// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core
// Contains: GEMM and TRSM.
#ifndef GEMM_HPP
#define GEMM_HPP
#include <algorithm>
#include <cstdlib>
#if defined(_OPENMP)
#include <omp.h>
#endif
namespace foo {
//-- malloc with alignment (I guess that is already in BLAZE) ------------------
void *
malloc_aligned(std::size_t alignment, std::size_t size)
{
alignment = std::max(alignment, alignof(void *));
size += alignment;
void *ptr = std::malloc(size);
void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1));
void **vp = (void**) ptr2 - 1;
*vp = ptr;
return ptr2;
}
void
free_aligned(void *ptr)
{
std::free(*((void**)ptr-1));
}
//-- Config --------------------------------------------------------------------
// SIMD-Register width in bits
// SSE: 128
// AVX/FMA: 256
// AVX-512: 512
#ifndef SIMD_REGISTER_WIDTH
#define SIMD_REGISTER_WIDTH 256
#endif
#ifdef HAVE_FMA
# ifndef BS_D_MR
# define BS_D_MR 4
# endif
# ifndef BS_D_NR
# define BS_D_NR 12
# endif
# ifndef BS_D_MC
# define BS_D_MC 256
# endif
# ifndef BS_D_KC
# define BS_D_KC 512
# endif
# ifndef BS_D_NC
# define BS_D_NC 4092
# endif
#endif
#ifdef HAVE_BLISAVX
# ifndef BS_D_MR
# define BS_D_MR 8
# endif
# ifndef BS_D_NR
# define BS_D_NR 4
# endif
# ifndef BS_D_MC
# define BS_D_MC 96
# endif
# ifndef BS_D_KC
# define BS_D_KC 256
# endif
# ifndef BS_D_NC
# define BS_D_NC 4096
# endif
#endif
#ifndef BS_D_MR
#define BS_D_MR 4
#endif
#ifndef BS_D_NR
#define BS_D_NR 8
#endif
#ifndef BS_D_MC
#define BS_D_MC 256
#endif
#ifndef BS_D_KC
#define BS_D_KC 256
#endif
#ifndef BS_D_NC
#define BS_D_NC 4096
#endif
#if !defined(USE_SIMD) && defined(HAVE_AVX)
#define USE_SIMD
#endif
#if !defined(USE_SIMD) && defined(HAVE_FMA)
#define USE_SIMD
#endif
#if !defined(USE_SIMD) && defined(HAVE_GCCVEC)
#define USE_SIMD
#endif
#if !defined(USE_SIMD) && defined(HAVE_BLISAVX)
#define USE_SIMD
#endif
template <typename T>
struct BlockSize
{
static constexpr int MC = 64;
static constexpr int KC = 64;
static constexpr int NC = 256;
static constexpr int MR = 8;
static constexpr int NR = 8;
static constexpr int rwidth = 0;
static constexpr int align = alignof(T);
static constexpr int vlen = 0;
static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
static_assert(MC % MR == 0, "MC must be a multiple of MR.");
static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};
template <>
struct BlockSize<double>
{
static constexpr int MC = BS_D_MC;
static constexpr int KC = BS_D_KC;
static constexpr int NC = BS_D_NC;
static constexpr int MR = BS_D_MR;
static constexpr int NR = BS_D_NR;
static constexpr int rwidth = SIMD_REGISTER_WIDTH;
static constexpr int align = rwidth / 8;
#ifdef USE_SIMD
static constexpr int vlen = rwidth / (8*sizeof(double));
#else
static constexpr int vlen = 0;
#endif
static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
static_assert(MC % MR == 0, "MC must be a multiple of MR.");
static_assert(NC % NR == 0, "NC must be a multiple of NR.");
static_assert(rwidth % sizeof(double) == 0, "SIMD register width not sane.");
};
//-- aux routines (can be replaced by calling BLAZE routines...) ---------------
template <typename Alpha, typename MX, typename MY>
void
geaxpy(const Alpha &alpha, const MX &X, MY &Y)
{
assert(X.rows()==Y.rows());
assert(X.columns()==Y.columns());
for (std::size_t j=0; j<X.columns(); ++j) {
for (std::size_t i=0; i<X.rows(); ++i) {
Y(i,j) += alpha*X(i,j);
}
}
}
template <typename Alpha, typename MX>
void
gescal(const Alpha &alpha, MX &X)
{
for (std::size_t j=0; j<X.columns(); ++j) {
for (std::size_t i=0; i<X.rows(); ++i) {
X(i,j) *= alpha;
}
}
}
template <typename Index, typename Alpha, typename TX, typename TY>
void
geaxpy(Index m, Index n,
const Alpha &alpha,
const TX *X, Index incRowX, Index incColX,
TY *Y, Index incRowY, Index incColY)
{
for (Index j=0; j<n; ++j) {
for (Index i=0; i<m; ++i) {
Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
}
}
}
template <typename Index, typename Alpha, typename TX>
void
gescal(Index m, Index n,
const Alpha &alpha,
TX *X, Index incRowX, Index incColX)
{
for (Index j=0; j<n; ++j) {
for (Index i=0; i<m; ++i) {
X[i*incRowX+j*incColX] *= alpha;
}
}
}
template <typename IndexType, typename MX, typename MY>
void
gecopy(IndexType m, IndexType n,
const MX *X, IndexType incRowX, IndexType incColX,
MY *Y, IndexType incRowY, IndexType incColY)
{
for (IndexType j=0; j<n; ++j) {
for (IndexType i=0; i<m; ++i) {
Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX];
}
}
}
//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
typename std::enable_if<BlockSize<T>::vlen == 0,
void>::type
ugemm(Index kc, T alpha, const T *A, const T *B, T beta,
T *C, Index incRowC, Index incColC,
const T *a_next, const T *b_next)
{
const Index MR = BlockSize<T>::MR;
const Index NR = BlockSize<T>::NR;
T P[BlockSize<T>::MR*BlockSize<T>::NR];
for (Index l=0; l<MR*NR; ++l) {
P[l] = 0;
}
for (Index l=0; l<kc; ++l) {
for (Index j=0; j<NR; ++j) {
for (Index i=0; i<MR; ++i) {
P[i+j*MR] += A[i+l*MR]*B[l*NR+j];
}
}
}
for (Index j=0; j<NR; ++j) {
for (Index i=0; i<MR; ++i) {
C[i*incRowC+j*incColC] *= beta;
C[i*incRowC+j*incColC] += alpha*P[i+j*MR];
}
}
}
#if defined HAVE_AVX
#include "avx.h"
#elif defined HAVE_FMA
#include "fma.h"
#elif defined HAVE_GCCVEC
#include "gccvec.h"
#elif defined HAVE_BLISAVX
#include "blisavx.h"
#endif
//-- Macro Kernel --------------------------------------------------------------
template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc, const T &alpha,
const T *A, const T *B, Beta beta,
TC *C, Index incRowC, Index incColC)
{
const Index MR = BlockSize<T>::MR;
const Index NR = BlockSize<T>::NR;
const Index mp = (mc+MR-1) / MR;
const Index np = (nc+NR-1) / NR;
const Index mr_ = mc % MR;
const Index nr_ = nc % NR;
const T *nextA;
const T *nextB;
#if defined(_OPENMP)
#pragma omp parallel for
#endif
for (Index j=0; j<np; ++j) {
const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
T C_[BlockSize<T>::MR*BlockSize<T>::NR];
nextB = &B[j*kc*NR];
for (Index i=0; i<mp; ++i) {
const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
nextA = &A[(i+1)*kc*MR];
if (i==mp-1) {
nextA = A;
nextB = &B[(j+1)*kc*NR];
if (j==np-1) {
nextB = B;
}
}
if (mr==MR && nr==NR) {
ugemm(kc, alpha,
&A[i*kc*MR], &B[j*kc*NR],
beta,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC,
nextA, nextB);
} else {
std::fill_n(C_, MR*NR, T(0));
ugemm(kc, alpha,
&A[i*kc*MR], &B[j*kc*NR],
T(0),
C_, Index(1), MR,
nextA, nextB);
gescal(mr, nr, beta,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
geaxpy(mr, nr, T(1), C_, Index(1), MR,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
}
}
}
}
//-- Packing blocks ------------------------------------------------------------
template <typename MA, typename T>
void
pack_A(const MA &A, T *p)
{
std::size_t mc = A.rows();
std::size_t kc = A.columns();
std::size_t MR = BlockSize<T>::MR;
std::size_t mp = (mc+MR-1) / MR;
for (std::size_t j=0; j<kc; ++j) {
for (std::size_t l=0; l<mp; ++l) {
for (std::size_t i0=0; i0<MR; ++i0) {
std::size_t i = l*MR + i0;
std::size_t nu = l*MR*kc + j*MR + i0;
p[nu] = (i<mc) ? A(i,j) : T(0);
}
}
}
}
template <typename MB, typename T>
void
pack_B(const MB &B, T *p)
{
std::size_t kc = B.rows();
std::size_t nc = B.columns();
std::size_t NR = BlockSize<T>::NR;
std::size_t np = (nc+NR-1) / NR;
for (std::size_t l=0; l<np; ++l) {
for (std::size_t j0=0; j0<NR; ++j0) {
for (std::size_t i=0; i<kc; ++i) {
std::size_t j = l*NR+j0;
std::size_t nu = l*NR*kc + i*NR + j0;
p[nu] = (j<nc) ? B(i,j) : T(0);
}
}
}
}
//-- Frame routine -------------------------------------------------------------
template <typename Alpha, typename MatrixA, typename MatrixB,
typename Beta, typename MatrixC>
void
gemm(Alpha alpha, const MatrixA &A, const MatrixB &B, Beta beta, MatrixC &C)
{
assert((~A).columns()==(~B).rows());
assert((~C).rows()==(~A).rows());
assert((~C).columns()==(~B).columns());
typedef typename MatrixA::ElementType TA;
typedef typename MatrixB::ElementType TB;
typedef typename MatrixC::ElementType TC;
typedef typename std::common_type<Alpha, TA, TB>::type T;
const std::size_t m = (~C).rows();
const std::size_t n = (~C).columns();
const std::size_t k = (~A).rows();
// Here we should choose block sizes at runtime based on the problem size
const std::size_t MC = BlockSize<T>::MC;
const std::size_t NC = BlockSize<T>::NC;
const std::size_t KC = BlockSize<T>::KC;
const std::size_t MR = BlockSize<T>::MR;
const std::size_t NR = BlockSize<T>::NR;
const std::size_t mb = (m+MC-1) / MC;
const std::size_t nb = (n+NC-1) / NC;
const std::size_t kb = (k+KC-1) / KC;
const std::size_t mc_ = m % MC;
const std::size_t nc_ = n % NC;
const std::size_t kc_ = k % KC;
if (m==0 || n==0 || ((alpha==Alpha(0) || k==0) && (beta==Beta(1)))) {
return;
}
// Actually C is not required to be row- or col-major...
TC *C_ = (~C).data();
const std::size_t incRowC = blaze::IsRowMajorMatrix<MatrixC>::value
? (~C).spacing()
: 1;
const std::size_t incColC = blaze::IsRowMajorMatrix<MatrixC>::value
? 1
: (~C).spacing();
// Here we should use unique pointers for the buffers A_ and B_
T *A_ = (T*) malloc_aligned(BlockSize<T>::align, sizeof(T)*(MC*KC+MR));
T *B_ = (T*) malloc_aligned(BlockSize<T>::align, sizeof(T)*(KC*NC+NR));
if (alpha==Alpha(0) || k==0) {
gescal(beta, ~C);
return;
}
for (std::size_t j=0; j<nb; ++j) {
std::size_t nc = (j!=nb-1 || nc_==0) ? NC : nc_;
for (std::size_t l=0; l<kb; ++l) {
std::size_t kc = (l!=kb-1 || kc_==0) ? KC : kc_;
Beta beta_ = (l==0) ? beta : Beta(1);
const auto Bs = blaze::submatrix(~B, l*KC, j*NC, kc, nc);
pack_B(Bs, B_);
for (std::size_t i=0; i<mb; ++i) {
std::size_t mc = (i!=mb-1 || mc_==0) ? MC : mc_;
const auto As = blaze::submatrix(~A, i*MC, l*KC, mc, kc);
pack_A(As, A_);
mgemm(mc, nc, kc,
T(alpha), A_, B_, beta_,
&C_[i*MC*incRowC+j*NC*incColC],
incRowC, incColC);
}
}
}
free_aligned(A_);
free_aligned(B_);
}
} // namespace foo
#endif