// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core #ifndef GEMM_HPP #define GEMM_HPP #include #include #if defined(_OPENMP) #include #endif //-- new with alignment -------------------------------------------------------- void * malloc_(std::size_t alignment, std::size_t size) { alignment = std::max(alignment, alignof(void *)); size += alignment; void *ptr = std::malloc(size); void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1)); void **vp = (void**) ptr2 - 1; *vp = ptr; return ptr2; } void free_(void *ptr) { std::free(*((void**)ptr-1)); } //-- Config -------------------------------------------------------------------- #ifdef HAVE_FMA # ifndef BS_D_MR # define BS_D_MR 4 # endif # ifndef BS_D_NR # define BS_D_NR 12 # endif # ifndef BS_D_MC # define BS_D_MC 256 # endif # ifndef BS_D_KC # define BS_D_KC 512 # endif # ifndef BS_D_NC # define BS_D_NC 4092 # endif # ifndef BS_D_ALIGN # define BS_D_ALIGN 32 # endif #endif #ifndef BS_D_MR #define BS_D_MR 4 #endif #ifndef BS_D_NR #define BS_D_NR 8 #endif #ifndef BS_D_MC #define BS_D_MC 256 #endif #ifndef BS_D_KC #define BS_D_KC 256 #endif #ifndef BS_D_NC #define BS_D_NC 4096 #endif #ifndef BS_D_ALIGN #define BS_D_ALIGN 32 #endif template struct BlockSize { static const int MC = 64; static const int KC = 64; static const int NC = 256; static const int MR = 8; static const int NR = 8; static const int align = alignof(T); static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize { static const int MC = BS_D_MC; static const int KC = BS_D_KC; static const int NC = BS_D_NC; static const int MR = BS_D_MR; static const int NR = BS_D_NR; static const int align = BS_D_ALIGN; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; //-- aux routines -------------------------------------------------------------- template void geaxpy(const Alpha &alpha, const MX &X, MY &Y) { assert(X.size1()==Y.size1()); assert(X.size2()==Y.size2()); typedef typename MX::size_type size_type; for (size_type j=0; j void gescal(const Alpha &alpha, MX &X) { typedef typename MX::size_type size_type; for (size_type j=0; j void geaxpy(Index m, Index n, const Alpha &alpha, const TX *X, Index incRowX, Index incColX, TY *Y, Index incRowY, Index incColY) { for (Index j=0; j void gescal(Index m, Index n, const Alpha &alpha, TX *X, Index incRowX, Index incColX) { if (alpha!=Alpha(0)) { for (Index j=0; j void ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { const Index MR = BlockSize::MR; const Index NR = BlockSize::NR; T P[BlockSize::MR*BlockSize::NR]; for (Index l=0; l void mgemm(Index mc, Index nc, Index kc, T alpha, const T *A, const T *B, Beta beta, TC *C, Index incRowC, Index incColC) { const Index MR = BlockSize::MR; const Index NR = BlockSize::NR; const Index mp = (mc+MR-1) / MR; const Index np = (nc+NR-1) / NR; const Index mr_ = mc % MR; const Index nr_ = nc % NR; #if defined(_OPENMP) #pragma omp parallel for #endif for (Index j=0; j void pack_A(const MA &A, T *p) { typedef typename MA::size_type size_type; size_type mc = A.size1(); size_type kc = A.size2(); size_type MR = BlockSize::MR; size_type mp = (mc+MR-1) / MR; for (size_type j=0; j void pack_B(const MB &B, T *p) { typedef typename MB::size_type size_type; size_type kc = B.size1(); size_type nc = B.size2(); size_type NR = BlockSize::NR; size_type np = (nc+NR-1) / NR; for (size_type l=0; l void gemm(Alpha alpha, const MatrixA &A, const MatrixB &B, Beta beta, MatrixC &C) { assert(A.size2()==B.size1()); namespace ublas = boost::numeric::ublas; typedef typename MatrixC::size_type size_type; typedef typename MatrixA::value_type TA; typedef typename MatrixB::value_type TB; typedef typename MatrixC::value_type TC; typedef typename std::common_type::type T; const size_type MC = BlockSize::MC; const size_type NC = BlockSize::NC; const size_type MR = BlockSize::MR; const size_type NR = BlockSize::NR; const size_type m = C.size1(); const size_type n = C.size2(); const size_type k = A.size2(); const size_type KC = BlockSize::KC; const size_type mb = (m+MC-1) / MC; const size_type nb = (n+NC-1) / NC; const size_type kb = (k+KC-1) / KC; const size_type mc_ = m % MC; const size_type nc_ = n % NC; const size_type kc_ = k % KC; TC *C_ = &C(0,0); const size_type incRowC = &C(1,0) - &C(0,0); const size_type incColC = &C(0,1) - &C(0,0); T *A_ = (T*) malloc_(BlockSize::align, sizeof(T)*(MC*KC+MR)); T *B_ = (T*) malloc_(BlockSize::align, sizeof(T)*(KC*NC+NR)); if (alpha==Alpha(0) || k==0) { gescal(beta, C); return; } for (size_type j=0; j