// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core // Contains: GEMM and TRSM. #ifndef GEMM_HPP #define GEMM_HPP #include #include #if defined(_OPENMP) #include #endif namespace foo { //-- malloc with alignment (I guess that is already in BLAZE) ------------------ void * malloc_aligned(std::size_t alignment, std::size_t size) { alignment = std::max(alignment, alignof(void *)); size += alignment; void *ptr = std::malloc(size); void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1)); void **vp = (void**) ptr2 - 1; *vp = ptr; return ptr2; } void free_aligned(void *ptr) { std::free(*((void**)ptr-1)); } //-- Config -------------------------------------------------------------------- // SIMD-Register width in bits // SSE: 128 // AVX/FMA: 256 // AVX-512: 512 #ifndef SIMD_REGISTER_WIDTH #define SIMD_REGISTER_WIDTH 256 #endif #ifdef HAVE_FMA # ifndef BS_D_MR # define BS_D_MR 4 # endif # ifndef BS_D_NR # define BS_D_NR 12 # endif # ifndef BS_D_MC # define BS_D_MC 256 # endif # ifndef BS_D_KC # define BS_D_KC 512 # endif # ifndef BS_D_NC # define BS_D_NC 4092 # endif #endif #ifdef HAVE_BLISAVX # ifndef BS_D_MR # define BS_D_MR 8 # endif # ifndef BS_D_NR # define BS_D_NR 4 # endif # ifndef BS_D_MC # define BS_D_MC 96 # endif # ifndef BS_D_KC # define BS_D_KC 256 # endif # ifndef BS_D_NC # define BS_D_NC 4096 # endif #endif #ifndef BS_D_MR #define BS_D_MR 4 #endif #ifndef BS_D_NR #define BS_D_NR 8 #endif #ifndef BS_D_MC #define BS_D_MC 256 #endif #ifndef BS_D_KC #define BS_D_KC 256 #endif #ifndef BS_D_NC #define BS_D_NC 4096 #endif #if !defined(USE_SIMD) && defined(HAVE_AVX) #define USE_SIMD #endif #if !defined(USE_SIMD) && defined(HAVE_FMA) #define USE_SIMD #endif #if !defined(USE_SIMD) && defined(HAVE_GCCVEC) #define USE_SIMD #endif #if !defined(USE_SIMD) && defined(HAVE_BLISAVX) #define USE_SIMD #endif template struct BlockSize { static constexpr int MC = 64; static constexpr int KC = 64; static constexpr int NC = 256; static constexpr int MR = 8; static constexpr int NR = 8; static constexpr int rwidth = 0; static constexpr int align = alignof(T); static constexpr int vlen = 0; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize { static constexpr int MC = BS_D_MC; static constexpr int KC = BS_D_KC; static constexpr int NC = BS_D_NC; static constexpr int MR = BS_D_MR; static constexpr int NR = BS_D_NR; static constexpr int rwidth = SIMD_REGISTER_WIDTH; static constexpr int align = rwidth / 8; #ifdef USE_SIMD static constexpr int vlen = rwidth / (8*sizeof(double)); #else static constexpr int vlen = 0; #endif static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); static_assert(rwidth % sizeof(double) == 0, "SIMD register width not sane."); }; //-- aux routines (can be replaced by calling BLAZE routines...) --------------- template void geaxpy(const Alpha &alpha, const MX &X, MY &Y) { assert(X.rows()==Y.rows()); assert(X.columns()==Y.columns()); for (std::size_t j=0; j void gescal(const Alpha &alpha, MX &X) { for (std::size_t j=0; j void geaxpy(Index m, Index n, const Alpha &alpha, const TX *X, Index incRowX, Index incColX, TY *Y, Index incRowY, Index incColY) { for (Index j=0; j void gescal(Index m, Index n, const Alpha &alpha, TX *X, Index incRowX, Index incColX) { for (Index j=0; j void gecopy(IndexType m, IndexType n, const MX *X, IndexType incRowX, IndexType incColX, MY *Y, IndexType incRowY, IndexType incColY) { for (IndexType j=0; j typename std::enable_if::vlen == 0, void>::type ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC, const T *a_next, const T *b_next) { const Index MR = BlockSize::MR; const Index NR = BlockSize::NR; T P[BlockSize::MR*BlockSize::NR]; for (Index l=0; l void mgemm(Index mc, Index nc, Index kc, const T &alpha, const T *A, const T *B, Beta beta, TC *C, Index incRowC, Index incColC) { const Index MR = BlockSize::MR; const Index NR = BlockSize::NR; const Index mp = (mc+MR-1) / MR; const Index np = (nc+NR-1) / NR; const Index mr_ = mc % MR; const Index nr_ = nc % NR; const T *nextA; const T *nextB; #if defined(_OPENMP) #pragma omp parallel for #endif for (Index j=0; j::MR*BlockSize::NR]; nextB = &B[j*kc*NR]; for (Index i=0; i void pack_A(const MA &A, T *p) { std::size_t mc = A.rows(); std::size_t kc = A.columns(); std::size_t MR = BlockSize::MR; std::size_t mp = (mc+MR-1) / MR; for (std::size_t j=0; j void pack_B(const MB &B, T *p) { std::size_t kc = B.rows(); std::size_t nc = B.columns(); std::size_t NR = BlockSize::NR; std::size_t np = (nc+NR-1) / NR; for (std::size_t l=0; l void gemm(Alpha alpha, const MatrixA &A, const MatrixB &B, Beta beta, MatrixC &C) { assert((~A).columns()==(~B).rows()); assert((~C).rows()==(~A).rows()); assert((~C).columns()==(~B).columns()); typedef typename MatrixA::ElementType TA; typedef typename MatrixB::ElementType TB; typedef typename MatrixC::ElementType TC; typedef typename std::common_type::type T; const std::size_t m = (~C).rows(); const std::size_t n = (~C).columns(); const std::size_t k = (~A).rows(); // Here we should choose block sizes at runtime based on the problem size const std::size_t MC = BlockSize::MC; const std::size_t NC = BlockSize::NC; const std::size_t KC = BlockSize::KC; const std::size_t MR = BlockSize::MR; const std::size_t NR = BlockSize::NR; const std::size_t mb = (m+MC-1) / MC; const std::size_t nb = (n+NC-1) / NC; const std::size_t kb = (k+KC-1) / KC; const std::size_t mc_ = m % MC; const std::size_t nc_ = n % NC; const std::size_t kc_ = k % KC; if (m==0 || n==0 || ((alpha==Alpha(0) || k==0) && (beta==Beta(1)))) { return; } // Actually C is not required to be row- or col-major... TC *C_ = (~C).data(); const std::size_t incRowC = blaze::IsRowMajorMatrix::value ? (~C).spacing() : 1; const std::size_t incColC = blaze::IsRowMajorMatrix::value ? 1 : (~C).spacing(); // Here we should use unique pointers for the buffers A_ and B_ T *A_ = (T*) malloc_aligned(BlockSize::align, sizeof(T)*(MC*KC+MR)); T *B_ = (T*) malloc_aligned(BlockSize::align, sizeof(T)*(KC*NC+NR)); if (alpha==Alpha(0) || k==0) { gescal(beta, ~C); return; } for (std::size_t j=0; j