#ifndef HPC_GEMM_BLOCKED_H #define HPC_GEMM_BLOCKED_H 1 #include #include #include "ulmblas.h" namespace blocked { template struct BlockSize { static const int MC = 256; static const int KC = 512; static const int NC = 4096; static const int MR = 8; static const int NR = 8; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize { static const int MC = 256; static const int KC = 512; static const int NC = 4096; static const int MR = 8; static const int NR = 8; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize { static const int MC = 256; static const int KC = 256; static const int NC = 4096; static const int MR = 4; static const int NR = 4; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize > { static const int MC = 256; static const int KC = 256; static const int NC = 4096; static const int MR = 4; static const int NR = 8; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize > { static const int MC = 256; static const int KC = 128; static const int NC = 4096; static const int MR = 4; static const int NR = 4; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template void pack_A(Index mc, Index kc, const T *A, Index incRowA, Index incColA, T *p) { Index MR = BlockSize::MR; Index mp = (mc+MR-1) / MR; for (Index j=0; j void pack_B(Index kc, Index nc, const T *B, Index incRowB, Index incColB, T *p) { Index NR = BlockSize::NR; Index np = (nc+NR-1) / NR; for (Index l=0; l void ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { Index MR = BlockSize::MR; Index NR = BlockSize::NR; T P[MR*NR]; for (Index l=0; l void mgemm(Index mc, Index nc, Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { Index MR = BlockSize::MR; Index NR = BlockSize::NR; T C_[MR*NR]; Index mp = (mc+MR-1) / MR; Index np = (nc+NR-1) / NR; Index mr_ = mc % MR; Index nr_ = nc % NR; for (Index j=0; j void gemm(Index m, Index n, Index k, T alpha, const T *A, Index incRowA, Index incColA, const T *B, Index incRowB, Index incColB, T beta, T *C, Index incRowC, Index incColC) { Index MC = BlockSize::MC; Index NC = BlockSize::NC; Index KC = BlockSize::KC; Index mb = (m+MC-1) / MC; Index nb = (n+NC-1) / NC; Index kb = (k+KC-1) / KC; Index mc_ = m % MC; Index nc_ = n % NC; Index kc_ = k % KC; T *A_ = new T[MC*KC]; T *B_ = new T[KC*NC]; if (alpha==T(0) || k==0) { ulmBLAS::gescal(m, n, beta, C, incRowC, incColC); return; } for (Index j=0; j