Lösungsvorschlag
#ifndef HPC_ULMBLAS_GEMM_H #define HPC_ULMBLAS_GEMM_H 1 #include <complex> #include <type_traits> #include <hpc/ulmblas/blocksize.h> #include <hpc/ulmblas/gescal.h> #include <hpc/ulmblas/pack.h> #include <hpc/ulmblas/mgemm.h> #include <hpc/ulmblas/malloc.h> #include <hpc/ulmblas/nukes/ugemm.h> #include <cstdlib> namespace hpc { namespace ulmblas { template <typename Index, typename Alpha, typename TA, typename TB, typename Beta, typename TC> void gemm(Index m, Index n, Index k, Alpha alpha, const TA *A, Index incRowA, Index incColA, const TB *B, Index incRowB, Index incColB, Beta beta, TC *C, Index incRowC, Index incColC) { typedef typename std::common_type<Alpha, TA, TB>::type T; const Index MC = BlockSize<T>::MC; const Index NC = BlockSize<T>::NC; const Index KC = BlockSize<T>::KC; const Index mb = (m+MC-1) / MC; const Index nb = (n+NC-1) / NC; const Index kb = (k+KC-1) / KC; const Index mc_ = m % MC; const Index nc_ = n % NC; const Index kc_ = k % KC; T *A_ = malloc<T>(MC*KC + MR); T *B_ = malloc<T>(KC*NC + NR); if (alpha==Alpha(0) || k==0) { gescal(m, n, beta, C, incRowC, incColC); return; } for (Index j=0; j<nb; ++j) { Index nc = (j!=nb-1 || nc_==0) ? NC : nc_; for (Index l=0; l<kb; ++l) { Index kc = (l!=kb-1 || kc_==0) ? KC : kc_; Beta beta_ = (l==0) ? beta : Beta(1); pack_B(kc, nc, &B[l*KC*incRowB+j*NC*incColB], incRowB, incColB, B_); for (Index i=0; i<mb; ++i) { Index mc = (i!=mb-1 || mc_==0) ? MC : mc_; pack_A(mc, kc, &A[i*MC*incRowA+l*KC*incColA], incRowA, incColA, A_); mgemm(mc, nc, kc, T(alpha), A_, B_, beta_, &C[i*MC*incRowC+j*NC*incColC], incRowC, incColC); } } } free(A_); free(B_); } } } // namespace ulmblas, hpc #endif // HPC_ULMBLAS_GEMM_H
#ifndef HPC_ULMBLAS_MALLOC_H #define HPC_ULMBLAS_MALLOC_H 1 #include <cstddef> #ifdef MEM_ALIGN # include <xmmintrin.h> #endif namespace hpc { namespace ulmblas { template <typename T> T * malloc(std::size_t n) { #ifdef MEM_ALIGN return reinterpret_cast<T *>(_mm_malloc(n*sizeof(T), MEM_ALIGN)); # else return new T[n]; # endif } template <typename T> void free(T *block) { #ifdef MEM_ALIGN _mm_free(reinterpret_cast<void *>(block)); # else delete [] block; # endif } } } // namespace ulmblas, hpc #endif // HPC_ULMBLAS_MALLOC_H
#ifndef HPC_ULMBLAS_CONFIG_H #define HPC_ULMBLAS_CONFIG_H 1 #ifdef SSE #define MEM_ALIGN 16 #elif defined(AVX) || defined(FMA) #define MEM_ALIGN 32 #endif // // default // #ifndef DEFAULT_BLOCKSIZE_MR #define DEFAULT_BLOCKSIZE_MR 8 #endif #ifndef DEFAULT_BLOCKSIZE_NR #define DEFAULT_BLOCKSIZE_NR 8 #endif #ifndef DEFAULT_BLOCKSIZE_MC #define DEFAULT_BLOCKSIZE_MC 64 #endif #ifndef DEFAULT_BLOCKSIZE_KC #define DEFAULT_BLOCKSIZE_KC 64 #endif #ifndef DEFAULT_BLOCKSIZE_NC #define DEFAULT_BLOCKSIZE_NC 256 #endif // // single precision(float) // #ifndef S_BLOCKSIZE_MR #define S_BLOCKSIZE_MR 4 #endif #ifndef S_BLOCKSIZE_NR #define S_BLOCKSIZE_NR 4 #endif #ifndef S_BLOCKSIZE_MC #define S_BLOCKSIZE_MC 512 #endif #ifndef S_BLOCKSIZE_KC #define S_BLOCKSIZE_KC 512 #endif #ifndef S_BLOCKSIZE_NC #define S_BLOCKSIZE_NC 4096 #endif // // double precision(double) // #ifndef D_BLOCKSIZE_MR #define D_BLOCKSIZE_MR 4 #endif #ifndef D_BLOCKSIZE_NR #define D_BLOCKSIZE_NR 4 #endif #ifndef D_BLOCKSIZE_MC #define D_BLOCKSIZE_MC 256 #endif #ifndef D_BLOCKSIZE_KC #define D_BLOCKSIZE_KC 256 #endif #ifndef D_BLOCKSIZE_NC #define D_BLOCKSIZE_NC 4096 #endif // // complex single precision(std::complex<float>) // #ifndef C_BLOCKSIZE_MR #define C_BLOCKSIZE_MR 4 #endif #ifndef C_BLOCKSIZE_NR #define C_BLOCKSIZE_NR 8 #endif #ifndef C_BLOCKSIZE_MC #define C_BLOCKSIZE_MC 256 #endif #ifndef C_BLOCKSIZE_KC #define C_BLOCKSIZE_KC 256 #endif #ifndef C_BLOCKSIZE_NC #define C_BLOCKSIZE_NC 4096 #endif // // complex double precision(std::complex<double>) // #ifndef Z_BLOCKSIZE_MR #define Z_BLOCKSIZE_MR 4 #endif #ifndef Z_BLOCKSIZE_NR #define Z_BLOCKSIZE_NR 4 #endif #ifndef Z_BLOCKSIZE_MC #define Z_BLOCKSIZE_MC 256 #endif #ifndef Z_BLOCKSIZE_KC #define Z_BLOCKSIZE_KC 128 #endif #ifndef Z_BLOCKSIZE_NC #define Z_BLOCKSIZE_NC 4096 #endif #endif // HPC_ULMBLAS_BLOCKSIZE_H