Content |
Lösungsvorschlag
Festlegung von Blockgrößen durch Macros
#ifndef HPC_ULMBLAS_CONFIG_H #define HPC_ULMBLAS_CONFIG_H 1 // // default // #ifndef DEFAULT_BLOCKSIZE_MR #define DEFAULT_BLOCKSIZE_MR 8 #endif #ifndef DEFAULT_BLOCKSIZE_NR #define DEFAULT_BLOCKSIZE_NR 8 #endif #ifndef DEFAULT_BLOCKSIZE_MC #define DEFAULT_BLOCKSIZE_MC 64 #endif #ifndef DEFAULT_BLOCKSIZE_KC #define DEFAULT_BLOCKSIZE_KC 64 #endif #ifndef DEFAULT_BLOCKSIZE_NC #define DEFAULT_BLOCKSIZE_NC 256 #endif // // single precision(float) // #ifndef S_BLOCKSIZE_MR #define S_BLOCKSIZE_MR 4 #endif #ifndef S_BLOCKSIZE_NR #define S_BLOCKSIZE_NR 4 #endif #ifndef S_BLOCKSIZE_MC #define S_BLOCKSIZE_MC 512 #endif #ifndef S_BLOCKSIZE_KC #define S_BLOCKSIZE_KC 512 #endif #ifndef S_BLOCKSIZE_NC #define S_BLOCKSIZE_NC 4096 #endif // // double precision(double) // #ifndef D_BLOCKSIZE_MR #define D_BLOCKSIZE_MR 4 #endif #ifndef D_BLOCKSIZE_NR #define D_BLOCKSIZE_NR 4 #endif #ifndef D_BLOCKSIZE_MC #define D_BLOCKSIZE_MC 256 #endif #ifndef D_BLOCKSIZE_KC #define D_BLOCKSIZE_KC 256 #endif #ifndef D_BLOCKSIZE_NC #define D_BLOCKSIZE_NC 4096 #endif // // complex single precision(std::complex<float>) // #ifndef C_BLOCKSIZE_MR #define C_BLOCKSIZE_MR 4 #endif #ifndef C_BLOCKSIZE_NR #define C_BLOCKSIZE_NR 8 #endif #ifndef C_BLOCKSIZE_MC #define C_BLOCKSIZE_MC 256 #endif #ifndef C_BLOCKSIZE_KC #define C_BLOCKSIZE_KC 256 #endif #ifndef C_BLOCKSIZE_NC #define C_BLOCKSIZE_NC 4096 #endif // // complex double precision(std::complex<double>) // #ifndef Z_BLOCKSIZE_MR #define Z_BLOCKSIZE_MR 4 #endif #ifndef Z_BLOCKSIZE_NR #define Z_BLOCKSIZE_NR 4 #endif #ifndef Z_BLOCKSIZE_MC #define Z_BLOCKSIZE_MC 256 #endif #ifndef Z_BLOCKSIZE_KC #define Z_BLOCKSIZE_KC 128 #endif #ifndef Z_BLOCKSIZE_NC #define Z_BLOCKSIZE_NC 4096 #endif #endif // HPC_ULMBLAS_BLOCKSIZE_H
Festlegung von Blockgrößen durch Traits
#ifndef HPC_ULMBLAS_BLOCKSIZE_H #define HPC_ULMBLAS_BLOCKSIZE_H 1 #include <complex> #include <hpc/ulmblas/config.h> namespace hpc { namespace ulmblas { template <typename T> struct BlockSize { static const int MC = DEFAULT_BLOCKSIZE_MC; static const int KC = DEFAULT_BLOCKSIZE_KC; static const int NC = DEFAULT_BLOCKSIZE_NC; static const int MR = DEFAULT_BLOCKSIZE_MR; static const int NR = DEFAULT_BLOCKSIZE_NR; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize<float> { static const int MC = S_BLOCKSIZE_MC; static const int KC = S_BLOCKSIZE_KC; static const int NC = S_BLOCKSIZE_NC; static const int MR = S_BLOCKSIZE_MR; static const int NR = S_BLOCKSIZE_NR; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize<double> { static const int MC = D_BLOCKSIZE_MC; static const int KC = D_BLOCKSIZE_KC; static const int NC = D_BLOCKSIZE_NC; static const int MR = D_BLOCKSIZE_MR; static const int NR = D_BLOCKSIZE_NR; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize<std::complex<float> > { static const int MC = C_BLOCKSIZE_MC; static const int KC = C_BLOCKSIZE_KC; static const int NC = C_BLOCKSIZE_NC; static const int MR = C_BLOCKSIZE_MR; static const int NR = C_BLOCKSIZE_NR; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize<std::complex<double> > { static const int MC = Z_BLOCKSIZE_MC; static const int KC = Z_BLOCKSIZE_KC; static const int NC = Z_BLOCKSIZE_NC; static const int MR = Z_BLOCKSIZE_MR; static const int NR = Z_BLOCKSIZE_NR; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; } } // namespace ulmblas, hpc #endif // HPC_ULMBLAS_BLOCKSIZE_H
Funktionen zum Packen von Blöcken
#ifndef HPC_ULMBLAS_PACK_H #define HPC_ULMBLAS_PACK_H 1 #include <hpc/ulmblas/blocksize.h> namespace hpc { namespace ulmblas { template <typename Index, typename TA, typename T> void pack_A(Index mc, Index kc, const TA *A, Index incRowA, Index incColA, T *p) { Index MR = BlockSize<T>::MR; Index mp = (mc+MR-1) / MR; for (Index j=0; j<kc; ++j) { for (Index l=0; l<mp; ++l) { for (Index i0=0; i0<MR; ++i0) { Index i = l*MR + i0; Index nu = l*MR*kc + j*MR + i0; p[nu] = (i<mc) ? A[i*incRowA+j*incColA] : T(0); } } } } template <typename Index, typename TB, typename T> void pack_B(Index kc, Index nc, const TB *B, Index incRowB, Index incColB, T *p) { Index NR = BlockSize<T>::NR; Index np = (nc+NR-1) / NR; for (Index l=0; l<np; ++l) { for (Index j0=0; j0<NR; ++j0) { for (Index i=0; i<kc; ++i) { Index j = l*NR+j0; Index nu = l*NR*kc + i*NR + j0; p[nu] = (j<nc) ? B[i*incRowB+j*incColB] : T(0); } } } } } } // namespace ulmblas, hpc #endif // HPC_ULMBLAS_PACK_H
Macro-Kernel
#ifndef HPC_ULMBLAS_MGEMM_H #define HPC_ULMBLAS_MGEMM_H 1 #include <algorithm> #include <hpc/ulmblas/blocksize.h> #include <hpc/ulmblas/gescal.h> #include <hpc/ulmblas/geaxpy.h> #include <hpc/ulmblas/kernels/ugemm.h> namespace hpc { namespace ulmblas { template <typename Index, typename T, typename Beta, typename TC> void mgemm(Index mc, Index nc, Index kc, T alpha, const T *A, const T *B, Beta beta, TC *C, Index incRowC, Index incColC) { Index MR = BlockSize<T>::MR; Index NR = BlockSize<T>::NR; T C_[BlockSize<T>::MR*BlockSize<T>::NR]; Index mp = (mc+MR-1) / MR; Index np = (nc+NR-1) / NR; Index mr_ = mc % MR; Index nr_ = nc % NR; for (Index j=0; j<np; ++j) { Index nr = (j!=np-1 || nr_==0) ? NR : nr_; for (Index i=0; i<mp; ++i) { Index mr = (i!=mp-1 || mr_==0) ? MR : mr_; if (mr==MR && nr==NR) { ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } else { std::fill_n(C_, MR*NR, T(0)); ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], T(0), C_, Index(1), MR); gescal(mr, nr, beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); geaxpy(mr, nr, T(1), C_, Index(1), MR, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } } } } } } // namespace ulmblas, hpc #endif // HPC_ULMBLAS_MGEMM_H
Micro-Kernel
Include-File zum Einbinden aller Micro-Kernel
#ifndef HPC_ULMBLAS_KERNELS_UGEMM_H #define HPC_ULMBLAS_KERNELS_UGEMM_H 1 #include <hpc/ulmblas/config.h> #include <hpc/ulmblas/kernels/ugemm_ref.h> #include <hpc/ulmblas/kernels/ugemm_buf.h> #endif // HPC_ULMBLAS_KERNELS_UGEMM_H
Micro-Kernel Wrapper für inhomogene Typen
#ifndef HPC_ULMBLAS_KERNELS_UGEMM_BUF_H #define HPC_ULMBLAS_KERNELS_UGEMM_BUF_H 1 #include <hpc/ulmblas/blocksize.h> namespace hpc { namespace ulmblas { template <typename Index, typename T, typename Beta, typename TC> void ugemm(Index kc, T alpha, const T *A, const T *B, Beta beta, TC *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; T P[BlockSize<T>::MR*BlockSize<T>::NR]; std::fill_n(P, MR*NR, T(0)); ugemm(kc, alpha, A, B, T(0), P, Index(1), MR); gescal(MR, NR, beta, C, incRowC, incColC); geaxpy(MR, NR, T(1), P, Index(1), MR, C, incRowC, incColC); } } } // namespace ulmblas, hpc #endif // HPC_ULMBLAS_KERNELS_UGEMM_BUF_H
Referenz-Implementierung für homogene Typen
#ifndef HPC_ULMBLAS_KERNELS_UGEMM_REF_H #define HPC_ULMBLAS_KERNELS_UGEMM_REF_H 1 #include <hpc/ulmblas/blocksize.h> namespace hpc { namespace ulmblas { template <typename Index, typename T> void ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; T P[BlockSize<T>::MR*BlockSize<T>::NR]; for (Index l=0; l<MR*NR; ++l) { P[l] = 0; } for (Index l=0; l<kc; ++l) { for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { P[i+j*MR] += A[i+l*MR]*B[l*NR+j]; } } } for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { C[i*incRowC+j*incColC] *= beta; C[i*incRowC+j*incColC] += alpha*P[i+j*MR]; } } } } } // namespace ulmblas, hpc #endif // HPC_ULMBLAS_KERNELS_UGEMM_REF_H
Frame-Algorithmus
#ifndef HPC_ULMBLAS_GEMM_H #define HPC_ULMBLAS_GEMM_H 1 #include <complex> #include <type_traits> #include <hpc/ulmblas/blocksize.h> #include <hpc/ulmblas/gescal.h> #include <hpc/ulmblas/pack.h> #include <hpc/ulmblas/mgemm.h> #include <hpc/ulmblas/kernels/ugemm.h> #include <cstdlib> namespace hpc { namespace ulmblas { template <typename Index, typename Alpha, typename TA, typename TB, typename Beta, typename TC> void gemm(Index m, Index n, Index k, Alpha alpha, const TA *A, Index incRowA, Index incColA, const TB *B, Index incRowB, Index incColB, Beta beta, TC *C, Index incRowC, Index incColC) { typedef typename std::common_type<Alpha, TA, TB>::type T; const Index MC = BlockSize<T>::MC; const Index NC = BlockSize<T>::NC; const Index KC = BlockSize<T>::KC; const Index mb = (m+MC-1) / MC; const Index nb = (n+NC-1) / NC; const Index kb = (k+KC-1) / KC; const Index mc_ = m % MC; const Index nc_ = n % NC; const Index kc_ = k % KC; T *A_ = new T[MC*KC]; T *B_ = new T[KC*NC]; if (alpha==Alpha(0) || k==0) { gescal(m, n, beta, C, incRowC, incColC); return; } for (Index j=0; j<nb; ++j) { Index nc = (j!=nb-1 || nc_==0) ? NC : nc_; for (Index l=0; l<kb; ++l) { Index kc = (l!=kb-1 || kc_==0) ? KC : kc_; Beta beta_ = (l==0) ? beta : Beta(1); pack_B(kc, nc, &B[l*KC*incRowB+j*NC*incColB], incRowB, incColB, B_); for (Index i=0; i<mb; ++i) { Index mc = (i!=mb-1 || mc_==0) ? MC : mc_; pack_A(mc, kc, &A[i*MC*incRowA+l*KC*incColA], incRowA, incColA, A_); mgemm(mc, nc, kc, T(alpha), A_, B_, beta_, &C[i*MC*incRowC+j*NC*incColC], incRowC, incColC); } } } delete [] A_; delete [] B_; } } } // namespace ulmblas, hpc #endif // HPC_ULMBLAS_GEMM_H