Content |
Using OpenMP
Changes:
-
gemm.hpp uses OpenMP if you compile with -fopenmp.
Tar-Ball for this Session
The tar-ball session3.tgz contains the files:
$shell> tar tfvz session3.tgz -rw-rw-r-- lehn/num 5179 2016-02-02 18:16 session3/matprod.cc -rw-rw-r-- lehn/num 17181 2016-02-02 18:14 session3/avx.hpp -rw-rw-r-- lehn/num 33544 2016-02-02 19:53 session3/fma.hpp -rw-rw-r-- lehn/num 8794 2016-02-02 19:51 session3/gemm.hpp $shell>
Compile and Run Benchmark
For a quick benchmark we again limit \(m\) to \(500\) with -DM_MAX=500. This time we also enable the optimized micro kernel for AVX with -DHAVE_AVX:
$shell> g++ -Ofast -mavx -Wall -std=c++11 -DNDEBUG -DHAVE_AVX -fopenmp -DM_MAX=500 -I ../boost_1_60_0 matprod.cc $shell> ./a.out # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Residual 100 100 100 0.000817 2447.98 0.00032 6250 0 200 200 200 0.003891 4112.05 0.000412 38835 0 300 300 300 0.010135 5328.07 0.001123 48085.5 1.9329e-16 400 400 400 0.023686 5404.04 0.002335 54818 8.42823e-17 500 500 500 0.04592 5444.25 0.003905 64020.5 3.53681e-17 $shell>
Core Function for Matrix-Matrix Produkt
// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core #ifndef GEMM_HPP #define GEMM_HPP #include <algorithm> #include <cstdlib> #if defined(_OPENMP) #include <omp.h> #endif //-- new with alignment -------------------------------------------------------- void * malloc_(std::size_t alignment, std::size_t size) { alignment = std::max(alignment, alignof(void *)); size += alignment; void *ptr = std::malloc(size); void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1)); void **vp = (void**) ptr2 - 1; *vp = ptr; return ptr2; } void free_(void *ptr) { std::free(*((void**)ptr-1)); } //-- Config -------------------------------------------------------------------- #ifdef HAVE_FMA # ifndef BS_D_MR # define BS_D_MR 4 # endif # ifndef BS_D_NR # define BS_D_NR 12 # endif # ifndef BS_D_MC # define BS_D_MC 256 # endif # ifndef BS_D_KC # define BS_D_KC 512 # endif # ifndef BS_D_NC # define BS_D_NC 4092 # endif # ifndef BS_D_ALIGN # define BS_D_ALIGN 32 # endif #endif #ifndef BS_D_MR #define BS_D_MR 4 #endif #ifndef BS_D_NR #define BS_D_NR 8 #endif #ifndef BS_D_MC #define BS_D_MC 256 #endif #ifndef BS_D_KC #define BS_D_KC 256 #endif #ifndef BS_D_NC #define BS_D_NC 4096 #endif #ifndef BS_D_ALIGN #define BS_D_ALIGN 32 #endif template <typename T> struct BlockSize { static const int MC = 64; static const int KC = 64; static const int NC = 256; static const int MR = 8; static const int NR = 8; static const int align = alignof(T); static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize<double> { static const int MC = BS_D_MC; static const int KC = BS_D_KC; static const int NC = BS_D_NC; static const int MR = BS_D_MR; static const int NR = BS_D_NR; static const int align = BS_D_ALIGN; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; //-- aux routines -------------------------------------------------------------- template <typename Index, typename Alpha, typename TX, typename TY> void geaxpy(Index m, Index n, const Alpha &alpha, const TX *X, Index incRowX, Index incColX, TY *Y, Index incRowY, Index incColY) { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; } } } template <typename Index, typename Alpha, typename TX> void gescal(Index m, Index n, const Alpha &alpha, TX *X, Index incRowX, Index incColX) { if (alpha!=Alpha(0)) { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { X[i*incRowX+j*incColX] *= alpha; } } } else { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { X[i*incRowX+j*incColX] = Alpha(0); } } } } //-- Micro Kernel -------------------------------------------------------------- template <typename Index, typename T> void ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; T P[BlockSize<T>::MR*BlockSize<T>::NR]; for (Index l=0; l<MR*NR; ++l) { P[l] = 0; } for (Index l=0; l<kc; ++l) { for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { P[i+j*MR] += A[i+l*MR]*B[l*NR+j]; } } } for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { C[i*incRowC+j*incColC] *= beta; C[i*incRowC+j*incColC] += alpha*P[i+j*MR]; } } } #ifdef HAVE_AVX #include "avx.hpp" #endif #ifdef HAVE_FMA #include "fma.hpp" #endif //-- Macro Kernel -------------------------------------------------------------- template <typename Index, typename T, typename Beta, typename TC> void mgemm(Index mc, Index nc, Index kc, T alpha, const T *A, const T *B, Beta beta, TC *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; const Index mp = (mc+MR-1) / MR; const Index np = (nc+NR-1) / NR; const Index mr_ = mc % MR; const Index nr_ = nc % NR; #pragma omp parallel for for (Index j=0; j<np; ++j) { const Index nr = (j!=np-1 || nr_==0) ? NR : nr_; T C_[MR*NR]; for (Index i=0; i<mp; ++i) { const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_; if (mr==MR && nr==NR) { ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } else { ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], T(0), C_, Index(1), MR); gescal(mr, nr, beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); geaxpy(mr, nr, T(1), C_, Index(1), MR, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } } } } //-- Packing blocks ------------------------------------------------------------ template <typename Index, typename TA, typename T> void pack_A(Index mc, Index kc, const TA *A, Index incRowA, Index incColA, T *p) { Index MR = BlockSize<T>::MR; Index mp = (mc+MR-1) / MR; for (Index j=0; j<kc; ++j) { for (Index l=0; l<mp; ++l) { for (Index i0=0; i0<MR; ++i0) { Index i = l*MR + i0; Index nu = l*MR*kc + j*MR + i0; p[nu] = (i<mc) ? A[i*incRowA+j*incColA] : T(0); } } } } template <typename Index, typename TB, typename T> void pack_B(Index kc, Index nc, const TB *B, Index incRowB, Index incColB, T *p) { Index NR = BlockSize<T>::NR; Index np = (nc+NR-1) / NR; for (Index l=0; l<np; ++l) { for (Index j0=0; j0<NR; ++j0) { for (Index i=0; i<kc; ++i) { Index j = l*NR+j0; Index nu = l*NR*kc + i*NR + j0; p[nu] = (j<nc) ? B[i*incRowB+j*incColB] : T(0); } } } } //-- Frame routine ------------------------------------------------------------- template <typename Index, typename Alpha, typename TA, typename TB, typename Beta, typename TC> void gemm(Index m, Index n, Index k, Alpha alpha, const TA *A, Index incRowA, Index incColA, const TB *B, Index incRowB, Index incColB, Beta beta, TC *C, Index incRowC, Index incColC) { typedef typename std::common_type<Alpha, TA, TB>::type T; const Index MC = BlockSize<T>::MC; const Index NC = BlockSize<T>::NC; const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; const Index KC = BlockSize<T>::KC; const Index mb = (m+MC-1) / MC; const Index nb = (n+NC-1) / NC; const Index kb = (k+KC-1) / KC; const Index mc_ = m % MC; const Index nc_ = n % NC; const Index kc_ = k % KC; T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*KC+MR)); T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(KC*NC+NR)); if (alpha==Alpha(0) || k==0) { gescal(m, n, beta, C, incRowC, incColC); return; } for (Index j=0; j<nb; ++j) { Index nc = (j!=nb-1 || nc_==0) ? NC : nc_; for (Index l=0; l<kb; ++l) { Index kc = (l!=kb-1 || kc_==0) ? KC : kc_; Beta beta_ = (l==0) ? beta : Beta(1); pack_B(kc, nc, &B[l*KC*incRowB+j*NC*incColB], incRowB, incColB, B_); for (Index i=0; i<mb; ++i) { Index mc = (i!=mb-1 || mc_==0) ? MC : mc_; pack_A(mc, kc, &A[i*MC*incRowA+l*KC*incColA], incRowA, incColA, A_); mgemm(mc, nc, kc, T(alpha), A_, B_, beta_, &C[i*MC*incRowC+j*NC*incColC], incRowC, incColC); } } } free_(A_); free_(B_); } #endif
Benchmark Results
$shell> g++ -Ofast -mavx -Wall -std=c++11 -DHAVE_AVX -fopenmp -DNDEBUG -I ../boost_1_60_0 matprod.cc $shell> ./a.out > report.session3 $shell> cat report.session3 # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Residual 100 100 100 0.00081 2469.14 0.000358 5586.59 0 200 200 200 0.00684 2339.18 0.000853 18757.3 0 300 300 300 0.010001 5399.46 0.001127 47914.8 1.94023e-16 400 400 400 0.0236 5423.73 0.002371 53985.7 8.38768e-17 500 500 500 0.04566 5475.25 0.003914 63873.3 3.52432e-17 600 600 600 0.079563 5429.66 0.006525 66206.9 1.63687e-17 700 700 700 0.132763 5167.1 0.009794 70042.9 8.44612e-18 800 800 800 0.210614 4861.97 0.015314 66866.9 4.71434e-18 900 900 900 0.327674 4449.54 0.020348 71653.2 2.79477e-18 1000 1000 1000 0.48001 4166.58 0.027266 73351.4 1.7654e-18 1100 1100 1100 0.664244 4007.56 0.037167 71622.7 1.14802e-18 1200 1200 1200 0.875123 3949.16 0.048052 71922.1 7.78787e-19 1300 1300 1300 1.08635 4044.73 0.058962 74522.6 5.45706e-19 1400 1400 1400 1.40024 3919.34 0.071781 76454.8 3.89556e-19 1500 1500 1500 1.72635 3909.98 0.08555 78901.2 2.86468e-19 1600 1600 1600 2.09328 3913.47 0.124991 65540.7 2.14057e-19 1700 1700 1700 2.51353 3909.25 0.124764 78756.7 1.62835e-19 1800 1800 1800 2.98552 3906.86 0.151215 77135.2 1.26232e-19 1900 1900 1900 3.49862 3920.97 0.179255 76527.9 9.85343e-20 2000 2000 2000 4.0474 3953.16 0.206362 77533.7 7.82848e-20 2100 2100 2100 4.76861 3884.15 0.23111 80143.7 6.28965e-20 2200 2200 2200 5.61489 3792.77 0.262311 81186.1 5.08613e-20 2300 2300 2300 6.54425 3718.38 0.332712 73138.3 4.17123e-20 2400 2400 2400 7.45089 3710.7 0.349646 79074.3 3.4353e-20 2500 2500 2500 8.44021 3702.51 0.380638 82099 2.8591e-20 2600 2600 2600 9.49611 3701.72 0.435837 80654 2.3931e-20 2700 2700 2700 10.6711 3689.04 0.477382 82462.3 2.01823e-20 2800 2800 2800 11.9161 3684.44 0.544741 80596.1 1.71307e-20 2900 2900 2900 13.2557 3679.76 0.593047 82249.8 1.4607e-20 3000 3000 3000 14.6075 3696.74 0.651973 82825.5 1.25212e-20 3100 3100 3100 16.1247 3695.09 0.715059 83324.6 1.08144e-20 3200 3200 3200 17.9785 3645.25 0.819091 80010.6 9.34998e-21 3300 3300 3300 19.5702 3672.63 0.854799 84082.9 8.15112e-21 3400 3400 3400 21.3683 3678.71 0.954719 82336.3 7.11691e-21 3500 3500 3500 23.279 3683.59 1.01965 84097.6 6.23425e-21 3600 3600 3600 25.2948 3688.97 1.14405 81562.6 5.4966e-21 3700 3700 3700 27.5727 3674.14 1.19846 84530.2 4.84832e-21 3800 3800 3800 29.9404 3665.42 1.33689 82089 4.3006e-21 3900 3900 3900 32.3364 3668.87 1.40714 84311.4 3.82669e-21 4000 4000 4000 34.7918 3679.03 1.52644 83855.2 3.41037e-21 $shell> gnuplot plot.session3.mflops $shell> gnuplot plot.session3.time $shell> gnuplot plot.session3.time_log $shell>