Content |
Using GCC Vector-Extensions for Micro-Kernels
Tar-Ball for this Session
The tar-ball session5.tgz contains the files:
$shell> tar tfvz session5.tgz -rw-rw-r-- lehn/num 5356 2016-01-31 11:28 session5/matprod.cc -rw-rw-r-- lehn/num 17181 2016-02-02 18:17 session5/avx.hpp -rw-rw-r-- lehn/num 33544 2016-02-02 19:53 session5/fma.hpp -rw-rw-r-- lehn/num 1898 2016-02-02 17:11 session5/gccvec.hpp -rw-rw-r-- lehn/num 3353 2016-02-02 01:34 session5/gccvec2.hpp -rw-rw-r-- lehn/num 10291 2016-02-02 19:50 session5/gemm.hpp $shell>
Compiling for FMA or AVX
-
If you have FMA (i.e. AVX2 with fused mult-add) compile with
g++ -Wall -DNDEBUG -mfma -Ofast -I ../../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC -DBS_D_NR=12 -DBS_D_NC=4092 matprod.cc
-
If you have AVX but not FMA with
g++ -Wall -DNDEBUG -mavx -Ofast -I ../../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC matprod.cc
Compile and Run Benchmark
The machine this page gets generated only has AVX. Also note that the GCC vector extensions need a recent GCC compiler. Here we will use GCC 5.3:
$shell> g++-5.3 -Wall -DNDEBUG -mavx -Ofast -I ../../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC -DM_MAX=500 matprod.cc $shell> ./a.out # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Residual 100 100 100 0.000915665 2184.2 0.000364562 5486.04 0 200 200 200 0.00735024 2176.8 0.0021132 7571.46 0 300 300 300 0.0105842 5101.97 0.00308739 17490.5 1.92458e-16 400 400 400 0.0243484 5257.01 0.00691159 18519.6 8.41365e-17 500 500 500 0.0473068 5284.65 0.0129615 19287.9 3.54665e-17 $shell>
Micro-Kernel
#ifndef GCCVEC_HPP #define GCCVEC_HPP #include "gemm.hpp" #include <type_traits> //-- Micro Kernel -------------------------------------------------------------- template <typename Index, typename T> typename std::enable_if<BlockSize<T>::vlen != 0, void>::type ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { typedef T vx __attribute__((vector_size (BlockSize<T>::rwidth/8))); static constexpr Index vlen = BlockSize<T>::vlen; static constexpr Index MR = BlockSize<T>::MR; static constexpr Index NR = BlockSize<T>::NR/vlen; A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align); B = (const T*) __builtin_assume_aligned (B, BlockSize<T>::align); vx P[MR*NR] = {}; for (Index l=0; l<kc; ++l) { const vx *b = (const vx *)B; for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { P[i*NR+j] += A[i]*b[j]; } } A += MR; B += vlen*NR; } if (alpha!=T(1)) { for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { P[i*NR+j] *= alpha; } } } if (beta!=T(0)) { for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { const T *p = (const T *) &P[i*NR+j]; for (Index j1=0; j1<vlen; ++j1) { C[i*incRowC+(j*vlen+j1)*incColC] *= beta; C[i*incRowC+(j*vlen+j1)*incColC] += p[j1]; } } } } else { for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { const T *p = (const T *) &P[i*NR+j]; for (Index j1=0; j1<vlen; ++j1) { C[i*incRowC+(j*vlen+j1)*incColC] = p[j1]; } } } } } #endif // GCCVEC_HPP
Modified gemm.hpp
#ifndef GCCVEC_HPP #define GCCVEC_HPP #include "gemm.hpp" #include <type_traits> //-- Micro Kernel -------------------------------------------------------------- template <typename Index, typename T> typename std::enable_if<BlockSize<T>::vlen != 0, void>::type ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { typedef T vx __attribute__((vector_size (BlockSize<T>::rwidth/8))); static constexpr Index vlen = BlockSize<T>::vlen; static constexpr Index MR = BlockSize<T>::MR; static constexpr Index NR = BlockSize<T>::NR/vlen; A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align); B = (const T*) __builtin_assume_aligned (B, BlockSize<T>::align); vx P[MR*NR] = {}; for (Index l=0; l<kc; ++l) { const vx *b = (const vx *)B; for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { P[i*NR+j] += A[i]*b[j]; } } A += MR; B += vlen*NR; } if (alpha!=T(1)) { for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { P[i*NR+j] *= alpha; } } } if (beta!=T(0)) { for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { const T *p = (const T *) &P[i*NR+j]; for (Index j1=0; j1<vlen; ++j1) { C[i*incRowC+(j*vlen+j1)*incColC] *= beta; C[i*incRowC+(j*vlen+j1)*incColC] += p[j1]; } } } } else { for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { const T *p = (const T *) &P[i*NR+j]; for (Index j1=0; j1<vlen; ++j1) { C[i*incRowC+(j*vlen+j1)*incColC] = p[j1]; } } } } } #endif // GCCVEC_HPP
Benchmark Results for GEMM: Single-Threaded
Resuts
$shell> g++-5.3 -Wall -DNDEBUG -mavx -Ofast -I ../../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC matprod.cc $shell> ./a.out > report.session5 $shell> cat report.session5 # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Residual 100 100 100 0.000911962 2193.07 0.000365959 5465.09 0 200 200 200 0.00735283 2176.03 0.00208917 7658.54 0 300 300 300 0.0106023 5093.23 0.00310073 17415.3 1.94333e-16 400 400 400 0.0242957 5268.42 0.00690516 18536.9 8.42829e-17 500 500 500 0.0471917 5297.54 0.0129306 19334 3.51248e-17 600 600 600 0.0821204 5260.57 0.0226174 19100.3 1.63638e-17 700 700 700 0.135061 5079.19 0.035084 19553.1 8.47304e-18 800 800 800 0.21912 4673.24 0.054134 18916 4.72651e-18 900 900 900 0.334129 4363.59 0.0755758 19291.9 2.79897e-18 1000 1000 1000 0.489293 4087.53 0.102529 19506.7 1.76203e-18 1100 1100 1100 0.684217 3890.58 0.138813 19176.9 1.14774e-18 1200 1200 1200 0.883973 3909.62 0.177891 19427.6 7.78524e-19 1300 1300 1300 1.13136 3883.82 0.227143 19344.7 5.44382e-19 1400 1400 1400 1.40896 3895.08 0.276579 19842.4 3.89672e-19 1500 1500 1500 1.7419 3875.09 0.340231 19839.5 2.85985e-19 1600 1600 1600 2.10443 3892.73 0.448085 18282.3 2.14244e-19 1700 1700 1700 2.55316 3848.56 0.496574 19787.6 1.63099e-19 1800 1800 1800 3.03558 3842.43 0.592272 19693.7 1.26206e-19 1900 1900 1900 3.59253 3818.48 0.709155 19344.1 9.86444e-20 2000 2000 2000 4.19097 3817.73 0.795291 20118.4 7.82567e-20 2100 2100 2100 4.96059 3733.83 0.935513 19798.8 6.28397e-20 2200 2200 2200 5.78887 3678.79 1.0466 20347.9 5.08788e-20 2300 2300 2300 6.72603 3617.89 1.27558 19076.8 4.17594e-20 2400 2400 2400 7.66609 3606.53 1.37846 20057.2 3.43472e-20 2500 2500 2500 8.74354 3574.07 1.56127 20015.8 2.85566e-20 2600 2600 2600 9.85449 3567.1 1.74043 20197.3 2.39598e-20 2700 2700 2700 11.0877 3550.41 1.92683 20430.4 2.01726e-20 2800 2800 2800 12.3331 3559.85 2.1434 20483.3 1.71317e-20 2900 2900 2900 13.7286 3553.02 2.40989 20240.8 1.46133e-20 3000 3000 3000 15.1355 3567.78 2.62543 20568 1.25247e-20 3100 3100 3100 16.7428 3558.67 2.93044 20332.1 1.08154e-20 3200 3200 3200 18.2769 3585.72 3.28095 19974.7 9.35807e-21 3300 3300 3300 20.2541 3548.61 3.48786 20606.9 8.14622e-21 3400 3400 3400 22.0989 3557.1 3.84574 20440.3 7.1122e-21 3500 3500 3500 24.0929 3559.14 4.17224 20552.5 6.23346e-21 3600 3600 3600 26.1278 3571.36 4.58381 20356.9 5.49784e-21 3700 3700 3700 28.5613 3546.96 4.92577 20566.5 4.85315e-21 3800 3800 3800 31.0737 3531.74 5.43442 20194.2 4.30106e-21 3900 3900 3900 33.416 3550.34 5.80666 20431.4 3.8242e-21 4000 4000 4000 35.7797 3577.45 6.24483 20496.9 3.40942e-21 $shell> gnuplot plot.session5.mflops $shell> gnuplot plot.session5.time $shell> gnuplot plot.session5.time_log $shell>
MFLOPS
Time
Time with Logarithmic scale
Benchmark Results for GEMM: Multi-Threaded
Resuts
$shell> g++-5.3 -Wall -DNDEBUG -mavx -Ofast -I ../../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC -fopenmp matprod.cc $shell> ./a.out > report.mt.session5 $shell> cat report.mt.session5 # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Residual 100 100 100 0.000774379 2582.71 0.000363724 5498.67 0 200 200 200 0.00623561 2565.91 0.000941224 16999.1 0 300 300 300 0.00928075 5818.5 0.00129866 41581.2 1.94577e-16 400 400 400 0.021505 5952.1 0.0027016 47379.4 8.39812e-17 500 500 500 0.0414698 6028.49 0.004542 55041.8 3.52654e-17 600 600 600 0.0722763 5977.06 0.0074851 57714.6 1.62898e-17 700 700 700 0.120367 5699.23 0.0113668 60351.3 8.42435e-18 800 800 800 0.193569 5290.11 0.0173253 59104.4 4.71521e-18 900 900 900 0.307149 4746.88 0.0236151 61740.3 2.80991e-18 1000 1000 1000 0.434627 4601.65 0.0345603 57869.9 1.76465e-18 1100 1100 1100 0.58295 4566.43 0.0438241 60742.9 1.14685e-18 1200 1200 1200 0.762115 4534.75 0.0571088 60516.1 7.77469e-19 1300 1300 1300 0.976507 4499.71 0.0685796 64071.5 5.44636e-19 1400 1400 1400 1.22356 4485.27 0.0831172 66027.3 3.89876e-19 1500 1500 1500 1.51885 4444.16 0.106189 63566.1 2.86703e-19 1600 1600 1600 1.8423 4446.62 0.140199 58431.2 2.13989e-19 1700 1700 1700 2.23963 4387.33 0.144956 67786.2 1.62891e-19 1800 1800 1800 2.67899 4353.87 0.173365 67280.2 1.26145e-19 1900 1900 1900 3.17614 4319.08 0.207838 66003.5 9.85899e-20 2000 2000 2000 3.72609 4294.04 0.237732 67302.5 7.83622e-20 2100 2100 2100 4.45792 4154.85 0.268595 68958.8 6.28453e-20 2200 2200 2200 5.35515 3976.74 0.304674 69897.6 5.09178e-20 2300 2300 2300 6.30052 3862.22 0.381793 63736 4.17415e-20 2400 2400 2400 7.27294 3801.49 0.398319 69411.7 3.43416e-20 2500 2500 2500 8.30512 3762.74 0.441457 70788.2 2.85787e-20 2600 2600 2600 9.36644 3752.97 0.498854 70465.5 2.39259e-20 2700 2700 2700 10.5421 3734.16 0.554967 70934 2.01672e-20 2800 2800 2800 11.7684 3730.66 0.623188 70450.7 1.71452e-20 2900 2900 2900 13.0811 3728.9 0.689614 70732.3 1.46009e-20 3000 3000 3000 14.4417 3739.17 0.752104 71798.6 1.25351e-20 3100 3100 3100 15.9404 3737.8 0.838195 71083.7 1.08094e-20 3200 3200 3200 17.618 3719.83 0.943513 69459.5 9.35277e-21 3300 3300 3300 19.3273 3718.79 0.992755 72398.5 8.14846e-21 3400 3400 3400 21.1403 3718.4 1.09192 71990.7 7.11666e-21 3500 3500 3500 23.0198 3725.05 1.18521 72350.2 6.23645e-21 3600 3600 3600 24.9776 3735.82 1.3149 70964.9 5.49705e-21 3700 3700 3700 27.2811 3713.41 1.39523 72608.7 4.85061e-21 3800 3800 3800 29.6816 3697.38 1.54116 71208.8 4.30176e-21 3900 3900 3900 31.833 3726.89 1.63634 72502.2 3.82169e-21 4000 4000 4000 34.3442 3726.97 1.77494 72115.3 3.40799e-21 $shell> gnuplot plot.mt.session5.mflops $shell> gnuplot plot.mt.session5.time $shell> gnuplot plot.mt.session5.time_log $shell>