Content

Using GCC Vector-Extensions for Micro-Kernels

Tar-Ball for this Session

The tar-ball session5.tgz contains the files:

$shell> tar tfvz session5.tgz
-rw-rw-r-- lehn/num       5356 2016-01-31 11:28 session5/matprod.cc
-rw-rw-r-- lehn/num      17181 2016-02-02 18:17 session5/avx.hpp
-rw-rw-r-- lehn/num      33544 2016-02-02 19:53 session5/fma.hpp
-rw-rw-r-- lehn/num       1898 2016-02-02 17:11 session5/gccvec.hpp
-rw-rw-r-- lehn/num       3353 2016-02-02 01:34 session5/gccvec2.hpp
-rw-rw-r-- lehn/num      10291 2016-02-02 19:50 session5/gemm.hpp
$shell> 

Compiling for FMA or AVX

Compile and Run Benchmark

The machine this page gets generated only has AVX. Also note that the GCC vector extensions need a recent GCC compiler. Here we will use GCC 5.3:

$shell> g++-5.3 -Wall -DNDEBUG -mavx -Ofast -I ../../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC -DM_MAX=500 matprod.cc
$shell> ./a.out
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Residual
  100   100   100  0.000915665       2184.2     0.000364562      5486.04               0
  200   200   200   0.00735024       2176.8       0.0021132      7571.46               0
  300   300   300    0.0105842      5101.97      0.00308739      17490.5     1.92458e-16
  400   400   400    0.0243484      5257.01      0.00691159      18519.6     8.41365e-17
  500   500   500    0.0473068      5284.65       0.0129615      19287.9     3.54665e-17
$shell> 

Micro-Kernel

#ifndef GCCVEC_HPP
#define GCCVEC_HPP

#include "gemm.hpp"
#include <type_traits>

//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
typename std::enable_if<BlockSize<T>::vlen != 0,
         void>::type
ugemm(Index kc, T alpha, const T *A, const T *B, T beta,
      T *C, Index incRowC, Index incColC)
{
    typedef T vx __attribute__((vector_size (BlockSize<T>::rwidth/8)));

    static constexpr Index vlen = BlockSize<T>::vlen;
    static constexpr Index MR   = BlockSize<T>::MR;
    static constexpr Index NR   = BlockSize<T>::NR/vlen;

    A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align);
    B = (const T*) __builtin_assume_aligned (B, BlockSize<T>::align);

    vx P[MR*NR] = {};

    for (Index l=0; l<kc; ++l) {
        const vx *b = (const vx *)B;
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] += A[i]*b[j];
            }
        }
        A += MR;
        B += vlen*NR;
    }

    if (alpha!=T(1)) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] *= alpha;
            }
        }
    }

    if (beta!=T(0)) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                const T *p = (const T *) &P[i*NR+j];
                for (Index j1=0; j1<vlen; ++j1) {
                    C[i*incRowC+(j*vlen+j1)*incColC] *= beta;
                    C[i*incRowC+(j*vlen+j1)*incColC] += p[j1];
                }
            }
        }
    } else {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                const T *p = (const T *) &P[i*NR+j];
                for (Index j1=0; j1<vlen; ++j1) {
                    C[i*incRowC+(j*vlen+j1)*incColC] = p[j1];
                }
            }
        }
    }
}

#endif // GCCVEC_HPP

Modified gemm.hpp

#ifndef GCCVEC_HPP
#define GCCVEC_HPP

#include "gemm.hpp"
#include <type_traits>

//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
typename std::enable_if<BlockSize<T>::vlen != 0,
         void>::type
ugemm(Index kc, T alpha, const T *A, const T *B, T beta,
      T *C, Index incRowC, Index incColC)
{
    typedef T vx __attribute__((vector_size (BlockSize<T>::rwidth/8)));

    static constexpr Index vlen = BlockSize<T>::vlen;
    static constexpr Index MR   = BlockSize<T>::MR;
    static constexpr Index NR   = BlockSize<T>::NR/vlen;

    A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align);
    B = (const T*) __builtin_assume_aligned (B, BlockSize<T>::align);

    vx P[MR*NR] = {};

    for (Index l=0; l<kc; ++l) {
        const vx *b = (const vx *)B;
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] += A[i]*b[j];
            }
        }
        A += MR;
        B += vlen*NR;
    }

    if (alpha!=T(1)) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] *= alpha;
            }
        }
    }

    if (beta!=T(0)) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                const T *p = (const T *) &P[i*NR+j];
                for (Index j1=0; j1<vlen; ++j1) {
                    C[i*incRowC+(j*vlen+j1)*incColC] *= beta;
                    C[i*incRowC+(j*vlen+j1)*incColC] += p[j1];
                }
            }
        }
    } else {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                const T *p = (const T *) &P[i*NR+j];
                for (Index j1=0; j1<vlen; ++j1) {
                    C[i*incRowC+(j*vlen+j1)*incColC] = p[j1];
                }
            }
        }
    }
}

#endif // GCCVEC_HPP

Benchmark Results for GEMM: Single-Threaded

Resuts

$shell> g++-5.3 -Wall -DNDEBUG -mavx -Ofast -I ../../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC matprod.cc
$shell> ./a.out  > report.session5
$shell> cat report.session5
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Residual
  100   100   100  0.000911962      2193.07     0.000365959      5465.09               0
  200   200   200   0.00735283      2176.03      0.00208917      7658.54               0
  300   300   300    0.0106023      5093.23      0.00310073      17415.3     1.94333e-16
  400   400   400    0.0242957      5268.42      0.00690516      18536.9     8.42829e-17
  500   500   500    0.0471917      5297.54       0.0129306        19334     3.51248e-17
  600   600   600    0.0821204      5260.57       0.0226174      19100.3     1.63638e-17
  700   700   700     0.135061      5079.19        0.035084      19553.1     8.47304e-18
  800   800   800      0.21912      4673.24        0.054134        18916     4.72651e-18
  900   900   900     0.334129      4363.59       0.0755758      19291.9     2.79897e-18
 1000  1000  1000     0.489293      4087.53        0.102529      19506.7     1.76203e-18
 1100  1100  1100     0.684217      3890.58        0.138813      19176.9     1.14774e-18
 1200  1200  1200     0.883973      3909.62        0.177891      19427.6     7.78524e-19
 1300  1300  1300      1.13136      3883.82        0.227143      19344.7     5.44382e-19
 1400  1400  1400      1.40896      3895.08        0.276579      19842.4     3.89672e-19
 1500  1500  1500       1.7419      3875.09        0.340231      19839.5     2.85985e-19
 1600  1600  1600      2.10443      3892.73        0.448085      18282.3     2.14244e-19
 1700  1700  1700      2.55316      3848.56        0.496574      19787.6     1.63099e-19
 1800  1800  1800      3.03558      3842.43        0.592272      19693.7     1.26206e-19
 1900  1900  1900      3.59253      3818.48        0.709155      19344.1     9.86444e-20
 2000  2000  2000      4.19097      3817.73        0.795291      20118.4     7.82567e-20
 2100  2100  2100      4.96059      3733.83        0.935513      19798.8     6.28397e-20
 2200  2200  2200      5.78887      3678.79          1.0466      20347.9     5.08788e-20
 2300  2300  2300      6.72603      3617.89         1.27558      19076.8     4.17594e-20
 2400  2400  2400      7.66609      3606.53         1.37846      20057.2     3.43472e-20
 2500  2500  2500      8.74354      3574.07         1.56127      20015.8     2.85566e-20
 2600  2600  2600      9.85449       3567.1         1.74043      20197.3     2.39598e-20
 2700  2700  2700      11.0877      3550.41         1.92683      20430.4     2.01726e-20
 2800  2800  2800      12.3331      3559.85          2.1434      20483.3     1.71317e-20
 2900  2900  2900      13.7286      3553.02         2.40989      20240.8     1.46133e-20
 3000  3000  3000      15.1355      3567.78         2.62543        20568     1.25247e-20
 3100  3100  3100      16.7428      3558.67         2.93044      20332.1     1.08154e-20
 3200  3200  3200      18.2769      3585.72         3.28095      19974.7     9.35807e-21
 3300  3300  3300      20.2541      3548.61         3.48786      20606.9     8.14622e-21
 3400  3400  3400      22.0989       3557.1         3.84574      20440.3      7.1122e-21
 3500  3500  3500      24.0929      3559.14         4.17224      20552.5     6.23346e-21
 3600  3600  3600      26.1278      3571.36         4.58381      20356.9     5.49784e-21
 3700  3700  3700      28.5613      3546.96         4.92577      20566.5     4.85315e-21
 3800  3800  3800      31.0737      3531.74         5.43442      20194.2     4.30106e-21
 3900  3900  3900       33.416      3550.34         5.80666      20431.4      3.8242e-21
 4000  4000  4000      35.7797      3577.45         6.24483      20496.9     3.40942e-21
$shell> gnuplot plot.session5.mflops
$shell> gnuplot plot.session5.time
$shell> gnuplot plot.session5.time_log
$shell> 

MFLOPS

Time

Time with Logarithmic scale

Benchmark Results for GEMM: Multi-Threaded

Resuts

$shell> g++-5.3 -Wall -DNDEBUG -mavx -Ofast -I ../../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC -fopenmp matprod.cc
$shell> ./a.out  > report.mt.session5
$shell> cat report.mt.session5
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Residual
  100   100   100  0.000774379      2582.71     0.000363724      5498.67               0
  200   200   200   0.00623561      2565.91     0.000941224      16999.1               0
  300   300   300   0.00928075       5818.5      0.00129866      41581.2     1.94577e-16
  400   400   400     0.021505       5952.1       0.0027016      47379.4     8.39812e-17
  500   500   500    0.0414698      6028.49        0.004542      55041.8     3.52654e-17
  600   600   600    0.0722763      5977.06       0.0074851      57714.6     1.62898e-17
  700   700   700     0.120367      5699.23       0.0113668      60351.3     8.42435e-18
  800   800   800     0.193569      5290.11       0.0173253      59104.4     4.71521e-18
  900   900   900     0.307149      4746.88       0.0236151      61740.3     2.80991e-18
 1000  1000  1000     0.434627      4601.65       0.0345603      57869.9     1.76465e-18
 1100  1100  1100      0.58295      4566.43       0.0438241      60742.9     1.14685e-18
 1200  1200  1200     0.762115      4534.75       0.0571088      60516.1     7.77469e-19
 1300  1300  1300     0.976507      4499.71       0.0685796      64071.5     5.44636e-19
 1400  1400  1400      1.22356      4485.27       0.0831172      66027.3     3.89876e-19
 1500  1500  1500      1.51885      4444.16        0.106189      63566.1     2.86703e-19
 1600  1600  1600       1.8423      4446.62        0.140199      58431.2     2.13989e-19
 1700  1700  1700      2.23963      4387.33        0.144956      67786.2     1.62891e-19
 1800  1800  1800      2.67899      4353.87        0.173365      67280.2     1.26145e-19
 1900  1900  1900      3.17614      4319.08        0.207838      66003.5     9.85899e-20
 2000  2000  2000      3.72609      4294.04        0.237732      67302.5     7.83622e-20
 2100  2100  2100      4.45792      4154.85        0.268595      68958.8     6.28453e-20
 2200  2200  2200      5.35515      3976.74        0.304674      69897.6     5.09178e-20
 2300  2300  2300      6.30052      3862.22        0.381793        63736     4.17415e-20
 2400  2400  2400      7.27294      3801.49        0.398319      69411.7     3.43416e-20
 2500  2500  2500      8.30512      3762.74        0.441457      70788.2     2.85787e-20
 2600  2600  2600      9.36644      3752.97        0.498854      70465.5     2.39259e-20
 2700  2700  2700      10.5421      3734.16        0.554967        70934     2.01672e-20
 2800  2800  2800      11.7684      3730.66        0.623188      70450.7     1.71452e-20
 2900  2900  2900      13.0811       3728.9        0.689614      70732.3     1.46009e-20
 3000  3000  3000      14.4417      3739.17        0.752104      71798.6     1.25351e-20
 3100  3100  3100      15.9404       3737.8        0.838195      71083.7     1.08094e-20
 3200  3200  3200       17.618      3719.83        0.943513      69459.5     9.35277e-21
 3300  3300  3300      19.3273      3718.79        0.992755      72398.5     8.14846e-21
 3400  3400  3400      21.1403       3718.4         1.09192      71990.7     7.11666e-21
 3500  3500  3500      23.0198      3725.05         1.18521      72350.2     6.23645e-21
 3600  3600  3600      24.9776      3735.82          1.3149      70964.9     5.49705e-21
 3700  3700  3700      27.2811      3713.41         1.39523      72608.7     4.85061e-21
 3800  3800  3800      29.6816      3697.38         1.54116      71208.8     4.30176e-21
 3900  3900  3900       31.833      3726.89         1.63634      72502.2     3.82169e-21
 4000  4000  4000      34.3442      3726.97         1.77494      72115.3     3.40799e-21
$shell> gnuplot plot.mt.session5.mflops
$shell> gnuplot plot.mt.session5.time
$shell> gnuplot plot.mt.session5.time_log
$shell> 

MFLOPS

Time

Time with Logarithmic scale