Using OpenMP

Changes:

gemm.hpp uses OpenMP if you compile with -fopenmp.

Tar-Ball for this Session

The tar-ball session3.tgz contains the files:

$shell> tar tfvz session3.tgz
-rw-rw-r-- lehn/num       5179 2016-02-02 18:16 session3/matprod.cc
-rw-rw-r-- lehn/num      17181 2016-02-02 18:14 session3/avx.hpp
-rw-rw-r-- lehn/num      33544 2016-02-02 19:53 session3/fma.hpp
-rw-rw-r-- lehn/num       8794 2016-02-02 19:51 session3/gemm.hpp
$shell>

Compile and Run Benchmark

For a quick benchmark we again limit \(m\) to \(500\) with -DM_MAX=500. This time we also enable the optimized micro kernel for AVX with -DHAVE_AVX:

$shell> g++ -Ofast -mavx -Wall -std=c++11 -DNDEBUG -DHAVE_AVX -fopenmp -DM_MAX=500 -I ../boost_1_60_0 matprod.cc
$shell> ./a.out
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Residual
  100   100   100     0.000817      2447.98         0.00032         6250               0
  200   200   200     0.003891      4112.05        0.000412        38835               0
  300   300   300     0.010135      5328.07        0.001123      48085.5      1.9329e-16
  400   400   400     0.023686      5404.04        0.002335        54818     8.42823e-17
  500   500   500      0.04592      5444.25        0.003905      64020.5     3.53681e-17
$shell>

Core Function for Matrix-Matrix Produkt

session3/gemm.hpp

// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core

#ifndef GEMM_HPP
#define GEMM_HPP
#include <algorithm>
#include <cstdlib>

#if defined(_OPENMP)
#include <omp.h>
#endif

//-- new with alignment --------------------------------------------------------
void *
malloc_(std::size_t alignment, std::size_t size)
{
    alignment = std::max(alignment, alignof(void *));
    size     += alignment;

    void *ptr  = std::malloc(size);
    void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1));
    void **vp  = (void**) ptr2 - 1;
    *vp        = ptr;
    return ptr2;
}

void
free_(void *ptr)
{
    std::free(*((void**)ptr-1));
}

//-- Config --------------------------------------------------------------------

#ifdef HAVE_FMA

#   ifndef BS_D_MR
#   define BS_D_MR 4
#   endif

#   ifndef BS_D_NR
#   define BS_D_NR 12
#   endif

#   ifndef BS_D_MC
#   define BS_D_MC 256
#   endif

#   ifndef BS_D_KC
#   define BS_D_KC 512
#   endif

#   ifndef BS_D_NC
#   define BS_D_NC 4092
#   endif

#   ifndef BS_D_ALIGN
#   define BS_D_ALIGN 32
#   endif

#endif


#ifndef BS_D_MR
#define BS_D_MR 4
#endif

#ifndef BS_D_NR
#define BS_D_NR 8
#endif

#ifndef BS_D_MC
#define BS_D_MC 256
#endif

#ifndef BS_D_KC
#define BS_D_KC 256
#endif

#ifndef BS_D_NC
#define BS_D_NC 4096
#endif

#ifndef BS_D_ALIGN
#define BS_D_ALIGN 32
#endif


template <typename T>
struct BlockSize
{
    static const int MC = 64;
    static const int KC = 64;
    static const int NC = 256;
    static const int MR = 8;
    static const int NR = 8;

    static const int align = alignof(T);

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};


template <>
struct BlockSize<double>
{
    static const int MC    = BS_D_MC;
    static const int KC    = BS_D_KC;
    static const int NC    = BS_D_NC;
    static const int MR    = BS_D_MR;
    static const int NR    = BS_D_NR;

    static const int align = BS_D_ALIGN;

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};

//-- aux routines --------------------------------------------------------------
template <typename Index, typename Alpha, typename TX, typename TY>
void
geaxpy(Index m, Index n,
       const Alpha &alpha,
       const TX *X, Index incRowX, Index incColX,
       TY       *Y, Index incRowY, Index incColY)
{
    for (Index j=0; j<n; ++j) {
        for (Index i=0; i<m; ++i) {
            Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
        }
    }
}

template <typename Index, typename Alpha, typename TX>
void
gescal(Index m, Index n,
       const Alpha &alpha,
       TX *X, Index incRowX, Index incColX)
{
    if (alpha!=Alpha(0)) {
        for (Index j=0; j<n; ++j) {
            for (Index i=0; i<m; ++i) {
                X[i*incRowX+j*incColX] *= alpha;
            }
        }
    } else {
        for (Index j=0; j<n; ++j) {
            for (Index i=0; i<m; ++i) {
                X[i*incRowX+j*incColX] = Alpha(0);
            }
        }
    }
}

//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
void
ugemm(Index kc, T alpha,
      const T *A, const T *B,
      T beta,
      T *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    T P[BlockSize<T>::MR*BlockSize<T>::NR];

    for (Index l=0; l<MR*NR; ++l) {
        P[l] = 0;
    }
    for (Index l=0; l<kc; ++l) {
        for (Index j=0; j<NR; ++j) {
            for (Index i=0; i<MR; ++i) {
                P[i+j*MR] += A[i+l*MR]*B[l*NR+j];
            }
        }
    }
    for (Index j=0; j<NR; ++j) {
        for (Index i=0; i<MR; ++i) {
            C[i*incRowC+j*incColC] *= beta;
            C[i*incRowC+j*incColC] += alpha*P[i+j*MR];
        }
    }
}

#ifdef HAVE_AVX
#include "avx.hpp"
#endif

#ifdef HAVE_FMA
#include "fma.hpp"
#endif

//-- Macro Kernel --------------------------------------------------------------
template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc,
      T alpha,
      const T *A, const T *B,
      Beta beta,
      TC *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    const Index mp  = (mc+MR-1) / MR;
    const Index np  = (nc+NR-1) / NR;
    const Index mr_ = mc % MR;
    const Index nr_ = nc % NR;

    #pragma omp parallel for
    for (Index j=0; j<np; ++j) {
        const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
        T C_[MR*NR];

        for (Index i=0; i<mp; ++i) {
            const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;

            if (mr==MR && nr==NR) {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      beta,
                      &C[i*MR*incRowC+j*NR*incColC],
                      incRowC, incColC);
            } else {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      T(0),
                      C_, Index(1), MR);
                gescal(mr, nr, beta,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
                geaxpy(mr, nr, T(1), C_, Index(1), MR,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
            }
        }
    }
}

//-- Packing blocks ------------------------------------------------------------
template <typename Index, typename TA, typename T>
void
pack_A(Index mc, Index kc,
       const TA *A, Index incRowA, Index incColA,
       T *p)
{
    Index MR = BlockSize<T>::MR;
    Index mp = (mc+MR-1) / MR;

    for (Index j=0; j<kc; ++j) {
        for (Index l=0; l<mp; ++l) {
            for (Index i0=0; i0<MR; ++i0) {
                Index i  = l*MR + i0;
                Index nu = l*MR*kc + j*MR + i0;
                p[nu]   = (i<mc) ? A[i*incRowA+j*incColA]
                                 : T(0);
            }
        }
    }
}

template <typename Index, typename TB, typename T>
void
pack_B(Index kc, Index nc,
       const TB *B, Index incRowB, Index incColB,
       T *p)
{
    Index NR = BlockSize<T>::NR;
    Index np = (nc+NR-1) / NR;

    for (Index l=0; l<np; ++l) {
        for (Index j0=0; j0<NR; ++j0) {
            for (Index i=0; i<kc; ++i) {
                Index j  = l*NR+j0;
                Index nu = l*NR*kc + i*NR + j0;
                p[nu]   = (j<nc) ? B[i*incRowB+j*incColB]
                                 : T(0);
            }
        }
    }
}

//-- Frame routine -------------------------------------------------------------
template <typename Index, typename Alpha,
         typename TA, typename TB,
         typename Beta,
         typename TC>
void
gemm(Index m, Index n, Index k,
     Alpha alpha,
     const TA *A, Index incRowA, Index incColA,
     const TB *B, Index incRowB, Index incColB,
     Beta beta,
     TC *C, Index incRowC, Index incColC)
{
    typedef typename std::common_type<Alpha, TA, TB>::type  T;

    const Index MC = BlockSize<T>::MC;
    const Index NC = BlockSize<T>::NC;
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;

    const Index KC = BlockSize<T>::KC;
    const Index mb = (m+MC-1) / MC;
    const Index nb = (n+NC-1) / NC;
    const Index kb = (k+KC-1) / KC;
    const Index mc_ = m % MC;
    const Index nc_ = n % NC;
    const Index kc_ = k % KC;


    T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*KC+MR));
    T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(KC*NC+NR));

    if (alpha==Alpha(0) || k==0) {
        gescal(m, n, beta, C, incRowC, incColC);
        return;
    }

    for (Index j=0; j<nb; ++j) {
        Index nc = (j!=nb-1 || nc_==0) ? NC : nc_;

        for (Index l=0; l<kb; ++l) {
            Index   kc  = (l!=kb-1 || kc_==0) ? KC : kc_;
            Beta beta_  = (l==0) ? beta : Beta(1);

            pack_B(kc, nc,
                   &B[l*KC*incRowB+j*NC*incColB],
                   incRowB, incColB,
                   B_);

            for (Index i=0; i<mb; ++i) {
                Index mc = (i!=mb-1 || mc_==0) ? MC : mc_;

                pack_A(mc, kc,
                       &A[i*MC*incRowA+l*KC*incColA],
                       incRowA, incColA,
                       A_);

                mgemm(mc, nc, kc,
                      T(alpha), A_, B_, beta_,
                      &C[i*MC*incRowC+j*NC*incColC],
                      incRowC, incColC);
            }
        }
    }
    free_(A_);
    free_(B_);
}

#endif

Benchmark Results

$shell> g++ -Ofast -mavx -Wall -std=c++11 -DHAVE_AVX -fopenmp -DNDEBUG -I ../boost_1_60_0 matprod.cc
$shell> ./a.out  > report.session3
$shell> cat report.session3
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Residual
  100   100   100      0.00081      2469.14        0.000358      5586.59               0
  200   200   200      0.00684      2339.18        0.000853      18757.3               0
  300   300   300     0.010001      5399.46        0.001127      47914.8     1.94023e-16
  400   400   400       0.0236      5423.73        0.002371      53985.7     8.38768e-17
  500   500   500      0.04566      5475.25        0.003914      63873.3     3.52432e-17
  600   600   600     0.079563      5429.66        0.006525      66206.9     1.63687e-17
  700   700   700     0.132763       5167.1        0.009794      70042.9     8.44612e-18
  800   800   800     0.210614      4861.97        0.015314      66866.9     4.71434e-18
  900   900   900     0.327674      4449.54        0.020348      71653.2     2.79477e-18
 1000  1000  1000      0.48001      4166.58        0.027266      73351.4      1.7654e-18
 1100  1100  1100     0.664244      4007.56        0.037167      71622.7     1.14802e-18
 1200  1200  1200     0.875123      3949.16        0.048052      71922.1     7.78787e-19
 1300  1300  1300      1.08635      4044.73        0.058962      74522.6     5.45706e-19
 1400  1400  1400      1.40024      3919.34        0.071781      76454.8     3.89556e-19
 1500  1500  1500      1.72635      3909.98         0.08555      78901.2     2.86468e-19
 1600  1600  1600      2.09328      3913.47        0.124991      65540.7     2.14057e-19
 1700  1700  1700      2.51353      3909.25        0.124764      78756.7     1.62835e-19
 1800  1800  1800      2.98552      3906.86        0.151215      77135.2     1.26232e-19
 1900  1900  1900      3.49862      3920.97        0.179255      76527.9     9.85343e-20
 2000  2000  2000       4.0474      3953.16        0.206362      77533.7     7.82848e-20
 2100  2100  2100      4.76861      3884.15         0.23111      80143.7     6.28965e-20
 2200  2200  2200      5.61489      3792.77        0.262311      81186.1     5.08613e-20
 2300  2300  2300      6.54425      3718.38        0.332712      73138.3     4.17123e-20
 2400  2400  2400      7.45089       3710.7        0.349646      79074.3      3.4353e-20
 2500  2500  2500      8.44021      3702.51        0.380638        82099      2.8591e-20
 2600  2600  2600      9.49611      3701.72        0.435837        80654      2.3931e-20
 2700  2700  2700      10.6711      3689.04        0.477382      82462.3     2.01823e-20
 2800  2800  2800      11.9161      3684.44        0.544741      80596.1     1.71307e-20
 2900  2900  2900      13.2557      3679.76        0.593047      82249.8      1.4607e-20
 3000  3000  3000      14.6075      3696.74        0.651973      82825.5     1.25212e-20
 3100  3100  3100      16.1247      3695.09        0.715059      83324.6     1.08144e-20
 3200  3200  3200      17.9785      3645.25        0.819091      80010.6     9.34998e-21
 3300  3300  3300      19.5702      3672.63        0.854799      84082.9     8.15112e-21
 3400  3400  3400      21.3683      3678.71        0.954719      82336.3     7.11691e-21
 3500  3500  3500       23.279      3683.59         1.01965      84097.6     6.23425e-21
 3600  3600  3600      25.2948      3688.97         1.14405      81562.6      5.4966e-21
 3700  3700  3700      27.5727      3674.14         1.19846      84530.2     4.84832e-21
 3800  3800  3800      29.9404      3665.42         1.33689        82089      4.3006e-21
 3900  3900  3900      32.3364      3668.87         1.40714      84311.4     3.82669e-21
 4000  4000  4000      34.7918      3679.03         1.52644      83855.2     3.41037e-21
$shell> gnuplot plot.session3.mflops
$shell> gnuplot plot.session3.time
$shell> gnuplot plot.session3.time_log
$shell>

Content

Using OpenMP

Tar-Ball for this Session

Compile and Run Benchmark

Core Function for Matrix-Matrix Produkt

Benchmark Results

MFLOPS

Time

Time with Logarithmic scale