uBLAS-fying the GEMM

With uBLAS it is pretty straight forward to generalize the GEMM Operation:

SYMM, HEMM comes for free. Unlike the traditional BLAS there are no requirements about underlying storage scheme. Having a packed syymetric matrix is just fine.
Support for operations like \(C = (\alpha_1 A_1 + \alpha_2 A_2) \cdot (\beta B_1 + \beta B_2)\) comes for free. No temporaries get created and you don't loose performance.

However, due to my lack of deep knowledge of uBLAS the implementation is not as ellegant as could be. So things will become even better.

Tar-Ball for this Session

The tar-ball session4.tgz contains the files:

$shell> tar tfvz session4.tgz
-rw-rw-r-- lehn/num       5401 2016-01-27 00:35 session4/matprod.cc
-rw-rw-r-- lehn/num       6913 2016-01-27 00:53 session4/symatprod.cc
-rw-rw-r-- lehn/num      17181 2016-02-02 18:17 session4/avx.hpp
-rw-rw-r-- lehn/num      33544 2016-02-02 19:53 session4/fma.hpp
-rw-rw-r-- lehn/num       9797 2016-02-02 19:50 session4/gemm.hpp
$shell>

Compile and Run Benchmark

For a quick benchmark we again limit \(m\) to \(500\) with -DM_MAX=500. This time we also enable the optimized micro kernel for AVX with -DHAVE_AVX.

There are two demos.

Demo for GEMM

$shell> g++ -Ofast -mavx -Wall -std=c++11 -DNDEBUG -DHAVE_AVX -fopenmp -DM_MAX=500 -I ../boost_1_60_0 matprod.cc
$shell> ./a.out
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Residual
  100   100   100     0.000809      2472.19        0.000337      5934.72               0
  200   200   200     0.004352      3676.47        0.000413      38740.9               0
  300   300   300     0.010032      5382.78        0.001125        48000     1.93133e-16
  400   400   400     0.023491       5448.9        0.002363      54168.4     8.44533e-17
  500   500   500     0.045753      5464.12         0.00394      63451.8     3.50952e-17
$shell>

Demo for SYMM

$shell> g++ -Ofast -mavx -Wall -std=c++11 -DNDEBUG -DHAVE_AVX -fopenmp -DM_MAX=500 -I ../boost_1_60_0 symatprod.cc
g++: error: symatprod.cc: No such file or directory
g++: fatal error: no input files
compilation terminated.
$shell> ./a.out
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Residual
  100   100   100     0.000819         2442        0.000331       6042.3               0
  200   200   200     0.006857      2333.38        0.000837      19115.9               0
  300   300   300     0.010086      5353.96        0.001121      48171.3     1.93032e-16
  400   400   400     0.023649      5412.49        0.002339      54724.2     8.41258e-17
  500   500   500     0.045708       5469.5        0.003915        63857      3.5178e-17
$shell>

Core Function for Matrix-Matrix Produkt

session4/gemm.hpp

// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core

#ifndef GEMM_HPP
#define GEMM_HPP
#include <algorithm>
#include <cstdlib>

#if defined(_OPENMP)
#include <omp.h>
#endif

//-- new with alignment --------------------------------------------------------
void *
malloc_(std::size_t alignment, std::size_t size)
{
    alignment = std::max(alignment, alignof(void *));
    size     += alignment;

    void *ptr  = std::malloc(size);
    void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1));
    void **vp  = (void**) ptr2 - 1;
    *vp        = ptr;
    return ptr2;
}

void
free_(void *ptr)
{
    std::free(*((void**)ptr-1));
}

//-- Config --------------------------------------------------------------------

// SIMD-Register width in bits
// SSE:         128
// AVX/FMA:     256
// AVX-512:     512
#ifndef SIMD_REGISTER_WIDTH
#define SIMD_REGISTER_WIDTH 256
#endif

#ifdef HAVE_FMA

#   ifndef BS_D_MR
#   define BS_D_MR 4
#   endif

#   ifndef BS_D_NR
#   define BS_D_NR 12
#   endif

#   ifndef BS_D_MC
#   define BS_D_MC 256
#   endif

#   ifndef BS_D_KC
#   define BS_D_KC 512
#   endif

#   ifndef BS_D_NC
#   define BS_D_NC 4092
#   endif

#endif


#ifndef BS_D_MR
#define BS_D_MR 4
#endif

#ifndef BS_D_NR
#define BS_D_NR 8
#endif

#ifndef BS_D_MC
#define BS_D_MC 256
#endif

#ifndef BS_D_KC
#define BS_D_KC 256
#endif

#ifndef BS_D_NC
#define BS_D_NC 4096
#endif

template <typename T>
struct BlockSize
{
    static constexpr int MC = 64;
    static constexpr int KC = 64;
    static constexpr int NC = 256;
    static constexpr int MR = 8;
    static constexpr int NR = 8;

    static constexpr int rwidth = 0;
    static constexpr int align  = alignof(T);
    static constexpr int vlen   = 0;

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};


template <>
struct BlockSize<double>
{
    static constexpr int MC     = BS_D_MC;
    static constexpr int KC     = BS_D_KC;
    static constexpr int NC     = BS_D_NC;
    static constexpr int MR     = BS_D_MR;
    static constexpr int NR     = BS_D_NR;

    static constexpr int rwidth = SIMD_REGISTER_WIDTH;
    static constexpr int align  = rwidth / 8;
    static constexpr int vlen   = rwidth / (8*sizeof(double));

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
    static_assert(rwidth % sizeof(double) == 0, "SIMD register width not sane.");
};

//-- aux routines --------------------------------------------------------------
template <typename Alpha, typename MX, typename MY>
void
geaxpy(const Alpha &alpha, const MX &X, MY &Y)
{
    assert(X.size1()==Y.size1());
    assert(X.size2()==Y.size2());

    typedef typename MX::size_type  size_type;

    for (size_type j=0; j<X.size2(); ++j) {
        for (size_type i=0; i<X.size1(); ++i) {
            Y(i,j) += alpha*X(i,j);
        }
    }
}

template <typename Alpha, typename MX>
void
gescal(const Alpha &alpha, MX &X)
{
    typedef typename MX::size_type  size_type;

    for (size_type j=0; j<X.size2(); ++j) {
        for (size_type i=0; i<X.size1(); ++i) {
            X(i,j) *= alpha;
        }
    }
}

template <typename Index, typename Alpha, typename TX, typename TY>
void
geaxpy(Index m, Index n,
       const Alpha &alpha,
       const TX *X, Index incRowX, Index incColX,
       TY       *Y, Index incRowY, Index incColY)
{
    for (Index j=0; j<n; ++j) {
        for (Index i=0; i<m; ++i) {
            Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
        }
    }
}

template <typename Index, typename Alpha, typename TX>
void
gescal(Index m, Index n,
       const Alpha &alpha,
       TX *X, Index incRowX, Index incColX)
{
    if (alpha!=Alpha(0)) {
        for (Index j=0; j<n; ++j) {
            for (Index i=0; i<m; ++i) {
                X[i*incRowX+j*incColX] *= alpha;
            }
        }
    } else {
        for (Index j=0; j<n; ++j) {
            for (Index i=0; i<m; ++i) {
                X[i*incRowX+j*incColX] = Alpha(0);
            }
        }
    }
}

//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
typename std::enable_if<BlockSize<T>::vlen == 0,
         void>::type
ugemm(Index kc, T alpha,
      const T *A, const T *B,
      T beta,
      T *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    T P[BlockSize<T>::MR*BlockSize<T>::NR];

    for (Index l=0; l<MR*NR; ++l) {
        P[l] = 0;
    }
    for (Index l=0; l<kc; ++l) {
        for (Index j=0; j<NR; ++j) {
            for (Index i=0; i<MR; ++i) {
                P[i+j*MR] += A[i+l*MR]*B[l*NR+j];
            }
        }
    }
    for (Index j=0; j<NR; ++j) {
        for (Index i=0; i<MR; ++i) {
            C[i*incRowC+j*incColC] *= beta;
            C[i*incRowC+j*incColC] += alpha*P[i+j*MR];
        }
    }
}

#ifdef HAVE_AVX
#include "avx.hpp"
#endif

#ifdef HAVE_FMA
#include "fma.hpp"
#endif

#ifdef HAVE_GCCVEC
#include "gccvec.hpp"
#endif

//-- Macro Kernel --------------------------------------------------------------
template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc,
      T alpha,
      const T *A, const T *B,
      Beta beta,
      TC *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    const Index mp  = (mc+MR-1) / MR;
    const Index np  = (nc+NR-1) / NR;
    const Index mr_ = mc % MR;
    const Index nr_ = nc % NR;

    #if defined(_OPENMP)
    #pragma omp parallel for
    #endif
    for (Index j=0; j<np; ++j) {
        const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
        T C_[BlockSize<T>::MR*BlockSize<T>::NR];

        for (Index i=0; i<mp; ++i) {
            const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;

            if (mr==MR && nr==NR) {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      beta,
                      &C[i*MR*incRowC+j*NR*incColC],
                      incRowC, incColC);
            } else {
                std::fill_n(C_, MR*NR, T(0));
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      T(0),
                      C_, Index(1), MR);
                gescal(mr, nr, beta,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
                geaxpy(mr, nr, T(1), C_, Index(1), MR,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
            }
        }
    }
}

//-- Packing blocks ------------------------------------------------------------
template <typename MA, typename T>
void
pack_A(const MA &A, T *p)
{
    typedef typename MA::size_type  size_type;

    size_type mc = A.size1();
    size_type kc = A.size2();
    size_type MR = BlockSize<T>::MR;
    size_type mp = (mc+MR-1) / MR;

    for (size_type j=0; j<kc; ++j) {
        for (size_type l=0; l<mp; ++l) {
            for (size_type i0=0; i0<MR; ++i0) {
                size_type i  = l*MR + i0;
                size_type nu = l*MR*kc + j*MR + i0;
                p[nu]        = (i<mc) ? A(i,j) : T(0);
            }
        }
    }
}

template <typename MB, typename T>
void
pack_B(const MB &B, T *p)
{
    typedef typename MB::size_type  size_type;

    size_type kc = B.size1();
    size_type nc = B.size2();
    size_type NR = BlockSize<T>::NR;
    size_type np = (nc+NR-1) / NR;

    for (size_type l=0; l<np; ++l) {
        for (size_type j0=0; j0<NR; ++j0) {
            for (size_type i=0; i<kc; ++i) {
                size_type j  = l*NR+j0;
                size_type nu = l*NR*kc + i*NR + j0;
                p[nu]        = (j<nc) ? B(i,j) : T(0);
            }
        }
    }
}

//-- Frame routine -------------------------------------------------------------
template <typename Alpha, typename MatrixA, typename MatrixB,
         typename Beta, typename MatrixC>
void
gemm(Alpha alpha, const MatrixA &A, const MatrixB &B, Beta beta, MatrixC &C)
{
    assert(A.size2()==B.size1());
    namespace ublas = boost::numeric::ublas;

    typedef typename MatrixC::size_type  size_type;
    typedef typename MatrixA::value_type TA;
    typedef typename MatrixB::value_type TB;
    typedef typename MatrixC::value_type TC;
    typedef typename std::common_type<Alpha, TA, TB>::type  T;

    const size_type MC = BlockSize<T>::MC;
    const size_type NC = BlockSize<T>::NC;
    const size_type MR = BlockSize<T>::MR;
    const size_type NR = BlockSize<T>::NR;

    const size_type m = C.size1();
    const size_type n = C.size2();
    const size_type k = A.size2();

    const size_type KC = BlockSize<T>::KC;
    const size_type mb = (m+MC-1) / MC;
    const size_type nb = (n+NC-1) / NC;
    const size_type kb = (k+KC-1) / KC;
    const size_type mc_ = m % MC;
    const size_type nc_ = n % NC;
    const size_type kc_ = k % KC;


    TC *C_ = &C(0,0);
    const size_type incRowC = &C(1,0) - &C(0,0);
    const size_type incColC = &C(0,1) - &C(0,0);

    T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*KC+MR));
    T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(KC*NC+NR));

    if (alpha==Alpha(0) || k==0) {
        gescal(beta, C);
        return;
    }

    for (size_type j=0; j<nb; ++j) {
        size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_;

        for (size_type l=0; l<kb; ++l) {
            size_type kc = (l!=kb-1 || kc_==0) ? KC : kc_;
            Beta beta_   = (l==0) ? beta : Beta(1);

            const auto Bs = subrange(B, l*KC, l*KC+kc, j*NC, j*NC+nc);
            pack_B(Bs, B_);

            for (size_type i=0; i<mb; ++i) {
                size_type mc = (i!=mb-1 || mc_==0) ? MC : mc_;

                const auto As = subrange(A, i*MC, i*MC+mc, l*KC, l*KC+kc);
                pack_A(As, A_);

                mgemm(mc, nc, kc,
                      T(alpha), A_, B_, beta_,
                      &C_[i*MC*incRowC+j*NC*incColC],
                      incRowC, incColC);
            }
        }
    }
    free_(A_);
    free_(B_);
}

#endif

Benchmark Results for GEMM

The benchmark tests \(C = A B\) for general matrices \(A\), \(B\) and \(C\). Please also test for cases where \(A\) or \(B\) are expressions.

Benchmark Program

session4/matprod.cc

#include <cassert>
#include <chrono>
#include <cmath>
#include <limits>
#include <random>
#include <type_traits>
#include <boost/timer.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include "gemm.hpp"

template <typename T>
struct WallTime
{
    void
    tic()
    {
        t0 = std::chrono::high_resolution_clock::now();
    }

    T
    toc()
    {
        using namespace std::chrono;

        elapsed = high_resolution_clock::now() - t0;
        return duration<T,seconds::period>(elapsed).count();
    }

    std::chrono::high_resolution_clock::time_point t0;
    std::chrono::high_resolution_clock::duration   elapsed;
};

// I guess this trait already exists or can be done more elegant ...
template <typename M>
struct MatrixType {
    static constexpr bool  isGeneral    = false;
    static constexpr bool  isSymmetric  = false;
    static constexpr bool  isHermitian  = false;
    static constexpr bool  isTriangular = false;
};

template <typename T, typename SO>
struct MatrixType<boost::numeric::ublas::matrix<T,SO> > {
    static constexpr bool  isGeneral    = true;
    static constexpr bool  isSymmetric  = false;
    static constexpr bool  isHermitian  = false;
    static constexpr bool  isTriangular = false;
};

// fill rectangular matrix with random values
template <typename MATRIX>
typename std::enable_if<MatrixType<MATRIX>::isGeneral,
         void>::type
fill(MATRIX &A)
{
    typedef typename MATRIX::size_type  size_type;
    typedef typename MATRIX::value_type T;

    std::random_device                  random;
    std::default_random_engine          mt(random());
    std::uniform_real_distribution<T>   uniform(-100,100);

    for (size_type i=0; i<A.size1(); ++i) {
        for (size_type j=0; j<A.size2(); ++j) {
            A(i,j) = uniform(mt);
        }
    }
}

template <typename MATRIX>
typename MATRIX::value_type
asum(const MATRIX &A)
{
    typedef typename MATRIX::size_type  size_type;
    typedef typename MATRIX::value_type T;

    T asum = 0;

    for (size_type i=0; i<A.size1(); ++i) {
        for (size_type j=0; j<A.size2(); ++j) {
            asum += std::abs(A(i,j));
        }
    }
    return asum;
}

template <typename MA, typename MB, typename MC0, typename MC1>
double
estimateGemmResidual(const MA &A, const MB &B,
                     const MC0 &C0, const MC1 &C1)
{
    typedef typename MC0::value_type   TC0;
    typedef typename MC0::size_type    size_type;

    size_type m= C1.size1();
    size_type n= C1.size2();
    size_type k= A.size2();

    double aNorm = asum(A);
    double bNorm = asum(B);
    double cNorm = asum(C1);
    double diff  = asum(C1-C0);
    // Using eps for double gives upper bound in case elements have lower
    // precision.
    double eps = std::numeric_limits<double>::epsilon();
    double res = diff/(aNorm*bNorm*cNorm*eps*std::max(std::max(m,n),k));
    return res;
}



namespace foo {

template <typename MATRIXA, typename MATRIXB, typename MATRIXC>
void
axpy_prod(const MATRIXA &A, const MATRIXB &B, MATRIXC &C, bool update)
{
    typedef typename MATRIXA::value_type TA;
    typedef typename MATRIXC::value_type TC;

    assert(A.size2()==B.size1());

    gemm(TA(1), A, B, TC(update ? 0 : 1), C);
}

} // namespace foo

#ifndef M_MAX
#define M_MAX 4000
#endif

#ifndef K_MAX
#define K_MAX 4000
#endif

#ifndef N_MAX
#define N_MAX 4000
#endif

int
main()
{
    namespace ublas = boost::numeric::ublas;

    const std::size_t m_min = 100;
    const std::size_t k_min = 100;
    const std::size_t n_min = 100;

    const std::size_t m_max = M_MAX;
    const std::size_t k_max = K_MAX;
    const std::size_t n_max = N_MAX;

    const std::size_t m_inc = 100;
    const std::size_t k_inc = 100;
    const std::size_t n_inc = 100;

    const bool matprodUpdate = true;

    typedef double              T;
    typedef ublas::row_major    SO;

    std::cout << "#   m";
    std::cout << "     n";
    std::cout << "     k";
    std::cout << "  uBLAS:   t1";
    std::cout << "       MFLOPS";
    std::cout << "   Blocked:   t2";
    std::cout << "      MFLOPS";
    std::cout << "        Diff nrm1";
    std::cout << std::endl;


    WallTime<double>  walltime;

    for (std::size_t m=m_min, k=k_min, n=n_min;
         m<=m_max && k<=k_max && n<=n_max;
         m += m_inc, k += k_inc, n += n_inc)
    {
        ublas::matrix<T,SO>     A(m, k);
        ublas::matrix<T,SO>     B(k, n);
        ublas::matrix<T,SO>     C1(m, n);
        ublas::matrix<T,SO>     C2(m, n);

        fill(A);
        fill(B);
        fill(C1);
        C2 = C1;

        walltime.tic();
        ublas::axpy_prod(A, B, C1, matprodUpdate);
        double t1 = walltime.toc();

        walltime.tic();
        foo::axpy_prod(A, B, C2, matprodUpdate);
        double t2 = walltime.toc();

        double res = estimateGemmResidual(A, B, C1, C2);

        std::cout.width(5);  std::cout << m << " ";
        std::cout.width(5);  std::cout << n << " ";
        std::cout.width(5);  std::cout << k << " ";
        std::cout.width(12); std::cout << t1 << " ";
        std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*k/t1 << " ";
        std::cout.width(15); std::cout << t2 << " ";
        std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*k/t2 << " ";
        std::cout.width(15); std::cout << res;
        std::cout << std::endl;
    }
}

Resuts

$shell> g++ -Ofast -mavx -Wall -std=c++11 -DHAVE_AVX -fopenmp -DNDEBUG -I ../boost_1_60_0 matprod.cc
$shell> ./a.out  > report.gemm.session4
$shell> cat report.gemm.session4
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Diff nrm1
  100   100   100     0.000827      2418.38        0.000306      6535.95               0
  200   200   200     0.006169      2593.61        0.000408      39215.7               0
  300   300   300     0.010054         5371         0.00114      47368.4     1.95068e-16
  400   400   400     0.023607      5422.12        0.002379      53804.1     8.39536e-17
  500   500   500     0.045778      5461.14        0.003895      64184.9     3.52985e-17
  600   600   600     0.080227      5384.72        0.006512      66339.1     1.62983e-17
  700   700   700     0.130662      5250.19        0.009802      69985.7     8.45155e-18
  800   800   800     0.207417      4936.91         0.01596      64160.4     4.71673e-18
  900   900   900      0.32338      4508.63        0.020666      70550.7     2.80048e-18
 1000  1000  1000     0.474364      4216.17        0.027074      73871.6     1.76058e-18
 1100  1100  1100     0.634478      4195.57        0.037055      71839.2     1.14713e-18
 1200  1200  1200     0.868618      3978.73         0.04757      72650.8     7.78273e-19
 1300  1300  1300      1.07822      4075.22        0.058402      75237.1     5.44648e-19
 1400  1400  1400      1.39077      3946.01        0.071462      76796.1     3.90201e-19
 1500  1500  1500      1.71005      3947.26        0.084887      79517.5     2.86504e-19
 1600  1600  1600      2.08305      3932.69        0.124203      65956.5     2.13818e-19
 1700  1700  1700      2.49311      3941.27        0.123427      79609.8     1.62944e-19
 1800  1800  1800      2.96805      3929.86        0.148842        78365     1.26159e-19
 1900  1900  1900      3.47857      3943.58        0.178341      76920.1     9.86272e-20
 2000  2000  2000      4.03543      3964.88        0.204527      78229.3     7.82724e-20
 2100  2100  2100      4.73412      3912.44        0.230168      80471.7      6.2824e-20
 2200  2200  2200      5.59092      3809.04        0.260903      81624.2     5.09102e-20
 2300  2300  2300       6.5266      3728.43        0.330762      73569.5     4.17457e-20
 2400  2400  2400      7.43927      3716.49        0.345519      80018.8     3.43634e-20
 2500  2500  2500      8.40704      3717.12        0.376525      82995.8     2.85796e-20
 2600  2600  2600      9.45964         3716        0.429706      81804.8      2.3956e-20
 2700  2700  2700      10.6612      3692.46        0.474293      82999.3     2.01636e-20
 2800  2800  2800      11.8969      3690.38        0.538666      81505.1     1.71253e-20
 2900  2900  2900      13.2132      3691.63        0.590193      82647.5      1.4617e-20
 3000  3000  3000      14.5794      3703.85         0.64663      83509.9     1.25305e-20
 3100  3100  3100      16.0776      3705.89         0.71265      83606.3     1.08095e-20
 3200  3200  3200      17.9667      3647.63        0.815718      80341.5     9.34951e-21
 3300  3300  3300      19.5265      3680.85        0.846187      84938.7     8.15149e-21
 3400  3400  3400      21.3146      3687.99        0.940302      83598.7     7.11752e-21
 3500  3500  3500      23.1942      3697.04         1.01066      84845.4     6.24469e-21
 3600  3600  3600      25.2668      3693.06         1.12898      82651.7     5.49412e-21
 3700  3700  3700      27.5336      3679.35         1.19162      85015.4     4.85316e-21
 3800  3800  3800      29.8748      3673.47         1.32286      82959.6     4.30078e-21
 3900  3900  3900      32.2766      3675.67          1.3989      84808.2     3.82272e-21
 4000  4000  4000      34.6874       3690.1         1.51476      84501.8     3.41036e-21
$shell> gnuplot plot.session4.gemm.mflops
$shell> gnuplot plot.session4.gemm.time
$shell> gnuplot plot.session4.gemm.time_log
$shell>

MFLOPS

Time

Time with Logarithmic scale

Benchmark SYMM

The benchmark tests \(C = A B\) for a symmetric matrix \(A\) in packed storage format. Please also test for cases where \(A\) or \(B\) are expressions.

Benchmark Program

session4/symatprod.cc

#include <cassert>
#include <chrono>
#include <cmath>
#include <random>
#include <type_traits>
#include <boost/timer.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/symmetric.hpp>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include "gemm.hpp"

template <typename T>
struct WallTime
{
    void
    tic()
    {
        t0 = std::chrono::high_resolution_clock::now();
    }

    T
    toc()
    {
        using namespace std::chrono;

        elapsed = high_resolution_clock::now() - t0;
        return duration<T,seconds::period>(elapsed).count();
    }

    std::chrono::high_resolution_clock::time_point t0;
    std::chrono::high_resolution_clock::duration   elapsed;
};

// I guess this trait already exists or can be done more elegant ...
template <typename M>
struct MatrixType {
    static constexpr bool  isGeneral    = false;
    static constexpr bool  isSymmetric  = false;
    static constexpr bool  isHermitian  = false;
    static constexpr bool  isTriangular = false;
};

template <typename T, typename SO>
struct MatrixType<boost::numeric::ublas::matrix<T,SO> > {
    static constexpr bool  isGeneral    = true;
    static constexpr bool  isSymmetric  = false;
    static constexpr bool  isHermitian  = false;
    static constexpr bool  isTriangular = false;
};

template <typename A, typename TRL>
struct MatrixType<boost::numeric::ublas::symmetric_adaptor<A, TRL> > {
    static constexpr bool  isGeneral    = true;
    static constexpr bool  isSymmetric  = false;
    static constexpr bool  isHermitian  = false;
    static constexpr bool  isTriangular = false;
};

template <typename TA>
struct MatrixType<boost::numeric::ublas::symmetric_matrix<TA> > {
    static constexpr bool  isGeneral    = true;
    static constexpr bool  isSymmetric  = false;
    static constexpr bool  isHermitian  = false;
    static constexpr bool  isTriangular = false;
};


// fill rectangular matrix with random values
template <typename MATRIX>
typename std::enable_if<MatrixType<MATRIX>::isGeneral,
         void>::type
fill(MATRIX &A)
{
    typedef typename MATRIX::size_type  size_type;
    typedef typename MATRIX::value_type T;

    std::random_device                  random;
    std::default_random_engine          mt(random());
    std::uniform_real_distribution<T>   uniform(-100,100);

    for (size_type i=0; i<A.size1(); ++i) {
        for (size_type j=0; j<A.size2(); ++j) {
            A(i,j) = uniform(mt);
        }
    }
}

// fill symmetric matrix with random values
template <typename MATRIX>
typename std::enable_if<MatrixType<MATRIX>::isSymmetric,
         void>::type
fill(MATRIX &A)
{
    using namespace boost::numeric::ublas;

    typedef typename MATRIX::size_type       size_type;
    typedef typename MATRIX::value_type      T;
    typedef typename MATRIX::triangular_type triangular_type;

    std::random_device                  random;
    std::default_random_engine          mt(random());
    std::uniform_real_distribution<T>   uniform(-100,100);

    if (std::is_same<triangular_type, upper>::value) {
        for (size_type j=0; j<A.size2(); ++j) {
            for (size_type i=0; i<=j; ++i) {
                A(i,j) = uniform(mt);
            }
        }
    } else {
        for (size_type j=0; j<A.size2(); ++j) {
            for (size_type i=j; i<=A.size1(); ++i) {
                A(i,j) = uniform(mt);
            }
        }
    }
}

template <typename MATRIX>
typename MATRIX::value_type
asum(const MATRIX &A)
{
    typedef typename MATRIX::size_type  size_type;
    typedef typename MATRIX::value_type T;

    T asum = 0;

    for (size_type i=0; i<A.size1(); ++i) {
        for (size_type j=0; j<A.size2(); ++j) {
            asum += std::abs(A(i,j));
        }
    }
    return asum;
}

template <typename MA, typename MB, typename MC0, typename MC1>
double
estimateGemmResidual(const MA &A, const MB &B,
                     const MC0 &C0, const MC1 &C1)
{
    typedef typename MC0::value_type   TC0;
    typedef typename MC0::size_type    size_type;

    size_type m= C1.size1();
    size_type n= C1.size2();
    size_type k= A.size2();

    double aNorm = asum(A);
    double bNorm = asum(B);
    double cNorm = asum(C1);
    double diff  = asum(C1-C0);
    // Using eps for double gives upper bound in case elements have lower
    // precision.
    double eps = std::numeric_limits<double>::epsilon();
    double res = diff/(aNorm*bNorm*cNorm*eps*std::max(std::max(m,n),k));
    return res;
}

namespace foo {

template <typename MATRIXA, typename MATRIXB, typename MATRIXC>
void
axpy_prod(const MATRIXA &A, const MATRIXB &B, MATRIXC &C, bool update)
{
    typedef typename MATRIXA::value_type TA;
    typedef typename MATRIXC::value_type TC;

    assert(A.size2()==B.size1());

    gemm(TA(1), A, B, TC(update ? 0 : 1), C);
}

} // namespace foo


#ifndef M_MAX
#define M_MAX 4000
#endif

#ifndef K_MAX
#define K_MAX 4000
#endif

#ifndef N_MAX
#define N_MAX 4000
#endif

#ifndef UPLO
#define UPLO ublas::lower
#endif

int
main()
{
    namespace ublas = boost::numeric::ublas;

    const std::size_t m_min = 100;
    const std::size_t n_min = 100;

    const std::size_t m_max = M_MAX;
    const std::size_t n_max = N_MAX;

    const std::size_t m_inc = 100;
    const std::size_t n_inc = 100;

    const bool matprodUpdate = true;

    typedef double              T;
    typedef ublas::column_major    SO;

    std::cout << "#   m";
    std::cout << "     n";
    std::cout << "  uBLAS:   t1";
    std::cout << "       MFLOPS";
    std::cout << "   Blocked:   t2";
    std::cout << "      MFLOPS";
    std::cout << "        Diff nrm1";
    std::cout << std::endl;


    WallTime<double>  walltime;

    for (std::size_t m=m_min, n=n_min;
         m<=m_max && n<=n_max;
         m += m_inc, n += n_inc)
    {
        ublas::symmetric_matrix<T>                              A(m);
        ublas::matrix<T,SO>                                     B(m, n);
        ublas::matrix<T,SO>                                     C1(m, n);
        ublas::matrix<T,SO>                                     C2(m, n);

        fill(A);
        fill(B);
        fill(C1);
        C2 = C1;

        walltime.tic();
        ublas::axpy_prod(A, B, C1, matprodUpdate);
        double t1 = walltime.toc();

        walltime.tic();
        foo::axpy_prod(A, B, C2, matprodUpdate);
        double t2 = walltime.toc();

        double res = estimateGemmResidual(A, B, C1, C2);

        std::cout.width(5);  std::cout << m << " ";
        std::cout.width(5);  std::cout << n << " ";
        std::cout.width(12); std::cout << t1 << " ";
        std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*m/t1 << " ";
        std::cout.width(15); std::cout << t2 << " ";
        std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*m/t2 << " ";
        std::cout.width(15); std::cout << res;
        std::cout << std::endl;
    }
}

Resuts

$shell> g++ -Ofast -mavx -Wall -std=c++11 -DHAVE_AVX -fopenmp -DNDEBUG -DM_MAX=1500 -I ../boost_1_60_0 symatprod.cc
$shell> ./a.out  > report.symm.session4
$shell> cat report.symm.session4
#   m     n  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Diff nrm1
  100   100     0.003826      522.739        0.000368      5434.78               0
  200   200     0.019738      810.619        0.000448      35714.3               0
  300   300     0.047816      1129.33        0.001153      46834.3      1.9448e-16
  400   400     0.117269      1091.51        0.002268      56437.4     8.38772e-17
  500   500     0.232305      1076.17        0.004088      61154.6     3.54522e-17
  600   600     0.408112      1058.53        0.006774      63773.3     1.63089e-17
  700   700     0.654633      1047.92        0.010262      66848.6      8.4569e-18
  800   800      1.09888      931.862        0.015027        68144     4.71215e-18
  900   900      1.90133      766.831        0.021053      69253.8     2.80651e-18
 1000  1000      3.03804       658.32        0.027982      71474.5     1.75932e-18
 1100  1100      4.76968      558.109         0.03664      72652.8     1.14902e-18
 1200  1200      7.09646      487.003        0.046163      74865.2     7.78259e-19
 1300  1300      9.70487      452.763        0.058498      75113.7     5.45503e-19
 1400  1400         12.9      425.425         0.07085      77459.4     3.89462e-19
 1500  1500      16.4966      409.175        0.085683      78778.8     2.86317e-19
$shell> gnuplot plot.session4.symm.mflops
$shell> gnuplot plot.session4.symm.time
$shell> gnuplot plot.session4.symm.time_log
$shell>