Content

Matrix-Matrix Product with C++ and ublas

Computing a matrix-matrix product \(C = AB\) for a \(m\times n\) matrix \(C\), \(m \times k\) matrix \(A\) and \(k \times n\) matrix \(B\).

All kept very simple ...

Compile and Run Benchmark

For a quick benchmark we limit \(m\) to \(500\) with -DM_MAX=500:

$shell> g++ -Ofast -mavx -Wall -DNDEBUG -std=c++11 -DM_MAX=500 -I ../boost_1_60_0 matprod.cc
$shell> ./a.out
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Diff nrm1
  100   100   100     0.000819         2442        0.000842       2375.3     6.20455e-14
  200   200   200     0.006837      2340.21        0.003703      4320.82     2.71434e-15
  300   300   300     0.010083      5355.55        0.008251      6544.66     4.28955e-16
  400   400   400     0.023768      5385.39        0.018627      6871.75     1.13314e-16
  500   500   500     0.045824      5455.66        0.035936      6956.81     4.11985e-17
$shell> 

The columns in the output are:

Main Program with Benchmark and Test

#include <cassert>
#include <chrono>
#include <cmath>
#include <random>
#include <boost/timer.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include "gemm.hpp"

template <typename T>
struct WallTime
{
    void
    tic()
    {
        t0 = std::chrono::high_resolution_clock::now();
    }

    T
    toc()
    {
        using namespace std::chrono;

        elapsed = high_resolution_clock::now() - t0;
        return duration<T,seconds::period>(elapsed).count();
    }

    std::chrono::high_resolution_clock::time_point t0;
    std::chrono::high_resolution_clock::duration   elapsed;
};

template <typename MATRIX>
void
fill(MATRIX &A)
{
    typedef typename MATRIX::size_type  size_type;
    typedef typename MATRIX::value_type T;

    std::random_device                  random;
    std::default_random_engine          mt(random());
    std::uniform_real_distribution<T>   uniform(-100,100);

    for (size_type i=0; i<A.size1(); ++i) {
        for (size_type j=0; j<A.size2(); ++j) {
            A(i,j) = uniform(mt);
        }
    }
}

template <typename MATRIX>
typename MATRIX::value_type
asum(const MATRIX &A)
{
    typedef typename MATRIX::size_type  size_type;
    typedef typename MATRIX::value_type T;

    T asum = 0;

    for (size_type i=0; i<A.size1(); ++i) {
        for (size_type j=0; j<A.size2(); ++j) {
            asum += std::abs(A(i,j));
        }
    }
    return asum;
}

template <typename MA, typename MB, typename MC0, typename MC1>
double
estimateGemmResidual(const MA &A, const MB &B,
                     const MC0 &C0, const MC1 &C1)
{
    typedef typename MC0::size_type    size_type;

    size_type m= C1.size1();
    size_type n= C1.size2();
    size_type k= A.size2();

    double aNorm = asum(A);
    double bNorm = asum(B);
    double cNorm = asum(C1);
    double diff  = asum(C1-C0);
    // Using eps for double gives upper bound in case elements have lower
    // precision.
    double eps = std::numeric_limits<double>::epsilon();
    double res = diff/(aNorm*bNorm*cNorm*eps*std::max(std::max(m,n),k));
    return res;
}

namespace foo {

template <typename MATRIXA, typename MATRIXB, typename MATRIXC>
void
axpy_prod(const MATRIXA &A, const MATRIXB &B, MATRIXC &C, bool update)
{
    typedef typename MATRIXA::value_type TA;
    typedef typename MATRIXC::value_type TC;

    assert(A.size2()==B.size1());

    const std::ptrdiff_t m = C.size1();
    const std::ptrdiff_t n = C.size2();
    const std::ptrdiff_t k = A.size2();

    const std::ptrdiff_t incRowA = &A(1,0) - &A(0,0);
    const std::ptrdiff_t incColA = &A(0,1) - &A(0,0);

    const std::ptrdiff_t incRowB = &B(1,0) - &B(0,0);
    const std::ptrdiff_t incColB = &B(0,1) - &B(0,0);

    const std::ptrdiff_t incRowC = &C(1,0) - &C(0,0);
    const std::ptrdiff_t incColC = &C(0,1) - &C(0,0);

    gemm(m, n, k, TA(1),
         &A(0,0), incRowA, incColA,
         &B(0,0), incRowB, incColB,
         TC(update ? 0 : 1),
         &C(0,0), incRowC, incColC);
}

} // namespace foo

#ifndef M_MAX
#define M_MAX 4000
#endif

#ifndef K_MAX
#define K_MAX 4000
#endif

#ifndef N_MAX
#define N_MAX 4000
#endif

int
main()
{
    namespace ublas = boost::numeric::ublas;

    const std::size_t m_min = 100;
    const std::size_t k_min = 100;
    const std::size_t n_min = 100;

    const std::size_t m_max = M_MAX;
    const std::size_t k_max = K_MAX;
    const std::size_t n_max = N_MAX;

    const std::size_t m_inc = 100;
    const std::size_t k_inc = 100;
    const std::size_t n_inc = 100;

    const bool matprodUpdate = true;

    typedef double              T;
    typedef ublas::row_major    SO;

    std::cout << "#   m";
    std::cout << "     n";
    std::cout << "     k";
    std::cout << "  uBLAS:   t1";
    std::cout << "       MFLOPS";
    std::cout << "   Blocked:   t2";
    std::cout << "      MFLOPS";
    std::cout << "        Diff nrm1";
    std::cout << std::endl;


    WallTime<double>  walltime;

    for (std::size_t m=m_min, k=k_min, n=n_min;
         m<=m_max && k<=k_max && n<=n_max;
         m += m_inc, k += k_inc, n += n_inc)
    {
        ublas::matrix<T,SO>     A(m, k);
        ublas::matrix<T,SO>     B(k, n);
        ublas::matrix<T,SO>     C1(m, n);
        ublas::matrix<T,SO>     C2(m, n);

        fill(A);
        fill(B);
        fill(C1);
        C2 = C1;

        walltime.tic();
        ublas::axpy_prod(A, B, C1, matprodUpdate);
        double t1 = walltime.toc();

        walltime.tic();
        foo::axpy_prod(A, B, C2, matprodUpdate);
        double t2 = walltime.toc();

        double res = estimateGemmResidual(A, B, C1, C2);

        std::cout.width(5);  std::cout << m << " ";
        std::cout.width(5);  std::cout << n << " ";
        std::cout.width(5);  std::cout << k << " ";
        std::cout.width(12); std::cout << t1 << " ";
        std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*k/t1 << " ";
        std::cout.width(15); std::cout << t2 << " ";
        std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*k/t2 << " ";
        std::cout.width(15); std::cout << res;
        std::cout << std::endl;
    }
}

Core Function for Matrix-Matrix Produkt

// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core

#ifndef GEMM_HPP
#define GEMM_HPP
#include <algorithm>

//-- Config --------------------------------------------------------------------
template <typename T>
struct BlockSize
{
    static const int MC = 64;
    static const int KC = 64;
    static const int NC = 256;
    static const int MR = 8;
    static const int NR = 8;
    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};


template <>
struct BlockSize<double>
{
    static const int MC = 256;
    static const int KC = 256;
    static const int NC = 4096;
    static const int MR = 4;
    static const int NR = 8;
    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};

//-- aux routines --------------------------------------------------------------
template <typename Index, typename Alpha, typename TX, typename TY>
void
geaxpy(Index m, Index n,
       const Alpha &alpha,
       const TX *X, Index incRowX, Index incColX,
       TY       *Y, Index incRowY, Index incColY)
{
    for (Index j=0; j<n; ++j) {
        for (Index i=0; i<m; ++i) {
            Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
        }
    }
}

template <typename Index, typename Alpha, typename TX>
void
gescal(Index m, Index n,
       const Alpha &alpha,
       TX *X, Index incRowX, Index incColX)
{
    if (alpha!=Alpha(0)) {
        for (Index j=0; j<n; ++j) {
            for (Index i=0; i<m; ++i) {
                X[i*incRowX+j*incColX] *= alpha;
            }
        }
    } else {
        for (Index j=0; j<n; ++j) {
            for (Index i=0; i<m; ++i) {
                X[i*incRowX+j*incColX] = Alpha(0);
            }
        }
    }
}

//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
void
ugemm(Index kc, T alpha,
      const T *A, const T *B,
      T beta,
      T *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    T P[BlockSize<T>::MR*BlockSize<T>::NR];

    for (Index l=0; l<MR*NR; ++l) {
        P[l] = 0;
    }
    for (Index l=0; l<kc; ++l) {
        for (Index j=0; j<NR; ++j) {
            for (Index i=0; i<MR; ++i) {
                P[i+j*MR] += A[i+l*MR]*B[l*NR+j];
            }
        }
    }
    if (beta!=T(0)) {
        for (Index j=0; j<NR; ++j) {
            for (Index i=0; i<MR; ++i) {
                C[i*incRowC+j*incColC] *= beta;
                C[i*incRowC+j*incColC] += alpha*P[i+j*MR];
            }
        }
    } else {
        for (Index j=0; j<NR; ++j) {
            for (Index i=0; i<MR; ++i) {
                C[i*incRowC+j*incColC] = alpha*P[i+j*MR];
            }
        }
    }
}


//-- Macro Kernel --------------------------------------------------------------
template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc,
      T alpha,
      const T *A, const T *B,
      Beta beta,
      TC *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    const Index mp  = (mc+MR-1) / MR;
    const Index np  = (nc+NR-1) / NR;
    const Index mr_ = mc % MR;
    const Index nr_ = nc % NR;

    T C_[MR*NR];

    for (Index j=0; j<np; ++j) {
        const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;

        for (Index i=0; i<mp; ++i) {
            const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;

            if (mr==MR && nr==NR) {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      beta,
                      &C[i*MR*incRowC+j*NR*incColC],
                      incRowC, incColC);
            } else {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      T(0),
                      C_, Index(1), MR);
                gescal(mr, nr, beta,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
                geaxpy(mr, nr, T(1), C_, Index(1), MR,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
            }
        }
    }
}

//-- Packing blocks ------------------------------------------------------------
template <typename Index, typename TA, typename T>
void
pack_A(Index mc, Index kc,
       const TA *A, Index incRowA, Index incColA,
       T *p)
{
    Index MR = BlockSize<T>::MR;
    Index mp = (mc+MR-1) / MR;

    for (Index j=0; j<kc; ++j) {
        for (Index l=0; l<mp; ++l) {
            for (Index i0=0; i0<MR; ++i0) {
                Index i  = l*MR + i0;
                Index nu = l*MR*kc + j*MR + i0;
                p[nu]   = (i<mc) ? A[i*incRowA+j*incColA]
                                 : T(0);
            }
        }
    }
}

template <typename Index, typename TB, typename T>
void
pack_B(Index kc, Index nc,
       const TB *B, Index incRowB, Index incColB,
       T *p)
{
    Index NR = BlockSize<T>::NR;
    Index np = (nc+NR-1) / NR;

    for (Index l=0; l<np; ++l) {
        for (Index j0=0; j0<NR; ++j0) {
            for (Index i=0; i<kc; ++i) {
                Index j  = l*NR+j0;
                Index nu = l*NR*kc + i*NR + j0;
                p[nu]   = (j<nc) ? B[i*incRowB+j*incColB]
                                 : T(0);
            }
        }
    }
}

//-- Frame routine -------------------------------------------------------------
template <typename Index, typename Alpha,
         typename TA, typename TB,
         typename Beta,
         typename TC>
void
gemm(Index m, Index n, Index k,
     Alpha alpha,
     const TA *A, Index incRowA, Index incColA,
     const TB *B, Index incRowB, Index incColB,
     Beta beta,
     TC *C, Index incRowC, Index incColC)
{
    typedef typename std::common_type<Alpha, TA, TB>::type  T;

    const Index MC = BlockSize<T>::MC;
    const Index NC = BlockSize<T>::NC;
    const Index KC = BlockSize<T>::KC;
    const Index mb = (m+MC-1) / MC;
    const Index nb = (n+NC-1) / NC;
    const Index kb = (k+KC-1) / KC;
    const Index mc_ = m % MC;
    const Index nc_ = n % NC;
    const Index kc_ = k % KC;

    T *A_ = new T[MC*KC];
    T *B_ = new T[KC*NC];

    if (alpha==Alpha(0) || k==0) {
        gescal(m, n, beta, C, incRowC, incColC);
        return;
    }

    for (Index j=0; j<nb; ++j) {
        Index nc = (j!=nb-1 || nc_==0) ? NC : nc_;

        for (Index l=0; l<kb; ++l) {
            Index   kc  = (l!=kb-1 || kc_==0) ? KC : kc_;
            Beta beta_  = (l==0) ? beta : Beta(1);

            pack_B(kc, nc,
                   &B[l*KC*incRowB+j*NC*incColB],
                   incRowB, incColB,
                   B_);

            for (Index i=0; i<mb; ++i) {
                Index mc = (i!=mb-1 || mc_==0) ? MC : mc_;

                pack_A(mc, kc,
                       &A[i*MC*incRowA+l*KC*incColA],
                       incRowA, incColA,
                       A_);

                mgemm(mc, nc, kc,
                      T(alpha), A_, B_, beta_,
                      &C[i*MC*incRowC+j*NC*incColC],
                      incRowC, incColC);
            }
        }
    }
    delete [] A_;
    delete [] B_;
}

#endif

Benchmark Results

$shell> cat /proc/cpuinfo
processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 58
model name	: Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz
stepping	: 9
microcode	: 0x1b
cpu MHz		: 1600.000
cache size	: 6144 KB
physical id	: 0
siblings	: 4
core id		: 0
cpu cores	: 4
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
bogomips	: 6385.07
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 1
vendor_id	: GenuineIntel
cpu family	: 6
model		: 58
model name	: Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz
stepping	: 9
microcode	: 0x1b
cpu MHz		: 1600.000
cache size	: 6144 KB
physical id	: 0
siblings	: 4
core id		: 1
cpu cores	: 4
apicid		: 2
initial apicid	: 2
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
bogomips	: 6385.19
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 2
vendor_id	: GenuineIntel
cpu family	: 6
model		: 58
model name	: Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz
stepping	: 9
microcode	: 0x1b
cpu MHz		: 1600.000
cache size	: 6144 KB
physical id	: 0
siblings	: 4
core id		: 2
cpu cores	: 4
apicid		: 4
initial apicid	: 4
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
bogomips	: 6385.19
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 3
vendor_id	: GenuineIntel
cpu family	: 6
model		: 58
model name	: Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz
stepping	: 9
microcode	: 0x1b
cpu MHz		: 1600.000
cache size	: 6144 KB
physical id	: 0
siblings	: 4
core id		: 3
cpu cores	: 4
apicid		: 6
initial apicid	: 6
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
bogomips	: 6385.19
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:
$shell> g++ -Ofast -mavx -Wall -DNDEBUG -std=c++11 -I ../boost_1_60_0 matprod.cc
$shell> ./a.out  > report.session1
$shell> cat report.session1
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Diff nrm1
  100   100   100      0.00082      2439.02        0.000832      2403.85     6.15072e-14
  200   200   200     0.006832      2341.92         0.00543      2946.59     2.71864e-15
  300   300   300     0.010102      5345.48        0.008225      6565.35     4.21274e-16
  400   400   400     0.023674      5406.78        0.018666      6857.39     1.14023e-16
  500   500   500     0.045833      5454.59        0.035914      6961.07     4.13976e-17
  600   600   600     0.081284       5314.7        0.062157      6950.14     1.79329e-17
  700   700   700     0.129532      5295.99         0.09774      7018.62     8.90805e-18
  800   800   800     0.210805      4857.57        0.148193      6909.91     4.87688e-18
  900   900   900      0.32142      4536.12          0.2082      7002.88     2.85418e-18
 1000  1000  1000       0.4617      4331.82        0.282644      7076.04     1.76828e-18
 1100  1100  1100      0.64577      4122.21        0.378013      7042.09     1.14454e-18
 1200  1200  1200     0.839814       4115.2        0.484853      7127.93     7.71812e-19
 1300  1300  1300      1.03782      4233.89        0.626335      7015.42     5.37115e-19
 1400  1400  1400      1.30557      4203.51        0.772569      7103.57     3.83696e-19
 1500  1500  1500      1.61648      4175.73        0.942957      7158.33     2.81209e-19
 1600  1600  1600       1.9694      4159.64         1.17412      6977.15     2.09789e-19
 1700  1700  1700      2.37052      4145.08         1.37257      7158.84     1.59207e-19
 1800  1800  1800      2.82108      4134.58         1.63629      7128.31     1.23088e-19
 1900  1900  1900      3.32736      4122.79         1.93345      7095.09     9.63235e-20
 2000  2000  2000      3.88764       4115.6         2.22819      7180.71     7.64161e-20
 2100  2100  2100      4.59908      4027.33          2.5915      7147.21     6.13089e-20
 2200  2200  2200      5.46934       3893.7         2.96199      7189.75     4.96631e-20
 2300  2300  2300      6.40947      3796.57         3.44146      7070.84     4.06371e-20
 2400  2400  2400      7.37459      3749.09         3.85586      7170.38     3.34798e-20
 2500  2500  2500       8.3659       3735.4         4.33895       7202.2     2.78472e-20
 2600  2600  2600      9.45125       3719.3         4.90042      7173.27     2.33145e-20
 2700  2700  2700      10.6491      3696.64         5.46821      7199.06     1.96765e-20
 2800  2800  2800      11.8888      3692.87         6.08888      7210.52     1.66894e-20
 2900  2900  2900       13.206      3693.63          6.7976      7175.77     1.42455e-20
 3000  3000  3000      14.5646      3707.62         7.47821      7220.98     1.22104e-20
 3100  3100  3100      16.0738      3706.78         8.30422      7174.91     1.05387e-20
 3200  3200  3200      17.8987       3661.5         9.15534      7158.23     9.12413e-21
 3300  3300  3300       19.513      3683.38         9.95183      7222.19     7.94422e-21
 3400  3400  3400      21.3111       3688.6         10.9218      7197.32     6.94222e-21
 3500  3500  3500      23.2202       3692.9          11.877      7219.81     6.08876e-21
 3600  3600  3600      25.2252      3699.16         12.9813       7188.2     5.36298e-21
 3700  3700  3700      27.4847      3685.91         14.0403      7215.36     4.73651e-21
 3800  3800  3800       29.853      3676.14         15.2719         7186     4.19908e-21
 3900  3900  3900      32.2609      3677.45         16.4889      7195.04     3.73515e-21
 4000  4000  4000      34.7002      3688.74         17.7425      7214.32     3.32962e-21
$shell> gnuplot plot.session1.mflops
$shell> gnuplot plot.session1.time
$shell> gnuplot plot.session1.time_log
$shell> 

MFLOPS

Time

Time with Logarithmic scale