Content

Using Optimized Micro-Kernels

Changes:

Tar-Ball for this Session

The tar-ball session2.tgz contains the files:

$shell> tar tfvz session2.tgz
-rw-rw-r-- lehn/num       5158 2016-02-02 18:35 session2/matprod.cc
-rw-rw-r-- lehn/num      17181 2016-02-02 18:17 session2/avx.hpp
-rw-rw-r-- lehn/num      33544 2016-02-02 19:53 session2/fma.hpp
-rw-rw-r-- lehn/num       8714 2016-02-02 19:51 session2/gemm.hpp
$shell> 

Compile and Run Benchmark

For a quick benchmark we again limit \(m\) to \(500\) with -DM_MAX=500. This time we also enable the optimized micro kernel for AVX with -DHAVE_AVX:

$shell> g++ -Ofast -mavx -Wall -std=c++11 -DNDEBUG -DHAVE_AVX -DM_MAX=500 -I ../boost_1_60_0 matprod.cc
$shell> ./a.out
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Residual
  100   100   100     0.000827      2418.38        0.000323      6191.95               0
  200   200   200     0.006825      2344.32        0.001831      8738.39               0
  300   300   300      0.01007      5362.46        0.002697      20022.2     1.92848e-16
  400   400   400     0.023609      5421.66        0.005953      21501.8     8.39383e-17
  500   500   500     0.045823      5455.78        0.011097      22528.6     3.52756e-17
$shell> 

Main Program with Benchmark and Test

#include <cassert>
#include <chrono>
#include <cmath>
#include <random>
#include <boost/timer.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include "gemm.hpp"

template <typename T>
struct WallTime
{
    void
    tic()
    {
        t0 = std::chrono::high_resolution_clock::now();
    }

    T
    toc()
    {
        using namespace std::chrono;

        elapsed = high_resolution_clock::now() - t0;
        return duration<T,seconds::period>(elapsed).count();
    }

    std::chrono::high_resolution_clock::time_point t0;
    std::chrono::high_resolution_clock::duration   elapsed;
};

template <typename MATRIX>
void
fill(MATRIX &A)
{
    typedef typename MATRIX::size_type  size_type;
    typedef typename MATRIX::value_type T;

    std::random_device                  random;
    std::default_random_engine          mt(random());
    std::uniform_real_distribution<T>   uniform(-100,100);

    for (size_type i=0; i<A.size1(); ++i) {
        for (size_type j=0; j<A.size2(); ++j) {
            A(i,j) = uniform(mt);
        }
    }
}

template <typename MATRIX>
typename MATRIX::value_type
asum(const MATRIX &A)
{
    typedef typename MATRIX::size_type  size_type;
    typedef typename MATRIX::value_type T;

    T asum = 0;

    for (size_type i=0; i<A.size1(); ++i) {
        for (size_type j=0; j<A.size2(); ++j) {
            asum += std::abs(A(i,j));
        }
    }
    return asum;
}

template <typename MA, typename MB, typename MC0, typename MC1>
double
estimateGemmResidual(const MA &A, const MB &B,
                     const MC0 &C0, const MC1 &C1)
{
    typedef typename MC0::size_type    size_type;

    size_type m= C1.size1();
    size_type n= C1.size2();
    size_type k= A.size2();

    double aNorm = asum(A);
    double bNorm = asum(B);
    double cNorm = asum(C1);
    double diff  = asum(C1-C0);
    // Using eps for double gives upper bound in case elements have lower
    // precision.
    double eps = std::numeric_limits<double>::epsilon();
    double res = diff/(aNorm*bNorm*cNorm*eps*std::max(std::max(m,n),k));
    return res;
}

namespace foo {

template <typename MATRIXA, typename MATRIXB, typename MATRIXC>
void
axpy_prod(const MATRIXA &A, const MATRIXB &B, MATRIXC &C, bool update)
{
    typedef typename MATRIXA::value_type TA;
    typedef typename MATRIXC::value_type TC;

    assert(A.size2()==B.size1());

    const std::ptrdiff_t m = C.size1();
    const std::ptrdiff_t n = C.size2();
    const std::ptrdiff_t k = A.size2();

    const std::ptrdiff_t incRowA = &A(1,0) - &A(0,0);
    const std::ptrdiff_t incColA = &A(0,1) - &A(0,0);

    const std::ptrdiff_t incRowB = &B(1,0) - &B(0,0);
    const std::ptrdiff_t incColB = &B(0,1) - &B(0,0);

    const std::ptrdiff_t incRowC = &C(1,0) - &C(0,0);
    const std::ptrdiff_t incColC = &C(0,1) - &C(0,0);

    gemm(m, n, k, TA(1),
         &A(0,0), incRowA, incColA,
         &B(0,0), incRowB, incColB,
         TC(update ? 0 : 1),
         &C(0,0), incRowC, incColC);
}

} // namespace foo

#ifndef M_MAX
#define M_MAX 4000
#endif

#ifndef K_MAX
#define K_MAX 4000
#endif

#ifndef N_MAX
#define N_MAX 4000
#endif

int
main()
{
    namespace ublas = boost::numeric::ublas;

    const std::size_t m_min = 100;
    const std::size_t k_min = 100;
    const std::size_t n_min = 100;

    const std::size_t m_max = M_MAX;
    const std::size_t k_max = K_MAX;
    const std::size_t n_max = N_MAX;

    const std::size_t m_inc = 100;
    const std::size_t k_inc = 100;
    const std::size_t n_inc = 100;

    const bool matprodUpdate = true;

    typedef double              T;
    typedef ublas::row_major    SO;

    std::cout << "#   m";
    std::cout << "     n";
    std::cout << "     k";
    std::cout << "  uBLAS:   t1";
    std::cout << "       MFLOPS";
    std::cout << "   Blocked:   t2";
    std::cout << "      MFLOPS";
    std::cout << "        Residual";
    std::cout << std::endl;


    WallTime<double>  walltime;

    for (std::size_t m=m_min, k=k_min, n=n_min;
         m<=m_max && k<=k_max && n<=n_max;
         m += m_inc, k += k_inc, n += n_inc)
    {
        ublas::matrix<T,SO>     A(m, k);
        ublas::matrix<T,SO>     B(k, n);
        ublas::matrix<T,SO>     C1(m, n);
        ublas::matrix<T,SO>     C2(m, n);

        fill(A);
        fill(B);
        fill(C1);
        C2 = C1;

        walltime.tic();
        ublas::axpy_prod(A, B, C1, matprodUpdate);
        double t1 = walltime.toc();

        walltime.tic();
        foo::axpy_prod(A, B, C2, matprodUpdate);
        double t2 = walltime.toc();

        double res = estimateGemmResidual(A, B, C1, C2);

        std::cout.width(5);  std::cout << m << " ";
        std::cout.width(5);  std::cout << n << " ";
        std::cout.width(5);  std::cout << k << " ";
        std::cout.width(12); std::cout << t1 << " ";
        std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*k/t1 << " ";
        std::cout.width(15); std::cout << t2 << " ";
        std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*k/t2 << " ";
        std::cout.width(15); std::cout << res;
        std::cout << std::endl;
    }
}

Core Function for Matrix-Matrix Produkt

// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core

#ifndef GEMM_HPP
#define GEMM_HPP
#include <algorithm>
#include <cstdlib>

//-- new with alignment --------------------------------------------------------
void *
malloc_(std::size_t alignment, std::size_t size)
{
    alignment = std::max(alignment, alignof(void *));
    size     += alignment;

    void *ptr  = std::malloc(size);
    void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1));
    void **vp  = (void**) ptr2 - 1;
    *vp        = ptr;
    return ptr2;
}

void
free_(void *ptr)
{
    std::free(*((void**)ptr-1));
}

//-- Config --------------------------------------------------------------------

#ifdef HAVE_FMA

#   ifndef BS_D_MR
#   define BS_D_MR 4
#   endif

#   ifndef BS_D_NR
#   define BS_D_NR 12
#   endif

#   ifndef BS_D_MC
#   define BS_D_MC 256
#   endif

#   ifndef BS_D_KC
#   define BS_D_KC 512
#   endif

#   ifndef BS_D_NC
#   define BS_D_NC 4092
#   endif

#   ifndef BS_D_ALIGN
#   define BS_D_ALIGN 32
#   endif

#endif


#ifndef BS_D_MR
#define BS_D_MR 4
#endif

#ifndef BS_D_NR
#define BS_D_NR 8
#endif

#ifndef BS_D_MC
#define BS_D_MC 256
#endif

#ifndef BS_D_KC
#define BS_D_KC 256
#endif

#ifndef BS_D_NC
#define BS_D_NC 4096
#endif

#ifndef BS_D_ALIGN
#define BS_D_ALIGN 32
#endif


template <typename T>
struct BlockSize
{
    static const int MC = 64;
    static const int KC = 64;
    static const int NC = 256;
    static const int MR = 8;
    static const int NR = 8;
    static const int align = alignof(T);
    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};


template <>
struct BlockSize<double>
{
    static const int MC    = BS_D_MC;
    static const int KC    = BS_D_KC;
    static const int NC    = BS_D_NC;
    static const int MR    = BS_D_MR;
    static const int NR    = BS_D_NR;

    static const int align = BS_D_ALIGN;

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};

//-- aux routines --------------------------------------------------------------
template <typename Index, typename Alpha, typename TX, typename TY>
void
geaxpy(Index m, Index n,
       const Alpha &alpha,
       const TX *X, Index incRowX, Index incColX,
       TY       *Y, Index incRowY, Index incColY)
{
    for (Index j=0; j<n; ++j) {
        for (Index i=0; i<m; ++i) {
            Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
        }
    }
}

template <typename Index, typename Alpha, typename TX>
void
gescal(Index m, Index n,
       const Alpha &alpha,
       TX *X, Index incRowX, Index incColX)
{
    if (alpha!=Alpha(0)) {
        for (Index j=0; j<n; ++j) {
            for (Index i=0; i<m; ++i) {
                X[i*incRowX+j*incColX] *= alpha;
            }
        }
    } else {
        for (Index j=0; j<n; ++j) {
            for (Index i=0; i<m; ++i) {
                X[i*incRowX+j*incColX] = Alpha(0);
            }
        }
    }
}


//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
void
ugemm(Index kc, T alpha,
      const T *A, const T *B,
      T beta,
      T *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    T P[BlockSize<T>::MR*BlockSize<T>::NR];

    for (Index l=0; l<MR*NR; ++l) {
        P[l] = 0;
    }
    for (Index l=0; l<kc; ++l) {
        for (Index j=0; j<NR; ++j) {
            for (Index i=0; i<MR; ++i) {
                P[i+j*MR] += A[i+l*MR]*B[l*NR+j];
            }
        }
    }
    for (Index j=0; j<NR; ++j) {
        for (Index i=0; i<MR; ++i) {
            C[i*incRowC+j*incColC] *= beta;
            C[i*incRowC+j*incColC] += alpha*P[i+j*MR];
        }
    }
}

#ifdef HAVE_AVX
#include "avx.hpp"
#endif

#ifdef HAVE_FMA
#include "fma.hpp"
#endif

//-- Macro Kernel --------------------------------------------------------------
template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc,
      T alpha,
      const T *A, const T *B,
      Beta beta,
      TC *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    const Index mp  = (mc+MR-1) / MR;
    const Index np  = (nc+NR-1) / NR;
    const Index mr_ = mc % MR;
    const Index nr_ = nc % NR;

    T C_[MR*NR];

    for (Index j=0; j<np; ++j) {
        const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;

        for (Index i=0; i<mp; ++i) {
            const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;

            if (mr==MR && nr==NR) {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      beta,
                      &C[i*MR*incRowC+j*NR*incColC],
                      incRowC, incColC);
            } else {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      T(0),
                      C_, Index(1), MR);
                gescal(mr, nr, beta,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
                geaxpy(mr, nr, T(1), C_, Index(1), MR,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
            }
        }
    }
}

//-- Packing blocks ------------------------------------------------------------
template <typename Index, typename TA, typename T>
void
pack_A(Index mc, Index kc,
       const TA *A, Index incRowA, Index incColA,
       T *p)
{
    Index MR = BlockSize<T>::MR;
    Index mp = (mc+MR-1) / MR;

    for (Index j=0; j<kc; ++j) {
        for (Index l=0; l<mp; ++l) {
            for (Index i0=0; i0<MR; ++i0) {
                Index i  = l*MR + i0;
                Index nu = l*MR*kc + j*MR + i0;
                p[nu]   = (i<mc) ? A[i*incRowA+j*incColA]
                                 : T(0);
            }
        }
    }
}

template <typename Index, typename TB, typename T>
void
pack_B(Index kc, Index nc,
       const TB *B, Index incRowB, Index incColB,
       T *p)
{
    Index NR = BlockSize<T>::NR;
    Index np = (nc+NR-1) / NR;

    for (Index l=0; l<np; ++l) {
        for (Index j0=0; j0<NR; ++j0) {
            for (Index i=0; i<kc; ++i) {
                Index j  = l*NR+j0;
                Index nu = l*NR*kc + i*NR + j0;
                p[nu]   = (j<nc) ? B[i*incRowB+j*incColB]
                                 : T(0);
            }
        }
    }
}

//-- Frame routine -------------------------------------------------------------
template <typename Index, typename Alpha,
         typename TA, typename TB,
         typename Beta,
         typename TC>
void
gemm(Index m, Index n, Index k,
     Alpha alpha,
     const TA *A, Index incRowA, Index incColA,
     const TB *B, Index incRowB, Index incColB,
     Beta beta,
     TC *C, Index incRowC, Index incColC)
{
    typedef typename std::common_type<Alpha, TA, TB>::type  T;

    const Index MC = BlockSize<T>::MC;
    const Index NC = BlockSize<T>::NC;
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;

    const Index KC = BlockSize<T>::KC;
    const Index mb = (m+MC-1) / MC;
    const Index nb = (n+NC-1) / NC;
    const Index kb = (k+KC-1) / KC;
    const Index mc_ = m % MC;
    const Index nc_ = n % NC;
    const Index kc_ = k % KC;

    T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*KC+MR));
    T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(KC*NC+NR));

    if (alpha==Alpha(0) || k==0) {
        gescal(m, n, beta, C, incRowC, incColC);
        return;
    }

    for (Index j=0; j<nb; ++j) {
        Index nc = (j!=nb-1 || nc_==0) ? NC : nc_;

        for (Index l=0; l<kb; ++l) {
            Index   kc  = (l!=kb-1 || kc_==0) ? KC : kc_;
            Beta beta_  = (l==0) ? beta : Beta(1);

            pack_B(kc, nc,
                   &B[l*KC*incRowB+j*NC*incColB],
                   incRowB, incColB,
                   B_);

            for (Index i=0; i<mb; ++i) {
                Index mc = (i!=mb-1 || mc_==0) ? MC : mc_;

                pack_A(mc, kc,
                       &A[i*MC*incRowA+l*KC*incColA],
                       incRowA, incColA,
                       A_);

                mgemm(mc, nc, kc,
                      T(alpha), A_, B_, beta_,
                      &C[i*MC*incRowC+j*NC*incColC],
                      incRowC, incColC);
            }
        }
    }
    free_(A_);
    free_(B_);
}

#endif

Micro-Kernel for AVX

#ifndef AVX_HPP
#define AVX_HPP

#include "gemm.hpp"
#include <type_traits>

template <typename Index>
typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
                     && BlockSize<double>::MR==4
                     && BlockSize<double>::NR==8
                     && BlockSize<double>::align==32,
         void>::type
ugemm(Index kc_, double alpha,
      const double *A, const double *B,
      double beta,
      double *C, Index incRowC_, Index incColC_)
{
    int64_t kc      = kc_;
    int64_t incRowC = incRowC_;
    int64_t incColC = incColC_;

    double *pAlpha  = &alpha;
    double *pBeta   = &beta;

//
//  Compute AB = A*B
//
    __asm__ volatile
    (
    "movq      %0,           %%rdi    \n\t"  // kc
    "movq      %1,           %%rsi    \n\t"  // A
    "movq      %2,           %%rdx    \n\t"  // B
    "movq      %5,           %%rcx    \n\t"  // C
    "movq      %6,           %%r8     \n\t"  // incRowC
    "movq      %7,           %%r9     \n\t"  // incColC

    "vmovapd           0 * 32(%%rdx),         %%ymm4\n\t"

    "vbroadcastsd       0 * 8(%%rsi),         %%ymm0\n\t"
    "vbroadcastsd       1 * 8(%%rsi),         %%ymm1\n\t"
    "vbroadcastsd       2 * 8(%%rsi),         %%ymm2\n\t"
    "vbroadcastsd       3 * 8(%%rsi),         %%ymm3\n\t"

    "vxorpd                  %%ymm8,          %%ymm8,          %%ymm8\n\t"
    "vxorpd                  %%ymm9,          %%ymm9,          %%ymm9\n\t"
    "vxorpd                  %%ymm10,         %%ymm10,         %%ymm10\n\t"
    "vxorpd                  %%ymm11,         %%ymm11,         %%ymm11\n\t"
    "vxorpd                  %%ymm12,         %%ymm12,         %%ymm12\n\t"
    "vxorpd                  %%ymm13,         %%ymm13,         %%ymm13\n\t"
    "vxorpd                  %%ymm14,         %%ymm14,         %%ymm14\n\t"
    "vxorpd                  %%ymm15,         %%ymm15,         %%ymm15\n\t"

    "jmp                     check%=\n\t"

    "loop%=:\n\t"

    "vmovapd           1 * 32(%%rdx),         %%ymm5\n\t"

    "vmulpd                  %%ymm0,          %%ymm4,          %%ymm6\n\t"
    "vaddpd                  %%ymm6,          %%ymm8,          %%ymm8\n\t"
    "vmulpd                  %%ymm1,          %%ymm4,          %%ymm7\n\t"
    "vaddpd                  %%ymm7,          %%ymm9,          %%ymm9\n\t"
    "vmulpd                  %%ymm2,          %%ymm4,          %%ymm6\n\t"
    "vaddpd                  %%ymm6,          %%ymm10,         %%ymm10\n\t"
    "vmulpd                  %%ymm3,          %%ymm4,          %%ymm7\n\t"
    "vaddpd                  %%ymm7,          %%ymm11,         %%ymm11\n\t"

    "vmovapd           2 * 32(%%rdx),         %%ymm4\n\t"

    "vmulpd                  %%ymm0,          %%ymm5,          %%ymm6\n\t"
    "vaddpd                  %%ymm6,          %%ymm12,         %%ymm12\n\t"
    "vbroadcastsd       4 * 8(%%rsi),         %%ymm0\n\t"
    "vmulpd                  %%ymm1,          %%ymm5,          %%ymm7\n\t"
    "vaddpd                  %%ymm7,          %%ymm13,         %%ymm13\n\t"
    "vbroadcastsd       5 * 8(%%rsi),         %%ymm1\n\t"
    "vmulpd                  %%ymm2,          %%ymm5,          %%ymm6\n\t"
    "vaddpd                  %%ymm6,          %%ymm14,         %%ymm14\n\t"
    "vbroadcastsd       6 * 8(%%rsi),         %%ymm2\n\t"
    "vmulpd                  %%ymm3,          %%ymm5,          %%ymm7\n\t"
    "vaddpd                  %%ymm7,          %%ymm15,         %%ymm15\n\t"
    "vbroadcastsd       7 * 8(%%rsi),         %%ymm3\n\t"

    "addq                    $32,            %%rsi\n\t"
    "addq                    $2*32,          %%rdx\n\t"
    "decq                    %%rdi\n\t"

    "check%=:\n\t"
    "testq                   %%rdi,           %%rdi\n\t"
    "jg                      loop%=\n\t"

    "movq      %3,           %%rdi                  \n\t"  // alpha
    "movq      %4,           %%rsi                  \n\t"  // beta
    "vbroadcastsd           (%%rdi),          %%ymm6\n\t"
    "vbroadcastsd           (%%rsi),          %%ymm7\n\t"


    "vmulpd                  %%ymm6,          %%ymm8,          %%ymm8\n\t"
    "vmulpd                  %%ymm6,          %%ymm9,          %%ymm9\n\t"
    "vmulpd                  %%ymm6,          %%ymm10,         %%ymm10\n\t"
    "vmulpd                  %%ymm6,          %%ymm11,         %%ymm11\n\t"
    "vmulpd                  %%ymm6,          %%ymm12,         %%ymm12\n\t"
    "vmulpd                  %%ymm6,          %%ymm13,         %%ymm13\n\t"
    "vmulpd                  %%ymm6,          %%ymm14,         %%ymm14\n\t"
    "vmulpd                  %%ymm6,          %%ymm15,         %%ymm15\n\t"

    "leaq                    (,%%r8,8),       %%r8\n\t"
    "leaq                    (,%%r9,8),       %%r9\n\t"

    "leaq                    (,%%r9,2),       %%r10\n\t"
    "leaq                    (%%r10,%%r9),    %%r11\n\t"
    "leaq                    (%%rcx,%%r10,2), %%rdx\n\t"

    // check if beta == 0
    "vxorpd                  %%ymm0,          %%ymm0,          %%ymm0  \n\t"
    "vucomisd                %%xmm0,          %%xmm7                   \n\t"
    "je                      beta_zero%=                              \n\t"

    // case: beta != 0
    "#\n\t"
    "#       Update C(0,:)\n\t"
    "#\n\t"
    "vmovlpd                 (%%rcx),         %%xmm0,          %%xmm0\n\t"
    "vmovhpd                 (%%rcx,%%r9),    %%xmm0,          %%xmm0\n\t"
    "vmovlpd                 (%%rcx,%%r10),   %%xmm1,          %%xmm1\n\t"
    "vmovhpd                 (%%rcx,%%r11),   %%xmm1,          %%xmm1\n\t"
    "vmovlpd                 (%%rdx),         %%xmm2,          %%xmm2\n\t"
    "vmovhpd                 (%%rdx,%%r9),    %%xmm2,          %%xmm2\n\t"
    "vmovlpd                 (%%rdx,%%r10),   %%xmm3,          %%xmm3\n\t"
    "vmovhpd                 (%%rdx,%%r11),   %%xmm3,          %%xmm3\n\t"

    "vmulpd                  %%xmm7,          %%xmm0,          %%xmm0\n\t"
    "vmulpd                  %%xmm7,          %%xmm1,          %%xmm1\n\t"
    "vmulpd                  %%xmm7,          %%xmm2,          %%xmm2\n\t"
    "vmulpd                  %%xmm7,          %%xmm3,          %%xmm3\n\t"

    "vextractf128            $1,              %%ymm8,          %%xmm4\n\t"
    "vextractf128            $1,              %%ymm12,         %%xmm5\n\t"

    "vaddpd                  %%xmm0,          %%xmm8,          %%xmm0\n\t"
    "vaddpd                  %%xmm1,          %%xmm4,          %%xmm1\n\t"
    "vaddpd                  %%xmm2,          %%xmm12,         %%xmm2\n\t"
    "vaddpd                  %%xmm3,          %%xmm5,          %%xmm3\n\t"

    "vmovlpd                 %%xmm0,          (%%rcx)\n\t"
    "vmovhpd                 %%xmm0,          (%%rcx,%%r9)\n\t"
    "vmovlpd                 %%xmm1,          (%%rcx,%%r10)\n\t"
    "vmovhpd                 %%xmm1,          (%%rcx,%%r11)\n\t"
    "vmovlpd                 %%xmm2,          (%%rdx)\n\t"
    "vmovhpd                 %%xmm2,          (%%rdx,%%r9)\n\t"
    "vmovlpd                 %%xmm3,          (%%rdx,%%r10)\n\t"
    "vmovhpd                 %%xmm3,          (%%rdx,%%r11)\n\t"

    "#\n\t"
    "#       Update C(1,:)\n\t"
    "#\n\t"
    "addq                    %%r8,            %%rcx\n\t"
    "addq                    %%r8,            %%rdx\n\t"

    "vmovlpd                 (%%rcx),         %%xmm0,          %%xmm0\n\t"
    "vmovhpd                 (%%rcx,%%r9),    %%xmm0,          %%xmm0\n\t"
    "vmovlpd                 (%%rcx,%%r10),   %%xmm1,          %%xmm1\n\t"
    "vmovhpd                 (%%rcx,%%r11),   %%xmm1,          %%xmm1\n\t"
    "vmovlpd                 (%%rdx),         %%xmm2,          %%xmm2\n\t"
    "vmovhpd                 (%%rdx,%%r9),    %%xmm2,          %%xmm2\n\t"
    "vmovlpd                 (%%rdx,%%r10),   %%xmm3,          %%xmm3\n\t"
    "vmovhpd                 (%%rdx,%%r11),   %%xmm3,          %%xmm3\n\t"

    "vmulpd                  %%xmm7,          %%xmm0,          %%xmm0\n\t"
    "vmulpd                  %%xmm7,          %%xmm1,          %%xmm1\n\t"
    "vmulpd                  %%xmm7,          %%xmm2,          %%xmm2\n\t"
    "vmulpd                  %%xmm7,          %%xmm3,          %%xmm3\n\t"

    "vextractf128            $1,              %%ymm9,          %%xmm4\n\t"
    "vextractf128            $1,              %%ymm13,         %%xmm5\n\t"

    "vaddpd                  %%xmm0,          %%xmm9,          %%xmm0\n\t"
    "vaddpd                  %%xmm1,          %%xmm4,          %%xmm1\n\t"
    "vaddpd                  %%xmm2,          %%xmm13,         %%xmm2\n\t"
    "vaddpd                  %%xmm3,          %%xmm5,          %%xmm3\n\t"

    "vmovlpd                 %%xmm0,          (%%rcx)\n\t"
    "vmovhpd                 %%xmm0,          (%%rcx,%%r9)\n\t"
    "vmovlpd                 %%xmm1,          (%%rcx,%%r10)\n\t"
    "vmovhpd                 %%xmm1,          (%%rcx,%%r11)\n\t"
    "vmovlpd                 %%xmm2,          (%%rdx)\n\t"
    "vmovhpd                 %%xmm2,          (%%rdx,%%r9)\n\t"
    "vmovlpd                 %%xmm3,          (%%rdx,%%r10)\n\t"
    "vmovhpd                 %%xmm3,          (%%rdx,%%r11)\n\t"

    "#\n\t"
    "#       Update C(2,:)\n\t"
    "#\n\t"
    "addq                    %%r8,            %%rcx\n\t"
    "addq                    %%r8,            %%rdx\n\t"

    "vmovlpd                 (%%rcx),         %%xmm0,          %%xmm0\n\t"
    "vmovhpd                 (%%rcx,%%r9),    %%xmm0,          %%xmm0\n\t"
    "vmovlpd                 (%%rcx,%%r10),   %%xmm1,          %%xmm1\n\t"
    "vmovhpd                 (%%rcx,%%r11),   %%xmm1,          %%xmm1\n\t"
    "vmovlpd                 (%%rdx),         %%xmm2,          %%xmm2\n\t"
    "vmovhpd                 (%%rdx,%%r9),    %%xmm2,          %%xmm2\n\t"
    "vmovlpd                 (%%rdx,%%r10),   %%xmm3,          %%xmm3\n\t"
    "vmovhpd                 (%%rdx,%%r11),   %%xmm3,          %%xmm3\n\t"

    "vmulpd                  %%xmm7,          %%xmm0,          %%xmm0\n\t"
    "vmulpd                  %%xmm7,          %%xmm1,          %%xmm1\n\t"
    "vmulpd                  %%xmm7,          %%xmm2,          %%xmm2\n\t"
    "vmulpd                  %%xmm7,          %%xmm3,          %%xmm3\n\t"

    "vextractf128            $1,              %%ymm10,         %%xmm4\n\t"
    "vextractf128            $1,              %%ymm14,         %%xmm5\n\t"

    "vaddpd                  %%xmm0,          %%xmm10,         %%xmm0\n\t"
    "vaddpd                  %%xmm1,          %%xmm4,          %%xmm1\n\t"
    "vaddpd                  %%xmm2,          %%xmm14,         %%xmm2\n\t"
    "vaddpd                  %%xmm3,          %%xmm5,          %%xmm3\n\t"

    "vmovlpd                 %%xmm0,          (%%rcx)\n\t"
    "vmovhpd                 %%xmm0,          (%%rcx,%%r9)\n\t"
    "vmovlpd                 %%xmm1,          (%%rcx,%%r10)\n\t"
    "vmovhpd                 %%xmm1,          (%%rcx,%%r11)\n\t"
    "vmovlpd                 %%xmm2,          (%%rdx)\n\t"
    "vmovhpd                 %%xmm2,          (%%rdx,%%r9)\n\t"
    "vmovlpd                 %%xmm3,          (%%rdx,%%r10)\n\t"
    "vmovhpd                 %%xmm3,          (%%rdx,%%r11)\n\t"

    "#\n\t"
    "#       Update C(3,:)\n\t"
    "#\n\t"
    "addq                    %%r8,            %%rcx\n\t"
    "addq                    %%r8,            %%rdx\n\t"

    "vmovlpd                 (%%rcx),         %%xmm0,          %%xmm0\n\t"
    "vmovhpd                 (%%rcx,%%r9),    %%xmm0,          %%xmm0\n\t"
    "vmovlpd                 (%%rcx,%%r10),   %%xmm1,          %%xmm1\n\t"
    "vmovhpd                 (%%rcx,%%r11),   %%xmm1,          %%xmm1\n\t"
    "vmovlpd                 (%%rdx),         %%xmm2,          %%xmm2\n\t"
    "vmovhpd                 (%%rdx,%%r9),    %%xmm2,          %%xmm2\n\t"
    "vmovlpd                 (%%rdx,%%r10),   %%xmm3,          %%xmm3\n\t"
    "vmovhpd                 (%%rdx,%%r11),   %%xmm3,          %%xmm3\n\t"

    "vmulpd                  %%xmm7,          %%xmm0,          %%xmm0\n\t"
    "vmulpd                  %%xmm7,          %%xmm1,          %%xmm1\n\t"
    "vmulpd                  %%xmm7,          %%xmm2,          %%xmm2\n\t"
    "vmulpd                  %%xmm7,          %%xmm3,          %%xmm3\n\t"

    "vextractf128            $1,              %%ymm11,         %%xmm4\n\t"
    "vextractf128            $1,              %%ymm15,         %%xmm5\n\t"

    "vaddpd                  %%xmm0,          %%xmm11,         %%xmm0\n\t"
    "vaddpd                  %%xmm1,          %%xmm4,          %%xmm1\n\t"
    "vaddpd                  %%xmm2,          %%xmm15,         %%xmm2\n\t"
    "vaddpd                  %%xmm3,          %%xmm5,          %%xmm3\n\t"

    "vmovlpd                 %%xmm0,          (%%rcx)\n\t"
    "vmovhpd                 %%xmm0,          (%%rcx,%%r9)\n\t"
    "vmovlpd                 %%xmm1,          (%%rcx,%%r10)\n\t"
    "vmovhpd                 %%xmm1,          (%%rcx,%%r11)\n\t"
    "vmovlpd                 %%xmm2,          (%%rdx)\n\t"
    "vmovhpd                 %%xmm2,          (%%rdx,%%r9)\n\t"
    "vmovlpd                 %%xmm3,          (%%rdx,%%r10)\n\t"
    "vmovhpd                 %%xmm3,          (%%rdx,%%r11)\n\t"

    "jmp done%=                              \n\t"

    // case: beta == 0
    "beta_zero%=:                           \n\t"
    "#\n\t"
    "#       Update C(0,:)\n\t"
    "#\n\t"
    "vextractf128            $1,              %%ymm8,         %%xmm4\n\t"
    "vextractf128            $1,              %%ymm12,        %%xmm5\n\t"

    "vmovlpd                 %%xmm8,          (%%rcx)\n\t"
    "vmovhpd                 %%xmm8,          (%%rcx,%%r9)\n\t"
    "vmovlpd                 %%xmm4,          (%%rcx,%%r10)\n\t"
    "vmovhpd                 %%xmm4,          (%%rcx,%%r11)\n\t"
    "vmovlpd                 %%xmm12,         (%%rdx)\n\t"
    "vmovhpd                 %%xmm12,         (%%rdx,%%r9)\n\t"
    "vmovlpd                 %%xmm5,          (%%rdx,%%r10)\n\t"
    "vmovhpd                 %%xmm5,          (%%rdx,%%r11)\n\t"

    "#\n\t"
    "#       Update C(1,:)\n\t"
    "#\n\t"
    "addq                    %%r8,            %%rcx\n\t"
    "addq                    %%r8,            %%rdx\n\t"

    "vextractf128            $1,              %%ymm9,         %%xmm4\n\t"
    "vextractf128            $1,              %%ymm13,        %%xmm5\n\t"

    "vmovlpd                 %%xmm9,          (%%rcx)\n\t"
    "vmovhpd                 %%xmm9,          (%%rcx,%%r9)\n\t"
    "vmovlpd                 %%xmm4,          (%%rcx,%%r10)\n\t"
    "vmovhpd                 %%xmm4,          (%%rcx,%%r11)\n\t"
    "vmovlpd                 %%xmm13,         (%%rdx)\n\t"
    "vmovhpd                 %%xmm13,         (%%rdx,%%r9)\n\t"
    "vmovlpd                 %%xmm5,          (%%rdx,%%r10)\n\t"
    "vmovhpd                 %%xmm5,          (%%rdx,%%r11)\n\t"

    "#\n\t"
    "#       Update C(2,:)\n\t"
    "#\n\t"
    "addq                    %%r8,            %%rcx\n\t"
    "addq                    %%r8,            %%rdx\n\t"

    "vextractf128            $1,              %%ymm10,        %%xmm4\n\t"
    "vextractf128            $1,              %%ymm14,        %%xmm5\n\t"

    "vmovlpd                 %%xmm10,         (%%rcx)\n\t"
    "vmovhpd                 %%xmm10,         (%%rcx,%%r9)\n\t"
    "vmovlpd                 %%xmm4,          (%%rcx,%%r10)\n\t"
    "vmovhpd                 %%xmm4,          (%%rcx,%%r11)\n\t"
    "vmovlpd                 %%xmm14,         (%%rdx)\n\t"
    "vmovhpd                 %%xmm14,         (%%rdx,%%r9)\n\t"
    "vmovlpd                 %%xmm5,          (%%rdx,%%r10)\n\t"
    "vmovhpd                 %%xmm5,          (%%rdx,%%r11)\n\t"

    "#\n\t"
    "#       Update C(3,:)\n\t"
    "#\n\t"
    "addq                    %%r8,            %%rcx\n\t"
    "addq                    %%r8,            %%rdx\n\t"

    "vextractf128            $1,              %%ymm11,        %%xmm4\n\t"
    "vextractf128            $1,              %%ymm15,        %%xmm5\n\t"

    "vmovlpd                 %%xmm11,         (%%rcx)\n\t"
    "vmovhpd                 %%xmm11,         (%%rcx,%%r9)\n\t"
    "vmovlpd                 %%xmm4,          (%%rcx,%%r10)\n\t"
    "vmovhpd                 %%xmm4,          (%%rcx,%%r11)\n\t"
    "vmovlpd                 %%xmm15,         (%%rdx)\n\t"
    "vmovhpd                 %%xmm15,         (%%rdx,%%r9)\n\t"
    "vmovlpd                 %%xmm5,          (%%rdx,%%r10)\n\t"
    "vmovhpd                 %%xmm5,          (%%rdx,%%r11)\n\t"

    "done%=:                           \n\t"

    : // output
    : // input
        "m" (kc),       // 0
        "m" (A),        // 1
        "m" (B),        // 2
        "m" (pAlpha),   // 3
        "m" (pBeta),    // 4
        "m" (C),        // 5
        "m" (incRowC),  // 6
        "m" (incColC)   // 7
    : // register clobber list
        "rax",  "rbx",  "rcx",  "rdx",    "rsi",   "rdi",
        "r8",   "r9",   "r10",  "r11",
        "xmm0", "xmm1", "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
        "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
        "memory"
    );
}

#endif

Micro-Kernel for FMA

#ifndef FMA_HPP
#define FMA_HPP

#include "gemm.hpp"
#include <type_traits>

template <typename Index>
typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
                     && BlockSize<double>::MR==4
                     && BlockSize<double>::NR==12
                     && BlockSize<double>::align==32,
         void>::type
ugemm(Index kc_, double alpha,
      const double *A, const double *B,
      double beta,
      double *C, Index incRowC_, Index incColC_)
{
    int64_t kc      = kc_;
    int64_t incRowC = incRowC_;
    int64_t incColC = incColC_;

    double *pAlpha  = &alpha;
    double *pBeta   = &beta;

//
//  Compute AB = A*B
//
    __asm__ volatile
    (
    "movq      %0,           %%rdi    \n\t"  // kc
    "movq      %1,           %%rsi    \n\t"  // A
    "movq      %2,           %%rdx    \n\t"  // B
    "movq      %5,           %%rcx    \n\t"  // C
    "movq      %6,           %%r8     \n\t"  // incRowC
    "movq      %7,           %%r9     \n\t"  // incColC



    "vmovapd             0*32(%%rdx),        %%ymm1 \n\t"
    "vmovapd             1*32(%%rdx),        %%ymm2 \n\t"
    "vmovapd             2*32(%%rdx),        %%ymm3 \n\t"

    "vxorpd                  %%ymm4,         %%ymm4,          %%ymm4    \n\t"
    "vxorpd                  %%ymm5,         %%ymm5,          %%ymm5    \n\t"
    "vxorpd                  %%ymm6,         %%ymm6,          %%ymm6    \n\t"
    "vxorpd                  %%ymm7,         %%ymm7,          %%ymm7    \n\t"
    "vxorpd                  %%ymm8,         %%ymm8,          %%ymm8    \n\t"
    "vxorpd                  %%ymm9,         %%ymm9,          %%ymm9    \n\t"
    "vxorpd                  %%ymm10,        %%ymm10,         %%ymm10   \n\t"
    "vxorpd                  %%ymm11,        %%ymm11,         %%ymm11   \n\t"
    "vxorpd                  %%ymm12,        %%ymm12,         %%ymm12   \n\t"
    "vxorpd                  %%ymm13,        %%ymm13,         %%ymm13   \n\t"
    "vxorpd                  %%ymm14,        %%ymm14,         %%ymm14   \n\t"
    "vxorpd                  %%ymm15,        %%ymm15,         %%ymm15   \n\t"

    "movq                    $3*32,          %%r13                      \n\t"
    "movq                    $4* 8,          %%r12                      \n\t"

    "jmp                     check%=                                    \n\t"

    "loop%=:                                                            \n\t"

    "vbroadcastsd       0* 8(%%rsi),          %%ymm0                    \n\t"
    "addq                    %%r13,           %%rdx                     \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm1,          %%ymm4   \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm2,          %%ymm8   \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm3,          %%ymm12  \n\t"

    "vbroadcastsd       1* 8(%%rsi),          %%ymm0                    \n\t"
    "decq                    %%rdi                                      \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm1,          %%ymm5   \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm2,          %%ymm9   \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm3,          %%ymm13  \n\t"


    "vbroadcastsd       2* 8(%%rsi),          %%ymm0                    \n\t"
    "addq                    %%r12,           %%rsi                     \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm1,          %%ymm6   \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm2,          %%ymm10  \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm3,          %%ymm14  \n\t"

    "vbroadcastsd      -1* 8(%%rsi),          %%ymm0                    \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm1,          %%ymm7   \n\t"
    "vmovapd            0*32(%%rdx),          %%ymm1                    \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm2,          %%ymm11  \n\t"
    "vmovapd            1*32(%%rdx),          %%ymm2                    \n\t"
    "vfmadd231pd            %%ymm0,           %%ymm3,          %%ymm15  \n\t"
    "vmovapd            2*32(%%rdx),          %%ymm3                    \n\t"

    "check%=:                                                           \n\t"
    "testq                   %%rdi,           %%rdi                     \n\t"
    "jg                      loop%=                                     \n\t"

    "movq      %3,           %%rdi                      \n\t"  // alpha
    "vbroadcastsd           (%%rdi),          %%ymm0    \n\t"

    "vmulpd                  %%ymm0,         %%ymm4,        %%ymm4      \n\t"
    "vmulpd                  %%ymm0,         %%ymm5,        %%ymm5      \n\t"
    "vmulpd                  %%ymm0,         %%ymm6,        %%ymm6      \n\t"
    "vmulpd                  %%ymm0,         %%ymm7,        %%ymm7      \n\t"
    "vmulpd                  %%ymm0,         %%ymm8,        %%ymm8      \n\t"
    "vmulpd                  %%ymm0,         %%ymm9,        %%ymm9      \n\t"
    "vmulpd                  %%ymm0,         %%ymm10,       %%ymm10     \n\t"
    "vmulpd                  %%ymm0,         %%ymm11,       %%ymm11     \n\t"
    "vmulpd                  %%ymm0,         %%ymm12,       %%ymm12     \n\t"
    "vmulpd                  %%ymm0,         %%ymm13,       %%ymm13     \n\t"
    "vmulpd                  %%ymm0,         %%ymm14,       %%ymm14     \n\t"
    "vmulpd                  %%ymm0,         %%ymm15,       %%ymm15     \n\t"


    "leaq                    (,%%r8,8),       %%r8                            \n\t"
    "leaq                    (,%%r9,8),       %%r9                            \n\t"

    "leaq                    (,%%r9,2),       %%r10        # 2*incColC        \n\t"
    "leaq                    (%%r10,%%r9),    %%r11        # 3*incColC       \n\t"
    "leaq                    (%%rcx,%%r10,2), %%rdx        # C + 4*incColC   \n\t"
    "leaq                    (%%rdx,%%r10,2), %%rax        # C + 8*incColC   \n\t"

    // check if beta == 0
    "movq      %4,           %%rdi                      \n\t"  // beta
    "vbroadcastsd           (%%rdi),          %%ymm0    \n\t"
    "vxorpd                  %%ymm1,          %%ymm1,          %%ymm1  \n\t"
    "vucomisd                %%xmm0,          %%xmm1                   \n\t"
    "je                      beta_zero%=                               \n\t"

    // case: beta != 0
    "#                                                                          \n\t"
    "#       Update C(0,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rcx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rcx,%%r9),    %%xmm1,          %%xmm1          \n\t"
    "vmovlpd                 (%%rcx,%%r10),   %%xmm2,          %%xmm2          \n\t"
    "vmovhpd                 (%%rcx,%%r11),   %%xmm2,          %%xmm2          \n\t"

    "vextractf128            $1,              %%ymm4,          %%xmm3            \n\t"

    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm4,          %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"

    "vmovlpd                 %%xmm1,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rcx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(0,4:7)                                                    \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rdx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rdx,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rdx,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rdx,%%r11),   %%xmm2,          %%xmm2           \n\t"

    "vextractf128            $1,              %%ymm8,          %%xmm3           \n\t"

    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm8,          %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"

    "vmovlpd                 %%xmm1,          (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rdx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(0,8:11)                                                   \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rax),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rax,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rax,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rax,%%r11),   %%xmm2,          %%xmm2           \n\t"

    "vextractf128            $1,              %%ymm12,         %%xmm3           \n\t"

    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm12,         %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"

    "vmovlpd                 %%xmm1,          (%%rax)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rax,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(1,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "addq                    %%r8,            %%rcx                             \n\t"
    "addq                    %%r8,            %%rdx                             \n\t"
    "addq                    %%r8,            %%rax                             \n\t"

    "vmovlpd                 (%%rcx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rcx,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rcx,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rcx,%%r11),   %%xmm2,          %%xmm2           \n\t"

    "vextractf128            $1,              %%ymm5,          %%xmm3           \n\t"

    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm5,          %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"

    "vmovlpd                 %%xmm1,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rcx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(1,4:7)                                                    \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rdx),          %%xmm1,          %%xmm1          \n\t"
    "vmovhpd                 (%%rdx,%%r9),     %%xmm1,          %%xmm1          \n\t"
    "vmovlpd                 (%%rdx,%%r10),    %%xmm2,          %%xmm2          \n\t"
    "vmovhpd                 (%%rdx,%%r11),    %%xmm2,          %%xmm2          \n\t"

    "vextractf128            $1,             %%ymm9,          %%xmm3            \n\t"

    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm9,          %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"

    "vmovlpd                 %%xmm1,          (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rdx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(1,8:11)                                                   \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rax),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rax,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rax,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rax,%%r11),   %%xmm2,          %%xmm2           \n\t"

    "vextractf128            $1,              %%ymm13,         %%xmm3           \n\t"

    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm13,         %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"

    "vmovlpd                 %%xmm1,          (%%rax)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rax,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(2,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "addq                    %%r8,            %%rcx                             \n\t"
    "addq                    %%r8,            %%rdx                             \n\t"
    "addq                    %%r8,            %%rax                             \n\t"

    "vmovlpd                 (%%rcx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rcx,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rcx,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rcx,%%r11),   %%xmm2,          %%xmm2           \n\t"

    "vextractf128            $1,              %%ymm6,          %%xmm3           \n\t"

    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm6,          %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"

    "vmovlpd                 %%xmm1,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rcx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(2,4:7)                                                    \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rdx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rdx,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rdx,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rdx,%%r11),   %%xmm2,          %%xmm2           \n\t"

    "vextractf128            $1,              %%ymm10,         %%xmm3           \n\t"

    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm10,         %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"

    "vmovlpd                 %%xmm1,          (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rdx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(2,8:11)                                                   \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rax),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rax,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rax,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rax,%%r11),   %%xmm2,          %%xmm2           \n\t"

    "vextractf128            $1,              %%ymm14,         %%xmm3           \n\t"

    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm14,         %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"

    "vmovlpd                 %%xmm1,          (%%rax)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rax,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(3,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "addq                    %%r8,            %%rcx                             \n\t"
    "addq                    %%r8,            %%rdx                             \n\t"
    "addq                    %%r8,            %%rax                             \n\t"

    "vmovlpd                 (%%rcx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rcx,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rcx,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rcx,%%r11),   %%xmm2,          %%xmm2           \n\t"

    "vextractf128            $1,              %%ymm7,          %%xmm3           \n\t"

    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm7,          %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"

    "vmovlpd                 %%xmm1,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rcx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(3,4:7)                                                    \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rdx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rdx,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rdx,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rdx,%%r11),   %%xmm2,          %%xmm2           \n\t"

    "vextractf128            $1,              %%ymm11,         %%xmm3           \n\t"

    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm11,         %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"

    "vmovlpd                 %%xmm1,          (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rdx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(3,8:11)                                                   \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rax),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rax,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rax,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rax,%%r11),   %%xmm2,          %%xmm2           \n\t"

    "vextractf128            $1,              %%ymm15,         %%xmm3           \n\t"

    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm15,         %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"

    "vmovlpd                 %%xmm1,          (%%rax)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rax,%%r11)                     \n\t"

    "jmp done%=                              \n\t"

    // case: beta == 0
    "beta_zero%=:                           \n\t"
    "#                                                                          \n\t"
    "#       Update C(0,0:3)                                                    \n\t"
    "#                                                                          \n\t"

    "vextractf128            $1,              %%ymm4,          %%xmm3            \n\t"

    "vmovlpd                 %%xmm4,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm4,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rcx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(0,4:7)                                                    \n\t"
    "#                                                                          \n\t"

    "vextractf128            $1,              %%ymm8,          %%xmm3           \n\t"

    "vmovlpd                 %%xmm8,          (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm8,          (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rdx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(0,8:11)                                                   \n\t"
    "#                                                                          \n\t"

    "vextractf128            $1,              %%ymm12,         %%xmm3           \n\t"

    "vmovlpd                 %%xmm12,         (%%rax)                           \n\t"
    "vmovhpd                 %%xmm12,         (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rax,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(1,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "addq                    %%r8,            %%rcx                             \n\t"
    "addq                    %%r8,            %%rdx                             \n\t"
    "addq                    %%r8,            %%rax                             \n\t"

    "vextractf128            $1,              %%ymm5,          %%xmm3           \n\t"

    "vmovlpd                 %%xmm5,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm5,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rcx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(1,4:7)                                                    \n\t"
    "#                                                                          \n\t"

    "vextractf128            $1,             %%ymm9,          %%xmm3            \n\t"

    "vmovlpd                 %%xmm9,          (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm9,          (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rdx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(1,8:11)                                                   \n\t"
    "#                                                                          \n\t"

    "vextractf128            $1,              %%ymm13,         %%xmm3           \n\t"

    "vmovlpd                 %%xmm13,         (%%rax)                           \n\t"
    "vmovhpd                 %%xmm13,         (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rax,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(2,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "addq                    %%r8,            %%rcx                             \n\t"
    "addq                    %%r8,            %%rdx                             \n\t"
    "addq                    %%r8,            %%rax                             \n\t"

    "vextractf128            $1,              %%ymm6,          %%xmm3           \n\t"

    "vmovlpd                 %%xmm6,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm6,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rcx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(2,4:7)                                                    \n\t"
    "#                                                                          \n\t"

    "vextractf128            $1,              %%ymm10,         %%xmm3           \n\t"

    "vmovlpd                 %%xmm10,         (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm10,         (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rdx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(2,8:11)                                                   \n\t"
    "#                                                                          \n\t"

    "vextractf128            $1,              %%ymm14,         %%xmm3           \n\t"

    "vmovlpd                 %%xmm14,         (%%rax)                           \n\t"
    "vmovhpd                 %%xmm14,         (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rax,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(3,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "addq                    %%r8,            %%rcx                             \n\t"
    "addq                    %%r8,            %%rdx                             \n\t"
    "addq                    %%r8,            %%rax                             \n\t"

    "vextractf128            $1,              %%ymm7,          %%xmm3           \n\t"

    "vmovlpd                 %%xmm7,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm7,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rcx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(3,4:7)                                                    \n\t"
    "#                                                                          \n\t"

    "vextractf128            $1,              %%ymm11,         %%xmm3           \n\t"

    "vmovlpd                 %%xmm11,         (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm11,         (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rdx,%%r11)                     \n\t"

    "#                                                                          \n\t"
    "#       Update C(3,8:11)                                                   \n\t"
    "#                                                                          \n\t"

    "vextractf128            $1,              %%ymm15,         %%xmm3           \n\t"

    "vmovlpd                 %%xmm15,         (%%rax)                           \n\t"
    "vmovhpd                 %%xmm15,         (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rax,%%r11)                     \n\t"

    "done%=:                           \n\t"
    : // output
    : // input
        "m" (kc),       // 0
        "m" (A),        // 1
        "m" (B),        // 2
        "m" (pAlpha),   // 3
        "m" (pBeta),    // 4
        "m" (C),        // 5
        "m" (incRowC),  // 6
        "m" (incColC)   // 7
    : // register clobber list
        "rax",  "rcx",  "rdx",    "rsi",   "rdi",
        "r8",   "r9",   "r10",  "r11", "r12", "r13",
        "xmm0", "xmm1", "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
        "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
        "memory"
    );
}

#endif

Benchmark Results

$shell> g++ -Ofast -mavx -Wall -std=c++11 -DHAVE_AVX -DNDEBUG -I ../boost_1_60_0 matprod.cc
$shell> ./a.out  > report.session2
$shell> cat report.session2
#   m     n     k  uBLAS:   t1       MFLOPS   Blocked:   t2      MFLOPS        Residual
  100   100   100     0.000809      2472.19        0.000323      6191.95               0
  200   200   200     0.006877       2326.6         0.00152      10526.3               0
  300   300   300     0.010098      5347.59        0.002711      19918.8     1.94339e-16
  400   400   400     0.023644      5413.64        0.005979      21408.3     8.40192e-17
  500   500   500     0.045822      5455.89        0.011117      22488.1     3.52697e-17
  600   600   600     0.080257      5382.71        0.019715      21912.2     1.63972e-17
  700   700   700     0.128891      5322.33        0.030326      22620.9     8.44478e-18
  800   800   800     0.210201      4871.53        0.047688      21472.9     4.71309e-18
  900   900   900     0.325499      4479.28        0.063481      22967.5     2.79898e-18
 1000  1000  1000     0.470301       4252.6        0.086339      23164.5     1.76616e-18
 1100  1100  1100     0.624556      4262.23        0.119658      22246.7       1.149e-18
 1200  1200  1200     0.811154       4260.6         0.15355      22507.3     7.78732e-19
 1300  1300  1300        1.036       4241.3        0.194676      22570.8     5.45933e-19
 1400  1400  1400      1.30352      4210.13        0.244264      22467.5     3.89593e-19
 1500  1500  1500      1.61592      4177.18         0.29786      22661.7      2.8652e-19
 1600  1600  1600      1.96915      4160.16        0.381175      21491.4     2.13964e-19
 1700  1700  1700      2.36788      4149.71        0.435644      22555.1     1.62834e-19
 1800  1800  1800      2.81765      4139.62        0.512775      22746.8     1.26254e-19
 1900  1900  1900      3.32306      4128.12        0.602307      22775.8     9.85705e-20
 2000  2000  2000      3.88255         4121        0.692417      23107.5     7.82885e-20
 2100  2100  2100      4.60511      4022.05        0.792981      23357.4     6.28576e-20
 2200  2200  2200      5.47546      3889.35        0.902376      23599.9     5.08775e-20
 2300  2300  2300      6.39626      3804.41         1.07557      22624.2     4.17117e-20
 2400  2400  2400      7.35844      3757.32         1.20236      22994.8     3.43739e-20
 2500  2500  2500      8.37275      3732.35         1.32592      23568.6     2.85856e-20
 2600  2600  2600      9.45366      3718.35         1.52877      22993.7     2.39389e-20
 2700  2700  2700      10.6354       3701.4         1.67988      23433.9     2.01962e-20
 2800  2800  2800      11.8683      3699.25         1.88038      23348.5     1.71236e-20
 2900  2900  2900       13.202      3694.74         2.09965      23231.5     1.45996e-20
 3000  3000  3000      14.5456      3712.47         2.28774      23604.1     1.25328e-20
 3100  3100  3100      16.0865      3703.85         2.50491      23786.1     1.08093e-20
 3200  3200  3200       17.895      3662.25         2.85963      22917.7     9.35243e-21
 3300  3300  3300      19.5051      3684.88         3.03928      23648.4     8.14503e-21
 3400  3400  3400      21.3004      3690.44         3.39132      23179.2     7.11654e-21
 3500  3500  3500      23.2126      3694.12         3.63449      23593.4     6.24089e-21
 3600  3600  3600      25.2271      3698.88          4.0155        23238     5.49184e-21
 3700  3700  3700      27.4812      3686.37         4.29658      23578.3     4.85238e-21
 3800  3800  3800      29.7953      3683.26           4.696      23369.7      4.3001e-21
 3900  3900  3900      32.1891      3685.66          5.0543      23472.7     3.82209e-21
 4000  4000  4000      34.6342      3695.76         5.34636      23941.5     3.41023e-21
$shell> gnuplot plot.session2.mflops
$shell> gnuplot plot.session2.time
$shell> gnuplot plot.session2.time_log
$shell> 

MFLOPS

Time

Time with Logarithmic scale