#ifndef FMA_HPP
#define FMA_HPP
#include "gemm.hpp"
#include <type_traits>
template <typename Index>
typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
                     && BlockSize<double>::MR==4
                     && BlockSize<double>::NR==12
                     && BlockSize<double>::align==32,
         void>::type
ugemm(Index kc_, double alpha,
      const double *A, const double *B,
      double beta,
      double *C, Index incRowC_, Index incColC_)
{
    int64_t kc      = kc_;
    int64_t incRowC = incRowC_;
    int64_t incColC = incColC_;
    double *pAlpha  = α
    double *pBeta   = β
//
//  Compute AB = A*B
//
    __asm__ volatile
    (
    "movq      %0,           %%rdi    \n\t"  // kc
    "movq      %1,           %%rsi    \n\t"  // A
    "movq      %2,           %%rdx    \n\t"  // B
    "movq      %5,           %%rcx    \n\t"  // C
    "movq      %6,           %%r8     \n\t"  // incRowC
    "movq      %7,           %%r9     \n\t"  // incColC
    "vmovapd             0*32(%%rdx),        %%ymm1 \n\t"
    "vmovapd             1*32(%%rdx),        %%ymm2 \n\t"
    "vmovapd             2*32(%%rdx),        %%ymm3 \n\t"
    "vxorpd                  %%ymm4,         %%ymm4,          %%ymm4    \n\t"
    "vxorpd                  %%ymm5,         %%ymm5,          %%ymm5    \n\t"
    "vxorpd                  %%ymm6,         %%ymm6,          %%ymm6    \n\t"
    "vxorpd                  %%ymm7,         %%ymm7,          %%ymm7    \n\t"
    "vxorpd                  %%ymm8,         %%ymm8,          %%ymm8    \n\t"
    "vxorpd                  %%ymm9,         %%ymm9,          %%ymm9    \n\t"
    "vxorpd                  %%ymm10,        %%ymm10,         %%ymm10   \n\t"
    "vxorpd                  %%ymm11,        %%ymm11,         %%ymm11   \n\t"
    "vxorpd                  %%ymm12,        %%ymm12,         %%ymm12   \n\t"
    "vxorpd                  %%ymm13,        %%ymm13,         %%ymm13   \n\t"
    "vxorpd                  %%ymm14,        %%ymm14,         %%ymm14   \n\t"
    "vxorpd                  %%ymm15,        %%ymm15,         %%ymm15   \n\t"
    "movq                    $3*32,          %%r13                      \n\t"
    "movq                    $4* 8,          %%r12                      \n\t"
    "jmp                     check%=                                    \n\t"
    "loop%=:                                                            \n\t"
    "vbroadcastsd       0* 8(%%rsi),          %%ymm0                    \n\t"
    "addq                    %%r13,           %%rdx                     \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm1,          %%ymm4   \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm2,          %%ymm8   \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm3,          %%ymm12  \n\t"
    "vbroadcastsd       1* 8(%%rsi),          %%ymm0                    \n\t"
    "decq                    %%rdi                                      \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm1,          %%ymm5   \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm2,          %%ymm9   \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm3,          %%ymm13  \n\t"
    "vbroadcastsd       2* 8(%%rsi),          %%ymm0                    \n\t"
    "addq                    %%r12,           %%rsi                     \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm1,          %%ymm6   \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm2,          %%ymm10  \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm3,          %%ymm14  \n\t"
    "vbroadcastsd      -1* 8(%%rsi),          %%ymm0                    \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm1,          %%ymm7   \n\t"
    "vmovapd            0*32(%%rdx),          %%ymm1                    \n\t"
    "vfmadd231pd             %%ymm0,          %%ymm2,          %%ymm11  \n\t"
    "vmovapd            1*32(%%rdx),          %%ymm2                    \n\t"
    "vfmadd231pd            %%ymm0,           %%ymm3,          %%ymm15  \n\t"
    "vmovapd            2*32(%%rdx),          %%ymm3                    \n\t"
    "check%=:                                                           \n\t"
    "testq                   %%rdi,           %%rdi                     \n\t"
    "jg                      loop%=                                     \n\t"
    "movq      %3,           %%rdi                      \n\t"  // alpha
    "vbroadcastsd           (%%rdi),          %%ymm0    \n\t"
    "vmulpd                  %%ymm0,         %%ymm4,        %%ymm4      \n\t"
    "vmulpd                  %%ymm0,         %%ymm5,        %%ymm5      \n\t"
    "vmulpd                  %%ymm0,         %%ymm6,        %%ymm6      \n\t"
    "vmulpd                  %%ymm0,         %%ymm7,        %%ymm7      \n\t"
    "vmulpd                  %%ymm0,         %%ymm8,        %%ymm8      \n\t"
    "vmulpd                  %%ymm0,         %%ymm9,        %%ymm9      \n\t"
    "vmulpd                  %%ymm0,         %%ymm10,       %%ymm10     \n\t"
    "vmulpd                  %%ymm0,         %%ymm11,       %%ymm11     \n\t"
    "vmulpd                  %%ymm0,         %%ymm12,       %%ymm12     \n\t"
    "vmulpd                  %%ymm0,         %%ymm13,       %%ymm13     \n\t"
    "vmulpd                  %%ymm0,         %%ymm14,       %%ymm14     \n\t"
    "vmulpd                  %%ymm0,         %%ymm15,       %%ymm15     \n\t"
    "leaq                    (,%%r8,8),       %%r8                            \n\t"
    "leaq                    (,%%r9,8),       %%r9                            \n\t"
    "leaq                    (,%%r9,2),       %%r10        # 2*incColC        \n\t"
    "leaq                    (%%r10,%%r9),    %%r11        # 3*incColC       \n\t"
    "leaq                    (%%rcx,%%r10,2), %%rdx        # C + 4*incColC   \n\t"
    "leaq                    (%%rdx,%%r10,2), %%rax        # C + 8*incColC   \n\t"
    // check if beta == 0
    "movq      %4,           %%rdi                      \n\t"  // beta
    "vbroadcastsd           (%%rdi),          %%ymm0    \n\t"
    "vxorpd                  %%ymm1,          %%ymm1,          %%ymm1  \n\t"
    "vucomisd                %%xmm0,          %%xmm1                   \n\t"
    "je                      beta_zero%=                               \n\t"
    // case: beta != 0
    "#                                                                          \n\t"
    "#       Update C(0,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rcx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rcx,%%r9),    %%xmm1,          %%xmm1          \n\t"
    "vmovlpd                 (%%rcx,%%r10),   %%xmm2,          %%xmm2          \n\t"
    "vmovhpd                 (%%rcx,%%r11),   %%xmm2,          %%xmm2          \n\t"
    "vextractf128            $1,              %%ymm4,          %%xmm3            \n\t"
    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm4,          %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"
    "vmovlpd                 %%xmm1,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rcx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(0,4:7)                                                    \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rdx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rdx,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rdx,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rdx,%%r11),   %%xmm2,          %%xmm2           \n\t"
    "vextractf128            $1,              %%ymm8,          %%xmm3           \n\t"
    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm8,          %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"
    "vmovlpd                 %%xmm1,          (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rdx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(0,8:11)                                                   \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rax),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rax,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rax,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rax,%%r11),   %%xmm2,          %%xmm2           \n\t"
    "vextractf128            $1,              %%ymm12,         %%xmm3           \n\t"
    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm12,         %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"
    "vmovlpd                 %%xmm1,          (%%rax)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rax,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(1,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "addq                    %%r8,            %%rcx                             \n\t"
    "addq                    %%r8,            %%rdx                             \n\t"
    "addq                    %%r8,            %%rax                             \n\t"
    "vmovlpd                 (%%rcx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rcx,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rcx,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rcx,%%r11),   %%xmm2,          %%xmm2           \n\t"
    "vextractf128            $1,              %%ymm5,          %%xmm3           \n\t"
    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm5,          %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"
    "vmovlpd                 %%xmm1,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rcx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(1,4:7)                                                    \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rdx),          %%xmm1,          %%xmm1          \n\t"
    "vmovhpd                 (%%rdx,%%r9),     %%xmm1,          %%xmm1          \n\t"
    "vmovlpd                 (%%rdx,%%r10),    %%xmm2,          %%xmm2          \n\t"
    "vmovhpd                 (%%rdx,%%r11),    %%xmm2,          %%xmm2          \n\t"
    "vextractf128            $1,             %%ymm9,          %%xmm3            \n\t"
    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm9,          %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"
    "vmovlpd                 %%xmm1,          (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rdx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(1,8:11)                                                   \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rax),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rax,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rax,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rax,%%r11),   %%xmm2,          %%xmm2           \n\t"
    "vextractf128            $1,              %%ymm13,         %%xmm3           \n\t"
    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm13,         %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"
    "vmovlpd                 %%xmm1,          (%%rax)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rax,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(2,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "addq                    %%r8,            %%rcx                             \n\t"
    "addq                    %%r8,            %%rdx                             \n\t"
    "addq                    %%r8,            %%rax                             \n\t"
    "vmovlpd                 (%%rcx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rcx,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rcx,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rcx,%%r11),   %%xmm2,          %%xmm2           \n\t"
    "vextractf128            $1,              %%ymm6,          %%xmm3           \n\t"
    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm6,          %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"
    "vmovlpd                 %%xmm1,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rcx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(2,4:7)                                                    \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rdx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rdx,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rdx,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rdx,%%r11),   %%xmm2,          %%xmm2           \n\t"
    "vextractf128            $1,              %%ymm10,         %%xmm3           \n\t"
    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm10,         %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"
    "vmovlpd                 %%xmm1,          (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rdx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(2,8:11)                                                   \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rax),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rax,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rax,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rax,%%r11),   %%xmm2,          %%xmm2           \n\t"
    "vextractf128            $1,              %%ymm14,         %%xmm3           \n\t"
    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm14,         %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"
    "vmovlpd                 %%xmm1,          (%%rax)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rax,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(3,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "addq                    %%r8,            %%rcx                             \n\t"
    "addq                    %%r8,            %%rdx                             \n\t"
    "addq                    %%r8,            %%rax                             \n\t"
    "vmovlpd                 (%%rcx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rcx,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rcx,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rcx,%%r11),   %%xmm2,          %%xmm2           \n\t"
    "vextractf128            $1,              %%ymm7,          %%xmm3           \n\t"
    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm7,          %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"
    "vmovlpd                 %%xmm1,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rcx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(3,4:7)                                                    \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rdx),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rdx,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rdx,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rdx,%%r11),   %%xmm2,          %%xmm2           \n\t"
    "vextractf128            $1,              %%ymm11,         %%xmm3           \n\t"
    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm11,         %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"
    "vmovlpd                 %%xmm1,          (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rdx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(3,8:11)                                                   \n\t"
    "#                                                                          \n\t"
    "vmovlpd                 (%%rax),         %%xmm1,          %%xmm1           \n\t"
    "vmovhpd                 (%%rax,%%r9),    %%xmm1,          %%xmm1           \n\t"
    "vmovlpd                 (%%rax,%%r10),   %%xmm2,          %%xmm2           \n\t"
    "vmovhpd                 (%%rax,%%r11),   %%xmm2,          %%xmm2           \n\t"
    "vextractf128            $1,              %%ymm15,         %%xmm3           \n\t"
    "vmulpd                  %%xmm0,          %%xmm1,          %%xmm1           \n\t"
    "vaddpd                  %%xmm1,          %%xmm15,         %%xmm1           \n\t"
    "vmulpd                  %%xmm0,          %%xmm2,          %%xmm2           \n\t"
    "vaddpd                  %%xmm2,          %%xmm3,          %%xmm2           \n\t"
    "vmovlpd                 %%xmm1,          (%%rax)                           \n\t"
    "vmovhpd                 %%xmm1,          (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm2,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm2,          (%%rax,%%r11)                     \n\t"
    "jmp done%=                              \n\t"
    // case: beta == 0
    "beta_zero%=:                           \n\t"
    "#                                                                          \n\t"
    "#       Update C(0,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "vextractf128            $1,              %%ymm4,          %%xmm3            \n\t"
    "vmovlpd                 %%xmm4,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm4,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rcx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(0,4:7)                                                    \n\t"
    "#                                                                          \n\t"
    "vextractf128            $1,              %%ymm8,          %%xmm3           \n\t"
    "vmovlpd                 %%xmm8,          (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm8,          (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rdx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(0,8:11)                                                   \n\t"
    "#                                                                          \n\t"
    "vextractf128            $1,              %%ymm12,         %%xmm3           \n\t"
    "vmovlpd                 %%xmm12,         (%%rax)                           \n\t"
    "vmovhpd                 %%xmm12,         (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rax,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(1,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "addq                    %%r8,            %%rcx                             \n\t"
    "addq                    %%r8,            %%rdx                             \n\t"
    "addq                    %%r8,            %%rax                             \n\t"
    "vextractf128            $1,              %%ymm5,          %%xmm3           \n\t"
    "vmovlpd                 %%xmm5,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm5,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rcx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(1,4:7)                                                    \n\t"
    "#                                                                          \n\t"
    "vextractf128            $1,             %%ymm9,          %%xmm3            \n\t"
    "vmovlpd                 %%xmm9,          (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm9,          (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rdx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(1,8:11)                                                   \n\t"
    "#                                                                          \n\t"
    "vextractf128            $1,              %%ymm13,         %%xmm3           \n\t"
    "vmovlpd                 %%xmm13,         (%%rax)                           \n\t"
    "vmovhpd                 %%xmm13,         (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rax,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(2,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "addq                    %%r8,            %%rcx                             \n\t"
    "addq                    %%r8,            %%rdx                             \n\t"
    "addq                    %%r8,            %%rax                             \n\t"
    "vextractf128            $1,              %%ymm6,          %%xmm3           \n\t"
    "vmovlpd                 %%xmm6,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm6,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rcx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(2,4:7)                                                    \n\t"
    "#                                                                          \n\t"
    "vextractf128            $1,              %%ymm10,         %%xmm3           \n\t"
    "vmovlpd                 %%xmm10,         (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm10,         (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rdx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(2,8:11)                                                   \n\t"
    "#                                                                          \n\t"
    "vextractf128            $1,              %%ymm14,         %%xmm3           \n\t"
    "vmovlpd                 %%xmm14,         (%%rax)                           \n\t"
    "vmovhpd                 %%xmm14,         (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rax,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(3,0:3)                                                    \n\t"
    "#                                                                          \n\t"
    "addq                    %%r8,            %%rcx                             \n\t"
    "addq                    %%r8,            %%rdx                             \n\t"
    "addq                    %%r8,            %%rax                             \n\t"
    "vextractf128            $1,              %%ymm7,          %%xmm3           \n\t"
    "vmovlpd                 %%xmm7,          (%%rcx)                           \n\t"
    "vmovhpd                 %%xmm7,          (%%rcx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rcx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rcx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(3,4:7)                                                    \n\t"
    "#                                                                          \n\t"
    "vextractf128            $1,              %%ymm11,         %%xmm3           \n\t"
    "vmovlpd                 %%xmm11,         (%%rdx)                           \n\t"
    "vmovhpd                 %%xmm11,         (%%rdx,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rdx,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rdx,%%r11)                     \n\t"
    "#                                                                          \n\t"
    "#       Update C(3,8:11)                                                   \n\t"
    "#                                                                          \n\t"
    "vextractf128            $1,              %%ymm15,         %%xmm3           \n\t"
    "vmovlpd                 %%xmm15,         (%%rax)                           \n\t"
    "vmovhpd                 %%xmm15,         (%%rax,%%r9)                      \n\t"
    "vmovlpd                 %%xmm3,          (%%rax,%%r10)                     \n\t"
    "vmovhpd                 %%xmm3,          (%%rax,%%r11)                     \n\t"
    "done%=:                           \n\t"
    : // output
    : // input
        "m" (kc),       // 0
        "m" (A),        // 1
        "m" (B),        // 2
        "m" (pAlpha),   // 3
        "m" (pBeta),    // 4
        "m" (C),        // 5
        "m" (incRowC),  // 6
        "m" (incColC)   // 7
    : // register clobber list
        "rax",  "rcx",  "rdx",    "rsi",   "rdi",
        "r8",   "r9",   "r10",  "r11", "r12", "r13",
        "xmm0", "xmm1", "xmm2",  "xmm3",  "xmm4",  "xmm5",  "xmm6",  "xmm7",
        "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
        "memory"
    );
}
#endif