#ifndef FMA_HPP
#define FMA_HPP
#include "gemm.hpp"
#include <type_traits>
template <typename Index>
typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
&& BlockSize<double>::MR==4
&& BlockSize<double>::NR==12
&& BlockSize<double>::align==32,
void>::type
ugemm(Index kc_, double alpha,
const double *A, const double *B,
double beta,
double *C, Index incRowC_, Index incColC_)
{
int64_t kc = kc_;
int64_t incRowC = incRowC_;
int64_t incColC = incColC_;
double *pAlpha = α
double *pBeta = β
//
// Compute AB = A*B
//
__asm__ volatile
(
"movq %0, %%rdi \n\t" // kc
"movq %1, %%rsi \n\t" // A
"movq %2, %%rdx \n\t" // B
"movq %5, %%rcx \n\t" // C
"movq %6, %%r8 \n\t" // incRowC
"movq %7, %%r9 \n\t" // incColC
"vmovapd 0*32(%%rdx), %%ymm1 \n\t"
"vmovapd 1*32(%%rdx), %%ymm2 \n\t"
"vmovapd 2*32(%%rdx), %%ymm3 \n\t"
"vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t"
"vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t"
"vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t"
"vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t"
"vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t"
"vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t"
"vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t"
"vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
"vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
"vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
"movq $3*32, %%r13 \n\t"
"movq $4* 8, %%r12 \n\t"
"jmp check%= \n\t"
"loop%=: \n\t"
"vbroadcastsd 0* 8(%%rsi), %%ymm0 \n\t"
"addq %%r13, %%rdx \n\t"
"vfmadd231pd %%ymm0, %%ymm1, %%ymm4 \n\t"
"vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t"
"vfmadd231pd %%ymm0, %%ymm3, %%ymm12 \n\t"
"vbroadcastsd 1* 8(%%rsi), %%ymm0 \n\t"
"decq %%rdi \n\t"
"vfmadd231pd %%ymm0, %%ymm1, %%ymm5 \n\t"
"vfmadd231pd %%ymm0, %%ymm2, %%ymm9 \n\t"
"vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t"
"vbroadcastsd 2* 8(%%rsi), %%ymm0 \n\t"
"addq %%r12, %%rsi \n\t"
"vfmadd231pd %%ymm0, %%ymm1, %%ymm6 \n\t"
"vfmadd231pd %%ymm0, %%ymm2, %%ymm10 \n\t"
"vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t"
"vbroadcastsd -1* 8(%%rsi), %%ymm0 \n\t"
"vfmadd231pd %%ymm0, %%ymm1, %%ymm7 \n\t"
"vmovapd 0*32(%%rdx), %%ymm1 \n\t"
"vfmadd231pd %%ymm0, %%ymm2, %%ymm11 \n\t"
"vmovapd 1*32(%%rdx), %%ymm2 \n\t"
"vfmadd231pd %%ymm0, %%ymm3, %%ymm15 \n\t"
"vmovapd 2*32(%%rdx), %%ymm3 \n\t"
"check%=: \n\t"
"testq %%rdi, %%rdi \n\t"
"jg loop%= \n\t"
"movq %3, %%rdi \n\t" // alpha
"vbroadcastsd (%%rdi), %%ymm0 \n\t"
"vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t"
"vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t"
"vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t"
"vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t"
"vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t"
"vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t"
"vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t"
"vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t"
"vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t"
"vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t"
"vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t"
"vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t"
"leaq (,%%r8,8), %%r8 \n\t"
"leaq (,%%r9,8), %%r9 \n\t"
"leaq (,%%r9,2), %%r10 # 2*incColC \n\t"
"leaq (%%r10,%%r9), %%r11 # 3*incColC \n\t"
"leaq (%%rcx,%%r10,2), %%rdx # C + 4*incColC \n\t"
"leaq (%%rdx,%%r10,2), %%rax # C + 8*incColC \n\t"
// check if beta == 0
"movq %4, %%rdi \n\t" // beta
"vbroadcastsd (%%rdi), %%ymm0 \n\t"
"vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t"
"vucomisd %%xmm0, %%xmm1 \n\t"
"je beta_zero%= \n\t"
// case: beta != 0
"# \n\t"
"# Update C(0,0:3) \n\t"
"# \n\t"
"vmovlpd (%%rcx), %%xmm1, %%xmm1 \n\t"
"vmovhpd (%%rcx,%%r9), %%xmm1, %%xmm1 \n\t"
"vmovlpd (%%rcx,%%r10), %%xmm2, %%xmm2 \n\t"
"vmovhpd (%%rcx,%%r11), %%xmm2, %%xmm2 \n\t"
"vextractf128 $1, %%ymm4, %%xmm3 \n\t"
"vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t"
"vaddpd %%xmm1, %%xmm4, %%xmm1 \n\t"
"vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t"
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
"vmovlpd %%xmm1, (%%rcx) \n\t"
"vmovhpd %%xmm1, (%%rcx,%%r9) \n\t"
"vmovlpd %%xmm2, (%%rcx,%%r10) \n\t"
"vmovhpd %%xmm2, (%%rcx,%%r11) \n\t"
"# \n\t"
"# Update C(0,4:7) \n\t"
"# \n\t"
"vmovlpd (%%rdx), %%xmm1, %%xmm1 \n\t"
"vmovhpd (%%rdx,%%r9), %%xmm1, %%xmm1 \n\t"
"vmovlpd (%%rdx,%%r10), %%xmm2, %%xmm2 \n\t"
"vmovhpd (%%rdx,%%r11), %%xmm2, %%xmm2 \n\t"
"vextractf128 $1, %%ymm8, %%xmm3 \n\t"
"vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t"
"vaddpd %%xmm1, %%xmm8, %%xmm1 \n\t"
"vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t"
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
"vmovlpd %%xmm1, (%%rdx) \n\t"
"vmovhpd %%xmm1, (%%rdx,%%r9) \n\t"
"vmovlpd %%xmm2, (%%rdx,%%r10) \n\t"
"vmovhpd %%xmm2, (%%rdx,%%r11) \n\t"
"# \n\t"
"# Update C(0,8:11) \n\t"
"# \n\t"
"vmovlpd (%%rax), %%xmm1, %%xmm1 \n\t"
"vmovhpd (%%rax,%%r9), %%xmm1, %%xmm1 \n\t"
"vmovlpd (%%rax,%%r10), %%xmm2, %%xmm2 \n\t"
"vmovhpd (%%rax,%%r11), %%xmm2, %%xmm2 \n\t"
"vextractf128 $1, %%ymm12, %%xmm3 \n\t"
"vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t"
"vaddpd %%xmm1, %%xmm12, %%xmm1 \n\t"
"vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t"
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
"vmovlpd %%xmm1, (%%rax) \n\t"
"vmovhpd %%xmm1, (%%rax,%%r9) \n\t"
"vmovlpd %%xmm2, (%%rax,%%r10) \n\t"
"vmovhpd %%xmm2, (%%rax,%%r11) \n\t"
"# \n\t"
"# Update C(1,0:3) \n\t"
"# \n\t"
"addq %%r8, %%rcx \n\t"
"addq %%r8, %%rdx \n\t"
"addq %%r8, %%rax \n\t"
"vmovlpd (%%rcx), %%xmm1, %%xmm1 \n\t"
"vmovhpd (%%rcx,%%r9), %%xmm1, %%xmm1 \n\t"
"vmovlpd (%%rcx,%%r10), %%xmm2, %%xmm2 \n\t"
"vmovhpd (%%rcx,%%r11), %%xmm2, %%xmm2 \n\t"
"vextractf128 $1, %%ymm5, %%xmm3 \n\t"
"vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t"
"vaddpd %%xmm1, %%xmm5, %%xmm1 \n\t"
"vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t"
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
"vmovlpd %%xmm1, (%%rcx) \n\t"
"vmovhpd %%xmm1, (%%rcx,%%r9) \n\t"
"vmovlpd %%xmm2, (%%rcx,%%r10) \n\t"
"vmovhpd %%xmm2, (%%rcx,%%r11) \n\t"
"# \n\t"
"# Update C(1,4:7) \n\t"
"# \n\t"
"vmovlpd (%%rdx), %%xmm1, %%xmm1 \n\t"
"vmovhpd (%%rdx,%%r9), %%xmm1, %%xmm1 \n\t"
"vmovlpd (%%rdx,%%r10), %%xmm2, %%xmm2 \n\t"
"vmovhpd (%%rdx,%%r11), %%xmm2, %%xmm2 \n\t"
"vextractf128 $1, %%ymm9, %%xmm3 \n\t"
"vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t"
"vaddpd %%xmm1, %%xmm9, %%xmm1 \n\t"
"vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t"
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
"vmovlpd %%xmm1, (%%rdx) \n\t"
"vmovhpd %%xmm1, (%%rdx,%%r9) \n\t"
"vmovlpd %%xmm2, (%%rdx,%%r10) \n\t"
"vmovhpd %%xmm2, (%%rdx,%%r11) \n\t"
"# \n\t"
"# Update C(1,8:11) \n\t"
"# \n\t"
"vmovlpd (%%rax), %%xmm1, %%xmm1 \n\t"
"vmovhpd (%%rax,%%r9), %%xmm1, %%xmm1 \n\t"
"vmovlpd (%%rax,%%r10), %%xmm2, %%xmm2 \n\t"
"vmovhpd (%%rax,%%r11), %%xmm2, %%xmm2 \n\t"
"vextractf128 $1, %%ymm13, %%xmm3 \n\t"
"vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t"
"vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t"
"vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t"
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
"vmovlpd %%xmm1, (%%rax) \n\t"
"vmovhpd %%xmm1, (%%rax,%%r9) \n\t"
"vmovlpd %%xmm2, (%%rax,%%r10) \n\t"
"vmovhpd %%xmm2, (%%rax,%%r11) \n\t"
"# \n\t"
"# Update C(2,0:3) \n\t"
"# \n\t"
"addq %%r8, %%rcx \n\t"
"addq %%r8, %%rdx \n\t"
"addq %%r8, %%rax \n\t"
"vmovlpd (%%rcx), %%xmm1, %%xmm1 \n\t"
"vmovhpd (%%rcx,%%r9), %%xmm1, %%xmm1 \n\t"
"vmovlpd (%%rcx,%%r10), %%xmm2, %%xmm2 \n\t"
"vmovhpd (%%rcx,%%r11), %%xmm2, %%xmm2 \n\t"
"vextractf128 $1, %%ymm6, %%xmm3 \n\t"
"vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t"
"vaddpd %%xmm1, %%xmm6, %%xmm1 \n\t"
"vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t"
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
"vmovlpd %%xmm1, (%%rcx) \n\t"
"vmovhpd %%xmm1, (%%rcx,%%r9) \n\t"
"vmovlpd %%xmm2, (%%rcx,%%r10) \n\t"
"vmovhpd %%xmm2, (%%rcx,%%r11) \n\t"
"# \n\t"
"# Update C(2,4:7) \n\t"
"# \n\t"
"vmovlpd (%%rdx), %%xmm1, %%xmm1 \n\t"
"vmovhpd (%%rdx,%%r9), %%xmm1, %%xmm1 \n\t"
"vmovlpd (%%rdx,%%r10), %%xmm2, %%xmm2 \n\t"
"vmovhpd (%%rdx,%%r11), %%xmm2, %%xmm2 \n\t"
"vextractf128 $1, %%ymm10, %%xmm3 \n\t"
"vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t"
"vaddpd %%xmm1, %%xmm10, %%xmm1 \n\t"
"vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t"
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
"vmovlpd %%xmm1, (%%rdx) \n\t"
"vmovhpd %%xmm1, (%%rdx,%%r9) \n\t"
"vmovlpd %%xmm2, (%%rdx,%%r10) \n\t"
"vmovhpd %%xmm2, (%%rdx,%%r11) \n\t"
"# \n\t"
"# Update C(2,8:11) \n\t"
"# \n\t"
"vmovlpd (%%rax), %%xmm1, %%xmm1 \n\t"
"vmovhpd (%%rax,%%r9), %%xmm1, %%xmm1 \n\t"
"vmovlpd (%%rax,%%r10), %%xmm2, %%xmm2 \n\t"
"vmovhpd (%%rax,%%r11), %%xmm2, %%xmm2 \n\t"
"vextractf128 $1, %%ymm14, %%xmm3 \n\t"
"vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t"
"vaddpd %%xmm1, %%xmm14, %%xmm1 \n\t"
"vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t"
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
"vmovlpd %%xmm1, (%%rax) \n\t"
"vmovhpd %%xmm1, (%%rax,%%r9) \n\t"
"vmovlpd %%xmm2, (%%rax,%%r10) \n\t"
"vmovhpd %%xmm2, (%%rax,%%r11) \n\t"
"# \n\t"
"# Update C(3,0:3) \n\t"
"# \n\t"
"addq %%r8, %%rcx \n\t"
"addq %%r8, %%rdx \n\t"
"addq %%r8, %%rax \n\t"
"vmovlpd (%%rcx), %%xmm1, %%xmm1 \n\t"
"vmovhpd (%%rcx,%%r9), %%xmm1, %%xmm1 \n\t"
"vmovlpd (%%rcx,%%r10), %%xmm2, %%xmm2 \n\t"
"vmovhpd (%%rcx,%%r11), %%xmm2, %%xmm2 \n\t"
"vextractf128 $1, %%ymm7, %%xmm3 \n\t"
"vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t"
"vaddpd %%xmm1, %%xmm7, %%xmm1 \n\t"
"vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t"
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
"vmovlpd %%xmm1, (%%rcx) \n\t"
"vmovhpd %%xmm1, (%%rcx,%%r9) \n\t"
"vmovlpd %%xmm2, (%%rcx,%%r10) \n\t"
"vmovhpd %%xmm2, (%%rcx,%%r11) \n\t"
"# \n\t"
"# Update C(3,4:7) \n\t"
"# \n\t"
"vmovlpd (%%rdx), %%xmm1, %%xmm1 \n\t"
"vmovhpd (%%rdx,%%r9), %%xmm1, %%xmm1 \n\t"
"vmovlpd (%%rdx,%%r10), %%xmm2, %%xmm2 \n\t"
"vmovhpd (%%rdx,%%r11), %%xmm2, %%xmm2 \n\t"
"vextractf128 $1, %%ymm11, %%xmm3 \n\t"
"vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t"
"vaddpd %%xmm1, %%xmm11, %%xmm1 \n\t"
"vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t"
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
"vmovlpd %%xmm1, (%%rdx) \n\t"
"vmovhpd %%xmm1, (%%rdx,%%r9) \n\t"
"vmovlpd %%xmm2, (%%rdx,%%r10) \n\t"
"vmovhpd %%xmm2, (%%rdx,%%r11) \n\t"
"# \n\t"
"# Update C(3,8:11) \n\t"
"# \n\t"
"vmovlpd (%%rax), %%xmm1, %%xmm1 \n\t"
"vmovhpd (%%rax,%%r9), %%xmm1, %%xmm1 \n\t"
"vmovlpd (%%rax,%%r10), %%xmm2, %%xmm2 \n\t"
"vmovhpd (%%rax,%%r11), %%xmm2, %%xmm2 \n\t"
"vextractf128 $1, %%ymm15, %%xmm3 \n\t"
"vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t"
"vaddpd %%xmm1, %%xmm15, %%xmm1 \n\t"
"vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t"
"vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t"
"vmovlpd %%xmm1, (%%rax) \n\t"
"vmovhpd %%xmm1, (%%rax,%%r9) \n\t"
"vmovlpd %%xmm2, (%%rax,%%r10) \n\t"
"vmovhpd %%xmm2, (%%rax,%%r11) \n\t"
"jmp done%= \n\t"
// case: beta == 0
"beta_zero%=: \n\t"
"# \n\t"
"# Update C(0,0:3) \n\t"
"# \n\t"
"vextractf128 $1, %%ymm4, %%xmm3 \n\t"
"vmovlpd %%xmm4, (%%rcx) \n\t"
"vmovhpd %%xmm4, (%%rcx,%%r9) \n\t"
"vmovlpd %%xmm3, (%%rcx,%%r10) \n\t"
"vmovhpd %%xmm3, (%%rcx,%%r11) \n\t"
"# \n\t"
"# Update C(0,4:7) \n\t"
"# \n\t"
"vextractf128 $1, %%ymm8, %%xmm3 \n\t"
"vmovlpd %%xmm8, (%%rdx) \n\t"
"vmovhpd %%xmm8, (%%rdx,%%r9) \n\t"
"vmovlpd %%xmm3, (%%rdx,%%r10) \n\t"
"vmovhpd %%xmm3, (%%rdx,%%r11) \n\t"
"# \n\t"
"# Update C(0,8:11) \n\t"
"# \n\t"
"vextractf128 $1, %%ymm12, %%xmm3 \n\t"
"vmovlpd %%xmm12, (%%rax) \n\t"
"vmovhpd %%xmm12, (%%rax,%%r9) \n\t"
"vmovlpd %%xmm3, (%%rax,%%r10) \n\t"
"vmovhpd %%xmm3, (%%rax,%%r11) \n\t"
"# \n\t"
"# Update C(1,0:3) \n\t"
"# \n\t"
"addq %%r8, %%rcx \n\t"
"addq %%r8, %%rdx \n\t"
"addq %%r8, %%rax \n\t"
"vextractf128 $1, %%ymm5, %%xmm3 \n\t"
"vmovlpd %%xmm5, (%%rcx) \n\t"
"vmovhpd %%xmm5, (%%rcx,%%r9) \n\t"
"vmovlpd %%xmm3, (%%rcx,%%r10) \n\t"
"vmovhpd %%xmm3, (%%rcx,%%r11) \n\t"
"# \n\t"
"# Update C(1,4:7) \n\t"
"# \n\t"
"vextractf128 $1, %%ymm9, %%xmm3 \n\t"
"vmovlpd %%xmm9, (%%rdx) \n\t"
"vmovhpd %%xmm9, (%%rdx,%%r9) \n\t"
"vmovlpd %%xmm3, (%%rdx,%%r10) \n\t"
"vmovhpd %%xmm3, (%%rdx,%%r11) \n\t"
"# \n\t"
"# Update C(1,8:11) \n\t"
"# \n\t"
"vextractf128 $1, %%ymm13, %%xmm3 \n\t"
"vmovlpd %%xmm13, (%%rax) \n\t"
"vmovhpd %%xmm13, (%%rax,%%r9) \n\t"
"vmovlpd %%xmm3, (%%rax,%%r10) \n\t"
"vmovhpd %%xmm3, (%%rax,%%r11) \n\t"
"# \n\t"
"# Update C(2,0:3) \n\t"
"# \n\t"
"addq %%r8, %%rcx \n\t"
"addq %%r8, %%rdx \n\t"
"addq %%r8, %%rax \n\t"
"vextractf128 $1, %%ymm6, %%xmm3 \n\t"
"vmovlpd %%xmm6, (%%rcx) \n\t"
"vmovhpd %%xmm6, (%%rcx,%%r9) \n\t"
"vmovlpd %%xmm3, (%%rcx,%%r10) \n\t"
"vmovhpd %%xmm3, (%%rcx,%%r11) \n\t"
"# \n\t"
"# Update C(2,4:7) \n\t"
"# \n\t"
"vextractf128 $1, %%ymm10, %%xmm3 \n\t"
"vmovlpd %%xmm10, (%%rdx) \n\t"
"vmovhpd %%xmm10, (%%rdx,%%r9) \n\t"
"vmovlpd %%xmm3, (%%rdx,%%r10) \n\t"
"vmovhpd %%xmm3, (%%rdx,%%r11) \n\t"
"# \n\t"
"# Update C(2,8:11) \n\t"
"# \n\t"
"vextractf128 $1, %%ymm14, %%xmm3 \n\t"
"vmovlpd %%xmm14, (%%rax) \n\t"
"vmovhpd %%xmm14, (%%rax,%%r9) \n\t"
"vmovlpd %%xmm3, (%%rax,%%r10) \n\t"
"vmovhpd %%xmm3, (%%rax,%%r11) \n\t"
"# \n\t"
"# Update C(3,0:3) \n\t"
"# \n\t"
"addq %%r8, %%rcx \n\t"
"addq %%r8, %%rdx \n\t"
"addq %%r8, %%rax \n\t"
"vextractf128 $1, %%ymm7, %%xmm3 \n\t"
"vmovlpd %%xmm7, (%%rcx) \n\t"
"vmovhpd %%xmm7, (%%rcx,%%r9) \n\t"
"vmovlpd %%xmm3, (%%rcx,%%r10) \n\t"
"vmovhpd %%xmm3, (%%rcx,%%r11) \n\t"
"# \n\t"
"# Update C(3,4:7) \n\t"
"# \n\t"
"vextractf128 $1, %%ymm11, %%xmm3 \n\t"
"vmovlpd %%xmm11, (%%rdx) \n\t"
"vmovhpd %%xmm11, (%%rdx,%%r9) \n\t"
"vmovlpd %%xmm3, (%%rdx,%%r10) \n\t"
"vmovhpd %%xmm3, (%%rdx,%%r11) \n\t"
"# \n\t"
"# Update C(3,8:11) \n\t"
"# \n\t"
"vextractf128 $1, %%ymm15, %%xmm3 \n\t"
"vmovlpd %%xmm15, (%%rax) \n\t"
"vmovhpd %%xmm15, (%%rax,%%r9) \n\t"
"vmovlpd %%xmm3, (%%rax,%%r10) \n\t"
"vmovhpd %%xmm3, (%%rax,%%r11) \n\t"
"done%=: \n\t"
: // output
: // input
"m" (kc), // 0
"m" (A), // 1
"m" (B), // 2
"m" (pAlpha), // 3
"m" (pBeta), // 4
"m" (C), // 5
"m" (incRowC), // 6
"m" (incColC) // 7
: // register clobber list
"rax", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
}
#endif