#include <stddef.h>
#include <stdint.h>
void
dgemm_micro_avx_8x4(size_t kc,
double alpha,
const double *A,
const double *B,
double beta,
double *C,
ptrdiff_t incRowC,
ptrdiff_t incColC,
const double *a_next,
const double *b_next
)
{
int64_t rs_c = incRowC;
int64_t cs_c = incColC;
double *pAlpha = α
double *pBeta = β
int64_t k_iter = kc / 4;
int64_t k_left = kc % 4;
__asm__ volatile
(
" \n\t"
" \n\t"
"movq %2, %%rax \n\t"
"movq %3, %%rbx \n\t"
"movq %9, %%r15 \n\t"
"addq $-4 * 64, %%r15 \n\t"
" \n\t"
"vmovapd 0 * 32(%%rax), %%ymm0 \n\t"
"vmovapd 0 * 32(%%rbx), %%ymm2 \n\t"
"vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
" \n\t"
"movq %6, %%rcx \n\t"
"movq %8, %%rdi \n\t"
"leaq (,%%rdi,8), %%rdi \n\t"
"leaq (%%rcx,%%rdi,2), %%r10 \n\t"
" \n\t"
"prefetcht0 3 * 8(%%rcx) \n\t"
"prefetcht0 3 * 8(%%rcx,%%rdi) \n\t"
"prefetcht0 3 * 8(%%r10) \n\t"
"prefetcht0 3 * 8(%%r10,%%rdi) \n\t"
" \n\t"
"vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t"
"vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t"
"vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t"
"vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t"
"vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
"vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
"vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
" \n\t"
" \n\t"
" \n\t"
"movq %0, %%rsi \n\t"
"testq %%rsi, %%rsi \n\t"
"je .DCONSIDKLEFT%= \n\t"
" \n\t"
" \n\t"
" \n\t"
".DLOOPKITER%=: \n\t"
" \n\t"
"addq $4 * 4 * 8, %%r15 \n\t"
" \n\t"
" \n\t"
"vmovapd 1 * 32(%%rax), %%ymm1 \n\t"
"vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
"vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
"vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
"vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
"vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
" \n\t"
"prefetcht0 16 * 32(%%rax) \n\t"
"vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
"vmovapd 1 * 32(%%rbx), %%ymm2 \n\t"
"vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
"vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
"vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
"vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
" \n\t"
"vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
"vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
"vmovapd 2 * 32(%%rax), %%ymm0 \n\t"
"vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
"vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
"prefetcht0 0 * 32(%%r15) \n\t"
" \n\t"
"vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
"vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
"vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
"vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
" \n\t"
" \n\t"
" \n\t"
"vmovapd 3 * 32(%%rax), %%ymm1 \n\t"
"vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
"vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
"vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
"vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
"vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
" \n\t"
"prefetcht0 18 * 32(%%rax) \n\t"
"vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
"vmovapd 2 * 32(%%rbx), %%ymm2 \n\t"
"vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
"vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
"vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
"vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
" \n\t"
"vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
"vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
"vmovapd 4 * 32(%%rax), %%ymm0 \n\t"
"vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
"vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
" \n\t"
"vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
"vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
"vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
"vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
" \n\t"
" \n\t"
" \n\t"
"vmovapd 5 * 32(%%rax), %%ymm1 \n\t"
"vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
"vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
"vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
"vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
"vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
" \n\t"
"prefetcht0 20 * 32(%%rax) \n\t"
"vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
"vmovapd 3 * 32(%%rbx), %%ymm2 \n\t"
"addq $4 * 4 * 8, %%rbx \n\t"
"vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
"vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
"vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
"vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
" \n\t"
"vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
"vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
"vmovapd 6 * 32(%%rax), %%ymm0 \n\t"
"vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
"vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
"prefetcht0 2 * 32(%%r15) \n\t"
" \n\t"
"vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
"vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
"vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
"vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
" \n\t"
" \n\t"
" \n\t"
"vmovapd 7 * 32(%%rax), %%ymm1 \n\t"
"addq $4 * 8 * 8, %%rax \n\t"
"vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
"vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
"vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
"vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
"vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
" \n\t"
"prefetcht0 14 * 32(%%rax) \n\t"
"vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
"vmovapd 0 * 32(%%rbx), %%ymm2 \n\t"
"vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
"vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
"vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
"vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
" \n\t"
"vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
"vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
"vmovapd 0 * 32(%%rax), %%ymm0 \n\t"
"vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
"vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
" \n\t"
"vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
"vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
"vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
"vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"decq %%rsi \n\t"
"jne .DLOOPKITER%= \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
".DCONSIDKLEFT%=: \n\t"
" \n\t"
"movq %1, %%rsi \n\t"
"testq %%rsi, %%rsi \n\t"
"je .DPOSTACCUM%= \n\t"
" \n\t"
" \n\t"
" \n\t"
".DLOOPKLEFT%=: \n\t"
" \n\t"
"vmovapd 1 * 32(%%rax), %%ymm1 \n\t"
"addq $8 * 1 * 8, %%rax \n\t"
"vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
"vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
"vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
"vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
"vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
"vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
" \n\t"
"prefetcht0 14 * 32(%%rax) \n\t"
"vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
"vmovapd 1 * 32(%%rbx), %%ymm2 \n\t"
"addq $4 * 1 * 8, %%rbx \n\t"
"vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
"vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
"vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
"vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
" \n\t"
"vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
"vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
"vmovapd 0 * 32(%%rax), %%ymm0 \n\t"
"vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
"vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
" \n\t"
"vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
"vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
"vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
"vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
" \n\t"
" \n\t"
"decq %%rsi \n\t"
"jne .DLOOPKLEFT%= \n\t"
" \n\t"
" \n\t"
" \n\t"
".DPOSTACCUM%=: \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"vmovapd %%ymm15, %%ymm7 \n\t"
"vshufpd $0xa, %%ymm15, %%ymm13, %%ymm15 \n\t"
"vshufpd $0xa, %%ymm13, %%ymm7, %%ymm13 \n\t"
" \n\t"
"vmovapd %%ymm11, %%ymm7 \n\t"
"vshufpd $0xa, %%ymm11, %%ymm9, %%ymm11 \n\t"
"vshufpd $0xa, %%ymm9, %%ymm7, %%ymm9 \n\t"
" \n\t"
"vmovapd %%ymm14, %%ymm7 \n\t"
"vshufpd $0xa, %%ymm14, %%ymm12, %%ymm14 \n\t"
"vshufpd $0xa, %%ymm12, %%ymm7, %%ymm12 \n\t"
" \n\t"
"vmovapd %%ymm10, %%ymm7 \n\t"
"vshufpd $0xa, %%ymm10, %%ymm8, %%ymm10 \n\t"
"vshufpd $0xa, %%ymm8, %%ymm7, %%ymm8 \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"vmovapd %%ymm15, %%ymm7 \n\t"
"vperm2f128 $0x30, %%ymm15, %%ymm11, %%ymm15 \n\t"
"vperm2f128 $0x12, %%ymm7, %%ymm11, %%ymm11 \n\t"
" \n\t"
"vmovapd %%ymm13, %%ymm7 \n\t"
"vperm2f128 $0x30, %%ymm13, %%ymm9, %%ymm13 \n\t"
"vperm2f128 $0x12, %%ymm7, %%ymm9, %%ymm9 \n\t"
" \n\t"
"vmovapd %%ymm14, %%ymm7 \n\t"
"vperm2f128 $0x30, %%ymm14, %%ymm10, %%ymm14 \n\t"
"vperm2f128 $0x12, %%ymm7, %%ymm10, %%ymm10 \n\t"
" \n\t"
"vmovapd %%ymm12, %%ymm7 \n\t"
"vperm2f128 $0x30, %%ymm12, %%ymm8, %%ymm12 \n\t"
"vperm2f128 $0x12, %%ymm7, %%ymm8, %%ymm8 \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"movq %4, %%rax \n\t"
"movq %5, %%rbx \n\t"
"vbroadcastsd (%%rax), %%ymm0 \n\t"
"vbroadcastsd (%%rbx), %%ymm2 \n\t"
" \n\t"
"vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t"
"vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t"
"vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t"
"vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t"
"vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t"
"vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t"
"vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t"
"vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"movq %7, %%rsi \n\t"
"leaq (,%%rsi,8), %%rsi \n\t"
" \n\t"
"leaq (%%rcx,%%rsi,4), %%rdx \n\t"
" \n\t"
"leaq (,%%rsi,2), %%r12 \n\t"
"leaq (%%r12,%%rsi,1), %%r13 \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"cmpq $8, %%rsi \n\t"
"sete %%bl \n\t"
"testq $31, %%rcx \n\t"
"setz %%bh \n\t"
"testq $31, %%rdi \n\t"
"setz %%al \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
"vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t"
"vucomisd %%xmm0, %%xmm2 \n\t"
"je .DBETAZERO%= \n\t"
" \n\t"
" \n\t"
" \n\t"
"andb %%bl, %%bh \n\t"
"andb %%bh, %%al \n\t"
"jne .DCOLSTORED%= \n\t"
" \n\t"
" \n\t"
" \n\t"
".DGENSTORED%=: \n\t"
" \n\t"
" \n\t"
"vextractf128 $1, %%ymm9, %%xmm1 \n\t"
"vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm9, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rcx) \n\t"
"vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t"
"vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rcx,%%r12) \n\t"
"vmovhpd %%xmm0, (%%rcx,%%r13) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
"vextractf128 $1, %%ymm11, %%xmm1 \n\t"
"vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm11, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rcx) \n\t"
"vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t"
"vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rcx,%%r12) \n\t"
"vmovhpd %%xmm0, (%%rcx,%%r13) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
"vextractf128 $1, %%ymm13, %%xmm1 \n\t"
"vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm13, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rcx) \n\t"
"vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t"
"vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rcx,%%r12) \n\t"
"vmovhpd %%xmm0, (%%rcx,%%r13) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
"vextractf128 $1, %%ymm15, %%xmm1 \n\t"
"vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm15, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rcx) \n\t"
"vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t"
"vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rcx,%%r12) \n\t"
"vmovhpd %%xmm0, (%%rcx,%%r13) \n\t"
" \n\t"
" \n\t"
" \n\t"
"vextractf128 $1, %%ymm8, %%xmm1 \n\t"
"vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm8, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rdx) \n\t"
"vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t"
"vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rdx,%%r12) \n\t"
"vmovhpd %%xmm0, (%%rdx,%%r13) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vextractf128 $1, %%ymm10, %%xmm1 \n\t"
"vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm10, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rdx) \n\t"
"vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t"
"vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rdx,%%r12) \n\t"
"vmovhpd %%xmm0, (%%rdx,%%r13) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vextractf128 $1, %%ymm12, %%xmm1 \n\t"
"vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm12, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rdx) \n\t"
"vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t"
"vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rdx,%%r12) \n\t"
"vmovhpd %%xmm0, (%%rdx,%%r13) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vextractf128 $1, %%ymm14, %%xmm1 \n\t"
"vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm14, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rdx) \n\t"
"vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t"
"vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t"
"vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t"
"vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t"
"vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t"
"vmovlpd %%xmm0, (%%rdx,%%r12) \n\t"
"vmovhpd %%xmm0, (%%rdx,%%r13) \n\t"
" \n\t"
" \n\t"
"jmp .DDONE%= \n\t"
" \n\t"
" \n\t"
" \n\t"
".DCOLSTORED%=: \n\t"
" \n\t"
" \n\t"
"vmovapd (%%rcx), %%ymm0 \n\t"
"vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t"
"vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t"
"vmovapd %%ymm0, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
"vmovapd (%%rcx), %%ymm0 \n\t"
"vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t"
"vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t"
"vmovapd %%ymm0, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
"vmovapd (%%rcx), %%ymm0 \n\t"
"vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t"
"vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t"
"vmovapd %%ymm0, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
"vmovapd (%%rcx), %%ymm0 \n\t"
"vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t"
"vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t"
"vmovapd %%ymm0, (%%rcx) \n\t"
" \n\t"
" \n\t"
" \n\t"
"vmovapd (%%rdx), %%ymm0 \n\t"
"vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t"
"vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t"
"vmovapd %%ymm0, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vmovapd (%%rdx), %%ymm0 \n\t"
"vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t"
"vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t"
"vmovapd %%ymm0, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vmovapd (%%rdx), %%ymm0 \n\t"
"vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t"
"vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t"
"vmovapd %%ymm0, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vmovapd (%%rdx), %%ymm0 \n\t"
"vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t"
"vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t"
"vmovapd %%ymm0, (%%rdx) \n\t"
" \n\t"
" \n\t"
"jmp .DDONE%= \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
".DBETAZERO%=: \n\t"
" \n\t"
"andb %%bl, %%bh \n\t"
"andb %%bh, %%al \n\t"
"jne .DCOLSTORBZ%= \n\t"
" \n\t"
" \n\t"
" \n\t"
".DGENSTORBZ%=: \n\t"
" \n\t"
" \n\t"
"vextractf128 $1, %%ymm9, %%xmm1 \n\t"
"vmovlpd %%xmm9, (%%rcx) \n\t"
"vmovhpd %%xmm9, (%%rcx,%%rsi) \n\t"
"vmovlpd %%xmm1, (%%rcx,%%r12) \n\t"
"vmovhpd %%xmm1, (%%rcx,%%r13) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
"vextractf128 $1, %%ymm11, %%xmm1 \n\t"
"vmovlpd %%xmm11, (%%rcx) \n\t"
"vmovhpd %%xmm11, (%%rcx,%%rsi) \n\t"
"vmovlpd %%xmm1, (%%rcx,%%r12) \n\t"
"vmovhpd %%xmm1, (%%rcx,%%r13) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
"vextractf128 $1, %%ymm13, %%xmm1 \n\t"
"vmovlpd %%xmm13, (%%rcx) \n\t"
"vmovhpd %%xmm13, (%%rcx,%%rsi) \n\t"
"vmovlpd %%xmm1, (%%rcx,%%r12) \n\t"
"vmovhpd %%xmm1, (%%rcx,%%r13) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
"vextractf128 $1, %%ymm15, %%xmm1 \n\t"
"vmovlpd %%xmm15, (%%rcx) \n\t"
"vmovhpd %%xmm15, (%%rcx,%%rsi) \n\t"
"vmovlpd %%xmm1, (%%rcx,%%r12) \n\t"
"vmovhpd %%xmm1, (%%rcx,%%r13) \n\t"
" \n\t"
" \n\t"
" \n\t"
"vextractf128 $1, %%ymm8, %%xmm1 \n\t"
"vmovlpd %%xmm8, (%%rdx) \n\t"
"vmovhpd %%xmm8, (%%rdx,%%rsi) \n\t"
"vmovlpd %%xmm1, (%%rdx,%%r12) \n\t"
"vmovhpd %%xmm1, (%%rdx,%%r13) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vextractf128 $1, %%ymm10, %%xmm1 \n\t"
"vmovlpd %%xmm10, (%%rdx) \n\t"
"vmovhpd %%xmm10, (%%rdx,%%rsi) \n\t"
"vmovlpd %%xmm1, (%%rdx,%%r12) \n\t"
"vmovhpd %%xmm1, (%%rdx,%%r13) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vextractf128 $1, %%ymm12, %%xmm1 \n\t"
"vmovlpd %%xmm12, (%%rdx) \n\t"
"vmovhpd %%xmm12, (%%rdx,%%rsi) \n\t"
"vmovlpd %%xmm1, (%%rdx,%%r12) \n\t"
"vmovhpd %%xmm1, (%%rdx,%%r13) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vextractf128 $1, %%ymm14, %%xmm1 \n\t"
"vmovlpd %%xmm14, (%%rdx) \n\t"
"vmovhpd %%xmm14, (%%rdx,%%rsi) \n\t"
"vmovlpd %%xmm1, (%%rdx,%%r12) \n\t"
"vmovhpd %%xmm1, (%%rdx,%%r13) \n\t"
" \n\t"
" \n\t"
"jmp .DDONE%= \n\t"
" \n\t"
" \n\t"
" \n\t"
".DCOLSTORBZ%=: \n\t"
" \n\t"
" \n\t"
"vmovapd %%ymm9, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
"vmovapd %%ymm11, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
"vmovapd %%ymm13, (%%rcx) \n\t"
"addq %%rdi, %%rcx \n\t"
" \n\t"
"vmovapd %%ymm15, (%%rcx) \n\t"
" \n\t"
" \n\t"
" \n\t"
"vmovapd %%ymm8, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vmovapd %%ymm10, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vmovapd %%ymm12, (%%rdx) \n\t"
"addq %%rdi, %%rdx \n\t"
" \n\t"
"vmovapd %%ymm14, (%%rdx) \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
" \n\t"
".DDONE%=: \n\t"
" \n\t"
:
:
"m" (k_iter),
"m" (k_left),
"m" (A),
"m" (B),
"m" (pAlpha),
"m" (pBeta),
"m" (C),
"m" (rs_c),
"m" (cs_c),
"m" (b_next)
:
"rax", "rbx", "rcx", "rdx", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15",
"memory"
);
}