=============================== Bestückung des ulmBLAS Arsenals =============================== Wir stellen verschieden GEMM Micro-Kernel zur Verfügung: - `hpc/ulmblas/kernels/ugemm_sse4x4.h` enthält einen für SSE3 optimierten Micor-Kernel, der voraussetzt, dass `MR=4` und `NR=4` ist. - `hpc/ulmblas/kernels/ugemm_avx4x8.h` enthält einen für AVX optimierten Micor-Kernel, der voraussetzt, dass `MR=4` und `NR=8` ist. Diese sollen aktiviert werden, falls beim Übersetzten `-DSSE` bzw. `-DAVX` mitgegeben wird und die Blockgrößen passen. Dazu muss auch das Header File `hpc/ulmblas/kernels/ugemm.h` angepasst werden: :import: session12/page02/hpc/ulmblas/kernels/ugemm.h Aufgabe: SSE Micro-Kernel einbinden =================================== ---- CODE (type=cc) ----------------------------------------------------------- void ugemm(std::int64_t kc_, double alpha, const double *A, const double *B, double beta, double *C, std::int64_t incRowC_, std::int64_t incColC_) { int64_t kc = kc_; int64_t incRowC = incRowC_; int64_t incColC = incColC_; int64_t kb = kc / 4; int64_t kl = kc % 4; __asm__ volatile ( "movq %0, %%rsi \n\t" // kb (32 bit) stored in %rsi "movq %1, %%rdi \n\t" // kl (32 bit) stored in %rdi "movq %2, %%rax \n\t" // Address of A stored in %rax "movq %3, %%rbx \n\t" // Address of B stored in %rbx " \n\t" "movapd (%%rax), %%xmm0 \n\t" // tmp0 = _mm_load_pd(A) "movapd 16(%%rax), %%xmm1 \n\t" // tmp1 = _mm_load_pd(A+2) "movapd (%%rbx), %%xmm2 \n\t" // tmp2 = _mm_load_pd(B) " \n\t" "xorpd %%xmm8, %%xmm8 \n\t" // ab_00_11 = _mm_setzero_pd() "xorpd %%xmm9, %%xmm9 \n\t" // ab_20_31 = _mm_setzero_pd() "xorpd %%xmm10, %%xmm10 \n\t" // ab_01_10 = _mm_setzero_pd() "xorpd %%xmm11, %%xmm11 \n\t" // ab_21_30 = _mm_setzero_pd() "xorpd %%xmm12, %%xmm12 \n\t" // ab_02_13 = _mm_setzero_pd() "xorpd %%xmm13, %%xmm13 \n\t" // ab_22_33 = _mm_setzero_pd() "xorpd %%xmm14, %%xmm14 \n\t" // ab_03_12 = _mm_setzero_pd() "xorpd %%xmm15, %%xmm15 \n\t" // ab_23_32 = _mm_setzero_pd() " \n\t" "xorpd %%xmm3, %%xmm3 \n\t" // tmp3 = _mm_setzero_pd "xorpd %%xmm4, %%xmm4 \n\t" // tmp4 = _mm_setzero_pd "xorpd %%xmm5, %%xmm5 \n\t" // tmp5 = _mm_setzero_pd "xorpd %%xmm6, %%xmm6 \n\t" // tmp6 = _mm_setzero_pd "xorpd %%xmm7, %%xmm7 \n\t" // tmp7 = _mm_setzero_pd "testq %%rdi, %%rdi \n\t" // if kl==0 writeback to AB " \n\t" " \n\t" "testq %%rsi, %%rsi \n\t" // if kb==0 handle remaining kl "je .DCONSIDERLEFT%= \n\t" // update iterations " \n\t" ".DLOOP%=: \n\t" // for l = kb,..,1 do " \n\t" " \n\t" // 1. update "addpd %%xmm3, %%xmm12 \n\t" // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) "movapd 16(%%rbx), %%xmm3 \n\t" // tmp3 = _mm_load_pd(B+2) "addpd %%xmm6, %%xmm13 \n\t" // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) "movapd %%xmm2, %%xmm6 \n\t" // tmp6 = tmp2 "pshufd $78,%%xmm2, %%xmm4 \n\t" // tmp4 = _mm_shuffle_pd(tmp2, tmp2, " \n\t" // _MM_SHUFFLE2(0, 1)) "mulpd %%xmm0, %%xmm2 \n\t" // tmp2 = _mm_mul_pd(tmp2, tmp0); "mulpd %%xmm1, %%xmm6 \n\t" // tmp6 = _mm_mul_pd(tmp6, tmp1); " \n\t" " \n\t" "addpd %%xmm5, %%xmm14 \n\t" // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) "addpd %%xmm7, %%xmm15 \n\t" // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) "movapd %%xmm4, %%xmm7 \n\t" // tmp7 = tmp4 "mulpd %%xmm0, %%xmm4 \n\t" // tmp4 = _mm_mul_pd(tmp4, tmp0) "mulpd %%xmm1, %%xmm7 \n\t" // tmp7 = _mm_mul_pd(tmp7, tmp1) " \n\t" " \n\t" "addpd %%xmm2, %%xmm8 \n\t" // ab_00_11 = _mm_add_pd(ab_00_11, tmp2) "movapd 32(%%rbx), %%xmm2 \n\t" // tmp2 = _mm_load_pd(B+4) "addpd %%xmm6, %%xmm9 \n\t" // ab_20_31 = _mm_add_pd(ab_20_31, tmp6) "movapd %%xmm3, %%xmm6 \n\t" // tmp6 = tmp3 "pshufd $78,%%xmm3, %%xmm5 \n\t" // tmp5 = _mm_shuffle_pd(tmp3, tmp3, " \n\t" // _MM_SHUFFLE2(0, 1)) "mulpd %%xmm0, %%xmm3 \n\t" // tmp3 = _mm_mul_pd(tmp3, tmp0) "mulpd %%xmm1, %%xmm6 \n\t" // tmp6 = _mm_mul_pd(tmp6, tmp1) " \n\t" " \n\t" "addpd %%xmm4, %%xmm10 \n\t" // ab_01_10 = _mm_add_pd(ab_01_10, tmp4) "addpd %%xmm7, %%xmm11 \n\t" // ab_21_30 = _mm_add_pd(ab_21_30, tmp7) "movapd %%xmm5, %%xmm7 \n\t" // tmp7 = tmp5 "mulpd %%xmm0, %%xmm5 \n\t" // tmp5 = _mm_mul_pd(tmp5, tmp0) "movapd 32(%%rax), %%xmm0 \n\t" // tmp0 = _mm_load_pd(A+4) "mulpd %%xmm1, %%xmm7 \n\t" // tmp7 = _mm_mul_pd(tmp7, tmp1) "movapd 48(%%rax), %%xmm1 \n\t" // tmp1 = _mm_load_pd(A+6) " \n\t" " \n\t" " \n\t" " \n\t" // 2. update "addpd %%xmm3, %%xmm12 \n\t" // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) "movapd 48(%%rbx), %%xmm3 \n\t" // tmp3 = _mm_load_pd(B+6) "addpd %%xmm6, %%xmm13 \n\t" // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) "movapd %%xmm2, %%xmm6 \n\t" // tmp6 = tmp2 "pshufd $78,%%xmm2, %%xmm4 \n\t" // tmp4 = _mm_shuffle_pd(tmp2, tmp2, " \n\t" // _MM_SHUFFLE2(0, 1)) "mulpd %%xmm0, %%xmm2 \n\t" // tmp2 = _mm_mul_pd(tmp2, tmp0); "mulpd %%xmm1, %%xmm6 \n\t" // tmp6 = _mm_mul_pd(tmp6, tmp1); " \n\t" " \n\t" "addpd %%xmm5, %%xmm14 \n\t" // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) "addpd %%xmm7, %%xmm15 \n\t" // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) "movapd %%xmm4, %%xmm7 \n\t" // tmp7 = tmp4 "mulpd %%xmm0, %%xmm4 \n\t" // tmp4 = _mm_mul_pd(tmp4, tmp0) "mulpd %%xmm1, %%xmm7 \n\t" // tmp7 = _mm_mul_pd(tmp7, tmp1) " \n\t" " \n\t" "addpd %%xmm2, %%xmm8 \n\t" // ab_00_11 = _mm_add_pd(ab_00_11, tmp2) "movapd 64(%%rbx), %%xmm2 \n\t" // tmp2 = _mm_load_pd(B+8) "addpd %%xmm6, %%xmm9 \n\t" // ab_20_31 = _mm_add_pd(ab_20_31, tmp6) "movapd %%xmm3, %%xmm6 \n\t" // tmp6 = tmp3 "pshufd $78,%%xmm3, %%xmm5 \n\t" // tmp5 = _mm_shuffle_pd(tmp3, tmp3, " \n\t" // _MM_SHUFFLE2(0, 1)) "mulpd %%xmm0, %%xmm3 \n\t" // tmp3 = _mm_mul_pd(tmp3, tmp0) "mulpd %%xmm1, %%xmm6 \n\t" // tmp6 = _mm_mul_pd(tmp6, tmp1) " \n\t" " \n\t" "addpd %%xmm4, %%xmm10 \n\t" // ab_01_10 = _mm_add_pd(ab_01_10, tmp4) "addpd %%xmm7, %%xmm11 \n\t" // ab_21_30 = _mm_add_pd(ab_21_30, tmp7) "movapd %%xmm5, %%xmm7 \n\t" // tmp7 = tmp5 "mulpd %%xmm0, %%xmm5 \n\t" // tmp5 = _mm_mul_pd(tmp5, tmp0) "movapd 64(%%rax), %%xmm0 \n\t" // tmp0 = _mm_load_pd(A+8) "mulpd %%xmm1, %%xmm7 \n\t" // tmp7 = _mm_mul_pd(tmp7, tmp1) "movapd 80(%%rax), %%xmm1 \n\t" // tmp1 = _mm_load_pd(A+10) " \n\t" " \n\t" " \n\t" // 3. update "addpd %%xmm3, %%xmm12 \n\t" // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) "movapd 80(%%rbx), %%xmm3 \n\t" // tmp3 = _mm_load_pd(B+10) "addpd %%xmm6, %%xmm13 \n\t" // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) "movapd %%xmm2, %%xmm6 \n\t" // tmp6 = tmp2 "pshufd $78,%%xmm2, %%xmm4 \n\t" // tmp4 = _mm_shuffle_pd(tmp2, tmp2, " \n\t" // _MM_SHUFFLE2(0, 1)) "mulpd %%xmm0, %%xmm2 \n\t" // tmp2 = _mm_mul_pd(tmp2, tmp0); "mulpd %%xmm1, %%xmm6 \n\t" // tmp6 = _mm_mul_pd(tmp6, tmp1); " \n\t" " \n\t" "addpd %%xmm5, %%xmm14 \n\t" // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) "addpd %%xmm7, %%xmm15 \n\t" // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) "movapd %%xmm4, %%xmm7 \n\t" // tmp7 = tmp4 "mulpd %%xmm0, %%xmm4 \n\t" // tmp4 = _mm_mul_pd(tmp4, tmp0) "mulpd %%xmm1, %%xmm7 \n\t" // tmp7 = _mm_mul_pd(tmp7, tmp1) " \n\t" " \n\t" "addpd %%xmm2, %%xmm8 \n\t" // ab_00_11 = _mm_add_pd(ab_00_11, tmp2) "movapd 96(%%rbx), %%xmm2 \n\t" // tmp2 = _mm_load_pd(B+12) "addpd %%xmm6, %%xmm9 \n\t" // ab_20_31 = _mm_add_pd(ab_20_31, tmp6) "movapd %%xmm3, %%xmm6 \n\t" // tmp6 = tmp3 "pshufd $78,%%xmm3, %%xmm5 \n\t" // tmp5 = _mm_shuffle_pd(tmp3, tmp3, " \n\t" // _MM_SHUFFLE2(0, 1)) "mulpd %%xmm0, %%xmm3 \n\t" // tmp3 = _mm_mul_pd(tmp3, tmp0) "mulpd %%xmm1, %%xmm6 \n\t" // tmp6 = _mm_mul_pd(tmp6, tmp1) " \n\t" " \n\t" "addpd %%xmm4, %%xmm10 \n\t" // ab_01_10 = _mm_add_pd(ab_01_10, tmp4) "addpd %%xmm7, %%xmm11 \n\t" // ab_21_30 = _mm_add_pd(ab_21_30, tmp7) "movapd %%xmm5, %%xmm7 \n\t" // tmp7 = tmp5 "mulpd %%xmm0, %%xmm5 \n\t" // tmp5 = _mm_mul_pd(tmp5, tmp0) "movapd 96(%%rax), %%xmm0 \n\t" // tmp0 = _mm_load_pd(A+12) "mulpd %%xmm1, %%xmm7 \n\t" // tmp7 = _mm_mul_pd(tmp7, tmp1) "movapd 112(%%rax), %%xmm1 \n\t" // tmp1 = _mm_load_pd(A+14) " \n\t" " \n\t" " \n\t" // 4. update "addpd %%xmm3, %%xmm12 \n\t" // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) "movapd 112(%%rbx), %%xmm3 \n\t" // tmp3 = _mm_load_pd(B+14) "addpd %%xmm6, %%xmm13 \n\t" // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) "movapd %%xmm2, %%xmm6 \n\t" // tmp6 = tmp2 "pshufd $78,%%xmm2, %%xmm4 \n\t" // tmp4 = _mm_shuffle_pd(tmp2, tmp2, " \n\t" // _MM_SHUFFLE2(0, 1)) "mulpd %%xmm0, %%xmm2 \n\t" // tmp2 = _mm_mul_pd(tmp2, tmp0); "mulpd %%xmm1, %%xmm6 \n\t" // tmp6 = _mm_mul_pd(tmp6, tmp1); " \n\t" "addq $32*4, %%rax \n\t" // A += 16; " \n\t" "addpd %%xmm5, %%xmm14 \n\t" // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) "addpd %%xmm7, %%xmm15 \n\t" // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) "movapd %%xmm4, %%xmm7 \n\t" // tmp7 = tmp4 "mulpd %%xmm0, %%xmm4 \n\t" // tmp4 = _mm_mul_pd(tmp4, tmp0) "mulpd %%xmm1, %%xmm7 \n\t" // tmp7 = _mm_mul_pd(tmp7, tmp1) " \n\t" " \n\t" "addpd %%xmm2, %%xmm8 \n\t" // ab_00_11 = _mm_add_pd(ab_00_11, tmp2) "movapd 128(%%rbx), %%xmm2 \n\t" // tmp2 = _mm_load_pd(B+16) "addpd %%xmm6, %%xmm9 \n\t" // ab_20_31 = _mm_add_pd(ab_20_31, tmp6) "movapd %%xmm3, %%xmm6 \n\t" // tmp6 = tmp3 "pshufd $78,%%xmm3, %%xmm5 \n\t" // tmp5 = _mm_shuffle_pd(tmp3, tmp3, " \n\t" // _MM_SHUFFLE2(0, 1)) "mulpd %%xmm0, %%xmm3 \n\t" // tmp3 = _mm_mul_pd(tmp3, tmp0) "mulpd %%xmm1, %%xmm6 \n\t" // tmp6 = _mm_mul_pd(tmp6, tmp1) " \n\t" "addq $32*4, %%rbx \n\t" // B += 16; " \n\t" " \n\t" "addpd %%xmm4, %%xmm10 \n\t" // ab_01_10 = _mm_add_pd(ab_01_10, tmp4) "addpd %%xmm7, %%xmm11 \n\t" // ab_21_30 = _mm_add_pd(ab_21_30, tmp7) "movapd %%xmm5, %%xmm7 \n\t" // tmp7 = tmp5 "mulpd %%xmm0, %%xmm5 \n\t" // tmp5 = _mm_mul_pd(tmp5, tmp0) "movapd (%%rax), %%xmm0 \n\t" // tmp0 = _mm_load_pd(A+16) "mulpd %%xmm1, %%xmm7 \n\t" // tmp7 = _mm_mul_pd(tmp7, tmp1) "movapd 16(%%rax), %%xmm1 \n\t" // tmp1 = _mm_load_pd(A+18) " \n\t" " \n\t" "decq %%rsi \n\t" // --l "jne .DLOOP%= \n\t" // if l>= 1 go back " \n\t" " \n\t" ".DCONSIDERLEFT%=: \n\t" "testq %%rdi, %%rdi \n\t" // if kl==0 writeback to AB "je .DPOSTACCUMULATE%=\n\t" " \n\t" ".DLOOPLEFT%=: \n\t" // for l = kl,..,1 do " \n\t" "addpd %%xmm3, %%xmm12 \n\t" // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) "movapd 16(%%rbx), %%xmm3 \n\t" // tmp3 = _mm_load_pd(B+2) "addpd %%xmm6, %%xmm13 \n\t" // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) "movapd %%xmm2, %%xmm6 \n\t" // tmp6 = tmp2 "pshufd $78,%%xmm2, %%xmm4 \n\t" // tmp4 = _mm_shuffle_pd(tmp2, tmp2, " \n\t" // _MM_SHUFFLE2(0, 1)) "mulpd %%xmm0, %%xmm2 \n\t" // tmp2 = _mm_mul_pd(tmp2, tmp0); "mulpd %%xmm1, %%xmm6 \n\t" // tmp6 = _mm_mul_pd(tmp6, tmp1); " \n\t" " \n\t" "addpd %%xmm5, %%xmm14 \n\t" // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) "addpd %%xmm7, %%xmm15 \n\t" // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) "movapd %%xmm4, %%xmm7 \n\t" // tmp7 = tmp4 "mulpd %%xmm0, %%xmm4 \n\t" // tmp4 = _mm_mul_pd(tmp4, tmp0) "mulpd %%xmm1, %%xmm7 \n\t" // tmp7 = _mm_mul_pd(tmp7, tmp1) " \n\t" " \n\t" "addpd %%xmm2, %%xmm8 \n\t" // ab_00_11 = _mm_add_pd(ab_00_11, tmp2) "movapd 32(%%rbx), %%xmm2 \n\t" // tmp2 = _mm_load_pd(B+4) "addpd %%xmm6, %%xmm9 \n\t" // ab_20_31 = _mm_add_pd(ab_20_31, tmp6) "movapd %%xmm3, %%xmm6 \n\t" // tmp6 = tmp3 "pshufd $78,%%xmm3, %%xmm5 \n\t" // tmp5 = _mm_shuffle_pd(tmp3, tmp3, " \n\t" // _MM_SHUFFLE2(0, 1)) "mulpd %%xmm0, %%xmm3 \n\t" // tmp3 = _mm_mul_pd(tmp3, tmp0) "mulpd %%xmm1, %%xmm6 \n\t" // tmp6 = _mm_mul_pd(tmp6, tmp1) " \n\t" " \n\t" "addpd %%xmm4, %%xmm10 \n\t" // ab_01_10 = _mm_add_pd(ab_01_10, tmp4) "addpd %%xmm7, %%xmm11 \n\t" // ab_21_30 = _mm_add_pd(ab_21_30, tmp7) "movapd %%xmm5, %%xmm7 \n\t" // tmp7 = tmp5 "mulpd %%xmm0, %%xmm5 \n\t" // tmp5 = _mm_mul_pd(tmp5, tmp0) "movapd 32(%%rax), %%xmm0 \n\t" // tmp0 = _mm_load_pd(A+4) "mulpd %%xmm1, %%xmm7 \n\t" // tmp7 = _mm_mul_pd(tmp7, tmp1) "movapd 48(%%rax), %%xmm1 \n\t" // tmp1 = _mm_load_pd(A+6) " \n\t" " \n\t" "addq $32, %%rax \n\t" // A += 4; "addq $32, %%rbx \n\t" // B += 4; " \n\t" "decq %%rdi \n\t" // --l "jne .DLOOPLEFT%= \n\t" // if l>= 1 go back " \n\t" ".DPOSTACCUMULATE%=: \n\t" // Update remaining ab_*_* registers " \n\t" "addpd %%xmm3, %%xmm12 \n\t" // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) "addpd %%xmm6, %%xmm13 \n\t" // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) " \n\t" // "addpd %%xmm5, %%xmm14 \n\t" // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) "addpd %%xmm7, %%xmm15 \n\t" // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) " \n\t" // // Update C <- beta*C + alpha*AB // // "movsd %4, %%xmm0 \n\t" // load alpha "movsd %5, %%xmm1 \n\t" // load beta "movq %6, %%rcx \n\t" // Address of C stored in %rcx "movq %7, %%r8 \n\t" // load incRowC "leaq (,%%r8,8), %%r8 \n\t" // incRowC *= sizeof(double) "movq %8, %%r9 \n\t" // load incColC "leaq (,%%r9,8), %%r9 \n\t" // incRowC *= sizeof(double) " \n\t" "leaq (%%rcx,%%r9), %%r10 \n\t" // Store addr of C01 in %r10 "leaq (%%rcx,%%r8,2), %%rdx \n\t" // Store addr of C20 in %rdx "leaq (%%rdx,%%r9), %%r11 \n\t" // Store addr of C21 in %r11 " \n\t" "unpcklpd %%xmm0, %%xmm0 \n\t" // duplicate alpha "unpcklpd %%xmm1, %%xmm1 \n\t" // duplicate beta " \n\t" " \n\t" "movlpd (%%rcx), %%xmm3 \n\t" // load (C00, "movhpd (%%r10,%%r8), %%xmm3 \n\t" // C11) "mulpd %%xmm0, %%xmm8 \n\t" // scale ab_00_11 by alpha "mulpd %%xmm1, %%xmm3 \n\t" // scale (C00, C11) by beta "addpd %%xmm8, %%xmm3 \n\t" // add results "movlpd %%xmm3, (%%rcx) \n\t" // write back (C00, "movhpd %%xmm3, (%%r10,%%r8) \n\t" // C11) " \n\t" "movlpd (%%rdx), %%xmm4 \n\t" // load (C20, "movhpd (%%r11,%%r8), %%xmm4 \n\t" // C31) "mulpd %%xmm0, %%xmm9 \n\t" // scale ab_20_31 by alpha "mulpd %%xmm1, %%xmm4 \n\t" // scale (C20, C31) by beta "addpd %%xmm9, %%xmm4 \n\t" // add results "movlpd %%xmm4, (%%rdx) \n\t" // write back (C20, "movhpd %%xmm4, (%%r11,%%r8) \n\t" // C31) " \n\t" " \n\t" "movlpd (%%r10), %%xmm3 \n\t" // load (C01, "movhpd (%%rcx,%%r8), %%xmm3 \n\t" // C10) "mulpd %%xmm0, %%xmm10\n\t" // scale ab_01_10 by alpha "mulpd %%xmm1, %%xmm3 \n\t" // scale (C01, C10) by beta "addpd %%xmm10, %%xmm3 \n\t" // add results "movlpd %%xmm3, (%%r10) \n\t" // write back (C01, "movhpd %%xmm3, (%%rcx,%%r8) \n\t" // C10) " \n\t" "movlpd (%%r11), %%xmm4 \n\t" // load (C21, "movhpd (%%rdx,%%r8), %%xmm4 \n\t" // C30) "mulpd %%xmm0, %%xmm11\n\t" // scale ab_21_30 by alpha "mulpd %%xmm1, %%xmm4 \n\t" // scale (C21, C30) by beta "addpd %%xmm11, %%xmm4 \n\t" // add results "movlpd %%xmm4, (%%r11) \n\t" // write back (C21, "movhpd %%xmm4, (%%rdx,%%r8) \n\t" // C30) " \n\t" " \n\t" "leaq (%%rcx,%%r9,2), %%rcx \n\t" // Store addr of C02 in %rcx "leaq (%%r10,%%r9,2), %%r10 \n\t" // Store addr of C03 in %r10 "leaq (%%rdx,%%r9,2), %%rdx \n\t" // Store addr of C22 in $rdx "leaq (%%r11,%%r9,2), %%r11 \n\t" // Store addr of C23 in %r11 " \n\t" " \n\t" "movlpd (%%rcx), %%xmm3 \n\t" // load (C02, "movhpd (%%r10,%%r8), %%xmm3 \n\t" // C13) "mulpd %%xmm0, %%xmm12\n\t" // scale ab_02_13 by alpha "mulpd %%xmm1, %%xmm3 \n\t" // scale (C02, C13) by beta "addpd %%xmm12, %%xmm3 \n\t" // add results "movlpd %%xmm3, (%%rcx) \n\t" // write back (C02, "movhpd %%xmm3, (%%r10,%%r8) \n\t" // C13) " \n\t" "movlpd (%%rdx), %%xmm4 \n\t" // load (C22, "movhpd (%%r11, %%r8), %%xmm4 \n\t" // C33) "mulpd %%xmm0, %%xmm13\n\t" // scale ab_22_33 by alpha "mulpd %%xmm1, %%xmm4 \n\t" // scale (C22, C33) by beta "addpd %%xmm13, %%xmm4 \n\t" // add results "movlpd %%xmm4, (%%rdx) \n\t" // write back (C22, "movhpd %%xmm4, (%%r11,%%r8) \n\t" // C33) " \n\t" " \n\t" "movlpd (%%r10), %%xmm3 \n\t" // load (C03, "movhpd (%%rcx,%%r8), %%xmm3 \n\t" // C12) "mulpd %%xmm0, %%xmm14\n\t" // scale ab_03_12 by alpha "mulpd %%xmm1, %%xmm3 \n\t" // scale (C03, C12) by beta "addpd %%xmm14, %%xmm3 \n\t" // add results "movlpd %%xmm3, (%%r10) \n\t" // write back (C03, "movhpd %%xmm3, (%%rcx,%%r8) \n\t" // C12) " \n\t" "movlpd (%%r11), %%xmm4 \n\t" // load (C23, "movhpd (%%rdx,%%r8), %%xmm4 \n\t" // C32) "mulpd %%xmm0, %%xmm15\n\t" // scale ab_23_32 by alpha "mulpd %%xmm1, %%xmm4 \n\t" // scale (C23, C32) by beta "addpd %%xmm15, %%xmm4 \n\t" // add results "movlpd %%xmm4, (%%r11) \n\t" // write back (C23, "movhpd %%xmm4, (%%rdx,%%r8) \n\t" // C32) : // output : // input "m" (kb), // 0 "m" (kl), // 1 "m" (A), // 2 "m" (B), // 3 "m" (alpha), // 4 "m" (beta), // 5 "m" (C), // 6 "m" (incRowC), // 7 "m" (incColC) // 8 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" ); } ------------------------------------------------------------------------------- Aufgabe: AVX Micro-Kernel einbinden =================================== ---- CODE (type=cc) ----------------------------------------------------------- void ugemm(std::int64_t kc_, double alpha, const double *A, const double *B, double beta, double *C, std::int64_t incRowC_, std::int64_t incColC_) { int64_t kc = kc_; int64_t incRowC = incRowC_; int64_t incColC = incColC_; double *pAlpha = α double *pBeta = β // // Compute AB = A*B // __asm__ volatile ( "movq %0, %%rdi \n\t" // kc "movq %1, %%rsi \n\t" // A "movq %2, %%rdx \n\t" // B "movq %5, %%rcx \n\t" // C "movq %6, %%r8 \n\t" // incRowC "movq %7, %%r9 \n\t" // incColC "vmovapd 0 * 32(%%rdx), %%ymm4\n\t" "vbroadcastsd 0 * 8(%%rsi), %%ymm0\n\t" "vbroadcastsd 1 * 8(%%rsi), %%ymm1\n\t" "vbroadcastsd 2 * 8(%%rsi), %%ymm2\n\t" "vbroadcastsd 3 * 8(%%rsi), %%ymm3\n\t" "vxorpd %%ymm8, %%ymm8, %%ymm8\n\t" "vxorpd %%ymm9, %%ymm9, %%ymm9\n\t" "vxorpd %%ymm10, %%ymm10, %%ymm10\n\t" "vxorpd %%ymm11, %%ymm11, %%ymm11\n\t" "vxorpd %%ymm12, %%ymm12, %%ymm12\n\t" "vxorpd %%ymm13, %%ymm13, %%ymm13\n\t" "vxorpd %%ymm14, %%ymm14, %%ymm14\n\t" "vxorpd %%ymm15, %%ymm15, %%ymm15\n\t" "jmp check%=\n\t" "loop%=:\n\t" "vmovapd 1 * 32(%%rdx), %%ymm5\n\t" "vmulpd %%ymm0, %%ymm4, %%ymm6\n\t" "vaddpd %%ymm6, %%ymm8, %%ymm8\n\t" "vmulpd %%ymm1, %%ymm4, %%ymm7\n\t" "vaddpd %%ymm7, %%ymm9, %%ymm9\n\t" "vmulpd %%ymm2, %%ymm4, %%ymm6\n\t" "vaddpd %%ymm6, %%ymm10, %%ymm10\n\t" "vmulpd %%ymm3, %%ymm4, %%ymm7\n\t" "vaddpd %%ymm7, %%ymm11, %%ymm11\n\t" "vmovapd 2 * 32(%%rdx), %%ymm4\n\t" "vmulpd %%ymm0, %%ymm5, %%ymm6\n\t" "vaddpd %%ymm6, %%ymm12, %%ymm12\n\t" "vbroadcastsd 4 * 8(%%rsi), %%ymm0\n\t" "vmulpd %%ymm1, %%ymm5, %%ymm7\n\t" "vaddpd %%ymm7, %%ymm13, %%ymm13\n\t" "vbroadcastsd 5 * 8(%%rsi), %%ymm1\n\t" "vmulpd %%ymm2, %%ymm5, %%ymm6\n\t" "vaddpd %%ymm6, %%ymm14, %%ymm14\n\t" "vbroadcastsd 6 * 8(%%rsi), %%ymm2\n\t" "vmulpd %%ymm3, %%ymm5, %%ymm7\n\t" "vaddpd %%ymm7, %%ymm15, %%ymm15\n\t" "vbroadcastsd 7 * 8(%%rsi), %%ymm3\n\t" "addq $32, %%rsi\n\t" "addq $2*32, %%rdx\n\t" "decq %%rdi\n\t" "check%=:\n\t" "testq %%rdi, %%rdi\n\t" "jg loop%=\n\t" "movq %3, %%rdi \n\t" // alpha "movq %4, %%rsi \n\t" // beta "vbroadcastsd (%%rdi), %%ymm6\n\t" "vbroadcastsd (%%rsi), %%ymm7\n\t" "vmulpd %%ymm6, %%ymm8, %%ymm8\n\t" "vmulpd %%ymm6, %%ymm9, %%ymm9\n\t" "vmulpd %%ymm6, %%ymm10, %%ymm10\n\t" "vmulpd %%ymm6, %%ymm11, %%ymm11\n\t" "vmulpd %%ymm6, %%ymm12, %%ymm12\n\t" "vmulpd %%ymm6, %%ymm13, %%ymm13\n\t" "vmulpd %%ymm6, %%ymm14, %%ymm14\n\t" "vmulpd %%ymm6, %%ymm15, %%ymm15\n\t" "leaq (,%%r8,8), %%r8\n\t" "leaq (,%%r9,8), %%r9\n\t" "leaq (,%%r9,2), %%r10\n\t" "leaq (%%r10,%%r9), %%r11\n\t" "leaq (%%rcx,%%r10,2), %%rdx\n\t" "#\n\t" "# Update C(0,:)\n\t" "#\n\t" "vmovlpd (%%rcx), %%xmm0, %%xmm0\n\t" "vmovhpd (%%rcx,%%r9), %%xmm0, %%xmm0\n\t" "vmovlpd (%%rcx,%%r10), %%xmm1, %%xmm1\n\t" "vmovhpd (%%rcx,%%r11), %%xmm1, %%xmm1\n\t" "vmovlpd (%%rdx), %%xmm2, %%xmm2\n\t" "vmovhpd (%%rdx,%%r9), %%xmm2, %%xmm2\n\t" "vmovlpd (%%rdx,%%r10), %%xmm3, %%xmm3\n\t" "vmovhpd (%%rdx,%%r11), %%xmm3, %%xmm3\n\t" "vmulpd %%xmm7, %%xmm0, %%xmm0\n\t" "vmulpd %%xmm7, %%xmm1, %%xmm1\n\t" "vmulpd %%xmm7, %%xmm2, %%xmm2\n\t" "vmulpd %%xmm7, %%xmm3, %%xmm3\n\t" "vextractf128 $1, %%ymm8, %%xmm4\n\t" "vextractf128 $1, %%ymm12, %%xmm5\n\t" "vaddpd %%xmm0, %%xmm8, %%xmm0\n\t" "vaddpd %%xmm1, %%xmm4, %%xmm1\n\t" "vaddpd %%xmm2, %%xmm12, %%xmm2\n\t" "vaddpd %%xmm3, %%xmm5, %%xmm3\n\t" "vmovlpd %%xmm0, (%%rcx)\n\t" "vmovhpd %%xmm0, (%%rcx,%%r9)\n\t" "vmovlpd %%xmm1, (%%rcx,%%r10)\n\t" "vmovhpd %%xmm1, (%%rcx,%%r11)\n\t" "vmovlpd %%xmm2, (%%rdx)\n\t" "vmovhpd %%xmm2, (%%rdx,%%r9)\n\t" "vmovlpd %%xmm3, (%%rdx,%%r10)\n\t" "vmovhpd %%xmm3, (%%rdx,%%r11)\n\t" "#\n\t" "# Update C(1,:)\n\t" "#\n\t" "addq %%r8, %%rcx\n\t" "addq %%r8, %%rdx\n\t" "vmovlpd (%%rcx), %%xmm0, %%xmm0\n\t" "vmovhpd (%%rcx,%%r9), %%xmm0, %%xmm0\n\t" "vmovlpd (%%rcx,%%r10), %%xmm1, %%xmm1\n\t" "vmovhpd (%%rcx,%%r11), %%xmm1, %%xmm1\n\t" "vmovlpd (%%rdx), %%xmm2, %%xmm2\n\t" "vmovhpd (%%rdx,%%r9), %%xmm2, %%xmm2\n\t" "vmovlpd (%%rdx,%%r10), %%xmm3, %%xmm3\n\t" "vmovhpd (%%rdx,%%r11), %%xmm3, %%xmm3\n\t" "vmulpd %%xmm7, %%xmm0, %%xmm0\n\t" "vmulpd %%xmm7, %%xmm1, %%xmm1\n\t" "vmulpd %%xmm7, %%xmm2, %%xmm2\n\t" "vmulpd %%xmm7, %%xmm3, %%xmm3\n\t" "vextractf128 $1, %%ymm9, %%xmm4\n\t" "vextractf128 $1, %%ymm13, %%xmm5\n\t" "vaddpd %%xmm0, %%xmm9, %%xmm0\n\t" "vaddpd %%xmm1, %%xmm4, %%xmm1\n\t" "vaddpd %%xmm2, %%xmm13, %%xmm2\n\t" "vaddpd %%xmm3, %%xmm5, %%xmm3\n\t" "vmovlpd %%xmm0, (%%rcx)\n\t" "vmovhpd %%xmm0, (%%rcx,%%r9)\n\t" "vmovlpd %%xmm1, (%%rcx,%%r10)\n\t" "vmovhpd %%xmm1, (%%rcx,%%r11)\n\t" "vmovlpd %%xmm2, (%%rdx)\n\t" "vmovhpd %%xmm2, (%%rdx,%%r9)\n\t" "vmovlpd %%xmm3, (%%rdx,%%r10)\n\t" "vmovhpd %%xmm3, (%%rdx,%%r11)\n\t" "#\n\t" "# Update C(2,:)\n\t" "#\n\t" "addq %%r8, %%rcx\n\t" "addq %%r8, %%rdx\n\t" "vmovlpd (%%rcx), %%xmm0, %%xmm0\n\t" "vmovhpd (%%rcx,%%r9), %%xmm0, %%xmm0\n\t" "vmovlpd (%%rcx,%%r10), %%xmm1, %%xmm1\n\t" "vmovhpd (%%rcx,%%r11), %%xmm1, %%xmm1\n\t" "vmovlpd (%%rdx), %%xmm2, %%xmm2\n\t" "vmovhpd (%%rdx,%%r9), %%xmm2, %%xmm2\n\t" "vmovlpd (%%rdx,%%r10), %%xmm3, %%xmm3\n\t" "vmovhpd (%%rdx,%%r11), %%xmm3, %%xmm3\n\t" "vmulpd %%xmm7, %%xmm0, %%xmm0\n\t" "vmulpd %%xmm7, %%xmm1, %%xmm1\n\t" "vmulpd %%xmm7, %%xmm2, %%xmm2\n\t" "vmulpd %%xmm7, %%xmm3, %%xmm3\n\t" "vextractf128 $1, %%ymm10, %%xmm4\n\t" "vextractf128 $1, %%ymm14, %%xmm5\n\t" "vaddpd %%xmm0, %%xmm10, %%xmm0\n\t" "vaddpd %%xmm1, %%xmm4, %%xmm1\n\t" "vaddpd %%xmm2, %%xmm14, %%xmm2\n\t" "vaddpd %%xmm3, %%xmm5, %%xmm3\n\t" "vmovlpd %%xmm0, (%%rcx)\n\t" "vmovhpd %%xmm0, (%%rcx,%%r9)\n\t" "vmovlpd %%xmm1, (%%rcx,%%r10)\n\t" "vmovhpd %%xmm1, (%%rcx,%%r11)\n\t" "vmovlpd %%xmm2, (%%rdx)\n\t" "vmovhpd %%xmm2, (%%rdx,%%r9)\n\t" "vmovlpd %%xmm3, (%%rdx,%%r10)\n\t" "vmovhpd %%xmm3, (%%rdx,%%r11)\n\t" "#\n\t" "# Update C(3,:)\n\t" "#\n\t" "addq %%r8, %%rcx\n\t" "addq %%r8, %%rdx\n\t" "vmovlpd (%%rcx), %%xmm0, %%xmm0\n\t" "vmovhpd (%%rcx,%%r9), %%xmm0, %%xmm0\n\t" "vmovlpd (%%rcx,%%r10), %%xmm1, %%xmm1\n\t" "vmovhpd (%%rcx,%%r11), %%xmm1, %%xmm1\n\t" "vmovlpd (%%rdx), %%xmm2, %%xmm2\n\t" "vmovhpd (%%rdx,%%r9), %%xmm2, %%xmm2\n\t" "vmovlpd (%%rdx,%%r10), %%xmm3, %%xmm3\n\t" "vmovhpd (%%rdx,%%r11), %%xmm3, %%xmm3\n\t" "vmulpd %%xmm7, %%xmm0, %%xmm0\n\t" "vmulpd %%xmm7, %%xmm1, %%xmm1\n\t" "vmulpd %%xmm7, %%xmm2, %%xmm2\n\t" "vmulpd %%xmm7, %%xmm3, %%xmm3\n\t" "vextractf128 $1, %%ymm11, %%xmm4\n\t" "vextractf128 $1, %%ymm15, %%xmm5\n\t" "vaddpd %%xmm0, %%xmm11, %%xmm0\n\t" "vaddpd %%xmm1, %%xmm4, %%xmm1\n\t" "vaddpd %%xmm2, %%xmm15, %%xmm2\n\t" "vaddpd %%xmm3, %%xmm5, %%xmm3\n\t" "vmovlpd %%xmm0, (%%rcx)\n\t" "vmovhpd %%xmm0, (%%rcx,%%r9)\n\t" "vmovlpd %%xmm1, (%%rcx,%%r10)\n\t" "vmovhpd %%xmm1, (%%rcx,%%r11)\n\t" "vmovlpd %%xmm2, (%%rdx)\n\t" "vmovhpd %%xmm2, (%%rdx,%%r9)\n\t" "vmovlpd %%xmm3, (%%rdx,%%r10)\n\t" "vmovhpd %%xmm3, (%%rdx,%%r11)\n\t" : // output : // input "m" (kc), // 0 "m" (A), // 1 "m" (B), // 2 "m" (pAlpha), // 3 "m" (pBeta), // 4 "m" (C), // 5 "m" (incRowC), // 6 "m" (incColC) // 7 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } ------------------------------------------------------------------------------- Aufgabe ======= Den Benchmark mit -DSSE und -DAVX übersetzen und ausführen: Viel Spass! :navigate: up -> doc:index back -> doc:session12/page08