Content |
Using Optimized Micro-Kernels
Changes:
-
The benchmark program matprod.cc was not modified.
-
In gemm.hpp memory for buffers now gets aligned.
-
If you compile with -DHAVE_AVX the file avx.hpp gets included. This contains inline-assembly code for AVX.
-
If you compile with -DHAVE_FMA the file fma.hpp gets included. This contains inline-assembly code for FMA.
-
Default block sizes can be overwritten with:
-
-DBS_D_MC=512
-
-DBS_D_KC=512
-
...
-
Tar-Ball for this Session
The tar-ball session2.tgz contains the files:
$shell> tar tfvz session2.tgz -rw-rw-r-- lehn/num 5158 2016-02-02 18:35 session2/matprod.cc -rw-rw-r-- lehn/num 17181 2016-02-02 18:17 session2/avx.hpp -rw-rw-r-- lehn/num 33544 2016-02-02 19:53 session2/fma.hpp -rw-rw-r-- lehn/num 8714 2016-02-02 19:51 session2/gemm.hpp $shell>
Compile and Run Benchmark
For a quick benchmark we again limit \(m\) to \(500\) with -DM_MAX=500. This time we also enable the optimized micro kernel for AVX with -DHAVE_AVX:
$shell> g++ -Ofast -mavx -Wall -std=c++11 -DNDEBUG -DHAVE_AVX -DM_MAX=500 -I ../boost_1_60_0 matprod.cc $shell> ./a.out # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Residual 100 100 100 0.000827 2418.38 0.000323 6191.95 0 200 200 200 0.006825 2344.32 0.001831 8738.39 0 300 300 300 0.01007 5362.46 0.002697 20022.2 1.92848e-16 400 400 400 0.023609 5421.66 0.005953 21501.8 8.39383e-17 500 500 500 0.045823 5455.78 0.011097 22528.6 3.52756e-17 $shell>
Main Program with Benchmark and Test
#include <cassert> #include <chrono> #include <cmath> #include <random> #include <boost/timer.hpp> #include <boost/numeric/ublas/matrix.hpp> #include <boost/numeric/ublas/io.hpp> #include <boost/numeric/ublas/operation.hpp> #include "gemm.hpp" template <typename T> struct WallTime { void tic() { t0 = std::chrono::high_resolution_clock::now(); } T toc() { using namespace std::chrono; elapsed = high_resolution_clock::now() - t0; return duration<T,seconds::period>(elapsed).count(); } std::chrono::high_resolution_clock::time_point t0; std::chrono::high_resolution_clock::duration elapsed; }; template <typename MATRIX> void fill(MATRIX &A) { typedef typename MATRIX::size_type size_type; typedef typename MATRIX::value_type T; std::random_device random; std::default_random_engine mt(random()); std::uniform_real_distribution<T> uniform(-100,100); for (size_type i=0; i<A.size1(); ++i) { for (size_type j=0; j<A.size2(); ++j) { A(i,j) = uniform(mt); } } } template <typename MATRIX> typename MATRIX::value_type asum(const MATRIX &A) { typedef typename MATRIX::size_type size_type; typedef typename MATRIX::value_type T; T asum = 0; for (size_type i=0; i<A.size1(); ++i) { for (size_type j=0; j<A.size2(); ++j) { asum += std::abs(A(i,j)); } } return asum; } template <typename MA, typename MB, typename MC0, typename MC1> double estimateGemmResidual(const MA &A, const MB &B, const MC0 &C0, const MC1 &C1) { typedef typename MC0::size_type size_type; size_type m= C1.size1(); size_type n= C1.size2(); size_type k= A.size2(); double aNorm = asum(A); double bNorm = asum(B); double cNorm = asum(C1); double diff = asum(C1-C0); // Using eps for double gives upper bound in case elements have lower // precision. double eps = std::numeric_limits<double>::epsilon(); double res = diff/(aNorm*bNorm*cNorm*eps*std::max(std::max(m,n),k)); return res; } namespace foo { template <typename MATRIXA, typename MATRIXB, typename MATRIXC> void axpy_prod(const MATRIXA &A, const MATRIXB &B, MATRIXC &C, bool update) { typedef typename MATRIXA::value_type TA; typedef typename MATRIXC::value_type TC; assert(A.size2()==B.size1()); const std::ptrdiff_t m = C.size1(); const std::ptrdiff_t n = C.size2(); const std::ptrdiff_t k = A.size2(); const std::ptrdiff_t incRowA = &A(1,0) - &A(0,0); const std::ptrdiff_t incColA = &A(0,1) - &A(0,0); const std::ptrdiff_t incRowB = &B(1,0) - &B(0,0); const std::ptrdiff_t incColB = &B(0,1) - &B(0,0); const std::ptrdiff_t incRowC = &C(1,0) - &C(0,0); const std::ptrdiff_t incColC = &C(0,1) - &C(0,0); gemm(m, n, k, TA(1), &A(0,0), incRowA, incColA, &B(0,0), incRowB, incColB, TC(update ? 0 : 1), &C(0,0), incRowC, incColC); } } // namespace foo #ifndef M_MAX #define M_MAX 4000 #endif #ifndef K_MAX #define K_MAX 4000 #endif #ifndef N_MAX #define N_MAX 4000 #endif int main() { namespace ublas = boost::numeric::ublas; const std::size_t m_min = 100; const std::size_t k_min = 100; const std::size_t n_min = 100; const std::size_t m_max = M_MAX; const std::size_t k_max = K_MAX; const std::size_t n_max = N_MAX; const std::size_t m_inc = 100; const std::size_t k_inc = 100; const std::size_t n_inc = 100; const bool matprodUpdate = true; typedef double T; typedef ublas::row_major SO; std::cout << "# m"; std::cout << " n"; std::cout << " k"; std::cout << " uBLAS: t1"; std::cout << " MFLOPS"; std::cout << " Blocked: t2"; std::cout << " MFLOPS"; std::cout << " Residual"; std::cout << std::endl; WallTime<double> walltime; for (std::size_t m=m_min, k=k_min, n=n_min; m<=m_max && k<=k_max && n<=n_max; m += m_inc, k += k_inc, n += n_inc) { ublas::matrix<T,SO> A(m, k); ublas::matrix<T,SO> B(k, n); ublas::matrix<T,SO> C1(m, n); ublas::matrix<T,SO> C2(m, n); fill(A); fill(B); fill(C1); C2 = C1; walltime.tic(); ublas::axpy_prod(A, B, C1, matprodUpdate); double t1 = walltime.toc(); walltime.tic(); foo::axpy_prod(A, B, C2, matprodUpdate); double t2 = walltime.toc(); double res = estimateGemmResidual(A, B, C1, C2); std::cout.width(5); std::cout << m << " "; std::cout.width(5); std::cout << n << " "; std::cout.width(5); std::cout << k << " "; std::cout.width(12); std::cout << t1 << " "; std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*k/t1 << " "; std::cout.width(15); std::cout << t2 << " "; std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*k/t2 << " "; std::cout.width(15); std::cout << res; std::cout << std::endl; } }
Core Function for Matrix-Matrix Produkt
// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core #ifndef GEMM_HPP #define GEMM_HPP #include <algorithm> #include <cstdlib> //-- new with alignment -------------------------------------------------------- void * malloc_(std::size_t alignment, std::size_t size) { alignment = std::max(alignment, alignof(void *)); size += alignment; void *ptr = std::malloc(size); void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1)); void **vp = (void**) ptr2 - 1; *vp = ptr; return ptr2; } void free_(void *ptr) { std::free(*((void**)ptr-1)); } //-- Config -------------------------------------------------------------------- #ifdef HAVE_FMA # ifndef BS_D_MR # define BS_D_MR 4 # endif # ifndef BS_D_NR # define BS_D_NR 12 # endif # ifndef BS_D_MC # define BS_D_MC 256 # endif # ifndef BS_D_KC # define BS_D_KC 512 # endif # ifndef BS_D_NC # define BS_D_NC 4092 # endif # ifndef BS_D_ALIGN # define BS_D_ALIGN 32 # endif #endif #ifndef BS_D_MR #define BS_D_MR 4 #endif #ifndef BS_D_NR #define BS_D_NR 8 #endif #ifndef BS_D_MC #define BS_D_MC 256 #endif #ifndef BS_D_KC #define BS_D_KC 256 #endif #ifndef BS_D_NC #define BS_D_NC 4096 #endif #ifndef BS_D_ALIGN #define BS_D_ALIGN 32 #endif template <typename T> struct BlockSize { static const int MC = 64; static const int KC = 64; static const int NC = 256; static const int MR = 8; static const int NR = 8; static const int align = alignof(T); static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize<double> { static const int MC = BS_D_MC; static const int KC = BS_D_KC; static const int NC = BS_D_NC; static const int MR = BS_D_MR; static const int NR = BS_D_NR; static const int align = BS_D_ALIGN; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; //-- aux routines -------------------------------------------------------------- template <typename Index, typename Alpha, typename TX, typename TY> void geaxpy(Index m, Index n, const Alpha &alpha, const TX *X, Index incRowX, Index incColX, TY *Y, Index incRowY, Index incColY) { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; } } } template <typename Index, typename Alpha, typename TX> void gescal(Index m, Index n, const Alpha &alpha, TX *X, Index incRowX, Index incColX) { if (alpha!=Alpha(0)) { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { X[i*incRowX+j*incColX] *= alpha; } } } else { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { X[i*incRowX+j*incColX] = Alpha(0); } } } } //-- Micro Kernel -------------------------------------------------------------- template <typename Index, typename T> void ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; T P[BlockSize<T>::MR*BlockSize<T>::NR]; for (Index l=0; l<MR*NR; ++l) { P[l] = 0; } for (Index l=0; l<kc; ++l) { for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { P[i+j*MR] += A[i+l*MR]*B[l*NR+j]; } } } for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { C[i*incRowC+j*incColC] *= beta; C[i*incRowC+j*incColC] += alpha*P[i+j*MR]; } } } #ifdef HAVE_AVX #include "avx.hpp" #endif #ifdef HAVE_FMA #include "fma.hpp" #endif //-- Macro Kernel -------------------------------------------------------------- template <typename Index, typename T, typename Beta, typename TC> void mgemm(Index mc, Index nc, Index kc, T alpha, const T *A, const T *B, Beta beta, TC *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; const Index mp = (mc+MR-1) / MR; const Index np = (nc+NR-1) / NR; const Index mr_ = mc % MR; const Index nr_ = nc % NR; T C_[MR*NR]; for (Index j=0; j<np; ++j) { const Index nr = (j!=np-1 || nr_==0) ? NR : nr_; for (Index i=0; i<mp; ++i) { const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_; if (mr==MR && nr==NR) { ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } else { ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], T(0), C_, Index(1), MR); gescal(mr, nr, beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); geaxpy(mr, nr, T(1), C_, Index(1), MR, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } } } } //-- Packing blocks ------------------------------------------------------------ template <typename Index, typename TA, typename T> void pack_A(Index mc, Index kc, const TA *A, Index incRowA, Index incColA, T *p) { Index MR = BlockSize<T>::MR; Index mp = (mc+MR-1) / MR; for (Index j=0; j<kc; ++j) { for (Index l=0; l<mp; ++l) { for (Index i0=0; i0<MR; ++i0) { Index i = l*MR + i0; Index nu = l*MR*kc + j*MR + i0; p[nu] = (i<mc) ? A[i*incRowA+j*incColA] : T(0); } } } } template <typename Index, typename TB, typename T> void pack_B(Index kc, Index nc, const TB *B, Index incRowB, Index incColB, T *p) { Index NR = BlockSize<T>::NR; Index np = (nc+NR-1) / NR; for (Index l=0; l<np; ++l) { for (Index j0=0; j0<NR; ++j0) { for (Index i=0; i<kc; ++i) { Index j = l*NR+j0; Index nu = l*NR*kc + i*NR + j0; p[nu] = (j<nc) ? B[i*incRowB+j*incColB] : T(0); } } } } //-- Frame routine ------------------------------------------------------------- template <typename Index, typename Alpha, typename TA, typename TB, typename Beta, typename TC> void gemm(Index m, Index n, Index k, Alpha alpha, const TA *A, Index incRowA, Index incColA, const TB *B, Index incRowB, Index incColB, Beta beta, TC *C, Index incRowC, Index incColC) { typedef typename std::common_type<Alpha, TA, TB>::type T; const Index MC = BlockSize<T>::MC; const Index NC = BlockSize<T>::NC; const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; const Index KC = BlockSize<T>::KC; const Index mb = (m+MC-1) / MC; const Index nb = (n+NC-1) / NC; const Index kb = (k+KC-1) / KC; const Index mc_ = m % MC; const Index nc_ = n % NC; const Index kc_ = k % KC; T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*KC+MR)); T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(KC*NC+NR)); if (alpha==Alpha(0) || k==0) { gescal(m, n, beta, C, incRowC, incColC); return; } for (Index j=0; j<nb; ++j) { Index nc = (j!=nb-1 || nc_==0) ? NC : nc_; for (Index l=0; l<kb; ++l) { Index kc = (l!=kb-1 || kc_==0) ? KC : kc_; Beta beta_ = (l==0) ? beta : Beta(1); pack_B(kc, nc, &B[l*KC*incRowB+j*NC*incColB], incRowB, incColB, B_); for (Index i=0; i<mb; ++i) { Index mc = (i!=mb-1 || mc_==0) ? MC : mc_; pack_A(mc, kc, &A[i*MC*incRowA+l*KC*incColA], incRowA, incColA, A_); mgemm(mc, nc, kc, T(alpha), A_, B_, beta_, &C[i*MC*incRowC+j*NC*incColC], incRowC, incColC); } } } free_(A_); free_(B_); } #endif
Micro-Kernel for AVX
#ifndef AVX_HPP #define AVX_HPP #include "gemm.hpp" #include <type_traits> template <typename Index> typename std::enable_if<std::is_convertible<Index, std::int64_t>::value && BlockSize<double>::MR==4 && BlockSize<double>::NR==8 && BlockSize<double>::align==32, void>::type ugemm(Index kc_, double alpha, const double *A, const double *B, double beta, double *C, Index incRowC_, Index incColC_) { int64_t kc = kc_; int64_t incRowC = incRowC_; int64_t incColC = incColC_; double *pAlpha = α double *pBeta = β // // Compute AB = A*B // __asm__ volatile ( "movq %0, %%rdi \n\t" // kc "movq %1, %%rsi \n\t" // A "movq %2, %%rdx \n\t" // B "movq %5, %%rcx \n\t" // C "movq %6, %%r8 \n\t" // incRowC "movq %7, %%r9 \n\t" // incColC "vmovapd 0 * 32(%%rdx), %%ymm4\n\t" "vbroadcastsd 0 * 8(%%rsi), %%ymm0\n\t" "vbroadcastsd 1 * 8(%%rsi), %%ymm1\n\t" "vbroadcastsd 2 * 8(%%rsi), %%ymm2\n\t" "vbroadcastsd 3 * 8(%%rsi), %%ymm3\n\t" "vxorpd %%ymm8, %%ymm8, %%ymm8\n\t" "vxorpd %%ymm9, %%ymm9, %%ymm9\n\t" "vxorpd %%ymm10, %%ymm10, %%ymm10\n\t" "vxorpd %%ymm11, %%ymm11, %%ymm11\n\t" "vxorpd %%ymm12, %%ymm12, %%ymm12\n\t" "vxorpd %%ymm13, %%ymm13, %%ymm13\n\t" "vxorpd %%ymm14, %%ymm14, %%ymm14\n\t" "vxorpd %%ymm15, %%ymm15, %%ymm15\n\t" "jmp check%=\n\t" "loop%=:\n\t" "vmovapd 1 * 32(%%rdx), %%ymm5\n\t" "vmulpd %%ymm0, %%ymm4, %%ymm6\n\t" "vaddpd %%ymm6, %%ymm8, %%ymm8\n\t" "vmulpd %%ymm1, %%ymm4, %%ymm7\n\t" "vaddpd %%ymm7, %%ymm9, %%ymm9\n\t" "vmulpd %%ymm2, %%ymm4, %%ymm6\n\t" "vaddpd %%ymm6, %%ymm10, %%ymm10\n\t" "vmulpd %%ymm3, %%ymm4, %%ymm7\n\t" "vaddpd %%ymm7, %%ymm11, %%ymm11\n\t" "vmovapd 2 * 32(%%rdx), %%ymm4\n\t" "vmulpd %%ymm0, %%ymm5, %%ymm6\n\t" "vaddpd %%ymm6, %%ymm12, %%ymm12\n\t" "vbroadcastsd 4 * 8(%%rsi), %%ymm0\n\t" "vmulpd %%ymm1, %%ymm5, %%ymm7\n\t" "vaddpd %%ymm7, %%ymm13, %%ymm13\n\t" "vbroadcastsd 5 * 8(%%rsi), %%ymm1\n\t" "vmulpd %%ymm2, %%ymm5, %%ymm6\n\t" "vaddpd %%ymm6, %%ymm14, %%ymm14\n\t" "vbroadcastsd 6 * 8(%%rsi), %%ymm2\n\t" "vmulpd %%ymm3, %%ymm5, %%ymm7\n\t" "vaddpd %%ymm7, %%ymm15, %%ymm15\n\t" "vbroadcastsd 7 * 8(%%rsi), %%ymm3\n\t" "addq $32, %%rsi\n\t" "addq $2*32, %%rdx\n\t" "decq %%rdi\n\t" "check%=:\n\t" "testq %%rdi, %%rdi\n\t" "jg loop%=\n\t" "movq %3, %%rdi \n\t" // alpha "movq %4, %%rsi \n\t" // beta "vbroadcastsd (%%rdi), %%ymm6\n\t" "vbroadcastsd (%%rsi), %%ymm7\n\t" "vmulpd %%ymm6, %%ymm8, %%ymm8\n\t" "vmulpd %%ymm6, %%ymm9, %%ymm9\n\t" "vmulpd %%ymm6, %%ymm10, %%ymm10\n\t" "vmulpd %%ymm6, %%ymm11, %%ymm11\n\t" "vmulpd %%ymm6, %%ymm12, %%ymm12\n\t" "vmulpd %%ymm6, %%ymm13, %%ymm13\n\t" "vmulpd %%ymm6, %%ymm14, %%ymm14\n\t" "vmulpd %%ymm6, %%ymm15, %%ymm15\n\t" "leaq (,%%r8,8), %%r8\n\t" "leaq (,%%r9,8), %%r9\n\t" "leaq (,%%r9,2), %%r10\n\t" "leaq (%%r10,%%r9), %%r11\n\t" "leaq (%%rcx,%%r10,2), %%rdx\n\t" // check if beta == 0 "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" "vucomisd %%xmm0, %%xmm7 \n\t" "je beta_zero%= \n\t" // case: beta != 0 "#\n\t" "# Update C(0,:)\n\t" "#\n\t" "vmovlpd (%%rcx), %%xmm0, %%xmm0\n\t" "vmovhpd (%%rcx,%%r9), %%xmm0, %%xmm0\n\t" "vmovlpd (%%rcx,%%r10), %%xmm1, %%xmm1\n\t" "vmovhpd (%%rcx,%%r11), %%xmm1, %%xmm1\n\t" "vmovlpd (%%rdx), %%xmm2, %%xmm2\n\t" "vmovhpd (%%rdx,%%r9), %%xmm2, %%xmm2\n\t" "vmovlpd (%%rdx,%%r10), %%xmm3, %%xmm3\n\t" "vmovhpd (%%rdx,%%r11), %%xmm3, %%xmm3\n\t" "vmulpd %%xmm7, %%xmm0, %%xmm0\n\t" "vmulpd %%xmm7, %%xmm1, %%xmm1\n\t" "vmulpd %%xmm7, %%xmm2, %%xmm2\n\t" "vmulpd %%xmm7, %%xmm3, %%xmm3\n\t" "vextractf128 $1, %%ymm8, %%xmm4\n\t" "vextractf128 $1, %%ymm12, %%xmm5\n\t" "vaddpd %%xmm0, %%xmm8, %%xmm0\n\t" "vaddpd %%xmm1, %%xmm4, %%xmm1\n\t" "vaddpd %%xmm2, %%xmm12, %%xmm2\n\t" "vaddpd %%xmm3, %%xmm5, %%xmm3\n\t" "vmovlpd %%xmm0, (%%rcx)\n\t" "vmovhpd %%xmm0, (%%rcx,%%r9)\n\t" "vmovlpd %%xmm1, (%%rcx,%%r10)\n\t" "vmovhpd %%xmm1, (%%rcx,%%r11)\n\t" "vmovlpd %%xmm2, (%%rdx)\n\t" "vmovhpd %%xmm2, (%%rdx,%%r9)\n\t" "vmovlpd %%xmm3, (%%rdx,%%r10)\n\t" "vmovhpd %%xmm3, (%%rdx,%%r11)\n\t" "#\n\t" "# Update C(1,:)\n\t" "#\n\t" "addq %%r8, %%rcx\n\t" "addq %%r8, %%rdx\n\t" "vmovlpd (%%rcx), %%xmm0, %%xmm0\n\t" "vmovhpd (%%rcx,%%r9), %%xmm0, %%xmm0\n\t" "vmovlpd (%%rcx,%%r10), %%xmm1, %%xmm1\n\t" "vmovhpd (%%rcx,%%r11), %%xmm1, %%xmm1\n\t" "vmovlpd (%%rdx), %%xmm2, %%xmm2\n\t" "vmovhpd (%%rdx,%%r9), %%xmm2, %%xmm2\n\t" "vmovlpd (%%rdx,%%r10), %%xmm3, %%xmm3\n\t" "vmovhpd (%%rdx,%%r11), %%xmm3, %%xmm3\n\t" "vmulpd %%xmm7, %%xmm0, %%xmm0\n\t" "vmulpd %%xmm7, %%xmm1, %%xmm1\n\t" "vmulpd %%xmm7, %%xmm2, %%xmm2\n\t" "vmulpd %%xmm7, %%xmm3, %%xmm3\n\t" "vextractf128 $1, %%ymm9, %%xmm4\n\t" "vextractf128 $1, %%ymm13, %%xmm5\n\t" "vaddpd %%xmm0, %%xmm9, %%xmm0\n\t" "vaddpd %%xmm1, %%xmm4, %%xmm1\n\t" "vaddpd %%xmm2, %%xmm13, %%xmm2\n\t" "vaddpd %%xmm3, %%xmm5, %%xmm3\n\t" "vmovlpd %%xmm0, (%%rcx)\n\t" "vmovhpd %%xmm0, (%%rcx,%%r9)\n\t" "vmovlpd %%xmm1, (%%rcx,%%r10)\n\t" "vmovhpd %%xmm1, (%%rcx,%%r11)\n\t" "vmovlpd %%xmm2, (%%rdx)\n\t" "vmovhpd %%xmm2, (%%rdx,%%r9)\n\t" "vmovlpd %%xmm3, (%%rdx,%%r10)\n\t" "vmovhpd %%xmm3, (%%rdx,%%r11)\n\t" "#\n\t" "# Update C(2,:)\n\t" "#\n\t" "addq %%r8, %%rcx\n\t" "addq %%r8, %%rdx\n\t" "vmovlpd (%%rcx), %%xmm0, %%xmm0\n\t" "vmovhpd (%%rcx,%%r9), %%xmm0, %%xmm0\n\t" "vmovlpd (%%rcx,%%r10), %%xmm1, %%xmm1\n\t" "vmovhpd (%%rcx,%%r11), %%xmm1, %%xmm1\n\t" "vmovlpd (%%rdx), %%xmm2, %%xmm2\n\t" "vmovhpd (%%rdx,%%r9), %%xmm2, %%xmm2\n\t" "vmovlpd (%%rdx,%%r10), %%xmm3, %%xmm3\n\t" "vmovhpd (%%rdx,%%r11), %%xmm3, %%xmm3\n\t" "vmulpd %%xmm7, %%xmm0, %%xmm0\n\t" "vmulpd %%xmm7, %%xmm1, %%xmm1\n\t" "vmulpd %%xmm7, %%xmm2, %%xmm2\n\t" "vmulpd %%xmm7, %%xmm3, %%xmm3\n\t" "vextractf128 $1, %%ymm10, %%xmm4\n\t" "vextractf128 $1, %%ymm14, %%xmm5\n\t" "vaddpd %%xmm0, %%xmm10, %%xmm0\n\t" "vaddpd %%xmm1, %%xmm4, %%xmm1\n\t" "vaddpd %%xmm2, %%xmm14, %%xmm2\n\t" "vaddpd %%xmm3, %%xmm5, %%xmm3\n\t" "vmovlpd %%xmm0, (%%rcx)\n\t" "vmovhpd %%xmm0, (%%rcx,%%r9)\n\t" "vmovlpd %%xmm1, (%%rcx,%%r10)\n\t" "vmovhpd %%xmm1, (%%rcx,%%r11)\n\t" "vmovlpd %%xmm2, (%%rdx)\n\t" "vmovhpd %%xmm2, (%%rdx,%%r9)\n\t" "vmovlpd %%xmm3, (%%rdx,%%r10)\n\t" "vmovhpd %%xmm3, (%%rdx,%%r11)\n\t" "#\n\t" "# Update C(3,:)\n\t" "#\n\t" "addq %%r8, %%rcx\n\t" "addq %%r8, %%rdx\n\t" "vmovlpd (%%rcx), %%xmm0, %%xmm0\n\t" "vmovhpd (%%rcx,%%r9), %%xmm0, %%xmm0\n\t" "vmovlpd (%%rcx,%%r10), %%xmm1, %%xmm1\n\t" "vmovhpd (%%rcx,%%r11), %%xmm1, %%xmm1\n\t" "vmovlpd (%%rdx), %%xmm2, %%xmm2\n\t" "vmovhpd (%%rdx,%%r9), %%xmm2, %%xmm2\n\t" "vmovlpd (%%rdx,%%r10), %%xmm3, %%xmm3\n\t" "vmovhpd (%%rdx,%%r11), %%xmm3, %%xmm3\n\t" "vmulpd %%xmm7, %%xmm0, %%xmm0\n\t" "vmulpd %%xmm7, %%xmm1, %%xmm1\n\t" "vmulpd %%xmm7, %%xmm2, %%xmm2\n\t" "vmulpd %%xmm7, %%xmm3, %%xmm3\n\t" "vextractf128 $1, %%ymm11, %%xmm4\n\t" "vextractf128 $1, %%ymm15, %%xmm5\n\t" "vaddpd %%xmm0, %%xmm11, %%xmm0\n\t" "vaddpd %%xmm1, %%xmm4, %%xmm1\n\t" "vaddpd %%xmm2, %%xmm15, %%xmm2\n\t" "vaddpd %%xmm3, %%xmm5, %%xmm3\n\t" "vmovlpd %%xmm0, (%%rcx)\n\t" "vmovhpd %%xmm0, (%%rcx,%%r9)\n\t" "vmovlpd %%xmm1, (%%rcx,%%r10)\n\t" "vmovhpd %%xmm1, (%%rcx,%%r11)\n\t" "vmovlpd %%xmm2, (%%rdx)\n\t" "vmovhpd %%xmm2, (%%rdx,%%r9)\n\t" "vmovlpd %%xmm3, (%%rdx,%%r10)\n\t" "vmovhpd %%xmm3, (%%rdx,%%r11)\n\t" "jmp done%= \n\t" // case: beta == 0 "beta_zero%=: \n\t" "#\n\t" "# Update C(0,:)\n\t" "#\n\t" "vextractf128 $1, %%ymm8, %%xmm4\n\t" "vextractf128 $1, %%ymm12, %%xmm5\n\t" "vmovlpd %%xmm8, (%%rcx)\n\t" "vmovhpd %%xmm8, (%%rcx,%%r9)\n\t" "vmovlpd %%xmm4, (%%rcx,%%r10)\n\t" "vmovhpd %%xmm4, (%%rcx,%%r11)\n\t" "vmovlpd %%xmm12, (%%rdx)\n\t" "vmovhpd %%xmm12, (%%rdx,%%r9)\n\t" "vmovlpd %%xmm5, (%%rdx,%%r10)\n\t" "vmovhpd %%xmm5, (%%rdx,%%r11)\n\t" "#\n\t" "# Update C(1,:)\n\t" "#\n\t" "addq %%r8, %%rcx\n\t" "addq %%r8, %%rdx\n\t" "vextractf128 $1, %%ymm9, %%xmm4\n\t" "vextractf128 $1, %%ymm13, %%xmm5\n\t" "vmovlpd %%xmm9, (%%rcx)\n\t" "vmovhpd %%xmm9, (%%rcx,%%r9)\n\t" "vmovlpd %%xmm4, (%%rcx,%%r10)\n\t" "vmovhpd %%xmm4, (%%rcx,%%r11)\n\t" "vmovlpd %%xmm13, (%%rdx)\n\t" "vmovhpd %%xmm13, (%%rdx,%%r9)\n\t" "vmovlpd %%xmm5, (%%rdx,%%r10)\n\t" "vmovhpd %%xmm5, (%%rdx,%%r11)\n\t" "#\n\t" "# Update C(2,:)\n\t" "#\n\t" "addq %%r8, %%rcx\n\t" "addq %%r8, %%rdx\n\t" "vextractf128 $1, %%ymm10, %%xmm4\n\t" "vextractf128 $1, %%ymm14, %%xmm5\n\t" "vmovlpd %%xmm10, (%%rcx)\n\t" "vmovhpd %%xmm10, (%%rcx,%%r9)\n\t" "vmovlpd %%xmm4, (%%rcx,%%r10)\n\t" "vmovhpd %%xmm4, (%%rcx,%%r11)\n\t" "vmovlpd %%xmm14, (%%rdx)\n\t" "vmovhpd %%xmm14, (%%rdx,%%r9)\n\t" "vmovlpd %%xmm5, (%%rdx,%%r10)\n\t" "vmovhpd %%xmm5, (%%rdx,%%r11)\n\t" "#\n\t" "# Update C(3,:)\n\t" "#\n\t" "addq %%r8, %%rcx\n\t" "addq %%r8, %%rdx\n\t" "vextractf128 $1, %%ymm11, %%xmm4\n\t" "vextractf128 $1, %%ymm15, %%xmm5\n\t" "vmovlpd %%xmm11, (%%rcx)\n\t" "vmovhpd %%xmm11, (%%rcx,%%r9)\n\t" "vmovlpd %%xmm4, (%%rcx,%%r10)\n\t" "vmovhpd %%xmm4, (%%rcx,%%r11)\n\t" "vmovlpd %%xmm15, (%%rdx)\n\t" "vmovhpd %%xmm15, (%%rdx,%%r9)\n\t" "vmovlpd %%xmm5, (%%rdx,%%r10)\n\t" "vmovhpd %%xmm5, (%%rdx,%%r11)\n\t" "done%=: \n\t" : // output : // input "m" (kc), // 0 "m" (A), // 1 "m" (B), // 2 "m" (pAlpha), // 3 "m" (pBeta), // 4 "m" (C), // 5 "m" (incRowC), // 6 "m" (incColC) // 7 : // register clobber list "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } #endif
Micro-Kernel for FMA
#ifndef FMA_HPP #define FMA_HPP #include "gemm.hpp" #include <type_traits> template <typename Index> typename std::enable_if<std::is_convertible<Index, std::int64_t>::value && BlockSize<double>::MR==4 && BlockSize<double>::NR==12 && BlockSize<double>::align==32, void>::type ugemm(Index kc_, double alpha, const double *A, const double *B, double beta, double *C, Index incRowC_, Index incColC_) { int64_t kc = kc_; int64_t incRowC = incRowC_; int64_t incColC = incColC_; double *pAlpha = α double *pBeta = β // // Compute AB = A*B // __asm__ volatile ( "movq %0, %%rdi \n\t" // kc "movq %1, %%rsi \n\t" // A "movq %2, %%rdx \n\t" // B "movq %5, %%rcx \n\t" // C "movq %6, %%r8 \n\t" // incRowC "movq %7, %%r9 \n\t" // incColC "vmovapd 0*32(%%rdx), %%ymm1 \n\t" "vmovapd 1*32(%%rdx), %%ymm2 \n\t" "vmovapd 2*32(%%rdx), %%ymm3 \n\t" "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t" "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" "movq $3*32, %%r13 \n\t" "movq $4* 8, %%r12 \n\t" "jmp check%= \n\t" "loop%=: \n\t" "vbroadcastsd 0* 8(%%rsi), %%ymm0 \n\t" "addq %%r13, %%rdx \n\t" "vfmadd231pd %%ymm0, %%ymm1, %%ymm4 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm8 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm12 \n\t" "vbroadcastsd 1* 8(%%rsi), %%ymm0 \n\t" "decq %%rdi \n\t" "vfmadd231pd %%ymm0, %%ymm1, %%ymm5 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm9 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm13 \n\t" "vbroadcastsd 2* 8(%%rsi), %%ymm0 \n\t" "addq %%r12, %%rsi \n\t" "vfmadd231pd %%ymm0, %%ymm1, %%ymm6 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm10 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm14 \n\t" "vbroadcastsd -1* 8(%%rsi), %%ymm0 \n\t" "vfmadd231pd %%ymm0, %%ymm1, %%ymm7 \n\t" "vmovapd 0*32(%%rdx), %%ymm1 \n\t" "vfmadd231pd %%ymm0, %%ymm2, %%ymm11 \n\t" "vmovapd 1*32(%%rdx), %%ymm2 \n\t" "vfmadd231pd %%ymm0, %%ymm3, %%ymm15 \n\t" "vmovapd 2*32(%%rdx), %%ymm3 \n\t" "check%=: \n\t" "testq %%rdi, %%rdi \n\t" "jg loop%= \n\t" "movq %3, %%rdi \n\t" // alpha "vbroadcastsd (%%rdi), %%ymm0 \n\t" "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" "vmulpd %%ymm0, %%ymm7, %%ymm7 \n\t" "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t" "leaq (,%%r8,8), %%r8 \n\t" "leaq (,%%r9,8), %%r9 \n\t" "leaq (,%%r9,2), %%r10 # 2*incColC \n\t" "leaq (%%r10,%%r9), %%r11 # 3*incColC \n\t" "leaq (%%rcx,%%r10,2), %%rdx # C + 4*incColC \n\t" "leaq (%%rdx,%%r10,2), %%rax # C + 8*incColC \n\t" // check if beta == 0 "movq %4, %%rdi \n\t" // beta "vbroadcastsd (%%rdi), %%ymm0 \n\t" "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" "vucomisd %%xmm0, %%xmm1 \n\t" "je beta_zero%= \n\t" // case: beta != 0 "# \n\t" "# Update C(0,0:3) \n\t" "# \n\t" "vmovlpd (%%rcx), %%xmm1, %%xmm1 \n\t" "vmovhpd (%%rcx,%%r9), %%xmm1, %%xmm1 \n\t" "vmovlpd (%%rcx,%%r10), %%xmm2, %%xmm2 \n\t" "vmovhpd (%%rcx,%%r11), %%xmm2, %%xmm2 \n\t" "vextractf128 $1, %%ymm4, %%xmm3 \n\t" "vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t" "vaddpd %%xmm1, %%xmm4, %%xmm1 \n\t" "vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vmovlpd %%xmm1, (%%rcx) \n\t" "vmovhpd %%xmm1, (%%rcx,%%r9) \n\t" "vmovlpd %%xmm2, (%%rcx,%%r10) \n\t" "vmovhpd %%xmm2, (%%rcx,%%r11) \n\t" "# \n\t" "# Update C(0,4:7) \n\t" "# \n\t" "vmovlpd (%%rdx), %%xmm1, %%xmm1 \n\t" "vmovhpd (%%rdx,%%r9), %%xmm1, %%xmm1 \n\t" "vmovlpd (%%rdx,%%r10), %%xmm2, %%xmm2 \n\t" "vmovhpd (%%rdx,%%r11), %%xmm2, %%xmm2 \n\t" "vextractf128 $1, %%ymm8, %%xmm3 \n\t" "vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t" "vaddpd %%xmm1, %%xmm8, %%xmm1 \n\t" "vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vmovlpd %%xmm1, (%%rdx) \n\t" "vmovhpd %%xmm1, (%%rdx,%%r9) \n\t" "vmovlpd %%xmm2, (%%rdx,%%r10) \n\t" "vmovhpd %%xmm2, (%%rdx,%%r11) \n\t" "# \n\t" "# Update C(0,8:11) \n\t" "# \n\t" "vmovlpd (%%rax), %%xmm1, %%xmm1 \n\t" "vmovhpd (%%rax,%%r9), %%xmm1, %%xmm1 \n\t" "vmovlpd (%%rax,%%r10), %%xmm2, %%xmm2 \n\t" "vmovhpd (%%rax,%%r11), %%xmm2, %%xmm2 \n\t" "vextractf128 $1, %%ymm12, %%xmm3 \n\t" "vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t" "vaddpd %%xmm1, %%xmm12, %%xmm1 \n\t" "vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vmovlpd %%xmm1, (%%rax) \n\t" "vmovhpd %%xmm1, (%%rax,%%r9) \n\t" "vmovlpd %%xmm2, (%%rax,%%r10) \n\t" "vmovhpd %%xmm2, (%%rax,%%r11) \n\t" "# \n\t" "# Update C(1,0:3) \n\t" "# \n\t" "addq %%r8, %%rcx \n\t" "addq %%r8, %%rdx \n\t" "addq %%r8, %%rax \n\t" "vmovlpd (%%rcx), %%xmm1, %%xmm1 \n\t" "vmovhpd (%%rcx,%%r9), %%xmm1, %%xmm1 \n\t" "vmovlpd (%%rcx,%%r10), %%xmm2, %%xmm2 \n\t" "vmovhpd (%%rcx,%%r11), %%xmm2, %%xmm2 \n\t" "vextractf128 $1, %%ymm5, %%xmm3 \n\t" "vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t" "vaddpd %%xmm1, %%xmm5, %%xmm1 \n\t" "vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vmovlpd %%xmm1, (%%rcx) \n\t" "vmovhpd %%xmm1, (%%rcx,%%r9) \n\t" "vmovlpd %%xmm2, (%%rcx,%%r10) \n\t" "vmovhpd %%xmm2, (%%rcx,%%r11) \n\t" "# \n\t" "# Update C(1,4:7) \n\t" "# \n\t" "vmovlpd (%%rdx), %%xmm1, %%xmm1 \n\t" "vmovhpd (%%rdx,%%r9), %%xmm1, %%xmm1 \n\t" "vmovlpd (%%rdx,%%r10), %%xmm2, %%xmm2 \n\t" "vmovhpd (%%rdx,%%r11), %%xmm2, %%xmm2 \n\t" "vextractf128 $1, %%ymm9, %%xmm3 \n\t" "vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t" "vaddpd %%xmm1, %%xmm9, %%xmm1 \n\t" "vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vmovlpd %%xmm1, (%%rdx) \n\t" "vmovhpd %%xmm1, (%%rdx,%%r9) \n\t" "vmovlpd %%xmm2, (%%rdx,%%r10) \n\t" "vmovhpd %%xmm2, (%%rdx,%%r11) \n\t" "# \n\t" "# Update C(1,8:11) \n\t" "# \n\t" "vmovlpd (%%rax), %%xmm1, %%xmm1 \n\t" "vmovhpd (%%rax,%%r9), %%xmm1, %%xmm1 \n\t" "vmovlpd (%%rax,%%r10), %%xmm2, %%xmm2 \n\t" "vmovhpd (%%rax,%%r11), %%xmm2, %%xmm2 \n\t" "vextractf128 $1, %%ymm13, %%xmm3 \n\t" "vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t" "vaddpd %%xmm1, %%xmm13, %%xmm1 \n\t" "vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vmovlpd %%xmm1, (%%rax) \n\t" "vmovhpd %%xmm1, (%%rax,%%r9) \n\t" "vmovlpd %%xmm2, (%%rax,%%r10) \n\t" "vmovhpd %%xmm2, (%%rax,%%r11) \n\t" "# \n\t" "# Update C(2,0:3) \n\t" "# \n\t" "addq %%r8, %%rcx \n\t" "addq %%r8, %%rdx \n\t" "addq %%r8, %%rax \n\t" "vmovlpd (%%rcx), %%xmm1, %%xmm1 \n\t" "vmovhpd (%%rcx,%%r9), %%xmm1, %%xmm1 \n\t" "vmovlpd (%%rcx,%%r10), %%xmm2, %%xmm2 \n\t" "vmovhpd (%%rcx,%%r11), %%xmm2, %%xmm2 \n\t" "vextractf128 $1, %%ymm6, %%xmm3 \n\t" "vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t" "vaddpd %%xmm1, %%xmm6, %%xmm1 \n\t" "vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vmovlpd %%xmm1, (%%rcx) \n\t" "vmovhpd %%xmm1, (%%rcx,%%r9) \n\t" "vmovlpd %%xmm2, (%%rcx,%%r10) \n\t" "vmovhpd %%xmm2, (%%rcx,%%r11) \n\t" "# \n\t" "# Update C(2,4:7) \n\t" "# \n\t" "vmovlpd (%%rdx), %%xmm1, %%xmm1 \n\t" "vmovhpd (%%rdx,%%r9), %%xmm1, %%xmm1 \n\t" "vmovlpd (%%rdx,%%r10), %%xmm2, %%xmm2 \n\t" "vmovhpd (%%rdx,%%r11), %%xmm2, %%xmm2 \n\t" "vextractf128 $1, %%ymm10, %%xmm3 \n\t" "vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t" "vaddpd %%xmm1, %%xmm10, %%xmm1 \n\t" "vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vmovlpd %%xmm1, (%%rdx) \n\t" "vmovhpd %%xmm1, (%%rdx,%%r9) \n\t" "vmovlpd %%xmm2, (%%rdx,%%r10) \n\t" "vmovhpd %%xmm2, (%%rdx,%%r11) \n\t" "# \n\t" "# Update C(2,8:11) \n\t" "# \n\t" "vmovlpd (%%rax), %%xmm1, %%xmm1 \n\t" "vmovhpd (%%rax,%%r9), %%xmm1, %%xmm1 \n\t" "vmovlpd (%%rax,%%r10), %%xmm2, %%xmm2 \n\t" "vmovhpd (%%rax,%%r11), %%xmm2, %%xmm2 \n\t" "vextractf128 $1, %%ymm14, %%xmm3 \n\t" "vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t" "vaddpd %%xmm1, %%xmm14, %%xmm1 \n\t" "vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vmovlpd %%xmm1, (%%rax) \n\t" "vmovhpd %%xmm1, (%%rax,%%r9) \n\t" "vmovlpd %%xmm2, (%%rax,%%r10) \n\t" "vmovhpd %%xmm2, (%%rax,%%r11) \n\t" "# \n\t" "# Update C(3,0:3) \n\t" "# \n\t" "addq %%r8, %%rcx \n\t" "addq %%r8, %%rdx \n\t" "addq %%r8, %%rax \n\t" "vmovlpd (%%rcx), %%xmm1, %%xmm1 \n\t" "vmovhpd (%%rcx,%%r9), %%xmm1, %%xmm1 \n\t" "vmovlpd (%%rcx,%%r10), %%xmm2, %%xmm2 \n\t" "vmovhpd (%%rcx,%%r11), %%xmm2, %%xmm2 \n\t" "vextractf128 $1, %%ymm7, %%xmm3 \n\t" "vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t" "vaddpd %%xmm1, %%xmm7, %%xmm1 \n\t" "vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vmovlpd %%xmm1, (%%rcx) \n\t" "vmovhpd %%xmm1, (%%rcx,%%r9) \n\t" "vmovlpd %%xmm2, (%%rcx,%%r10) \n\t" "vmovhpd %%xmm2, (%%rcx,%%r11) \n\t" "# \n\t" "# Update C(3,4:7) \n\t" "# \n\t" "vmovlpd (%%rdx), %%xmm1, %%xmm1 \n\t" "vmovhpd (%%rdx,%%r9), %%xmm1, %%xmm1 \n\t" "vmovlpd (%%rdx,%%r10), %%xmm2, %%xmm2 \n\t" "vmovhpd (%%rdx,%%r11), %%xmm2, %%xmm2 \n\t" "vextractf128 $1, %%ymm11, %%xmm3 \n\t" "vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t" "vaddpd %%xmm1, %%xmm11, %%xmm1 \n\t" "vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vmovlpd %%xmm1, (%%rdx) \n\t" "vmovhpd %%xmm1, (%%rdx,%%r9) \n\t" "vmovlpd %%xmm2, (%%rdx,%%r10) \n\t" "vmovhpd %%xmm2, (%%rdx,%%r11) \n\t" "# \n\t" "# Update C(3,8:11) \n\t" "# \n\t" "vmovlpd (%%rax), %%xmm1, %%xmm1 \n\t" "vmovhpd (%%rax,%%r9), %%xmm1, %%xmm1 \n\t" "vmovlpd (%%rax,%%r10), %%xmm2, %%xmm2 \n\t" "vmovhpd (%%rax,%%r11), %%xmm2, %%xmm2 \n\t" "vextractf128 $1, %%ymm15, %%xmm3 \n\t" "vmulpd %%xmm0, %%xmm1, %%xmm1 \n\t" "vaddpd %%xmm1, %%xmm15, %%xmm1 \n\t" "vmulpd %%xmm0, %%xmm2, %%xmm2 \n\t" "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" "vmovlpd %%xmm1, (%%rax) \n\t" "vmovhpd %%xmm1, (%%rax,%%r9) \n\t" "vmovlpd %%xmm2, (%%rax,%%r10) \n\t" "vmovhpd %%xmm2, (%%rax,%%r11) \n\t" "jmp done%= \n\t" // case: beta == 0 "beta_zero%=: \n\t" "# \n\t" "# Update C(0,0:3) \n\t" "# \n\t" "vextractf128 $1, %%ymm4, %%xmm3 \n\t" "vmovlpd %%xmm4, (%%rcx) \n\t" "vmovhpd %%xmm4, (%%rcx,%%r9) \n\t" "vmovlpd %%xmm3, (%%rcx,%%r10) \n\t" "vmovhpd %%xmm3, (%%rcx,%%r11) \n\t" "# \n\t" "# Update C(0,4:7) \n\t" "# \n\t" "vextractf128 $1, %%ymm8, %%xmm3 \n\t" "vmovlpd %%xmm8, (%%rdx) \n\t" "vmovhpd %%xmm8, (%%rdx,%%r9) \n\t" "vmovlpd %%xmm3, (%%rdx,%%r10) \n\t" "vmovhpd %%xmm3, (%%rdx,%%r11) \n\t" "# \n\t" "# Update C(0,8:11) \n\t" "# \n\t" "vextractf128 $1, %%ymm12, %%xmm3 \n\t" "vmovlpd %%xmm12, (%%rax) \n\t" "vmovhpd %%xmm12, (%%rax,%%r9) \n\t" "vmovlpd %%xmm3, (%%rax,%%r10) \n\t" "vmovhpd %%xmm3, (%%rax,%%r11) \n\t" "# \n\t" "# Update C(1,0:3) \n\t" "# \n\t" "addq %%r8, %%rcx \n\t" "addq %%r8, %%rdx \n\t" "addq %%r8, %%rax \n\t" "vextractf128 $1, %%ymm5, %%xmm3 \n\t" "vmovlpd %%xmm5, (%%rcx) \n\t" "vmovhpd %%xmm5, (%%rcx,%%r9) \n\t" "vmovlpd %%xmm3, (%%rcx,%%r10) \n\t" "vmovhpd %%xmm3, (%%rcx,%%r11) \n\t" "# \n\t" "# Update C(1,4:7) \n\t" "# \n\t" "vextractf128 $1, %%ymm9, %%xmm3 \n\t" "vmovlpd %%xmm9, (%%rdx) \n\t" "vmovhpd %%xmm9, (%%rdx,%%r9) \n\t" "vmovlpd %%xmm3, (%%rdx,%%r10) \n\t" "vmovhpd %%xmm3, (%%rdx,%%r11) \n\t" "# \n\t" "# Update C(1,8:11) \n\t" "# \n\t" "vextractf128 $1, %%ymm13, %%xmm3 \n\t" "vmovlpd %%xmm13, (%%rax) \n\t" "vmovhpd %%xmm13, (%%rax,%%r9) \n\t" "vmovlpd %%xmm3, (%%rax,%%r10) \n\t" "vmovhpd %%xmm3, (%%rax,%%r11) \n\t" "# \n\t" "# Update C(2,0:3) \n\t" "# \n\t" "addq %%r8, %%rcx \n\t" "addq %%r8, %%rdx \n\t" "addq %%r8, %%rax \n\t" "vextractf128 $1, %%ymm6, %%xmm3 \n\t" "vmovlpd %%xmm6, (%%rcx) \n\t" "vmovhpd %%xmm6, (%%rcx,%%r9) \n\t" "vmovlpd %%xmm3, (%%rcx,%%r10) \n\t" "vmovhpd %%xmm3, (%%rcx,%%r11) \n\t" "# \n\t" "# Update C(2,4:7) \n\t" "# \n\t" "vextractf128 $1, %%ymm10, %%xmm3 \n\t" "vmovlpd %%xmm10, (%%rdx) \n\t" "vmovhpd %%xmm10, (%%rdx,%%r9) \n\t" "vmovlpd %%xmm3, (%%rdx,%%r10) \n\t" "vmovhpd %%xmm3, (%%rdx,%%r11) \n\t" "# \n\t" "# Update C(2,8:11) \n\t" "# \n\t" "vextractf128 $1, %%ymm14, %%xmm3 \n\t" "vmovlpd %%xmm14, (%%rax) \n\t" "vmovhpd %%xmm14, (%%rax,%%r9) \n\t" "vmovlpd %%xmm3, (%%rax,%%r10) \n\t" "vmovhpd %%xmm3, (%%rax,%%r11) \n\t" "# \n\t" "# Update C(3,0:3) \n\t" "# \n\t" "addq %%r8, %%rcx \n\t" "addq %%r8, %%rdx \n\t" "addq %%r8, %%rax \n\t" "vextractf128 $1, %%ymm7, %%xmm3 \n\t" "vmovlpd %%xmm7, (%%rcx) \n\t" "vmovhpd %%xmm7, (%%rcx,%%r9) \n\t" "vmovlpd %%xmm3, (%%rcx,%%r10) \n\t" "vmovhpd %%xmm3, (%%rcx,%%r11) \n\t" "# \n\t" "# Update C(3,4:7) \n\t" "# \n\t" "vextractf128 $1, %%ymm11, %%xmm3 \n\t" "vmovlpd %%xmm11, (%%rdx) \n\t" "vmovhpd %%xmm11, (%%rdx,%%r9) \n\t" "vmovlpd %%xmm3, (%%rdx,%%r10) \n\t" "vmovhpd %%xmm3, (%%rdx,%%r11) \n\t" "# \n\t" "# Update C(3,8:11) \n\t" "# \n\t" "vextractf128 $1, %%ymm15, %%xmm3 \n\t" "vmovlpd %%xmm15, (%%rax) \n\t" "vmovhpd %%xmm15, (%%rax,%%r9) \n\t" "vmovlpd %%xmm3, (%%rax,%%r10) \n\t" "vmovhpd %%xmm3, (%%rax,%%r11) \n\t" "done%=: \n\t" : // output : // input "m" (kc), // 0 "m" (A), // 1 "m" (B), // 2 "m" (pAlpha), // 3 "m" (pBeta), // 4 "m" (C), // 5 "m" (incRowC), // 6 "m" (incColC) // 7 : // register clobber list "rax", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", "memory" ); } #endif
Benchmark Results
$shell> g++ -Ofast -mavx -Wall -std=c++11 -DHAVE_AVX -DNDEBUG -I ../boost_1_60_0 matprod.cc $shell> ./a.out > report.session2 $shell> cat report.session2 # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Residual 100 100 100 0.000809 2472.19 0.000323 6191.95 0 200 200 200 0.006877 2326.6 0.00152 10526.3 0 300 300 300 0.010098 5347.59 0.002711 19918.8 1.94339e-16 400 400 400 0.023644 5413.64 0.005979 21408.3 8.40192e-17 500 500 500 0.045822 5455.89 0.011117 22488.1 3.52697e-17 600 600 600 0.080257 5382.71 0.019715 21912.2 1.63972e-17 700 700 700 0.128891 5322.33 0.030326 22620.9 8.44478e-18 800 800 800 0.210201 4871.53 0.047688 21472.9 4.71309e-18 900 900 900 0.325499 4479.28 0.063481 22967.5 2.79898e-18 1000 1000 1000 0.470301 4252.6 0.086339 23164.5 1.76616e-18 1100 1100 1100 0.624556 4262.23 0.119658 22246.7 1.149e-18 1200 1200 1200 0.811154 4260.6 0.15355 22507.3 7.78732e-19 1300 1300 1300 1.036 4241.3 0.194676 22570.8 5.45933e-19 1400 1400 1400 1.30352 4210.13 0.244264 22467.5 3.89593e-19 1500 1500 1500 1.61592 4177.18 0.29786 22661.7 2.8652e-19 1600 1600 1600 1.96915 4160.16 0.381175 21491.4 2.13964e-19 1700 1700 1700 2.36788 4149.71 0.435644 22555.1 1.62834e-19 1800 1800 1800 2.81765 4139.62 0.512775 22746.8 1.26254e-19 1900 1900 1900 3.32306 4128.12 0.602307 22775.8 9.85705e-20 2000 2000 2000 3.88255 4121 0.692417 23107.5 7.82885e-20 2100 2100 2100 4.60511 4022.05 0.792981 23357.4 6.28576e-20 2200 2200 2200 5.47546 3889.35 0.902376 23599.9 5.08775e-20 2300 2300 2300 6.39626 3804.41 1.07557 22624.2 4.17117e-20 2400 2400 2400 7.35844 3757.32 1.20236 22994.8 3.43739e-20 2500 2500 2500 8.37275 3732.35 1.32592 23568.6 2.85856e-20 2600 2600 2600 9.45366 3718.35 1.52877 22993.7 2.39389e-20 2700 2700 2700 10.6354 3701.4 1.67988 23433.9 2.01962e-20 2800 2800 2800 11.8683 3699.25 1.88038 23348.5 1.71236e-20 2900 2900 2900 13.202 3694.74 2.09965 23231.5 1.45996e-20 3000 3000 3000 14.5456 3712.47 2.28774 23604.1 1.25328e-20 3100 3100 3100 16.0865 3703.85 2.50491 23786.1 1.08093e-20 3200 3200 3200 17.895 3662.25 2.85963 22917.7 9.35243e-21 3300 3300 3300 19.5051 3684.88 3.03928 23648.4 8.14503e-21 3400 3400 3400 21.3004 3690.44 3.39132 23179.2 7.11654e-21 3500 3500 3500 23.2126 3694.12 3.63449 23593.4 6.24089e-21 3600 3600 3600 25.2271 3698.88 4.0155 23238 5.49184e-21 3700 3700 3700 27.4812 3686.37 4.29658 23578.3 4.85238e-21 3800 3800 3800 29.7953 3683.26 4.696 23369.7 4.3001e-21 3900 3900 3900 32.1891 3685.66 5.0543 23472.7 3.82209e-21 4000 4000 4000 34.6342 3695.76 5.34636 23941.5 3.41023e-21 $shell> gnuplot plot.session2.mflops $shell> gnuplot plot.session2.time $shell> gnuplot plot.session2.time_log $shell>