Content |
uBLAS-fying the GEMM
With uBLAS it is pretty straight forward to generalize the GEMM Operation:
-
SYMM, HEMM comes for free. Unlike the traditional BLAS there are no requirements about underlying storage scheme. Having a packed syymetric matrix is just fine.
-
Support for operations like \(C = (\alpha_1 A_1 + \alpha_2 A_2) \cdot (\beta B_1 + \beta B_2)\) comes for free. No temporaries get created and you don't loose performance.
However, due to my lack of deep knowledge of uBLAS the implementation is not as ellegant as could be. So things will become even better.
Tar-Ball for this Session
The tar-ball session4.tgz contains the files:
$shell> tar tfvz session4.tgz -rw-rw-r-- lehn/num 5401 2016-01-27 00:35 session4/matprod.cc -rw-rw-r-- lehn/num 6913 2016-01-27 00:53 session4/symatprod.cc -rw-rw-r-- lehn/num 17181 2016-02-02 18:17 session4/avx.hpp -rw-rw-r-- lehn/num 33544 2016-02-02 19:53 session4/fma.hpp -rw-rw-r-- lehn/num 9797 2016-02-02 19:50 session4/gemm.hpp $shell>
Compile and Run Benchmark
For a quick benchmark we again limit \(m\) to \(500\) with -DM_MAX=500. This time we also enable the optimized micro kernel for AVX with -DHAVE_AVX.
There are two demos.
Demo for GEMM
$shell> g++ -Ofast -mavx -Wall -std=c++11 -DNDEBUG -DHAVE_AVX -fopenmp -DM_MAX=500 -I ../boost_1_60_0 matprod.cc $shell> ./a.out # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Residual 100 100 100 0.000809 2472.19 0.000337 5934.72 0 200 200 200 0.004352 3676.47 0.000413 38740.9 0 300 300 300 0.010032 5382.78 0.001125 48000 1.93133e-16 400 400 400 0.023491 5448.9 0.002363 54168.4 8.44533e-17 500 500 500 0.045753 5464.12 0.00394 63451.8 3.50952e-17 $shell>
Demo for SYMM
$shell> g++ -Ofast -mavx -Wall -std=c++11 -DNDEBUG -DHAVE_AVX -fopenmp -DM_MAX=500 -I ../boost_1_60_0 symatprod.cc g++: error: symatprod.cc: No such file or directory g++: fatal error: no input files compilation terminated. $shell> ./a.out # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Residual 100 100 100 0.000819 2442 0.000331 6042.3 0 200 200 200 0.006857 2333.38 0.000837 19115.9 0 300 300 300 0.010086 5353.96 0.001121 48171.3 1.93032e-16 400 400 400 0.023649 5412.49 0.002339 54724.2 8.41258e-17 500 500 500 0.045708 5469.5 0.003915 63857 3.5178e-17 $shell>
Core Function for Matrix-Matrix Produkt
// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core #ifndef GEMM_HPP #define GEMM_HPP #include <algorithm> #include <cstdlib> #if defined(_OPENMP) #include <omp.h> #endif //-- new with alignment -------------------------------------------------------- void * malloc_(std::size_t alignment, std::size_t size) { alignment = std::max(alignment, alignof(void *)); size += alignment; void *ptr = std::malloc(size); void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1)); void **vp = (void**) ptr2 - 1; *vp = ptr; return ptr2; } void free_(void *ptr) { std::free(*((void**)ptr-1)); } //-- Config -------------------------------------------------------------------- // SIMD-Register width in bits // SSE: 128 // AVX/FMA: 256 // AVX-512: 512 #ifndef SIMD_REGISTER_WIDTH #define SIMD_REGISTER_WIDTH 256 #endif #ifdef HAVE_FMA # ifndef BS_D_MR # define BS_D_MR 4 # endif # ifndef BS_D_NR # define BS_D_NR 12 # endif # ifndef BS_D_MC # define BS_D_MC 256 # endif # ifndef BS_D_KC # define BS_D_KC 512 # endif # ifndef BS_D_NC # define BS_D_NC 4092 # endif #endif #ifndef BS_D_MR #define BS_D_MR 4 #endif #ifndef BS_D_NR #define BS_D_NR 8 #endif #ifndef BS_D_MC #define BS_D_MC 256 #endif #ifndef BS_D_KC #define BS_D_KC 256 #endif #ifndef BS_D_NC #define BS_D_NC 4096 #endif template <typename T> struct BlockSize { static constexpr int MC = 64; static constexpr int KC = 64; static constexpr int NC = 256; static constexpr int MR = 8; static constexpr int NR = 8; static constexpr int rwidth = 0; static constexpr int align = alignof(T); static constexpr int vlen = 0; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize<double> { static constexpr int MC = BS_D_MC; static constexpr int KC = BS_D_KC; static constexpr int NC = BS_D_NC; static constexpr int MR = BS_D_MR; static constexpr int NR = BS_D_NR; static constexpr int rwidth = SIMD_REGISTER_WIDTH; static constexpr int align = rwidth / 8; static constexpr int vlen = rwidth / (8*sizeof(double)); static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); static_assert(rwidth % sizeof(double) == 0, "SIMD register width not sane."); }; //-- aux routines -------------------------------------------------------------- template <typename Alpha, typename MX, typename MY> void geaxpy(const Alpha &alpha, const MX &X, MY &Y) { assert(X.size1()==Y.size1()); assert(X.size2()==Y.size2()); typedef typename MX::size_type size_type; for (size_type j=0; j<X.size2(); ++j) { for (size_type i=0; i<X.size1(); ++i) { Y(i,j) += alpha*X(i,j); } } } template <typename Alpha, typename MX> void gescal(const Alpha &alpha, MX &X) { typedef typename MX::size_type size_type; for (size_type j=0; j<X.size2(); ++j) { for (size_type i=0; i<X.size1(); ++i) { X(i,j) *= alpha; } } } template <typename Index, typename Alpha, typename TX, typename TY> void geaxpy(Index m, Index n, const Alpha &alpha, const TX *X, Index incRowX, Index incColX, TY *Y, Index incRowY, Index incColY) { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; } } } template <typename Index, typename Alpha, typename TX> void gescal(Index m, Index n, const Alpha &alpha, TX *X, Index incRowX, Index incColX) { if (alpha!=Alpha(0)) { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { X[i*incRowX+j*incColX] *= alpha; } } } else { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { X[i*incRowX+j*incColX] = Alpha(0); } } } } //-- Micro Kernel -------------------------------------------------------------- template <typename Index, typename T> typename std::enable_if<BlockSize<T>::vlen == 0, void>::type ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; T P[BlockSize<T>::MR*BlockSize<T>::NR]; for (Index l=0; l<MR*NR; ++l) { P[l] = 0; } for (Index l=0; l<kc; ++l) { for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { P[i+j*MR] += A[i+l*MR]*B[l*NR+j]; } } } for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { C[i*incRowC+j*incColC] *= beta; C[i*incRowC+j*incColC] += alpha*P[i+j*MR]; } } } #ifdef HAVE_AVX #include "avx.hpp" #endif #ifdef HAVE_FMA #include "fma.hpp" #endif #ifdef HAVE_GCCVEC #include "gccvec.hpp" #endif //-- Macro Kernel -------------------------------------------------------------- template <typename Index, typename T, typename Beta, typename TC> void mgemm(Index mc, Index nc, Index kc, T alpha, const T *A, const T *B, Beta beta, TC *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; const Index mp = (mc+MR-1) / MR; const Index np = (nc+NR-1) / NR; const Index mr_ = mc % MR; const Index nr_ = nc % NR; #if defined(_OPENMP) #pragma omp parallel for #endif for (Index j=0; j<np; ++j) { const Index nr = (j!=np-1 || nr_==0) ? NR : nr_; T C_[BlockSize<T>::MR*BlockSize<T>::NR]; for (Index i=0; i<mp; ++i) { const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_; if (mr==MR && nr==NR) { ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } else { std::fill_n(C_, MR*NR, T(0)); ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], T(0), C_, Index(1), MR); gescal(mr, nr, beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); geaxpy(mr, nr, T(1), C_, Index(1), MR, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } } } } //-- Packing blocks ------------------------------------------------------------ template <typename MA, typename T> void pack_A(const MA &A, T *p) { typedef typename MA::size_type size_type; size_type mc = A.size1(); size_type kc = A.size2(); size_type MR = BlockSize<T>::MR; size_type mp = (mc+MR-1) / MR; for (size_type j=0; j<kc; ++j) { for (size_type l=0; l<mp; ++l) { for (size_type i0=0; i0<MR; ++i0) { size_type i = l*MR + i0; size_type nu = l*MR*kc + j*MR + i0; p[nu] = (i<mc) ? A(i,j) : T(0); } } } } template <typename MB, typename T> void pack_B(const MB &B, T *p) { typedef typename MB::size_type size_type; size_type kc = B.size1(); size_type nc = B.size2(); size_type NR = BlockSize<T>::NR; size_type np = (nc+NR-1) / NR; for (size_type l=0; l<np; ++l) { for (size_type j0=0; j0<NR; ++j0) { for (size_type i=0; i<kc; ++i) { size_type j = l*NR+j0; size_type nu = l*NR*kc + i*NR + j0; p[nu] = (j<nc) ? B(i,j) : T(0); } } } } //-- Frame routine ------------------------------------------------------------- template <typename Alpha, typename MatrixA, typename MatrixB, typename Beta, typename MatrixC> void gemm(Alpha alpha, const MatrixA &A, const MatrixB &B, Beta beta, MatrixC &C) { assert(A.size2()==B.size1()); namespace ublas = boost::numeric::ublas; typedef typename MatrixC::size_type size_type; typedef typename MatrixA::value_type TA; typedef typename MatrixB::value_type TB; typedef typename MatrixC::value_type TC; typedef typename std::common_type<Alpha, TA, TB>::type T; const size_type MC = BlockSize<T>::MC; const size_type NC = BlockSize<T>::NC; const size_type MR = BlockSize<T>::MR; const size_type NR = BlockSize<T>::NR; const size_type m = C.size1(); const size_type n = C.size2(); const size_type k = A.size2(); const size_type KC = BlockSize<T>::KC; const size_type mb = (m+MC-1) / MC; const size_type nb = (n+NC-1) / NC; const size_type kb = (k+KC-1) / KC; const size_type mc_ = m % MC; const size_type nc_ = n % NC; const size_type kc_ = k % KC; TC *C_ = &C(0,0); const size_type incRowC = &C(1,0) - &C(0,0); const size_type incColC = &C(0,1) - &C(0,0); T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*KC+MR)); T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(KC*NC+NR)); if (alpha==Alpha(0) || k==0) { gescal(beta, C); return; } for (size_type j=0; j<nb; ++j) { size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_; for (size_type l=0; l<kb; ++l) { size_type kc = (l!=kb-1 || kc_==0) ? KC : kc_; Beta beta_ = (l==0) ? beta : Beta(1); const auto Bs = subrange(B, l*KC, l*KC+kc, j*NC, j*NC+nc); pack_B(Bs, B_); for (size_type i=0; i<mb; ++i) { size_type mc = (i!=mb-1 || mc_==0) ? MC : mc_; const auto As = subrange(A, i*MC, i*MC+mc, l*KC, l*KC+kc); pack_A(As, A_); mgemm(mc, nc, kc, T(alpha), A_, B_, beta_, &C_[i*MC*incRowC+j*NC*incColC], incRowC, incColC); } } } free_(A_); free_(B_); } #endif
Benchmark Results for GEMM
The benchmark tests \(C = A B\) for general matrices \(A\), \(B\) and \(C\). Please also test for cases where \(A\) or \(B\) are expressions.
Benchmark Program
#include <cassert> #include <chrono> #include <cmath> #include <limits> #include <random> #include <type_traits> #include <boost/timer.hpp> #include <boost/numeric/ublas/matrix.hpp> #include <boost/numeric/ublas/io.hpp> #include <boost/numeric/ublas/operation.hpp> #include "gemm.hpp" template <typename T> struct WallTime { void tic() { t0 = std::chrono::high_resolution_clock::now(); } T toc() { using namespace std::chrono; elapsed = high_resolution_clock::now() - t0; return duration<T,seconds::period>(elapsed).count(); } std::chrono::high_resolution_clock::time_point t0; std::chrono::high_resolution_clock::duration elapsed; }; // I guess this trait already exists or can be done more elegant ... template <typename M> struct MatrixType { static constexpr bool isGeneral = false; static constexpr bool isSymmetric = false; static constexpr bool isHermitian = false; static constexpr bool isTriangular = false; }; template <typename T, typename SO> struct MatrixType<boost::numeric::ublas::matrix<T,SO> > { static constexpr bool isGeneral = true; static constexpr bool isSymmetric = false; static constexpr bool isHermitian = false; static constexpr bool isTriangular = false; }; // fill rectangular matrix with random values template <typename MATRIX> typename std::enable_if<MatrixType<MATRIX>::isGeneral, void>::type fill(MATRIX &A) { typedef typename MATRIX::size_type size_type; typedef typename MATRIX::value_type T; std::random_device random; std::default_random_engine mt(random()); std::uniform_real_distribution<T> uniform(-100,100); for (size_type i=0; i<A.size1(); ++i) { for (size_type j=0; j<A.size2(); ++j) { A(i,j) = uniform(mt); } } } template <typename MATRIX> typename MATRIX::value_type asum(const MATRIX &A) { typedef typename MATRIX::size_type size_type; typedef typename MATRIX::value_type T; T asum = 0; for (size_type i=0; i<A.size1(); ++i) { for (size_type j=0; j<A.size2(); ++j) { asum += std::abs(A(i,j)); } } return asum; } template <typename MA, typename MB, typename MC0, typename MC1> double estimateGemmResidual(const MA &A, const MB &B, const MC0 &C0, const MC1 &C1) { typedef typename MC0::value_type TC0; typedef typename MC0::size_type size_type; size_type m= C1.size1(); size_type n= C1.size2(); size_type k= A.size2(); double aNorm = asum(A); double bNorm = asum(B); double cNorm = asum(C1); double diff = asum(C1-C0); // Using eps for double gives upper bound in case elements have lower // precision. double eps = std::numeric_limits<double>::epsilon(); double res = diff/(aNorm*bNorm*cNorm*eps*std::max(std::max(m,n),k)); return res; } namespace foo { template <typename MATRIXA, typename MATRIXB, typename MATRIXC> void axpy_prod(const MATRIXA &A, const MATRIXB &B, MATRIXC &C, bool update) { typedef typename MATRIXA::value_type TA; typedef typename MATRIXC::value_type TC; assert(A.size2()==B.size1()); gemm(TA(1), A, B, TC(update ? 0 : 1), C); } } // namespace foo #ifndef M_MAX #define M_MAX 4000 #endif #ifndef K_MAX #define K_MAX 4000 #endif #ifndef N_MAX #define N_MAX 4000 #endif int main() { namespace ublas = boost::numeric::ublas; const std::size_t m_min = 100; const std::size_t k_min = 100; const std::size_t n_min = 100; const std::size_t m_max = M_MAX; const std::size_t k_max = K_MAX; const std::size_t n_max = N_MAX; const std::size_t m_inc = 100; const std::size_t k_inc = 100; const std::size_t n_inc = 100; const bool matprodUpdate = true; typedef double T; typedef ublas::row_major SO; std::cout << "# m"; std::cout << " n"; std::cout << " k"; std::cout << " uBLAS: t1"; std::cout << " MFLOPS"; std::cout << " Blocked: t2"; std::cout << " MFLOPS"; std::cout << " Diff nrm1"; std::cout << std::endl; WallTime<double> walltime; for (std::size_t m=m_min, k=k_min, n=n_min; m<=m_max && k<=k_max && n<=n_max; m += m_inc, k += k_inc, n += n_inc) { ublas::matrix<T,SO> A(m, k); ublas::matrix<T,SO> B(k, n); ublas::matrix<T,SO> C1(m, n); ublas::matrix<T,SO> C2(m, n); fill(A); fill(B); fill(C1); C2 = C1; walltime.tic(); ublas::axpy_prod(A, B, C1, matprodUpdate); double t1 = walltime.toc(); walltime.tic(); foo::axpy_prod(A, B, C2, matprodUpdate); double t2 = walltime.toc(); double res = estimateGemmResidual(A, B, C1, C2); std::cout.width(5); std::cout << m << " "; std::cout.width(5); std::cout << n << " "; std::cout.width(5); std::cout << k << " "; std::cout.width(12); std::cout << t1 << " "; std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*k/t1 << " "; std::cout.width(15); std::cout << t2 << " "; std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*k/t2 << " "; std::cout.width(15); std::cout << res; std::cout << std::endl; } }
Resuts
$shell> g++ -Ofast -mavx -Wall -std=c++11 -DHAVE_AVX -fopenmp -DNDEBUG -I ../boost_1_60_0 matprod.cc $shell> ./a.out > report.gemm.session4 $shell> cat report.gemm.session4 # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1 100 100 100 0.000827 2418.38 0.000306 6535.95 0 200 200 200 0.006169 2593.61 0.000408 39215.7 0 300 300 300 0.010054 5371 0.00114 47368.4 1.95068e-16 400 400 400 0.023607 5422.12 0.002379 53804.1 8.39536e-17 500 500 500 0.045778 5461.14 0.003895 64184.9 3.52985e-17 600 600 600 0.080227 5384.72 0.006512 66339.1 1.62983e-17 700 700 700 0.130662 5250.19 0.009802 69985.7 8.45155e-18 800 800 800 0.207417 4936.91 0.01596 64160.4 4.71673e-18 900 900 900 0.32338 4508.63 0.020666 70550.7 2.80048e-18 1000 1000 1000 0.474364 4216.17 0.027074 73871.6 1.76058e-18 1100 1100 1100 0.634478 4195.57 0.037055 71839.2 1.14713e-18 1200 1200 1200 0.868618 3978.73 0.04757 72650.8 7.78273e-19 1300 1300 1300 1.07822 4075.22 0.058402 75237.1 5.44648e-19 1400 1400 1400 1.39077 3946.01 0.071462 76796.1 3.90201e-19 1500 1500 1500 1.71005 3947.26 0.084887 79517.5 2.86504e-19 1600 1600 1600 2.08305 3932.69 0.124203 65956.5 2.13818e-19 1700 1700 1700 2.49311 3941.27 0.123427 79609.8 1.62944e-19 1800 1800 1800 2.96805 3929.86 0.148842 78365 1.26159e-19 1900 1900 1900 3.47857 3943.58 0.178341 76920.1 9.86272e-20 2000 2000 2000 4.03543 3964.88 0.204527 78229.3 7.82724e-20 2100 2100 2100 4.73412 3912.44 0.230168 80471.7 6.2824e-20 2200 2200 2200 5.59092 3809.04 0.260903 81624.2 5.09102e-20 2300 2300 2300 6.5266 3728.43 0.330762 73569.5 4.17457e-20 2400 2400 2400 7.43927 3716.49 0.345519 80018.8 3.43634e-20 2500 2500 2500 8.40704 3717.12 0.376525 82995.8 2.85796e-20 2600 2600 2600 9.45964 3716 0.429706 81804.8 2.3956e-20 2700 2700 2700 10.6612 3692.46 0.474293 82999.3 2.01636e-20 2800 2800 2800 11.8969 3690.38 0.538666 81505.1 1.71253e-20 2900 2900 2900 13.2132 3691.63 0.590193 82647.5 1.4617e-20 3000 3000 3000 14.5794 3703.85 0.64663 83509.9 1.25305e-20 3100 3100 3100 16.0776 3705.89 0.71265 83606.3 1.08095e-20 3200 3200 3200 17.9667 3647.63 0.815718 80341.5 9.34951e-21 3300 3300 3300 19.5265 3680.85 0.846187 84938.7 8.15149e-21 3400 3400 3400 21.3146 3687.99 0.940302 83598.7 7.11752e-21 3500 3500 3500 23.1942 3697.04 1.01066 84845.4 6.24469e-21 3600 3600 3600 25.2668 3693.06 1.12898 82651.7 5.49412e-21 3700 3700 3700 27.5336 3679.35 1.19162 85015.4 4.85316e-21 3800 3800 3800 29.8748 3673.47 1.32286 82959.6 4.30078e-21 3900 3900 3900 32.2766 3675.67 1.3989 84808.2 3.82272e-21 4000 4000 4000 34.6874 3690.1 1.51476 84501.8 3.41036e-21 $shell> gnuplot plot.session4.gemm.mflops $shell> gnuplot plot.session4.gemm.time $shell> gnuplot plot.session4.gemm.time_log $shell>
MFLOPS
Time
Time with Logarithmic scale
Benchmark SYMM
The benchmark tests \(C = A B\) for a symmetric matrix \(A\) in packed storage format. Please also test for cases where \(A\) or \(B\) are expressions.
Benchmark Program
#include <cassert> #include <chrono> #include <cmath> #include <random> #include <type_traits> #include <boost/timer.hpp> #include <boost/numeric/ublas/matrix.hpp> #include <boost/numeric/ublas/symmetric.hpp> #include <boost/numeric/ublas/io.hpp> #include <boost/numeric/ublas/operation.hpp> #include "gemm.hpp" template <typename T> struct WallTime { void tic() { t0 = std::chrono::high_resolution_clock::now(); } T toc() { using namespace std::chrono; elapsed = high_resolution_clock::now() - t0; return duration<T,seconds::period>(elapsed).count(); } std::chrono::high_resolution_clock::time_point t0; std::chrono::high_resolution_clock::duration elapsed; }; // I guess this trait already exists or can be done more elegant ... template <typename M> struct MatrixType { static constexpr bool isGeneral = false; static constexpr bool isSymmetric = false; static constexpr bool isHermitian = false; static constexpr bool isTriangular = false; }; template <typename T, typename SO> struct MatrixType<boost::numeric::ublas::matrix<T,SO> > { static constexpr bool isGeneral = true; static constexpr bool isSymmetric = false; static constexpr bool isHermitian = false; static constexpr bool isTriangular = false; }; template <typename A, typename TRL> struct MatrixType<boost::numeric::ublas::symmetric_adaptor<A, TRL> > { static constexpr bool isGeneral = true; static constexpr bool isSymmetric = false; static constexpr bool isHermitian = false; static constexpr bool isTriangular = false; }; template <typename TA> struct MatrixType<boost::numeric::ublas::symmetric_matrix<TA> > { static constexpr bool isGeneral = true; static constexpr bool isSymmetric = false; static constexpr bool isHermitian = false; static constexpr bool isTriangular = false; }; // fill rectangular matrix with random values template <typename MATRIX> typename std::enable_if<MatrixType<MATRIX>::isGeneral, void>::type fill(MATRIX &A) { typedef typename MATRIX::size_type size_type; typedef typename MATRIX::value_type T; std::random_device random; std::default_random_engine mt(random()); std::uniform_real_distribution<T> uniform(-100,100); for (size_type i=0; i<A.size1(); ++i) { for (size_type j=0; j<A.size2(); ++j) { A(i,j) = uniform(mt); } } } // fill symmetric matrix with random values template <typename MATRIX> typename std::enable_if<MatrixType<MATRIX>::isSymmetric, void>::type fill(MATRIX &A) { using namespace boost::numeric::ublas; typedef typename MATRIX::size_type size_type; typedef typename MATRIX::value_type T; typedef typename MATRIX::triangular_type triangular_type; std::random_device random; std::default_random_engine mt(random()); std::uniform_real_distribution<T> uniform(-100,100); if (std::is_same<triangular_type, upper>::value) { for (size_type j=0; j<A.size2(); ++j) { for (size_type i=0; i<=j; ++i) { A(i,j) = uniform(mt); } } } else { for (size_type j=0; j<A.size2(); ++j) { for (size_type i=j; i<=A.size1(); ++i) { A(i,j) = uniform(mt); } } } } template <typename MATRIX> typename MATRIX::value_type asum(const MATRIX &A) { typedef typename MATRIX::size_type size_type; typedef typename MATRIX::value_type T; T asum = 0; for (size_type i=0; i<A.size1(); ++i) { for (size_type j=0; j<A.size2(); ++j) { asum += std::abs(A(i,j)); } } return asum; } template <typename MA, typename MB, typename MC0, typename MC1> double estimateGemmResidual(const MA &A, const MB &B, const MC0 &C0, const MC1 &C1) { typedef typename MC0::value_type TC0; typedef typename MC0::size_type size_type; size_type m= C1.size1(); size_type n= C1.size2(); size_type k= A.size2(); double aNorm = asum(A); double bNorm = asum(B); double cNorm = asum(C1); double diff = asum(C1-C0); // Using eps for double gives upper bound in case elements have lower // precision. double eps = std::numeric_limits<double>::epsilon(); double res = diff/(aNorm*bNorm*cNorm*eps*std::max(std::max(m,n),k)); return res; } namespace foo { template <typename MATRIXA, typename MATRIXB, typename MATRIXC> void axpy_prod(const MATRIXA &A, const MATRIXB &B, MATRIXC &C, bool update) { typedef typename MATRIXA::value_type TA; typedef typename MATRIXC::value_type TC; assert(A.size2()==B.size1()); gemm(TA(1), A, B, TC(update ? 0 : 1), C); } } // namespace foo #ifndef M_MAX #define M_MAX 4000 #endif #ifndef K_MAX #define K_MAX 4000 #endif #ifndef N_MAX #define N_MAX 4000 #endif #ifndef UPLO #define UPLO ublas::lower #endif int main() { namespace ublas = boost::numeric::ublas; const std::size_t m_min = 100; const std::size_t n_min = 100; const std::size_t m_max = M_MAX; const std::size_t n_max = N_MAX; const std::size_t m_inc = 100; const std::size_t n_inc = 100; const bool matprodUpdate = true; typedef double T; typedef ublas::column_major SO; std::cout << "# m"; std::cout << " n"; std::cout << " uBLAS: t1"; std::cout << " MFLOPS"; std::cout << " Blocked: t2"; std::cout << " MFLOPS"; std::cout << " Diff nrm1"; std::cout << std::endl; WallTime<double> walltime; for (std::size_t m=m_min, n=n_min; m<=m_max && n<=n_max; m += m_inc, n += n_inc) { ublas::symmetric_matrix<T> A(m); ublas::matrix<T,SO> B(m, n); ublas::matrix<T,SO> C1(m, n); ublas::matrix<T,SO> C2(m, n); fill(A); fill(B); fill(C1); C2 = C1; walltime.tic(); ublas::axpy_prod(A, B, C1, matprodUpdate); double t1 = walltime.toc(); walltime.tic(); foo::axpy_prod(A, B, C2, matprodUpdate); double t2 = walltime.toc(); double res = estimateGemmResidual(A, B, C1, C2); std::cout.width(5); std::cout << m << " "; std::cout.width(5); std::cout << n << " "; std::cout.width(12); std::cout << t1 << " "; std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*m/t1 << " "; std::cout.width(15); std::cout << t2 << " "; std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*m/t2 << " "; std::cout.width(15); std::cout << res; std::cout << std::endl; } }
Resuts
$shell> g++ -Ofast -mavx -Wall -std=c++11 -DHAVE_AVX -fopenmp -DNDEBUG -DM_MAX=1500 -I ../boost_1_60_0 symatprod.cc $shell> ./a.out > report.symm.session4 $shell> cat report.symm.session4 # m n uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1 100 100 0.003826 522.739 0.000368 5434.78 0 200 200 0.019738 810.619 0.000448 35714.3 0 300 300 0.047816 1129.33 0.001153 46834.3 1.9448e-16 400 400 0.117269 1091.51 0.002268 56437.4 8.38772e-17 500 500 0.232305 1076.17 0.004088 61154.6 3.54522e-17 600 600 0.408112 1058.53 0.006774 63773.3 1.63089e-17 700 700 0.654633 1047.92 0.010262 66848.6 8.4569e-18 800 800 1.09888 931.862 0.015027 68144 4.71215e-18 900 900 1.90133 766.831 0.021053 69253.8 2.80651e-18 1000 1000 3.03804 658.32 0.027982 71474.5 1.75932e-18 1100 1100 4.76968 558.109 0.03664 72652.8 1.14902e-18 1200 1200 7.09646 487.003 0.046163 74865.2 7.78259e-19 1300 1300 9.70487 452.763 0.058498 75113.7 5.45503e-19 1400 1400 12.9 425.425 0.07085 77459.4 3.89462e-19 1500 1500 16.4966 409.175 0.085683 78778.8 2.86317e-19 $shell> gnuplot plot.session4.symm.mflops $shell> gnuplot plot.session4.symm.time $shell> gnuplot plot.session4.symm.time_log $shell>