Content |
Matrix-Matrix Product with C++ and ublas
Computing a matrix-matrix product \(C = AB\) for a \(m\times n\) matrix \(C\), \(m \times k\) matrix \(A\) and \(k \times n\) matrix \(B\).
All kept very simple ...
Compile and Run Benchmark
For a quick benchmark we limit \(m\) to \(500\) with -DM_MAX=500:
$shell> g++ -Ofast -mavx -Wall -DNDEBUG -std=c++11 -DM_MAX=500 -I ../boost_1_60_0 matprod.cc $shell> ./a.out # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1 100 100 100 0.000819 2442 0.000842 2375.3 6.20455e-14 200 200 200 0.006837 2340.21 0.003703 4320.82 2.71434e-15 300 300 300 0.010083 5355.55 0.008251 6544.66 4.28955e-16 400 400 400 0.023768 5385.39 0.018627 6871.75 1.13314e-16 500 500 500 0.045824 5455.66 0.035936 6956.81 4.11985e-17 $shell>
The columns in the output are:
-
Dimension \(m\). The program uses \(m=n=k\) but you can adjust that.
-
Time elapsed using ublas::axpy_prod to compute \(C_1 = AB\).
-
Time elapsed using the matrix-matrix product from gemm.hpp to compute \(C_2 = AB\).
-
The difference \(\|C_1 - C_2\|_1\). Note that this will in general not be exactly zero because of roundoff errors.
Main Program with Benchmark and Test
#include <cassert> #include <chrono> #include <cmath> #include <random> #include <boost/timer.hpp> #include <boost/numeric/ublas/matrix.hpp> #include <boost/numeric/ublas/io.hpp> #include <boost/numeric/ublas/operation.hpp> #include "gemm.hpp" template <typename T> struct WallTime { void tic() { t0 = std::chrono::high_resolution_clock::now(); } T toc() { using namespace std::chrono; elapsed = high_resolution_clock::now() - t0; return duration<T,seconds::period>(elapsed).count(); } std::chrono::high_resolution_clock::time_point t0; std::chrono::high_resolution_clock::duration elapsed; }; template <typename MATRIX> void fill(MATRIX &A) { typedef typename MATRIX::size_type size_type; typedef typename MATRIX::value_type T; std::random_device random; std::default_random_engine mt(random()); std::uniform_real_distribution<T> uniform(-100,100); for (size_type i=0; i<A.size1(); ++i) { for (size_type j=0; j<A.size2(); ++j) { A(i,j) = uniform(mt); } } } template <typename MATRIX> typename MATRIX::value_type asum(const MATRIX &A) { typedef typename MATRIX::size_type size_type; typedef typename MATRIX::value_type T; T asum = 0; for (size_type i=0; i<A.size1(); ++i) { for (size_type j=0; j<A.size2(); ++j) { asum += std::abs(A(i,j)); } } return asum; } template <typename MA, typename MB, typename MC0, typename MC1> double estimateGemmResidual(const MA &A, const MB &B, const MC0 &C0, const MC1 &C1) { typedef typename MC0::size_type size_type; size_type m= C1.size1(); size_type n= C1.size2(); size_type k= A.size2(); double aNorm = asum(A); double bNorm = asum(B); double cNorm = asum(C1); double diff = asum(C1-C0); // Using eps for double gives upper bound in case elements have lower // precision. double eps = std::numeric_limits<double>::epsilon(); double res = diff/(aNorm*bNorm*cNorm*eps*std::max(std::max(m,n),k)); return res; } namespace foo { template <typename MATRIXA, typename MATRIXB, typename MATRIXC> void axpy_prod(const MATRIXA &A, const MATRIXB &B, MATRIXC &C, bool update) { typedef typename MATRIXA::value_type TA; typedef typename MATRIXC::value_type TC; assert(A.size2()==B.size1()); const std::ptrdiff_t m = C.size1(); const std::ptrdiff_t n = C.size2(); const std::ptrdiff_t k = A.size2(); const std::ptrdiff_t incRowA = &A(1,0) - &A(0,0); const std::ptrdiff_t incColA = &A(0,1) - &A(0,0); const std::ptrdiff_t incRowB = &B(1,0) - &B(0,0); const std::ptrdiff_t incColB = &B(0,1) - &B(0,0); const std::ptrdiff_t incRowC = &C(1,0) - &C(0,0); const std::ptrdiff_t incColC = &C(0,1) - &C(0,0); gemm(m, n, k, TA(1), &A(0,0), incRowA, incColA, &B(0,0), incRowB, incColB, TC(update ? 0 : 1), &C(0,0), incRowC, incColC); } } // namespace foo #ifndef M_MAX #define M_MAX 4000 #endif #ifndef K_MAX #define K_MAX 4000 #endif #ifndef N_MAX #define N_MAX 4000 #endif int main() { namespace ublas = boost::numeric::ublas; const std::size_t m_min = 100; const std::size_t k_min = 100; const std::size_t n_min = 100; const std::size_t m_max = M_MAX; const std::size_t k_max = K_MAX; const std::size_t n_max = N_MAX; const std::size_t m_inc = 100; const std::size_t k_inc = 100; const std::size_t n_inc = 100; const bool matprodUpdate = true; typedef double T; typedef ublas::row_major SO; std::cout << "# m"; std::cout << " n"; std::cout << " k"; std::cout << " uBLAS: t1"; std::cout << " MFLOPS"; std::cout << " Blocked: t2"; std::cout << " MFLOPS"; std::cout << " Diff nrm1"; std::cout << std::endl; WallTime<double> walltime; for (std::size_t m=m_min, k=k_min, n=n_min; m<=m_max && k<=k_max && n<=n_max; m += m_inc, k += k_inc, n += n_inc) { ublas::matrix<T,SO> A(m, k); ublas::matrix<T,SO> B(k, n); ublas::matrix<T,SO> C1(m, n); ublas::matrix<T,SO> C2(m, n); fill(A); fill(B); fill(C1); C2 = C1; walltime.tic(); ublas::axpy_prod(A, B, C1, matprodUpdate); double t1 = walltime.toc(); walltime.tic(); foo::axpy_prod(A, B, C2, matprodUpdate); double t2 = walltime.toc(); double res = estimateGemmResidual(A, B, C1, C2); std::cout.width(5); std::cout << m << " "; std::cout.width(5); std::cout << n << " "; std::cout.width(5); std::cout << k << " "; std::cout.width(12); std::cout << t1 << " "; std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*k/t1 << " "; std::cout.width(15); std::cout << t2 << " "; std::cout.width(12); std::cout << 2.*m/1000.*n/1000.*k/t2 << " "; std::cout.width(15); std::cout << res; std::cout << std::endl; } }
Core Function for Matrix-Matrix Produkt
// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core #ifndef GEMM_HPP #define GEMM_HPP #include <algorithm> //-- Config -------------------------------------------------------------------- template <typename T> struct BlockSize { static const int MC = 64; static const int KC = 64; static const int NC = 256; static const int MR = 8; static const int NR = 8; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize<double> { static const int MC = 256; static const int KC = 256; static const int NC = 4096; static const int MR = 4; static const int NR = 8; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; //-- aux routines -------------------------------------------------------------- template <typename Index, typename Alpha, typename TX, typename TY> void geaxpy(Index m, Index n, const Alpha &alpha, const TX *X, Index incRowX, Index incColX, TY *Y, Index incRowY, Index incColY) { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; } } } template <typename Index, typename Alpha, typename TX> void gescal(Index m, Index n, const Alpha &alpha, TX *X, Index incRowX, Index incColX) { if (alpha!=Alpha(0)) { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { X[i*incRowX+j*incColX] *= alpha; } } } else { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { X[i*incRowX+j*incColX] = Alpha(0); } } } } //-- Micro Kernel -------------------------------------------------------------- template <typename Index, typename T> void ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; T P[BlockSize<T>::MR*BlockSize<T>::NR]; for (Index l=0; l<MR*NR; ++l) { P[l] = 0; } for (Index l=0; l<kc; ++l) { for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { P[i+j*MR] += A[i+l*MR]*B[l*NR+j]; } } } if (beta!=T(0)) { for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { C[i*incRowC+j*incColC] *= beta; C[i*incRowC+j*incColC] += alpha*P[i+j*MR]; } } } else { for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { C[i*incRowC+j*incColC] = alpha*P[i+j*MR]; } } } } //-- Macro Kernel -------------------------------------------------------------- template <typename Index, typename T, typename Beta, typename TC> void mgemm(Index mc, Index nc, Index kc, T alpha, const T *A, const T *B, Beta beta, TC *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; const Index mp = (mc+MR-1) / MR; const Index np = (nc+NR-1) / NR; const Index mr_ = mc % MR; const Index nr_ = nc % NR; T C_[MR*NR]; for (Index j=0; j<np; ++j) { const Index nr = (j!=np-1 || nr_==0) ? NR : nr_; for (Index i=0; i<mp; ++i) { const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_; if (mr==MR && nr==NR) { ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } else { ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], T(0), C_, Index(1), MR); gescal(mr, nr, beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); geaxpy(mr, nr, T(1), C_, Index(1), MR, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } } } } //-- Packing blocks ------------------------------------------------------------ template <typename Index, typename TA, typename T> void pack_A(Index mc, Index kc, const TA *A, Index incRowA, Index incColA, T *p) { Index MR = BlockSize<T>::MR; Index mp = (mc+MR-1) / MR; for (Index j=0; j<kc; ++j) { for (Index l=0; l<mp; ++l) { for (Index i0=0; i0<MR; ++i0) { Index i = l*MR + i0; Index nu = l*MR*kc + j*MR + i0; p[nu] = (i<mc) ? A[i*incRowA+j*incColA] : T(0); } } } } template <typename Index, typename TB, typename T> void pack_B(Index kc, Index nc, const TB *B, Index incRowB, Index incColB, T *p) { Index NR = BlockSize<T>::NR; Index np = (nc+NR-1) / NR; for (Index l=0; l<np; ++l) { for (Index j0=0; j0<NR; ++j0) { for (Index i=0; i<kc; ++i) { Index j = l*NR+j0; Index nu = l*NR*kc + i*NR + j0; p[nu] = (j<nc) ? B[i*incRowB+j*incColB] : T(0); } } } } //-- Frame routine ------------------------------------------------------------- template <typename Index, typename Alpha, typename TA, typename TB, typename Beta, typename TC> void gemm(Index m, Index n, Index k, Alpha alpha, const TA *A, Index incRowA, Index incColA, const TB *B, Index incRowB, Index incColB, Beta beta, TC *C, Index incRowC, Index incColC) { typedef typename std::common_type<Alpha, TA, TB>::type T; const Index MC = BlockSize<T>::MC; const Index NC = BlockSize<T>::NC; const Index KC = BlockSize<T>::KC; const Index mb = (m+MC-1) / MC; const Index nb = (n+NC-1) / NC; const Index kb = (k+KC-1) / KC; const Index mc_ = m % MC; const Index nc_ = n % NC; const Index kc_ = k % KC; T *A_ = new T[MC*KC]; T *B_ = new T[KC*NC]; if (alpha==Alpha(0) || k==0) { gescal(m, n, beta, C, incRowC, incColC); return; } for (Index j=0; j<nb; ++j) { Index nc = (j!=nb-1 || nc_==0) ? NC : nc_; for (Index l=0; l<kb; ++l) { Index kc = (l!=kb-1 || kc_==0) ? KC : kc_; Beta beta_ = (l==0) ? beta : Beta(1); pack_B(kc, nc, &B[l*KC*incRowB+j*NC*incColB], incRowB, incColB, B_); for (Index i=0; i<mb; ++i) { Index mc = (i!=mb-1 || mc_==0) ? MC : mc_; pack_A(mc, kc, &A[i*MC*incRowA+l*KC*incColA], incRowA, incColA, A_); mgemm(mc, nc, kc, T(alpha), A_, B_, beta_, &C[i*MC*incRowC+j*NC*incColC], incRowC, incColC); } } } delete [] A_; delete [] B_; } #endif
Benchmark Results
$shell> cat /proc/cpuinfo processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 58 model name : Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz stepping : 9 microcode : 0x1b cpu MHz : 1600.000 cache size : 6144 KB physical id : 0 siblings : 4 core id : 0 cpu cores : 4 apicid : 0 initial apicid : 0 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms bogomips : 6385.07 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management: processor : 1 vendor_id : GenuineIntel cpu family : 6 model : 58 model name : Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz stepping : 9 microcode : 0x1b cpu MHz : 1600.000 cache size : 6144 KB physical id : 0 siblings : 4 core id : 1 cpu cores : 4 apicid : 2 initial apicid : 2 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms bogomips : 6385.19 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management: processor : 2 vendor_id : GenuineIntel cpu family : 6 model : 58 model name : Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz stepping : 9 microcode : 0x1b cpu MHz : 1600.000 cache size : 6144 KB physical id : 0 siblings : 4 core id : 2 cpu cores : 4 apicid : 4 initial apicid : 4 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms bogomips : 6385.19 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management: processor : 3 vendor_id : GenuineIntel cpu family : 6 model : 58 model name : Intel(R) Core(TM) i5-3470 CPU @ 3.20GHz stepping : 9 microcode : 0x1b cpu MHz : 1600.000 cache size : 6144 KB physical id : 0 siblings : 4 core id : 3 cpu cores : 4 apicid : 6 initial apicid : 6 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms bogomips : 6385.19 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management: $shell> g++ -Ofast -mavx -Wall -DNDEBUG -std=c++11 -I ../boost_1_60_0 matprod.cc $shell> ./a.out > report.session1 $shell> cat report.session1 # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1 100 100 100 0.00082 2439.02 0.000832 2403.85 6.15072e-14 200 200 200 0.006832 2341.92 0.00543 2946.59 2.71864e-15 300 300 300 0.010102 5345.48 0.008225 6565.35 4.21274e-16 400 400 400 0.023674 5406.78 0.018666 6857.39 1.14023e-16 500 500 500 0.045833 5454.59 0.035914 6961.07 4.13976e-17 600 600 600 0.081284 5314.7 0.062157 6950.14 1.79329e-17 700 700 700 0.129532 5295.99 0.09774 7018.62 8.90805e-18 800 800 800 0.210805 4857.57 0.148193 6909.91 4.87688e-18 900 900 900 0.32142 4536.12 0.2082 7002.88 2.85418e-18 1000 1000 1000 0.4617 4331.82 0.282644 7076.04 1.76828e-18 1100 1100 1100 0.64577 4122.21 0.378013 7042.09 1.14454e-18 1200 1200 1200 0.839814 4115.2 0.484853 7127.93 7.71812e-19 1300 1300 1300 1.03782 4233.89 0.626335 7015.42 5.37115e-19 1400 1400 1400 1.30557 4203.51 0.772569 7103.57 3.83696e-19 1500 1500 1500 1.61648 4175.73 0.942957 7158.33 2.81209e-19 1600 1600 1600 1.9694 4159.64 1.17412 6977.15 2.09789e-19 1700 1700 1700 2.37052 4145.08 1.37257 7158.84 1.59207e-19 1800 1800 1800 2.82108 4134.58 1.63629 7128.31 1.23088e-19 1900 1900 1900 3.32736 4122.79 1.93345 7095.09 9.63235e-20 2000 2000 2000 3.88764 4115.6 2.22819 7180.71 7.64161e-20 2100 2100 2100 4.59908 4027.33 2.5915 7147.21 6.13089e-20 2200 2200 2200 5.46934 3893.7 2.96199 7189.75 4.96631e-20 2300 2300 2300 6.40947 3796.57 3.44146 7070.84 4.06371e-20 2400 2400 2400 7.37459 3749.09 3.85586 7170.38 3.34798e-20 2500 2500 2500 8.3659 3735.4 4.33895 7202.2 2.78472e-20 2600 2600 2600 9.45125 3719.3 4.90042 7173.27 2.33145e-20 2700 2700 2700 10.6491 3696.64 5.46821 7199.06 1.96765e-20 2800 2800 2800 11.8888 3692.87 6.08888 7210.52 1.66894e-20 2900 2900 2900 13.206 3693.63 6.7976 7175.77 1.42455e-20 3000 3000 3000 14.5646 3707.62 7.47821 7220.98 1.22104e-20 3100 3100 3100 16.0738 3706.78 8.30422 7174.91 1.05387e-20 3200 3200 3200 17.8987 3661.5 9.15534 7158.23 9.12413e-21 3300 3300 3300 19.513 3683.38 9.95183 7222.19 7.94422e-21 3400 3400 3400 21.3111 3688.6 10.9218 7197.32 6.94222e-21 3500 3500 3500 23.2202 3692.9 11.877 7219.81 6.08876e-21 3600 3600 3600 25.2252 3699.16 12.9813 7188.2 5.36298e-21 3700 3700 3700 27.4847 3685.91 14.0403 7215.36 4.73651e-21 3800 3800 3800 29.853 3676.14 15.2719 7186 4.19908e-21 3900 3900 3900 32.2609 3677.45 16.4889 7195.04 3.73515e-21 4000 4000 4000 34.7002 3688.74 17.7425 7214.32 3.32962e-21 $shell> gnuplot plot.session1.mflops $shell> gnuplot plot.session1.time $shell> gnuplot plot.session1.time_log $shell>