Content |
Comparison with Intel MKL
We are on the right track. But there is still some work to do. If you look at the benchmarks, you will see that the Intel MKL reaches its peak performance much faster. At the moment the unlocked LU factorization is the bottleneck. In order to keep up some BLAS Level 2 functions need improvement.
Compile and Run Benchmark
$shell> g++-5.3 -DM_MAX=800 -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -I /opt/intel/compilers_and_libraries/linux/mkl/include -DMKL_ILP64 -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc -L /opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread -Wl,-rpath /opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -I ../../eigen-eigen-ce5a455b34c0/ $shell> ./a.out 50 50 0 3.405e-11 0.00382 21.46344 0 4.131e-11 0.00007 1174.14544 0 2.313e-11 0.00012 658.43654 0 4.151e-11 0.00006 1354.04123 100 100 0 2.291e-10 0.00023 2903.49853 0 1.926e-10 0.00039 1696.35502 0 1.453e-10 0.00028 2333.53436 0 2.030e-10 0.00028 2373.28340 150 150 0 7.048e-10 0.00054 4133.02097 0 5.808e-10 0.00090 2495.19073 0 4.519e-10 0.00072 3114.32331 0 5.908e-10 0.00035 6383.06062 200 200 0 1.497e-09 0.00044 12196.11102 0 1.382e-09 0.00070 7574.61279 0 9.956e-10 0.00059 8998.16142 0 1.025e-09 0.00074 7155.69610 250 250 0 2.894e-09 0.00073 14265.58426 0 2.914e-09 0.00132 7896.79052 0 2.105e-09 0.00104 9983.52008 0 3.086e-09 0.00145 7173.39841 300 300 0 4.654e-09 0.00119 15100.73464 0 5.368e-09 0.00195 9200.51591 0 3.279e-09 0.00174 10326.97783 0 5.433e-09 0.00229 7831.90392 350 350 0 6.806e-09 0.00171 16692.38816 0 8.498e-09 0.00304 9372.03209 0 4.873e-09 0.00259 11014.51374 0 8.245e-09 0.00363 7856.64146 400 400 0 1.013e-08 0.00240 17731.84744 0 1.392e-08 0.00414 10277.61052 0 7.705e-09 0.00382 11144.06531 0 6.814e-09 0.00521 8176.52906 450 450 0 1.383e-08 0.00328 18467.25337 0 1.907e-08 0.00581 10431.91423 0 1.102e-08 0.00486 12467.76610 0 1.812e-08 0.00739 8209.37797 500 500 0 1.865e-08 0.00430 19345.10663 0 2.580e-08 0.00729 11414.96917 0 1.571e-08 0.00620 13430.56606 0 2.388e-08 0.00977 8512.43287 550 550 0 2.584e-08 0.00567 19521.02455 0 3.563e-08 0.00989 11199.80674 0 2.218e-08 0.00833 13295.14507 0 3.219e-08 0.01328 8337.99920 600 600 0 3.495e-08 0.00723 19888.72437 0 4.606e-08 0.01184 12148.82283 0 2.991e-08 0.01067 13474.35404 0 2.191e-08 0.01676 8582.73821 650 650 0 4.571e-08 0.00909 20114.99784 0 5.888e-08 0.01637 11174.04558 0 4.052e-08 0.01334 13705.89299 0 5.541e-08 0.02171 8423.28059 700 700 0 5.830e-08 0.01096 20832.71391 0 7.548e-08 0.02094 10907.48582 0 5.203e-08 0.01602 14255.15675 0 6.840e-08 0.02687 8499.41149 750 750 0 7.130e-08 0.01357 20709.53373 0 9.011e-08 0.02399 11712.34963 0 6.435e-08 0.02022 13896.63346 0 8.145e-08 0.03371 8335.78515 800 800 0 8.924e-08 0.01633 20884.45922 0 1.125e-07 0.02699 12635.08437 0 8.152e-08 0.02445 13946.51599 0 4.945e-08 0.03969 8591.61312 $shell>
Benchmark (Single Threaded)
$shell> g++-5.3 -DNO_CHECK -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -I /opt/intel/compilers_and_libraries/linux/mkl/include -DMKL_ILP64 -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc -L /opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread -Wl,-rpath /opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -I ../../eigen-eigen-ce5a455b34c0/ $shell> ./a.out > report.session8.lu $shell> gnuplot plot.session8.lu $shell> gnuplot plot.session8.lu.log $shell> gnuplot plot.session8.lu.mflops $shell>
Time (Effectiveness)
MFLOPS (Efficiency)
Source Code
bench_lu.cc
#include <cassert> #include <chrono> #include <cmath> #include <limits> #include <random> #include <type_traits> #include <boost/timer.hpp> #include <boost/numeric/ublas/matrix.hpp> #include <boost/numeric/ublas/io.hpp> #include <boost/numeric/ublas/lu.hpp> #include <boost/numeric/ublas/operation.hpp> #include "gemm.hpp" #include "lu.hpp" #include <mkl_lapack.h> #include <Eigen/Dense> template <typename T> struct WallTime { void tic() { t0 = std::chrono::high_resolution_clock::now(); } T toc() { using namespace std::chrono; elapsed = high_resolution_clock::now() - t0; return duration<T,seconds::period>(elapsed).count(); } std::chrono::high_resolution_clock::time_point t0; std::chrono::high_resolution_clock::duration elapsed; }; // // For checking I don't care for efficiency of elegance ... // template <typename MatrixA1, typename MatrixA2, typename MatrixP> double checkLU(const MatrixA1 &A1, MatrixA2 &A2, const MatrixP &P) { namespace ublas = boost::numeric::ublas; typedef typename MatrixA1::size_type size_type; typedef typename MatrixA1::value_type value_type; size_type m = A1.size1(); size_type n = A1.size2(); ublas::matrix<value_type> L(m,m); // Copy L form A2 and set A2 to U for (size_type j=0; j<m; ++j) { for (size_type i=0; i<m; ++i) { if (i==j) { L(i,i) = 1; } else if (i>j) { L(i,j) = A2(i,j); A2(i,j) = 0; } else if (j>i) { L(i,j) = 0; } } } ublas::matrix<value_type> LU(m,n); ublas::axpy_prod(L, A2, LU, true); // LU <- inv(P)*L*U foo::swap_rows(P, LU, true); double diff = 0; for (size_type j=0; j<n; ++j) { for (size_type i=0; i<m; ++i) { diff += std::abs(A1(i,j)-LU(i,j)); } } return diff; } // fill rectangular matrix with random values template <typename MATRIX> void fill(MATRIX &A) { typedef typename MATRIX::size_type size_type; typedef typename MATRIX::value_type T; std::random_device random; std::default_random_engine mt(random()); std::uniform_real_distribution<T> uniform(-100,100); for (size_type i=0; i<A.size1(); ++i) { for (size_type j=0; j<A.size2(); ++j) { A(i,j) = uniform(mt); } } } #ifndef M_MAX #define M_MAX 10000 #endif #ifndef N_MAX #define N_MAX 10000 #endif int main() { namespace ublas = boost::numeric::ublas; const std::size_t m_min = 50; const std::size_t n_min = 50; const std::size_t m_max = M_MAX; const std::size_t n_max = N_MAX; const std::size_t m_inc = 50; const std::size_t n_inc = 50; typedef double T; typedef ublas::row_major SO; typedef Eigen::Matrix<T,Eigen::Dynamic,Eigen::Dynamic> EigenMat; WallTime<double> walltime; for (std::size_t m=m_min, n=n_min; m<=m_max && n<=n_max; m+=m_inc, n+=n_inc) { ublas::matrix<T,SO> A_org(m, n); ublas::matrix<T,SO> A1(m, n); ublas::matrix<T,SO> A2(m, n); ublas::matrix<T,SO> A3(m, n); ublas::matrix<T,SO> A4(m, n); ublas::permutation_matrix<T> P1(m), P2(m), P3(m), P4(m); fill(A_org); A1 = A_org; A2 = A_org; A3 = A_org; A4 = A_org; double diff = 0, t, mflops; std::size_t info; std::size_t minMN = std::min(m,n); std::size_t maxMN = std::max(m,n); mflops = minMN*minMN*maxMN -(minMN*minMN*minMN/3.) -(minMN*minMN/2.); mflops /= 1000000; double *A_mkl = new double[m*n]; long *p_mkl_ = new long[m]; long long *p_mkl = (long long *)p_mkl_; for (std::size_t i=0; i<m; ++i) { for (std::size_t j=0; j<n; ++j) { A_mkl[i+j*m] = A1(i,j); } } long long m_ = m, n_ = n, info_; walltime.tic(); dgetrf(&m_, &n_, A_mkl, &m_, p_mkl, &info_); t = walltime.toc(); for (std::size_t i=0; i<m; ++i) { for (std::size_t j=0; j<n; ++j) { A1(i,j) = A_mkl[i+j*m]; } P1(i) = p_mkl[i]-1; } info = info_; delete [] A_mkl; delete [] p_mkl_; #ifndef NO_CHECK diff = checkLU(A_org, A1, P1); #endif std::printf("%4ld %4ld %4ld %16.3e %16.5lf %16.5lf", m, n, info, diff, t, mflops/t); walltime.tic(); info = foo::lu_blocked(A2, P2); t = walltime.toc(); #ifndef NO_CHECK diff = checkLU(A_org, A2, P2); #endif std::printf("%4ld %16.3e %16.5lf %16.5lf", info, diff, t, mflops/t); walltime.tic(); info = foo::lu_blocked_recursive(A3, P3); t = walltime.toc(); #ifndef NO_CHECK diff = checkLU(A_org, A3, P3); #endif std::printf("%4ld %16.3e %16.5lf %16.5lf", info, diff, t, mflops/t); EigenMat eigA(m, m); for (std::size_t i=0; i<m; ++i) { for (std::size_t j=0; j<n; ++j) { eigA(i,j) = A4(i,j); } } Eigen::PartialPivLU<EigenMat> lu(eigA); walltime.tic(); lu.compute(eigA); t = walltime.toc(); auto eigA_ = lu.reconstructedMatrix(); #ifndef NO_CHECK diff = 0; for (std::size_t i=0; i<m; ++i) { for (std::size_t j=0; j<n; ++j) { diff += std::abs(A_org(i,j) - eigA_(i,j)); } } #endif std::printf("%4ld %16.3e %16.5lf %16.5lf\n", info, diff, t, mflops/t); } }