Comparison with Intel MKL

We are on the right track. But there is still some work to do. If you look at the benchmarks, you will see that the Intel MKL reaches its peak performance much faster. At the moment the unlocked LU factorization is the bottleneck. In order to keep up some BLAS Level 2 functions need improvement.

Compile and Run Benchmark

$shell> g++-5.3 -DM_MAX=800 -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -I /opt/intel/compilers_and_libraries/linux/mkl/include -DMKL_ILP64 -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc -L /opt/intel/compilers_and_libraries/linux/mkl/lib/intel64  -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread -Wl,-rpath /opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -I ../../eigen-eigen-ce5a455b34c0/
$shell> ./a.out
  50   50    0        3.405e-11          0.00382         21.46344   0        4.131e-11          0.00007       1174.14544   0        2.313e-11          0.00012        658.43654   0        4.151e-11          0.00006       1354.04123
 100  100    0        2.291e-10          0.00023       2903.49853   0        1.926e-10          0.00039       1696.35502   0        1.453e-10          0.00028       2333.53436   0        2.030e-10          0.00028       2373.28340
 150  150    0        7.048e-10          0.00054       4133.02097   0        5.808e-10          0.00090       2495.19073   0        4.519e-10          0.00072       3114.32331   0        5.908e-10          0.00035       6383.06062
 200  200    0        1.497e-09          0.00044      12196.11102   0        1.382e-09          0.00070       7574.61279   0        9.956e-10          0.00059       8998.16142   0        1.025e-09          0.00074       7155.69610
 250  250    0        2.894e-09          0.00073      14265.58426   0        2.914e-09          0.00132       7896.79052   0        2.105e-09          0.00104       9983.52008   0        3.086e-09          0.00145       7173.39841
 300  300    0        4.654e-09          0.00119      15100.73464   0        5.368e-09          0.00195       9200.51591   0        3.279e-09          0.00174      10326.97783   0        5.433e-09          0.00229       7831.90392
 350  350    0        6.806e-09          0.00171      16692.38816   0        8.498e-09          0.00304       9372.03209   0        4.873e-09          0.00259      11014.51374   0        8.245e-09          0.00363       7856.64146
 400  400    0        1.013e-08          0.00240      17731.84744   0        1.392e-08          0.00414      10277.61052   0        7.705e-09          0.00382      11144.06531   0        6.814e-09          0.00521       8176.52906
 450  450    0        1.383e-08          0.00328      18467.25337   0        1.907e-08          0.00581      10431.91423   0        1.102e-08          0.00486      12467.76610   0        1.812e-08          0.00739       8209.37797
 500  500    0        1.865e-08          0.00430      19345.10663   0        2.580e-08          0.00729      11414.96917   0        1.571e-08          0.00620      13430.56606   0        2.388e-08          0.00977       8512.43287
 550  550    0        2.584e-08          0.00567      19521.02455   0        3.563e-08          0.00989      11199.80674   0        2.218e-08          0.00833      13295.14507   0        3.219e-08          0.01328       8337.99920
 600  600    0        3.495e-08          0.00723      19888.72437   0        4.606e-08          0.01184      12148.82283   0        2.991e-08          0.01067      13474.35404   0        2.191e-08          0.01676       8582.73821
 650  650    0        4.571e-08          0.00909      20114.99784   0        5.888e-08          0.01637      11174.04558   0        4.052e-08          0.01334      13705.89299   0        5.541e-08          0.02171       8423.28059
 700  700    0        5.830e-08          0.01096      20832.71391   0        7.548e-08          0.02094      10907.48582   0        5.203e-08          0.01602      14255.15675   0        6.840e-08          0.02687       8499.41149
 750  750    0        7.130e-08          0.01357      20709.53373   0        9.011e-08          0.02399      11712.34963   0        6.435e-08          0.02022      13896.63346   0        8.145e-08          0.03371       8335.78515
 800  800    0        8.924e-08          0.01633      20884.45922   0        1.125e-07          0.02699      12635.08437   0        8.152e-08          0.02445      13946.51599   0        4.945e-08          0.03969       8591.61312
$shell>

Benchmark (Single Threaded)

$shell> g++-5.3 -DNO_CHECK -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -I /opt/intel/compilers_and_libraries/linux/mkl/include -DMKL_ILP64 -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc -L /opt/intel/compilers_and_libraries/linux/mkl/lib/intel64  -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread -Wl,-rpath /opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -I ../../eigen-eigen-ce5a455b34c0/
$shell> ./a.out > report.session8.lu
$shell> gnuplot plot.session8.lu
$shell> gnuplot plot.session8.lu.log
$shell> gnuplot plot.session8.lu.mflops
$shell>

Time (Effectiveness)

MFLOPS (Efficiency)

Source Code

`bench_lu.cc`

session8/bench_lu.cc

#include <cassert>
#include <chrono>
#include <cmath>
#include <limits>
#include <random>
#include <type_traits>
#include <boost/timer.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include "gemm.hpp"
#include "lu.hpp"
#include <mkl_lapack.h>
#include <Eigen/Dense>

template <typename T>
struct WallTime
{
    void
    tic()
    {
        t0 = std::chrono::high_resolution_clock::now();
    }

    T
    toc()
    {
        using namespace std::chrono;

        elapsed = high_resolution_clock::now() - t0;
        return duration<T,seconds::period>(elapsed).count();
    }

    std::chrono::high_resolution_clock::time_point t0;
    std::chrono::high_resolution_clock::duration   elapsed;
};

//
// For checking I don't care for efficiency of elegance ...
//
template <typename MatrixA1, typename MatrixA2, typename MatrixP>
double
checkLU(const MatrixA1 &A1, MatrixA2 &A2, const MatrixP &P)
{
    namespace ublas = boost::numeric::ublas;

    typedef typename MatrixA1::size_type    size_type;
    typedef typename MatrixA1::value_type   value_type;

    size_type m = A1.size1();
    size_type n = A1.size2();

    ublas::matrix<value_type>     L(m,m);

    // Copy L form A2 and set A2 to U
    for (size_type j=0; j<m; ++j) {
        for (size_type i=0; i<m; ++i) {
            if (i==j) {
                L(i,i) = 1;
            } else if (i>j) {
                L(i,j)  = A2(i,j);
                A2(i,j) = 0;
            } else if (j>i) {
                L(i,j)  = 0;
            }
        }
    }
    ublas::matrix<value_type>   LU(m,n);
    ublas::axpy_prod(L, A2, LU, true);

    // LU <- inv(P)*L*U
    foo::swap_rows(P, LU, true);

    double diff = 0;
    for (size_type j=0; j<n; ++j) {
        for (size_type i=0; i<m; ++i) {
            diff += std::abs(A1(i,j)-LU(i,j));
        }
    }
    return diff;
}

// fill rectangular matrix with random values
template <typename MATRIX>
void
fill(MATRIX &A)
{
    typedef typename MATRIX::size_type  size_type;
    typedef typename MATRIX::value_type T;

    std::random_device                  random;
    std::default_random_engine          mt(random());
    std::uniform_real_distribution<T>   uniform(-100,100);

    for (size_type i=0; i<A.size1(); ++i) {
        for (size_type j=0; j<A.size2(); ++j) {
            A(i,j) = uniform(mt);
        }
    }
}

#ifndef M_MAX
#define M_MAX 10000
#endif

#ifndef N_MAX
#define N_MAX 10000
#endif

int
main()
{
    namespace ublas = boost::numeric::ublas;

    const std::size_t m_min = 50;
    const std::size_t n_min = 50;

    const std::size_t m_max = M_MAX;
    const std::size_t n_max = N_MAX;

    const std::size_t m_inc = 50;
    const std::size_t n_inc = 50;

    typedef double              T;
    typedef ublas::row_major    SO;

    typedef Eigen::Matrix<T,Eigen::Dynamic,Eigen::Dynamic> EigenMat;

    WallTime<double>  walltime;

    for (std::size_t m=m_min, n=n_min;
         m<=m_max && n<=n_max;
         m+=m_inc, n+=n_inc)
    {
        ublas::matrix<T,SO>             A_org(m, n);
        ublas::matrix<T,SO>             A1(m, n);
        ublas::matrix<T,SO>             A2(m, n);
        ublas::matrix<T,SO>             A3(m, n);
        ublas::matrix<T,SO>             A4(m, n);
        ublas::permutation_matrix<T>    P1(m), P2(m), P3(m), P4(m);

        fill(A_org);
        A1 = A_org;
        A2 = A_org;
        A3 = A_org;
        A4 = A_org;

        double      diff = 0, t, mflops;
        std::size_t info;

        std::size_t minMN = std::min(m,n);
        std::size_t maxMN = std::max(m,n);

        mflops  = minMN*minMN*maxMN -(minMN*minMN*minMN/3.) -(minMN*minMN/2.);
        mflops /= 1000000;

        double    *A_mkl  = new double[m*n];
        long      *p_mkl_ = new long[m];
        long long *p_mkl  = (long long *)p_mkl_;

        for (std::size_t i=0; i<m; ++i) {
            for (std::size_t j=0; j<n; ++j) {
                A_mkl[i+j*m] = A1(i,j);
            }
        }
        long long m_ = m, n_ = n, info_;

        walltime.tic();
        dgetrf(&m_, &n_, A_mkl, &m_, p_mkl, &info_);
        t = walltime.toc();

        for (std::size_t i=0; i<m; ++i) {
            for (std::size_t j=0; j<n; ++j) {
                A1(i,j) = A_mkl[i+j*m];
            }
            P1(i) = p_mkl[i]-1;
        }
        info = info_;

        delete [] A_mkl;
        delete [] p_mkl_;

#ifndef NO_CHECK
        diff   = checkLU(A_org, A1, P1);
#endif
        std::printf("%4ld %4ld %4ld %16.3e %16.5lf %16.5lf",
                    m, n, info, diff, t, mflops/t);

        walltime.tic();
        info = foo::lu_blocked(A2, P2);
        t = walltime.toc();

#ifndef NO_CHECK
        diff = checkLU(A_org, A2, P2);
#endif
        std::printf("%4ld %16.3e %16.5lf %16.5lf",
                    info, diff, t, mflops/t);

        walltime.tic();
        info = foo::lu_blocked_recursive(A3, P3);
        t = walltime.toc();


#ifndef NO_CHECK
        diff = checkLU(A_org, A3, P3);
#endif
        std::printf("%4ld %16.3e %16.5lf %16.5lf",
                    info, diff, t, mflops/t);


        EigenMat eigA(m, m);
        for (std::size_t i=0; i<m; ++i) {
            for (std::size_t j=0; j<n; ++j) {
                eigA(i,j) = A4(i,j);
            }
        }
        Eigen::PartialPivLU<EigenMat> lu(eigA);
        walltime.tic();
        lu.compute(eigA);
        t = walltime.toc();
        auto eigA_ = lu.reconstructedMatrix();
#ifndef NO_CHECK
        diff = 0;
        for (std::size_t i=0; i<m; ++i) {
            for (std::size_t j=0; j<n; ++j) {
                diff += std::abs(A_org(i,j) - eigA_(i,j));
            }
        }
#endif
        std::printf("%4ld %16.3e %16.5lf %16.5lf\n",
                    info, diff, t, mflops/t);
    }
}