LU Factorization

The performance of solving a system of linear equations depends on the performance of the matrix-matrix product.

What's new:

Triangular solver for \(B \leftarrow L^{-1} B\). Currently this is for the special case where \(L\) is a lower triangular matrix with unit diagonal. But this can easily be generalized. Note that the performance of the triangular solver merely depends on the performance of the GEMM micro kernel.
Two implementations for the LU factorization:
- lu_blocked is basically the algorithm of LAPACK function dgetrf.
- lu_blocked_recursive applies the factorization recursively.
Both variants exploit the performance of the matrix-matrix product and the triangular solver for matrix equations.

Compile and Run Benchmark

$shell> g++-5.3 -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -DM_MAX=500 -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc
$shell> ./a.out
  50   50    0        3.858e-11          0.00007       1257.01889   0        3.858e-11          0.00007       1262.41266   0        3.148e-11          0.00014        574.16190
 100  100    0        2.704e-10          0.00035       1891.03810   0        1.852e-10          0.00039       1708.88958   0        2.017e-10          0.00038       1759.34469
 150  150    0        8.315e-10          0.00102       2188.39717   0        5.930e-10          0.00096       2332.16729   0        5.970e-10          0.00081       2757.47181
 200  200    0        1.833e-09          0.00109       4888.77725   0        1.466e-09          0.00078       6782.49673   0        1.492e-09          0.00070       7579.86438
 250  250    0        3.335e-09          0.00216       4812.57506   0        2.929e-09          0.00137       7571.10183   0        3.030e-09          0.00122       8507.56694
 300  300    0        5.387e-09          0.00388       4627.15277   0        5.214e-09          0.00211       8520.78992   0        4.064e-09          0.00186       9649.06530
 350  350    0        8.423e-09          0.00610       4679.07581   0        8.603e-09          0.00330       8652.79651   0        7.163e-09          0.00282      10113.78357
 400  400    0        1.221e-08          0.00923       4614.49949   0        1.292e-08          0.00448       9506.99858   0        1.080e-08          0.00398      10713.10129
 450  450    0        1.715e-08          0.01302       4659.90448   0        1.923e-08          0.00637       9519.13039   0        1.626e-08          0.00540      11239.84015
 500  500    0        2.273e-08          0.01807       4605.68407   0        2.650e-08          0.00819      10155.92090   0        2.298e-08          0.00714      11659.91454
$shell>

Benchmark (Single Threaded)

$shell> g++-5.3 -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc
$shell> ./a.out > report.session7.lu
$shell> gnuplot plot.session7.lu
$shell> gnuplot plot.session7.lu.log
$shell> gnuplot plot.session7.lu.mflops
$shell>

Time (Effectiveness)

For the LU-factorization we are primarily interested how long it takes to get the factorization:

Using a log-sclaing of both axis shows, that the complexity is still of order \(N^3\):

MFLOPS (Efficiency)

We are also interested how fast each algorithm reaches its peak performance:

Benchmark (Multi Threaded)

Test with 2 threads:

$shell> g++-5.3 -fopenmp -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc
$shell> export OMP_NUM_THREADS=2; ./a.out > report.session7-mt2.lu
$shell>

Test with 4 threads:

$shell> g++-5.3 -fopenmp -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc
$shell> export OMP_NUM_THREADS=4; ./a.out > report.session7-mt4.lu
$shell> gnuplot plot.session7-mt.lu
$shell> gnuplot plot.session7-mt.lu.log
$shell> gnuplot plot.session7-mt.lu.mflops
$shell>

Time (Effectiveness)

MFLOPS (Efficiency)

Source Code

The tar-ball session7.tgz contains the files:

$shell> tar tfvz session7.tgz
-rw-rw-r-- lehn/num       4301 2016-02-11 16:34 session7/bench_lu.cc
-rw-rw-r-- lehn/num      13537 2016-02-02 16:05 session7/avx.hpp
-rw-rw-r-- lehn/num      24390 2016-02-02 16:05 session7/fma.hpp
-rw-rw-r-- lehn/num       1537 2016-02-12 21:01 session7/gccvec.hpp
-rw-rw-r-- lehn/num       3353 2016-02-02 16:05 session7/gccvec2.hpp
-rw-rw-r-- lehn/num      16946 2016-02-12 21:27 session7/gemm.hpp
-rw-rw-r-- lehn/num       6221 2016-02-10 13:15 session7/lu.hpp
$shell>

`lu.hpp`

session7/lu.hpp

#ifndef LU_HPP
#define LU_HPP 1

#include <boost/numeric/ublas/operation.hpp>
#include <boost/numeric/ublas/vector_proxy.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/vector.hpp>
#include <boost/numeric/ublas/triangular.hpp>
#include "gemm.hpp"

namespace foo {

template <typename MP, typename MA>
void
swap_rows(const MP &P, MA &A, bool reverse=false)
{
    typedef typename MP::size_type size_type;

    size_type size = P.size();
    if (!reverse) {
        for (size_type i=0; i<size; ++i) {
            if (i!=P(i)) {
                row(A, i).swap(row(A, P(i)));
            }
        }
    } else {
        for (size_type I=size; I>=1; --I) {
            size_type i = I-1;
            if (i!=P(i)) {
                row(A, i).swap(row(A, P(i)));
            }
        }
    }
}

// Unblocked LU factorization with partial pivoting
template <typename MA, typename MP>
typename MA::size_type
lu_unblocked(MA &A, MP &P)
{
    namespace ublas = boost::numeric::ublas;
    using ublas::range;
    using ublas::matrix_column;
    using ublas::matrix_row;

    typedef typename MA::size_type  size_type;
    typedef typename MA::value_type value_type;

    size_type singular = 0;
    size_type m  = A.size1 ();
    size_type n  = A.size2 ();
    size_type mn = std::min(m, n);


    for (size_type i=0; i<mn; ++ i) {
        matrix_column<MA> A_i(column(A,i));
        matrix_row<MA>    Ai_(row(A,i));

        P(i) = i + index_norm_inf(project(A_i, range(i, m)));
        if (A(P(i),i) != value_type()) {
            if (P(i)!=i) {
                row(A, P(i)).swap (Ai_);
            }
        } else {
            singular = i+1;
        }
        value_type alpha = value_type(1)/A(i,i);
        project (A_i, range(i+1, m)) *= alpha;
        project (A, range(i+1, m), range (i+1, n)).minus_assign (
                outer_prod (project(A_i, range(i+1, m)),
                            project(Ai_, range(i+1, n))));
    }
    return singular;
}

// Blocked LU factorization with partial pivoting
template <typename MA, typename MP>
typename MA::size_type
lu_blocked(MA &A, MP &P)
{
    namespace ublas = boost::numeric::ublas;
    using ublas::range;
    using ublas::matrix_column;
    using ublas::matrix_row;

    typedef typename MA::size_type  size_type;
    typedef typename MA::value_type value_type;

    size_type singular  = 0;
    size_type singular_ = 0;
    size_type m  = A.size1 ();
    size_type n  = A.size2 ();
    size_type mn = std::min(m, n);
    size_type bs = 64;

    if (bs>=mn) {
        singular = lu_unblocked(A, P);
    } else {
        for (size_type j=0; j<mn; j+=bs) {
            auto jb = std::min(mn-j, bs);

            auto A_ = project(A, range(j,m), range(j,j+jb));
            auto P_ = project(P, range(j,m));

            singular_ = lu_unblocked(A_, P_);

            if (singular==0 && singular_>0) {
                singular = singular_ + j;
            }

            auto A_left = project(A, range(j,m), range(0,j));
            foo::swap_rows(project(P, range(j,j+jb)), A_left);

            if (j+jb<=n) {
                auto A_right  = project(A, range(j,m), range(j+jb,n));
                foo::swap_rows(project(P, range(j,j+jb)), A_right);

                const auto L  = project(A, range(j,j+jb), range(j,j+jb));
                auto  U_right = project(A, range(j,j+jb), range(j+jb,n));

                //inplace_solve(L, U_right, ublas::unit_lower_tag());
                trlsm(value_type(1), true, L, U_right);

                if (j+jb<=m) {

                    auto A_ = project(A, range(j+jb,m), range(j+jb,n));
                    gemm(value_type(-1),
                         project(A, range(j+jb,m), range(j,j+jb)),
                         project(A, range(j,j+jb), range(j+jb,n)),
                         value_type(1),
                         A_);
                }
            }

            for (size_type i=j; i<std::min(m, j+jb); ++i) {
                P(i) += j;
            }
        }
    }
    return singular;
}

// Blocked recursive LU factorization with partial pivoting
template <typename MA, typename MP>
typename MA::size_type
lu_blocked_recursive(MA &A, MP &P)
{
    namespace ublas = boost::numeric::ublas;
    using ublas::range;
    using ublas::matrix_column;
    using ublas::matrix_row;

    typedef typename MA::size_type  size_type;
    typedef typename MA::value_type value_type;

    size_type singular  = 0;
    size_type singular_ = 0;
    size_type m  = A.size1();
    size_type n  = A.size2();
    size_type mn = std::min(m, n);
    size_type bs = 8;


    if (bs>=mn) {
        singular = lu_unblocked(A, P);
    } else {
        size_type k;
        for (k=1; k<mn/4; k*=2);

        auto A_left  = project(A, range(0,m), range(0,k));
        singular_ = lu_blocked_recursive(A_left, P);
        if (singular==0 && singular_>0) {
            singular = singular_;
        }

        auto A_right = project(A, range(0,m), range(k,n));
        auto mk      = std::min(m, k);
        foo::swap_rows(project(P, range(0,mk)), A_right);

        const auto L  = project(A, range(0,mk), range(0,mk));
        auto  U_right = project(A, range(0,mk), range(mk,n));

        //inplace_solve(L, U_right, ublas::unit_lower_tag());
        trlsm(value_type(1), true, L, U_right);

        auto A_ = project(A, range(mk,m), range(mk,n));
        auto P_ = project(P, range(mk,m));
        gemm(value_type(-1),
             project(A, range(mk,m), range(0,mk)),
             project(A, range(0,mk), range(mk,n)),
             value_type(1),
             A_);
        //std::cout << std::endl;
        //std::cout << "M = " << m << ", N = " << n << ", K = " << k << std::endl;
        //std::cout << "m = " << (m-mk) << ", n = " << (n-mk) << ", k = " << mk << std::endl;

        singular_ = lu_blocked_recursive(A_, P_);
        if (singular==0 && singular_>0) {
            singular = singular_ + mk;
        }

        auto A_left_bottom = project(A, range(mk,m), range(0,k));
        foo::swap_rows(project(P, range(mk,mn)), A_left_bottom);

        for (size_type i=mk; i<mn; ++i) {
            P(i) += mk;
        }
    }
    return singular;
}

} // namespace foo

#endif // LU_HPP

`gemm.hpp`

session7/gemm.hpp

// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core

#ifndef GEMM_HPP
#define GEMM_HPP
#include <algorithm>
#include <cstdlib>

#if defined(_OPENMP)
#include <omp.h>
#endif

namespace foo {

//-- new with alignment --------------------------------------------------------
void *
malloc_(std::size_t alignment, std::size_t size)
{
    alignment = std::max(alignment, alignof(void *));
    size     += alignment;

    void *ptr  = std::malloc(size);
    void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1));
    void **vp  = (void**) ptr2 - 1;
    *vp        = ptr;
    return ptr2;
}

void
free_(void *ptr)
{
    std::free(*((void**)ptr-1));
}

//-- Config --------------------------------------------------------------------

// SIMD-Register width in bits
// SSE:         128
// AVX/FMA:     256
// AVX-512:     512
#ifndef SIMD_REGISTER_WIDTH
#define SIMD_REGISTER_WIDTH 256
#endif

#ifdef HAVE_FMA

#   ifndef BS_D_MR
#   define BS_D_MR 4
#   endif

#   ifndef BS_D_NR
#   define BS_D_NR 12
#   endif

#   ifndef BS_D_MC
#   define BS_D_MC 256
#   endif

#   ifndef BS_D_KC
#   define BS_D_KC 512
#   endif

#   ifndef BS_D_NC
#   define BS_D_NC 4092
#   endif

#endif


#ifndef BS_D_MR
#define BS_D_MR 4
#endif

#ifndef BS_D_NR
#define BS_D_NR 8
#endif

#ifndef BS_D_MC
#define BS_D_MC 256
#endif

#ifndef BS_D_KC
#define BS_D_KC 256
#endif

#ifndef BS_D_NC
#define BS_D_NC 4096
#endif

template <typename T>
struct BlockSize
{
    static constexpr int MC = 64;
    static constexpr int KC = 64;
    static constexpr int NC = 256;
    static constexpr int MR = 8;
    static constexpr int NR = 8;

    static constexpr int rwidth = 0;
    static constexpr int align  = alignof(T);
    static constexpr int vlen   = 0;

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};


template <>
struct BlockSize<double>
{
    static constexpr int MC     = BS_D_MC;
    static constexpr int KC     = BS_D_KC;
    static constexpr int NC     = BS_D_NC;
    static constexpr int MR     = BS_D_MR;
    static constexpr int NR     = BS_D_NR;

    static constexpr int rwidth = SIMD_REGISTER_WIDTH;
    static constexpr int align  = rwidth / 8;
#if defined(HAVE_AVX) || defined(HAVE_FMA) || defined(HAVE_GCCVEC)
    static constexpr int vlen   = rwidth / (8*sizeof(double));
#else
    static constexpr int vlen   = 0;
#endif

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
    static_assert(rwidth % sizeof(double) == 0, "SIMD register width not sane.");
};

//-- aux routines --------------------------------------------------------------
template <typename Alpha, typename MX, typename MY>
void
geaxpy(const Alpha &alpha, const MX &X, MY &Y)
{
    assert(X.size1()==Y.size1());
    assert(X.size2()==Y.size2());

    typedef typename MX::size_type  size_type;

    for (size_type j=0; j<X.size2(); ++j) {
        for (size_type i=0; i<X.size1(); ++i) {
            Y(i,j) += alpha*X(i,j);
        }
    }
}

template <typename Alpha, typename MX>
void
gescal(const Alpha &alpha, MX &X)
{
    typedef typename MX::size_type  size_type;

    for (size_type j=0; j<X.size2(); ++j) {
        for (size_type i=0; i<X.size1(); ++i) {
            X(i,j) *= alpha;
        }
    }
}

template <typename Index, typename Alpha, typename TX, typename TY>
void
geaxpy(Index m, Index n,
       const Alpha &alpha,
       const TX *X, Index incRowX, Index incColX,
       TY       *Y, Index incRowY, Index incColY)
{
    for (Index j=0; j<n; ++j) {
        for (Index i=0; i<m; ++i) {
            Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
        }
    }
}

template <typename Index, typename Alpha, typename TX>
void
gescal(Index m, Index n,
       const Alpha &alpha,
       TX *X, Index incRowX, Index incColX)
{
    for (Index j=0; j<n; ++j) {
        for (Index i=0; i<m; ++i) {
            X[i*incRowX+j*incColX] *= alpha;
        }
    }
}

template <typename IndexType, typename MX, typename MY>
void
gecopy(IndexType m, IndexType n,
       const MX *X, IndexType incRowX, IndexType incColX,
       MY *Y, IndexType incRowY, IndexType incColY)
{
    for (IndexType j=0; j<n; ++j) {
        for (IndexType i=0; i<m; ++i) {
            Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX];
        }
    }
}

//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
typename std::enable_if<BlockSize<T>::vlen == 0,
         void>::type
ugemm(Index kc, T alpha, const T *A, const T *B, T beta,
      T *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    T P[BlockSize<T>::MR*BlockSize<T>::NR];

    for (Index l=0; l<MR*NR; ++l) {
        P[l] = 0;
    }
    for (Index l=0; l<kc; ++l) {
        for (Index j=0; j<NR; ++j) {
            for (Index i=0; i<MR; ++i) {
                P[i+j*MR] += A[i+l*MR]*B[l*NR+j];
            }
        }
    }
    for (Index j=0; j<NR; ++j) {
        for (Index i=0; i<MR; ++i) {
            C[i*incRowC+j*incColC] *= beta;
            C[i*incRowC+j*incColC] += alpha*P[i+j*MR];
        }
    }
}

#if defined HAVE_AVX
#include "avx.hpp"
#elif defined HAVE_FMA
#include "fma.hpp"
#elif defined HAVE_GCCVEC
#include "gccvec.hpp"
#endif


#ifndef HAVE_GCCVEC
template <typename T>
void
utrlsm(const T *A, T *B)
{
    typedef std::size_t IndexType;

    const IndexType MR = BlockSize<T>::MR;
    const IndexType NR = BlockSize<T>::NR;

    T   C_[MR*NR];

    for (IndexType i=0; i<MR; ++i) {
        for (IndexType j=0; j<NR; ++j) {
            C_[i+j*MR] = B[i*NR+j];
        }
    }

    for (IndexType i=0; i<MR; ++i) {
        for (IndexType j=0; j<NR; ++j) {
            C_[i+j*MR] *= A[i];
            for (IndexType l=i+1; l<MR; ++l) {
                C_[l+j*MR] -= A[l]*C_[i+j*MR];
            }
        }
        A += MR;
    }
    for (IndexType i=0; i<MR; ++i) {
        for (IndexType j=0; j<NR; ++j) {
            B[i*NR+j] = C_[i+j*MR];
        }
   }
}
#else
template <typename T>
typename std::enable_if<BlockSize<T>::vlen != 0,
         void>::type
utrlsm(const T *A, T *B)
{
    typedef std::size_t IndexType;
    typedef T           vx __attribute__((vector_size(BlockSize<T>::rwidth/8)));

    static constexpr IndexType vlen = BlockSize<T>::vlen;
    static constexpr IndexType MR   = BlockSize<T>::MR;
    static constexpr IndexType NR   = BlockSize<T>::NR/vlen;

    A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align);
    B = ( T*)      __builtin_assume_aligned (B, BlockSize<T>::align);

    vx C_[MR*NR];
    vx *B_ = (vx *)B;

    for (IndexType i=0; i<MR*NR; ++i) {
        C_[i] = B_[i];
    }

    for (IndexType i=0; i<MR; ++i) {
        for (IndexType j=0; j<NR; ++j) {
            C_[i*NR+j] *= A[i];
        }
        for (IndexType l=i+1; l<MR; ++l) {
            for (IndexType j=0; j<NR; ++j) {
                C_[l*NR+j] -= A[l]*C_[i*NR+j];
            }
        }
        A += MR;
    }
    for (IndexType i=0; i<MR*NR; ++i) {
        B_[i] = C_[i];
    }
}
#endif

//-- Macro Kernel --------------------------------------------------------------
template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc, const T &alpha,
      const T *A, const T *B, Beta beta,
      TC *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    const Index mp  = (mc+MR-1) / MR;
    const Index np  = (nc+NR-1) / NR;
    const Index mr_ = mc % MR;
    const Index nr_ = nc % NR;

    #if defined(_OPENMP)
    #pragma omp parallel for
    #endif
    for (Index j=0; j<np; ++j) {
        const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
        T C_[BlockSize<T>::MR*BlockSize<T>::NR];

        for (Index i=0; i<mp; ++i) {
            const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;

            if (mr==MR && nr==NR) {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      beta,
                      &C[i*MR*incRowC+j*NR*incColC],
                      incRowC, incColC);
            } else {
                std::fill_n(C_, MR*NR, T(0));
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      T(0),
                      C_, Index(1), MR);
                gescal(mr, nr, beta,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
                geaxpy(mr, nr, T(1), C_, Index(1), MR,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
            }
        }
    }
}

template <typename IndexType, typename T>
void
mtrlsm(IndexType mc, IndexType nc, const T &alpha, const T *A_, T *B_)
{
    const IndexType MR = BlockSize<T>::MR;
    const IndexType NR = BlockSize<T>::NR;

    const IndexType mp = (mc+MR-1) / MR;
    const IndexType np = (nc+NR-1) / NR;

    #if defined(_OPENMP)
    #pragma omp parallel for
    #endif
    for (IndexType j=0; j<np; ++j) {

        IndexType ia = 0;
        for (IndexType i=0; i<mp; ++i) {
            IndexType kc    = i*MR;

            ugemm(kc,
                  T(-1), &A_[ia*MR*MR], &B_[j*mc*NR],
                  alpha,
                  &B_[(j*mc+kc)*NR], NR, IndexType(1));

            utrlsm(&A_[(ia*MR+kc)*MR], &B_[(j*mc+kc)*NR]);
            ia += i+1;
        }
    }
}

//-- Packing blocks ------------------------------------------------------------
template <typename MA, typename T>
void
pack_A(const MA &A, T *p)
{
    typedef typename MA::size_type  size_type;

    size_type mc = A.size1();
    size_type kc = A.size2();
    size_type MR = BlockSize<T>::MR;
    size_type mp = (mc+MR-1) / MR;

    for (size_type j=0; j<kc; ++j) {
        for (size_type l=0; l<mp; ++l) {
            for (size_type i0=0; i0<MR; ++i0) {
                size_type i  = l*MR + i0;
                size_type nu = l*MR*kc + j*MR + i0;
                p[nu]        = (i<mc) ? A(i,j) : T(0);
            }
        }
    }
}

template <typename MB, typename T>
void
pack_B(const MB &B, T *p)
{
    typedef typename MB::size_type  size_type;

    size_type kc = B.size1();
    size_type nc = B.size2();
    size_type NR = BlockSize<T>::NR;
    size_type np = (nc+NR-1) / NR;

    for (size_type l=0; l<np; ++l) {
        for (size_type j0=0; j0<NR; ++j0) {
            for (size_type i=0; i<kc; ++i) {
                size_type j  = l*NR+j0;
                size_type nu = l*NR*kc + i*NR + j0;
                p[nu]        = (j<nc) ? B(i,j) : T(0);
            }
        }
    }
}

template <typename T, typename MB>
void
unpack_B(const T *p, MB &B)
{
    typedef typename MB::size_type  size_type;

    size_type kc = B.size1();
    size_type nc = B.size2();
    size_type NR = BlockSize<T>::NR;
    size_type np = (nc+NR-1) / NR;

    for (size_type l=0; l<np; ++l) {
        for (size_type j0=0; j0<NR; ++j0) {
            for (size_type i=0; i<kc; ++i) {
                size_type j  = l*NR+j0;
                size_type nu = l*NR*kc + i*NR + j0;
                if (j<nc) {
                    B(i,j) = p[nu];
                }
            }
        }
    }
}

template <typename ML, typename T>
void
pack_L(const ML &L, T *p)
{
    typedef typename ML::size_type  size_type;

    assert(L.size1()==L.size2());

    size_type mc = L.size1();
    size_type MR = BlockSize<T>::MR;
    size_type mp = (mc+MR-1) / MR;

    for (size_type j=0; j<mp; ++j) {
        for (size_type j0=0; j0<MR; ++j0) {
            for (size_type i=j; i<mp; ++i) {
                for (size_type i0=0; i0<MR; ++i0) {
                    size_type I  = i*MR+i0;
                    size_type J  = j*MR+j0;
                    size_type nu = (i+1)*i/2*MR*MR + j*MR*MR + j0*MR +i0;
                    p[nu]        = (I==J) ? T(1)
                                          : (I>=mc || J>=mc) ? T(0)
                                                             : (I>J) ? L(I,J)
                                                                     : T(0);
                }
            }
        }
    }
}

//-- Frame routine -------------------------------------------------------------
template <typename Alpha, typename MatrixA, typename MatrixB,
         typename Beta, typename MatrixC>
void
gemm(Alpha alpha, const MatrixA &A, const MatrixB &B, Beta beta, MatrixC &C)
{
    assert(A.size2()==B.size1());
    namespace ublas = boost::numeric::ublas;

    typedef typename MatrixC::size_type  size_type;
    typedef typename MatrixA::value_type TA;
    typedef typename MatrixB::value_type TB;
    typedef typename MatrixC::value_type TC;
    typedef typename std::common_type<Alpha, TA, TB>::type  T;

    const size_type MC = BlockSize<T>::MC;
    const size_type NC = BlockSize<T>::NC;
    const size_type MR = BlockSize<T>::MR;
    const size_type NR = BlockSize<T>::NR;

    const size_type m = C.size1();
    const size_type n = C.size2();
    const size_type k = A.size2();

    const size_type KC = BlockSize<T>::KC;
    const size_type mb = (m+MC-1) / MC;
    const size_type nb = (n+NC-1) / NC;
    const size_type kb = (k+KC-1) / KC;
    const size_type mc_ = m % MC;
    const size_type nc_ = n % NC;
    const size_type kc_ = k % KC;

    if (m==0 || n==0 || ((alpha==Alpha(0) || k==0) && (beta==Beta(1)))) {
        return;
    }

    TC *C_ = &C(0,0);
    const size_type incRowC = &C(1,0) - &C(0,0);
    const size_type incColC = &C(0,1) - &C(0,0);

    T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*KC+MR));
    T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(KC*NC+NR));

    if (alpha==Alpha(0) || k==0) {
        gescal(beta, C);
        return;
    }

    for (size_type j=0; j<nb; ++j) {
        size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_;

        for (size_type l=0; l<kb; ++l) {
            size_type kc = (l!=kb-1 || kc_==0) ? KC : kc_;
            Beta beta_   = (l==0) ? beta : Beta(1);

            const auto Bs = subrange(B, l*KC, l*KC+kc, j*NC, j*NC+nc);
            pack_B(Bs, B_);

            for (size_type i=0; i<mb; ++i) {
                size_type mc = (i!=mb-1 || mc_==0) ? MC : mc_;

                const auto As = subrange(A, i*MC, i*MC+mc, l*KC, l*KC+kc);
                pack_A(As, A_);

                mgemm(mc, nc, kc,
                      T(alpha), A_, B_, beta_,
                      &C_[i*MC*incRowC+j*NC*incColC],
                      incRowC, incColC);
            }
        }
    }
    free_(A_);
    free_(B_);
}

template <typename Alpha, typename MatrixA, typename MatrixB>
void
trlsm(const Alpha   &alpha, bool unitDiag, const MatrixA &A, MatrixB &B)
{
    assert(A.size2()==A.size1());
    namespace ublas = boost::numeric::ublas;

    typedef typename MatrixA::size_type  size_type;
    typedef typename MatrixA::value_type TA;
    typedef typename MatrixB::value_type TB;

    typedef typename std::common_type<Alpha, TA, TB>::type   T_;
    typedef typename std::remove_const<T_>::type             T;

    const size_type MC = BlockSize<T>::MC;
    const size_type NC = BlockSize<T>::NC;
    const size_type MR = BlockSize<T>::MR;
    const size_type NR = BlockSize<T>::NR;

    const size_type m = B.size1();
    const size_type n = B.size2();

    const size_type mb = (m+MC-1) / MC;
    const size_type nb = (n+NC-1) / NC;
    const size_type mc_ = m % MC;
    const size_type nc_ = n % NC;

    if (m==0 || n==0) {
        return;
    }

    const size_type incRowB = &B(1,0) - &B(0,0);
    const size_type incColB = &B(0,1) - &B(0,0);

    if (alpha==Alpha(0)) {
        gescal(Alpha(0), B);
        return;
    }

    T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*MC+MR));
    T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*NC+NR));

    for (size_type j=0; j<nb; ++j) {
        size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_;

        for (size_type i=0; i<mb; ++i) {
            size_type mc  = (i!=mb-1 || mc_==0) ? MC : mc_;
            Alpha  alpha_ = (i==0) ? alpha : Alpha(1);

            auto Bs = subrange(B, i*MC, i*MC+mc, j*NC, j*NC+nc);
            pack_B(Bs, B_);

            const auto Ls = subrange(A, i*MC, i*MC+mc, i*MC, i*MC+mc);
            pack_L(Ls, A_);

            mtrlsm(mc, nc, T(alpha_), A_, B_);
            unpack_B(B_, Bs);

            for (size_type l=i+1; l<mb; ++l) {
                mc  = (l!=mb-1 || mc_==0) ? MC : mc_;

                const auto As = subrange(A, l*MC, l*MC+mc, i*MC, i*MC+mc);
                pack_A(As, A_);

                mgemm(mc, nc, MC, T(-1), A_, B_, alpha_,
                      &B(l*MC,j*NC), incRowB, incColB);
            }
        }
    }

    free_(A_);
    free_(B_);
}

} // namespace foo

#endif

`bench_lu.cc`

session7/bench_lu.cc

#include <cassert>
#include <chrono>
#include <cmath>
#include <limits>
#include <random>
#include <type_traits>
#include <boost/timer.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include "gemm.hpp"
#include "lu.hpp"

template <typename T>
struct WallTime
{
    void
    tic()
    {
        t0 = std::chrono::high_resolution_clock::now();
    }

    T
    toc()
    {
        using namespace std::chrono;

        elapsed = high_resolution_clock::now() - t0;
        return duration<T,seconds::period>(elapsed).count();
    }

    std::chrono::high_resolution_clock::time_point t0;
    std::chrono::high_resolution_clock::duration   elapsed;
};

//
// For checking I don't care for efficiency of elegance ...
//
template <typename MatrixA1, typename MatrixA2, typename MatrixP>
double
checkLU(const MatrixA1 &A1, MatrixA2 &A2, const MatrixP &P)
{
    namespace ublas = boost::numeric::ublas;

    typedef typename MatrixA1::size_type    size_type;
    typedef typename MatrixA1::value_type   value_type;

    size_type m = A1.size1();
    size_type n = A1.size2();

    ublas::matrix<value_type>     L(m,m);

    // Copy L form A2 and set A2 to U
    for (size_type j=0; j<m; ++j) {
        for (size_type i=0; i<m; ++i) {
            if (i==j) {
                L(i,i) = 1;
            } else if (i>j) {
                L(i,j)  = A2(i,j);
                A2(i,j) = 0;
            } else if (j>i) {
                L(i,j)  = 0;
            }
        }
    }
    ublas::matrix<value_type>   LU(m,n);
    ublas::axpy_prod(L, A2, LU, true);

    // LU <- inv(P)*L*U
    foo::swap_rows(P, LU, true);

    double diff = 0;
    for (size_type j=0; j<n; ++j) {
        for (size_type i=0; i<m; ++i) {
            diff += std::abs(A1(i,j)-LU(i,j));
        }
    }
    return diff;
}

// fill rectangular matrix with random values
template <typename MATRIX>
void
fill(MATRIX &A)
{
    typedef typename MATRIX::size_type  size_type;
    typedef typename MATRIX::value_type T;

    std::random_device                  random;
    std::default_random_engine          mt(random());
    std::uniform_real_distribution<T>   uniform(-100,100);

    for (size_type i=0; i<A.size1(); ++i) {
        for (size_type j=0; j<A.size2(); ++j) {
            A(i,j) = uniform(mt);
        }
    }
}

#ifndef M_MAX
#define M_MAX 4000
#endif

#ifndef N_MAX
#define N_MAX 4000
#endif

int
main()
{
    namespace ublas = boost::numeric::ublas;

    const std::size_t m_min = 50;
    const std::size_t n_min = 50;

    const std::size_t m_max = M_MAX;
    const std::size_t n_max = N_MAX;

    const std::size_t m_inc = 50;
    const std::size_t n_inc = 50;

    typedef double              T;
    typedef ublas::row_major    SO;

    WallTime<double>  walltime;

    for (std::size_t m=m_min, n=n_min;
         m<=m_max && n<=n_max;
         m+=m_inc, n+=n_inc)
    {
        ublas::matrix<T,SO>             A_org(m, n);
        ublas::matrix<T,SO>             A1(m, n);
        ublas::matrix<T,SO>             A2(m, n);
        ublas::matrix<T,SO>             A3(m, n);
        ublas::permutation_matrix<T>    P1(m), P2(m), P3(m);

        fill(A_org);
        A1 = A_org;
        A2 = A_org;
        A3 = A_org;

        double      diff, t, mflops;
        std::size_t info;

        std::size_t minMN = std::min(m,n);
        std::size_t maxMN = std::max(m,n);

        mflops  = minMN*minMN*maxMN -(minMN*minMN*minMN/3.) -(minMN*minMN/2.);
        mflops /= 1000000;

        walltime.tic();
        info = ublas::lu_factorize(A1, P1);
        t = walltime.toc();

        diff   = checkLU(A_org, A1, P1);
        std::printf("%4ld %4ld %4ld %16.3e %16.5lf %16.5lf",
                    m, n, info, diff, t, mflops/t);

        walltime.tic();
        info = foo::lu_blocked(A2, P2);
        t = walltime.toc();

        diff = checkLU(A_org, A2, P2);
        std::printf("%4ld %16.3e %16.5lf %16.5lf",
                    info, diff, t, mflops/t);

        walltime.tic();
        info = foo::lu_blocked_recursive(A3, P3);
        t = walltime.toc();


        diff = checkLU(A_org, A3, P3);
        std::printf("%4ld %16.3e %16.5lf %16.5lf\n",
                    info, diff, t, mflops/t);

    }
}