Content

LU Factorization

The performance of solving a system of linear equations depends on the performance of the matrix-matrix product.

What's new:

Compile and Run Benchmark

$shell> g++-5.3 -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -DM_MAX=500 -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc
$shell> ./a.out
  50   50    0        3.858e-11          0.00007       1257.01889   0        3.858e-11          0.00007       1262.41266   0        3.148e-11          0.00014        574.16190
 100  100    0        2.704e-10          0.00035       1891.03810   0        1.852e-10          0.00039       1708.88958   0        2.017e-10          0.00038       1759.34469
 150  150    0        8.315e-10          0.00102       2188.39717   0        5.930e-10          0.00096       2332.16729   0        5.970e-10          0.00081       2757.47181
 200  200    0        1.833e-09          0.00109       4888.77725   0        1.466e-09          0.00078       6782.49673   0        1.492e-09          0.00070       7579.86438
 250  250    0        3.335e-09          0.00216       4812.57506   0        2.929e-09          0.00137       7571.10183   0        3.030e-09          0.00122       8507.56694
 300  300    0        5.387e-09          0.00388       4627.15277   0        5.214e-09          0.00211       8520.78992   0        4.064e-09          0.00186       9649.06530
 350  350    0        8.423e-09          0.00610       4679.07581   0        8.603e-09          0.00330       8652.79651   0        7.163e-09          0.00282      10113.78357
 400  400    0        1.221e-08          0.00923       4614.49949   0        1.292e-08          0.00448       9506.99858   0        1.080e-08          0.00398      10713.10129
 450  450    0        1.715e-08          0.01302       4659.90448   0        1.923e-08          0.00637       9519.13039   0        1.626e-08          0.00540      11239.84015
 500  500    0        2.273e-08          0.01807       4605.68407   0        2.650e-08          0.00819      10155.92090   0        2.298e-08          0.00714      11659.91454
$shell> 

Benchmark (Single Threaded)

$shell> g++-5.3 -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc
$shell> ./a.out > report.session7.lu
$shell> gnuplot plot.session7.lu
$shell> gnuplot plot.session7.lu.log
$shell> gnuplot plot.session7.lu.mflops
$shell> 

Time (Effectiveness)

For the LU-factorization we are primarily interested how long it takes to get the factorization:

Using a log-sclaing of both axis shows, that the complexity is still of order \(N^3\):

MFLOPS (Efficiency)

We are also interested how fast each algorithm reaches its peak performance:

Benchmark (Multi Threaded)

Test with 2 threads:

$shell> g++-5.3 -fopenmp -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc
$shell> export OMP_NUM_THREADS=2; ./a.out > report.session7-mt2.lu
$shell> 

Test with 4 threads:

$shell> g++-5.3 -fopenmp -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc
$shell> export OMP_NUM_THREADS=4; ./a.out > report.session7-mt4.lu
$shell> gnuplot plot.session7-mt.lu
$shell> gnuplot plot.session7-mt.lu.log
$shell> gnuplot plot.session7-mt.lu.mflops
$shell> 

Time (Effectiveness)

MFLOPS (Efficiency)

Source Code

The tar-ball session7.tgz contains the files:

$shell> tar tfvz session7.tgz
-rw-rw-r-- lehn/num       4301 2016-02-11 16:34 session7/bench_lu.cc
-rw-rw-r-- lehn/num      13537 2016-02-02 16:05 session7/avx.hpp
-rw-rw-r-- lehn/num      24390 2016-02-02 16:05 session7/fma.hpp
-rw-rw-r-- lehn/num       1537 2016-02-12 21:01 session7/gccvec.hpp
-rw-rw-r-- lehn/num       3353 2016-02-02 16:05 session7/gccvec2.hpp
-rw-rw-r-- lehn/num      16946 2016-02-12 21:27 session7/gemm.hpp
-rw-rw-r-- lehn/num       6221 2016-02-10 13:15 session7/lu.hpp
$shell> 

lu.hpp

#ifndef LU_HPP
#define LU_HPP 1

#include <boost/numeric/ublas/operation.hpp>
#include <boost/numeric/ublas/vector_proxy.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/vector.hpp>
#include <boost/numeric/ublas/triangular.hpp>
#include "gemm.hpp"

namespace foo {

template <typename MP, typename MA>
void
swap_rows(const MP &P, MA &A, bool reverse=false)
{
    typedef typename MP::size_type size_type;

    size_type size = P.size();
    if (!reverse) {
        for (size_type i=0; i<size; ++i) {
            if (i!=P(i)) {
                row(A, i).swap(row(A, P(i)));
            }
        }
    } else {
        for (size_type I=size; I>=1; --I) {
            size_type i = I-1;
            if (i!=P(i)) {
                row(A, i).swap(row(A, P(i)));
            }
        }
    }
}

// Unblocked LU factorization with partial pivoting
template <typename MA, typename MP>
typename MA::size_type
lu_unblocked(MA &A, MP &P)
{
    namespace ublas = boost::numeric::ublas;
    using ublas::range;
    using ublas::matrix_column;
    using ublas::matrix_row;

    typedef typename MA::size_type  size_type;
    typedef typename MA::value_type value_type;

    size_type singular = 0;
    size_type m  = A.size1 ();
    size_type n  = A.size2 ();
    size_type mn = std::min(m, n);


    for (size_type i=0; i<mn; ++ i) {
        matrix_column<MA> A_i(column(A,i));
        matrix_row<MA>    Ai_(row(A,i));

        P(i) = i + index_norm_inf(project(A_i, range(i, m)));
        if (A(P(i),i) != value_type()) {
            if (P(i)!=i) {
                row(A, P(i)).swap (Ai_);
            }
        } else {
            singular = i+1;
        }
        value_type alpha = value_type(1)/A(i,i);
        project (A_i, range(i+1, m)) *= alpha;
        project (A, range(i+1, m), range (i+1, n)).minus_assign (
                outer_prod (project(A_i, range(i+1, m)),
                            project(Ai_, range(i+1, n))));
    }
    return singular;
}

// Blocked LU factorization with partial pivoting
template <typename MA, typename MP>
typename MA::size_type
lu_blocked(MA &A, MP &P)
{
    namespace ublas = boost::numeric::ublas;
    using ublas::range;
    using ublas::matrix_column;
    using ublas::matrix_row;

    typedef typename MA::size_type  size_type;
    typedef typename MA::value_type value_type;

    size_type singular  = 0;
    size_type singular_ = 0;
    size_type m  = A.size1 ();
    size_type n  = A.size2 ();
    size_type mn = std::min(m, n);
    size_type bs = 64;

    if (bs>=mn) {
        singular = lu_unblocked(A, P);
    } else {
        for (size_type j=0; j<mn; j+=bs) {
            auto jb = std::min(mn-j, bs);

            auto A_ = project(A, range(j,m), range(j,j+jb));
            auto P_ = project(P, range(j,m));

            singular_ = lu_unblocked(A_, P_);

            if (singular==0 && singular_>0) {
                singular = singular_ + j;
            }

            auto A_left = project(A, range(j,m), range(0,j));
            foo::swap_rows(project(P, range(j,j+jb)), A_left);

            if (j+jb<=n) {
                auto A_right  = project(A, range(j,m), range(j+jb,n));
                foo::swap_rows(project(P, range(j,j+jb)), A_right);

                const auto L  = project(A, range(j,j+jb), range(j,j+jb));
                auto  U_right = project(A, range(j,j+jb), range(j+jb,n));

                //inplace_solve(L, U_right, ublas::unit_lower_tag());
                trlsm(value_type(1), true, L, U_right);

                if (j+jb<=m) {

                    auto A_ = project(A, range(j+jb,m), range(j+jb,n));
                    gemm(value_type(-1),
                         project(A, range(j+jb,m), range(j,j+jb)),
                         project(A, range(j,j+jb), range(j+jb,n)),
                         value_type(1),
                         A_);
                }
            }

            for (size_type i=j; i<std::min(m, j+jb); ++i) {
                P(i) += j;
            }
        }
    }
    return singular;
}

// Blocked recursive LU factorization with partial pivoting
template <typename MA, typename MP>
typename MA::size_type
lu_blocked_recursive(MA &A, MP &P)
{
    namespace ublas = boost::numeric::ublas;
    using ublas::range;
    using ublas::matrix_column;
    using ublas::matrix_row;

    typedef typename MA::size_type  size_type;
    typedef typename MA::value_type value_type;

    size_type singular  = 0;
    size_type singular_ = 0;
    size_type m  = A.size1();
    size_type n  = A.size2();
    size_type mn = std::min(m, n);
    size_type bs = 8;


    if (bs>=mn) {
        singular = lu_unblocked(A, P);
    } else {
        size_type k;
        for (k=1; k<mn/4; k*=2);

        auto A_left  = project(A, range(0,m), range(0,k));
        singular_ = lu_blocked_recursive(A_left, P);
        if (singular==0 && singular_>0) {
            singular = singular_;
        }

        auto A_right = project(A, range(0,m), range(k,n));
        auto mk      = std::min(m, k);
        foo::swap_rows(project(P, range(0,mk)), A_right);

        const auto L  = project(A, range(0,mk), range(0,mk));
        auto  U_right = project(A, range(0,mk), range(mk,n));

        //inplace_solve(L, U_right, ublas::unit_lower_tag());
        trlsm(value_type(1), true, L, U_right);

        auto A_ = project(A, range(mk,m), range(mk,n));
        auto P_ = project(P, range(mk,m));
        gemm(value_type(-1),
             project(A, range(mk,m), range(0,mk)),
             project(A, range(0,mk), range(mk,n)),
             value_type(1),
             A_);
        //std::cout << std::endl;
        //std::cout << "M = " << m << ", N = " << n << ", K = " << k << std::endl;
        //std::cout << "m = " << (m-mk) << ", n = " << (n-mk) << ", k = " << mk << std::endl;

        singular_ = lu_blocked_recursive(A_, P_);
        if (singular==0 && singular_>0) {
            singular = singular_ + mk;
        }

        auto A_left_bottom = project(A, range(mk,m), range(0,k));
        foo::swap_rows(project(P, range(mk,mn)), A_left_bottom);

        for (size_type i=mk; i<mn; ++i) {
            P(i) += mk;
        }
    }
    return singular;
}

} // namespace foo

#endif // LU_HPP

gemm.hpp

// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core

#ifndef GEMM_HPP
#define GEMM_HPP
#include <algorithm>
#include <cstdlib>

#if defined(_OPENMP)
#include <omp.h>
#endif

namespace foo {

//-- new with alignment --------------------------------------------------------
void *
malloc_(std::size_t alignment, std::size_t size)
{
    alignment = std::max(alignment, alignof(void *));
    size     += alignment;

    void *ptr  = std::malloc(size);
    void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1));
    void **vp  = (void**) ptr2 - 1;
    *vp        = ptr;
    return ptr2;
}

void
free_(void *ptr)
{
    std::free(*((void**)ptr-1));
}

//-- Config --------------------------------------------------------------------

// SIMD-Register width in bits
// SSE:         128
// AVX/FMA:     256
// AVX-512:     512
#ifndef SIMD_REGISTER_WIDTH
#define SIMD_REGISTER_WIDTH 256
#endif

#ifdef HAVE_FMA

#   ifndef BS_D_MR
#   define BS_D_MR 4
#   endif

#   ifndef BS_D_NR
#   define BS_D_NR 12
#   endif

#   ifndef BS_D_MC
#   define BS_D_MC 256
#   endif

#   ifndef BS_D_KC
#   define BS_D_KC 512
#   endif

#   ifndef BS_D_NC
#   define BS_D_NC 4092
#   endif

#endif


#ifndef BS_D_MR
#define BS_D_MR 4
#endif

#ifndef BS_D_NR
#define BS_D_NR 8
#endif

#ifndef BS_D_MC
#define BS_D_MC 256
#endif

#ifndef BS_D_KC
#define BS_D_KC 256
#endif

#ifndef BS_D_NC
#define BS_D_NC 4096
#endif

template <typename T>
struct BlockSize
{
    static constexpr int MC = 64;
    static constexpr int KC = 64;
    static constexpr int NC = 256;
    static constexpr int MR = 8;
    static constexpr int NR = 8;

    static constexpr int rwidth = 0;
    static constexpr int align  = alignof(T);
    static constexpr int vlen   = 0;

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};


template <>
struct BlockSize<double>
{
    static constexpr int MC     = BS_D_MC;
    static constexpr int KC     = BS_D_KC;
    static constexpr int NC     = BS_D_NC;
    static constexpr int MR     = BS_D_MR;
    static constexpr int NR     = BS_D_NR;

    static constexpr int rwidth = SIMD_REGISTER_WIDTH;
    static constexpr int align  = rwidth / 8;
#if defined(HAVE_AVX) || defined(HAVE_FMA) || defined(HAVE_GCCVEC)
    static constexpr int vlen   = rwidth / (8*sizeof(double));
#else
    static constexpr int vlen   = 0;
#endif

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
    static_assert(rwidth % sizeof(double) == 0, "SIMD register width not sane.");
};

//-- aux routines --------------------------------------------------------------
template <typename Alpha, typename MX, typename MY>
void
geaxpy(const Alpha &alpha, const MX &X, MY &Y)
{
    assert(X.size1()==Y.size1());
    assert(X.size2()==Y.size2());

    typedef typename MX::size_type  size_type;

    for (size_type j=0; j<X.size2(); ++j) {
        for (size_type i=0; i<X.size1(); ++i) {
            Y(i,j) += alpha*X(i,j);
        }
    }
}

template <typename Alpha, typename MX>
void
gescal(const Alpha &alpha, MX &X)
{
    typedef typename MX::size_type  size_type;

    for (size_type j=0; j<X.size2(); ++j) {
        for (size_type i=0; i<X.size1(); ++i) {
            X(i,j) *= alpha;
        }
    }
}

template <typename Index, typename Alpha, typename TX, typename TY>
void
geaxpy(Index m, Index n,
       const Alpha &alpha,
       const TX *X, Index incRowX, Index incColX,
       TY       *Y, Index incRowY, Index incColY)
{
    for (Index j=0; j<n; ++j) {
        for (Index i=0; i<m; ++i) {
            Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
        }
    }
}

template <typename Index, typename Alpha, typename TX>
void
gescal(Index m, Index n,
       const Alpha &alpha,
       TX *X, Index incRowX, Index incColX)
{
    for (Index j=0; j<n; ++j) {
        for (Index i=0; i<m; ++i) {
            X[i*incRowX+j*incColX] *= alpha;
        }
    }
}

template <typename IndexType, typename MX, typename MY>
void
gecopy(IndexType m, IndexType n,
       const MX *X, IndexType incRowX, IndexType incColX,
       MY *Y, IndexType incRowY, IndexType incColY)
{
    for (IndexType j=0; j<n; ++j) {
        for (IndexType i=0; i<m; ++i) {
            Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX];
        }
    }
}

//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
typename std::enable_if<BlockSize<T>::vlen == 0,
         void>::type
ugemm(Index kc, T alpha, const T *A, const T *B, T beta,
      T *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    T P[BlockSize<T>::MR*BlockSize<T>::NR];

    for (Index l=0; l<MR*NR; ++l) {
        P[l] = 0;
    }
    for (Index l=0; l<kc; ++l) {
        for (Index j=0; j<NR; ++j) {
            for (Index i=0; i<MR; ++i) {
                P[i+j*MR] += A[i+l*MR]*B[l*NR+j];
            }
        }
    }
    for (Index j=0; j<NR; ++j) {
        for (Index i=0; i<MR; ++i) {
            C[i*incRowC+j*incColC] *= beta;
            C[i*incRowC+j*incColC] += alpha*P[i+j*MR];
        }
    }
}

#if defined HAVE_AVX
#include "avx.hpp"
#elif defined HAVE_FMA
#include "fma.hpp"
#elif defined HAVE_GCCVEC
#include "gccvec.hpp"
#endif


#ifndef HAVE_GCCVEC
template <typename T>
void
utrlsm(const T *A, T *B)
{
    typedef std::size_t IndexType;

    const IndexType MR = BlockSize<T>::MR;
    const IndexType NR = BlockSize<T>::NR;

    T   C_[MR*NR];

    for (IndexType i=0; i<MR; ++i) {
        for (IndexType j=0; j<NR; ++j) {
            C_[i+j*MR] = B[i*NR+j];
        }
    }

    for (IndexType i=0; i<MR; ++i) {
        for (IndexType j=0; j<NR; ++j) {
            C_[i+j*MR] *= A[i];
            for (IndexType l=i+1; l<MR; ++l) {
                C_[l+j*MR] -= A[l]*C_[i+j*MR];
            }
        }
        A += MR;
    }
    for (IndexType i=0; i<MR; ++i) {
        for (IndexType j=0; j<NR; ++j) {
            B[i*NR+j] = C_[i+j*MR];
        }
   }
}
#else
template <typename T>
typename std::enable_if<BlockSize<T>::vlen != 0,
         void>::type
utrlsm(const T *A, T *B)
{
    typedef std::size_t IndexType;
    typedef T           vx __attribute__((vector_size(BlockSize<T>::rwidth/8)));

    static constexpr IndexType vlen = BlockSize<T>::vlen;
    static constexpr IndexType MR   = BlockSize<T>::MR;
    static constexpr IndexType NR   = BlockSize<T>::NR/vlen;

    A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align);
    B = ( T*)      __builtin_assume_aligned (B, BlockSize<T>::align);

    vx C_[MR*NR];
    vx *B_ = (vx *)B;

    for (IndexType i=0; i<MR*NR; ++i) {
        C_[i] = B_[i];
    }

    for (IndexType i=0; i<MR; ++i) {
        for (IndexType j=0; j<NR; ++j) {
            C_[i*NR+j] *= A[i];
        }
        for (IndexType l=i+1; l<MR; ++l) {
            for (IndexType j=0; j<NR; ++j) {
                C_[l*NR+j] -= A[l]*C_[i*NR+j];
            }
        }
        A += MR;
    }
    for (IndexType i=0; i<MR*NR; ++i) {
        B_[i] = C_[i];
    }
}
#endif

//-- Macro Kernel --------------------------------------------------------------
template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc, const T &alpha,
      const T *A, const T *B, Beta beta,
      TC *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    const Index mp  = (mc+MR-1) / MR;
    const Index np  = (nc+NR-1) / NR;
    const Index mr_ = mc % MR;
    const Index nr_ = nc % NR;

    #if defined(_OPENMP)
    #pragma omp parallel for
    #endif
    for (Index j=0; j<np; ++j) {
        const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
        T C_[BlockSize<T>::MR*BlockSize<T>::NR];

        for (Index i=0; i<mp; ++i) {
            const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;

            if (mr==MR && nr==NR) {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      beta,
                      &C[i*MR*incRowC+j*NR*incColC],
                      incRowC, incColC);
            } else {
                std::fill_n(C_, MR*NR, T(0));
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      T(0),
                      C_, Index(1), MR);
                gescal(mr, nr, beta,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
                geaxpy(mr, nr, T(1), C_, Index(1), MR,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
            }
        }
    }
}

template <typename IndexType, typename T>
void
mtrlsm(IndexType mc, IndexType nc, const T &alpha, const T *A_, T *B_)
{
    const IndexType MR = BlockSize<T>::MR;
    const IndexType NR = BlockSize<T>::NR;

    const IndexType mp = (mc+MR-1) / MR;
    const IndexType np = (nc+NR-1) / NR;

    #if defined(_OPENMP)
    #pragma omp parallel for
    #endif
    for (IndexType j=0; j<np; ++j) {

        IndexType ia = 0;
        for (IndexType i=0; i<mp; ++i) {
            IndexType kc    = i*MR;

            ugemm(kc,
                  T(-1), &A_[ia*MR*MR], &B_[j*mc*NR],
                  alpha,
                  &B_[(j*mc+kc)*NR], NR, IndexType(1));

            utrlsm(&A_[(ia*MR+kc)*MR], &B_[(j*mc+kc)*NR]);
            ia += i+1;
        }
    }
}

//-- Packing blocks ------------------------------------------------------------
template <typename MA, typename T>
void
pack_A(const MA &A, T *p)
{
    typedef typename MA::size_type  size_type;

    size_type mc = A.size1();
    size_type kc = A.size2();
    size_type MR = BlockSize<T>::MR;
    size_type mp = (mc+MR-1) / MR;

    for (size_type j=0; j<kc; ++j) {
        for (size_type l=0; l<mp; ++l) {
            for (size_type i0=0; i0<MR; ++i0) {
                size_type i  = l*MR + i0;
                size_type nu = l*MR*kc + j*MR + i0;
                p[nu]        = (i<mc) ? A(i,j) : T(0);
            }
        }
    }
}

template <typename MB, typename T>
void
pack_B(const MB &B, T *p)
{
    typedef typename MB::size_type  size_type;

    size_type kc = B.size1();
    size_type nc = B.size2();
    size_type NR = BlockSize<T>::NR;
    size_type np = (nc+NR-1) / NR;

    for (size_type l=0; l<np; ++l) {
        for (size_type j0=0; j0<NR; ++j0) {
            for (size_type i=0; i<kc; ++i) {
                size_type j  = l*NR+j0;
                size_type nu = l*NR*kc + i*NR + j0;
                p[nu]        = (j<nc) ? B(i,j) : T(0);
            }
        }
    }
}

template <typename T, typename MB>
void
unpack_B(const T *p, MB &B)
{
    typedef typename MB::size_type  size_type;

    size_type kc = B.size1();
    size_type nc = B.size2();
    size_type NR = BlockSize<T>::NR;
    size_type np = (nc+NR-1) / NR;

    for (size_type l=0; l<np; ++l) {
        for (size_type j0=0; j0<NR; ++j0) {
            for (size_type i=0; i<kc; ++i) {
                size_type j  = l*NR+j0;
                size_type nu = l*NR*kc + i*NR + j0;
                if (j<nc) {
                    B(i,j) = p[nu];
                }
            }
        }
    }
}

template <typename ML, typename T>
void
pack_L(const ML &L, T *p)
{
    typedef typename ML::size_type  size_type;

    assert(L.size1()==L.size2());

    size_type mc = L.size1();
    size_type MR = BlockSize<T>::MR;
    size_type mp = (mc+MR-1) / MR;

    for (size_type j=0; j<mp; ++j) {
        for (size_type j0=0; j0<MR; ++j0) {
            for (size_type i=j; i<mp; ++i) {
                for (size_type i0=0; i0<MR; ++i0) {
                    size_type I  = i*MR+i0;
                    size_type J  = j*MR+j0;
                    size_type nu = (i+1)*i/2*MR*MR + j*MR*MR + j0*MR +i0;
                    p[nu]        = (I==J) ? T(1)
                                          : (I>=mc || J>=mc) ? T(0)
                                                             : (I>J) ? L(I,J)
                                                                     : T(0);
                }
            }
        }
    }
}

//-- Frame routine -------------------------------------------------------------
template <typename Alpha, typename MatrixA, typename MatrixB,
         typename Beta, typename MatrixC>
void
gemm(Alpha alpha, const MatrixA &A, const MatrixB &B, Beta beta, MatrixC &C)
{
    assert(A.size2()==B.size1());
    namespace ublas = boost::numeric::ublas;

    typedef typename MatrixC::size_type  size_type;
    typedef typename MatrixA::value_type TA;
    typedef typename MatrixB::value_type TB;
    typedef typename MatrixC::value_type TC;
    typedef typename std::common_type<Alpha, TA, TB>::type  T;

    const size_type MC = BlockSize<T>::MC;
    const size_type NC = BlockSize<T>::NC;
    const size_type MR = BlockSize<T>::MR;
    const size_type NR = BlockSize<T>::NR;

    const size_type m = C.size1();
    const size_type n = C.size2();
    const size_type k = A.size2();

    const size_type KC = BlockSize<T>::KC;
    const size_type mb = (m+MC-1) / MC;
    const size_type nb = (n+NC-1) / NC;
    const size_type kb = (k+KC-1) / KC;
    const size_type mc_ = m % MC;
    const size_type nc_ = n % NC;
    const size_type kc_ = k % KC;

    if (m==0 || n==0 || ((alpha==Alpha(0) || k==0) && (beta==Beta(1)))) {
        return;
    }

    TC *C_ = &C(0,0);
    const size_type incRowC = &C(1,0) - &C(0,0);
    const size_type incColC = &C(0,1) - &C(0,0);

    T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*KC+MR));
    T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(KC*NC+NR));

    if (alpha==Alpha(0) || k==0) {
        gescal(beta, C);
        return;
    }

    for (size_type j=0; j<nb; ++j) {
        size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_;

        for (size_type l=0; l<kb; ++l) {
            size_type kc = (l!=kb-1 || kc_==0) ? KC : kc_;
            Beta beta_   = (l==0) ? beta : Beta(1);

            const auto Bs = subrange(B, l*KC, l*KC+kc, j*NC, j*NC+nc);
            pack_B(Bs, B_);

            for (size_type i=0; i<mb; ++i) {
                size_type mc = (i!=mb-1 || mc_==0) ? MC : mc_;

                const auto As = subrange(A, i*MC, i*MC+mc, l*KC, l*KC+kc);
                pack_A(As, A_);

                mgemm(mc, nc, kc,
                      T(alpha), A_, B_, beta_,
                      &C_[i*MC*incRowC+j*NC*incColC],
                      incRowC, incColC);
            }
        }
    }
    free_(A_);
    free_(B_);
}

template <typename Alpha, typename MatrixA, typename MatrixB>
void
trlsm(const Alpha   &alpha, bool unitDiag, const MatrixA &A, MatrixB &B)
{
    assert(A.size2()==A.size1());
    namespace ublas = boost::numeric::ublas;

    typedef typename MatrixA::size_type  size_type;
    typedef typename MatrixA::value_type TA;
    typedef typename MatrixB::value_type TB;

    typedef typename std::common_type<Alpha, TA, TB>::type   T_;
    typedef typename std::remove_const<T_>::type             T;

    const size_type MC = BlockSize<T>::MC;
    const size_type NC = BlockSize<T>::NC;
    const size_type MR = BlockSize<T>::MR;
    const size_type NR = BlockSize<T>::NR;

    const size_type m = B.size1();
    const size_type n = B.size2();

    const size_type mb = (m+MC-1) / MC;
    const size_type nb = (n+NC-1) / NC;
    const size_type mc_ = m % MC;
    const size_type nc_ = n % NC;

    if (m==0 || n==0) {
        return;
    }

    const size_type incRowB = &B(1,0) - &B(0,0);
    const size_type incColB = &B(0,1) - &B(0,0);

    if (alpha==Alpha(0)) {
        gescal(Alpha(0), B);
        return;
    }

    T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*MC+MR));
    T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*NC+NR));

    for (size_type j=0; j<nb; ++j) {
        size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_;

        for (size_type i=0; i<mb; ++i) {
            size_type mc  = (i!=mb-1 || mc_==0) ? MC : mc_;
            Alpha  alpha_ = (i==0) ? alpha : Alpha(1);

            auto Bs = subrange(B, i*MC, i*MC+mc, j*NC, j*NC+nc);
            pack_B(Bs, B_);

            const auto Ls = subrange(A, i*MC, i*MC+mc, i*MC, i*MC+mc);
            pack_L(Ls, A_);

            mtrlsm(mc, nc, T(alpha_), A_, B_);
            unpack_B(B_, Bs);

            for (size_type l=i+1; l<mb; ++l) {
                mc  = (l!=mb-1 || mc_==0) ? MC : mc_;

                const auto As = subrange(A, l*MC, l*MC+mc, i*MC, i*MC+mc);
                pack_A(As, A_);

                mgemm(mc, nc, MC, T(-1), A_, B_, alpha_,
                      &B(l*MC,j*NC), incRowB, incColB);
            }
        }
    }

    free_(A_);
    free_(B_);
}

} // namespace foo

#endif

bench_lu.cc

#include <cassert>
#include <chrono>
#include <cmath>
#include <limits>
#include <random>
#include <type_traits>
#include <boost/timer.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include "gemm.hpp"
#include "lu.hpp"

template <typename T>
struct WallTime
{
    void
    tic()
    {
        t0 = std::chrono::high_resolution_clock::now();
    }

    T
    toc()
    {
        using namespace std::chrono;

        elapsed = high_resolution_clock::now() - t0;
        return duration<T,seconds::period>(elapsed).count();
    }

    std::chrono::high_resolution_clock::time_point t0;
    std::chrono::high_resolution_clock::duration   elapsed;
};

//
// For checking I don't care for efficiency of elegance ...
//
template <typename MatrixA1, typename MatrixA2, typename MatrixP>
double
checkLU(const MatrixA1 &A1, MatrixA2 &A2, const MatrixP &P)
{
    namespace ublas = boost::numeric::ublas;

    typedef typename MatrixA1::size_type    size_type;
    typedef typename MatrixA1::value_type   value_type;

    size_type m = A1.size1();
    size_type n = A1.size2();

    ublas::matrix<value_type>     L(m,m);

    // Copy L form A2 and set A2 to U
    for (size_type j=0; j<m; ++j) {
        for (size_type i=0; i<m; ++i) {
            if (i==j) {
                L(i,i) = 1;
            } else if (i>j) {
                L(i,j)  = A2(i,j);
                A2(i,j) = 0;
            } else if (j>i) {
                L(i,j)  = 0;
            }
        }
    }
    ublas::matrix<value_type>   LU(m,n);
    ublas::axpy_prod(L, A2, LU, true);

    // LU <- inv(P)*L*U
    foo::swap_rows(P, LU, true);

    double diff = 0;
    for (size_type j=0; j<n; ++j) {
        for (size_type i=0; i<m; ++i) {
            diff += std::abs(A1(i,j)-LU(i,j));
        }
    }
    return diff;
}

// fill rectangular matrix with random values
template <typename MATRIX>
void
fill(MATRIX &A)
{
    typedef typename MATRIX::size_type  size_type;
    typedef typename MATRIX::value_type T;

    std::random_device                  random;
    std::default_random_engine          mt(random());
    std::uniform_real_distribution<T>   uniform(-100,100);

    for (size_type i=0; i<A.size1(); ++i) {
        for (size_type j=0; j<A.size2(); ++j) {
            A(i,j) = uniform(mt);
        }
    }
}

#ifndef M_MAX
#define M_MAX 4000
#endif

#ifndef N_MAX
#define N_MAX 4000
#endif

int
main()
{
    namespace ublas = boost::numeric::ublas;

    const std::size_t m_min = 50;
    const std::size_t n_min = 50;

    const std::size_t m_max = M_MAX;
    const std::size_t n_max = N_MAX;

    const std::size_t m_inc = 50;
    const std::size_t n_inc = 50;

    typedef double              T;
    typedef ublas::row_major    SO;

    WallTime<double>  walltime;

    for (std::size_t m=m_min, n=n_min;
         m<=m_max && n<=n_max;
         m+=m_inc, n+=n_inc)
    {
        ublas::matrix<T,SO>             A_org(m, n);
        ublas::matrix<T,SO>             A1(m, n);
        ublas::matrix<T,SO>             A2(m, n);
        ublas::matrix<T,SO>             A3(m, n);
        ublas::permutation_matrix<T>    P1(m), P2(m), P3(m);

        fill(A_org);
        A1 = A_org;
        A2 = A_org;
        A3 = A_org;

        double      diff, t, mflops;
        std::size_t info;

        std::size_t minMN = std::min(m,n);
        std::size_t maxMN = std::max(m,n);

        mflops  = minMN*minMN*maxMN -(minMN*minMN*minMN/3.) -(minMN*minMN/2.);
        mflops /= 1000000;

        walltime.tic();
        info = ublas::lu_factorize(A1, P1);
        t = walltime.toc();

        diff   = checkLU(A_org, A1, P1);
        std::printf("%4ld %4ld %4ld %16.3e %16.5lf %16.5lf",
                    m, n, info, diff, t, mflops/t);

        walltime.tic();
        info = foo::lu_blocked(A2, P2);
        t = walltime.toc();

        diff = checkLU(A_org, A2, P2);
        std::printf("%4ld %16.3e %16.5lf %16.5lf",
                    info, diff, t, mflops/t);

        walltime.tic();
        info = foo::lu_blocked_recursive(A3, P3);
        t = walltime.toc();


        diff = checkLU(A_org, A3, P3);
        std::printf("%4ld %16.3e %16.5lf %16.5lf\n",
                    info, diff, t, mflops/t);

    }
}