Content |
LU Factorization
The performance of solving a system of linear equations depends on the performance of the matrix-matrix product.
What's new:
-
Triangular solver for \(B \leftarrow L^{-1} B\). Currently this is for the special case where \(L\) is a lower triangular matrix with unit diagonal. But this can easily be generalized. Note that the performance of the triangular solver merely depends on the performance of the GEMM micro kernel.
-
Two implementations for the LU factorization:
-
lu_blocked is basically the algorithm of LAPACK function dgetrf.
-
lu_blocked_recursive applies the factorization recursively.
Both variants exploit the performance of the matrix-matrix product and the triangular solver for matrix equations.
-
Compile and Run Benchmark
$shell> g++-5.3 -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -DM_MAX=500 -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc $shell> ./a.out 50 50 0 3.858e-11 0.00007 1257.01889 0 3.858e-11 0.00007 1262.41266 0 3.148e-11 0.00014 574.16190 100 100 0 2.704e-10 0.00035 1891.03810 0 1.852e-10 0.00039 1708.88958 0 2.017e-10 0.00038 1759.34469 150 150 0 8.315e-10 0.00102 2188.39717 0 5.930e-10 0.00096 2332.16729 0 5.970e-10 0.00081 2757.47181 200 200 0 1.833e-09 0.00109 4888.77725 0 1.466e-09 0.00078 6782.49673 0 1.492e-09 0.00070 7579.86438 250 250 0 3.335e-09 0.00216 4812.57506 0 2.929e-09 0.00137 7571.10183 0 3.030e-09 0.00122 8507.56694 300 300 0 5.387e-09 0.00388 4627.15277 0 5.214e-09 0.00211 8520.78992 0 4.064e-09 0.00186 9649.06530 350 350 0 8.423e-09 0.00610 4679.07581 0 8.603e-09 0.00330 8652.79651 0 7.163e-09 0.00282 10113.78357 400 400 0 1.221e-08 0.00923 4614.49949 0 1.292e-08 0.00448 9506.99858 0 1.080e-08 0.00398 10713.10129 450 450 0 1.715e-08 0.01302 4659.90448 0 1.923e-08 0.00637 9519.13039 0 1.626e-08 0.00540 11239.84015 500 500 0 2.273e-08 0.01807 4605.68407 0 2.650e-08 0.00819 10155.92090 0 2.298e-08 0.00714 11659.91454 $shell>
Benchmark (Single Threaded)
$shell> g++-5.3 -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc $shell> ./a.out > report.session7.lu $shell> gnuplot plot.session7.lu $shell> gnuplot plot.session7.lu.log $shell> gnuplot plot.session7.lu.mflops $shell>
Time (Effectiveness)
For the LU-factorization we are primarily interested how long it takes to get the factorization:
Using a log-sclaing of both axis shows, that the complexity is still of order \(N^3\):
MFLOPS (Efficiency)
We are also interested how fast each algorithm reaches its peak performance:
Benchmark (Multi Threaded)
Test with 2 threads:
$shell> g++-5.3 -fopenmp -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc $shell> export OMP_NUM_THREADS=2; ./a.out > report.session7-mt2.lu $shell>
Test with 4 threads:
$shell> g++-5.3 -fopenmp -Wall -DNDEBUG -mavx -Ofast -std=c++11 -I ../../boost_1_60_0/ -DHAVE_AVX -DHAVE_GCCVEC bench_lu.cc $shell> export OMP_NUM_THREADS=4; ./a.out > report.session7-mt4.lu $shell> gnuplot plot.session7-mt.lu $shell> gnuplot plot.session7-mt.lu.log $shell> gnuplot plot.session7-mt.lu.mflops $shell>
Time (Effectiveness)
MFLOPS (Efficiency)
Source Code
The tar-ball session7.tgz contains the files:
$shell> tar tfvz session7.tgz -rw-rw-r-- lehn/num 4301 2016-02-11 16:34 session7/bench_lu.cc -rw-rw-r-- lehn/num 13537 2016-02-02 16:05 session7/avx.hpp -rw-rw-r-- lehn/num 24390 2016-02-02 16:05 session7/fma.hpp -rw-rw-r-- lehn/num 1537 2016-02-12 21:01 session7/gccvec.hpp -rw-rw-r-- lehn/num 3353 2016-02-02 16:05 session7/gccvec2.hpp -rw-rw-r-- lehn/num 16946 2016-02-12 21:27 session7/gemm.hpp -rw-rw-r-- lehn/num 6221 2016-02-10 13:15 session7/lu.hpp $shell>
lu.hpp
#ifndef LU_HPP #define LU_HPP 1 #include <boost/numeric/ublas/operation.hpp> #include <boost/numeric/ublas/vector_proxy.hpp> #include <boost/numeric/ublas/matrix_proxy.hpp> #include <boost/numeric/ublas/vector.hpp> #include <boost/numeric/ublas/triangular.hpp> #include "gemm.hpp" namespace foo { template <typename MP, typename MA> void swap_rows(const MP &P, MA &A, bool reverse=false) { typedef typename MP::size_type size_type; size_type size = P.size(); if (!reverse) { for (size_type i=0; i<size; ++i) { if (i!=P(i)) { row(A, i).swap(row(A, P(i))); } } } else { for (size_type I=size; I>=1; --I) { size_type i = I-1; if (i!=P(i)) { row(A, i).swap(row(A, P(i))); } } } } // Unblocked LU factorization with partial pivoting template <typename MA, typename MP> typename MA::size_type lu_unblocked(MA &A, MP &P) { namespace ublas = boost::numeric::ublas; using ublas::range; using ublas::matrix_column; using ublas::matrix_row; typedef typename MA::size_type size_type; typedef typename MA::value_type value_type; size_type singular = 0; size_type m = A.size1 (); size_type n = A.size2 (); size_type mn = std::min(m, n); for (size_type i=0; i<mn; ++ i) { matrix_column<MA> A_i(column(A,i)); matrix_row<MA> Ai_(row(A,i)); P(i) = i + index_norm_inf(project(A_i, range(i, m))); if (A(P(i),i) != value_type()) { if (P(i)!=i) { row(A, P(i)).swap (Ai_); } } else { singular = i+1; } value_type alpha = value_type(1)/A(i,i); project (A_i, range(i+1, m)) *= alpha; project (A, range(i+1, m), range (i+1, n)).minus_assign ( outer_prod (project(A_i, range(i+1, m)), project(Ai_, range(i+1, n)))); } return singular; } // Blocked LU factorization with partial pivoting template <typename MA, typename MP> typename MA::size_type lu_blocked(MA &A, MP &P) { namespace ublas = boost::numeric::ublas; using ublas::range; using ublas::matrix_column; using ublas::matrix_row; typedef typename MA::size_type size_type; typedef typename MA::value_type value_type; size_type singular = 0; size_type singular_ = 0; size_type m = A.size1 (); size_type n = A.size2 (); size_type mn = std::min(m, n); size_type bs = 64; if (bs>=mn) { singular = lu_unblocked(A, P); } else { for (size_type j=0; j<mn; j+=bs) { auto jb = std::min(mn-j, bs); auto A_ = project(A, range(j,m), range(j,j+jb)); auto P_ = project(P, range(j,m)); singular_ = lu_unblocked(A_, P_); if (singular==0 && singular_>0) { singular = singular_ + j; } auto A_left = project(A, range(j,m), range(0,j)); foo::swap_rows(project(P, range(j,j+jb)), A_left); if (j+jb<=n) { auto A_right = project(A, range(j,m), range(j+jb,n)); foo::swap_rows(project(P, range(j,j+jb)), A_right); const auto L = project(A, range(j,j+jb), range(j,j+jb)); auto U_right = project(A, range(j,j+jb), range(j+jb,n)); //inplace_solve(L, U_right, ublas::unit_lower_tag()); trlsm(value_type(1), true, L, U_right); if (j+jb<=m) { auto A_ = project(A, range(j+jb,m), range(j+jb,n)); gemm(value_type(-1), project(A, range(j+jb,m), range(j,j+jb)), project(A, range(j,j+jb), range(j+jb,n)), value_type(1), A_); } } for (size_type i=j; i<std::min(m, j+jb); ++i) { P(i) += j; } } } return singular; } // Blocked recursive LU factorization with partial pivoting template <typename MA, typename MP> typename MA::size_type lu_blocked_recursive(MA &A, MP &P) { namespace ublas = boost::numeric::ublas; using ublas::range; using ublas::matrix_column; using ublas::matrix_row; typedef typename MA::size_type size_type; typedef typename MA::value_type value_type; size_type singular = 0; size_type singular_ = 0; size_type m = A.size1(); size_type n = A.size2(); size_type mn = std::min(m, n); size_type bs = 8; if (bs>=mn) { singular = lu_unblocked(A, P); } else { size_type k; for (k=1; k<mn/4; k*=2); auto A_left = project(A, range(0,m), range(0,k)); singular_ = lu_blocked_recursive(A_left, P); if (singular==0 && singular_>0) { singular = singular_; } auto A_right = project(A, range(0,m), range(k,n)); auto mk = std::min(m, k); foo::swap_rows(project(P, range(0,mk)), A_right); const auto L = project(A, range(0,mk), range(0,mk)); auto U_right = project(A, range(0,mk), range(mk,n)); //inplace_solve(L, U_right, ublas::unit_lower_tag()); trlsm(value_type(1), true, L, U_right); auto A_ = project(A, range(mk,m), range(mk,n)); auto P_ = project(P, range(mk,m)); gemm(value_type(-1), project(A, range(mk,m), range(0,mk)), project(A, range(0,mk), range(mk,n)), value_type(1), A_); //std::cout << std::endl; //std::cout << "M = " << m << ", N = " << n << ", K = " << k << std::endl; //std::cout << "m = " << (m-mk) << ", n = " << (n-mk) << ", k = " << mk << std::endl; singular_ = lu_blocked_recursive(A_, P_); if (singular==0 && singular_>0) { singular = singular_ + mk; } auto A_left_bottom = project(A, range(mk,m), range(0,k)); foo::swap_rows(project(P, range(mk,mn)), A_left_bottom); for (size_type i=mk; i<mn; ++i) { P(i) += mk; } } return singular; } } // namespace foo #endif // LU_HPP
gemm.hpp
// Code extracted from ulmBLAS: https://github.com/michael-lehn/ulmBLAS-core #ifndef GEMM_HPP #define GEMM_HPP #include <algorithm> #include <cstdlib> #if defined(_OPENMP) #include <omp.h> #endif namespace foo { //-- new with alignment -------------------------------------------------------- void * malloc_(std::size_t alignment, std::size_t size) { alignment = std::max(alignment, alignof(void *)); size += alignment; void *ptr = std::malloc(size); void *ptr2 = (void *)(((uintptr_t)ptr + alignment) & ~(alignment-1)); void **vp = (void**) ptr2 - 1; *vp = ptr; return ptr2; } void free_(void *ptr) { std::free(*((void**)ptr-1)); } //-- Config -------------------------------------------------------------------- // SIMD-Register width in bits // SSE: 128 // AVX/FMA: 256 // AVX-512: 512 #ifndef SIMD_REGISTER_WIDTH #define SIMD_REGISTER_WIDTH 256 #endif #ifdef HAVE_FMA # ifndef BS_D_MR # define BS_D_MR 4 # endif # ifndef BS_D_NR # define BS_D_NR 12 # endif # ifndef BS_D_MC # define BS_D_MC 256 # endif # ifndef BS_D_KC # define BS_D_KC 512 # endif # ifndef BS_D_NC # define BS_D_NC 4092 # endif #endif #ifndef BS_D_MR #define BS_D_MR 4 #endif #ifndef BS_D_NR #define BS_D_NR 8 #endif #ifndef BS_D_MC #define BS_D_MC 256 #endif #ifndef BS_D_KC #define BS_D_KC 256 #endif #ifndef BS_D_NC #define BS_D_NC 4096 #endif template <typename T> struct BlockSize { static constexpr int MC = 64; static constexpr int KC = 64; static constexpr int NC = 256; static constexpr int MR = 8; static constexpr int NR = 8; static constexpr int rwidth = 0; static constexpr int align = alignof(T); static constexpr int vlen = 0; static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); }; template <> struct BlockSize<double> { static constexpr int MC = BS_D_MC; static constexpr int KC = BS_D_KC; static constexpr int NC = BS_D_NC; static constexpr int MR = BS_D_MR; static constexpr int NR = BS_D_NR; static constexpr int rwidth = SIMD_REGISTER_WIDTH; static constexpr int align = rwidth / 8; #if defined(HAVE_AVX) || defined(HAVE_FMA) || defined(HAVE_GCCVEC) static constexpr int vlen = rwidth / (8*sizeof(double)); #else static constexpr int vlen = 0; #endif static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size."); static_assert(MC % MR == 0, "MC must be a multiple of MR."); static_assert(NC % NR == 0, "NC must be a multiple of NR."); static_assert(rwidth % sizeof(double) == 0, "SIMD register width not sane."); }; //-- aux routines -------------------------------------------------------------- template <typename Alpha, typename MX, typename MY> void geaxpy(const Alpha &alpha, const MX &X, MY &Y) { assert(X.size1()==Y.size1()); assert(X.size2()==Y.size2()); typedef typename MX::size_type size_type; for (size_type j=0; j<X.size2(); ++j) { for (size_type i=0; i<X.size1(); ++i) { Y(i,j) += alpha*X(i,j); } } } template <typename Alpha, typename MX> void gescal(const Alpha &alpha, MX &X) { typedef typename MX::size_type size_type; for (size_type j=0; j<X.size2(); ++j) { for (size_type i=0; i<X.size1(); ++i) { X(i,j) *= alpha; } } } template <typename Index, typename Alpha, typename TX, typename TY> void geaxpy(Index m, Index n, const Alpha &alpha, const TX *X, Index incRowX, Index incColX, TY *Y, Index incRowY, Index incColY) { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; } } } template <typename Index, typename Alpha, typename TX> void gescal(Index m, Index n, const Alpha &alpha, TX *X, Index incRowX, Index incColX) { for (Index j=0; j<n; ++j) { for (Index i=0; i<m; ++i) { X[i*incRowX+j*incColX] *= alpha; } } } template <typename IndexType, typename MX, typename MY> void gecopy(IndexType m, IndexType n, const MX *X, IndexType incRowX, IndexType incColX, MY *Y, IndexType incRowY, IndexType incColY) { for (IndexType j=0; j<n; ++j) { for (IndexType i=0; i<m; ++i) { Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX]; } } } //-- Micro Kernel -------------------------------------------------------------- template <typename Index, typename T> typename std::enable_if<BlockSize<T>::vlen == 0, void>::type ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; T P[BlockSize<T>::MR*BlockSize<T>::NR]; for (Index l=0; l<MR*NR; ++l) { P[l] = 0; } for (Index l=0; l<kc; ++l) { for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { P[i+j*MR] += A[i+l*MR]*B[l*NR+j]; } } } for (Index j=0; j<NR; ++j) { for (Index i=0; i<MR; ++i) { C[i*incRowC+j*incColC] *= beta; C[i*incRowC+j*incColC] += alpha*P[i+j*MR]; } } } #if defined HAVE_AVX #include "avx.hpp" #elif defined HAVE_FMA #include "fma.hpp" #elif defined HAVE_GCCVEC #include "gccvec.hpp" #endif #ifndef HAVE_GCCVEC template <typename T> void utrlsm(const T *A, T *B) { typedef std::size_t IndexType; const IndexType MR = BlockSize<T>::MR; const IndexType NR = BlockSize<T>::NR; T C_[MR*NR]; for (IndexType i=0; i<MR; ++i) { for (IndexType j=0; j<NR; ++j) { C_[i+j*MR] = B[i*NR+j]; } } for (IndexType i=0; i<MR; ++i) { for (IndexType j=0; j<NR; ++j) { C_[i+j*MR] *= A[i]; for (IndexType l=i+1; l<MR; ++l) { C_[l+j*MR] -= A[l]*C_[i+j*MR]; } } A += MR; } for (IndexType i=0; i<MR; ++i) { for (IndexType j=0; j<NR; ++j) { B[i*NR+j] = C_[i+j*MR]; } } } #else template <typename T> typename std::enable_if<BlockSize<T>::vlen != 0, void>::type utrlsm(const T *A, T *B) { typedef std::size_t IndexType; typedef T vx __attribute__((vector_size(BlockSize<T>::rwidth/8))); static constexpr IndexType vlen = BlockSize<T>::vlen; static constexpr IndexType MR = BlockSize<T>::MR; static constexpr IndexType NR = BlockSize<T>::NR/vlen; A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align); B = ( T*) __builtin_assume_aligned (B, BlockSize<T>::align); vx C_[MR*NR]; vx *B_ = (vx *)B; for (IndexType i=0; i<MR*NR; ++i) { C_[i] = B_[i]; } for (IndexType i=0; i<MR; ++i) { for (IndexType j=0; j<NR; ++j) { C_[i*NR+j] *= A[i]; } for (IndexType l=i+1; l<MR; ++l) { for (IndexType j=0; j<NR; ++j) { C_[l*NR+j] -= A[l]*C_[i*NR+j]; } } A += MR; } for (IndexType i=0; i<MR*NR; ++i) { B_[i] = C_[i]; } } #endif //-- Macro Kernel -------------------------------------------------------------- template <typename Index, typename T, typename Beta, typename TC> void mgemm(Index mc, Index nc, Index kc, const T &alpha, const T *A, const T *B, Beta beta, TC *C, Index incRowC, Index incColC) { const Index MR = BlockSize<T>::MR; const Index NR = BlockSize<T>::NR; const Index mp = (mc+MR-1) / MR; const Index np = (nc+NR-1) / NR; const Index mr_ = mc % MR; const Index nr_ = nc % NR; #if defined(_OPENMP) #pragma omp parallel for #endif for (Index j=0; j<np; ++j) { const Index nr = (j!=np-1 || nr_==0) ? NR : nr_; T C_[BlockSize<T>::MR*BlockSize<T>::NR]; for (Index i=0; i<mp; ++i) { const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_; if (mr==MR && nr==NR) { ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } else { std::fill_n(C_, MR*NR, T(0)); ugemm(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], T(0), C_, Index(1), MR); gescal(mr, nr, beta, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); geaxpy(mr, nr, T(1), C_, Index(1), MR, &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); } } } } template <typename IndexType, typename T> void mtrlsm(IndexType mc, IndexType nc, const T &alpha, const T *A_, T *B_) { const IndexType MR = BlockSize<T>::MR; const IndexType NR = BlockSize<T>::NR; const IndexType mp = (mc+MR-1) / MR; const IndexType np = (nc+NR-1) / NR; #if defined(_OPENMP) #pragma omp parallel for #endif for (IndexType j=0; j<np; ++j) { IndexType ia = 0; for (IndexType i=0; i<mp; ++i) { IndexType kc = i*MR; ugemm(kc, T(-1), &A_[ia*MR*MR], &B_[j*mc*NR], alpha, &B_[(j*mc+kc)*NR], NR, IndexType(1)); utrlsm(&A_[(ia*MR+kc)*MR], &B_[(j*mc+kc)*NR]); ia += i+1; } } } //-- Packing blocks ------------------------------------------------------------ template <typename MA, typename T> void pack_A(const MA &A, T *p) { typedef typename MA::size_type size_type; size_type mc = A.size1(); size_type kc = A.size2(); size_type MR = BlockSize<T>::MR; size_type mp = (mc+MR-1) / MR; for (size_type j=0; j<kc; ++j) { for (size_type l=0; l<mp; ++l) { for (size_type i0=0; i0<MR; ++i0) { size_type i = l*MR + i0; size_type nu = l*MR*kc + j*MR + i0; p[nu] = (i<mc) ? A(i,j) : T(0); } } } } template <typename MB, typename T> void pack_B(const MB &B, T *p) { typedef typename MB::size_type size_type; size_type kc = B.size1(); size_type nc = B.size2(); size_type NR = BlockSize<T>::NR; size_type np = (nc+NR-1) / NR; for (size_type l=0; l<np; ++l) { for (size_type j0=0; j0<NR; ++j0) { for (size_type i=0; i<kc; ++i) { size_type j = l*NR+j0; size_type nu = l*NR*kc + i*NR + j0; p[nu] = (j<nc) ? B(i,j) : T(0); } } } } template <typename T, typename MB> void unpack_B(const T *p, MB &B) { typedef typename MB::size_type size_type; size_type kc = B.size1(); size_type nc = B.size2(); size_type NR = BlockSize<T>::NR; size_type np = (nc+NR-1) / NR; for (size_type l=0; l<np; ++l) { for (size_type j0=0; j0<NR; ++j0) { for (size_type i=0; i<kc; ++i) { size_type j = l*NR+j0; size_type nu = l*NR*kc + i*NR + j0; if (j<nc) { B(i,j) = p[nu]; } } } } } template <typename ML, typename T> void pack_L(const ML &L, T *p) { typedef typename ML::size_type size_type; assert(L.size1()==L.size2()); size_type mc = L.size1(); size_type MR = BlockSize<T>::MR; size_type mp = (mc+MR-1) / MR; for (size_type j=0; j<mp; ++j) { for (size_type j0=0; j0<MR; ++j0) { for (size_type i=j; i<mp; ++i) { for (size_type i0=0; i0<MR; ++i0) { size_type I = i*MR+i0; size_type J = j*MR+j0; size_type nu = (i+1)*i/2*MR*MR + j*MR*MR + j0*MR +i0; p[nu] = (I==J) ? T(1) : (I>=mc || J>=mc) ? T(0) : (I>J) ? L(I,J) : T(0); } } } } } //-- Frame routine ------------------------------------------------------------- template <typename Alpha, typename MatrixA, typename MatrixB, typename Beta, typename MatrixC> void gemm(Alpha alpha, const MatrixA &A, const MatrixB &B, Beta beta, MatrixC &C) { assert(A.size2()==B.size1()); namespace ublas = boost::numeric::ublas; typedef typename MatrixC::size_type size_type; typedef typename MatrixA::value_type TA; typedef typename MatrixB::value_type TB; typedef typename MatrixC::value_type TC; typedef typename std::common_type<Alpha, TA, TB>::type T; const size_type MC = BlockSize<T>::MC; const size_type NC = BlockSize<T>::NC; const size_type MR = BlockSize<T>::MR; const size_type NR = BlockSize<T>::NR; const size_type m = C.size1(); const size_type n = C.size2(); const size_type k = A.size2(); const size_type KC = BlockSize<T>::KC; const size_type mb = (m+MC-1) / MC; const size_type nb = (n+NC-1) / NC; const size_type kb = (k+KC-1) / KC; const size_type mc_ = m % MC; const size_type nc_ = n % NC; const size_type kc_ = k % KC; if (m==0 || n==0 || ((alpha==Alpha(0) || k==0) && (beta==Beta(1)))) { return; } TC *C_ = &C(0,0); const size_type incRowC = &C(1,0) - &C(0,0); const size_type incColC = &C(0,1) - &C(0,0); T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*KC+MR)); T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(KC*NC+NR)); if (alpha==Alpha(0) || k==0) { gescal(beta, C); return; } for (size_type j=0; j<nb; ++j) { size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_; for (size_type l=0; l<kb; ++l) { size_type kc = (l!=kb-1 || kc_==0) ? KC : kc_; Beta beta_ = (l==0) ? beta : Beta(1); const auto Bs = subrange(B, l*KC, l*KC+kc, j*NC, j*NC+nc); pack_B(Bs, B_); for (size_type i=0; i<mb; ++i) { size_type mc = (i!=mb-1 || mc_==0) ? MC : mc_; const auto As = subrange(A, i*MC, i*MC+mc, l*KC, l*KC+kc); pack_A(As, A_); mgemm(mc, nc, kc, T(alpha), A_, B_, beta_, &C_[i*MC*incRowC+j*NC*incColC], incRowC, incColC); } } } free_(A_); free_(B_); } template <typename Alpha, typename MatrixA, typename MatrixB> void trlsm(const Alpha &alpha, bool unitDiag, const MatrixA &A, MatrixB &B) { assert(A.size2()==A.size1()); namespace ublas = boost::numeric::ublas; typedef typename MatrixA::size_type size_type; typedef typename MatrixA::value_type TA; typedef typename MatrixB::value_type TB; typedef typename std::common_type<Alpha, TA, TB>::type T_; typedef typename std::remove_const<T_>::type T; const size_type MC = BlockSize<T>::MC; const size_type NC = BlockSize<T>::NC; const size_type MR = BlockSize<T>::MR; const size_type NR = BlockSize<T>::NR; const size_type m = B.size1(); const size_type n = B.size2(); const size_type mb = (m+MC-1) / MC; const size_type nb = (n+NC-1) / NC; const size_type mc_ = m % MC; const size_type nc_ = n % NC; if (m==0 || n==0) { return; } const size_type incRowB = &B(1,0) - &B(0,0); const size_type incColB = &B(0,1) - &B(0,0); if (alpha==Alpha(0)) { gescal(Alpha(0), B); return; } T *A_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*MC+MR)); T *B_ = (T*) malloc_(BlockSize<T>::align, sizeof(T)*(MC*NC+NR)); for (size_type j=0; j<nb; ++j) { size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_; for (size_type i=0; i<mb; ++i) { size_type mc = (i!=mb-1 || mc_==0) ? MC : mc_; Alpha alpha_ = (i==0) ? alpha : Alpha(1); auto Bs = subrange(B, i*MC, i*MC+mc, j*NC, j*NC+nc); pack_B(Bs, B_); const auto Ls = subrange(A, i*MC, i*MC+mc, i*MC, i*MC+mc); pack_L(Ls, A_); mtrlsm(mc, nc, T(alpha_), A_, B_); unpack_B(B_, Bs); for (size_type l=i+1; l<mb; ++l) { mc = (l!=mb-1 || mc_==0) ? MC : mc_; const auto As = subrange(A, l*MC, l*MC+mc, i*MC, i*MC+mc); pack_A(As, A_); mgemm(mc, nc, MC, T(-1), A_, B_, alpha_, &B(l*MC,j*NC), incRowB, incColB); } } } free_(A_); free_(B_); } } // namespace foo #endif
bench_lu.cc
#include <cassert> #include <chrono> #include <cmath> #include <limits> #include <random> #include <type_traits> #include <boost/timer.hpp> #include <boost/numeric/ublas/matrix.hpp> #include <boost/numeric/ublas/io.hpp> #include <boost/numeric/ublas/lu.hpp> #include <boost/numeric/ublas/operation.hpp> #include "gemm.hpp" #include "lu.hpp" template <typename T> struct WallTime { void tic() { t0 = std::chrono::high_resolution_clock::now(); } T toc() { using namespace std::chrono; elapsed = high_resolution_clock::now() - t0; return duration<T,seconds::period>(elapsed).count(); } std::chrono::high_resolution_clock::time_point t0; std::chrono::high_resolution_clock::duration elapsed; }; // // For checking I don't care for efficiency of elegance ... // template <typename MatrixA1, typename MatrixA2, typename MatrixP> double checkLU(const MatrixA1 &A1, MatrixA2 &A2, const MatrixP &P) { namespace ublas = boost::numeric::ublas; typedef typename MatrixA1::size_type size_type; typedef typename MatrixA1::value_type value_type; size_type m = A1.size1(); size_type n = A1.size2(); ublas::matrix<value_type> L(m,m); // Copy L form A2 and set A2 to U for (size_type j=0; j<m; ++j) { for (size_type i=0; i<m; ++i) { if (i==j) { L(i,i) = 1; } else if (i>j) { L(i,j) = A2(i,j); A2(i,j) = 0; } else if (j>i) { L(i,j) = 0; } } } ublas::matrix<value_type> LU(m,n); ublas::axpy_prod(L, A2, LU, true); // LU <- inv(P)*L*U foo::swap_rows(P, LU, true); double diff = 0; for (size_type j=0; j<n; ++j) { for (size_type i=0; i<m; ++i) { diff += std::abs(A1(i,j)-LU(i,j)); } } return diff; } // fill rectangular matrix with random values template <typename MATRIX> void fill(MATRIX &A) { typedef typename MATRIX::size_type size_type; typedef typename MATRIX::value_type T; std::random_device random; std::default_random_engine mt(random()); std::uniform_real_distribution<T> uniform(-100,100); for (size_type i=0; i<A.size1(); ++i) { for (size_type j=0; j<A.size2(); ++j) { A(i,j) = uniform(mt); } } } #ifndef M_MAX #define M_MAX 4000 #endif #ifndef N_MAX #define N_MAX 4000 #endif int main() { namespace ublas = boost::numeric::ublas; const std::size_t m_min = 50; const std::size_t n_min = 50; const std::size_t m_max = M_MAX; const std::size_t n_max = N_MAX; const std::size_t m_inc = 50; const std::size_t n_inc = 50; typedef double T; typedef ublas::row_major SO; WallTime<double> walltime; for (std::size_t m=m_min, n=n_min; m<=m_max && n<=n_max; m+=m_inc, n+=n_inc) { ublas::matrix<T,SO> A_org(m, n); ublas::matrix<T,SO> A1(m, n); ublas::matrix<T,SO> A2(m, n); ublas::matrix<T,SO> A3(m, n); ublas::permutation_matrix<T> P1(m), P2(m), P3(m); fill(A_org); A1 = A_org; A2 = A_org; A3 = A_org; double diff, t, mflops; std::size_t info; std::size_t minMN = std::min(m,n); std::size_t maxMN = std::max(m,n); mflops = minMN*minMN*maxMN -(minMN*minMN*minMN/3.) -(minMN*minMN/2.); mflops /= 1000000; walltime.tic(); info = ublas::lu_factorize(A1, P1); t = walltime.toc(); diff = checkLU(A_org, A1, P1); std::printf("%4ld %4ld %4ld %16.3e %16.5lf %16.5lf", m, n, info, diff, t, mflops/t); walltime.tic(); info = foo::lu_blocked(A2, P2); t = walltime.toc(); diff = checkLU(A_org, A2, P2); std::printf("%4ld %16.3e %16.5lf %16.5lf", info, diff, t, mflops/t); walltime.tic(); info = foo::lu_blocked_recursive(A3, P3); t = walltime.toc(); diff = checkLU(A_org, A3, P3); std::printf("%4ld %16.3e %16.5lf %16.5lf\n", info, diff, t, mflops/t); } }