Content |
LU Factorization
Matrix/Vector views are absolutely crucial for numerical linear algebra. This becomes evident in the following example where we implement two variants of the \(LU\) factorization:
-
lu_unblk is a so called unblocked variant and
-
lu_blk its blocked variant.
The block variant utilizes (beside the unblocked variant) BLAS Level 3 routines and leads to an immense speedup. For the sake of simplicity we do not consider privoting in this example.
Unblocked Variant
We partition \(A \in \mathbb{R}^{m \times n}\) as follows:
\[ A = \left( \begin{array}{c|c} a_{11} & a_{12}^T \\ \hline a_{21} & A_{22} \end{array}\right) \]After the factorization we want to have a lower triangular matrix \(L\) with unit diagonal
\[ L = \left( \begin{array}{c|c} 1 & \mathcal{0} \\ \hline l_{21} & L_{22} \end{array}\right) \]and an upper triangular matrix \(U\)
\[ U = \left( \begin{array}{c|c} u_{11} & u_{12}^T \\ \hline \mathcal{0} & U_{22} \end{array}\right) \]such that \(A = LU\). This leads to the equations
\[ \begin{eqnarray*} a_{11} &=& u_{11} \\ a_{12} &=& u_{12} \\ a_{21} &=& u_{11} l_{21} \\ A_{22} &=& l_{21} u_{12}^T + L_{22}U_{22} \end{eqnarray*} \]Furthermore, we want that \(A\) get compactly overwritten with \((L\backslash U)\).
This leads to the well-known Gauss algorithm:
-
Partition \(A\) as outlined above.
-
Overwrite \(a_{21}\) with \(\frac{1}{a_{11}} a_{21}\).
-
Overwrite \(A_{22}\) with \(A_{22} - a_{21} a_{12}^T\).
-
Repeat with \(A_{22}\) instead of \(A\).
Implementation of lu_unblk
#include <flens/flens.h>
namespace flens {
template <typename MA>
typename GeMatrix<MA>::IndexType
lu_unblk(GeMatrix<MA> &A)
{
using std::min;
typedef typename GeMatrix<MA>::ElementType ElementType;
typedef typename GeMatrix<MA>::IndexType IndexType;
const IndexType m = A.numRows();
const IndexType n = A.numCols();
const IndexType mn = min(m, n);
const ElementType Zero(0);
const Underscore<IndexType> _;
for (IndexType j=1; j<=mn; ++j) {
const auto a11 = A( j, j);
const auto a12 = A( j, _(j+1,n));
auto a21 = A(_(j+1,m), j);
auto A22 = A(_(j+1,m), _(j+1,n));
if (a11==Zero) {
return j;
}
a21 /= a11;
A22 -= a21*transpose(a12);
}
return 0;
}
} // namespace flens
Code for Testing
#include <flens/flens.cxx>
#include <flens/examples/lu_unblk.h>
using namespace flens;
using namespace std;
int
main()
{
typedef GeMatrix<FullStorage<double> > RealGeMatrix;
const int m = 2000;
const int n = 2000;
const int mn = std::min(m, n);
RealGeMatrix A(m, n), A_check;
fillRandom(A);
A_check = A;
if (lu_unblk(A)) {
cerr << "A is singular" << endl;
return 1;
}
Underscore<int> _;
RealGeMatrix L = A(_,_(1,mn)).lowerUnit();
RealGeMatrix U = A(_(1,mn),_).upper();
A_check -= L*U;
cout << "asum(A-L*U) = " << blas::asum(A_check) << endl;
return 0;
}
The structure of this test is simple:
Setup test case and make a copy of the input
fillRandom(A);
A_check = A;
Factorize \(A\).
cerr << "A is singular" << endl;
return 1;
}
Compute \(\|A - LU\|_1\) by brute force. Note that \(A\) is not required to be square.
RealGeMatrix L = A(_,_(1,mn)).lowerUnit();
RealGeMatrix U = A(_(1,mn),_).upper();
Compile and Run
Note that we compile with ATLAS here. With a BLAS reference implementation the runtime starts to hurt.
$shell> cd flens/examples $shell> g++ -Wall -std=c++11 -I../.. -o lu_unblk-test lu_unblk-test.cc -DWITH_ATLAS -framework vecLib $shell> time ./lu_unblk-test asum(A-L*U) = 1.46413e-09 real 0m15.318s user 0m19.182s sys 0m0.388s
Blocked Variant
We now partition \(A \in \mathbb{R}^{m \times n}\) block-wise
\[ A = \left( \begin{array}{c|c} A_{11} & A_{12} \\ \hline A_{21} & A_{22} \end{array}\right) \]where block \(A_{11}\) is square, i.e. \(A_{11} \in \mathbb{R}^{bs \times bs}\).
Analogously we partition \(L\)
\[ L = \left( \begin{array}{c|c} L_{11} & \mathcal{0} \\ \hline L_{21} & L_{22} \end{array}\right) \]and \(U\)
\[ U = \left( \begin{array}{c|c} U_{11} & U_{12} \\ \hline \mathcal{0} & U_{22} \end{array}\right). \]This time we get from \(A = LU\) the equations
\[ \begin{eqnarray*} A_{11} &=& L_{11} U_{11} \\ A_{12} &=& L_{11} U_{12} \\ A_{21} &=& L_{21} U_{11} \\ A_{22} &=& L_{21} U_{12} + L_{22}U_{22} \end{eqnarray*} \]We compute the \(LU\) factorization of \(A_{11}\) with the unblocked variant and get the following algorithm:
-
Partition \(A\) as outlined above.
-
Overwrite \(A_{11}\) with its \(LU\) factorization.
-
Overwrite \(A_{21}\) with \(A_{21} U_{11}^{-1}\).
-
Overwrite \(A_{12}\) with \(L_{11}^{–1} U_{12}\)
-
Overwrite \(A_{22}\) with \(A_{22} - A_{21}A_{12}\)
-
Repeat with \(A_{22}\) instead of \(A\).
Implementation of lu_blk
#include <flens/flens.h>
#include <flens/examples/lu_unblk.h>
#ifndef BS
#define BS 32
#endif
namespace flens {
template <typename MA>
typename GeMatrix<MA>::IndexType
lu_blk(GeMatrix<MA> &A)
{
using flens::min;
typedef typename GeMatrix<MA>::ElementType ElementType;
typedef typename GeMatrix<MA>::IndexType IndexType;
const IndexType m = A.numRows();
const IndexType n = A.numCols();
const IndexType mn = min(m, n);
const ElementType Zero(0), One(1);
const Underscore<IndexType> _;
IndexType info;
for (IndexType j=1; j<=mn; j+=BS) {
const IndexType bs = min(BS, m-j, n-j);
auto A11 = A(_(j,j+bs-1), _(j,j+bs-1));
auto A12 = A(_(j,j+bs-1), _(j+bs,n));
auto A21 = A(_(j+bs,m), _(j,j+bs-1));
auto A22 = A(_(j+bs,m), _(j+bs,n));
info = lu_unblk(A11);
if (info) {
return info+j-1;
}
// Compute A21*inv(A11.upper())
blas::sm(Right, NoTrans, One, A11.upper(), A21);
// Compute inv(A11.lowerUnit())*A12
blas::sm(Left, NoTrans, One, A11.lowerUnit(), A12);
A22 -= A21*A12;
}
return 0;
}
} // namespace flens
Code for Testing
#include <flens/flens.cxx>
#include <flens/examples/lu_blk.h>
using namespace flens;
using namespace std;
int
main()
{
typedef GeMatrix<FullStorage<double> > RealGeMatrix;
const int m = 2000;
const int n = 2000;
const int mn = std::min(m, n);
RealGeMatrix A(m, n), A_check;
fillRandom(A);
A_check = A;
if (lu_blk(A)) {
cerr << "A is singular" << endl;
return 1;
}
Underscore<int> _;
RealGeMatrix L = A(_,_(1,mn)).lowerUnit();
RealGeMatrix U = A(_(1,mn),_).upper();
A_check -= L*U;
cout << "asum(A-L*U) = " << blas::asum(A_check) << endl;
return 0;
}
Compile and Run
We again compile and run the test with ATLAS here.
$shell> cd flens/examples $shell> g++ -Wall -std=c++11 -I../.. -o lu_blk-test lu_blk-test.cc -DWITH_ATLAS -framework vecLib $shell> time ./lu_blk-test asum(A-L*U) = 1.3317e-09 real 0m3.587s user 0m5.209s sys 0m0.161s