# Unblocked LU factorization

#### Content

In the lecture we introduced the following algorithm for an unblocked LU factorization:

• for $$j=0, \dots, \min\{m,n\}-1$$

• Find pivot $$p_j$$ (using iamax) in $$\left(\begin{array}{c} a_{j,j} \\ \mathbf{a}_{j+1,j} \end{array}\right)$$

• if $$j \neq p_j$$ interchange in $$A$$ row $$j$$ and row $$p_j$$

• if element $$a_{j,j}$$ is exactly zero return $$j$$ (matrix is numerically singular).

• Scaling: $$\mathbf{a}_{j+1,j} \leftarrow \frac{1}{a_{j,j}} \mathbf{a}_{j+1,j}$$

• Rank-1 update: $$A_{j+1,j+1} \leftarrow A_{j+1,j+1} - \mathbf{a}_{j+1,j} \mathbf{a}_{j,j+1}^T$$

• return -1

where we partitioned $$A$$ in the $$j$$-th step as follows:

$A =\left(\begin{array}{l|l|l} A_{0,0} & \mathbf{a}_{0,j} & A_{0,j+1} \\ \hline \mathbf{a}_{j,0}^T & a_{j,j} & \mathbf{a}_{j,j+1}^T \\ \hline A_{j+1,0} & \mathbf{a}_{j+1,j} & A_{j+1,j+1} \end{array}\right)$

## Exercise

Implement the unblocked LU factorization. Using the provided material you only have to implement:

• Function ger in ulmblas_level2.hpp

• Function iamax and lu_ger in ulmblas_level2.hpp

## Provided Matrial

Copy the files ulmblas_level1.hpp, ulmblas_level2.hpp, ulmblas_level3.hpp, test.hpp and lu_exampl.cpp into a new directory.

You can compile the test on theon with g++ -Wall -std=c++17 lu_example.cpp

#ifndef ULMBLAS_LEVEL1_HPP
#define ULMBLAS_LEVEL1_HPP

#include <cstddef>      // for std::size_t, std::ptrdiff_t
#include <cmath>        // for std::abs (floating point)
#include <cstdlib>      // for std::abs (integer)

namespace ulmblas {

//==============================================================================
//
// BLAS Level 1 functions for vectors
//
//==============================================================================

//
// copy:  y <- x
//
//
template <typename TX, typename TY>
void
copy(std::size_t n, const TX *x, std::ptrdiff_t incX,
TY *y, std::ptrdiff_t incY)
{
for (std::size_t i=0; i<n; ++i) {
y[i*incY] = x[i*incX];
}
}

//
// swap:  x <-> y
//
//
template <typename TX, typename TY>
void
swap(std::size_t n, TX *x, std::ptrdiff_t incX, TY *y, std::ptrdiff_t incY)
{
for (std::size_t i=0; i<n; ++i) {
TX t = x[i*incX];
x[i*incX] = y[i*incY];
y[i*incY] = t;
}
}

//
// scal: x <- alpha*x
//
//
template <typename Alpha, typename TX>
void
scal(std::size_t n, Alpha alpha, TX *x, std::ptrdiff_t incX)
{
if (alpha==Alpha(1)) {
return;
}
if (alpha==Alpha(0)) {
for (std::size_t i=0; i<n; ++i) {
x[i*incX] = 0;
}
} else {
for (std::size_t i=0; i<n; ++i) {
x[i*incX] *= alpha;
}
}
}

//
// axpy: y <- y + alpha*x
//
//
template <typename Alpha, typename TX, typename TY>
void
axpy(std::size_t n, Alpha alpha,
const TX *x, std::ptrdiff_t incX,
TY *y, std::ptrdiff_t incY)
{
if (alpha==Alpha(0)) {
return;
}
for (std::size_t i=0; i<n; ++i) {
y[i*incY] += alpha*x[i*incX];
}
}

//==============================================================================
//
// BLAS Level 1 functions for matrices
//
//==============================================================================

//
// gecopy: B <- A
//
//
template <typename TA, typename TB>
void
gecopy(std::size_t m, std::size_t n,
const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
TB *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
if (m==0 || n==0) {
return;
}
// if B is row major:   B^T <- A^T
if (std::abs(incRowB) > std::abs(incColB)) {
gecopy(n, m, A, incColA, incRowA, B, incColB, incRowB);
return;
}
// B is col major:
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
}
}
}

//
// gescal: A <- alpha*A
//
//
template <typename Alpha, typename TA>
void
gescal(std::size_t m, std::size_t n, Alpha alpha,
TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
if (alpha==Alpha(1) || m==0 || n==0) {
return;
}
// if A is row major: A^T <- alpha*A^T
if (std::abs(incRowA) > std::abs(incColA)) {
gescal(n, m, alpha, A, incColA, incRowA);
return;
}
// A is col major:
if (alpha!=Alpha(0)) {
// Scale A column wise with alpha
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] *= alpha;
}
}
} else {
// A might contain Nan values: overwrite with zeros
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = 0;
}
}
}
}

//
// geaxpy: B <- B + alpha*A
//
//
template <typename Alpha, typename TA, typename TB>
void
geaxpy(std::size_t m, std::size_t n, Alpha alpha,
const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
TB *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
if (m==0 || n==0) {
return;
}
// if B is row major:   B^T <- alpha*A^T + B^T
if (std::abs(incRowB) > std::abs(incColB)) {
geaxpy(n, m, alpha, A, incColA, incRowA, B, incColB, incRowB);
return;
}
// B is col major:
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
B[i*incRowB+j*incColB] += alpha*A[i*incRowA+j*incColA];
}
}
}

} // namespace ulmblas

#endif // ULMBLAS_LEVEL1_HPP

#ifndef ULMBLAS_LEVEL2_HPP
#define ULMBLAS_LEVEL2_HPP

#include <cstddef>      // for std::size_t, std::ptrdiff_t
#include <cmath>        // for std::abs (floating point)
#include <cstdlib>      // for std::abs (integer)

namespace ulmblas {

//==============================================================================
//
// BLAS Level 2 functions
//
//==============================================================================

//
//  ger: A <- A + alpha * x*y^T  (rank 1 update)
//
template <typename Alpha, typename TX, typename TY, typename TA>
void
ger(std::size_t m, std::size_t n, Alpha alpha,
const TX *x, std::ptrdiff_t incX,
const TY *y, std::ptrdiff_t incY,
TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
if (m==0 || n==0 || alpha==Alpha(0)) {
return;
}
// if A is row major:   A^T <- A^T + alpha * y*x^T
if (std::abs(incRowA) > std::abs(incColA)) {
ger(n, m, alpha, y, incY, x, incX, A, incColA, incRowA);
return;
}
// A is col major:
}

} // namespace ulmblas

#endif // ULMBLAS_LEVEL2_HPP

#ifndef ULMBLAS_LEVEL3_HPP
#define ULMBLAS_LEVEL3_HPP

#include <cstddef>              // for std::size_t, std::ptrdiff_t
#include <cmath>                // for std::abs (floating point)
#include <cstdlib>              // for std::abs (integer)

#include "ulmblas_level1.hpp"   // for gescal

namespace ulmblas {

//==============================================================================
//
// BLAS Level 3 functions
//
//==============================================================================

//
//  gemm: C <- beta*C + alpha * A * B
//
//  Note: This is just a simple reference implementation!
//
template <typename Alpha, typename TA, typename TB, typename Beta, typename TC>
void
gemm(std::size_t m, std::size_t n, std::size_t k,
Alpha alpha,
const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
const TB *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
Beta beta,
TC *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
gescal(m, n, beta, C, incRowC, incColC);
if (m==0 || n==0 || k==0 || alpha==Alpha(0)) {
return;
}
for (std::size_t l=0; l<k; ++l) {
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}

} // namespace ulmblas

#endif // ULMBLAS_LEVEL2_HPP

#ifndef TEST_HPP
#define TEST_HPP

#include <cmath>
#include <limits>
#include <algorithm>                 // for std::min
#include "ulmblas_level1.hpp"        // for ulmblas::swap(), ulmblas::geaxpy()
#include "ulmblas_level3.hpp"        // for ulmblas::gemm()

namespace test {

template <typename TA>
auto
norminf(std::size_t m, std::size_t n,
const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
TA res = 0;
for (std::size_t i=0; i<m; ++i) {
TA asum = 0;
for (std::size_t j=0; j<n; ++j) {
asum += std::abs(A[i*incRowA + j*incColA]);
}
if (std::isnan(asum)) {
return asum;
}
if (asum>res) {
res = asum;
}
}
return res;
}

template <typename TP, typename TA>
void
swap(std::size_t m, std::size_t n, const TP *p, std::ptrdiff_t incP,
std::size_t i0, std::size_t i1,
TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
std::ptrdiff_t inc = i0<i1 ? 1 : -1;

i1 += inc;

for (std::size_t i=i0; i!=i1; i+=inc) {
if (i!=p[i*incP]) {
ulmblas::swap(n,
&A[i*incRowA], incColA,
&A[p[i*incP]*incRowA], incColA);
}
}
}

template <typename T, typename TP>
double
lu_err(std::size_t m, std::size_t n,
const T *A0, std::ptrdiff_t incRowA0, std::ptrdiff_t incColA0,
const T *LU, std::ptrdiff_t incRowLU, std::ptrdiff_t incColLU,
const TP *p, std::ptrdiff_t incP)
{
std::size_t k = std::min(m, n);

T *A = new T[m*n];
T *L = new T[m*k];
T *U = new T[k*n];

// copy L-part from A
for (std::size_t l=0; l<k; ++l) {
for (std::size_t i=0; i<m; ++i) {
L[i+l*m] = (i>l)  ? LU[i*incRowLU + l*incColLU] :
(i==l) ? T(1)   :
T(0);
}
}
// copy U-part from A
for (std::size_t j=0; j<n; ++j) {
for (std::size_t l=0; l<k; ++l) {
U[l + j*k] = (l>j)  ? T(0)
: LU[l*incRowLU + j*incColLU];
}
}

// A = L*U
ulmblas::gemm(m, n, k,T(1),
L, 1, m,
U, 1, k,
T(0),
A, 1, m);

// A = P^{-1}*A
swap(m, n, p, incP, std::min(m,n)-1, 0, A, 1, m);

ulmblas::geaxpy(m, n, -1, A0, incRowA0, incColA0, A, 1, m);

auto eps = std::numeric_limits<T>::epsilon();

delete [] U;
delete [] L;
delete [] A;

return norminf(m, n, A, 1, m)
/ (norminf(m, n, A0, incRowA0, incColA0)*eps*std::min(m,n));
}

} // namespace test

#endif //  TEST_HPP

#include <cstdlib>
#include <cmath>
#include <cstdio>
#include <printf.hpp>

#include "test.hpp"
#include "ulmblas_level1.hpp"
#include "ulmblas_level2.hpp"

template <typename TA>
void
initMatrix(std::size_t m, std::size_t n,
TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
bool withNan = false)
{
// if A is row major initialize A^T
if (std::abs(incRowA) > std::abs(incColA)) {
initMatrix(n, m, A, incColA, incRowA, withNan);
return;
}
// if A is col major
if (withNan) {
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = nan("");
}
}
} else {
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
double rValue = ((double)rand() - RAND_MAX/2)*2/RAND_MAX;
A[i*incRowA+j*incColA] = rValue;
}
}
}
}

template <typename TA>
void
printMatrix(std::size_t m, std::size_t n,
const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
for (std::size_t i=0; i<m; ++i) {
for (std::size_t j=0; j<n; ++j) {
fmt::printf("%10.3lf ", A[i*incRowA+j*incColA]);
}
std::printf("\n");
}
std::printf("\n");
}

//------------------------------------------------------------------------------

template <typename TX>
std::size_t
iamax(std::size_t n, const TX *x, std::ptrdiff_t incX)
{
}

template <typename TA, typename TP>
std::ptrdiff_t
lu_ger(std::size_t m, std::size_t n,
TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
TP *p, std::ptrdiff_t incP)
{
}

//------------------------------------------------------------------------------

#ifndef COLMAJOR
#define COLMAJOR 1
#endif

int
main()
{
std::size_t    m = 5;
std::size_t    n = 6;
std::ptrdiff_t incRowA = COLMAJOR ? 1 : n;
std::ptrdiff_t incColA = COLMAJOR ? m : 1;
std::ptrdiff_t incP    = 1;

double *A       = new double[m*n];
double *A0      = new double[m*n];
std::size_t *p  = new std::size_t[std::min(m,n)];

initMatrix(m, n, A, incRowA, incColA);
ulmblas::gecopy(m, n, A, incRowA, incColA, A0, incRowA, incColA);

printMatrix(m, n, A0, incRowA, incColA);

std::ptrdiff_t info = lu_ger(m, n, A, incRowA, incColA, p, incP);

printMatrix(m, n, A, incRowA, incColA);
printMatrix(std::min(m, n), 1, p, incP, 1);
fmt::printf("info = %d\n", info);

double err = test::lu_err(m, n,
A0, incRowA, incColA,
A, incRowA, incColA,
p, incP);

std::printf("err = %e\n", err);

delete [] A;
delete [] A0;
delete [] p;
}