Unblocked LU factorization

Content

In the lecture we introduced the following algorithm for an unblocked LU factorization:

  • for \(j=0, \dots, \min\{m,n\}-1\)

    • Find pivot \(p_j\) (using iamax) in \(\left(\begin{array}{c} a_{j,j} \\ \mathbf{a}_{j+1,j} \end{array}\right)\)

      • if \(j \neq p_j\) interchange in \(A\) row \(j\) and row \(p_j\)

    • if element \(a_{j,j}\) is exactly zero return \(j\) (matrix is numerically singular).

    • Scaling: \(\mathbf{a}_{j+1,j} \leftarrow \frac{1}{a_{j,j}} \mathbf{a}_{j+1,j}\)

    • Rank-1 update: \(A_{j+1,j+1} \leftarrow A_{j+1,j+1} - \mathbf{a}_{j+1,j} \mathbf{a}_{j,j+1}^T\)

  • return -1

where we partitioned \(A\) in the \(j\)-th step as follows:

\[A =\left(\begin{array}{l|l|l} A_{0,0} & \mathbf{a}_{0,j} & A_{0,j+1} \\ \hline \mathbf{a}_{j,0}^T & a_{j,j} & \mathbf{a}_{j,j+1}^T \\ \hline A_{j+1,0} & \mathbf{a}_{j+1,j} & A_{j+1,j+1} \end{array}\right)\]

Exercise

Implement the unblocked LU factorization. Using the provided material you only have to implement:

Provided Matrial

Copy the files ulmblas_level1.hpp, ulmblas_level2.hpp, ulmblas_level3.hpp, test.hpp and lu_exampl.cpp into a new directory.

You can compile the test on theon with g++ -Wall -std=c++17 lu_example.cpp

#ifndef ULMBLAS_LEVEL1_HPP
#define ULMBLAS_LEVEL1_HPP

#include <cstddef>      // for std::size_t, std::ptrdiff_t
#include <cmath>        // for std::abs (floating point)
#include <cstdlib>      // for std::abs (integer)

namespace ulmblas {

//==============================================================================
//
// BLAS Level 1 functions for vectors
//
//==============================================================================

//
// copy:  y <- x
//
// (adopted from session 3)
//
template <typename TX, typename TY>
void
copy(std::size_t n, const TX *x, std::ptrdiff_t incX,
     TY *y, std::ptrdiff_t incY)
{
    for (std::size_t i=0; i<n; ++i) {
        y[i*incY] = x[i*incX];
    }
}

//
// swap:  x <-> y
//
// (adopted from session 3)
//
template <typename TX, typename TY>
void
swap(std::size_t n, TX *x, std::ptrdiff_t incX, TY *y, std::ptrdiff_t incY)
{
    for (std::size_t i=0; i<n; ++i) {
        TX t = x[i*incX];
        x[i*incX] = y[i*incY];
        y[i*incY] = t;
    }
}

//
// scal: x <- alpha*x
//
// (adopted from session 6)
//
template <typename Alpha, typename TX>
void
scal(std::size_t n, Alpha alpha, TX *x, std::ptrdiff_t incX)
{
    if (alpha==Alpha(1)) {
        return;
    }
    if (alpha==Alpha(0)) {
        for (std::size_t i=0; i<n; ++i) {
            x[i*incX] = 0;
        }
    } else {
        for (std::size_t i=0; i<n; ++i) {
            x[i*incX] *= alpha;
        }
    }
}

//
// axpy: y <- y + alpha*x
//
// (adopted from session 6)
//
template <typename Alpha, typename TX, typename TY>
void
axpy(std::size_t n, Alpha alpha,
     const TX *x, std::ptrdiff_t incX,
     TY *y, std::ptrdiff_t incY)
{
    if (alpha==Alpha(0)) {
        return;
    }
    for (std::size_t i=0; i<n; ++i) {
        y[i*incY] += alpha*x[i*incX];
    }
}

//==============================================================================
//
// BLAS Level 1 functions for matrices
//
//==============================================================================

//
// gecopy: B <- A
//
// (adopted from session 6)
//
template <typename TA, typename TB>
void
gecopy(std::size_t m, std::size_t n,
       const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
       TB *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
    if (m==0 || n==0) {
        return;
    }
    // if B is row major:   B^T <- A^T
    if (std::abs(incRowB) > std::abs(incColB)) {
        gecopy(n, m, A, incColA, incRowA, B, incColB, incRowB);
        return;
    }
    // B is col major:
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t i=0; i<m; ++i) {
            B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
        }
    }
}

//
// gescal: A <- alpha*A
//
// (adopted from session 6)
//
template <typename Alpha, typename TA>
void
gescal(std::size_t m, std::size_t n, Alpha alpha,
       TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    if (alpha==Alpha(1) || m==0 || n==0) {
        return;
    }
    // if A is row major: A^T <- alpha*A^T
    if (std::abs(incRowA) > std::abs(incColA)) {
        gescal(n, m, alpha, A, incColA, incRowA);
        return;
    }
    // A is col major:
    if (alpha!=Alpha(0)) {
        // Scale A column wise with alpha
       for (std::size_t j=0; j<n; ++j) {
           for (std::size_t i=0; i<m; ++i) {
               A[i*incRowA+j*incColA] *= alpha;
           }
        }
    } else {
        // A might contain Nan values: overwrite with zeros
        for (std::size_t j=0; j<n; ++j) {
            for (std::size_t i=0; i<m; ++i) {
                A[i*incRowA+j*incColA] = 0;
            }
        }
    }
}

//
// geaxpy: B <- B + alpha*A
//
// (adopted from session 6)
//
template <typename Alpha, typename TA, typename TB>
void
geaxpy(std::size_t m, std::size_t n, Alpha alpha,
       const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
       TB *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
    if (m==0 || n==0) {
        return;
    }
    // if B is row major:   B^T <- alpha*A^T + B^T
    if (std::abs(incRowB) > std::abs(incColB)) {
        geaxpy(n, m, alpha, A, incColA, incRowA, B, incColB, incRowB);
        return;
    }
    // B is col major:
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t i=0; i<m; ++i) {
            B[i*incRowB+j*incColB] += alpha*A[i*incRowA+j*incColA];
        }
    }
}

} // namespace ulmblas

#endif // ULMBLAS_LEVEL1_HPP
#ifndef ULMBLAS_LEVEL2_HPP
#define ULMBLAS_LEVEL2_HPP

#include <cstddef>      // for std::size_t, std::ptrdiff_t
#include <cmath>        // for std::abs (floating point)
#include <cstdlib>      // for std::abs (integer)

namespace ulmblas {

//==============================================================================
//
// BLAS Level 2 functions
//
//==============================================================================

//
//  ger: A <- A + alpha * x*y^T  (rank 1 update)
//
template <typename Alpha, typename TX, typename TY, typename TA>
void
ger(std::size_t m, std::size_t n, Alpha alpha,
    const TX *x, std::ptrdiff_t incX,
    const TY *y, std::ptrdiff_t incY,
    TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    if (m==0 || n==0 || alpha==Alpha(0)) {
        return;
    }
    // if A is row major:   A^T <- A^T + alpha * y*x^T
    if (std::abs(incRowA) > std::abs(incColA)) {
        ger(n, m, alpha, y, incY, x, incX, A, incColA, incRowA);
        return;
    }
    // A is col major:
    // TODO: Your code
}

} // namespace ulmblas

#endif // ULMBLAS_LEVEL2_HPP
#ifndef ULMBLAS_LEVEL3_HPP
#define ULMBLAS_LEVEL3_HPP

#include <cstddef>              // for std::size_t, std::ptrdiff_t
#include <cmath>                // for std::abs (floating point)
#include <cstdlib>              // for std::abs (integer)

#include "ulmblas_level1.hpp"   // for gescal

namespace ulmblas {

//==============================================================================
//
// BLAS Level 3 functions
//
//==============================================================================

//
//  gemm: C <- beta*C + alpha * A * B
//
//  Note: This is just a simple reference implementation!
//
template <typename Alpha, typename TA, typename TB, typename Beta, typename TC>
void
gemm(std::size_t m, std::size_t n, std::size_t k,
     Alpha alpha,
     const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
     const TB *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
     Beta beta,
     TC *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
    gescal(m, n, beta, C, incRowC, incColC);
    if (m==0 || n==0 || k==0 || alpha==Alpha(0)) {
        return;
    }
    for (std::size_t l=0; l<k; ++l) {
        for (std::size_t j=0; j<n; ++j) {
            for (std::size_t i=0; i<m; ++i) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

} // namespace ulmblas

#endif // ULMBLAS_LEVEL2_HPP
#ifndef TEST_HPP
#define TEST_HPP

#include <cmath>
#include <limits>
#include <algorithm>                 // for std::min
#include "ulmblas_level1.hpp"        // for ulmblas::swap(), ulmblas::geaxpy()
#include "ulmblas_level3.hpp"        // for ulmblas::gemm()

namespace test {

template <typename TA>
auto
norminf(std::size_t m, std::size_t n,
        const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    TA res = 0;
    for (std::size_t i=0; i<m; ++i) {
        TA asum = 0;
        for (std::size_t j=0; j<n; ++j) {
            asum += std::abs(A[i*incRowA + j*incColA]);
        }
        if (std::isnan(asum)) {
            return asum;
        }
        if (asum>res) {
            res = asum;
        }
    }
    return res;
}

template <typename TP, typename TA>
void
swap(std::size_t m, std::size_t n, const TP *p, std::ptrdiff_t incP,
     std::size_t i0, std::size_t i1,
     TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    std::ptrdiff_t inc = i0<i1 ? 1 : -1;

    i1 += inc;

    for (std::size_t i=i0; i!=i1; i+=inc) {
        if (i!=p[i*incP]) {
            ulmblas::swap(n,
                          &A[i*incRowA], incColA,
                          &A[p[i*incP]*incRowA], incColA);
        }
    }
}

template <typename T, typename TP>
double
lu_err(std::size_t m, std::size_t n,
       const T *A0, std::ptrdiff_t incRowA0, std::ptrdiff_t incColA0,
       const T *LU, std::ptrdiff_t incRowLU, std::ptrdiff_t incColLU,
       const TP *p, std::ptrdiff_t incP)
{
    std::size_t k = std::min(m, n);

    T *A = new T[m*n];
    T *L = new T[m*k];
    T *U = new T[k*n];

    // copy L-part from A
    for (std::size_t l=0; l<k; ++l) {
        for (std::size_t i=0; i<m; ++i) {
            L[i+l*m] = (i>l)  ? LU[i*incRowLU + l*incColLU] :
                       (i==l) ? T(1)   :
                                T(0);
        }
    }
    // copy U-part from A
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t l=0; l<k; ++l) {
            U[l + j*k] = (l>j)  ? T(0)
                                : LU[l*incRowLU + j*incColLU];
        }
    }

    // A = L*U
    ulmblas::gemm(m, n, k,T(1),
                  L, 1, m,
                  U, 1, k,
                  T(0),
                  A, 1, m);

    // A = P^{-1}*A
    swap(m, n, p, incP, std::min(m,n)-1, 0, A, 1, m);

    ulmblas::geaxpy(m, n, -1, A0, incRowA0, incColA0, A, 1, m);

    auto eps = std::numeric_limits<T>::epsilon();

    delete [] U;
    delete [] L;
    delete [] A;

    return norminf(m, n, A, 1, m)
         / (norminf(m, n, A0, incRowA0, incColA0)*eps*std::min(m,n));
}


} // namespace test

#endif //  TEST_HPP
#include <cstdlib>
#include <cmath>
#include <cstdio>
#include <printf.hpp>

#include "test.hpp"
#include "ulmblas_level1.hpp"
#include "ulmblas_level2.hpp"

template <typename TA>
void
initMatrix(std::size_t m, std::size_t n,
           TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
           bool withNan = false)
{
    // if A is row major initialize A^T
    if (std::abs(incRowA) > std::abs(incColA)) {
        initMatrix(n, m, A, incColA, incRowA, withNan);
        return;
    }
    // if A is col major
    if (withNan) {
        for (size_t j=0; j<n; ++j) {
            for (size_t i=0; i<m; ++i) {
                A[i*incRowA+j*incColA] = nan("");
            }
        }
    } else {
        for (size_t j=0; j<n; ++j) {
            for (size_t i=0; i<m; ++i) {
                double rValue = ((double)rand() - RAND_MAX/2)*2/RAND_MAX;
                A[i*incRowA+j*incColA] = rValue;
            }
        }
    }
}

template <typename TA>
void
printMatrix(std::size_t m, std::size_t n,
            const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    for (std::size_t i=0; i<m; ++i) {
        for (std::size_t j=0; j<n; ++j) {
            fmt::printf("%10.3lf ", A[i*incRowA+j*incColA]);
        }
        std::printf("\n");
    }
    std::printf("\n");
}

//------------------------------------------------------------------------------

template <typename TX>
std::size_t
iamax(std::size_t n, const TX *x, std::ptrdiff_t incX)
{
    // TODO: Your code
}

template <typename TA, typename TP>
std::ptrdiff_t
lu_ger(std::size_t m, std::size_t n,
       TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
       TP *p, std::ptrdiff_t incP)
{
    // TODO: Your code
}

//------------------------------------------------------------------------------


#ifndef COLMAJOR
#define COLMAJOR 1
#endif

int
main()
{
    std::size_t    m = 5;
    std::size_t    n = 6;
    std::ptrdiff_t incRowA = COLMAJOR ? 1 : n;
    std::ptrdiff_t incColA = COLMAJOR ? m : 1;
    std::ptrdiff_t incP    = 1;

    double *A       = new double[m*n];
    double *A0      = new double[m*n];
    std::size_t *p  = new std::size_t[std::min(m,n)];

    initMatrix(m, n, A, incRowA, incColA);
    ulmblas::gecopy(m, n, A, incRowA, incColA, A0, incRowA, incColA);

    printMatrix(m, n, A0, incRowA, incColA);

    std::ptrdiff_t info = lu_ger(m, n, A, incRowA, incColA, p, incP);

    printMatrix(m, n, A, incRowA, incColA);
    printMatrix(std::min(m, n), 1, p, incP, 1);
    fmt::printf("info = %d\n", info);

    double err = test::lu_err(m, n,
                              A0, incRowA, incColA,
                              A, incRowA, incColA,
                              p, incP);

    std::printf("err = %e\n", err);

    delete [] A;
    delete [] A0;
    delete [] p;
}