Possible solution

Content

#ifndef ULMBLAS_LEVEL1_HPP
#define ULMBLAS_LEVEL1_HPP

#include <cstddef>      // for std::size_t, std::ptrdiff_t
#include <cmath>        // for std::abs (floating point)
#include <cstdlib>      // for std::abs (integer)

namespace ulmblas {

//==============================================================================
//
// BLAS Level 1 functions for vectors
//
//==============================================================================

//
// copy:  y <- x
//
// (adopted from session 3)
//
template <typename TX, typename TY>
void
copy(std::size_t n, const TX *x, std::ptrdiff_t incX,
     TY *y, std::ptrdiff_t incY)
{
    for (std::size_t i=0; i<n; ++i) {
        y[i*incY] = x[i*incX];
    }
}

//
// swap:  x <-> y
//
// (adopted from session 3)
//
template <typename TX, typename TY>
void
swap(std::size_t n, TX *x, std::ptrdiff_t incX, TY *y, std::ptrdiff_t incY)
{
    for (std::size_t i=0; i<n; ++i) {
        TX t = x[i*incX];
        x[i*incX] = y[i*incY];
        y[i*incY] = t;
    }
}

//
// scal: x <- alpha*x
//
// (adopted from session 6)
//
template <typename Alpha, typename TX>
void
scal(std::size_t n, Alpha alpha, TX *x, std::ptrdiff_t incX)
{
    if (alpha==Alpha(1)) {
        return;
    }
    if (alpha==Alpha(0)) {
        for (std::size_t i=0; i<n; ++i) {
            x[i*incX] = 0;
        }
    } else {
        for (std::size_t i=0; i<n; ++i) {
            x[i*incX] *= alpha;
        }
    }
}

//
// axpy: y <- y + alpha*x
//
// (adopted from session 6)
//
template <typename Alpha, typename TX, typename TY>
void
axpy(std::size_t n, Alpha alpha,
     const TX *x, std::ptrdiff_t incX,
     TY *y, std::ptrdiff_t incY)
{
    if (alpha==Alpha(0)) {
        return;
    }
    for (std::size_t i=0; i<n; ++i) {
        y[i*incY] += alpha*x[i*incX];
    }
}

//==============================================================================
//
// BLAS Level 1 functions for matrices
//
//==============================================================================

//
// gecopy: B <- A
//
// (adopted from session 6)
//
template <typename TA, typename TB>
void
gecopy(std::size_t m, std::size_t n,
       const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
       TB *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
    if (m==0 || n==0) {
        return;
    }
    // if B is row major:   B^T <- A^T
    if (std::abs(incRowB) > std::abs(incColB)) {
        gecopy(n, m, A, incColA, incRowA, B, incColB, incRowB);
        return;
    }
    // B is col major:
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t i=0; i<m; ++i) {
            B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
        }
    }
}

//
// gescal: A <- alpha*A
//
// (adopted from session 6)
//
template <typename Alpha, typename TA>
void
gescal(std::size_t m, std::size_t n, Alpha alpha,
       TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    if (alpha==Alpha(1) || m==0 || n==0) {
        return;
    }
    // if A is row major: A^T <- alpha*A^T
    if (std::abs(incRowA) > std::abs(incColA)) {
        gescal(n, m, alpha, A, incColA, incRowA);
        return;
    }
    // A is col major:
    if (alpha!=Alpha(0)) {
        // Scale A column wise with alpha
       for (std::size_t j=0; j<n; ++j) {
           for (std::size_t i=0; i<m; ++i) {
               A[i*incRowA+j*incColA] *= alpha;
           }
        }
    } else {
        // A might contain Nan values: overwrite with zeros
        for (std::size_t j=0; j<n; ++j) {
            for (std::size_t i=0; i<m; ++i) {
                A[i*incRowA+j*incColA] = 0;
            }
        }
    }
}

//
// geaxpy: B <- B + alpha*A
//
// (adopted from session 6)
//
template <typename Alpha, typename TA, typename TB>
void
geaxpy(std::size_t m, std::size_t n, Alpha alpha,
       const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
       TB *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
    if (m==0 || n==0) {
        return;
    }
    // if B is row major:   B^T <- alpha*A^T + B^T
    if (std::abs(incRowB) > std::abs(incColB)) {
        geaxpy(n, m, alpha, A, incColA, incRowA, B, incColB, incRowB);
        return;
    }
    // B is col major:
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t i=0; i<m; ++i) {
            B[i*incRowB+j*incColB] += alpha*A[i*incRowA+j*incColA];
        }
    }
}

} // namespace ulmblas

#endif // ULMBLAS_LEVEL1_HPP
#ifndef ULMBLAS_LEVEL2_HPP
#define ULMBLAS_LEVEL2_HPP

#include <cstddef>      // for std::size_t, std::ptrdiff_t
#include <cmath>        // for std::abs (floating point)
#include <cstdlib>      // for std::abs (integer)

namespace ulmblas {

//==============================================================================
//
// BLAS Level 2 functions
//
//==============================================================================

//
//  ger: A <- A + alpha * x*y^T  (rank 1 update)
//
template <typename Alpha, typename TX, typename TY, typename TA>
void
ger(std::size_t m, std::size_t n, Alpha alpha,
    const TX *x, std::ptrdiff_t incX,
    const TY *y, std::ptrdiff_t incY,
    TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    if (m==0 || n==0 || alpha==Alpha(0)) {
        return;
    }
    // if A is row major:   A^T <- A^T + alpha * y*x^T
    if (std::abs(incRowA) > std::abs(incColA)) {
        ger(n, m, alpha, y, incY, x, incX, A, incColA, incRowA);
        return;
    }
    // A is col major:
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] += alpha * x[i*incX]*y[j*incY];
        }
    }
}

} // namespace ulmblas

#endif // ULMBLAS_LEVEL2_HPP
#ifndef ULMBLAS_LEVEL3_HPP
#define ULMBLAS_LEVEL3_HPP

#include <cstddef>              // for std::size_t, std::ptrdiff_t
#include <cmath>                // for std::abs (floating point)
#include <cstdlib>              // for std::abs (integer)

#include "ulmblas_level1.hpp"   // for gescal

namespace ulmblas {

//==============================================================================
//
// BLAS Level 3 functions
//
//==============================================================================

//
//  gemm: C <- beta*C + alpha * A * B
//
//  Note: This is just a simple reference implementation!
//
template <typename Alpha, typename TA, typename TB, typename Beta, typename TC>
void
gemm(std::size_t m, std::size_t n, std::size_t k,
     Alpha alpha,
     const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
     const TB *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
     Beta beta,
     TC *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
    gescal(m, n, beta, C, incRowC, incColC);
    if (m==0 || n==0 || k==0 || alpha==Alpha(0)) {
        return;
    }
    for (std::size_t l=0; l<k; ++l) {
        for (std::size_t j=0; j<n; ++j) {
            for (std::size_t i=0; i<m; ++i) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

} // namespace ulmblas

#endif // ULMBLAS_LEVEL2_HPP
#ifndef TEST_HPP
#define TEST_HPP

#include <cmath>
#include <limits>
#include <algorithm>                 // for std::min
#include "ulmblas_level1.hpp"        // for ulmblas::swap(), ulmblas::geaxpy()
#include "ulmblas_level3.hpp"        // for ulmblas::gemm()

namespace test {

template <typename TA>
auto
norminf(std::size_t m, std::size_t n,
        const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    TA res = 0;
    for (std::size_t i=0; i<m; ++i) {
        TA asum = 0;
        for (std::size_t j=0; j<n; ++j) {
            asum += std::abs(A[i*incRowA + j*incColA]);
        }
        if (std::isnan(asum)) {
            return asum;
        }
        if (asum>res) {
            res = asum;
        }
    }
    return res;
}

template <typename TP, typename TA>
void
swap(std::size_t m, std::size_t n, const TP *p, std::ptrdiff_t incP,
     std::size_t i0, std::size_t i1,
     TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    std::ptrdiff_t inc = i0<i1 ? 1 : -1;

    i1 += inc;

    for (std::size_t i=i0; i!=i1; i+=inc) {
        if (i!=p[i*incP]) {
            ulmblas::swap(n,
                          &A[i*incRowA], incColA,
                          &A[p[i*incP]*incRowA], incColA);
        }
    }
}

template <typename T, typename TP>
double
lu_err(std::size_t m, std::size_t n,
       const T *A0, std::ptrdiff_t incRowA0, std::ptrdiff_t incColA0,
       const T *LU, std::ptrdiff_t incRowLU, std::ptrdiff_t incColLU,
       const TP *p, std::ptrdiff_t incP)
{
    std::size_t k = std::min(m, n);

    T *A = new T[m*n];
    T *L = new T[m*k];
    T *U = new T[k*n];

    // copy L-part from A
    for (std::size_t l=0; l<k; ++l) {
        for (std::size_t i=0; i<m; ++i) {
            L[i+l*m] = (i>l)  ? LU[i*incRowLU + l*incColLU] :
                       (i==l) ? T(1)   :
                                T(0);
        }
    }
    // copy U-part from A
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t l=0; l<k; ++l) {
            U[l + j*k] = (l>j)  ? T(0)
                                : LU[l*incRowLU + j*incColLU];
        }
    }

    // A = L*U
    ulmblas::gemm(m, n, k,T(1),
                  L, 1, m,
                  U, 1, k,
                  T(0),
                  A, 1, m);

    // A = P^{-1}*A
    swap(m, n, p, incP, std::min(m,n)-1, 0, A, 1, m);

    ulmblas::geaxpy(m, n, -1, A0, incRowA0, incColA0, A, 1, m);

    auto eps = std::numeric_limits<T>::epsilon();

    delete [] U;
    delete [] L;
    delete [] A;

    return norminf(m, n, A, 1, m)
         / (norminf(m, n, A0, incRowA0, incColA0)*eps*std::min(m,n));
}


} // namespace test

#endif //  TEST_HPP
#include <cstdlib>
#include <cmath>
#include <cstdio>
#include <printf.hpp>

#include "test.hpp"
#include "ulmblas_level1.hpp"
#include "ulmblas_level2.hpp"

template <typename TA>
void
initMatrix(std::size_t m, std::size_t n,
           TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
           bool withNan = false)
{
    // if A is row major initialize A^T
    if (std::abs(incRowA) > std::abs(incColA)) {
        initMatrix(n, m, A, incColA, incRowA, withNan);
        return;
    }
    // if A is col major
    if (withNan) {
        for (size_t j=0; j<n; ++j) {
            for (size_t i=0; i<m; ++i) {
                A[i*incRowA+j*incColA] = nan("");
            }
        }
    } else {
        for (size_t j=0; j<n; ++j) {
            for (size_t i=0; i<m; ++i) {
                double rValue = ((double)rand() - RAND_MAX/2)*2/RAND_MAX;
                A[i*incRowA+j*incColA] = rValue;
            }
        }
    }
}

template <typename TA>
void
printMatrix(std::size_t m, std::size_t n,
            const TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    for (std::size_t i=0; i<m; ++i) {
        for (std::size_t j=0; j<n; ++j) {
            fmt::printf("%10.3lf ", A[i*incRowA+j*incColA]);
        }
        std::printf("\n");
    }
    std::printf("\n");
}

//------------------------------------------------------------------------------

template <typename TX>
std::size_t
iamax(std::size_t n, const TX *x, std::ptrdiff_t incX)
{
    TX amax          = 0;
    std::size_t res = 0;

    for (std::size_t i=0; i<n; ++i) {
        if (std::abs(x[i*incX]) > amax) {
            amax = std::abs(x[i*incX]);
            res  = i;
        }
    }
    return res;
}

template <typename TA, typename TP>
std::ptrdiff_t
lu_ger(std::size_t m, std::size_t n,
       TA *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
       TP *p, std::ptrdiff_t incP)
{
    std::size_t mn = std::min(m,n);

    for (std::size_t j=0; j<mn; ++j) {
        // pivoting
        std::size_t jp = j + iamax(m-j, &A[j*incRowA+j*incColA], incRowA);
        if (jp!=j) {
            ulmblas::swap(n, &A[j*incRowA], incColA, &A[jp*incRowA], incColA);
        }
        p[j*incP] = jp;
        if (A[j*incRowA+j*incColA]==TA(0)) {
            return j;
        }

        // apply gauss
        ulmblas::scal(m-j-1,
                      1/A[j*incRowA+j*incColA],
                      &A[(j+1)*incRowA + j*incColA], incRowA);
        ulmblas::ger(m-j-1, n-j-1,
                     -1,
                     &A[(j+1)*incRowA + j*incColA], incRowA,
                     &A[j*incRowA + (j+1)*incColA], incColA,
                     &A[(j+1)*incRowA + (j+1)*incColA], incRowA, incColA);
    }
    return -1;
}

//------------------------------------------------------------------------------


#ifndef COLMAJOR
#define COLMAJOR 1
#endif

int
main()
{
    std::size_t    m = 5;
    std::size_t    n = 6;
    std::ptrdiff_t incRowA = COLMAJOR ? 1 : n;
    std::ptrdiff_t incColA = COLMAJOR ? m : 1;
    std::ptrdiff_t incP    = 1;

    double *A       = new double[m*n];
    double *A0      = new double[m*n];
    std::size_t *p  = new std::size_t[std::min(m,n)];

    initMatrix(m, n, A, incRowA, incColA);
    ulmblas::gecopy(m, n, A, incRowA, incColA, A0, incRowA, incColA);

    printMatrix(m, n, A0, incRowA, incColA);

    std::ptrdiff_t info = lu_ger(m, n, A, incRowA, incColA, p, incP);

    printMatrix(m, n, A, incRowA, incColA);
    printMatrix(std::min(m, n), 1, p, incP, 1);
    fmt::printf("info = %d\n", info);

    double err = test::lu_err(m, n,
                              A0, incRowA, incColA,
                              A, incRowA, incColA,
                              p, incP);

    std::printf("err = %e\n", err);

    delete [] A;
    delete [] A0;
    delete [] p;
}