Geblockte LU-Zerlegung

Ziel: Implementiert in der Funktion hpc::ulmlapack::getrf die geblockte LU-Zerlegung mit partieller Pivotisierung.

Idee für den Algorithmus ohne Pivotisierung

Wir wählen eine bestimmte Blockgröße bs (zum Beispiel bs=64). In ersten Schritt betrachten wir für eine Matrix \(A\) die Partitionierung entlang der ersten bs Spalten:

\[A = \left(\begin{array}{c|c} A_1 & A_2 \end{array}\right)\]

Für den Block \(A_1\) bestimmen wir mit der ungeblockten LU-Zerlegung

\[A_1 = L_1 U_1\]

mit

\[L_1 = \left(\begin{array} L_{1,1} \\ \hline L_{2,1} \end{array}\right), \quad L_{1,1} \in \mathbb{R}^{bs \times bs}\]

und

\[U_1 = \left(\begin{array} U_{1,1} \\ \hline \mathbf{0} \end{array}\right), \quad U_{1,1} \in \mathbb{R}^{bs \times bs}\]

Analog kann man nun die Partitionierung

\[A = \left(\begin{array}{c|c} A_1 & A_2 \end{array}\right) = \left(\begin{array}{c|c} A_{1,1} & A_{1,2} \\ \hline A_{2,1} & A_{2,2} \end{array}\right) \quad\text{mit} A_{1,1} \in \mathbb{R}^{bs \times bs}\]

betrachten. Dann folgt

\[A = L U\quad\Leftrightarrow\quad\left(\begin{array}{c|c} A_{1,1} & A_{1,2} \\ \hline A_{2,1} & A_{2,2} \end{array}\right)=\left(\begin{array}{c|c} L_{1,1} & \mathbf{0} \\ \hline L_{2,1} & L_{2,2} \end{array}\right)\left(\begin{array}{c|c} U_{1,1} & U_{1,2} \\ \hline \mathbf{0} & U_{2,2} \end{array}\right)\]

Aufgaben

Leitet damit zunächst einen Algorithmus für eine geblockte LU-Zerlegung ohne Pivotisierung her. Dieser soll als Operationen lediglich die ungeblockte LU-Zerlegung, das Matrix-Produkt und das Vorwärtseinsetzen benutzen.

Haltet außerdem fest, welche Dimensionen die einzelen Blöcke besitzen. Beachtet dabei, dass \(A\) nicht quadratisch sein muss. Die Blockgröße bs soll dabei beliebig aber fest sein!

Pivotisierung einbauen

Die geblockte LU-Zerlegung soll bei der Zerlegung \(A = PLU\) die Permutationsmatrix durch einen Pivotvektor \(p\) repräsentieren. Wir betrachten die Partitionierung

\[A = \left(\begin{array}{c|c} A_1 & A_2 & A_3 \end{array}\right) = \left(\begin{array}{c|c} A_{1,1} & A_{1,2} & A_{1,3} \\ \hline A_{2,1} & A_{2,2} & A_{2,3} \\ \hline A_{3,1} & A_{3,2} & A_{3,3} \\ \end{array}\right) \quad\text{mit} A_{2,2} \in \mathbb{R}^{bs \times bs}\]

Aufgaben

Wie sind die Dimensionen der einzelnen Blöcke?
Die Blöcke \(A_{1,1}\), \(A_{2,1}\), \(A_{3,1}\), \(A_{1,2}\) und \(A_{1,3}\) seien bereits mit der LU-Zerlegung überschrieben:
- Welche Zeilen könnten dann bereits vertauscht worden sein?
- Welcher Teil des Pivotvektors enthält dann schon die endgültigen Werte?
- Wie könnte man bei der Restmatrix
  \[A = \left(\begin{array}{c|c} \overline{A}_1 & \overline{A}_2 \end{array}\right) = \left(\begin{array}{c|c} A_{2,2} & A_{2,3} \\ \hline A_{3,2} & A_{3,3} \\ \end{array}\right)\]
  durch Anwenden der ungeblockten LU-Zerlegung auf \(\overline{A}_1\) weiter vorgehen?

Vorlage für die Implementierung

Als Vorlage für die geblockte LU-Zerlegung hpc::ulmlapack::getrf im Verzeichnis hpc/ulmlapack/ könnt ihr folgendes Gerüst verwenden:

#ifndef HPC_ULMLAPACK_GETRF_H
#define HPC_ULMLAPACK_GETRF_H 1

namespace hpc { namespace ulmlapack {

template <typename Index, typename TA, typename TP>
Index
getrf(Index m, Index n, TA *A, Index incRowA, Index incColA, TP *p, Index incP)
{
    Index mn   = std::min(m,n);
    Index bs   = 64;
    Index info = -1;

    if (bs<=1 || bs>mn) {
        /* Matrix zu klein -> ungeblockte Variante aufrufen */
    } else {
        /* Geblockte LU-Zerlegung */
    }
    return info;
}

} } // namespace ulmblas, hpc

#endif // HPC_ULMLAPACK_GETRF_H

Test für die geblockte LU-Zerlegung

Um die Implementierung zu testen und einen Benchmark zu erzeugen, kann folgendes Programm benutzt werden:

session17/hpc/tests/lu_blocked.cc

#include <cassert>
#include <random>
#include <hpc/aux/walltime.h>
#include <hpc/matvec/copy.h>
#include <hpc/matvec/gematrix.h>
#include <hpc/matvec/mm.h>
#include <hpc/matvec/print.h>
#include <hpc/ulmlapack/getrf.h>

using namespace hpc::matvec;

template <typename T, typename I>
double
checkLU(const GeMatrixView<T, I> &A,
        GeMatrixView<T, I> &A_,
        const GeMatrixView<I, I> &p)
{
    assert(A.numRows==A_.numRows);
    assert(A.numCols==A_.numCols);
    assert(A.numCols>=A.numRows);
    assert(A.numRows==p.numRows);

    typedef typename GeMatrix<T, I>::Index  Index;

    Index m = A.numRows;
    Index n = A.numRows;

    // Copy L form A_ and set A_ to U
    GeMatrix<T, I> L(m, m);
    for (Index j=0; j<m; ++j) {
        for (Index i=0; i<m; ++i) {
            if (i==j) {
                L(i,i) = 1;
            } else if (i>j) {
                L(i,j)  = A_(i,j);
                A_(i,j) = 0;
            } else if (j>i) {
                L(i,j)  = 0;
            }
        }
    }

    // Compute LU = L*U
    GeMatrix<T, I> LU(m, n);
    for (Index j=0; j<n; ++j) {
        for (Index i=0; i<m; ++i) {
            LU(i,j) = 0;
        }
    }
    hpc::matvec::mm(T(1), L, A_, T(0), LU);

    // Apply P
    for (Index i=m; i>0; --i) {
        if (i-1!=p(i-1,0)) {
            hpc::ulmblas::swap(n,
                               &LU(i-1,0), LU.incCol,
                               &LU(p(i-1,0),0), LU.incCol);
        }
    }

    double diff = 0;
    for (Index j=0; j<n; ++j) {
        for (Index i=0; i<m; ++i) {
            diff += std::abs(A(i,j)-LU(i,j));
        }
    }
    return diff;
}

//
//  Random initializer for general matrices: real and complex valued
//
template <typename Index, typename T>
void
randomInit(Index m, Index n, T *A, Index incRowA, Index incColA)
{
    std::random_device                  random;
    std::default_random_engine          mt(random());
    std::uniform_real_distribution<T>   uniform(-100,100);

    for (Index i=0; i<m; ++i) {
        for (Index j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = uniform(mt);
        }
    }
}

int
main()
{
    const std::ptrdiff_t   m_min = 10;
    const std::ptrdiff_t   n_min = 10;

    const std::ptrdiff_t   m_max = 2000;
    const std::ptrdiff_t   n_max = 2000;

    const std::ptrdiff_t   m_inc = 10;
    const std::ptrdiff_t   n_inc = 10;

    GeMatrix<double, std::ptrdiff_t>      A(m_max, n_max);
    GeMatrix<double, std::ptrdiff_t>      A_org(m_max, n_max);
    GeMatrix<std::ptrdiff_t, std::ptrdiff_t>        p(m_max, 1);
    hpc::aux::WallTime<double>  timer;

    for (std::ptrdiff_t m=m_min, n=n_min; m<=m_max && n<=n_max; m+=m_inc, n+=n_inc) {

        randomInit(m, n, A.data, A.incRow, A.incCol);
        hpc::ulmblas::gecopy(m, n,
                             A.data, A.incRow, A.incCol,
                             A_org.data, A_org.incRow, A_org.incCol);
        //print(A_org, "A_org");

        timer.tic();
        std::ptrdiff_t info = hpc::ulmlapack::getrf(m, n,
                                                    A.data, A.incRow, A.incCol,
                                                    p.data, p.incRow);
        double t = timer.toc();
        //print(A, "A");
        //print(p, "p");

        auto A_ = A_org(0, 0, m, n);
        auto LU = A(0, 0, m, n);
        auto p_ = p(0, 0, m, 1);

        double diff = checkLU(A_, LU, p_);
        //print(A, "LU");
        std::printf("%4ld %4ld %4ld %16.3e %16.5lf\n", m, n, info, diff, t);
    }
}

Aufgaben

Implementiert die Funktion hpc::ulmlapack::getrf
Erzeugt Benchmarks, um die Effektivität von getf2 und getrfzu vergleichen. Natürlich sollen diese mit gnuplot schön dargestellt werden.
Verwendet auch die Assembler-Kernel für die Benchmarks:
- Auf Rechnern mit AVX mit -DAVX -DD_BLOCKSIZE_MR=4 -DD_BLOCKSIZE_NR=8 und
- auf Rechnern mit SSE mit -DSSE
übersetzen.