TRLSM

Die TRLSM-Operation (triangular lower solver for matrices) soll für eine untere \(m\times m\) Matrix \(L\) und \(m \times n\) Matrix \(B\) die Operation

\[B \leftarrow L^{-1} \left( \alpha\,B \right)\]

durchführen. Durch eine blockweise Berechnung kann dabei die effiziente GEMM-Operation ausgenutzt werden.

Ungeblockte Variante

Implementiert zunächst eine einfache Variante basierend auf der Level 2 Operation dtrlsv. Zum Test könnt ihr die unten stehende Vorlage benutzen.

Einfacher Block-Algorithmus

Die Matrix \(B\) soll mit dem Ergebnis der Operation überschrieben werden. Beim Aufschreiben des Algorithmus unterscheiden wir aber zunächste zwischen der ursprünglichen Matrix \(B\) und der Lösung \(X = L^{-1} \left(\alpha\,B\right)\).

Wir betrachtet zunächst eine einfache Partitionierung des Problems:

\[\left(\begin{array}{c|c} L_{1,1} & \\ \hline L_{2,1} & L_{2,2} \end{array}\right)\left(\begin{array}{c|c} X_{1,1} & X_{1,2} \\ \hline X_{2,1} & X_{2,2} \end{array}\right)=\alpha\,\left(\begin{array}{c|c} B_{1,1} & B_{1,2} \\ \hline B_{2,1} & B_{2,2} \end{array}\right)\]

Multipliziert man die linke Seite aus, so erhält man

\[\left(\begin{array}{l|l} L_{1,1} X_{1,1} & L_{1,1} X_{1,2} \\ \hline L_{2,1} X_{1,1} + L_{2,2} X_{2,1} & L_{2,1} X_{1,2} + L_{2,2} X_{2,2} \end{array}\right)=\left(\begin{array}{c|c} \alpha B_{1,1} & \alpha B_{1,2} \\ \hline \alpha B_{2,1} & \alpha B_{2,2} \end{array}\right)\]

Eine Grundidee für einen Algorithmus erhält man indem man teilweise auflöst:

\[\begin{array}{lcllcl}X_{1,1} &=& L_{1,1}^{-1} \alpha B_{1,1}, & X_{1,2} &=& L_{1,1}^{-1} \alpha B_{1,2} \\L_{2,2} X_{2,1} &=& \alpha B_{2,1} - L_{2,1} X_{1,1}, & L_{2,2} X_{2,2} &=& \alpha B_{2,2} - L_{2,1} X_{1,2}\end{array}\]

Verallgemeinert zunächst den Algorithmus für den Fall, dass \(L\) bezüglich einem Blockfaktor \(M_c\) in quadratische Blöcke zerlegt wird und \(B\) bezüglich \(M_c\) und \(N_c\) zeilen- und spaltenweise partitioniert wird.

Vorlage für Test und Benchmark

Das gesamte Projekt (hpc_project.tgz) enthält einen Fehlerschätzer und eine Referenzimplementierung für den Dreieckslöser.

#include <stdio.h>
#include <bench/aux.h>
#include <bench/refblas.h>
#include <bench/errbound.h>
#include <ulmblas/level1.h>
#include <ulmblas/level2.h>
#include <ulmblas/level3.h>


void
dtrlsm_simple(int m, int n, double alpha, int unitDiag,
              const double *A, int incRowA, int incColA,
              double *B, int incRowB, int incColB)
{
    // ... your code here ...
}

#ifndef MIN_M
#define MIN_M 100
#endif

#ifndef MAX_M
#define MAX_M 1500
#endif

#ifndef INC_M
#define INC_M 100
#endif

#ifndef MIN_N
#define MIN_N 100
#endif

#ifndef MAX_N
#define MAX_N 1500
#endif

#ifndef INC_N
#define INC_N 100
#endif

#ifndef ROWMAJOR
#define ROWMAJOR 0
#endif

#ifndef ALPHA
#define ALPHA 2.5
#endif

#ifndef UNITDIAG
#define UNITDIAG 0
#endif

#if (ROWMAJOR==1)
#   define INCROW_A  MAX_M
#   define INCCOL_A  1
#else
#   define INCROW_A  1
#   define INCCOL_A  MAX_M
#endif

#if (ROWMAJOR==1)
#   define INCROW_B  MAX_N
#   define INCCOL_B  1
#else
#   define INCROW_B  1
#   define INCCOL_B  MAX_M
#endif

double A[MAX_M*MAX_M];
double B[MAX_M*MAX_M];
double X0[MAX_M*MAX_N];
double X1[MAX_M*MAX_N];

int
main()
{
    int m, n;

    randGeMatrix(MAX_M, MAX_M, A, INCROW_A, INCCOL_A);
    makeTrlDiagDom(MAX_M, UNITDIAG, A, INCROW_A, INCCOL_A);
    randGeMatrix(MAX_M, MAX_N, B, INCROW_B, INCCOL_B);

    printf("#UNITDIAG=%3d\n", UNITDIAG);
    printf("# %51s %25s\n", "dtrlsv_ref", "dtrlsv_row");
    printf("#%9s %9s %9s", "n", "INCROW_A", "INCCOL_A");
    printf(" %9s %9s", "INCROW_B", "INCCOL_B");
    printf(" %12s %12s", "t", "MFLOPS");
    printf(" %12s %12s %12s", "t", "MFLOPS", "err");
    printf("\n");

    for (m=MIN_M, n=MIN_N; m<=MAX_M && n<=MAX_N; m+=INC_M, n+=INC_N) {
        int     runs = 1;
        double  ops = (double)n*m*(m+1)/2 + (double)n*m*(m-1)/2;
        double  t0, dt, err;

        printf(" %9d %9d %9d", n, INCROW_A, INCCOL_A);
        printf(" %9d %9d", INCROW_B, INCCOL_B);

        t0   = 0;
        runs = 0;
        do {
            dgecopy(m, n, B, INCROW_B, INCCOL_B, X0, INCROW_B, INCCOL_B);
            dt = walltime();

            dtrlsm_ref(m, n, (double)ALPHA, UNITDIAG,
                       A, INCROW_A, INCCOL_A,
                       X0, INCROW_B, INCCOL_B);

            dt = walltime() - dt;
            t0 += dt;
            ++runs;
        } while (t0<0.3);
        t0 /= runs;

        printf(" %12.2e %12.2lf", t0, ops/(1000*1000*t0));

        t0   = 0;
        runs = 0;
        do {
            dgecopy(m, n, B, INCROW_B, INCCOL_B, X1, INCROW_B, INCCOL_B);
            dt = walltime();

            dtrlsm_simple(m, n, (double)ALPHA, UNITDIAG,
                          A, INCROW_A, INCCOL_A,
                          X1, INCROW_B, INCCOL_B);

            dt = walltime() - dt;
            t0 += dt;
            ++runs;
        } while (t0<0.3);
        t0 /= runs;

        err = err_dtrsm(m, n, ALPHA, UNITDIAG, 1,
                        A, INCROW_A, INCCOL_A,
                        X0, INCROW_B, INCCOL_B,
                        X1, INCROW_B, INCCOL_B);

        printf(" %12.2e %12.2lf %12.2e", t0, ops/(1000*1000*t0), err);
        printf("\n");

    }

    return 0;
}