=====
TRLSM
=====

Die TRLSM-Operation (triangular lower solver for matrices) soll für eine untere
$m\times m$ Matrix $L$ und $m \times n$ Matrix $B$ die Operation

---- LATEX ---------------------------------------------------------------------
B \leftarrow L^{-1} \left( \alpha\,B \right)
--------------------------------------------------------------------------------

durchführen.  Durch eine blockweise Berechnung kann dabei die effiziente
GEMM-Operation ausgenutzt werden.


Ungeblockte Variante
====================
Implementiert zunächst eine einfache Variante basierend auf der Level 2 Operation
`dtrlsv`.  Zum Test könnt ihr die unten stehende Vorlage benutzen.


Einfacher Block-Algorithmus
===========================

Die Matrix $B$ soll mit dem Ergebnis der Operation überschrieben werden.  Beim
Aufschreiben des Algorithmus unterscheiden wir aber zunächste zwischen der
ursprünglichen Matrix $B$ und der Lösung $X = L^{-1} \left(\alpha\,B\right)$.

Wir betrachtet zunächst eine einfache Partitionierung des Problems:

---- LATEX ---------------------------------------------------------------------
\left(\begin{array}{c|c}
      L_{1,1} &         \\
      \hline
      L_{2,1} & L_{2,2}
      \end{array}\right)
\left(\begin{array}{c|c}
      X_{1,1} & X_{1,2} \\
      \hline
      X_{2,1} & X_{2,2}
      \end{array}\right)
=
\alpha\,
\left(\begin{array}{c|c}
      B_{1,1} & B_{1,2} \\
      \hline
      B_{2,1} & B_{2,2}
      \end{array}\right)
--------------------------------------------------------------------------------

Multipliziert man die linke Seite aus, so erhält man

---- LATEX ---------------------------------------------------------------------
\left(\begin{array}{l|l}
      L_{1,1} X_{1,1}                    & L_{1,1} X_{1,2} \\
      \hline
      L_{2,1} X_{1,1} +  L_{2,2} X_{2,1} & L_{2,1} X_{1,2} +  L_{2,2} X_{2,2}
      \end{array}\right)
=
\left(\begin{array}{c|c}
      \alpha B_{1,1} & \alpha B_{1,2} \\
      \hline
      \alpha B_{2,1} & \alpha B_{2,2}
      \end{array}\right)
--------------------------------------------------------------------------------

Eine Grundidee für einen Algorithmus erhält man indem man teilweise auflöst:

---- LATEX ---------------------------------------------------------------------
\begin{array}{lcllcl}
X_{1,1}          &=& L_{1,1}^{-1} \alpha B_{1,1},      & X_{1,2}          &=& L_{1,1}^{-1} \alpha B_{1,2}      \\
L_{2,2} X_{2,1}  &=& \alpha B_{2,1} - L_{2,1} X_{1,1}, & L_{2,2} X_{2,2}  &=& \alpha B_{2,2} - L_{2,1} X_{1,2}
\end{array}
--------------------------------------------------------------------------------

Verallgemeinert zunächst den Algorithmus für den Fall, dass $L$ bezüglich einem
Blockfaktor $M_c$ in quadratische Blöcke zerlegt wird und $B$ bezüglich $M_c$ und
$N_c$ zeilen- und spaltenweise partitioniert wird.


Vorlage für Test und Benchmark
==============================

Das gesamte Projekt (__hpc_project.tgz__) enthält einen Fehlerschätzer und eine
Referenzimplementierung für den Dreieckslöser.

:links: hpc_project.tgz -> http://www.mathematik.uni-ulm.de/~lehn/hpc_project.tgz

---- CODE(type=c) --------------------------------------------------------------
#include <stdio.h>
#include <bench/aux.h>
#include <bench/refblas.h>
#include <bench/errbound.h>
#include <ulmblas/level1.h>
#include <ulmblas/level2.h>
#include <ulmblas/level3.h>


void
dtrlsm_simple(int m, int n, double alpha, int unitDiag,
              const double *A, int incRowA, int incColA,
              double *B, int incRowB, int incColB)
{
    // ... your code here ...
}

#ifndef MIN_M
#define MIN_M 100
#endif

#ifndef MAX_M
#define MAX_M 1500
#endif

#ifndef INC_M
#define INC_M 100
#endif

#ifndef MIN_N
#define MIN_N 100
#endif

#ifndef MAX_N
#define MAX_N 1500
#endif

#ifndef INC_N
#define INC_N 100
#endif

#ifndef ROWMAJOR
#define ROWMAJOR 0
#endif

#ifndef ALPHA
#define ALPHA 2.5
#endif

#ifndef UNITDIAG
#define UNITDIAG 0
#endif

#if (ROWMAJOR==1)
#   define INCROW_A  MAX_M
#   define INCCOL_A  1
#else
#   define INCROW_A  1
#   define INCCOL_A  MAX_M
#endif

#if (ROWMAJOR==1)
#   define INCROW_B  MAX_N
#   define INCCOL_B  1
#else
#   define INCROW_B  1
#   define INCCOL_B  MAX_M
#endif

double A[MAX_M*MAX_M];
double B[MAX_M*MAX_M];
double X0[MAX_M*MAX_N];
double X1[MAX_M*MAX_N];

int
main()
{
    int m, n;

    randGeMatrix(MAX_M, MAX_M, A, INCROW_A, INCCOL_A);
    makeTrlDiagDom(MAX_M, UNITDIAG, A, INCROW_A, INCCOL_A);
    randGeMatrix(MAX_M, MAX_N, B, INCROW_B, INCCOL_B);

    printf("#UNITDIAG=%3d\n", UNITDIAG);
    printf("# %51s %25s\n", "dtrlsv_ref", "dtrlsv_row");
    printf("#%9s %9s %9s", "n", "INCROW_A", "INCCOL_A");
    printf(" %9s %9s", "INCROW_B", "INCCOL_B");
    printf(" %12s %12s", "t", "MFLOPS");
    printf(" %12s %12s %12s", "t", "MFLOPS", "err");
    printf("\n");

    for (m=MIN_M, n=MIN_N; m<=MAX_M && n<=MAX_N; m+=INC_M, n+=INC_N) {
        int     runs = 1;
        double  ops = (double)n*m*(m+1)/2 + (double)n*m*(m-1)/2;
        double  t0, dt, err;

        printf(" %9d %9d %9d", n, INCROW_A, INCCOL_A);
        printf(" %9d %9d", INCROW_B, INCCOL_B);

        t0   = 0;
        runs = 0;
        do {
            dgecopy(m, n, B, INCROW_B, INCCOL_B, X0, INCROW_B, INCCOL_B);
            dt = walltime();

            dtrlsm_ref(m, n, (double)ALPHA, UNITDIAG,
                       A, INCROW_A, INCCOL_A,
                       X0, INCROW_B, INCCOL_B);

            dt = walltime() - dt;
            t0 += dt;
            ++runs;
        } while (t0<0.3);
        t0 /= runs;

        printf(" %12.2e %12.2lf", t0, ops/(1000*1000*t0));

        t0   = 0;
        runs = 0;
        do {
            dgecopy(m, n, B, INCROW_B, INCCOL_B, X1, INCROW_B, INCCOL_B);
            dt = walltime();

            dtrlsm_simple(m, n, (double)ALPHA, UNITDIAG,
                          A, INCROW_A, INCCOL_A,
                          X1, INCROW_B, INCCOL_B);

            dt = walltime() - dt;
            t0 += dt;
            ++runs;
        } while (t0<0.3);
        t0 /= runs;

        err = err_dtrsm(m, n, ALPHA, UNITDIAG, 1,
                        A, INCROW_A, INCCOL_A,
                        X0, INCROW_B, INCCOL_B,
                        X1, INCROW_B, INCCOL_B);

        printf(" %12.2e %12.2lf %12.2e", t0, ops/(1000*1000*t0), err);
        printf("\n");

    }

    return 0;
}
--------------------------------------------------------------------------------

:navigate: up    -> doc:index
           next  -> doc:session09/page02