Geblockte LU-Zerlegung

Implementiert die in der Vorlesung besprochende geblockte LU-Variante.

Vorlage

#include <stdio.h>
#include <bench/aux.h>
#include <bench/refblas.h>
#include <bench/errbound.h>
#include <ulmblas/level1.h>
#include <ulmblas/level2.h>
#include <ulmblas/level3.h>
#include <ulmblas/lu.h>

//-- TRLSM with simple cache optimization --------------------------------------

#ifndef DTRLSM_MC
#define DTRLSM_MC 64
#endif

#ifndef DTRLSM_NC
#define DTRLSM_NC 64
#endif

void
dtrlsm_simple_unblk(int m, int n, double alpha, int unitDiag,
                    const double *A, int incRowA, int incColA,
                    double *B, int incRowB, int incColB)
{
    int i, j;

    if (alpha!=1) {
        for (j=0; j<n; ++j) {
            for (i=0; i<m; ++i) {
                B[i*incRowB+j*incColB] *= alpha;
            }
        }
    }
    for (j=0; j<n; ++j) {
        dtrlsv_ref(m, unitDiag, A, incRowA, incColA, &B[j*incColB], incRowB);
    }
}

void
dtrlsm_simple_blk(int m, int n, double alpha, int unitDiag,
                  const double *A, int incRowA, int incColA,
                  double *B, int incRowB, int incColB)
{
    int i, j, l;

    const int NC = DTRLSM_NC;
    const int MC = DTRLSM_MC;

    for (j=0; j<n; j+=NC) {
        const int nc = (j+NC<=n) ? NC : n-j;

        for (i=0; i<m; i+=MC) {
            const int    mc     = (i+MC<=m) ? MC : m-i;
            const double alpha_ = (i==0) ? alpha : 1;

            dtrlsm_simple_unblk(mc, nc, alpha_, unitDiag,
                                &A[i*(incRowA+incColA)], incRowA, incColA,
                                &B[i*incRowB+j*incColB], incRowB, incColB);

            for (l=i+MC; l<m; l+= MC) {
                const int kc = (l+MC<=m) ? MC : m-l;

                dgemm(kc, nc, mc,
                      -1.0,
                      &A[l*incRowA+i*incColA], incRowA, incColA,
                      &B[i*incRowB+j*incColB], incRowB, incColB,
                      alpha_,
                      &B[l*incRowB+j*incColB], incRowB, incColB);
            }
        }
    }
}

//-- LU blocked ----------------------------------------------------------------

void
lu_blk(int m, int n, double *A, int incRowA, int incColA)
{
    // ... your code here ...
}

//------------------------------------------------------------------------------

#ifndef MIN_N
#define MIN_N 100
#endif

#ifndef MAX_N
#define MAX_N 8000
#endif

#ifndef INC_N
#define INC_N 100
#endif

#ifndef MIN_M
#define MIN_M 100
#endif

#ifndef MAX_M
#define MAX_M 8000
#endif

#ifndef INC_M
#define INC_M 100
#endif

#ifndef ROWMAJOR
#define ROWMAJOR 0
#endif

#if (ROWMAJOR==1)
#   define INCROW_A  MAX_N
#   define INCCOL_A  1
#else
#   define INCROW_A  1
#   define INCCOL_A  MAX_M
#endif

#define MIN(X,Y)   ((X)<(Y) ? (X) : (Y))
#define MAX(X,Y)   ((X)>(Y) ? (X) : (Y))

double A_[MAX_N*MAX_N];
double A_0[MAX_N*MAX_N*MIN(INCROW_A,INCCOL_A)];

int
main()
{
    int m, n;

    randGeMatrix(MAX_N, MAX_N, A_, 1, MAX_M);

    printf("#%9s %9s %9s %9s", "m", "n", "INCROW_A", "INCCOL_A");
    printf(" %12s %12s %12s", "t", "MFLOPS", "err");
    printf(" %12s %12s %12s", "t", "MFLOPS", "err");
    printf("\n");

    for (m=MIN_M, n=MIN_N; n<=MAX_N && m<=MAX_M; m+=INC_M, n+=INC_N) {
        double t, dt, err;
        int    runs  = 1;
        int    minMN = MIN(m,n);
        int    maxMN = MAX(m,n);
        double ops   = minMN*minMN*maxMN
                     -(minMN*minMN*minMN/3.)
                     -(minMN*minMN/2.);

        printf(" %9d %9d %9d %9d", m, n, INCROW_A, INCCOL_A);

        t    = 0;
        runs = 0;
        do {
            dgecopy(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A);
            dt = walltime();
            lu_unblk_var3(m, n, A_0, INCROW_A, INCCOL_A);
            dt = walltime() - dt;
            t += dt;
            ++runs;
        } while (t<0.3);
        t /= runs;

        err = err_lu(m, n,
                     A_, 1, MAX_M,
                     A_0, INCROW_A, INCCOL_A);

        printf(" %12.2e %12.2lf %12.2e", t, ops/(1000*1000*t), err);

        t    = 0;
        runs = 0;
        do {
            dgecopy(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A);
            dt = walltime();
            lu_blk(m, n, A_0, INCROW_A, INCCOL_A);
            dt = walltime() - dt;
            t += dt;
            ++runs;
        } while (t<0.3);
        t /= runs;

        err = err_lu(m, n,
                     A_, 1, MAX_M,
                     A_0, INCROW_A, INCCOL_A);

        printf(" %12.2e %12.2lf %12.2e", t, ops/(1000*1000*t), err);

        printf("\n");
    }

    return 0;
}