Geblockte LU-Zerlegung
Implementiert die in der Vorlesung besprochende geblockte LU-Variante.
Vorlage
#include <stdio.h> #include <bench/aux.h> #include <bench/refblas.h> #include <bench/errbound.h> #include <ulmblas/level1.h> #include <ulmblas/level2.h> #include <ulmblas/level3.h> #include <ulmblas/lu.h> //-- TRLSM with simple cache optimization -------------------------------------- #ifndef DTRLSM_MC #define DTRLSM_MC 64 #endif #ifndef DTRLSM_NC #define DTRLSM_NC 64 #endif void dtrlsm_simple_unblk(int m, int n, double alpha, int unitDiag, const double *A, int incRowA, int incColA, double *B, int incRowB, int incColB) { int i, j; if (alpha!=1) { for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { B[i*incRowB+j*incColB] *= alpha; } } } for (j=0; j<n; ++j) { dtrlsv_ref(m, unitDiag, A, incRowA, incColA, &B[j*incColB], incRowB); } } void dtrlsm_simple_blk(int m, int n, double alpha, int unitDiag, const double *A, int incRowA, int incColA, double *B, int incRowB, int incColB) { int i, j, l; const int NC = DTRLSM_NC; const int MC = DTRLSM_MC; for (j=0; j<n; j+=NC) { const int nc = (j+NC<=n) ? NC : n-j; for (i=0; i<m; i+=MC) { const int mc = (i+MC<=m) ? MC : m-i; const double alpha_ = (i==0) ? alpha : 1; dtrlsm_simple_unblk(mc, nc, alpha_, unitDiag, &A[i*(incRowA+incColA)], incRowA, incColA, &B[i*incRowB+j*incColB], incRowB, incColB); for (l=i+MC; l<m; l+= MC) { const int kc = (l+MC<=m) ? MC : m-l; dgemm(kc, nc, mc, -1.0, &A[l*incRowA+i*incColA], incRowA, incColA, &B[i*incRowB+j*incColB], incRowB, incColB, alpha_, &B[l*incRowB+j*incColB], incRowB, incColB); } } } } //-- LU blocked ---------------------------------------------------------------- void lu_blk(int m, int n, double *A, int incRowA, int incColA) { // ... your code here ... } //------------------------------------------------------------------------------ #ifndef MIN_N #define MIN_N 100 #endif #ifndef MAX_N #define MAX_N 8000 #endif #ifndef INC_N #define INC_N 100 #endif #ifndef MIN_M #define MIN_M 100 #endif #ifndef MAX_M #define MAX_M 8000 #endif #ifndef INC_M #define INC_M 100 #endif #ifndef ROWMAJOR #define ROWMAJOR 0 #endif #if (ROWMAJOR==1) # define INCROW_A MAX_N # define INCCOL_A 1 #else # define INCROW_A 1 # define INCCOL_A MAX_M #endif #define MIN(X,Y) ((X)<(Y) ? (X) : (Y)) #define MAX(X,Y) ((X)>(Y) ? (X) : (Y)) double A_[MAX_N*MAX_N]; double A_0[MAX_N*MAX_N*MIN(INCROW_A,INCCOL_A)]; int main() { int m, n; randGeMatrix(MAX_N, MAX_N, A_, 1, MAX_M); printf("#%9s %9s %9s %9s", "m", "n", "INCROW_A", "INCCOL_A"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf("\n"); for (m=MIN_M, n=MIN_N; n<=MAX_N && m<=MAX_M; m+=INC_M, n+=INC_N) { double t, dt, err; int runs = 1; int minMN = MIN(m,n); int maxMN = MAX(m,n); double ops = minMN*minMN*maxMN -(minMN*minMN*minMN/3.) -(minMN*minMN/2.); printf(" %9d %9d %9d %9d", m, n, INCROW_A, INCCOL_A); t = 0; runs = 0; do { dgecopy(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); dt = walltime(); lu_unblk_var3(m, n, A_0, INCROW_A, INCCOL_A); dt = walltime() - dt; t += dt; ++runs; } while (t<0.3); t /= runs; err = err_lu(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); printf(" %12.2e %12.2lf %12.2e", t, ops/(1000*1000*t), err); t = 0; runs = 0; do { dgecopy(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); dt = walltime(); lu_blk(m, n, A_0, INCROW_A, INCCOL_A); dt = walltime() - dt; t += dt; ++runs; } while (t<0.3); t /= runs; err = err_lu(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); printf(" %12.2e %12.2lf %12.2e", t, ops/(1000*1000*t), err); printf("\n"); } return 0; }