Lösungsvorschlag
Auf den Linux-Rechnern im E44 erhalten wir für die Paramter
#define M_C 256 #define K_C 256 #define N_C 1024
folgende Benchmark-Ergebnisse bei einer zeilenweise gespeicherten Matrix
Hier der zuhehörige Programm-Code:
#include <math.h> #include <stdio.h> #include <stdlib.h> #include <sys/times.h> #include <unistd.h> #ifndef COLMAJOR #define COLMAJOR 0 #else #undef COLMAJOR #define COLMAJOR 1 #endif #ifndef MAXDIM_M #define MAXDIM_M 4000 #endif #ifndef MAXDIM_N #define MAXDIM_N 4000 #endif #ifndef MAXDIM_K #define MAXDIM_K 4000 #endif #ifndef MIN_M #define MIN_M 100 #endif #ifndef MIN_N #define MIN_N 100 #endif #ifndef MIN_K #define MIN_K 100 #endif #ifndef MAX_M #define MAX_M 7000 #endif #ifndef MAX_N #define MAX_N 7000 #endif #ifndef MAX_K #define MAX_K 7000 #endif #ifndef INC_M #define INC_M 100 #endif #ifndef INC_N #define INC_N 100 #endif #ifndef INC_K #define INC_K 100 #endif #ifndef ALPHA #define ALPHA 1.5 #endif #ifndef BETA #define BETA 1.5 #endif double A[MAXDIM_M*MAXDIM_K]; double B[MAXDIM_K*MAXDIM_N]; double C1[MAXDIM_M*MAXDIM_N]; double C2[MAXDIM_M*MAXDIM_N]; double C3[MAXDIM_M*MAXDIM_N]; double C4[MAXDIM_M*MAXDIM_N]; double walltime() { struct tms ts; static double ClockTick=0.0; if (ClockTick==0.0) { ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK)); } return ((double) times(&ts)) * ClockTick; } void initMatrix(long m, long n, double *A, long incRowA, long incColA) { int i, j; for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { A[i*incRowA+j*incColA] = ((double)rand() - RAND_MAX/2)*200/RAND_MAX; } } } void dgecopy(long m, long n, const double *A, long incRowA, long incColA, double *B, long incRowB, long incColB) { int i, j; for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA]; } } } void dgeaxpy(long m, long n, double alpha, const double *X, long incRowX, long incColX, double *Y, long incRowY, long incColY) { long i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; } } } double asumDiffMatrix(long m, long n, const double *A, long incRowA, long incColA, double *B, long incRowB, long incColB) { int i, j; double asum = 0; for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { asum += fabs(B[i*incRowB+j*incColB] - A[i*incRowA+j*incColA]); } } return asum; } //------------------------------------------------------------------------------ // A <- alpha * A //------------------------------------------------------------------------------ void dgescal(long m, long n, double alpha, double *X, long incRowX, long incColX) { long i, j; if (alpha!=1.0) { for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { X[i*incRowX+j*incColX] *= alpha; } } } } //------------------------------------------------------------------------------ // 1. GEMM Variante: Schulmethode //------------------------------------------------------------------------------ void dgemm_var1(long m, long n, long k, double alpha, const double *A, long incRowA, long incColA, const double *B, long incRowB, long incColB, double beta, double *C, long incRowC, long incColC) { long i, j, l; dgescal(m, n, beta, C, incRowC, incColC); for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { for (l=0; l<k; ++l) { C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } //------------------------------------------------------------------------------ // 2. GEMM Variante: Zeilenweise //------------------------------------------------------------------------------ void dgemm_var2(long m, long n, long k, double alpha, const double *A, long incRowA, long incColA, const double *B, long incRowB, long incColB, double beta, double *C, long incRowC, long incColC) { long i, j, l; dgescal(m, n, beta, C, incRowC, incColC); for (i=0; i<m; ++i) { for (l=0; l<k; ++l) { for (j=0; j<n; ++j) { C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } //------------------------------------------------------------------------------ // 3. GEMM Variante: Spaltenweise //------------------------------------------------------------------------------ void dgemm_var3(long m, long n, long k, double alpha, const double *A, long incRowA, long incColA, const double *B, long incRowB, long incColB, double beta, double *C, long incRowC, long incColC) { long i, j, l; dgescal(m, n, beta, C, incRowC, incColC); for (j=0; j<n; ++j) { for (l=0; l<k; ++l) { for (i=0; i<m; ++i) { C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } //------------------------------------------------------------------------------ // 4. GEMM Variante: Bisschen mit Puffer //------------------------------------------------------------------------------ #ifndef M_C #define M_C 256 #endif #ifndef K_C #define K_C 256 #endif #ifndef N_C #define N_C 1024 #endif double A_[M_C*K_C]; double B_[K_C*N_C]; double C_[M_C*N_C]; void dgemm_var4(long m, long n, long k, double alpha, const double *A, long incRowA, long incColA, const double *B, long incRowB, long incColB, double beta, double *C, long incRowC, long incColC) { long i, j, l; long mb = m / M_C; long nb = n / N_C; long kb = k / K_C; long mr = m % M_C; long nr = n % N_C; long kr = k % K_C; dgescal(m, n, beta, C, incRowC, incColC); for (j=0; j<=nb; ++j) { long N = (j<nb || nr==0) ? N_C : nr; for (l=0; l<=kb; ++l) { long K = (l<kb || kr==0) ? K_C : kr; dgecopy(K, N, &B[l*K_C*incRowB+j*N_C*incColB], incRowB, incColB, B_, 1, K_C); for (i=0; i<=mb; ++i) { long M = (i<mb || mr==0) ? M_C : mr; dgecopy(M, K, &A[i*M_C*incRowA+l*K_C*incColA], incRowA, incColA, A_, 1, M_C); dgemm_var3(M, N, K, 1.0, A_, 1, M_C, B_, 1, K_C, 0.0, C_, 1, M_C); dgeaxpy(M, N, alpha, C_, 1, M_C, &C[i*M_C*incRowC+j*N_C*incColC], incRowC, incColC); } } } } int main() { initMatrix(MAXDIM_M, MAXDIM_K, A, 1, MAXDIM_M); initMatrix(MAXDIM_K, MAXDIM_N, B, 1, MAXDIM_K); initMatrix(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M); dgecopy(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M, C2, 1, MAXDIM_M); dgecopy(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M, C3, 1, MAXDIM_M); dgecopy(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M, C4, 1, MAXDIM_M); long m, n, k; // Header-Zeile für die Ausgabe printf("%5s %5s %5s ", "m", "n", "k"); printf("%5s %5s ", "IRA", "ICA"); printf("%5s %5s ", "IRB", "ICB"); printf("%5s %5s ", "IRC", "ICC"); printf("%9s %9s ", "var1: t", "MFLOPS"); printf("%9s %9s %9s", "var2: t", "MFLOPS", "diff"); printf("%9s %9s %9s", "var3: t", "MFLOPS", "diff"); printf("%9s %9s %9s", "var4: t", "MFLOPS", "diff"); printf("\n"); for (m = MIN_M, n = MIN_N, k = MIN_K; m <=MAX_M && n <= MAX_N && k <= MAX_K; m += INC_M, n += INC_N, k += INC_K) { double t0, t1, diff; long incRowA = (COLMAJOR) ? 1 : k; long incColA = (COLMAJOR) ? m : 1; long incRowB = (COLMAJOR) ? 1 : n; long incColB = (COLMAJOR) ? k : 1; long incRowC = (COLMAJOR) ? 1 : n; long incColC = (COLMAJOR) ? m : 1; printf("%5ld %5ld %5ld ", m, n, k); printf("%5ld %5ld ", incRowA, incColA); printf("%5ld %5ld ", incRowB, incColB); printf("%5ld %5ld ", incRowC, incColC); t0 = walltime(); dgemm_var1(m, n, k, ALPHA, A, incRowA, incColA, B, incRowB, incColB, BETA, C1, incRowC, incColC); t1 = walltime() - t0; printf("%9.4lf %9.2lf ", t1, 2.*m/1000*n/1000*k/t1); t0 = walltime(); dgemm_var2(m, n, k, ALPHA, A, incRowA, incColA, B, incRowB, incColB, BETA, C2, incRowC, incColC); t1 = walltime() - t0; diff = asumDiffMatrix(m, n, C1, incRowC, incColC, C2, incRowC, incColC); printf("%9.4lf %9.2lf %9.1e", t1, 2.*m/1000*n/1000*k/t1, diff); t0 = walltime(); dgemm_var3(m, n, k, ALPHA, A, incRowA, incColA, B, incRowB, incColB, BETA, C3, incRowC, incColC); t1 = walltime() - t0; diff = asumDiffMatrix(m, n, C1, incRowC, incColC, C3, incRowC, incColC); printf("%9.4lf %9.2lf %9.1e", t1, 2.*m/1000*n/1000*k/t1, diff); t0 = walltime(); dgemm_var4(m, n, k, ALPHA, A, incRowA, incColA, B, incRowB, incColB, BETA, C4, incRowC, incColC); t1 = walltime() - t0; diff = asumDiffMatrix(m, n, C1, incRowC, incColC, C4, incRowC, incColC); printf("%9.4lf %9.2lf %9.1e", t1, 2.*m/1000*n/1000*k/t1, diff); printf("\n"); } return 0; }