Lösungsvorschlag

Auf den Linux-Rechnern im E44 erhalten wir für die Paramter

#define M_C 256
#define K_C 256
#define N_C 1024

folgende Benchmark-Ergebnisse bei einer zeilenweise gespeicherten Matrix

Hier der zuhehörige Programm-Code:

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>

#ifndef COLMAJOR
#define COLMAJOR 0
#else
#undef  COLMAJOR
#define COLMAJOR 1
#endif

#ifndef MAXDIM_M
#define MAXDIM_M    4000
#endif

#ifndef MAXDIM_N
#define MAXDIM_N    4000
#endif

#ifndef MAXDIM_K
#define MAXDIM_K    4000
#endif

#ifndef MIN_M
#define MIN_M   100
#endif

#ifndef MIN_N
#define MIN_N   100
#endif

#ifndef MIN_K
#define MIN_K   100
#endif

#ifndef MAX_M
#define MAX_M   7000
#endif

#ifndef MAX_N
#define MAX_N   7000
#endif

#ifndef MAX_K
#define MAX_K   7000
#endif

#ifndef INC_M
#define INC_M   100
#endif

#ifndef INC_N
#define INC_N   100
#endif

#ifndef INC_K
#define INC_K   100
#endif

#ifndef ALPHA
#define ALPHA   1.5
#endif

#ifndef BETA
#define BETA    1.5
#endif

double A[MAXDIM_M*MAXDIM_K];
double B[MAXDIM_K*MAXDIM_N];

double C1[MAXDIM_M*MAXDIM_N];
double C2[MAXDIM_M*MAXDIM_N];
double C3[MAXDIM_M*MAXDIM_N];
double C4[MAXDIM_M*MAXDIM_N];

double
walltime()
{
   struct tms    ts;
   static double ClockTick=0.0;

   if (ClockTick==0.0) {
        ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
   }
   return ((double) times(&ts)) * ClockTick;
}



void
initMatrix(long m, long n, double *A, long incRowA, long incColA)
{
    int i, j;
    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] = ((double)rand() - RAND_MAX/2)*200/RAND_MAX;
        }
    }
}

void
dgecopy(long m, long n,
        const double *A, long incRowA, long incColA,
        double *B, long incRowB, long incColB)
{
    int i, j;
    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
        }
    }
}

void
dgeaxpy(long m, long n,
        double alpha,
        const double *X, long incRowX, long incColX,
        double       *Y, long incRowY, long incColY)
{
    long i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
        }
    }
}


double
asumDiffMatrix(long m, long n,
               const double *A, long incRowA, long incColA,
               double *B, long incRowB, long incColB)
{
    int i, j;

    double asum = 0;

    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            asum += fabs(B[i*incRowB+j*incColB] - A[i*incRowA+j*incColA]);
        }
    }
    return asum;
}

//------------------------------------------------------------------------------
// A <- alpha * A
//------------------------------------------------------------------------------

void
dgescal(long m, long n,
        double alpha,
        double *X, long incRowX, long incColX)
{
    long i, j;

    if (alpha!=1.0) {
        for (i=0; i<m; ++i) {
            for (j=0; j<n; ++j) {
                X[i*incRowX+j*incColX] *= alpha;
            }
        }
    }
}

//------------------------------------------------------------------------------
// 1. GEMM Variante: Schulmethode
//------------------------------------------------------------------------------
void
dgemm_var1(long m, long n, long k,
           double alpha,
           const double *A, long incRowA, long incColA,
           const double *B, long incRowB, long incColB,
           double beta,
           double *C, long incRowC, long incColC)
{
    long i, j, l;

    dgescal(m, n, beta, C, incRowC, incColC);
    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            for (l=0; l<k; ++l) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

//------------------------------------------------------------------------------
// 2. GEMM Variante: Zeilenweise
//------------------------------------------------------------------------------
void
dgemm_var2(long m, long n, long k,
           double alpha,
           const double *A, long incRowA, long incColA,
           const double *B, long incRowB, long incColB,
           double beta,
           double *C, long incRowC, long incColC)
{
    long i, j, l;

    dgescal(m, n, beta, C, incRowC, incColC);
    for (i=0; i<m; ++i) {
        for (l=0; l<k; ++l) {
            for (j=0; j<n; ++j) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}


//------------------------------------------------------------------------------
// 3. GEMM Variante: Spaltenweise
//------------------------------------------------------------------------------
void
dgemm_var3(long m, long n, long k,
           double alpha,
           const double *A, long incRowA, long incColA,
           const double *B, long incRowB, long incColB,
           double beta,
           double *C, long incRowC, long incColC)
{
    long i, j, l;

    dgescal(m, n, beta, C, incRowC, incColC);
    for (j=0; j<n; ++j) {
        for (l=0; l<k; ++l) {
            for (i=0; i<m; ++i) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

//------------------------------------------------------------------------------
// 4. GEMM Variante: Bisschen mit Puffer
//------------------------------------------------------------------------------

#ifndef M_C
#define M_C 256
#endif

#ifndef K_C
#define K_C 256
#endif

#ifndef N_C
#define N_C 1024
#endif

double A_[M_C*K_C];
double B_[K_C*N_C];
double C_[M_C*N_C];

void
dgemm_var4(long m, long n, long k,
           double alpha,
           const double *A, long incRowA, long incColA,
           const double *B, long incRowB, long incColB,
           double beta,
           double *C, long incRowC, long incColC)
{
    long i, j, l;

    long mb = m / M_C;
    long nb = n / N_C;
    long kb = k / K_C;

    long mr = m % M_C;
    long nr = n % N_C;
    long kr = k % K_C;

    dgescal(m, n, beta, C, incRowC, incColC);

    for (j=0; j<=nb; ++j) {
        long N = (j<nb || nr==0) ? N_C : nr;

        for (l=0; l<=kb; ++l) {
            long K = (l<kb || kr==0) ? K_C : kr;

            dgecopy(K, N,
                    &B[l*K_C*incRowB+j*N_C*incColB], incRowB, incColB,
                    B_, 1, K_C);

            for (i=0; i<=mb; ++i) {
                long M = (i<mb || mr==0) ? M_C : mr;

                dgecopy(M, K,
                        &A[i*M_C*incRowA+l*K_C*incColA], incRowA, incColA,
                        A_, 1, M_C);

                dgemm_var3(M, N, K,
                           1.0,
                           A_, 1, M_C,
                           B_, 1, K_C,
                           0.0,
                           C_, 1, M_C);

                dgeaxpy(M, N,
                        alpha,
                        C_, 1, M_C,
                        &C[i*M_C*incRowC+j*N_C*incColC], incRowC, incColC);
            }
        }
    }
}

int
main()
{
    initMatrix(MAXDIM_M, MAXDIM_K, A, 1, MAXDIM_M);
    initMatrix(MAXDIM_K, MAXDIM_N, B, 1, MAXDIM_K);
    initMatrix(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M);
    dgecopy(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M, C2, 1, MAXDIM_M);
    dgecopy(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M, C3, 1, MAXDIM_M);
    dgecopy(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M, C4, 1, MAXDIM_M);

    long m, n, k;

    // Header-Zeile für die Ausgabe
    printf("%5s %5s %5s ", "m", "n", "k");
    printf("%5s %5s ", "IRA", "ICA");
    printf("%5s %5s ", "IRB", "ICB");
    printf("%5s %5s ", "IRC", "ICC");
    printf("%9s %9s ", "var1: t", "MFLOPS");
    printf("%9s %9s %9s", "var2: t", "MFLOPS", "diff");
    printf("%9s %9s %9s", "var3: t", "MFLOPS", "diff");
    printf("%9s %9s %9s", "var4: t", "MFLOPS", "diff");
    printf("\n");

    for (m = MIN_M, n = MIN_N, k = MIN_K;
         m <=MAX_M && n <= MAX_N && k <= MAX_K;
         m += INC_M, n += INC_N, k += INC_K)
    {
        double t0, t1, diff;

        long incRowA = (COLMAJOR) ? 1 : k;
        long incColA = (COLMAJOR) ? m : 1;

        long incRowB = (COLMAJOR) ? 1 : n;
        long incColB = (COLMAJOR) ? k : 1;

        long incRowC = (COLMAJOR) ? 1 : n;
        long incColC = (COLMAJOR) ? m : 1;

        printf("%5ld %5ld %5ld ", m, n, k);
        printf("%5ld %5ld ", incRowA, incColA);
        printf("%5ld %5ld ", incRowB, incColB);
        printf("%5ld %5ld ", incRowC, incColC);

        t0 = walltime();
        dgemm_var1(m, n, k, ALPHA,
                   A, incRowA, incColA,
                   B, incRowB, incColB,
                   BETA,
                   C1, incRowC, incColC);
        t1   = walltime() - t0;
        printf("%9.4lf %9.2lf ", t1, 2.*m/1000*n/1000*k/t1);

        t0 = walltime();
        dgemm_var2(m, n, k, ALPHA,
                   A, incRowA, incColA,
                   B, incRowB, incColB,
                   BETA,
                   C2, incRowC, incColC);
        t1 = walltime() - t0;
        diff = asumDiffMatrix(m, n, C1, incRowC, incColC, C2, incRowC, incColC);
        printf("%9.4lf %9.2lf %9.1e", t1, 2.*m/1000*n/1000*k/t1, diff);

        t0 = walltime();
        dgemm_var3(m, n, k, ALPHA,
                   A, incRowA, incColA,
                   B, incRowB, incColB,
                   BETA,
                   C3, incRowC, incColC);
        t1 = walltime() - t0;
        diff = asumDiffMatrix(m, n, C1, incRowC, incColC, C3, incRowC, incColC);
        printf("%9.4lf %9.2lf %9.1e", t1, 2.*m/1000*n/1000*k/t1, diff);

        t0 = walltime();
        dgemm_var4(m, n, k, ALPHA,
                   A, incRowA, incColA,
                   B, incRowB, incColB,
                   BETA,
                   C4, incRowC, incColC);
        t1 = walltime() - t0;
        diff = asumDiffMatrix(m, n, C1, incRowC, incColC, C4, incRowC, incColC);
        printf("%9.4lf %9.2lf %9.1e", t1, 2.*m/1000*n/1000*k/t1, diff);
        printf("\n");
    }
    return 0;
}