Einfache Cache-Optimierungen

Ziele:

Beispiel aus der letzten Session

In der vorherigen Session wurde demonstriert, dass die Laufzeit der Initialisierung einer größeren Matrix je nach Zugriffsart deutlich schwanken kann:

Hier lohnt sich noch einmal ein genauer Blick auf die Funktion initMatrix:

#include <stdio.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>
#include <math.h>


#ifndef MINDIM_M
#define MINDIM_M    1000
#endif

#ifndef MINDIM_N
#define MINDIM_N    1000
#endif

#ifndef MINDIM_K
#define MINDIM_K    1000
#endif

#ifndef MAXDIM_M
#define MAXDIM_M    5000
#endif

#ifndef MAXDIM_N
#define MAXDIM_N    5000
#endif

#ifndef MAXDIM_K
#define MAXDIM_K    5000
#endif

#ifndef INC_M
#define INC_M   100
#endif

#ifndef INC_N
#define INC_N   100
#endif

#ifndef INC_K
#define INC_K   100
#endif

#ifndef MIN_T
#define MIN_T   1
#endif

double buffer[MAXDIM_M*MAXDIM_N];
double buffer1[MAXDIM_M*MAXDIM_N];
double buffer2[MAXDIM_M*MAXDIM_N];

double
walltime()
{
   struct tms    ts;
   static double ClockTick=0.0;

   if (ClockTick==0.0) {
        ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
   }
   return ((double) times(&ts)) * ClockTick;
}

void
gecopy(long m, long n, const double *A, long incRowA, long incColA,
       double *B, long incRowB, long incColB)
{
    long i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
        }
    }
}

double
asumDiff(long m, long n, const double *A, long incRowA, long incColA,
         const double *B, long incRowB, long incColB)
{
    double diff = 0;
    long   i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            diff += fabs(B[i*incRowB+j*incColB] - A[i*incRowA+j*incColA]);
        }
    }
    return diff;
}


//------------------------------------------------------------------------------

void
initMatrix(long m, long n, double *A, long incRowA, long incColA)
{
    long i, j;

    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] = j*n+i+1;
        }
    }
}

void
printMatrix(long m, long n, const double *A, long incRowA, long incColA)
{
    long i, j;

    for (i=0; i<m; ++i) {
        printf("   ");
        for (j=0; j<n; ++j) {
            printf("%4.1lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

int
main()
{
    long    m, n, runs;
    double  t0, t1, t2, diff;

    printf("#  M    N     t1 (colMajor)  t2 (rowMajor)      t2/t1        diff\n");
    printf("#================================================================\n");

    for (m=MINDIM_M, n=MINDIM_N; m<=MAXDIM_M && n<MAXDIM_N; m+=INC_M, n+=INC_N) {

        printf("%4ld %4ld ", m, n);

        runs = 0;
        t1   = 0;
        do {
            t0 = walltime();
            initMatrix(m, n, buffer, 1, m);
            t1 += walltime() - t0;
            ++runs;
        } while (t1<MIN_T);
        t1 /= runs;
        gecopy(m, n, buffer, 1, m, buffer1, 1, m);

        printf(" %12.2lf", t1);

        runs = 0;
        t2 = 0;
        do {
            t0 = walltime();
            initMatrix(m, n, buffer, n, 1);
            t2 += walltime() - t0;
            ++runs;
        } while (t2<MIN_T);
        t2 /= runs;
        gecopy(m, n, buffer, n, 1, buffer2, n, 1);

        diff = asumDiff(m, n, buffer1, 1, m, buffer2, n, 1);

        printf(" %12.2lf %16.2lf", t2, t2/t1);
        printf(" %11.2le", diff);
        printf("\n");
    }

    return 0;
}

Aufgabe 1

Aufgabe 2

Gegeben sei folgende Vorlage:

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>

#ifndef MIN_M
#define MIN_M 1000
#endif

#ifndef MIN_N
#define MIN_N 1000
#endif

#ifndef MAX_M
#define MAX_M 10000
#endif

#ifndef MAX_N
#define MAX_N 10000
#endif

#ifndef INCX
#define INCX 1
#endif

#ifndef INCY
#define INCY 1
#endif

#ifndef ALPHA
#define ALPHA 1
#endif

#ifndef BETA
#define BETA 1
#endif

#ifndef T_MIN
#define T_MIN 1
#endif

double A1[MAX_M*MAX_N];
double A2[MAX_M*MAX_N];
double X[MAX_N*INCX];
double Y[MAX_M*INCY];
double Y1[MAX_M*INCY];
double Y2[MAX_M*INCY];

double
walltime()
{
   struct tms    ts;
   static double ClockTick=0.0;

   if (ClockTick==0.0) {
        ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
   }
   return ((double) times(&ts)) * ClockTick;
}

void
initMatrix(long m, long n, double *A, long incRowA, long incColA)
{
    int i, j;
    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] = ((double)rand() - RAND_MAX/2)*200/RAND_MAX;
        }
    }
}

void
copyMatrix(long m, long n,
           const double *A, long incRowA, long incColA,
           double *B, long incRowB, long incColB)
{
    int i, j;
    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
        }
    }
}

double
asumDiffMatrix(long m, long n,
               const double *A, long incRowA, long incColA,
               double *B, long incRowB, long incColB)
{
    int i, j;

    double asum = 0;

    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            asum += fabs(B[i*incRowB+j*incColB] - A[i*incRowA+j*incColA]);
        }
    }
    return asum;
}

//------------------------------------------------------------------------------

void
dgemv(long m, long n,
      double alpha,
      const double *A, long incRowA, long incColA,
      const double *x, long incX,
      double beta,
      double *y, long incY)
{
    /* to be done */
}

//------------------------------------------------------------------------------

int
main()
{
    long runs, i, m, n, incRowA, incColA;
    double t0, t1, t2;
    double diff;
    double alpha = ALPHA;
    double beta  = BETA;

    initMatrix(MAX_M, MAX_N, A1, 1, MAX_M); /* col major */
    initMatrix(MAX_N, 1, X, INCX, 1);
    initMatrix(MAX_M, 1, Y, INCY, 1);

    printf("RUN     M     N");
    printf("  col major  row major");
    printf("  col major  row major");
    printf("       DIFF");
    printf("\n");
    printf("               ");
    printf("   (t in s)   (t in s)");
    printf("   (GFLOPS)   (GFLOPS)");
    printf("           ");
    printf("\n");

    for (i=0, m=MIN_M, n=MIN_M; m<=MAX_M && n<=MAX_N; ++i, m+=100, n+=100) {
        /* col major */
        incRowA = 1; incColA = m;
        t1   = 0;
        runs = 0;
        while (t1<=T_MIN) {
            copyMatrix(MAX_M, 1, Y, INCY, 1, Y1, INCY, 1);
            t0 = walltime();
            dgemv(m, n, alpha,
                  A1, incRowA, incColA,
                  X, INCX,
                  beta,
                  Y1, INCY);
            t1 += walltime() - t0;
            ++runs;
        }
        t1 /= runs;

        /* row major */
        incRowA = n; incColA = 1;
        copyMatrix(m, n, A1, 1, m, A2, n, 1); /* convert to row major */
        t2   = 0;
        runs = 0;
        while (t2<=T_MIN) {
            copyMatrix(MAX_M, 1, Y, INCY, 1, Y2, INCY, 1);
            t0 = walltime();
            dgemv(m, n, alpha,
                  A2, incRowA, incColA,
                  X, INCX,
                  beta,
                  Y2, INCY);
            t2 += walltime() - t0;
            ++runs;
        }
        t2 /= runs;

        diff = asumDiffMatrix(m, 1, Y1, INCY, 1, Y2, INCY, 1);

        printf("%3ld %5ld %5ld ", i, m, n);
        printf("%10.4lf %10.4lf ", t1, t2);
        printf("%10.4lf ", 2*(m/1000.0)*(n/1000.0)/t1);
        printf("%10.4lf ", 2*(m/1000.0)*(n/1000.0)/t2);
        printf("%10.4lf ", diff);
        printf("\n");
    }

    return 0;
}

Die Funktion dgemv (steht für double, general matrix, matrix vector) soll

\[y \leftarrow \alpha A x + \beta y\]

berechnen. Sie ist so mit Cache-Optimierung zu implementieren, dass es für die Laufzeit keine Rolle spielt, ob die Matrix \(A\) in row oder column major organisiert ist.