Ein erster Benchmark

Am Beispiel der AXPY-Operation soll ein erster Benchmark durchgeführt werden. Die Zutaten für einen Benchmark sind

Eine Referenzimplementierung, d.h. eine Implementierung der wir vertrauen.
Eine Fehlerschranke, um zu testen, ob Abweichungen vom Referenzergebnis im zulässigen Bereich liegt:

Seien \(x, y \in \mathbb{M}^n\) und \(\alpha \in \mathbb{M}^n\). Ferner sei \(\|x\|_\infty \neq 0\). Ist \(\hat{z}\) das Ergebnis der Referenzimplementierung von AXPY und \(z\) das zu testende Ergebnis, dann muss gelten
\[\frac{\|z - \hat{z}\|_\infty}{|\alpha| \cdot \|x\|_\infty \|y\|_\infty \cdot n}\quad\leq\quad \text{eps}.\]
Dabei bezeichnet \(\text{eps}\) die Maschinengenauigkeit.
Der Benchmark wird für Werte \(n_{\text{min}}, n_{\text{min}} + n_{\text{inc}}, \dots, n_{\text{max}}\) durchgeführt. Diese sollen durch Makros N_MIN, N_INC und N_MAX definiert werden. Beim Aufruf des Compilers soll es möglich sein die Deafult-Wert N_MIN=10000, N_INC=1000 und N_MIN=30000 zu überschreiben.
Der skalare Wert für \(\alpha\) soll analog durch das Makro ALPHA gesetzt werden können. Als default-Wert soll \(\alpha = 1\) verwendet werden.
Mit einem Timer für die Wall-Time (Zeit außerhalb des Computers) kann gestoppt werden wie lange die Implementierungen jeweils benötigen. Folgende Daten sollen dann ausgegeben werden:
- time elapsed für beide Implementierungen.
- mflops (mega flops) für beide Implementierungen.
- speedup factor das ist der Quotient
  \[s = \frac{t_{\text{new}}}{t_{\text{ref}}}\]
- error das ist der obige Quotient.
Zum Testen sollen Vektoren mit zufälligen Werten benutzt werden. Wir benötigen also auch noch einen Zufallszahlengenerator.
Zu guter Letzt: Das Ergebnis soll durch schöne Plots visualisiert werden.

Matrizen und Vektoren mit Zufallszahlen

Füllt die Matrix mit Zufallszahlen. Die Funktion rand() gibt Integer-Werte im Bereich 0,...,RAND_MAX-1 zurück.

void
randGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;
    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] = ((double)rand()-RAND_MAX/2)*200/RAND_MAX;
        }
    }
}

Wall-Time

Liefert die aktuelle Wall-Time:

#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>

double
walltime()
{
   struct tms    ts;
   static double ClockTick=0.0;

   if (ClockTick==0.0) {
        ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
   }
   return ((double) times(&ts)) * ClockTick;
}

Fehlerschranke

double
err_daxpy(int n, double alpha,
          const double *x, int incX,
          const double *y, int incY,
          const double *z_, int incZ_,
          double *z, int incZ)
{
    double normX = damax(n, x, incX);
    double normY = damax(n, y, incY);
    double normD;

    daxpy(n, -1.0, z_, incZ_, z, incZ);
    normD = damax(n, z, incZ);

    return normD/(fabs(alpha)*n*normX*normY);
}

Das gesamte Benchmark-Programm

session04/example06/axpy_bench.c

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>

double
walltime()
{
   struct tms    ts;
   static double ClockTick=0.0;

   if (ClockTick==0.0) {
        ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
   }
   return ((double) times(&ts)) * ClockTick;
}

void
initGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
randGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;
    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] = ((double)rand()-RAND_MAX/2)*200/RAND_MAX;
        }
    }
}

void
printGeMatrix(int m, int n, const double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n\n");
}

//-- BLAS level 1 procedures and functions -------------------------------------

double
ddot(int n, const double *x, int incX, const double *y, int incY)
{
    int     i;
    double  alpha = 0;

    for (i=0; i<n; ++i) {
        alpha += x[i*incX]*y[i*incY];

    }
    return alpha;
}

void
daxpy(int n, double alpha, const double *x, int incX, double *y, int incY)
{
    int i;

    if (alpha==0) {
        return;
    }
    for (i=0; i<n; ++i) {
        y[i*incY] += alpha*x[i*incX];
    }
}

void
dscal(int n, double alpha, double *x, int incX)
{
    int i;

    if (alpha==1) {
        return;
    }
    for (i=0; i<n; ++i) {
        x[i*incX] *= alpha;
    }
}

void
dcopy(int n, const double *x, int incX, double *y, int incY)
{
    int i;

    for (i=0; i<n; ++i) {
        y[i*incY] = x[i*incX];
    }
}

void
dswap(int n, double *x, int incX, double *y, int incY)
{
    int i;

    for (i=0; i<n; ++i) {
        double tmp;

        tmp = x[i*incX];
        x[i*incX] = y[i*incY];
        y[i*incY] = tmp;
    }
}

double
damax(int n, const double *x, int incX)
{
    int     i;
    double  result = 0;

    for (i=0; i<n; ++i) {
        if (fabs(x[i*incX])>result) {
            result = fabs(x[i*incX]);
        }
    }
    return result;
}

double
dgenrm1(int m, int n, const double *A, int incRowA, int incColA)
{
    int     i, j;
    double  result = 0;

    for (j=0; j<n; ++j) {
        double sum = 0;
        for (i=0; i<m; ++i) {
            sum += fabs(A[i*incRowA+j*incColA]);
        }
        if (sum>result) {
            result = sum;
        }
    }
    return result;
}

//-- error bounds for BLAS level 1 ---------------------------------------------

double
err_daxpy(int n, double alpha,
          const double *x, int incX,
          const double *y, int incY,
          const double *z_, int incZ_,
          double *z, int incZ)
{
    double normX = damax(n, x, incX);
    double normY = damax(n, y, incY);
    double normD;

    daxpy(n, -1.0, z_, incZ_, z, incZ);
    normD = damax(n, z, incZ);

    return normD/(fabs(alpha)*n*normX*normY);
}

//-- axpy for testing ----------------------------------------------------------

void
daxpy_test(int n, double alpha, const double *x, int incX, double *y, int incY)
{
    // We try to make it slower ... (so that you see a difference).
    int i;

    if (alpha==0) {
        return;
    }
    for (i=0; i<n; i+=2) {
        y[i*incY] += alpha*x[i*incX];
    }
    for (i=1; i<n; i+=2) {
        y[i*incY] += alpha*x[i*incX];
    }
}

#ifndef N_MIN
#define N_MIN  10000
#endif

#ifndef N_MAX
#define N_MAX  30000
#endif

#ifndef N_INC
#define N_INC  1000
#endif

#ifndef INC_X
#define INC_X  1
#endif

#ifndef INC_Y
#define INC_Y  1
#endif

#ifndef ALPHA
#define ALPHA  1.0
#endif

#ifndef ROWMAJOR
#define ROWMAJOR 0
#endif

#if (ROWMAJOR==1)
#   define INCROW_A  DIM_N
#   define INCCOL_A  1
#else
#   define INCROW_A  1
#   define INCCOL_A  DIM_M
#endif

double x[N_MAX*INC_X];
double y[N_MAX];
double z[N_MAX*INC_Y];
double z_[N_MAX*INC_Y];

int
main()
{
    int n;

    randGeMatrix(N_MAX, 1, x, INC_X, 0);
    randGeMatrix(N_MAX, 1, y, 1, 0);

    for (n=N_MIN; n<=N_MAX; n+=N_INC) {
        int     runs = 1;
        double  t0, t1, dt, err;

        printf("%5d %5d %5d %5.2lf", n, INC_X, INC_Y, ALPHA);

        t0   = 0;
        runs = 0;
        do {
            dcopy(n, y, 1, z_, INC_Y);
            dt = walltime();
            daxpy(n, ALPHA, x, INC_X, z_, INC_Y);
            dt = walltime() - dt;
            t0 += dt;
            ++runs;
        } while (t0<0.3);
        t0 /= runs;

        printf(" %8.2e %8.2lf", t0, n/(1000*1000*t0));

        t1   = 0;
        runs = 0;
        do {
            dcopy(n, y, 1, z, INC_Y);
            dt = walltime();
            daxpy(n, ALPHA, x, INC_X, z, INC_Y);
            dt = walltime() - dt;
            t1 += dt;
            ++runs;
        } while (t1<0.3);
        t1 /= runs;

        printf(" %8.2e %8.2lf", t1, n/(1000*1000*t1));

        err = err_daxpy(n, ALPHA, x, INC_X, y, 1, z_, INC_Y, z, INC_Y);

        printf(" %8.3e", err);
        printf("\n");
    }

    return 0;
}

Benchmark durchführen

$shell> gcc -Wall -O1 -o axpy_bench  axpy_bench.c
$shell> ./axpy_bench > report.axpy
$shell> cat report.axpy
10000     1     1  1.00 9.08e-06  1101.65 7.87e-06  1270.60 0.000e+00
11000     1     1  1.00 1.17e-05   937.52 1.11e-05   987.52 0.000e+00
12000     1     1  1.00 1.29e-05   926.92 1.04e-05  1152.28 0.000e+00
13000     1     1  1.00 1.10e-05  1181.20 1.27e-05  1023.85 0.000e+00
14000     1     1  1.00 1.30e-05  1080.61 1.30e-05  1073.05 0.000e+00
15000     1     1  1.00 1.28e-05  1170.45 1.37e-05  1095.65 0.000e+00
16000     1     1  1.00 1.58e-05  1013.47 1.59e-05  1003.20 0.000e+00
17000     1     1  1.00 1.92e-05   885.43 1.66e-05  1025.98 0.000e+00
18000     1     1  1.00 1.78e-05  1009.86 1.79e-05  1004.28 0.000e+00
19000     1     1  1.00 1.95e-05   975.97 1.89e-05  1007.80 0.000e+00
20000     1     1  1.00 1.82e-05  1100.00 1.84e-05  1084.19 0.000e+00
21000     1     1  1.00 1.51e-05  1394.75 1.79e-05  1172.85 0.000e+00
22000     1     1  1.00 2.19e-05  1005.77 2.07e-05  1065.01 0.000e+00
23000     1     1  1.00 2.19e-05  1051.02 2.23e-05  1030.55 0.000e+00
24000     1     1  1.00 1.05e-05  2275.76 2.09e-05  1150.84 0.000e+00
25000     1     1  1.00 2.52e-05   992.58 2.61e-05   958.95 0.000e+00
26000     1     1  1.00 2.77e-05   938.51 2.29e-05  1136.03 0.000e+00
27000     1     1  1.00 2.62e-05  1031.57 2.45e-05  1100.47 0.000e+00
28000     1     1  1.00 2.89e-05   968.15 2.96e-05   946.49 0.000e+00
29000     1     1  1.00 3.21e-05   903.49 2.88e-05  1007.94 0.000e+00
30000     1     1  1.00 2.91e-05  1030.16 2.90e-05  1034.03 0.000e+00
$shell>

Plot-Skript

session04/example06/plot.axpy

set terminal svg size 900, 500
set output "bench.axpy.svg"
set xlabel "Vector length N"
set ylabel "MFLOPS"
set title "AXPY"
set key outside
set pointsize 0.5
plot "report.axpy" using 1:6 with linespoints lt 2 lw 3 title "AXPY-ref", \
     "report.axpy" using 1:8 with linespoints lt 3 lw 3 title "AXPY-test"

Resulate plotten

$shell> gnuplot plot.axpy
$shell>