Content

Matrix-Vektor Produkt GEMV

In BLAS Level 2 wird mit der GEMV-Operation (general matrix vector product)

\[y \leftarrow \beta y + \alpha A x,\quad A \in \mathbb{M}^{m \times n},\; x \in \mathbb{M}^{n},\; y \in \mathbb{M}^{m}\quad\text{und}\quad\alpha,\; \beta \in \mathbb{M}\]

das Matrix-Vektor Produkt für eine General-Matrix \(A\) definiert. Wir stellen zwei algorithmische Varianten vor, die anschliessend implementiert werden sollen.

Col-Major-Variante für GEMV

Die \(m \times n\) Matrix \(A\) wird mit

\[A = \left(a_1, \dots, a_n \right)\]

spaltenweise betrachtet. Für eine konsitente Partitionierung des Produktes wird entsprechend \(x\) komponentenweise als

\[x = \left( \begin{array}{c} x_1 \\ \vdots \\ x_n \end{array} \right)\]

betrachtet. Die Berechnung soll durch

\[\begin{array}{lcl}y &\leftarrow& \beta y \\y &\leftarrow& y + \alpha\, a_1\, x_1\\ &\vdots & \\y &\leftarrow& y + \alpha\,a_n\, x_n\\\end{array}\]

erfolgen. Dazu sollen die BLAS-Level 1 Funktionen für die SCAL und AXPY Operationen benutzt werden.

Als Signatur ist

void
dgemv_col(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY);

zu verwenden.

Row-Major-Variante für GEMV

Die \(m \times n\) Matrix \(A\) wird mit

\[A = \left( \begin{array}{c} a_1^T \\ \vdots \\ a_m^T \end{array} \right)\]

zeilenweise betrachtet. Für eine konsitente Partitionierung des Produktes wird entsprechend \(y\) komponentenweise als

\[y = \left( \begin{array}{c} y_1 \\ \vdots \\ y_m \end{array} \right)\]

betrachtet. Die Berechnung soll durch

\[\begin{array}{lcl}y_1 &\leftarrow& \beta\, y_1 + \alpha\,a_1^T\, x \\ &\vdots & \\y_m &\leftarrow& \beta\, y_m + \alpha\,a_m^T\, x \\\end{array}\]

erfolgen. Dazu soll die BLAS-Level 1 Funktion DOT für das Skalarprodukt verwendet werden.

Als Signatur ist

void
dgemv_row(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY);

zu verwenden.

Test nach Augenmaß

Testet eure Implementation zunächst an einfachen Beispielen, die man einfach nachrechnen kann. Ein etwas rigoroserer Test erfolgt dann bei den Benchmarks. Ihr könnt nachstehendes Grundgerüst verwenden. Natürlich könnt ihr die BLAS Level 1 Funktionen und Prozeduren (ddot, dscal,...) durch eure eigenen Implementierungen ersetzen.

#include <stdio.h>
#include <math.h>

//-- setup and print matrices --------------------------------------------------

void
initGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
printGeMatrix(int m, int n, const double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n\n");
}

//-- some BLAS Level 1 procedures and functions --------------------------------

double
ddot(int n, const double *x, int incX, const double *y, int incY)
{
    int     i;
    double  alpha = 0;

    for (i=0; i<n; ++i) {
        alpha += x[i*incX]*y[i*incY];

    }
    return alpha;
}

void
daxpy(int n, double alpha, const double *x, int incX, double *y, int incY)
{
    int i;

    if (alpha==0) {
        return;
    }
    for (i=0; i<n; ++i) {
        y[i*incY] += alpha*x[i*incX];
    }
}

void
dscal(int n, double alpha, double *x, int incX)
{
    int i;

    if (alpha==1) {
        return;
    }
    for (i=0; i<n; ++i) {
        x[i*incX] *= alpha;
    }
}

void
dcopy(int n, const double *x, int incX, double *y, int incY)
{
    int i;

    for (i=0; i<n; ++i) {
        y[i*incY] = x[i*incX];
    }
}

void
dswap(int n, double *x, int incX, double *y, int incY)
{
    int i;

    for (i=0; i<n; ++i) {
        double tmp;

        tmp = x[i*incX];
        x[i*incX] = y[i*incY];
        y[i*incY] = tmp;
    }
}

double
damax(int n, const double *x, int incX)
{
    int     i;
    double  result = 0;

    for (i=0; i<n; ++i) {
        if (fabs(x[i*incX])>result) {
            result = fabs(x[i*incX]);
        }
    }
    return result;
}

double
dgenrm1(int m, int n, const double *A, int incRowA, int incColA)
{
    int     i, j;
    double  result = 0;

    for (j=0; j<n; ++j) {
        double sum = 0;
        for (i=0; i<m; ++i) {
            sum += fabs(A[i*incRowA+j*incColA]);
        }
        if (sum>result) {
            result = sum;
        }
    }
    return result;
}

//-- GEMV implementations ------------------------------------------------------

void
dgemv_col(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
    // your code here
}

void
dgemv_row(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
    // your code here
}

//------------------------------------------------------------------------------

#ifndef DIM_M
#define DIM_M 5
#endif

#ifndef DIM_N
#define DIM_N 6
#endif

#ifndef ROWMAJOR
#define ROWMAJOR 0
#endif

#if (ROWMAJOR==1)
#   define INCROW_A  DIM_N
#   define INCCOL_A  1
#else
#   define INCROW_A  1
#   define INCCOL_A  DIM_M
#endif

#ifndef INC_X
#define INC_X 1
#endif

#ifndef INC_Y
#define INC_Y 1
#endif

#ifndef ALPHA
#define ALPHA 2
#endif

#ifndef BETA
#define BETA 3
#endif


//
//  A, x, y_ will get initialized with initGeMatrix
//
double A[DIM_M*DIM_N];
double x[INC_X*DIM_N];
double y_[DIM_M];

//
// If BETA is not zero the result of BETA*y + ALPHA * A *x depends on the
// initial value of y.  We initialize y with y_ before each call of a gemv
// implementation.  So the results are always the same.
//
double y[INC_Y*DIM_M];

int
main()
{
    initGeMatrix(DIM_M, DIM_N, A, INCROW_A, INCCOL_A);
    initGeMatrix(DIM_N, 1, x, INC_X, 0);
    initGeMatrix(DIM_M, 1, y_, 1, 0);

    printf("A =\n");
    printGeMatrix(DIM_M, DIM_N, A, INCROW_A, INCCOL_A);

    printf("x =\n");
    printGeMatrix(DIM_N, 1, x, INC_X, 0);

    printf("y =\n");
    printGeMatrix(DIM_M, 1, y_, INC_Y, 0);

    //
    //  Aufruf von gemv_col
    //
    printf("gemv_col: %10.4lf * y + %10.4lf * A * x =\n\n",
           (double)BETA, (double)ALPHA);

    dcopy(DIM_M, y_, 1, y, INC_Y);
    dgemv_col(DIM_M, DIM_N,
              ALPHA,
              A, INCROW_A, INCCOL_A,
              x, INC_X,
              BETA,
              y, INC_Y);

    printGeMatrix(DIM_M, 1, y, INC_Y, 0);

    //
    //  Aufruf von gemv_row
    //
    printf("gemv_row: %10.4lf * y + %10.4lf * A * x =\n\n",
           (double)BETA, (double)ALPHA);

    dcopy(DIM_M, y_, 1, y, INC_Y);
    dgemv_row(DIM_M, DIM_N,
              ALPHA,
              A, INCROW_A, INCCOL_A,
              x, INC_X,
              BETA,
              y, INC_Y);

    printGeMatrix(DIM_M, 1, y, INC_Y, 0);


    return 0;
}

Test und Benchmark

Der Benchmark vergleicht beide Implementierungen mit einer Referenz-Implementierung:

Aufgaben

Fügt in dem Grundgerüst eure Implementierung für dgemv_col und dgemv_row ein und führt Tests für verschiedene Parameter und Compiler-Optimierungen durch:

Achtet bei der Ausgabe zuerst darauf, ob eure Implementierung korrekt ist und in den err-Spalten Werte im Bereich der Maschinengenauigkeit stehen. Erst dann auf die MFLOPS schauen!

Wenn alles klaptt, dann sollen die Ergebnisse in hübschen Plots aufbereitet werden. Folgendes Vorgehen kann man später leicht durch Makefiles automatisieren:

Grundgerüst

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>

//-- timer for benchmarks ------------------------------------------------------

double
walltime()
{
   struct tms    ts;
   static double ClockTick=0.0;

   if (ClockTick==0.0) {
        ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
   }
   return ((double) times(&ts)) * ClockTick;
}

//-- setup and print matrices --------------------------------------------------

void
initGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
randGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;
    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] = ((double)rand()-RAND_MAX/2)*200/RAND_MAX;
        }
    }
}

void
printGeMatrix(int m, int n, const double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n\n");
}

//-- some BLAS Level 1 procedures and functions --------------------------------

double
ddot(int n, const double *x, int incX, const double *y, int incY)
{
    int     i;
    double  alpha = 0;

    for (i=0; i<n; ++i) {
        alpha += x[i*incX]*y[i*incY];

    }
    return alpha;
}

void
daxpy(int n, double alpha, const double *x, int incX, double *y, int incY)
{
    int i;

    if (alpha==0) {
        return;
    }
    for (i=0; i<n; ++i) {
        y[i*incY] += alpha*x[i*incX];
    }
}

void
dscal(int n, double alpha, double *x, int incX)
{
    int i;

    if (alpha==1) {
        return;
    }
    for (i=0; i<n; ++i) {
        x[i*incX] *= alpha;
    }
}

void
dcopy(int n, const double *x, int incX, double *y, int incY)
{
    int i;

    for (i=0; i<n; ++i) {
        y[i*incY] = x[i*incX];
    }
}

void
dswap(int n, double *x, int incX, double *y, int incY)
{
    int i;

    for (i=0; i<n; ++i) {
        double tmp;

        tmp = x[i*incX];
        x[i*incX] = y[i*incY];
        y[i*incY] = tmp;
    }
}

double
damax(int n, const double *x, int incX)
{
    int     i;
    double  result = 0;

    for (i=0; i<n; ++i) {
        if (fabs(x[i*incX])>result) {
            result = fabs(x[i*incX]);
        }
    }
    return result;
}

double
dgenrm1(int m, int n, const double *A, int incRowA, int incColA)
{
    int     i, j;
    double  result = 0;

    for (j=0; j<n; ++j) {
        double sum = 0;
        for (i=0; i<m; ++i) {
            sum += fabs(A[i*incRowA+j*incColA]);
        }
        if (sum>result) {
            result = sum;
        }
    }
    return result;
}

//-- error bound for gemv ------------------------------------------------------

//
//  Vector y0 is the trusted result and vector y1 the vector to be tested.
//
double
err_dgemv(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          const double *y0, int incY0,
          double *y1, int incY1)
{
    int    max_mn = (m>n) ? m : n;
    double normA  = fabs(alpha)*dgenrm1(m, n, A, incRowA, incColA);
    double normX  = damax(n, x, incX);
    double normY0 = fabs(beta)*damax(m, y0, incY0);
    double normD;

    daxpy(m, -1.0, y0, incY0, y1, incY1);
    normD = damax(n, y1, incY1);

    return normD/(normA*normX*normY0*max_mn);
}


//-- GEMV implementations ------------------------------------------------------

void
dgemv_ref(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
    int i, j;

    for (i=0; i<m; ++i) {
        if (beta==0) {
            y[i*incY] = 0;
        } else {
            y[i*incY] *= beta;
        }
        for (j=0; j<n; ++j) {
            y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX];
        }
    }
}

void
dgemv_col(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
}

void
dgemv_row(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
}

//------------------------------------------------------------------------------

#ifndef MIN_M
#define MIN_M 100
#endif

#ifndef MIN_N
#define MIN_N 100
#endif

#ifndef MAX_M
#define MAX_M 8000
#endif

#ifndef MAX_N
#define MAX_N 8000
#endif

#ifndef INC_M
#define INC_M 100
#endif

#ifndef INC_N
#define INC_N 100
#endif

#ifndef ROWMAJOR
#define ROWMAJOR 0
#endif

#if (ROWMAJOR==1)
#   define INCROW_A  MAX_N
#   define INCCOL_A  1
#else
#   define INCROW_A  1
#   define INCCOL_A  MAX_M
#endif

#ifndef INC_X
#define INC_X 1
#endif

#ifndef INC_Y
#define INC_Y 1
#endif

#ifndef ALPHA
#define ALPHA 2
#endif

#ifndef BETA
#define BETA 3
#endif


//
//  A, x, y_ will get initialized with initGeMatrix
//
double A[MAX_M*MAX_N];
double x[INC_X*MAX_N];
double y_[MAX_M];

//
// Each implementation gets its own vector for y:
//
double y_0[INC_Y*MAX_M];
double y_1[INC_Y*MAX_M];
double y_2[INC_Y*MAX_M];

int
main()
{
    int m, n;

    randGeMatrix(MAX_M, MAX_N, A, INCROW_A, INCCOL_A);
    randGeMatrix(MAX_N, 1, x, INC_X, 0);
    randGeMatrix(MAX_M, 1, y_, 1, 0);

    printf("#%9s %9s %9s %9s", "m", "n", "INCROW_A", "INCCOL_A");
    printf(" %9s %9s", "INC_X", "INC_Y");
    printf(" %9s %9s", "ALPHA", "BETA");
    printf(" %12s %12s", "t0", "MFLOPS");
    printf(" %12s %12s %12s", "t1", "MFLOPS", "err");
    printf(" %12s %12s %12s", "t2", "MFLOPS", "err");
    printf("\n");

    for (m=MIN_M, n=MIN_N; (m<=MAX_M) && (n<=MAX_N); m+=INC_M, n+=INC_N) {
        int     runs = 1;
        int     ops = m*n + m*(n+1);
        double  t0, t1, t2, dt, err;

        printf(" %9d %9d %9d %9d", m, n, INCROW_A, INCCOL_A);
        printf(" %9d %9d", INC_X, INC_Y);
        printf(" %9.2lf %9.2lf", (double)ALPHA, (double)BETA);

        t0   = 0;
        runs = 0;
        do {
            dcopy(n, y_, 1, y_0, INC_Y);
            dt = walltime();
            dgemv_ref(m, n, ALPHA,
                      A, INCROW_A, INCCOL_A,
                      x, INC_X,
                      BETA,
                      y_0, INC_Y);
            dt = walltime() - dt;
            t0 += dt;
            ++runs;
        } while (t0<0.3);
        t0 /= runs;

        printf(" %12.2e %12.2lf", t0, ops/(1000*1000*t0));

        t1   = 0;
        runs = 0;
        do {
            dcopy(n, y_, 1, y_1, INC_Y);
            dt = walltime();
            dgemv_col(m, n, ALPHA,
                      A, INCROW_A, INCCOL_A,
                      x, INC_X,
                      BETA,
                      y_1, INC_Y);
            dt = walltime() - dt;
            t1 += dt;
            ++runs;
        } while (t1<0.3);
        t1 /= runs;

        err = err_dgemv(m, n, ALPHA,
                        A, INCROW_A, INCCOL_A,
                        x, INC_X,
                        BETA,
                        y_0, INC_Y,
                        y_1, INC_Y);

        printf(" %12.2e %12.2lf %12.2e", t1, ops/(1000*1000*t1), err);

        t2   = 0;
        runs = 0;
        do {
            dcopy(n, y_, 1, y_2, INC_Y);
            dt = walltime();
            dgemv_row(m, n, ALPHA,
                      A, INCROW_A, INCCOL_A,
                      x, INC_X,
                      BETA,
                      y_2, INC_Y);
            dt = walltime() - dt;
            t2 += dt;
            ++runs;
        } while (t2<0.3);
        t2 /= runs;

        err = err_dgemv(m, n, ALPHA,
                        A, INCROW_A, INCCOL_A,
                        x, INC_X,
                        BETA,
                        y_0, INC_Y,
                        y_2, INC_Y);

        printf(" %12.2e %12.2lf %12.2e", t2, ops/(1000*1000*t2), err);
        printf("\n");
    }

    return 0;
}