Matrix-Vektor Produkt GEMV

In BLAS Level 2 wird mit der GEMV-Operation (general matrix vector product)

\[y \leftarrow \beta y + \alpha A x,\quad A \in \mathbb{M}^{m \times n},\; x \in \mathbb{M}^{n},\; y \in \mathbb{M}^{m}\quad\text{und}\quad\alpha,\; \beta \in \mathbb{M}\]

das Matrix-Vektor Produkt für eine General-Matrix \(A\) definiert. Wir stellen zwei algorithmische Varianten vor, die anschliessend implementiert werden sollen.

Col-Major-Variante für GEMV

Die \(m \times n\) Matrix \(A\) wird mit

\[A = \left(a_1, \dots, a_n \right)\]

spaltenweise betrachtet. Für eine konsitente Partitionierung des Produktes wird entsprechend \(x\) komponentenweise als

\[x = \left( \begin{array}{c} x_1 \\ \vdots \\ x_n \end{array} \right)\]

betrachtet. Die Berechnung soll durch

\[\begin{array}{lcl}y &\leftarrow& \beta y \\y &\leftarrow& y + \alpha\, a_1\, x_1\\ &\vdots & \\y &\leftarrow& y + \alpha\,a_n\, x_n\\\end{array}\]

erfolgen. Dazu sollen die BLAS-Level 1 Funktionen für die SCAL und AXPY Operationen benutzt werden.

Als Signatur ist

void
dgemv_col(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY);

zu verwenden.

Row-Major-Variante für GEMV

Die \(m \times n\) Matrix \(A\) wird mit

\[A = \left( \begin{array}{c} a_1^T \\ \vdots \\ a_m^T \end{array} \right)\]

zeilenweise betrachtet. Für eine konsitente Partitionierung des Produktes wird entsprechend \(y\) komponentenweise als

\[y = \left( \begin{array}{c} y_1 \\ \vdots \\ y_m \end{array} \right)\]

betrachtet. Die Berechnung soll durch

\[\begin{array}{lcl}y_1 &\leftarrow& \beta\, y_1 + \alpha\,a_1^T\, x \\ &\vdots & \\y_m &\leftarrow& \beta\, y_m + \alpha\,a_m^T\, x \\\end{array}\]

erfolgen. Dazu soll die BLAS-Level 1 Funktion DOT für das Skalarprodukt verwendet werden.

Als Signatur ist

void
dgemv_row(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY);

zu verwenden.

Test nach Augenmaß

Testet eure Implementation zunächst an einfachen Beispielen, die man einfach nachrechnen kann. Ein etwas rigoroserer Test erfolgt dann bei den Benchmarks. Ihr könnt nachstehendes Grundgerüst verwenden. Natürlich könnt ihr die BLAS Level 1 Funktionen und Prozeduren (ddot, dscal,...) durch eure eigenen Implementierungen ersetzen.

session05/example01/gemv_test.c

#include <stdio.h>
#include <math.h>

//-- setup and print matrices --------------------------------------------------

void
initGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
printGeMatrix(int m, int n, const double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n\n");
}

//-- some BLAS Level 1 procedures and functions --------------------------------

double
ddot(int n, const double *x, int incX, const double *y, int incY)
{
    int     i;
    double  alpha = 0;

    for (i=0; i<n; ++i) {
        alpha += x[i*incX]*y[i*incY];

    }
    return alpha;
}

void
daxpy(int n, double alpha, const double *x, int incX, double *y, int incY)
{
    int i;

    if (alpha==0) {
        return;
    }
    for (i=0; i<n; ++i) {
        y[i*incY] += alpha*x[i*incX];
    }
}

void
dscal(int n, double alpha, double *x, int incX)
{
    int i;

    if (alpha==1) {
        return;
    }
    for (i=0; i<n; ++i) {
        x[i*incX] *= alpha;
    }
}

void
dcopy(int n, const double *x, int incX, double *y, int incY)
{
    int i;

    for (i=0; i<n; ++i) {
        y[i*incY] = x[i*incX];
    }
}

void
dswap(int n, double *x, int incX, double *y, int incY)
{
    int i;

    for (i=0; i<n; ++i) {
        double tmp;

        tmp = x[i*incX];
        x[i*incX] = y[i*incY];
        y[i*incY] = tmp;
    }
}

double
damax(int n, const double *x, int incX)
{
    int     i;
    double  result = 0;

    for (i=0; i<n; ++i) {
        if (fabs(x[i*incX])>result) {
            result = fabs(x[i*incX]);
        }
    }
    return result;
}

double
dgenrm1(int m, int n, const double *A, int incRowA, int incColA)
{
    int     i, j;
    double  result = 0;

    for (j=0; j<n; ++j) {
        double sum = 0;
        for (i=0; i<m; ++i) {
            sum += fabs(A[i*incRowA+j*incColA]);
        }
        if (sum>result) {
            result = sum;
        }
    }
    return result;
}

//-- GEMV implementations ------------------------------------------------------

void
dgemv_col(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
    // your code here
}

void
dgemv_row(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
    // your code here
}

//------------------------------------------------------------------------------

#ifndef DIM_M
#define DIM_M 5
#endif

#ifndef DIM_N
#define DIM_N 6
#endif

#ifndef ROWMAJOR
#define ROWMAJOR 0
#endif

#if (ROWMAJOR==1)
#   define INCROW_A  DIM_N
#   define INCCOL_A  1
#else
#   define INCROW_A  1
#   define INCCOL_A  DIM_M
#endif

#ifndef INC_X
#define INC_X 1
#endif

#ifndef INC_Y
#define INC_Y 1
#endif

#ifndef ALPHA
#define ALPHA 2
#endif

#ifndef BETA
#define BETA 3
#endif


//
//  A, x, y_ will get initialized with initGeMatrix
//
double A[DIM_M*DIM_N];
double x[INC_X*DIM_N];
double y_[DIM_M];

//
// If BETA is not zero the result of BETA*y + ALPHA * A *x depends on the
// initial value of y.  We initialize y with y_ before each call of a gemv
// implementation.  So the results are always the same.
//
double y[INC_Y*DIM_M];

int
main()
{
    initGeMatrix(DIM_M, DIM_N, A, INCROW_A, INCCOL_A);
    initGeMatrix(DIM_N, 1, x, INC_X, 0);
    initGeMatrix(DIM_M, 1, y_, 1, 0);

    printf("A =\n");
    printGeMatrix(DIM_M, DIM_N, A, INCROW_A, INCCOL_A);

    printf("x =\n");
    printGeMatrix(DIM_N, 1, x, INC_X, 0);

    printf("y =\n");
    printGeMatrix(DIM_M, 1, y_, INC_Y, 0);

    //
    //  Aufruf von gemv_col
    //
    printf("gemv_col: %10.4lf * y + %10.4lf * A * x =\n\n",
           (double)BETA, (double)ALPHA);

    dcopy(DIM_M, y_, 1, y, INC_Y);
    dgemv_col(DIM_M, DIM_N,
              ALPHA,
              A, INCROW_A, INCCOL_A,
              x, INC_X,
              BETA,
              y, INC_Y);

    printGeMatrix(DIM_M, 1, y, INC_Y, 0);

    //
    //  Aufruf von gemv_row
    //
    printf("gemv_row: %10.4lf * y + %10.4lf * A * x =\n\n",
           (double)BETA, (double)ALPHA);

    dcopy(DIM_M, y_, 1, y, INC_Y);
    dgemv_row(DIM_M, DIM_N,
              ALPHA,
              A, INCROW_A, INCCOL_A,
              x, INC_X,
              BETA,
              y, INC_Y);

    printGeMatrix(DIM_M, 1, y, INC_Y, 0);


    return 0;
}

Test und Benchmark

Der Benchmark vergleicht beide Implementierungen mit einer Referenz-Implementierung:

Die Fehlerschranke wird berechnet durch
\[\text{err} =\frac{ \|y_1 - y_0 \|_\infty}{ \max\{m, n\} \, |\alpha| \, |\beta|\,\|A\|_1 \, \|x\|_\infty \|y_0\|_\infty}\]
wobei mit \(y_0\) das Ergebnis der Referenzimplemnetierung und mit \(y_1\) das Ergebnis einer zu testenden Implementierung bezeichnet wird.
Die Rechenleistung (Rechenoperationen pro Zeit) berechnet sich bei der GEMV-Operation als
\[\text{FLOPS} =\frac{\text{#OP}_{\text{add}} + \text{#OP}_{\text{mult}}}{t} =\frac{m \cdot n + m\cdot (n+1)}{t}\]
und
\[\text{MFLOPS} =\frac{\text{#OP}_{\text{add}} + \text{#OP}_{\text{mult}}}{t} 10^{-6}\]
Das entspricht genau genommen nur dem Fall \(\beta = 1\). Außerdem wird hier stets die Mindestanzahl der Operationen benutzt. Die Implementierung von dgemv_col führt bei einem Aufruf mehr Multiplikationen aus. Nach dem von-Neumann-Modell müsste diese Implementierung also immer schlechter sein als dgemv_row.

Aufgaben

Fügt in dem Grundgerüst eure Implementierung für dgemv_col und dgemv_row ein und führt Tests für verschiedene Parameter und Compiler-Optimierungen durch:

Führt jeden Benchmark einmal für Row-Major (-DROWMAJOR=1) und Col-Major (-DROWMAJOR=0) Matrizen aus.
Übersetzt jeweils mit den Optimierungsflags -O1, -O2, -O3 und -Ofast.

Achtet bei der Ausgabe zuerst darauf, ob eure Implementierung korrekt ist und in den err-Spalten Werte im Bereich der Maschinengenauigkeit stehen. Erst dann auf die MFLOPS schauen!

Wenn alles klaptt, dann sollen die Ergebnisse in hübschen Plots aufbereitet werden. Folgendes Vorgehen kann man später leicht durch Makefiles automatisieren:

Die ausführbaren Dateien bekommen einen Namensuffix _row, falls mit -DROWMAJOR=1 übersetzt wird, und sonst den Zusatz _col. Ein weiteres Suffix _O1, ... _O3 oder _Ofast markiert die verwendete Optimierung. Insgesamt also übersetzen wir mit

gcc -Wall -O1 -DROWMAJOR=1 -o gemv_bench_row_O1 gemv_bench.c
gcc -Wall -O2 -DROWMAJOR=1 -o gemv_bench_row_O2 gemv_bench.c
gcc -Wall -O3 -DROWMAJOR=1 -o gemv_bench_row_O3 gemv_bench.c
gcc -Wall -Ofast -DROWMAJOR=1 -o gemv_bench_row_Ofast gemv_bench.c
gcc -Wall -O1 -DROWMAJOR=0 -o gemv_bench_col_O1 gemv_bench.c
gcc -Wall -O2 -DROWMAJOR=0 -o gemv_bench_col_O2 gemv_bench.c
gcc -Wall -O3 -DROWMAJOR=0 -o gemv_bench_col_O3 gemv_bench.c
gcc -Wall -Ofast -DROWMAJOR=0 -o gemv_bench_col_Ofast gemv_bench.c

Die Ausgaben lenken wir anschließend in Report-Dateien mit jeweils entsprechendem Suffix um:

gemv_bench_row_O1 > report.gemv.row_O1
gemv_bench_row_O2 > report.gemv.row_O2
gemv_bench_row_O3 > report.gemv.row_O3
gemv_bench_row_Ofast > report.gemv.row_Ofast
gemv_bench_col_O1 > report.gemv.col_O1
gemv_bench_col_O2 > report.gemv.col_O2
gemv_bench_col_O3 > report.gemv.col_O3
gemv_bench_col_Ofast > report.gemv.col_Ofast

Mit zwei GNUPLOT-Skripten beschreiben wir, wie daraus Plots erzeugt werden sollen:

Plots für RowMajor-Benchmarks:

session05/example01/plot.gemv_row

set terminal svg size 1200, 500
set output "bench.gemv_row.svg"
set xlabel "Matrix Dimension (M=N)"
set ylabel "MFLOPS"
set yrange[0:4500]
set title "GEMV for Row-Major Matrix"
set key outside
set pointsize 1.25
plot "report.gemv.row_O1"    using 1:10 with linespoints lt  1 lw 1 title "GEMV-ref (-O1)", \
     "report.gemv.row_O1"    using 1:12 with linespoints lt  2 lw 1 title "GEMV-col (-O1)", \
     "report.gemv.row_O1"    using 1:15 with linespoints lt  3 lw 1 title "GEMV-row (-O1)", \
     "report.gemv.row_O2"    using 1:10 with linespoints lt  4 lw 1 title "GEMV-ref (-O2)", \
     "report.gemv.row_O2"    using 1:12 with linespoints lt  5 lw 1 title "GEMV-col (-O2)", \
     "report.gemv.row_O2"    using 1:15 with linespoints lt  6 lw 1 title "GEMV-row (-O2)", \
     "report.gemv.row_O3"    using 1:10 with linespoints lt  7 lw 1 title "GEMV-ref (-O3)", \
     "report.gemv.row_O3"    using 1:12 with linespoints lt  8 lw 1 title "GEMV-col (-O3)", \
     "report.gemv.row_O3"    using 1:15 with linespoints lt  9 lw 1 title "GEMV-row (-O3)", \
     "report.gemv.row_Ofast" using 1:10 with linespoints lt 10 lw 1 title "GEMV-ref (-Ofast)", \
     "report.gemv.row_Ofast" using 1:12 with linespoints lt 11 lw 1 title "GEMV-col (-Ofast)", \
     "report.gemv.row_Ofast" using 1:15 with linespoints lt 12 lw 1 title "GEMV-row (-Ofast)"

Plots für ColMajor-Benchmarks:

session05/example01/plot.gemv_col

set terminal svg size 1200, 500
set output "bench.gemv_col.svg"
set xlabel "Matrix Dimension (M=N)"
set ylabel "MFLOPS"
set yrange[0:4500]
set title "GEMV for Col-Major Matrix"
set key outside
set pointsize 1.25
plot "report.gemv.col_O1"    using 1:10 with linespoints lt  1 lw 1 title "GEMV-ref (-O1)", \
     "report.gemv.col_O1"    using 1:12 with linespoints lt  2 lw 1 title "GEMV-col (-O1)", \
     "report.gemv.col_O1"    using 1:15 with linespoints lt  3 lw 1 title "GEMV-row (-O1)", \
     "report.gemv.col_O2"    using 1:10 with linespoints lt  4 lw 1 title "GEMV-ref (-O2)", \
     "report.gemv.col_O2"    using 1:12 with linespoints lt  5 lw 1 title "GEMV-col (-O2)", \
     "report.gemv.col_O2"    using 1:15 with linespoints lt  6 lw 1 title "GEMV-row (-O2)", \
     "report.gemv.col_O3"    using 1:10 with linespoints lt  7 lw 1 title "GEMV-ref (-O3)", \
     "report.gemv.col_O3"    using 1:12 with linespoints lt  8 lw 1 title "GEMV-col (-O3)", \
     "report.gemv.col_O3"    using 1:15 with linespoints lt  9 lw 1 title "GEMV-row (-O3)", \
     "report.gemv.col_Ofast" using 1:10 with linespoints lt 10 lw 1 title "GEMV-ref (-Ofast)", \
     "report.gemv.col_Ofast" using 1:12 with linespoints lt 11 lw 1 title "GEMV-col (-Ofast)", \
     "report.gemv.col_Ofast" using 1:15 with linespoints lt 12 lw 1 title "GEMV-row (-Ofast)"

Erzeugt werden die Plots dann durch

gnuplot plot.gemv_row
gnuplot plot.gemv_col

Grundgerüst

session05/example01/gemv_bench.c

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>

//-- timer for benchmarks ------------------------------------------------------

double
walltime()
{
   struct tms    ts;
   static double ClockTick=0.0;

   if (ClockTick==0.0) {
        ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
   }
   return ((double) times(&ts)) * ClockTick;
}

//-- setup and print matrices --------------------------------------------------

void
initGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
randGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;
    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] = ((double)rand()-RAND_MAX/2)*200/RAND_MAX;
        }
    }
}

void
printGeMatrix(int m, int n, const double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n\n");
}

//-- some BLAS Level 1 procedures and functions --------------------------------

double
ddot(int n, const double *x, int incX, const double *y, int incY)
{
    int     i;
    double  alpha = 0;

    for (i=0; i<n; ++i) {
        alpha += x[i*incX]*y[i*incY];

    }
    return alpha;
}

void
daxpy(int n, double alpha, const double *x, int incX, double *y, int incY)
{
    int i;

    if (alpha==0) {
        return;
    }
    for (i=0; i<n; ++i) {
        y[i*incY] += alpha*x[i*incX];
    }
}

void
dscal(int n, double alpha, double *x, int incX)
{
    int i;

    if (alpha==1) {
        return;
    }
    for (i=0; i<n; ++i) {
        x[i*incX] *= alpha;
    }
}

void
dcopy(int n, const double *x, int incX, double *y, int incY)
{
    int i;

    for (i=0; i<n; ++i) {
        y[i*incY] = x[i*incX];
    }
}

void
dswap(int n, double *x, int incX, double *y, int incY)
{
    int i;

    for (i=0; i<n; ++i) {
        double tmp;

        tmp = x[i*incX];
        x[i*incX] = y[i*incY];
        y[i*incY] = tmp;
    }
}

double
damax(int n, const double *x, int incX)
{
    int     i;
    double  result = 0;

    for (i=0; i<n; ++i) {
        if (fabs(x[i*incX])>result) {
            result = fabs(x[i*incX]);
        }
    }
    return result;
}

double
dgenrm1(int m, int n, const double *A, int incRowA, int incColA)
{
    int     i, j;
    double  result = 0;

    for (j=0; j<n; ++j) {
        double sum = 0;
        for (i=0; i<m; ++i) {
            sum += fabs(A[i*incRowA+j*incColA]);
        }
        if (sum>result) {
            result = sum;
        }
    }
    return result;
}

//-- error bound for gemv ------------------------------------------------------

//
//  Vector y0 is the trusted result and vector y1 the vector to be tested.
//
double
err_dgemv(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          const double *y0, int incY0,
          double *y1, int incY1)
{
    int    max_mn = (m>n) ? m : n;
    double normA  = fabs(alpha)*dgenrm1(m, n, A, incRowA, incColA);
    double normX  = damax(n, x, incX);
    double normY0 = fabs(beta)*damax(m, y0, incY0);
    double normD;

    daxpy(m, -1.0, y0, incY0, y1, incY1);
    normD = damax(n, y1, incY1);

    return normD/(normA*normX*normY0*max_mn);
}


//-- GEMV implementations ------------------------------------------------------

void
dgemv_ref(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
    int i, j;

    for (i=0; i<m; ++i) {
        if (beta==0) {
            y[i*incY] = 0;
        } else {
            y[i*incY] *= beta;
        }
        for (j=0; j<n; ++j) {
            y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX];
        }
    }
}

void
dgemv_col(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
}

void
dgemv_row(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
}

//------------------------------------------------------------------------------

#ifndef MIN_M
#define MIN_M 100
#endif

#ifndef MIN_N
#define MIN_N 100
#endif

#ifndef MAX_M
#define MAX_M 8000
#endif

#ifndef MAX_N
#define MAX_N 8000
#endif

#ifndef INC_M
#define INC_M 100
#endif

#ifndef INC_N
#define INC_N 100
#endif

#ifndef ROWMAJOR
#define ROWMAJOR 0
#endif

#if (ROWMAJOR==1)
#   define INCROW_A  MAX_N
#   define INCCOL_A  1
#else
#   define INCROW_A  1
#   define INCCOL_A  MAX_M
#endif

#ifndef INC_X
#define INC_X 1
#endif

#ifndef INC_Y
#define INC_Y 1
#endif

#ifndef ALPHA
#define ALPHA 2
#endif

#ifndef BETA
#define BETA 3
#endif


//
//  A, x, y_ will get initialized with initGeMatrix
//
double A[MAX_M*MAX_N];
double x[INC_X*MAX_N];
double y_[MAX_M];

//
// Each implementation gets its own vector for y:
//
double y_0[INC_Y*MAX_M];
double y_1[INC_Y*MAX_M];
double y_2[INC_Y*MAX_M];

int
main()
{
    int m, n;

    randGeMatrix(MAX_M, MAX_N, A, INCROW_A, INCCOL_A);
    randGeMatrix(MAX_N, 1, x, INC_X, 0);
    randGeMatrix(MAX_M, 1, y_, 1, 0);

    printf("#%9s %9s %9s %9s", "m", "n", "INCROW_A", "INCCOL_A");
    printf(" %9s %9s", "INC_X", "INC_Y");
    printf(" %9s %9s", "ALPHA", "BETA");
    printf(" %12s %12s", "t0", "MFLOPS");
    printf(" %12s %12s %12s", "t1", "MFLOPS", "err");
    printf(" %12s %12s %12s", "t2", "MFLOPS", "err");
    printf("\n");

    for (m=MIN_M, n=MIN_N; (m<=MAX_M) && (n<=MAX_N); m+=INC_M, n+=INC_N) {
        int     runs = 1;
        int     ops = m*n + m*(n+1);
        double  t0, t1, t2, dt, err;

        printf(" %9d %9d %9d %9d", m, n, INCROW_A, INCCOL_A);
        printf(" %9d %9d", INC_X, INC_Y);
        printf(" %9.2lf %9.2lf", (double)ALPHA, (double)BETA);

        t0   = 0;
        runs = 0;
        do {
            dcopy(n, y_, 1, y_0, INC_Y);
            dt = walltime();
            dgemv_ref(m, n, ALPHA,
                      A, INCROW_A, INCCOL_A,
                      x, INC_X,
                      BETA,
                      y_0, INC_Y);
            dt = walltime() - dt;
            t0 += dt;
            ++runs;
        } while (t0<0.3);
        t0 /= runs;

        printf(" %12.2e %12.2lf", t0, ops/(1000*1000*t0));

        t1   = 0;
        runs = 0;
        do {
            dcopy(n, y_, 1, y_1, INC_Y);
            dt = walltime();
            dgemv_col(m, n, ALPHA,
                      A, INCROW_A, INCCOL_A,
                      x, INC_X,
                      BETA,
                      y_1, INC_Y);
            dt = walltime() - dt;
            t1 += dt;
            ++runs;
        } while (t1<0.3);
        t1 /= runs;

        err = err_dgemv(m, n, ALPHA,
                        A, INCROW_A, INCCOL_A,
                        x, INC_X,
                        BETA,
                        y_0, INC_Y,
                        y_1, INC_Y);

        printf(" %12.2e %12.2lf %12.2e", t1, ops/(1000*1000*t1), err);

        t2   = 0;
        runs = 0;
        do {
            dcopy(n, y_, 1, y_2, INC_Y);
            dt = walltime();
            dgemv_row(m, n, ALPHA,
                      A, INCROW_A, INCCOL_A,
                      x, INC_X,
                      BETA,
                      y_2, INC_Y);
            dt = walltime() - dt;
            t2 += dt;
            ++runs;
        } while (t2<0.3);
        t2 /= runs;

        err = err_dgemv(m, n, ALPHA,
                        A, INCROW_A, INCCOL_A,
                        x, INC_X,
                        BETA,
                        y_0, INC_Y,
                        y_2, INC_Y);

        printf(" %12.2e %12.2lf %12.2e", t2, ops/(1000*1000*t2), err);
        printf("\n");
    }

    return 0;
}