GEMV-Prozeduren aufräumen

Die Implementierungen von dgemv_blk_row und dgemv_blk_col packen wir in eine Prozedur dgemv_ulm. Durch eine Fallunterscheidung wird die günstigere Implementierung ausgewählt:

void
dgemv_ulm(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
    if (incRowA<incColA) {
        int bs = DGEMV_MxBS;
        int j;

        dscal(m, beta, y, incY);

        for (j=0; j+bs<=n; j+=bs) {
            dgemv_mxBS(m, alpha,
                       &A[j*incColA], incRowA, incColA,
                       &x[j*incX], incX,
                       y, incY);
        }
        for (; j<n; ++j) {
            daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY);
        }
    } else {
        int bs = DGEMV_BSxN;
        int i;

        for (i=0; i+bs<=m; i+=bs) {
            dgemv_BSxn(n, alpha,
                       &A[i*incRowA], incRowA, incColA,
                       x, incX,
                       beta,
                       &y[i*incY], incY);
        }
        for (; i<m; ++i) {
            y[i*incY] *= beta;
            y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX);
        }
    }
}

Benchmark mit der Intel MKL

Wir werden das Matrix-Vektor-Produkt vorerst nicht weiter optimieren. Wir versuchen in einer nächsten Vorlesung erst einmal zu erklären, wie durch eine Variation der Algorithmen überhaupt eine Leistungssteigerung erzielt werden konnte.

Um zu belegen, dass wir aber prinzipiell auf dem richtigen Weg sind, werden wir die aktuelle Version mit der Implementierung der Intel Math Kernel Library (Intel MKL) vergleichen. Im Linux-Pool E44 ist eine Version der MKL installiert. Für den Benchmark kann folgendes Makefile verwendet werden (wget http://www.mathematik.uni-ulm.de/numerik/hpc/ss16/download/session05/example03/Makefile):

session05/example03/Makefile

CC         = gcc
CFLAGS     += -Wall -Ofast

Arch       := intel64
IntelDir   := /opt/intel/compilers_and_libraries/linux
IncludeDir := $(IntelDir)/mkl/include
LibDirs    := $(IntelDir)/lib/$(Arch) $(IntelDir)/mkl/lib/$(Arch)
LinkerFlag := -Wl,
Rpath      := $(patsubst %,$(LinkerFlag)-rpath $(LinkerFlag)%,$(LibDirs))
Lpath      := $(patsubst %,-L%,$(LibDirs))

LDFLAGS    += $(Lpath) $(Rpath)
CPPFLAGS   += -I$(IncludeDir) -DMKL_ILP64
LDLIBS     += -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread

TARGETS = gemv_bench_sol_row gemv_bench_sol_col

%_row : %.c
    $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $(LDLIBS) -DROWMAJOR=1 -o $@ $<

%_col : %.c
    $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $(LDLIBS) -DROWMAJOR=0 -o $@ $<

all: $(TARGETS)

clean:
    $(RM) $(TARGETS)

Mit make werden zwei ausführbare Programme gemv_bench_sol_col und gemv_bench_sol_row erzeugt:

$shell> make
gcc -I/opt/intel/compilers_and_libraries/linux/mkl/include -DMKL_ILP64 -Wall -Ofast -L/opt/intel/compilers_and_libraries/linux/lib/intel64 -L/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread -DROWMAJOR=1 -o gemv_bench_sol_row gemv_bench_sol.c
gcc -I/opt/intel/compilers_and_libraries/linux/mkl/include -DMKL_ILP64 -Wall -Ofast -L/opt/intel/compilers_and_libraries/linux/lib/intel64 -L/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread -DROWMAJOR=0 -o gemv_bench_sol_col gemv_bench_sol.c
$shell>

Wir lenken die Ausgabe wieder in Dateien um, so dass anschließend wieder Plots erzeugt werden können:

$shell> ./gemv_bench_sol_col > report.gemv_mkl_col
$shell> ./gemv_bench_sol_row > report.gemv_mkl_row
$shell> gnuplot plot.gemv_mkl
$shell>

Hier wurde folgendes Skript für Gnuplot verwendet:

session05/example03/plot.gemv_mkl

set terminal svg size 1200, 500
set output "bench.gemv_mkl.svg"
set xlabel "Matrix Dimension (M=N)"
set ylabel "MFLOPS"
set yrange[0:12000]
set title "GEMV for Col-Major and Row-Major Matrices"
set key outside
set pointsize 1.25
plot "report.gemv_mkl_col" using 1:12 with linespoints lt  1 lw 1 title "dgemv_ulm (col major)", \
     "report.gemv_mkl_col" using 1:15 with linespoints lt  2 lw 1 title "dgemv_mkl (col major)", \
     "report.gemv_mkl_row" using 1:12 with linespoints lt  3 lw 1 title "dgemv_ulm (row major)", \
     "report.gemv_mkl_row" using 1:15 with linespoints lt  4 lw 1 title "dgemv_mkl (row major)"

Schließlich noch der Source-Code für den Benchmark:

session05/example03/gemv_bench_sol.c

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>

//-- timer for benchmarks ------------------------------------------------------

double
walltime()
{
   struct tms    ts;
   static double ClockTick=0.0;

   if (ClockTick==0.0) {
        ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
   }
   return ((double) times(&ts)) * ClockTick;
}

//-- setup and print matrices --------------------------------------------------

void
initGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
randGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;
    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] = ((double)rand()-RAND_MAX/2)*200/RAND_MAX;
        }
    }
}

void
printGeMatrix(int m, int n, const double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n\n");
}

//-- some BLAS Level 1 procedures and functions --------------------------------

double
ddot(int n, const double *x, int incX, const double *y, int incY)
{
    int     i;
    double  alpha = 0;

    for (i=0; i<n; ++i) {
        alpha += x[i*incX]*y[i*incY];

    }
    return alpha;
}

void
daxpy(int n, double alpha, const double *x, int incX, double *y, int incY)
{
    int i;

    if (alpha==0) {
        return;
    }
    for (i=0; i<n; ++i) {
        y[i*incY] += alpha*x[i*incX];
    }
}

void
dscal(int n, double alpha, double *x, int incX)
{
    int i;

    if (alpha==1) {
        return;
    }
    for (i=0; i<n; ++i) {
        x[i*incX] *= alpha;
    }
}

void
dcopy(int n, const double *x, int incX, double *y, int incY)
{
    int i;

    for (i=0; i<n; ++i) {
        y[i*incY] = x[i*incX];
    }
}

void
dswap(int n, double *x, int incX, double *y, int incY)
{
    int i;

    for (i=0; i<n; ++i) {
        double tmp;

        tmp = x[i*incX];
        x[i*incX] = y[i*incY];
        y[i*incY] = tmp;
    }
}

double
damax(int n, const double *x, int incX)
{
    int     i;
    double  result = 0;

    for (i=0; i<n; ++i) {
        if (fabs(x[i*incX])>result) {
            result = fabs(x[i*incX]);
        }
    }
    return result;
}

double
dgenrm1(int m, int n, const double *A, int incRowA, int incColA)
{
    int     i, j;
    double  result = 0;

    for (j=0; j<n; ++j) {
        double sum = 0;
        for (i=0; i<m; ++i) {
            sum += fabs(A[i*incRowA+j*incColA]);
        }
        if (sum>result) {
            result = sum;
        }
    }
    return result;
}

//-- error bound for gemv ------------------------------------------------------

//
//  Vector y0 is the trusted result and vector y1 the vector to be tested.
//
double
err_dgemv(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          const double *y0, int incY0,
          double *y1, int incY1)
{
    int    max_mn = (m>n) ? m : n;
    double normA  = fabs(alpha)*dgenrm1(m, n, A, incRowA, incColA);
    double normX  = damax(n, x, incX);
    double normY0 = fabs(beta)*damax(m, y0, incY0);
    double normD;

    daxpy(m, -1.0, y0, incY0, y1, incY1);
    normD = damax(n, y1, incY1);

    return normD/(normA*normX*normY0*max_mn);
}

//-- Fused BLAS Level 1 procedures and functions -------------------------------

#ifndef DGEMV_MxBS
#define DGEMV_MxBS 4
#endif

void
dgemv_mxBS(int m, double alpha,
           const double *A, int incRowA, int incColA,
           const double *x, int incX,
           double *y, int incY)
{
    int i, j;

    if (alpha==0) {
        return;
    }
    for (i=0; i<m; ++i) {
        double dot = 0;
        for (j=0; j<DGEMV_MxBS; ++j) {
            dot += A[i*incRowA+j*incColA]*x[j*incX];
        }
        y[i*incY] += alpha*dot;
    }
}

#ifndef DGEMV_BSxN
#define DGEMV_BSxN 4
#endif

void
dgemv_BSxn(int n, double alpha,
           const double *A, int incRowA, int incColA,
           const double *x, int incX,
           double beta,
           double *y, int incY)
{
    int i, j;

    if (alpha==0) {
        return;
    }
    if (beta!=1) {
        for (i=0; i<DGEMV_BSxN; ++i) {
            y[i*incY] *= beta;
        }
    }
    for (j=0; j<n; ++j) {
        for (i=0; i<DGEMV_BSxN; ++i) {
            y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX];
        }
    }
}

//-- GEMV implementations ------------------------------------------------------

void
dgemv_ref(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
    int i, j;

    for (i=0; i<m; ++i) {
        if (beta==0) {
            y[i*incY] = 0;
        } else {
            y[i*incY] *= beta;
        }
        for (j=0; j<n; ++j) {
            y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX];
        }
    }
}

void
dgemv_unblk(int m, int n, double alpha,
            const double *A, int incRowA, int incColA,
            const double *x, int incX,
            double beta,
            double *y, int incY)
{
    if (incRowA<incColA) {
        int j;

        dscal(m, beta, y, incY);

        for (j=0; j<n; ++j) {
            daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY);
        }
    } else {
        int i;

        for (i=0; i<m; ++i) {
            y[i*incY] *= beta;
            y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX);
        }
    }
}

void
dgemv_ulm(int m, int n, double alpha,
          const double *A, int incRowA, int incColA,
          const double *x, int incX,
          double beta,
          double *y, int incY)
{
    if (incRowA<incColA) {
        int bs = DGEMV_MxBS;
        int j;

        dscal(m, beta, y, incY);

        for (j=0; j+bs<=n; j+=bs) {
            dgemv_mxBS(m, alpha,
                       &A[j*incColA], incRowA, incColA,
                       &x[j*incX], incX,
                       y, incY);
        }
        dgemv_unblk(m, n-j, alpha,
                    &A[j*incColA], incRowA, incColA,
                    &x[j*incX], incX,
                    1.0,
                    y, incY);
    } else {
        int bs = DGEMV_BSxN;
        int i;

        for (i=0; i+bs<=m; i+=bs) {
            dgemv_BSxn(n, alpha,
                       &A[i*incRowA], incRowA, incColA,
                       x, incX,
                       beta,
                       &y[i*incY], incY);
        }
        dgemv_unblk(m-i, n, alpha,
                    &A[i*incRowA], incRowA, incColA,
                    x, incX,
                    beta,
                    &y[i*incY], incY);
    }
}

//-- Wrapper for the Intel MKL implementation of gemv --------------------------
#include <mkl_types.h>

// pre declaration so we don't need to include the complete mkl_blas header.
void
dgemv(const char *trans,
      const MKL_INT *m, const MKL_INT *n, const double *alpha,
      const double *A, const MKL_INT *ldA, const double *x,
      const MKL_INT *incX,
      const double *beta, double *y, const MKL_INT *incY);

void
dgemv_mkl(MKL_INT m, MKL_INT n,
          double alpha,
          const double *A, MKL_INT incRowA, MKL_INT incColA,
          const double *x, MKL_INT incX,
          double beta,
          double *y, MKL_INT incY)
{
    MKL_INT ldA   = (incRowA==1) ? incColA : incRowA;
    char    trans = (incRowA==1) ? 'N' : 'T';
    MKL_INT M     = (incRowA==1) ? m : n;
    MKL_INT N     = (incRowA==1) ? n : m;

    dgemv(&trans, &M, &N, &alpha, A, &ldA, x, &incX, &beta, y, &incY);
}

//------------------------------------------------------------------------------

#ifndef MIN_M
#define MIN_M 100
#endif

#ifndef MIN_N
#define MIN_N 100
#endif

#ifndef MAX_M
#define MAX_M 8000
#endif

#ifndef MAX_N
#define MAX_N 8000
#endif

#ifndef INC_M
#define INC_M 100
#endif

#ifndef INC_N
#define INC_N 100
#endif

#ifndef ROWMAJOR
#define ROWMAJOR 0
#endif

#if (ROWMAJOR==1)
#   define INCROW_A  MAX_N
#   define INCCOL_A  1
#else
#   define INCROW_A  1
#   define INCCOL_A  MAX_M
#endif

#ifndef INC_X
#define INC_X 1
#endif

#ifndef INC_Y
#define INC_Y 1
#endif

#ifndef ALPHA
#define ALPHA 2
#endif

#ifndef BETA
#define BETA 3
#endif


//
//  A, x, y_ will get initialized with initGeMatrix
//
double A[MAX_M*MAX_N];
double x[INC_X*MAX_N];
double y_[MAX_M];

//
// Each implementation gets its own vector for y:
//
double y_0[INC_Y*MAX_M];
double y_1[INC_Y*MAX_M];
double y_2[INC_Y*MAX_M];

int
main()
{
    int m, n;

    randGeMatrix(MAX_M, MAX_N, A, INCROW_A, INCCOL_A);
    randGeMatrix(MAX_N, 1, x, INC_X, 0);
    randGeMatrix(MAX_M, 1, y_, 1, 0);

    printf("#%9s %9s %9s %9s", "m", "n", "INCROW_A", "INCCOL_A");
    printf(" %9s %9s", "INC_X", "INC_Y");
    printf(" %9s %9s", "ALPHA", "BETA");
    printf(" %12s %12s", "t", "MFLOPS");
    printf(" %12s %12s %12s", "t", "MFLOPS", "err");
    printf(" %12s %12s %12s", "t", "MFLOPS", "err");
    printf("\n");

    for (m=MIN_M, n=MIN_N; (m<=MAX_M) && (n<=MAX_N); m+=INC_M, n+=INC_N) {
        int     runs = 1;
        int     ops = m*n + m*(n+1);
        double  t0, t1, dt, err;

        printf(" %9d %9d %9d %9d", m, n, INCROW_A, INCCOL_A);
        printf(" %9d %9d", INC_X, INC_Y);
        printf(" %9.2lf %9.2lf", (double)ALPHA, (double)BETA);

        t0   = 0;
        runs = 0;
        do {
            dcopy(n, y_, 1, y_0, INC_Y);
            dt = walltime();
            dgemv_ref(m, n, ALPHA,
                      A, INCROW_A, INCCOL_A,
                      x, INC_X,
                      BETA,
                      y_0, INC_Y);
            dt = walltime() - dt;
            t0 += dt;
            ++runs;
        } while (t0<0.3);
        t0 /= runs;

        printf(" %12.2e %12.2lf", t0, ops/(1000*1000*t0));

        t1   = 0;
        runs = 0;
        do {
            dcopy(n, y_, 1, y_2, INC_Y);
            dt = walltime();
            dgemv_ulm(m, n, ALPHA,
                      A, INCROW_A, INCCOL_A,
                      x, INC_X,
                      BETA,
                      y_2, INC_Y);
            dt = walltime() - dt;
            t1 += dt;
            ++runs;
        } while (t1<0.3);
        t1 /= runs;

        err = err_dgemv(m, n, ALPHA,
                        A, INCROW_A, INCCOL_A,
                        x, INC_X,
                        BETA,
                        y_0, INC_Y,
                        y_2, INC_Y);

        printf(" %12.2e %12.2lf %12.2e", t1, ops/(1000*1000*t1), err);

        t1   = 0;
        runs = 0;
        do {
            dcopy(n, y_, 1, y_2, INC_Y);
            dt = walltime();
            dgemv_mkl(m, n, ALPHA,
                      A, INCROW_A, INCCOL_A,
                      x, INC_X,
                      BETA,
                      y_2, INC_Y);
            dt = walltime() - dt;
            t1 += dt;
            ++runs;
        } while (t1<0.3);
        t1 /= runs;

        err = err_dgemv(m, n, ALPHA,
                        A, INCROW_A, INCCOL_A,
                        x, INC_X,
                        BETA,
                        y_0, INC_Y,
                        y_2, INC_Y);

        printf(" %12.2e %12.2lf %12.2e", t1, ops/(1000*1000*t1), err);


        printf("\n");
    }

    return 0;
}

Content

GEMV-Prozeduren aufräumen

Benchmark mit der Intel MKL

Benchmark-Plot