Content |
GEMV-Prozeduren aufräumen
Die Implementierungen von dgemv_blk_row und dgemv_blk_col packen wir in eine Prozedur dgemv_ulm. Durch eine Fallunterscheidung wird die günstigere Implementierung ausgewählt:
void dgemv_ulm(int m, int n, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double beta, double *y, int incY) { if (incRowA<incColA) { int bs = DGEMV_MxBS; int j; dscal(m, beta, y, incY); for (j=0; j+bs<=n; j+=bs) { dgemv_mxBS(m, alpha, &A[j*incColA], incRowA, incColA, &x[j*incX], incX, y, incY); } for (; j<n; ++j) { daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY); } } else { int bs = DGEMV_BSxN; int i; for (i=0; i+bs<=m; i+=bs) { dgemv_BSxn(n, alpha, &A[i*incRowA], incRowA, incColA, x, incX, beta, &y[i*incY], incY); } for (; i<m; ++i) { y[i*incY] *= beta; y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX); } } }
Benchmark mit der Intel MKL
Wir werden das Matrix-Vektor-Produkt vorerst nicht weiter optimieren. Wir versuchen in einer nächsten Vorlesung erst einmal zu erklären, wie durch eine Variation der Algorithmen überhaupt eine Leistungssteigerung erzielt werden konnte.
Um zu belegen, dass wir aber prinzipiell auf dem richtigen Weg sind, werden wir die aktuelle Version mit der Implementierung der Intel Math Kernel Library (Intel MKL) vergleichen. Im Linux-Pool E44 ist eine Version der MKL installiert. Für den Benchmark kann folgendes Makefile verwendet werden (wget http://www.mathematik.uni-ulm.de/numerik/hpc/ss16/download/session05/example03/Makefile):
CC = gcc CFLAGS += -Wall -Ofast Arch := intel64 IntelDir := /opt/intel/compilers_and_libraries/linux IncludeDir := $(IntelDir)/mkl/include LibDirs := $(IntelDir)/lib/$(Arch) $(IntelDir)/mkl/lib/$(Arch) LinkerFlag := -Wl, Rpath := $(patsubst %,$(LinkerFlag)-rpath $(LinkerFlag)%,$(LibDirs)) Lpath := $(patsubst %,-L%,$(LibDirs)) LDFLAGS += $(Lpath) $(Rpath) CPPFLAGS += -I$(IncludeDir) -DMKL_ILP64 LDLIBS += -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread TARGETS = gemv_bench_sol_row gemv_bench_sol_col %_row : %.c $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $(LDLIBS) -DROWMAJOR=1 -o $@ $< %_col : %.c $(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $(LDLIBS) -DROWMAJOR=0 -o $@ $< all: $(TARGETS) clean: $(RM) $(TARGETS)
Mit make werden zwei ausführbare Programme gemv_bench_sol_col und gemv_bench_sol_row erzeugt:
$shell> make gcc -I/opt/intel/compilers_and_libraries/linux/mkl/include -DMKL_ILP64 -Wall -Ofast -L/opt/intel/compilers_and_libraries/linux/lib/intel64 -L/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread -DROWMAJOR=1 -o gemv_bench_sol_row gemv_bench_sol.c gcc -I/opt/intel/compilers_and_libraries/linux/mkl/include -DMKL_ILP64 -Wall -Ofast -L/opt/intel/compilers_and_libraries/linux/lib/intel64 -L/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread -DROWMAJOR=0 -o gemv_bench_sol_col gemv_bench_sol.c $shell>
Wir lenken die Ausgabe wieder in Dateien um, so dass anschließend wieder Plots erzeugt werden können:
$shell> ./gemv_bench_sol_col > report.gemv_mkl_col $shell> ./gemv_bench_sol_row > report.gemv_mkl_row $shell> gnuplot plot.gemv_mkl $shell>
Hier wurde folgendes Skript für Gnuplot verwendet:
set terminal svg size 1200, 500 set output "bench.gemv_mkl.svg" set xlabel "Matrix Dimension (M=N)" set ylabel "MFLOPS" set yrange[0:12000] set title "GEMV for Col-Major and Row-Major Matrices" set key outside set pointsize 1.25 plot "report.gemv_mkl_col" using 1:12 with linespoints lt 1 lw 1 title "dgemv_ulm (col major)", \ "report.gemv_mkl_col" using 1:15 with linespoints lt 2 lw 1 title "dgemv_mkl (col major)", \ "report.gemv_mkl_row" using 1:12 with linespoints lt 3 lw 1 title "dgemv_ulm (row major)", \ "report.gemv_mkl_row" using 1:15 with linespoints lt 4 lw 1 title "dgemv_mkl (row major)"
Schließlich noch der Source-Code für den Benchmark:
#include <stdio.h> #include <math.h> #include <stdlib.h> #include <sys/times.h> #include <unistd.h> //-- timer for benchmarks ------------------------------------------------------ double walltime() { struct tms ts; static double ClockTick=0.0; if (ClockTick==0.0) { ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK)); } return ((double) times(&ts)) * ClockTick; } //-- setup and print matrices -------------------------------------------------- void initGeMatrix(int m, int n, double *A, int incRowA, int incColA) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { A[i*incRowA+j*incColA] = i*n + j + 1; } } } void randGeMatrix(int m, int n, double *A, int incRowA, int incColA) { int i, j; for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { A[i*incRowA+j*incColA] = ((double)rand()-RAND_MAX/2)*200/RAND_MAX; } } } void printGeMatrix(int m, int n, const double *A, int incRowA, int incColA) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { printf("%10.4lf ", A[i*incRowA+j*incColA]); } printf("\n"); } printf("\n\n"); } //-- some BLAS Level 1 procedures and functions -------------------------------- double ddot(int n, const double *x, int incX, const double *y, int incY) { int i; double alpha = 0; for (i=0; i<n; ++i) { alpha += x[i*incX]*y[i*incY]; } return alpha; } void daxpy(int n, double alpha, const double *x, int incX, double *y, int incY) { int i; if (alpha==0) { return; } for (i=0; i<n; ++i) { y[i*incY] += alpha*x[i*incX]; } } void dscal(int n, double alpha, double *x, int incX) { int i; if (alpha==1) { return; } for (i=0; i<n; ++i) { x[i*incX] *= alpha; } } void dcopy(int n, const double *x, int incX, double *y, int incY) { int i; for (i=0; i<n; ++i) { y[i*incY] = x[i*incX]; } } void dswap(int n, double *x, int incX, double *y, int incY) { int i; for (i=0; i<n; ++i) { double tmp; tmp = x[i*incX]; x[i*incX] = y[i*incY]; y[i*incY] = tmp; } } double damax(int n, const double *x, int incX) { int i; double result = 0; for (i=0; i<n; ++i) { if (fabs(x[i*incX])>result) { result = fabs(x[i*incX]); } } return result; } double dgenrm1(int m, int n, const double *A, int incRowA, int incColA) { int i, j; double result = 0; for (j=0; j<n; ++j) { double sum = 0; for (i=0; i<m; ++i) { sum += fabs(A[i*incRowA+j*incColA]); } if (sum>result) { result = sum; } } return result; } //-- error bound for gemv ------------------------------------------------------ // // Vector y0 is the trusted result and vector y1 the vector to be tested. // double err_dgemv(int m, int n, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double beta, const double *y0, int incY0, double *y1, int incY1) { int max_mn = (m>n) ? m : n; double normA = fabs(alpha)*dgenrm1(m, n, A, incRowA, incColA); double normX = damax(n, x, incX); double normY0 = fabs(beta)*damax(m, y0, incY0); double normD; daxpy(m, -1.0, y0, incY0, y1, incY1); normD = damax(n, y1, incY1); return normD/(normA*normX*normY0*max_mn); } //-- Fused BLAS Level 1 procedures and functions ------------------------------- #ifndef DGEMV_MxBS #define DGEMV_MxBS 4 #endif void dgemv_mxBS(int m, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double *y, int incY) { int i, j; if (alpha==0) { return; } for (i=0; i<m; ++i) { double dot = 0; for (j=0; j<DGEMV_MxBS; ++j) { dot += A[i*incRowA+j*incColA]*x[j*incX]; } y[i*incY] += alpha*dot; } } #ifndef DGEMV_BSxN #define DGEMV_BSxN 4 #endif void dgemv_BSxn(int n, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double beta, double *y, int incY) { int i, j; if (alpha==0) { return; } if (beta!=1) { for (i=0; i<DGEMV_BSxN; ++i) { y[i*incY] *= beta; } } for (j=0; j<n; ++j) { for (i=0; i<DGEMV_BSxN; ++i) { y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX]; } } } //-- GEMV implementations ------------------------------------------------------ void dgemv_ref(int m, int n, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double beta, double *y, int incY) { int i, j; for (i=0; i<m; ++i) { if (beta==0) { y[i*incY] = 0; } else { y[i*incY] *= beta; } for (j=0; j<n; ++j) { y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX]; } } } void dgemv_unblk(int m, int n, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double beta, double *y, int incY) { if (incRowA<incColA) { int j; dscal(m, beta, y, incY); for (j=0; j<n; ++j) { daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY); } } else { int i; for (i=0; i<m; ++i) { y[i*incY] *= beta; y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX); } } } void dgemv_ulm(int m, int n, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double beta, double *y, int incY) { if (incRowA<incColA) { int bs = DGEMV_MxBS; int j; dscal(m, beta, y, incY); for (j=0; j+bs<=n; j+=bs) { dgemv_mxBS(m, alpha, &A[j*incColA], incRowA, incColA, &x[j*incX], incX, y, incY); } dgemv_unblk(m, n-j, alpha, &A[j*incColA], incRowA, incColA, &x[j*incX], incX, 1.0, y, incY); } else { int bs = DGEMV_BSxN; int i; for (i=0; i+bs<=m; i+=bs) { dgemv_BSxn(n, alpha, &A[i*incRowA], incRowA, incColA, x, incX, beta, &y[i*incY], incY); } dgemv_unblk(m-i, n, alpha, &A[i*incRowA], incRowA, incColA, x, incX, beta, &y[i*incY], incY); } } //-- Wrapper for the Intel MKL implementation of gemv -------------------------- #include <mkl_types.h> // pre declaration so we don't need to include the complete mkl_blas header. void dgemv(const char *trans, const MKL_INT *m, const MKL_INT *n, const double *alpha, const double *A, const MKL_INT *ldA, const double *x, const MKL_INT *incX, const double *beta, double *y, const MKL_INT *incY); void dgemv_mkl(MKL_INT m, MKL_INT n, double alpha, const double *A, MKL_INT incRowA, MKL_INT incColA, const double *x, MKL_INT incX, double beta, double *y, MKL_INT incY) { MKL_INT ldA = (incRowA==1) ? incColA : incRowA; char trans = (incRowA==1) ? 'N' : 'T'; MKL_INT M = (incRowA==1) ? m : n; MKL_INT N = (incRowA==1) ? n : m; dgemv(&trans, &M, &N, &alpha, A, &ldA, x, &incX, &beta, y, &incY); } //------------------------------------------------------------------------------ #ifndef MIN_M #define MIN_M 100 #endif #ifndef MIN_N #define MIN_N 100 #endif #ifndef MAX_M #define MAX_M 8000 #endif #ifndef MAX_N #define MAX_N 8000 #endif #ifndef INC_M #define INC_M 100 #endif #ifndef INC_N #define INC_N 100 #endif #ifndef ROWMAJOR #define ROWMAJOR 0 #endif #if (ROWMAJOR==1) # define INCROW_A MAX_N # define INCCOL_A 1 #else # define INCROW_A 1 # define INCCOL_A MAX_M #endif #ifndef INC_X #define INC_X 1 #endif #ifndef INC_Y #define INC_Y 1 #endif #ifndef ALPHA #define ALPHA 2 #endif #ifndef BETA #define BETA 3 #endif // // A, x, y_ will get initialized with initGeMatrix // double A[MAX_M*MAX_N]; double x[INC_X*MAX_N]; double y_[MAX_M]; // // Each implementation gets its own vector for y: // double y_0[INC_Y*MAX_M]; double y_1[INC_Y*MAX_M]; double y_2[INC_Y*MAX_M]; int main() { int m, n; randGeMatrix(MAX_M, MAX_N, A, INCROW_A, INCCOL_A); randGeMatrix(MAX_N, 1, x, INC_X, 0); randGeMatrix(MAX_M, 1, y_, 1, 0); printf("#%9s %9s %9s %9s", "m", "n", "INCROW_A", "INCCOL_A"); printf(" %9s %9s", "INC_X", "INC_Y"); printf(" %9s %9s", "ALPHA", "BETA"); printf(" %12s %12s", "t", "MFLOPS"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf("\n"); for (m=MIN_M, n=MIN_N; (m<=MAX_M) && (n<=MAX_N); m+=INC_M, n+=INC_N) { int runs = 1; int ops = m*n + m*(n+1); double t0, t1, dt, err; printf(" %9d %9d %9d %9d", m, n, INCROW_A, INCCOL_A); printf(" %9d %9d", INC_X, INC_Y); printf(" %9.2lf %9.2lf", (double)ALPHA, (double)BETA); t0 = 0; runs = 0; do { dcopy(n, y_, 1, y_0, INC_Y); dt = walltime(); dgemv_ref(m, n, ALPHA, A, INCROW_A, INCCOL_A, x, INC_X, BETA, y_0, INC_Y); dt = walltime() - dt; t0 += dt; ++runs; } while (t0<0.3); t0 /= runs; printf(" %12.2e %12.2lf", t0, ops/(1000*1000*t0)); t1 = 0; runs = 0; do { dcopy(n, y_, 1, y_2, INC_Y); dt = walltime(); dgemv_ulm(m, n, ALPHA, A, INCROW_A, INCCOL_A, x, INC_X, BETA, y_2, INC_Y); dt = walltime() - dt; t1 += dt; ++runs; } while (t1<0.3); t1 /= runs; err = err_dgemv(m, n, ALPHA, A, INCROW_A, INCCOL_A, x, INC_X, BETA, y_0, INC_Y, y_2, INC_Y); printf(" %12.2e %12.2lf %12.2e", t1, ops/(1000*1000*t1), err); t1 = 0; runs = 0; do { dcopy(n, y_, 1, y_2, INC_Y); dt = walltime(); dgemv_mkl(m, n, ALPHA, A, INCROW_A, INCCOL_A, x, INC_X, BETA, y_2, INC_Y); dt = walltime() - dt; t1 += dt; ++runs; } while (t1<0.3); t1 /= runs; err = err_dgemv(m, n, ALPHA, A, INCROW_A, INCCOL_A, x, INC_X, BETA, y_0, INC_Y, y_2, INC_Y); printf(" %12.2e %12.2lf %12.2e", t1, ops/(1000*1000*t1), err); printf("\n"); } return 0; }