Possible Solution and another Exercise
Content |
Source Code
#include <stdlib.h> // for malloc(), free(), rand(), srand(), abort() #include <stdio.h> // for printf() #include <stddef.h> // for size_t, ptrdiff_t #include <math.h> // for nan(), fabs() #include <float.h> // for DBL_EPSILON #include <stdbool.h> // for typedef bool #include <sys/times.h> // needed for walltime() #include <unistd.h> // needed for walltime() //-- Function for benchmarking and testing ------------------------------------- double walltime() { struct tms ts; static double ClockTick=0.0; if (ClockTick==0.0) { ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK)); } return ((double) times(&ts)) * ClockTick; } void initGeMatrix(size_t m, size_t n, double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { A[i*incRowA + j*incColA] = i*n + j +1; } } } void randGeMatrix(size_t m, size_t n, bool withNan, double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { A[i*incRowA + j*incColA] = withNan ? nan("") : 2.*(rand()-RAND_MAX/2)/RAND_MAX; } } } void printGeMatrix(size_t m, size_t n, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { printf("%9.2lf ", A[i*incRowA + j*incColA]); } printf("\n"); } printf("\n"); } double dgenrm_inf(size_t m, size_t n, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { double result = 0; for (size_t i=0; i<m; ++i) { double sum = 0; for (size_t j=0; j<n; ++j) { sum += fabs(A[i*incRowA+j*incColA]); } if (sum>result) { result = sum; } } return result; } //-- BLAS Level 1 functions ---------------------------------------------------- void dcopy(size_t n, const double *x, ptrdiff_t incX, double *y, ptrdiff_t incY) { for (size_t i=0; i<n; ++i) { y[i*incY] = x[i*incX]; } } void daxpy(size_t n, double alpha, const double *x, ptrdiff_t incX, double *y, ptrdiff_t incY) { for (size_t i=0; i<n; ++i) { y[i*incY] += alpha*x[i*incX]; } } double ddot(size_t n, const double *x, ptrdiff_t incX, const double *y, ptrdiff_t incY) { double result = 0; for (size_t i=0; i<n; ++i) { result += x[i*incX]*y[i*incY]; } return result; } void dscal(size_t n, double alpha, double *x, ptrdiff_t incX) { if (alpha==1) { return; } if (alpha!=0) { for (size_t i=0; i<n; ++i) { x[i*incX] *= alpha; } } else { for (size_t i=0; i<n; ++i) { x[i*incX] = 0; } } } //-- BLAS Level 2 functions ---------------------------------------------------- void dgemv_dot(size_t m, size_t n, double alpha, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, const double *x, ptrdiff_t incX, double beta, double *y, ptrdiff_t incY) { dscal(m, beta, y, incY); if (alpha==0) { return; } for (size_t i=0; i<m; ++i) { y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX); } } #ifndef DOTF #define DOTF 4 #endif void dgemv_dotf(size_t m, size_t n, double alpha, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, const double *x, ptrdiff_t incX, double beta, double *y, ptrdiff_t incY) { dscal(m, beta, y, incY); if (alpha==0) { return; } size_t mb = m / DOTF; for (size_t i=0; i<mb; ++i) { for (size_t j=0; j<n; ++j) { for (size_t l=0; l<DOTF; ++l) { y[(DOTF*i+l)*incY] += alpha*A[(DOTF*i+l)*incRowA+j*incColA]*x[j*incX]; } } } for (size_t i=mb*DOTF; i<m; ++i) { y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX); } } void dgemv_axpy(size_t m, size_t n, double alpha, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, const double *x, ptrdiff_t incX, double beta, double *y, ptrdiff_t incY) { dscal(m, beta, y, incY); if (alpha==0) { return; } for (size_t j=0; j<n; ++j) { daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY); } } #ifndef AXPYF #define AXPYF 4 #endif void dgemv_axpyf(size_t m, size_t n, double alpha, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, const double *x, ptrdiff_t incX, double beta, double *y, ptrdiff_t incY) { dscal(m, beta, y, incY); if (alpha==0) { return; } size_t nb = n / AXPYF; for (size_t j=0; j<nb; ++j) { for (size_t i=0; i<m; ++i) { for (size_t l=0; l<AXPYF; ++l) { y[i*incY] += alpha*A[i*incRowA+(j*AXPYF+l)*incColA] *x[(j*AXPYF+l)*incX]; } } } for (size_t j=nb*AXPYF; j<n; ++j) { daxpy(n, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY); } } //-- BLAS Level 2: dgemv reference implementation and error bound -------------- void dgemv_ref(size_t m, size_t n, double alpha, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, const double *x, ptrdiff_t incX, double beta, double *y, ptrdiff_t incY) { if (beta!=1) { if (beta!=0) { for (size_t i=0; i<m; ++i) { y[i*incY] *= beta; } } else { for (size_t i=0; i<m; ++i) { y[i*incY] = 0; } } } if (alpha!=0) { for (size_t j=0; j<n; ++j) { for (size_t i=0; i<m; ++i) { y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX]; } } } } // - Computes error bound for the test result ySol of the gemv operation // beta*y0 + alpha*A*x. // - yRef is trusted result. // - ySol gets overwritten. double dgemv_err(size_t m, size_t n, double alpha, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, const double *x, ptrdiff_t incX, const double *y0, ptrdiff_t incY0, double beta, const double *yRef, ptrdiff_t incYRef, double *ySol, ptrdiff_t incYSol) { double nrmY0 = dgenrm_inf(m, 1, y0, incY0, 1); double nrmX = dgenrm_inf(n, 1, x, incX, 1); double nrmA = dgenrm_inf(m, n, A, incRowA, incColA); size_t maxMN = m<n ? n : m; // nrmDiff = ||y2 - y1||_inf daxpy(m, -1, yRef, incYRef, ySol, incYSol); double nrmDiff = dgenrm_inf(m, 1, ySol, incYSol, 1); return nrmDiff / (DBL_EPSILON*(maxMN*fabs(alpha)*nrmA*nrmX + m*fabs(beta)*nrmY0)); } //------------------------------------------------------------------------------ #ifndef COLMAJOR #define COLMAJOR 0 #endif #ifndef SEED_RAND #define SEED_RAND 0 #endif #ifndef MAX_M #define MAX_M 4500 #endif #ifndef MAX_N #define MAX_N 4500 #endif #ifndef ALPHA #define ALPHA 1 #endif #ifndef BETA #define BETA 1 #endif int main() { srand(SEED_RAND); printf("#COLMAJOR = %d\n", COLMAJOR); printf("#ALPHA = %lf\n", (double)ALPHA); printf("#BETA = %lf\n", (double)BETA); double *A = malloc(MAX_M*MAX_N*sizeof(double)); double *x = malloc(MAX_N*sizeof(double)); double *y0 = malloc(MAX_M*sizeof(double)); double *yRef = malloc(MAX_M*sizeof(double)); double *ySol = malloc(MAX_M*sizeof(double)); if (!A || !x || !y0 || !yRef || !ySol) { abort(); } // print header printf("#%4s %4s ", "m", "n"); printf("%10s %10s ", "time ref", "mflops ref"); printf("%10s %10s %7s ", "time 1", "mflops 1", "err"); printf("%10s %10s %7s ", "time 2", "mflops 2", "err"); printf("%10s %10s %7s ", "time 3", "mflops 3", "err"); printf("%10s %10s %7s ", "time 4", "mflops 4", "err"); printf("\n"); for (size_t m=16, n=16; m<=MAX_M && n<=MAX_N; m+=16, n+=16) { ptrdiff_t incRowA = COLMAJOR ? 1 : n; ptrdiff_t incColA = COLMAJOR ? m : 1; ptrdiff_t incX = 1; ptrdiff_t incY0 = 1; ptrdiff_t incYRef = 1; ptrdiff_t incYSol = 1; double alpha = ALPHA; double beta = BETA; double mflop = m*(2*n+1)/1000./1000.; randGeMatrix(m, n, ALPHA==0, A, incRowA, incColA); randGeMatrix(n, 1, ALPHA==0, x, incX, 0); randGeMatrix(m, 1, BETA==0, y0, incY0, 0); //printf("A =\n"); //printGeMatrix(m, n, A, incRowA, incColA); //printf("x =\n"); //printGeMatrix(1, n, x, 0, incX); //printf("y0 =\n"); //printGeMatrix(1, m, y0, 0, incY0); printf(" %4zu %4zu ", m, n); { double t = 0; size_t runs = 0; while (t<0.1 || runs<3) { dcopy(m, y0, incY0, yRef, incYRef); double t0 = walltime(); dgemv_ref(m, n, alpha, A, incRowA, incColA, x, incX, beta, yRef, incYRef); t += walltime() - t0; ++runs; } t /= runs; //printf("\nyRef =\n"); //printGeMatrix(1, m, yRef, 0, incYRef); printf("%10.2lf %10.2lf ", t, mflop/t); } { double t = 0; size_t runs = 0; while (t<0.1 || runs<3) { dcopy(m, y0, incY0, ySol, incYSol); double t0 = walltime(); dgemv_dot(m, n, alpha, A, incRowA, incColA, x, incX, beta, ySol, incYSol); t += walltime() - t0; ++runs; } t /= runs; //printf("\nySol =\n"); //printGeMatrix(1, m, ySol, 0, incYSol); double err = dgemv_err(m, n, alpha, A, incRowA, incColA, x, incX, y0, incY0, beta, yRef, incYRef, ySol, incYSol); //printf("\nyDiff =\n"); //printGeMatrix(1, m, ySol, 0, incYSol); printf("%10.2lf %10.2lf %7.1e ", t, mflop/t, err); } { double t = 0; size_t runs = 0; while (t<0.1 || runs<3) { dcopy(m, y0, incY0, ySol, incYSol); double t0 = walltime(); dgemv_axpy(m, n, alpha, A, incRowA, incColA, x, incX, beta, ySol, incYSol); t += walltime() - t0; ++runs; } t /= runs; //printf("\nySol =\n"); //printGeMatrix(1, m, ySol, 0, incYSol); double err = dgemv_err(m, n, alpha, A, incRowA, incColA, x, incX, y0, incY0, beta, yRef, incYRef, ySol, incYSol); //printf("\nyDiff =\n"); //printGeMatrix(1, m, ySol, 0, incYSol); printf("%10.2lf %10.2lf %7.1e ", t, mflop/t, err); } { double t = 0; size_t runs = 0; while (t<0.1 || runs<3) { dcopy(m, y0, incY0, ySol, incYSol); double t0 = walltime(); dgemv_dotf(m, n, alpha, A, incRowA, incColA, x, incX, beta, ySol, incYSol); t += walltime() - t0; ++runs; } t /= runs; //printf("\nySol =\n"); //printGeMatrix(1, m, ySol, 0, incYSol); double err = dgemv_err(m, n, alpha, A, incRowA, incColA, x, incX, y0, incY0, beta, yRef, incYRef, ySol, incYSol); //printf("\nyDiff =\n"); //printGeMatrix(1, m, ySol, 0, incYSol); printf("%10.2lf %10.2lf %7.1e ", t, mflop/t, err); } { double t = 0; size_t runs = 0; while (t<0.1 || runs<3) { dcopy(m, y0, incY0, ySol, incYSol); double t0 = walltime(); dgemv_axpyf(m, n, alpha, A, incRowA, incColA, x, incX, beta, ySol, incYSol); t += walltime() - t0; ++runs; } t /= runs; //printf("\nySol =\n"); //printGeMatrix(1, m, ySol, 0, incYSol); double err = dgemv_err(m, n, alpha, A, incRowA, incColA, x, incX, y0, incY0, beta, yRef, incYRef, ySol, incYSol); //printf("\nyDiff =\n"); //printGeMatrix(1, m, ySol, 0, incYSol); printf("%10.2lf %10.2lf %7.1e ", t, mflop/t, err); } printf("\n"); //break; } free(A); free(x); free(y0); free(yRef); free(ySol); }
Test Run
heim$ gcc -Wall -std=c11 -O3 -DCOLMAJOR=1 -DMAX_M=3000 -o gemv gemv_sol.c heim$ ./gemv | tee gemv_colmajor.dat #COLMAJOR = 1 #ALPHA = 1.000000 #BETA = 1.000000 # m n time ref mflops ref time 1 mflops 1 err time 2 mflops 2 err time 3 mflops 3 err time 4 mflops 4 err 16 16 0.00 898.33 0.00 930.24 6.6e-03 0.00 1067.20 0.0e+00 0.00 1154.03 0.0e+00 0.00 953.51 0.0e+00 32 32 0.00 2019.74 0.00 1381.72 1.2e-02 0.00 3250.35 0.0e+00 0.00 3200.51 0.0e+00 0.00 3052.42 0.0e+00 48 48 0.00 3270.47 0.00 1832.93 8.7e-03 0.00 4053.65 0.0e+00 0.00 4330.73 0.0e+00 0.00 5831.08 0.0e+00 64 64 0.00 2900.48 0.00 2268.67 3.3e-03 0.00 3050.59 0.0e+00 0.00 5552.24 0.0e+00 0.00 7321.72 0.0e+00 80 80 0.00 3945.66 0.00 2143.75 4.3e-03 0.00 3490.87 0.0e+00 0.00 5478.51 0.0e+00 0.00 6680.08 0.0e+00 96 96 0.00 3468.44 0.00 2917.23 2.9e-03 0.00 3766.56 0.0e+00 0.00 4986.07 0.0e+00 0.00 6853.00 0.0e+00 112 112 0.00 3740.44 0.00 2243.03 2.3e-03 0.00 3769.92 0.0e+00 0.00 6126.81 0.0e+00 0.00 8266.10 0.0e+00 128 128 0.00 3818.90 0.00 2129.69 2.1e-03 0.00 3704.39 0.0e+00 0.00 4519.31 0.0e+00 0.00 8993.11 0.0e+00 144 144 0.00 4279.26 0.00 2328.23 3.3e-03 0.00 3911.07 0.0e+00 0.00 5290.53 0.0e+00 0.00 8245.64 0.0e+00 160 160 0.00 4063.09 0.00 2072.14 1.9e-03 0.00 4094.33 0.0e+00 0.00 3972.46 0.0e+00 0.00 8805.44 0.0e+00 176 176 0.00 4491.29 0.00 2286.31 1.2e-03 0.00 4083.67 0.0e+00 0.00 5790.95 0.0e+00 0.00 8839.12 0.0e+00 192 192 0.00 4501.73 0.00 2050.27 2.3e-03 0.00 4002.43 0.0e+00 0.00 4167.74 0.0e+00 0.00 7871.81 0.0e+00 208 208 0.00 4211.90 0.00 2135.28 1.3e-03 0.00 4267.41 0.0e+00 0.00 4442.46 0.0e+00 0.00 7923.33 0.0e+00 224 224 0.00 4532.32 0.00 1911.95 1.7e-03 0.00 4212.12 0.0e+00 0.00 3628.96 0.0e+00 0.00 7718.75 0.0e+00 240 240 0.00 4347.89 0.00 2119.48 1.3e-03 0.00 4148.49 0.0e+00 0.00 4200.97 0.0e+00 0.00 9012.40 0.0e+00 256 256 0.00 4250.25 0.00 1504.30 1.3e-03 0.00 4269.47 0.0e+00 0.00 2744.76 0.0e+00 0.00 8203.22 0.0e+00 272 272 0.00 4341.95 0.00 1950.84 1.2e-03 0.00 4742.33 0.0e+00 0.00 4026.20 0.0e+00 0.00 7867.50 0.0e+00 288 288 0.00 4453.52 0.00 1824.91 1.2e-03 0.00 4378.74 0.0e+00 0.00 3518.40 0.0e+00 0.00 7820.85 0.0e+00 304 304 0.00 4387.72 0.00 1883.34 9.4e-04 0.00 4310.30 0.0e+00 0.00 3559.66 0.0e+00 0.00 8073.78 0.0e+00 320 320 0.00 4314.98 0.00 1823.70 8.4e-04 0.00 4801.67 0.0e+00 0.00 3322.94 0.0e+00 0.00 8092.92 0.0e+00 336 336 0.00 4414.02 0.00 1800.80 7.7e-04 0.00 4473.22 0.0e+00 0.00 3400.14 0.0e+00 0.00 8065.99 0.0e+00 352 352 0.00 4509.74 0.00 1818.34 9.9e-04 0.00 4444.55 0.0e+00 0.00 3519.36 0.0e+00 0.00 7965.94 0.0e+00 368 368 0.00 4524.38 0.00 1844.27 5.4e-04 0.00 4369.04 0.0e+00 0.00 3476.50 0.0e+00 0.00 8196.15 0.0e+00 384 384 0.00 4348.90 0.00 1516.75 6.9e-04 0.00 4359.64 0.0e+00 0.00 2772.83 0.0e+00 0.00 8217.28 0.0e+00 400 400 0.00 4363.27 0.00 1842.30 6.5e-04 0.00 4555.51 0.0e+00 0.00 3323.42 0.0e+00 0.00 8362.44 0.0e+00 416 416 0.00 4445.95 0.00 1738.94 5.8e-04 0.00 4567.87 0.0e+00 0.00 3274.69 0.0e+00 0.00 8036.30 0.0e+00 432 432 0.00 4538.51 0.00 1803.86 4.8e-04 0.00 4499.11 0.0e+00 0.00 3427.66 0.0e+00 0.00 8142.83 0.0e+00 448 448 0.00 4428.45 0.00 1724.33 7.3e-04 0.00 4409.46 0.0e+00 0.00 3086.98 0.0e+00 0.00 8161.70 0.0e+00 464 464 0.00 4843.50 0.00 1784.57 5.4e-04 0.00 4482.98 0.0e+00 0.00 3232.92 0.0e+00 0.00 8378.16 0.0e+00 480 480 0.00 4783.47 0.00 1780.54 4.5e-04 0.00 4512.16 0.0e+00 0.00 3090.58 0.0e+00 0.00 7957.08 0.0e+00 496 496 0.00 4423.80 0.00 1652.21 5.4e-04 0.00 4576.03 0.0e+00 0.00 3166.95 0.0e+00 0.00 7844.63 0.0e+00 512 512 0.00 4436.95 0.00 920.79 5.1e-04 0.00 4403.55 0.0e+00 0.00 1827.26 0.0e+00 0.00 7803.78 0.0e+00 528 528 0.00 4317.63 0.00 837.14 5.3e-04 0.00 4464.77 0.0e+00 0.00 2161.35 0.0e+00 0.00 7975.70 0.0e+00 544 544 0.00 4303.09 0.00 627.96 7.0e-04 0.00 4513.13 0.0e+00 0.00 1863.42 0.0e+00 0.00 7985.77 0.0e+00 560 560 0.00 4428.56 0.00 410.90 5.7e-04 0.00 4491.34 0.0e+00 0.00 1425.02 0.0e+00 0.00 7732.86 0.0e+00 576 576 0.00 4365.13 0.00 544.58 6.3e-04 0.00 4328.91 0.0e+00 0.00 1829.37 0.0e+00 0.00 7625.40 0.0e+00 592 592 0.00 4314.35 0.00 478.31 4.7e-04 0.00 4821.36 0.0e+00 0.00 1606.48 0.0e+00 0.00 7864.04 0.0e+00 608 608 0.00 4231.09 0.00 437.23 4.3e-04 0.00 4358.22 0.0e+00 0.00 1432.79 0.0e+00 0.00 7944.22 0.0e+00 624 624 0.00 4247.60 0.00 340.09 4.6e-04 0.00 4414.10 0.0e+00 0.00 1190.32 0.0e+00 0.00 7700.23 0.0e+00 640 640 0.00 4263.17 0.00 342.84 7.3e-04 0.00 4230.37 0.0e+00 0.00 1259.57 0.0e+00 0.00 7393.47 0.0e+00 656 656 0.00 4283.15 0.00 387.60 6.9e-04 0.00 4745.13 0.0e+00 0.00 1326.45 0.0e+00 0.00 7596.91 0.0e+00 672 672 0.00 4091.93 0.00 410.84 4.2e-04 0.00 4190.53 0.0e+00 0.00 1382.88 0.0e+00 0.00 7633.34 0.0e+00 688 688 0.00 4056.49 0.00 312.63 3.3e-04 0.00 4297.64 0.0e+00 0.00 1076.56 0.0e+00 0.00 7406.76 0.0e+00 704 704 0.00 4418.62 0.00 342.67 4.8e-04 0.00 4283.36 0.0e+00 0.00 1220.08 0.0e+00 0.00 7015.69 0.0e+00 720 720 0.00 4131.22 0.00 332.01 3.5e-04 0.00 4131.22 0.0e+00 0.00 1188.43 0.0e+00 0.00 6904.22 0.0e+00 736 736 0.00 3967.91 0.00 364.66 3.3e-04 0.00 4050.70 0.0e+00 0.00 1235.91 0.0e+00 0.00 6948.27 0.0e+00 752 752 0.00 4289.37 0.00 329.24 3.2e-04 0.00 4074.34 0.0e+00 0.00 1090.61 0.0e+00 0.00 6821.43 0.0e+00 768 768 0.00 4260.23 0.00 300.47 3.5e-04 0.00 4120.72 0.0e+00 0.00 1062.37 0.0e+00 0.00 6342.05 0.0e+00 784 784 0.00 3958.67 0.00 307.52 2.9e-04 0.00 3981.04 0.0e+00 0.00 1084.72 0.0e+00 0.00 6407.68 0.0e+00 800 800 0.00 3803.98 0.00 326.02 3.7e-04 0.00 3842.40 0.0e+00 0.00 1075.87 0.0e+00 0.00 6415.64 0.0e+00 816 816 0.00 3930.96 0.00 314.96 3.3e-04 0.00 3784.38 0.0e+00 0.00 1017.57 0.0e+00 0.00 6311.34 0.0e+00 832 832 0.00 3948.05 0.00 304.76 3.5e-04 0.00 3941.75 0.0e+00 0.00 1020.07 0.0e+00 0.00 6012.12 0.0e+00 848 848 0.00 3754.63 0.00 300.89 3.7e-04 0.00 3885.45 0.0e+00 0.00 1059.67 0.0e+00 0.00 5808.55 0.0e+00 864 864 0.00 4078.23 0.00 313.71 3.6e-04 0.00 4003.53 0.0e+00 0.00 1045.70 0.0e+00 0.00 5751.35 0.0e+00 880 880 0.00 3874.20 0.00 309.94 3.0e-04 0.00 3775.58 0.0e+00 0.00 1022.79 0.0e+00 0.00 5790.17 0.0e+00 896 896 0.00 3727.14 0.01 305.24 3.7e-04 0.00 3841.06 0.0e+00 0.00 1007.73 0.0e+00 0.00 5739.69 0.0e+00 912 912 0.00 3888.64 0.01 302.62 2.5e-04 0.00 3812.99 0.0e+00 0.00 998.64 0.0e+00 0.00 5295.82 0.0e+00 928 928 0.00 3894.65 0.01 310.19 2.2e-04 0.00 3894.65 0.0e+00 0.00 1018.31 0.0e+00 0.00 5138.56 0.0e+00 944 944 0.00 3793.39 0.01 308.01 2.8e-04 0.00 3673.42 0.0e+00 0.00 972.66 0.0e+00 0.00 5333.44 0.0e+00 960 960 0.00 3577.67 0.01 313.51 2.8e-04 0.00 3928.06 0.0e+00 0.00 955.61 0.0e+00 0.00 5145.21 0.0e+00 976 976 0.00 3812.26 0.01 311.91 2.4e-04 0.00 3926.62 0.0e+00 0.00 972.13 0.0e+00 0.00 5163.87 0.0e+00 992 992 0.00 3741.33 0.01 295.37 3.0e-04 0.00 3580.22 0.0e+00 0.00 966.66 0.0e+00 0.00 4958.60 0.0e+00 1008 1008 0.00 3696.61 0.01 332.69 2.1e-04 0.00 3720.64 0.0e+00 0.00 975.91 0.0e+00 0.00 5304.64 0.0e+00 1024 1024 0.00 3776.72 0.01 305.19 2.9e-04 0.00 3566.90 0.0e+00 0.00 839.27 0.0e+00 0.00 5140.53 0.0e+00 1040 1040 0.00 3657.57 0.01 314.80 1.7e-04 0.00 3419.50 0.0e+00 0.00 983.75 0.0e+00 0.00 5367.32 0.0e+00 1056 1056 0.00 3570.12 0.01 290.07 2.4e-04 0.00 3837.88 0.0e+00 0.00 953.39 0.0e+00 0.00 5132.05 0.0e+00 1072 1072 0.00 3532.78 0.01 313.56 2.1e-04 0.00 3762.72 0.0e+00 0.00 1011.75 0.0e+00 0.00 4661.59 0.0e+00 1088 1088 0.00 3694.98 0.01 301.46 2.3e-04 0.00 3766.04 0.0e+00 0.00 1012.03 0.0e+00 0.00 4974.01 0.0e+00 1104 1104 0.00 3536.17 0.01 317.04 1.7e-04 0.00 3536.17 0.0e+00 0.00 931.15 0.0e+00 0.00 5143.52 0.0e+00 1120 1120 0.00 3650.79 0.01 301.19 3.3e-04 0.00 3739.78 0.0e+00 0.00 953.77 0.0e+00 0.00 4517.86 0.0e+00 1136 1136 0.00 3849.72 0.01 305.16 2.3e-04 0.00 3544.56 0.0e+00 0.00 938.96 0.0e+00 0.00 4570.37 0.0e+00 1152 1152 0.00 3558.18 0.01 313.82 2.5e-04 0.00 3531.63 0.0e+00 0.00 902.82 0.0e+00 0.00 4538.25 0.0e+00 1168 1168 0.00 3712.28 0.01 297.78 2.2e-04 0.00 3712.28 0.0e+00 0.00 982.66 0.0e+00 0.00 4838.86 0.0e+00 1184 1184 0.00 3646.36 0.01 280.49 2.3e-04 0.00 3722.86 0.0e+00 0.00 892.47 0.0e+00 0.00 4793.82 0.0e+00 1200 1200 0.00 3588.40 0.01 288.12 2.4e-04 0.00 3860.81 0.0e+00 0.00 950.80 0.0e+00 0.00 4845.65 0.0e+00 1216 1216 0.00 3523.34 0.01 295.85 2.2e-04 0.00 3461.48 0.0e+00 0.00 887.56 0.0e+00 0.00 4615.30 0.0e+00 1232 1232 0.00 3583.52 0.01 303.69 2.0e-04 0.00 3462.04 0.0e+00 0.00 880.70 0.0e+00 0.00 4737.53 0.0e+00 1248 1248 0.00 3796.17 0.01 283.30 2.0e-04 0.00 3552.53 0.0e+00 0.00 934.88 0.0e+00 0.00 4487.41 0.0e+00 1264 1264 0.00 3516.32 0.01 290.61 3.6e-04 0.00 3580.25 0.0e+00 0.00 990.96 0.0e+00 0.00 4533.44 0.0e+00 1280 1280 0.00 3605.89 0.01 295.03 1.8e-04 0.00 3441.98 0.0e+00 0.00 917.86 0.0e+00 0.00 4499.91 0.0e+00 1296 1296 0.00 3574.38 0.01 274.95 2.1e-04 0.00 3696.58 0.0e+00 0.00 974.55 0.0e+00 0.00 4490.89 0.0e+00 1312 1312 0.00 3581.76 0.01 275.52 2.2e-04 0.00 3581.76 0.0e+00 0.00 895.44 0.0e+00 0.00 4915.53 0.0e+00 1328 1328 0.00 3704.92 0.01 288.70 2.1e-04 0.00 3457.93 0.0e+00 0.00 952.69 0.0e+00 0.00 4426.66 0.0e+00 1344 1344 0.00 3469.46 0.01 289.12 1.7e-04 0.00 3745.43 0.0e+00 0.00 867.36 0.0e+00 0.00 4435.38 0.0e+00 1360 1360 0.00 3465.07 0.01 296.04 2.9e-04 0.00 3633.28 0.0e+00 0.00 925.14 0.0e+00 0.00 4474.31 0.0e+00 1376 1376 0.00 3409.32 0.01 265.17 2.1e-04 0.00 3636.60 0.0e+00 0.00 871.27 0.0e+00 0.00 4304.69 0.0e+00 1392 1392 0.00 3721.65 0.01 281.94 1.7e-04 0.00 3418.56 0.0e+00 0.00 930.41 0.0e+00 0.00 4334.88 0.0e+00 1408 1408 0.00 3331.72 0.01 277.64 1.8e-04 0.00 3450.71 0.0e+00 0.00 872.59 0.0e+00 0.00 4254.80 0.0e+00 1424 1424 0.00 3691.85 0.01 283.99 1.8e-04 0.00 3448.43 0.0e+00 0.00 933.10 0.0e+00 0.00 4178.69 0.0e+00 1440 1440 0.00 3432.06 0.02 276.58 1.5e-04 0.00 3696.06 0.0e+00 0.01 829.73 0.0e+00 0.00 4261.78 0.0e+00 1456 1456 0.00 3605.13 0.02 269.90 1.6e-04 0.00 3508.74 0.0e+00 0.00 933.09 0.0e+00 0.00 4125.66 0.0e+00 1472 1472 0.00 3598.08 0.02 275.87 2.0e-04 0.00 3598.08 0.0e+00 0.01 867.01 0.0e+00 0.00 4031.59 0.0e+00 1488 1488 0.00 3463.28 0.02 281.89 2.5e-04 0.00 3721.01 0.0e+00 0.01 841.66 0.0e+00 0.00 4208.29 0.0e+00 1504 1504 0.00 3529.92 0.02 263.99 1.3e-04 0.00 3661.57 0.0e+00 0.01 859.85 0.0e+00 0.00 4344.51 0.0e+00 1520 1520 0.00 3655.83 0.02 277.34 1.3e-04 0.00 3697.86 0.0e+00 0.01 832.02 0.0e+00 0.00 3992.00 0.0e+00 1536 1536 0.00 3561.55 0.02 257.46 1.8e-04 0.00 3681.70 0.0e+00 0.01 613.62 0.0e+00 0.00 3823.30 0.0e+00 1552 1552 0.00 3614.22 0.02 262.85 1.5e-04 0.00 3421.46 0.0e+00 0.01 876.17 0.0e+00 0.00 4144.31 0.0e+00 1568 1568 0.00 3639.92 0.02 268.30 1.7e-04 0.00 3639.92 0.0e+00 0.01 885.39 0.0e+00 0.00 4292.78 0.0e+00 1584 1584 0.00 3513.79 0.02 273.80 2.2e-04 0.00 3664.38 0.0e+00 0.01 912.67 0.0e+00 0.00 4266.74 0.0e+00 1600 1600 0.00 3636.34 0.02 279.36 2.3e-04 0.00 3226.61 0.0e+00 0.01 921.89 0.0e+00 0.00 4236.96 0.0e+00 1616 1616 0.00 3448.19 0.02 284.97 1.5e-04 0.00 3291.45 0.0e+00 0.01 949.91 0.0e+00 0.00 4179.62 0.0e+00 1632 1632 0.00 3463.51 0.02 266.42 2.1e-04 0.00 3536.17 0.0e+00 0.01 905.84 0.0e+00 0.00 3972.14 0.0e+00 1648 1648 0.00 3314.41 0.02 271.67 1.3e-04 0.00 3477.41 0.0e+00 0.01 923.69 0.0e+00 0.00 4020.76 0.0e+00 1664 1664 0.00 3489.86 0.02 276.97 1.8e-04 0.00 3600.65 0.0e+00 0.01 886.31 0.0e+00 0.00 4280.49 0.0e+00 1680 1680 0.00 3593.21 0.02 260.61 1.3e-04 0.00 3557.28 0.0e+00 0.01 923.97 0.0e+00 0.00 4065.47 0.0e+00 1696 1696 0.00 3348.09 0.02 261.57 1.5e-04 0.00 3337.63 0.0e+00 0.01 889.34 0.0e+00 0.00 4028.17 0.0e+00 1712 1712 0.00 3459.52 0.02 266.53 1.9e-04 0.00 3459.52 0.0e+00 0.01 879.54 0.0e+00 0.00 3997.91 0.0e+00 1728 1728 0.00 3584.22 0.02 271.53 1.2e-04 0.00 3692.83 0.0e+00 0.01 896.05 0.0e+00 0.00 4181.59 0.0e+00 1744 1744 0.00 3484.94 0.02 276.58 1.2e-04 0.00 3650.89 0.0e+00 0.01 851.87 0.0e+00 0.00 3982.79 0.0e+00 1760 1760 0.00 3532.27 0.02 258.21 1.2e-04 0.00 3492.83 0.0e+00 0.01 845.04 0.0e+00 0.00 3842.12 0.0e+00 1776 1776 0.00 3384.52 0.02 262.92 1.1e-04 0.00 3441.89 0.0e+00 0.01 860.47 0.0e+00 0.00 4302.36 0.0e+00 1792 1792 0.00 3340.65 0.02 267.68 1.1e-04 0.00 3328.97 0.0e+00 0.01 835.16 0.0e+00 0.00 4175.81 0.0e+00 1808 1808 0.00 3662.14 0.02 272.48 1.3e-04 0.00 3531.35 0.0e+00 0.01 832.30 0.0e+00 0.00 3923.72 0.0e+00 1824 1824 0.00 3327.89 0.03 255.99 1.4e-04 0.00 3751.44 0.0e+00 0.01 907.61 0.0e+00 0.00 4193.14 0.0e+00 1840 1840 0.00 3521.98 0.03 270.92 1.3e-04 0.00 3589.71 0.0e+00 0.01 880.50 0.0e+00 0.00 3928.36 0.0e+00 1856 1856 0.00 3570.96 0.03 265.05 1.8e-04 0.00 3633.61 0.0e+00 0.01 939.73 0.0e+00 0.00 4072.15 0.0e+00 1872 1872 0.00 3435.21 0.03 254.93 1.2e-04 0.00 3295.00 0.0e+00 0.01 828.53 0.0e+00 0.00 4136.28 0.0e+00 1888 1888 0.00 3565.49 0.03 259.31 1.6e-04 0.00 3565.49 0.0e+00 0.01 907.58 0.0e+00 0.00 4084.10 0.0e+00 1904 1904 0.00 3692.10 0.03 263.72 1.2e-04 0.00 3428.38 0.0e+00 0.01 797.76 0.0e+00 0.00 4278.88 0.0e+00 1920 1920 0.00 3419.19 0.03 268.17 1.8e-04 0.00 3419.19 0.0e+00 0.01 811.22 0.0e+00 0.00 4277.34 0.0e+00 1936 1936 0.00 3544.57 0.03 272.66 1.0e-04 0.00 3524.12 0.0e+00 0.01 824.79 0.0e+00 0.00 4048.99 0.0e+00 1952 1952 0.00 3658.83 0.03 254.09 1.2e-04 0.00 3430.15 0.0e+00 0.01 831.55 0.0e+00 0.00 4227.06 0.0e+00 1968 1968 0.00 3521.83 0.03 258.27 1.2e-04 0.00 3486.61 0.0e+00 0.01 852.28 0.0e+00 0.00 3944.44 0.0e+00 1984 1984 0.00 3464.78 0.03 262.48 1.2e-04 0.00 3543.52 0.0e+00 0.01 866.19 0.0e+00 0.00 3937.25 0.0e+00 2000 2000 0.00 3273.55 0.03 246.22 1.5e-04 0.00 3564.53 0.0e+00 0.01 872.95 0.0e+00 0.00 4161.04 0.0e+00 2016 2016 0.00 3496.13 0.03 271.02 1.4e-04 0.00 3547.87 0.0e+00 0.01 813.05 0.0e+00 0.00 3991.35 0.0e+00 2032 2032 0.00 3634.44 0.03 247.80 1.2e-04 0.00 3469.23 0.0e+00 0.01 826.01 0.0e+00 0.00 3799.64 0.0e+00 2048 2048 0.00 3661.38 0.04 228.84 9.5e-05 0.00 3432.54 0.0e+00 0.02 533.95 0.0e+00 0.00 3943.61 0.0e+00 2064 2064 0.00 3408.90 0.03 255.67 1.2e-04 0.00 3238.46 0.0e+00 0.01 852.23 0.0e+00 0.00 4175.91 0.0e+00 2080 2080 0.00 3461.95 0.03 266.30 1.3e-04 0.00 3461.95 0.0e+00 0.01 778.94 0.0e+00 0.00 4091.40 0.0e+00 2096 2096 0.00 3339.64 0.04 251.10 1.1e-04 0.00 3515.41 0.0e+00 0.01 790.97 0.0e+00 0.00 4394.26 0.0e+00 2112 2112 0.00 3407.04 0.04 254.95 1.1e-04 0.00 3212.35 0.0e+00 0.01 811.20 0.0e+00 0.00 3974.88 0.0e+00 2128 2128 0.00 3261.20 0.04 258.83 1.1e-04 0.00 3458.85 0.0e+00 0.01 823.54 0.0e+00 0.00 4364.74 0.0e+00 2144 2144 0.00 3260.26 0.04 250.79 1.5e-04 0.00 3402.38 0.0e+00 0.01 919.56 0.0e+00 0.00 3954.11 0.0e+00 2160 2160 0.00 3309.10 0.04 254.55 1.0e-04 0.00 3360.01 0.0e+00 0.01 840.00 0.0e+00 0.00 4200.01 0.0e+00 2176 2176 0.00 3599.41 0.04 258.33 1.4e-04 0.00 3409.97 0.0e+00 0.01 852.49 0.0e+00 0.00 4391.62 0.0e+00 2192 2192 0.00 3171.93 0.04 240.30 8.9e-05 0.00 3582.62 0.0e+00 0.01 786.43 0.0e+00 0.00 4133.13 0.0e+00 2208 2208 0.00 3457.79 0.04 265.98 1.6e-04 0.00 3457.79 0.0e+00 0.01 877.75 0.0e+00 0.00 3901.09 0.0e+00 2224 2224 0.00 3562.05 0.04 247.36 1.1e-04 0.00 3418.13 0.0e+00 0.01 791.57 0.0e+00 0.00 4056.78 0.0e+00 2240 2240 0.00 3558.73 0.04 250.94 1.2e-04 0.00 3284.98 0.0e+00 0.01 903.37 0.0e+00 0.00 4197.47 0.0e+00 2256 2256 0.00 3461.65 0.04 254.53 1.3e-04 0.00 3424.63 0.0e+00 0.01 814.51 0.0e+00 0.00 3979.97 0.0e+00 2272 2272 0.00 3510.92 0.04 258.16 1.1e-04 0.00 3614.18 0.0e+00 0.01 826.10 0.0e+00 0.00 4233.76 0.0e+00 2288 2288 0.00 3332.06 0.04 241.67 8.3e-05 0.00 3246.37 0.0e+00 0.01 761.61 0.0e+00 0.00 3998.47 0.0e+00 2304 2304 0.00 3282.28 0.04 245.06 1.3e-04 0.00 3185.74 0.0e+00 0.01 743.34 0.0e+00 0.00 4054.58 0.0e+00 2320 2320 0.00 3445.48 0.04 248.47 1.0e-04 0.00 3328.02 0.0e+00 0.01 807.53 0.0e+00 0.00 4306.85 0.0e+00 2336 2336 0.00 3165.68 0.04 251.91 9.0e-05 0.00 3572.55 0.0e+00 0.01 893.14 0.0e+00 0.00 4167.98 0.0e+00 2352 2352 0.00 3319.85 0.04 255.37 1.0e-04 0.00 3319.85 0.0e+00 0.01 829.96 0.0e+00 0.00 4315.80 0.0e+00 2368 2368 0.00 3477.34 0.04 258.86 8.3e-05 0.00 3365.16 0.0e+00 0.01 917.77 0.0e+00 0.00 4038.20 0.0e+00 2384 2384 0.00 3297.10 0.05 243.63 1.2e-04 0.00 3410.79 0.0e+00 0.01 826.86 0.0e+00 0.00 4030.93 0.0e+00 2400 2400 0.00 3456.72 0.05 246.91 7.7e-05 0.00 3561.47 0.0e+00 0.01 806.57 0.0e+00 0.00 4189.96 0.0e+00 2416 2416 0.00 3609.11 0.05 233.53 1.0e-04 0.00 3821.41 0.0e+00 0.01 778.44 0.0e+00 0.00 4320.32 0.0e+00 2432 2432 0.00 3549.50 0.05 253.54 1.1e-04 0.00 3431.19 0.0e+00 0.01 860.49 0.0e+00 0.00 4087.31 0.0e+00 2448 2448 0.00 3378.40 0.05 239.76 8.7e-05 0.00 3378.40 0.0e+00 0.02 799.19 0.0e+00 0.00 4315.63 0.0e+00 2464 2464 0.00 3157.71 0.05 260.25 9.1e-05 0.00 3400.62 0.0e+00 0.01 850.15 0.0e+00 0.00 4085.16 0.0e+00 2480 2480 0.00 3444.92 0.05 246.07 1.2e-04 0.00 3198.85 0.0e+00 0.02 782.94 0.0e+00 0.00 4306.15 0.0e+00 2496 2496 0.00 3285.58 0.05 267.05 1.1e-04 0.00 3364.88 0.0e+00 0.01 830.84 0.0e+00 0.00 4418.53 0.0e+00 2512 2512 0.00 3534.38 0.05 236.68 1.2e-04 0.00 3213.08 0.0e+00 0.02 757.37 0.0e+00 0.00 4291.75 0.0e+00 2528 2528 0.00 3323.86 0.05 255.68 1.3e-04 0.00 3137.91 0.0e+00 0.02 745.74 0.0e+00 0.00 4090.91 0.0e+00 2544 2544 0.00 3236.60 0.05 242.75 1.3e-04 0.00 3236.60 0.0e+00 0.02 823.86 0.0e+00 0.00 4401.78 0.0e+00 2560 2560 0.00 3408.54 0.06 231.35 9.3e-05 0.00 3277.44 0.0e+00 0.02 546.24 0.0e+00 0.00 4171.29 0.0e+00 2576 2576 0.00 3499.54 0.06 234.25 7.2e-05 0.00 3499.54 0.0e+00 0.02 796.45 0.0e+00 0.00 4114.98 0.0e+00 2592 2592 0.00 3359.88 0.05 251.99 8.2e-05 0.00 3225.48 0.0e+00 0.02 855.24 0.0e+00 0.00 4154.03 0.0e+00 2608 2608 0.00 3463.33 0.06 240.10 9.1e-05 0.00 3129.37 0.0e+00 0.02 793.68 0.0e+00 0.00 4081.78 0.0e+00 2624 2624 0.00 3030.14 0.05 258.25 1.1e-04 0.00 3167.88 0.0e+00 0.02 826.40 0.0e+00 0.00 4269.75 0.0e+00 2640 2640 0.00 3295.34 0.06 246.03 1.1e-04 0.00 3346.04 0.0e+00 0.02 760.46 0.0e+00 0.00 4055.81 0.0e+00 2656 2656 0.00 3245.61 0.06 249.02 8.7e-05 0.00 3386.72 0.0e+00 0.02 846.68 0.0e+00 0.00 4233.40 0.0e+00 2672 2672 0.00 3427.64 0.06 238.03 9.1e-05 0.00 3427.64 0.0e+00 0.02 779.01 0.0e+00 0.00 4284.55 0.0e+00 2688 2688 0.00 3547.65 0.06 255.06 9.4e-05 0.00 3324.28 0.0e+00 0.02 788.37 0.0e+00 0.00 4204.62 0.0e+00 2704 2704 0.00 3363.97 0.06 230.94 8.8e-05 0.00 3510.22 0.0e+00 0.02 731.30 0.0e+00 0.00 3802.74 0.0e+00 2720 2720 0.00 3498.07 0.06 233.68 9.1e-05 0.00 3255.89 0.0e+00 0.02 739.98 0.0e+00 0.00 3995.87 0.0e+00 2736 2736 0.00 3675.47 0.06 236.43 9.0e-05 0.00 3675.47 0.0e+00 0.02 816.77 0.0e+00 0.00 3893.27 0.0e+00 2752 2752 0.00 3443.13 0.06 239.21 1.1e-04 0.00 3305.40 0.0e+00 0.02 883.74 0.0e+00 0.00 4241.93 0.0e+00 2768 2768 0.00 3371.81 0.07 229.90 7.2e-05 0.00 3483.28 0.0e+00 0.02 766.32 0.0e+00 0.00 3984.87 0.0e+00 2784 2784 0.00 3523.66 0.06 244.80 9.6e-05 0.00 3523.66 0.0e+00 0.02 775.20 0.0e+00 0.00 3876.02 0.0e+00 2800 2800 0.00 3279.13 0.07 235.24 9.4e-05 0.00 3450.22 0.0e+00 0.02 784.14 0.0e+00 0.00 4234.36 0.0e+00 2816 2816 0.00 3316.71 0.07 237.94 8.1e-05 0.01 3172.51 0.0e+00 0.02 721.02 0.0e+00 0.00 4282.88 0.0e+00 2832 2832 0.00 3529.52 0.07 229.19 7.3e-05 0.00 3529.52 0.0e+00 0.02 729.24 0.0e+00 0.00 4083.74 0.0e+00 2848 2848 0.01 3245.01 0.07 243.38 9.9e-05 0.01 3245.01 0.0e+00 0.02 811.25 0.0e+00 0.00 4277.51 0.0e+00 2864 2864 0.00 3281.57 0.07 234.40 1.2e-04 0.00 3445.65 0.0e+00 0.02 820.39 0.0e+00 0.00 4266.04 0.0e+00 2880 2880 0.00 3318.34 0.07 248.88 8.9e-05 0.00 3469.17 0.0e+00 0.02 829.58 0.0e+00 0.00 3982.00 0.0e+00 2896 2896 0.01 3355.31 0.07 228.77 8.1e-05 0.00 3523.07 0.0e+00 0.02 838.83 0.0e+00 0.00 4026.37 0.0e+00 2912 2912 0.01 3238.28 0.07 242.32 1.1e-04 0.01 3392.48 0.0e+00 0.02 848.12 0.0e+00 0.00 4070.98 0.0e+00 2928 2928 0.01 3258.37 0.07 233.85 7.9e-05 0.01 3429.86 0.0e+00 0.02 779.51 0.0e+00 0.00 4053.47 0.0e+00 2944 2944 0.00 3467.44 0.07 236.42 9.2e-05 0.00 3467.44 0.0e+00 0.02 866.86 0.0e+00 0.00 4160.93 0.0e+00 2960 2960 0.01 3154.71 0.08 228.60 1.0e-04 0.01 3345.90 0.0e+00 0.02 730.26 0.0e+00 0.00 4301.88 0.0e+00 2976 2976 0.01 3382.17 0.08 231.08 9.7e-05 0.01 3366.06 0.0e+00 0.02 805.28 0.0e+00 0.00 4187.45 0.0e+00 2992 2992 0.01 3418.63 0.08 223.84 8.6e-05 0.01 3418.63 0.0e+00 0.02 813.96 0.0e+00 0.00 4297.71 0.0e+00 heim$
Using the gnuplot script
set terminal svg size 900, 500 set output "bench.gemv.svg" set xlabel "Matrix dimension M=N" set ylabel "MFLOPS" set yrange [0:12000] set title "GEMV (col major)" set key outside set pointsize 0.5 plot "gemv_colmajor.dat" using 1:4 with linespoints lt 2 lw 3 title "dgemv_ref", \ "gemv_colmajor.dat" using 1:6 with linespoints lt 3 lw 3 title "dgemv_dot", \ "gemv_colmajor.dat" using 1:9 with linespoints lt 4 lw 3 title "dgemv_axpy", \ "gemv_colmajor.dat" using 1:12 with linespoints lt 7 lw 3 title "dgemv_dotf", \ "gemv_colmajor.dat" using 1:15 with linespoints lt 8 lw 3 title "dgemv_axpyf"
and running it through gnuplot
heim$ gnuplot gemv.plot heim$
gives
Exercise (Gnuplot)
Use gnuplot to visualize benchmark results for the row major case.
Exercise (Fuse Factor)
Find the best fuse factor for axpyf and dotf.