# Possible Solution

## Source code

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <sys/times.h>
#include <unistd.h>
#include <mkl_types.h>

#ifndef MIN_M
#define MIN_M 100
#endif

#ifndef MIN_N
#define MIN_N 100
#endif

#ifndef MAX_M
#define MAX_M 1500
#endif

#ifndef MAX_N
#define MAX_N 1500
#endif

#ifndef INCX
#define INCX 1
#endif

#ifndef INCY
#define INCY 1
#endif

#ifndef ALPHA
#define ALPHA 1.5
#endif

#ifndef BETA
#define BETA 1.5
#endif

#ifndef T_MIN
#define T_MIN 5
#endif

double A[MAX_M*MAX_N];
double X[MAX_N*INCX];
double Y[MAX_M*INCY];
double Y1[MAX_M*INCY];
double Y2[MAX_M*INCY];
double Y3[MAX_M*INCY];
double Y4[MAX_M*INCY];
double Y5[MAX_M*INCY];

double
walltime()
{
struct tms    ts;
static double ClockTick=0.0;

if (ClockTick==0.0) {
ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
}
return ((double) times(&ts)) * ClockTick;
}

void
initMatrix(size_t m, size_t n, double *A, size_t incRowA, size_t incColA)
{
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = ((double)rand() - RAND_MAX/2)*200/RAND_MAX;
}
}
}

void
copyMatrix(size_t m, size_t n,
const double *A, size_t incRowA, size_t incColA,
double *B, size_t incRowB, size_t incColB)
{
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
}
}
}

double
asumDiffMatrix(size_t m, size_t n,
const double *A, size_t incRowA, size_t incColA,
double *B, size_t incRowB, size_t incColB)
{
double asum = 0;

for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
asum += fabs(B[i*incRowB+j*incColB] - A[i*incRowA+j*incColA]);
}
}
return asum;
}

//------------------------------------------------------------------------------

void
dgemv(const char *trans,
const MKL_INT *m, const MKL_INT *n, const double *alpha,
const double *A, const MKL_INT *ldA, const double *x,
const MKL_INT *incX,
const double *beta, double *y, const MKL_INT *incY);

void
dgemv_mkl(MKL_INT m, MKL_INT n,
double alpha,
const double *A, MKL_INT incRowA, MKL_INT incColA,
const double *x, MKL_INT incX,
double beta,
double *y, MKL_INT incY)
{
MKL_INT ldA   = (incRowA==1) ? incColA : incRowA;
char    trans = (incRowA==1) ? 'N' : 'T';
MKL_INT M     = (incRowA==1) ? m : n;
MKL_INT N     = (incRowA==1) ? n : m;

dgemv(&trans, &M, &N, &alpha, A, &ldA, x, &incX, &beta, y, &incY);
}

//------------------------------------------------------------------------------

#ifndef DGEMV_DOTF_FUSE
#define DGEMV_DOTF_FUSE  6
#endif

#ifndef DGEMV_AXPYF_FUSE
#define DGEMV_AXPYF_FUSE  6
#endif

void
ddotf_ulm(size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<DGEMV_DOTF_FUSE; ++i) {
y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX];
}
}
}

void
daxpyf_ulm(size_t m,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<DGEMV_AXPYF_FUSE; ++j) {
y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX];
}
}
}

void
dscal_ulm(size_t n, double alpha, double *x, ptrdiff_t incX)
{
if (alpha==1) {
return;
}
if (alpha==0) {
for (size_t i=0; i<n; ++i) {
x[i*incX] = 0;
}
} else {
for (size_t i=0; i<n; ++i) {
x[i*incX] *= alpha;
}
}
}

double
ddot_ulm(size_t n,
const double *x, ptrdiff_t incX,
const double *y, ptrdiff_t incY)
{
double result = 0;

for (size_t i=0; i<n; ++i) {
result += x[i*incX]*y[i*incY];
}
return result;
}

void
daxpy_ulm(size_t n, double alpha,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
if (alpha==0) {
return;
}
for (size_t i=0; i<n; ++i) {
y[i*incY] += alpha*x[i*incX];
}
}

void
dgemv_ulm(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double beta,
double *y, ptrdiff_t incY)
{
dscal_ulm(m, beta, y, incY);

if (alpha==0) {
return;
}

if (incRowA > incColA) {
size_t mb = m / DGEMV_DOTF_FUSE;

for (size_t ib=0; ib<mb; ++ib) {
ddotf_ulm(n, alpha,
&A[ib*DGEMV_DOTF_FUSE*incRowA], incRowA, incColA,
x, incX,
&y[ib*DGEMV_DOTF_FUSE*incY], incY);
}

for (size_t i=mb*DGEMV_DOTF_FUSE; i<m; ++i) {
y[i*incY] += alpha*ddot_ulm(n, &A[i*incRowA], incColA, x, incX);
}
} else  {
size_t nb = n / DGEMV_AXPYF_FUSE;

for (size_t jb=0; jb<nb; ++jb) {
daxpyf_ulm(m, alpha,
&A[jb*DGEMV_AXPYF_FUSE*incColA], incRowA, incColA,
&x[jb*DGEMV_AXPYF_FUSE*incX], incX,
y, incY);
}
for (size_t j=nb*DGEMV_AXPYF_FUSE; j<n; ++j) {
daxpy_ulm(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY);
}
}
}

//------------------------------------------------------------------------------

#ifndef COLMAJOR
//#define COLMAJOR 1
#define COLMAJOR 0
#endif

int
main()
{
size_t runs, incRowA, incColA;
double t0, t1, t2;
double diff2;
double alpha = ALPHA;
double beta  = BETA;

initMatrix(MAX_M, MAX_N, A, 1, MAX_M);
initMatrix(MAX_N, 1, X, INCX, 1);
initMatrix(MAX_M, 1, Y, INCY, 1);

printf("# COLMAJOR    = %d\n", COLMAJOR);
printf("# T_MIN       = %d\n", T_MIN);
printf("#RUN    M     N  INCROW  INCCOL");
printf("    GEMV_MKL    GEMV_ULM");
printf("    GEMV_MKL    GEMV_ULM");
printf("       DIFF2");
printf("\n");
printf("#                              ");
printf("    (t in s)    (t in s)");
printf("    (MFLOPS)    (MFLOPS)");
printf("           ");
printf("\n");

for (size_t i=0, m=MIN_M, n=MIN_N; m<=MAX_M && n<=MAX_N;
++i, m+=100, n+=100) {

if (COLMAJOR) {
incRowA = 1;
incColA = m;
} else {
incRowA = n;
incColA = 1;
}

t1   = 0;
runs = 0;
do {
copyMatrix(MAX_M, 1, Y, INCY, 1, Y1, INCY, 1);
t0 = walltime();
dgemv_mkl(m, n, alpha,
A, incRowA, incColA,
X, INCX,
beta,
Y1, INCY);
t1 += walltime() - t0;
++runs;
} while (t1<T_MIN);
t1 /= runs;

t2   = 0;
runs = 0;
do {
copyMatrix(MAX_M, 1, Y, INCY, 1, Y2, INCY, 1);
t0 = walltime();
dgemv_ulm(m, n, alpha,
A, incRowA, incColA,
X, INCX,
beta,
Y2, INCY);
t2 += walltime() - t0;
++runs;
} while (t2<T_MIN);
t2 /= runs;
diff2 = asumDiffMatrix(m, 1, Y1, INCY, 1, Y2, INCY, 1);

printf("%3ld %5ld %5ld %7ld %7ld ", i, m, n, incRowA, incColA);
printf("%11.4lf %11.4lf ", t1, t2);
printf("%11.4lf ", 2*(m/1000.0)*(n/1000.0)/t1);
printf("%11.4lf ", 2*(m/1000.0)*(n/1000.0)/t2);
printf("%11.4lf ", diff2);
printf("\n");
}
}

## Test run

heim$make gcc -DCOLMAJOR=0 -I/opt/intel/compilers_and_libraries/linux/mkl/include -DMKL_ILP64 -std=c99 -Wall -Ofast -L/opt/intel/compilers_and_libraries/linux/lib/intel64 -L/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread -o gemv_rowmajor gemv.c gcc -DCOLMAJOR=1 -I/opt/intel/compilers_and_libraries/linux/mkl/include -DMKL_ILP64 -std=c99 -Wall -Ofast -L/opt/intel/compilers_and_libraries/linux/lib/intel64 -L/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread -o gemv_colmajor gemv.c heim$ ./gemv_colmajor > gemv_colmajor.dat
heim$./gemv_rowmajor > gemv_rowmajor.dat heim$ cat gemv_colmajor.dat
# COLMAJOR    = 1
# T_MIN       = 5
#RUN    M     N  INCROW  INCCOL    GEMV_MKL    GEMV_ULM    GEMV_MKL    GEMV_ULM       DIFF2
#                                  (t in s)    (t in s)    (MFLOPS)    (MFLOPS)
0   100   100       1     100      0.0000      0.0000   8964.0000   7430.5868      0.0000
1   200   200       1     200      0.0000      0.0000   8762.5230   8150.2560      0.0000
2   300   300       1     300      0.0000      0.0000   8593.0419   8377.6320      0.0000
3   400   400       1     400      0.0000      0.0000   8751.4571   8264.5120      0.0000
4   500   500       1     500      0.0001      0.0001   8756.4000   8682.8343      0.0000
5   600   600       1     600      0.0001      0.0001   8758.6560   8365.0779      0.0000
6   700   700       1     700      0.0001      0.0001   8249.0180   7929.7680      0.0000
7   800   800       1     800      0.0002      0.0002   7207.9360   7078.5788      0.0000
8   900   900       1     900      0.0003      0.0003   5957.3880   6428.8080      0.0000
9  1000  1000       1    1000      0.0004      0.0004   5277.4451   5204.4000      0.0000
10  1100  1100       1    1100      0.0005      0.0005   4714.6440   4674.4720      0.0000
11  1200  1200       1    1200      0.0007      0.0007   4424.6228   4309.0778      0.0000
12  1300  1300       1    1300      0.0008      0.0008   4369.0379   4163.4840      0.0000
13  1400  1400       1    1400      0.0009      0.0009   4308.0800   4145.7920      0.0000
14  1500  1500       1    1500      0.0010      0.0011   4298.4000   4122.0000      0.0000
heim$cat gemv_rowmajor.dat # COLMAJOR = 0 # T_MIN = 5 #RUN M N INCROW INCCOL GEMV_MKL GEMV_ULM GEMV_MKL GEMV_ULM DIFF2 # (t in s) (t in s) (MFLOPS) (MFLOPS) 0 100 100 100 1 0.0000 0.0000 8162.4280 7206.9440 0.0000 1 200 200 200 1 0.0000 0.0000 8502.8480 8374.1760 0.0000 2 300 300 300 1 0.0000 0.0000 8350.4520 8733.6000 0.0000 3 400 400 400 1 0.0000 0.0000 8558.0800 8876.1996 0.0000 4 500 500 500 1 0.0001 0.0001 8545.7000 9061.3000 0.0000 5 600 600 600 1 0.0001 0.0001 8661.0240 8793.9360 0.0000 6 700 700 700 1 0.0001 0.0001 7874.3000 8026.0040 0.0000 7 800 800 800 1 0.0002 0.0002 7038.2080 7240.1920 0.0000 8 900 900 900 1 0.0003 0.0003 5949.6120 6335.1720 0.0000 9 1000 1000 1000 1 0.0004 0.0004 5364.8000 5348.8000 0.0000 10 1100 1100 1100 1 0.0005 0.0005 4768.3680 4688.0240 0.0000 11 1200 1200 1200 1 0.0006 0.0007 4553.8560 4359.6647 0.0000 12 1300 1300 1300 1 0.0008 0.0008 4464.3040 4214.1840 0.0000 13 1400 1400 1400 1 0.0009 0.0009 4454.6880 4215.7605 0.0000 14 1500 1500 1500 1 0.0010 0.0011 4422.7545 4134.6000 0.0000 heim$ gnuplot gemv_colmajor.plot
heim$gnuplot gemv_rowmajor.plot heim$

## Benchmark

• Here the plot for the col major case:

• and here for the row major case: