# Possible Solution

## Source code

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <sys/times.h>
#include <unistd.h>
#include <mkl_types.h>

#ifndef MIN_M
#define MIN_M 100
#endif

#ifndef MIN_N
#define MIN_N 100
#endif

#ifndef MAX_M
#define MAX_M 1500
#endif

#ifndef MAX_N
#define MAX_N 1500
#endif

#ifndef INCX
#define INCX 1
#endif

#ifndef INCY
#define INCY 1
#endif

#ifndef ALPHA
#define ALPHA 1.5
#endif

#ifndef BETA
#define BETA 1.5
#endif

#ifndef T_MIN
#define T_MIN 5
#endif

double A[MAX_M*MAX_N];
double X[MAX_N*INCX];
double Y[MAX_M*INCY];
double Y1[MAX_M*INCY];
double Y2[MAX_M*INCY];
double Y3[MAX_M*INCY];
double Y4[MAX_M*INCY];
double Y5[MAX_M*INCY];

double
walltime()
{
struct tms    ts;
static double ClockTick=0.0;

if (ClockTick==0.0) {
ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
}
return ((double) times(&ts)) * ClockTick;
}

void
initMatrix(size_t m, size_t n, double *A, size_t incRowA, size_t incColA)
{
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = ((double)rand() - RAND_MAX/2)*200/RAND_MAX;
}
}
}

void
copyMatrix(size_t m, size_t n,
const double *A, size_t incRowA, size_t incColA,
double *B, size_t incRowB, size_t incColB)
{
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
}
}
}

double
asumDiffMatrix(size_t m, size_t n,
const double *A, size_t incRowA, size_t incColA,
double *B, size_t incRowB, size_t incColB)
{
double asum = 0;

for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
asum += fabs(B[i*incRowB+j*incColB] - A[i*incRowA+j*incColA]);
}
}
return asum;
}

//------------------------------------------------------------------------------

void
dgemv(const char *trans,
const MKL_INT *m, const MKL_INT *n, const double *alpha,
const double *A, const MKL_INT *ldA, const double *x,
const MKL_INT *incX,
const double *beta, double *y, const MKL_INT *incY);

void
dgemv_mkl(MKL_INT m, MKL_INT n,
double alpha,
const double *A, MKL_INT incRowA, MKL_INT incColA,
const double *x, MKL_INT incX,
double beta,
double *y, MKL_INT incY)
{
MKL_INT ldA   = (incRowA==1) ? incColA : incRowA;
char    trans = (incRowA==1) ? 'N' : 'T';
MKL_INT M     = (incRowA==1) ? m : n;
MKL_INT N     = (incRowA==1) ? n : m;

dgemv(&trans, &M, &N, &alpha, A, &ldA, x, &incX, &beta, y, &incY);
}

//------------------------------------------------------------------------------

void
dscal_ulm(size_t n, double alpha, double *x, ptrdiff_t incX)
{
if (alpha==1) {
return;
}
if (alpha==0) {
for (size_t i=0; i<n; ++i) {
x[i*incX] = 0;
}
} else {
for (size_t i=0; i<n; ++i) {
x[i*incX] *= alpha;
}
}
}

double
ddot_ulm(size_t n,
const double *x, ptrdiff_t incX,
const double *y, ptrdiff_t incY)
{
double result = 0;

for (size_t i=0; i<n; ++i) {
result += x[i*incX]*y[i*incY];
}
return result;
}

void
daxpy_ulm(size_t n, double alpha,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
if (alpha==0) {
return;
}
for (size_t i=0; i<n; ++i) {
y[i*incY] += alpha*x[i*incX];
}
}

void
dgemv_ulm(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double beta,
double *y, ptrdiff_t incY)
{
dscal_ulm(m, beta, y, incY);

if (alpha==0) {
return;
}

if (incRowA > incColA) {
for (size_t i=0; i<m; ++i) {
y[i*incY] += alpha*ddot_ulm(n, &A[i*incRowA], incColA, x, incX);
}
} else  {
for (size_t j=0; j<n; ++j) {
daxpy_ulm(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY);
}
}
}

//------------------------------------------------------------------------------

#ifndef COLMAJOR
//#define COLMAJOR 1
#define COLMAJOR 0
#endif

int
main()
{
size_t runs, incRowA, incColA;
double t0, t1, t2;
double diff2;
double alpha = ALPHA;
double beta  = BETA;

initMatrix(MAX_M, MAX_N, A, 1, MAX_M);
initMatrix(MAX_N, 1, X, INCX, 1);
initMatrix(MAX_M, 1, Y, INCY, 1);

printf("# COLMAJOR    = %d\n", COLMAJOR);
printf("# T_MIN       = %d\n", T_MIN);
printf("#RUN    M     N  INCROW  INCCOL");
printf("    GEMV_MKL    GEMV_ULM");
printf("    GEMV_MKL    GEMV_ULM");
printf("       DIFF2");
printf("\n");
printf("#                              ");
printf("    (t in s)    (t in s)");
printf("    (MFLOPS)    (MFLOPS)");
printf("           ");
printf("\n");

for (size_t i=0, m=MIN_M, n=MIN_N; m<=MAX_M && n<=MAX_N;
++i, m+=100, n+=100) {

if (COLMAJOR) {
incRowA = 1;
incColA = m;
} else {
incRowA = n;
incColA = 1;
}

t1   = 0;
runs = 0;
do {
copyMatrix(MAX_M, 1, Y, INCY, 1, Y1, INCY, 1);
t0 = walltime();
dgemv_mkl(m, n, alpha,
A, incRowA, incColA,
X, INCX,
beta,
Y1, INCY);
t1 += walltime() - t0;
++runs;
} while (t1<T_MIN);
t1 /= runs;

t2   = 0;
runs = 0;
do {
copyMatrix(MAX_M, 1, Y, INCY, 1, Y2, INCY, 1);
t0 = walltime();
dgemv_ulm(m, n, alpha,
A, incRowA, incColA,
X, INCX,
beta,
Y2, INCY);
t2 += walltime() - t0;
++runs;
} while (t2<T_MIN);
t2 /= runs;
diff2 = asumDiffMatrix(m, 1, Y1, INCY, 1, Y2, INCY, 1);

printf("%3ld %5ld %5ld %7ld %7ld ", i, m, n, incRowA, incColA);
printf("%11.4lf %11.4lf ", t1, t2);
printf("%11.4lf ", 2*(m/1000.0)*(n/1000.0)/t1);
printf("%11.4lf ", 2*(m/1000.0)*(n/1000.0)/t2);
printf("%11.4lf ", diff2);
printf("\n");
}
}


## Test run

heim$make gcc -DCOLMAJOR=0 -I/opt/intel/compilers_and_libraries/linux/mkl/include -DMKL_ILP64 -std=c99 -Wall -Ofast -L/opt/intel/compilers_and_libraries/linux/lib/intel64 -L/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread -o gemv_rowmajor gemv.c gcc -DCOLMAJOR=1 -I/opt/intel/compilers_and_libraries/linux/mkl/include -DMKL_ILP64 -std=c99 -Wall -Ofast -L/opt/intel/compilers_and_libraries/linux/lib/intel64 -L/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/lib/intel64 -Wl,-rpath -Wl,/opt/intel/compilers_and_libraries/linux/mkl/lib/intel64 -lmkl_intel_ilp64 -lmkl_core -lmkl_sequential -lm -lpthread -o gemv_colmajor gemv.c heim$ ./gemv_colmajor > gemv_colmajor.dat
heim$./gemv_rowmajor > gemv_rowmajor.dat heim$ cat gemv_colmajor.dat
# COLMAJOR    = 1
# T_MIN       = 5
#RUN    M     N  INCROW  INCCOL    GEMV_MKL    GEMV_ULM    GEMV_MKL    GEMV_ULM       DIFF2
#                                  (t in s)    (t in s)    (MFLOPS)    (MFLOPS)
0   100   100       1     100      0.0000      0.0000   8669.1120   3936.8383      0.0000
1   200   200       1     200      0.0000      0.0000   8931.2415   4650.1876      0.0000
2   300   300       1     300      0.0000      0.0000   8584.8120   4907.1240      0.0000
3   400   400       1     400      0.0000      0.0001   8777.7086   5053.2535      0.0000
4   500   500       1     500      0.0001      0.0001   8649.0000   5094.9000      0.0000
5   600   600       1     600      0.0001      0.0001   8820.1440   5061.2695      0.0000
6   700   700       1     700      0.0001      0.0002   7939.7640   4919.6000      0.0000
7   800   800       1     800      0.0002      0.0003   7069.3812   4685.0560      0.0000
8   900   900       1     900      0.0003      0.0004   5902.4910   4434.4671      0.0000
9  1000  1000       1    1000      0.0004      0.0005   5428.8000   4160.8000      0.0000
10  1100  1100       1    1100      0.0005      0.0006   5063.6080   4170.9980      0.0000
11  1200  1200       1    1200      0.0006      0.0007   4974.1796   4119.5520      0.0000
12  1300  1300       1    1300      0.0007      0.0008   4763.0960   4100.5269      0.0000
13  1400  1400       1    1400      0.0009      0.0010   4306.5120   3964.6880      0.0000
14  1500  1500       1    1500      0.0010      0.0011   4295.7000   3996.9000      0.0000
heim$cat gemv_rowmajor.dat # COLMAJOR = 0 # T_MIN = 5 #RUN M N INCROW INCCOL GEMV_MKL GEMV_ULM GEMV_MKL GEMV_ULM DIFF2 # (t in s) (t in s) (MFLOPS) (MFLOPS) 0 100 100 100 1 0.0000 0.0000 8464.4960 4336.8400 0.0000 1 200 200 200 1 0.0000 0.0000 8541.9360 4775.8403 0.0000 2 300 300 300 1 0.0000 0.0000 8447.9040 4728.4671 0.0000 3 400 400 400 1 0.0000 0.0001 8638.9142 4815.6487 0.0000 4 500 500 500 1 0.0001 0.0001 8559.9800 4754.7904 0.0000 5 600 600 600 1 0.0001 0.0002 8389.7964 4717.2960 0.0000 6 700 700 700 1 0.0001 0.0002 8078.3360 4681.4600 0.0000 7 800 800 800 1 0.0002 0.0003 6930.9440 4406.6747 0.0000 8 900 900 900 1 0.0003 0.0004 6119.7120 4197.4200 0.0000 9 1000 1000 1000 1 0.0004 0.0005 5330.4000 4051.6000 0.0000 10 1100 1100 1100 1 0.0005 0.0006 4773.6920 3993.0000 0.0000 11 1200 1200 1200 1 0.0006 0.0007 5059.5840 4028.5440 0.0000 12 1300 1300 1300 1 0.0008 0.0009 4479.8520 3964.0640 0.0000 13 1400 1400 1400 1 0.0009 0.0010 4432.7360 3967.0400 0.0000 14 1500 1500 1500 1 0.0010 0.0011 4392.2156 3973.6527 0.0000 heim$ gnuplot gemv_colmajor.plot
heim$gnuplot gemv_rowmajor.plot heim$ 

## Benchmark

• Here the plot for the col major case:

• and here for the row major case: