# Matrix product

#### Content

In BLAS (Basic Linear Algebra Subprograms) the general matrix product performs

$C \leftarrow \beta C + \alpha A B$

which is named GEMM (general matrix-matrix product) and where

$\alpha \in \mathbb{K},\quad \beta \in \mathbb{K},\quad A \in \mathbb{K}^{m\times k},\quad B \in \mathbb{K}^{k\times n},\quad\text{and}\quad C \in \mathbb{K}^{m\times n}$

where $$\mathbb{K}$$ represents a floating point type, i.e. double.

## Special cases

• If $$\alpha=0$$ or dimension $$k=0$$ the operation just computes $$C \leftarrow \beta C$$ and returns.

• If $$\beta = 0$$ then matrix $$C$$ is allowed to contain Nan (not a number) entries

## Exercise

Below we provide a simple benchmark stub for the GEMM operation. It already contains an implementation that traverses matrices $$A$$, $$B$$ and $$C$$ in row major order:

• Make yourself familiar with the provided program:

• What storage is used for the matrices.

• In its current state the program already compiles (but function dgemm_col is not doing anything so far). Before you start with modifying the code: compile and run it. Make sure you understand the produced output.

• Implement dgemm_col such that it traverses all matrices in col major order.

• Compile with -O3 and compare the runtime of both GEMM variants and visualize the benchmark with a plot.

## Stub for the GEMM benchmark

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>

#ifndef MINDIM_M
#define MINDIM_M    100
#endif

#ifndef MINDIM_N
#define MINDIM_N    100
#endif

#ifndef MINDIM_K
#define MINDIM_K    100
#endif

#ifndef MAXDIM_M
#define MAXDIM_M    1000
#endif

#ifndef MAXDIM_N
#define MAXDIM_N    1000
#endif

#ifndef MAXDIM_K
#define MAXDIM_K    1000
#endif

#ifndef INC_M
#define INC_M   100
#endif

#ifndef INC_N
#define INC_N   100
#endif

#ifndef INC_K
#define INC_K   100
#endif

#ifndef MIN_T
#define MIN_T   1
#endif

#ifndef ALPHA
#define ALPHA   1
#endif

#ifndef BETA
#define BETA   1
#endif

double A[MAXDIM_M*MAXDIM_K];
double B[MAXDIM_K*MAXDIM_N];
double C1[MAXDIM_M*MAXDIM_N];
double C2[MAXDIM_M*MAXDIM_N];

/* return real time in seconds since start of the process */
double
wallTime()
{
static clock_t ticks_per_second = 0;
if (!ticks_per_second) {
ticks_per_second = sysconf(_SC_CLK_TCK);
}
struct tms timebuf;
/* times returns the number of real time ticks passed since start */
return (double) times(&timebuf) / ticks_per_second;
}

double
asumDiff(size_t m, size_t n,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB)
{
double diff = 0;
for (size_t i = 0; i < m; ++i) {
for (size_t j = 0; j < n; ++j) {
diff += fabs(B[i*incRowB+j*incColB] - A[i*incRowA+j*incColA]);
}
}
return diff;
}

void
initMatrix(size_t m, size_t n, double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
for (size_t j = 0; j < n; ++j) {
for (size_t i = 0; i < m; ++i) {
A[i*incRowA+j*incColA] = j*n+i+1;
}
}
}

void
printMatrix(size_t m, size_t n, const double *A,
ptrdiff_t incRowA, ptrdiff_t incColA)
{
for (size_t i = 0; i < m; ++i) {
printf("   ");
for (size_t j = 0; j < n; ++j) {
printf("%4.1lf ", A[i*incRowA+j*incColA]);
}
printf("\n");
}
printf("\n");
}

void
dgemm_row(size_t m, size_t n, size_t k, double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
double beta,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
if (beta!=0) {
if (beta!=1) {
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
C[i*incRowC+j*incColC] *= beta;
}
}
}
} else {
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
C[i*incRowC+j*incColC] = 0;
}
}
}
if (alpha==0 || k==0) {
return;
}
for (size_t i=0; i<m; ++i) {
for (size_t l=0; l<k; ++l) {
for (size_t j=0; j<n; ++j) {
C[i*incRowC+j*incColC] +=
alpha*A[i*incRowA+l*incColA]*B[l*incRowB+j*incColB];
}
}
}
}

void
dgemm_col(size_t m, size_t n, size_t k, double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
double beta,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
// TODO: Your code here ...
}

int
main()
{
printf("   M    N    K      t1      t2   t2/t1       diff\n");
printf("               col-maj row-maj\n");
printf("=================================================\n");

for (size_t m = MINDIM_M, n = MINDIM_N, k = MINDIM_K;
m <= MAXDIM_M && n <= MAXDIM_N && k <= MAXDIM_K;
m += INC_M, n += INC_N, k += INC_K)
{
// set storage order for A, B, C1, C2
ptrdiff_t incRowA = 1, incColA = m;
ptrdiff_t incRowB = 1, incColB = k;
ptrdiff_t incRowC = 1, incColC = m;     // used for C1, C2

initMatrix(m, k, A, incRowA, incColA);
initMatrix(k, n, B, incRowB, incColB);

size_t runs = 0;
double t1 = 0;
do {
initMatrix(m, n, C1, incRowC, incColC);

double t0 = wallTime();
dgemm_col(m, n, k, ALPHA,
A, incRowA, incColA,
B, incRowB, incColB,
BETA,
C1, incRowC, incColC);
t1 += wallTime() - t0;
++runs;
} while (t1 < MIN_T);
t1 /= runs;

runs = 0;
double t2 = 0;
do {
initMatrix(m, n, C2, incRowC, incColC);

double t0 = wallTime();
dgemm_row(m, n, k,
ALPHA,
A, incRowA, incColA,
B, incRowB, incColB,
BETA,
C2, incRowC, incColC);
t2 += wallTime() - t0;
++runs;
} while (t2 < MIN_T);
t2 /= runs;

double diff = asumDiff(m, n,
C1, incRowC, incColC,
C2, incRowC, incColC);

printf("%4zu %4zu %4zu %7.2lf %7.2lf %7.2lf %10.2le\n",
m, n, k, t1, t2, t2/t1, diff);
}
}