# GEMM Micro Kernel

#### Content

The GEMM micro kernel ($$\mu$$-gemm, ugemm) handles the special case of the GEMM operation

$C \leftarrow \beta \cdot C + \alpha \cdot A \, B\qquad\;C \in \mathbb{M}^{M_r \times N_r},\;A \in \mathbb{M}^{M_r \times k},\;B \in \mathbb{M}^{k \times N_r},\;$

where $$M_r$$, $$N_r$$ are given through some global variables or through macros (hence known at compile time in the later case). However, there are further restrictions:

• Matrix $$A$$ is stored in column-major and elements of $$A$$ are in a contiguous memory block. So more precisely the operation requires

$\text{incRow}(A) = 1\quad\text{and}\quad\text{incCol}(A) = M_r.$
• Matrix $$B$$ is stored in row-major and elements of $$B$$ are in a contiguous memory block. So more precisely the operation requires

$\text{incRow}(B) = N_r\quad\text{and}\quad\text{incCol}(B) = 1.$

## Special cases

• The GEMM micro kernel must handle the case $$\beta=0$$ correctly, i.e. assume that $$C$$ might contain NaN (Not a Number) entries.

• The case $$\alpha=0$$ can not occur (so you can assume that $$A$$ and $$B$$ do not contain NaN entries).

• Also the case $$k=0$$ will not occur (although this would be only relevant for performance but not for correctness).

## Implementation

You won't believe it! We provide you with an implementation of the GEMM micro kernel!

#ifndef DUGEMM_MR_DEFAULT
#define DUGEMM_MR_DEFAULT   4
#endif

#ifndef DUGEMM_NR_DEFAULT
#define DUGEMM_NR_DEFAULT   8
#endif

namespace ulmblas {

namespace dugemm_parameter {
std::size_t MR = DUGEMM_MR_DEFAULT;
std::size_t NR = DUGEMM_NR_DEFAULT;
} // namespace dugemm_parameter

void
ugemm_ref(std::size_t k, double alpha,
const double *A, const double *B,
double beta,
double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
using namespace dugemm_parameter;

double AB[MR*NR];

for (std::size_t i=0; i<MR*NR; ++i) {
AB[i] = 0;
}
for (std::size_t l=0; l<k; ++l) {
for (std::size_t i=0; i<MR; ++i) {
for (std::size_t j=0; j<NR; ++j) {
AB[i*NR + j] += A[i]*B[j];
}
}
A += MR;
B += NR;
}
// Yeah, this is unnecessary if (alpha==0). But ok ...
for (std::size_t i=0; i<MR*NR; ++i) {
AB[i] *= alpha;
}
// This check for beta is really necessary
if (beta!=0) {
for (std::size_t j=0; j<NR; ++j) {
for (std::size_t i=0; i<MR; ++i) {
C[i*incRowC+j*incColC] *= beta;
C[i*incRowC+j*incColC] += AB[i*NR+j];
}
}
} else {
for (std::size_t j=0; j<NR; ++j) {
for (std::size_t i=0; i<MR; ++i) {
C[i*incRowC+j*incColC] = AB[i*NR+j];
}
}
}
}

} // namespace ulmblas


## Exercise

Write a simple program for testing the GEMM micro kernel:

• Use the GEMM error estimator from session 6.

• In case $$\beta=0$$ the initial matrix $$C$$ must be initialized with NaN (we provide a function nanMatrix for this purpose).

• Use macros such that different cases for $$\alpha$$, $$\beta$$, $$M_r$$, $$N_r$$ and $$k$$ can be tested.

• Matrix $$C$$ can be stored column-major or row-major.

Here some examples how the test program can be compiled and used:

• Compiled with default values for the macros:

theon$g++ -Wall -std=c++11 -o ugemm_test ugemm_test.cpp theon$ ugemm_test
MR      NR       k incRowC incColC   alpha    beta        error
4       8     128       1       4       1       1     1.35e-03
theon$ • Compiled for $$\alpha=1.5$$ and $$\beta=0$$ theon$ g++ -Wall -std=c++11 -o ugemm_test -DALPHA=1.5 -DBETA=0 ugemm_test.cpp
theon$ugemm_test MR NR k incRowC incColC alpha beta error 4 8 128 1 4 1.50 0 1.28e-03 theon$ 
• Compiled with all macros explicitly defined

theon$g++ -Wall -std=c++11 -o ugemm_test -DDUGEMM_MR_DEFAULT=3 -DDUGEMM_NR_DEFAULT=2 -DDIM_K=1000 -DALPHA=1.5 -DBETA=0 -DCOLMAJOR_C=0 ugemm_test.cpp theon$ ugemm_test
MR      NR       k incRowC incColC   alpha    beta        error
3       2    1000       2       1    1.50       0     1.11e-04
theon\$ 

#include <cassert>
#include <cstdlib>
#include <cstddef>
#include <cmath>
#include <cfloat>
#include <printf.hpp>

//------------------------------------------------------------------------------

void
randMatrix(std::size_t m, std::size_t n,
double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = ((double)rand() - RAND_MAX/2)*2/RAND_MAX;
}
}
}

void
nanMatrix(std::size_t m, std::size_t n,
double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = std::nan("");
}
}
}

void
printMatrix(std::size_t m, std::size_t n,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
for (std::size_t i=0; i<m; ++i) {
for (std::size_t j=0; j<n; ++j) {
fmt::printf("%10.3lf ", A[i*incRowA+j*incColA]);
}
fmt::printf("\n");
}
fmt::printf("\n");
}

double
genorm_inf(std::size_t m, std::size_t n,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
double res = 0;
for (std::size_t i=0; i<m; ++i) {
double asum = 0;
for (std::size_t j=0; j<n; ++j) {
asum += std::fabs(A[i*incRowA+j*incColA]);
}
if (std::isnan(asum)) {
return asum;
}
if (asum>res) {
res = asum;
}
}
return res;
}

void
geaxpy(std::size_t m, std::size_t n, double alpha,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
if (m==0 || n==0 || alpha==0) {
return;
}
// if B is row major:   B^T <- alpha*A^T + B^T
if (incRowB>incColB) {
geaxpy(n, m, alpha, A, incColA, incRowA, B, incColB, incRowB);
return;
}
// B is col major:
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
B[i*incRowB+j*incColB] += alpha*A[i*incRowA+j*incColA];
}
}
}

void
gecopy(std::size_t m, std::size_t n,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
if (m==0 || n==0) {
return;
}
// if B is row major:   B^T <- A^T
if (incRowB>incColB) {
gecopy(n, m, A, incColA, incRowA, B, incColB, incRowB);
return;
}
// B is col major:
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
}
}
}

//------------------------------------------------------------------------------

#define MAX(x,y)    ((x)>(y)) ? (x) : (y)

double
gemm_err_est(std::size_t m, std::size_t n, std::size_t k,
double alpha,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
const double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
const double *C0, std::ptrdiff_t incRowC0, std::ptrdiff_t incColC0,
double beta,
const double *C_, std::ptrdiff_t incRowC_, std::ptrdiff_t incColC_,
double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
geaxpy(m, n, -1, C_, incRowC_, incColC_, C, incRowC, incColC);

double normD  = genorm_inf(m, n, C, incRowC, incColC);
std::size_t N = MAX(m, MAX(n, k));

if (std::isnan(normD)) {
return normD;
}

if (normD==0) {
return 0;
}

double normA = 0;
double normB = 0;

if (alpha!=0) {
normB  = genorm_inf(k, n, B, incRowB, incColB);
normA  = genorm_inf(m, k, A, incRowA, incColA);
normA  *= fabs(alpha);
}

double normC0 = 0;
if (beta!=0) {
normC0 = genorm_inf(m, n, C0, incRowC0, incColC0);
normC0 *= fabs(beta);
}

return normD/(DBL_EPSILON*(N*normA*normB+normC0));
}

//------------------------------------------------------------------------------

void
gemm_ref(std::size_t m, std::size_t n, std::size_t k,
double alpha,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
const double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
double beta,
double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
if (beta!=1) {
if (beta!=0) {
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
C[i*incRowC+j*incColC] *= beta;
}
}
} else {
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
C[i*incRowC+j*incColC] = 0;
}
}
}
}
if (k==0 || alpha==0) {
return;
}
for (std::size_t j=0; j<n; ++j) {
for (std::size_t l=0; l<k; ++l) {
for (std::size_t i=0; i<m; ++i) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}

//------------------------------------------------------------------------------

#ifndef DUGEMM_MR_DEFAULT
#define DUGEMM_MR_DEFAULT   4
#endif

#ifndef DUGEMM_NR_DEFAULT
#define DUGEMM_NR_DEFAULT   8
#endif

namespace ulmblas {

namespace dugemm_parameter {
std::size_t MR = DUGEMM_MR_DEFAULT;
std::size_t NR = DUGEMM_NR_DEFAULT;
} // namespace dugemm_parameter

void
ugemm_ref(std::size_t k, double alpha,
const double *A, const double *B,
double beta,
double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
using namespace dugemm_parameter;

double AB[MR*NR];

for (std::size_t i=0; i<MR*NR; ++i) {
AB[i] = 0;
}
for (std::size_t l=0; l<k; ++l) {
for (std::size_t i=0; i<MR; ++i) {
for (std::size_t j=0; j<NR; ++j) {
AB[i*NR + j] += A[i]*B[j];
}
}
A += MR;
B += NR;
}
// Yeah, this is unnecessary if (alpha==0). But ok ...
for (std::size_t i=0; i<MR*NR; ++i) {
AB[i] *= alpha;
}
// This check for beta is really necessary
if (beta!=0) {
for (std::size_t j=0; j<NR; ++j) {
for (std::size_t i=0; i<MR; ++i) {
C[i*incRowC+j*incColC] *= beta;
C[i*incRowC+j*incColC] += AB[i*NR+j];
}
}
} else {
for (std::size_t j=0; j<NR; ++j) {
for (std::size_t i=0; i<MR; ++i) {
C[i*incRowC+j*incColC] = AB[i*NR+j];
}
}
}
}

} // namespace ulmblas

//------------------------------------------------------------------------------

#ifndef ALPHA
#define ALPHA   1
#endif

#ifndef BETA
#define BETA    1
#endif

#ifndef DIM_K
#define DIM_K   128
#endif

#define DIM_MR  DUGEMM_MR_DEFAULT
#define DIM_NR  DUGEMM_NR_DEFAULT

#ifndef COLMAJOR_C
#define COLMAJOR_C 1
#endif

double A[DIM_MR*DIM_K];
double B[DIM_K*DIM_NR];
double C0[DIM_MR*DIM_NR];
double Cref[DIM_MR*DIM_NR];
double Ctst[DIM_MR*DIM_NR];

int
main()
{
/* TODO: your test code */
}