# Improving the solution of quiz 03

#### Content

• In Session 5 we have shown that using buffers and blocking can improve the performance of the GEMM operation. The idea was kept rather simple but the benchmark results were very promising.

• In Session 10 we tried to improve the idea by taking into account that we have a cache hierarchy (and not just one type of cache). The final implementation for this idea was given as assignment in quiz 03.

We first present possible solutions for both quizzes. We then compare the results in common benchmark program gemm_session12.cpp.

## Solutions for quiz 03

Here is a possible solution for quiz 03:

#include <cassert>
#include <cfloat>
#include <cstdlib>
#include <cstddef>
#include <cmath>

#include <algorithm>
#include <complex>
#include <map>
#include <cstdlib>
#include <string>
#include <printf.hpp>

#include <sys/times.h>
#include <unistd.h>

//------------------------------------------------------------------------------

//
// From session 8, page 2
//

namespace tools {

struct DoubleArray
{
DoubleArray(std::size_t n)
: ptr(new double[n])
{
if (!ptr) {
std::abort();
}
}

~DoubleArray()
{
delete [] ptr;
}

operator double *() const
{
return ptr;
}

double * const ptr;

};

} // namespace tools

namespace ulmblas {

void
gecopy(std::size_t m, std::size_t n,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
if (m==0 || n==0) {
return;
}
// if B is row major:   B^T <- A^T
if (incRowB>incColB) {
gecopy(n, m, A, incColA, incRowA, B, incColB, incRowB);
return;
}
// B is col major:
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
}
}
}

void
gescal(std::size_t m, std::size_t n, double alpha,
double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
if (m==0 || n==0 || alpha==1) {
return;
}
// A is row major: scale A^T
if (incRowA>incColA) {
gescal(n, m, alpha, A, incColA, incRowA);
return;
}
// A is col major:
if (alpha!=0) {
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] *= alpha;
}
}
} else {
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = 0;
}
}
}
}

void
geaxpy(std::size_t m, std::size_t n, double alpha,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
if (m==0 || n==0 || alpha==0) {
return;
}
// if B is row major:   B^T <- alpha*A^T + B^T
if (incRowB>incColB) {
geaxpy(n, m, alpha, A, incColA, incRowA, B, incColB, incRowB);
return;
}
// B is col major:
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
B[i*incRowB+j*incColB] += alpha*A[i*incRowA+j*incColA];
}
}
}

// from session 9, page 4
void
pack_A(std::size_t M, std::size_t K,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
std::size_t M_R,
double *p)
{
std::size_t m_p = (M + M_R - 1) / M_R;

if (incRowA<incColA) {
for (std::size_t J=0; J<K; ++J) {
for (std::size_t I=0; I<M_R*m_p; ++I) {
std::size_t mu = M_R*K*(I/M_R) + J*M_R + (I % M_R);

p[mu] = (I<M) ? A[I*incRowA+J*incColA]
: 0;
}
}
} else {
for (std::size_t I=0; I<M_R*m_p; ++I) {
for (std::size_t J=0; J<K; ++J) {
std::size_t mu = M_R*K*(I/M_R) + J*M_R + (I % M_R);

p[mu] = (I<M) ? A[I*incRowA+J*incColA]
: 0;
}
}
}
}

// from session 9, page 6
void
pack_B(std::size_t K, std::size_t N,
const double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
std::size_t N_R,
double *p)
{
pack_A(N, K, B, incColB, incRowB, N_R, p);
}

// from session 10, page 1
#ifndef DUGEMM_MR_DEFAULT
#define DUGEMM_MR_DEFAULT   4
#endif

#ifndef DUGEMM_NR_DEFAULT
#define DUGEMM_NR_DEFAULT   8
#endif

namespace dugemm_parameter {
std::size_t MR = DUGEMM_MR_DEFAULT;
std::size_t NR = DUGEMM_NR_DEFAULT;
} // namespace dugemm_parameter

void
ugemm_ref(std::size_t k, double alpha,
const double *A, const double *B,
double beta,
double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
using namespace dugemm_parameter;

double AB[MR*NR];

for (std::size_t i=0; i<MR*NR; ++i) {
AB[i] = 0;
}
for (std::size_t l=0; l<k; ++l) {
for (std::size_t i=0; i<MR; ++i) {
for (std::size_t j=0; j<NR; ++j) {
AB[i*NR + j] += A[i]*B[j];
}
}
A += MR;
B += NR;
}
// Yeah, this is unnecessary if (alpha==0). But ok ...
for (std::size_t i=0; i<MR*NR; ++i) {
AB[i] *= alpha;
}
// This check for beta is really necessary
if (beta!=0) {
for (std::size_t j=0; j<NR; ++j) {
for (std::size_t i=0; i<MR; ++i) {
C[i*incRowC+j*incColC] *= beta;
C[i*incRowC+j*incColC] += AB[i*NR+j];
}
}
} else {
for (std::size_t j=0; j<NR; ++j) {
for (std::size_t i=0; i<MR; ++i) {
C[i*incRowC+j*incColC] = AB[i*NR+j];
}
}
}
}

void
mgemm(std::size_t M, std::size_t N, std::size_t K,
double alpha,
const double *A, const double *B,
double beta,
double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
using namespace dugemm_parameter;

std::size_t mp = (M+MR-1) / MR;
std::size_t np = (N+NR-1) / NR;

std::size_t mr_ = M % MR;
std::size_t nr_ = N % NR;

double C_[MR*NR];

for (std::size_t j=0; j<np; ++j) {
std::size_t nr = (j<np-1 || nr_==0) ? NR
: nr_;
for (std::size_t i=0; i<mp; ++i) {
std::size_t mr = (i<mp-1 || mr_==0) ? MR
: mr_;
if (mr==MR && nr==NR) {
ugemm_ref(K, alpha,
&A[i*MR*K], &B[j*K*NR],
beta,
&C[i*MR*incRowC+j*NR*incColC], incRowC, incColC);
} else {
ugemm_ref(K, alpha,
&A[i*MR*K], &B[j*K*NR],
0,
C_, 1, MR);
gescal(mr, nr, beta,
&C[i*MR*incRowC+j*NR*incColC], incRowC, incColC);
geaxpy(mr, nr, 1,
C_, 1, MR,
&C[i*MR*incRowC+j*NR*incColC], incRowC, incColC);
}
}
}
}

// from session 10, page 7
#ifndef DGEMM_MC_DEFAULT
#define DGEMM_MC_DEFAULT   256
#endif

#ifndef DGEMM_NC_DEFAULT
#define DGEMM_NC_DEFAULT   2048
#endif

#ifndef DGEMM_KC_DEFAULT
#define DGEMM_KC_DEFAULT   256
#endif

namespace dgemm_parameter {
std::size_t MC = DGEMM_MC_DEFAULT;
std::size_t NC = DGEMM_NC_DEFAULT;
std::size_t KC = DGEMM_KC_DEFAULT;
} // namespace dgemm_parameter

void
gemm(std::size_t m, std::size_t n, std::size_t k,
double alpha,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
const double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
double beta,
double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
using namespace dugemm_parameter;
using namespace dgemm_parameter;

assert(MC % MR == 0);
assert(NC % NR == 0);

if (alpha==0 || k==0) {
gescal(m, n, beta, C, incRowC, incColC);
return;
}

std::size_t mb = (m + MC-1)/MC;
std::size_t nb = (n + NC-1)/NC;
std::size_t kb = (k + KC-1)/KC;

std::size_t mc_ = m % MC;
std::size_t nc_ = n % NC;
std::size_t kc_ = k % KC;

tools::DoubleArray A_(MC*KC);
tools::DoubleArray B_(KC*NC);

for (std::size_t j=0; j<nb; ++j) {
std::size_t N = (j<nb-1 || nc_==0) ? NC
: nc_;
for (std::size_t l=0; l<kb; ++l) {
std::size_t K = (l<kb-1 || kc_==0) ? KC
: kc_;
double beta_ = (l==0) ? beta
: 1;
pack_B(K, N,
&B[l*KC*incRowB+j*NC*incColB], incRowB, incColB, NR,
B_);

for (std::size_t i=0; i<mb; ++i) {
std::size_t M = (i<mb-1 || mc_==0) ? MC
: mc_;

pack_A(M, K,
&A[i*MC*incRowA+l*KC*incColA], incRowA, incColA, MR,
A_);

mgemm(M, N, K, alpha, A_, B_, beta_,
&C[i*MC*incRowC+j*NC*incColC], incRowC, incColC);
}
}
}
}

} // namespace ulmblas

extern "C" {

void
ulm_dgemm_(const char *transA, const char *transB,
const int *m, const int *n, const int *k,
const double *alpha,
const double *A, const int *ldA, const double *B, const int *ldB,
const double *beta,
double *C, const int *ldC)
{
bool ntA = *transA=='N' || *transA=='n';
bool ntB = *transB=='N' || *transB=='n';

std::ptrdiff_t incRowA = ntA ? 1 : *ldA;
std::ptrdiff_t incColA = ntA ? *ldA : 1;

std::ptrdiff_t incRowB = ntB ? 1 : *ldB;
std::ptrdiff_t incColB = ntB ? *ldB : 1;

ulmblas::gemm(*m, *n, *k, *alpha,
A, incRowA, incColA, B, incRowB, incColB,
*beta, C, 1, *ldC);
}

} // extern "C"



This can be tested with

#include <cassert>
#include <cstdlib>
#include <cstddef>
#include <cmath>
#include <cfloat>
#include <printf.hpp>
#include <sys/times.h>
#include <unistd.h>

//------------------------------------------------------------------------------

#include "gemm.hpp"

//------------------------------------------------------------------------------

namespace test{

double
wallTime() {
static int ticks_per_second = 0;
if (!ticks_per_second) {
ticks_per_second = sysconf(_SC_CLK_TCK);
}
struct tms timebuf;
/* times returns the number of real time ticks passed since start */
return (double) times(&timebuf) / ticks_per_second;
}

void
randMatrix(std::size_t m, std::size_t n,
double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = ((double)rand() - RAND_MAX/2)*2/RAND_MAX;
}
}
}

void
nanMatrix(std::size_t m, std::size_t n,
double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = std::nan("");
}
}
}

void
printMatrix(std::size_t m, std::size_t n,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
for (std::size_t i=0; i<m; ++i) {
for (std::size_t j=0; j<n; ++j) {
fmt::printf("%10.3lf ", A[i*incRowA+j*incColA]);
}
fmt::printf("\n");
}
fmt::printf("\n");
}

double
genorm_inf(std::size_t m, std::size_t n,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
double res = 0;
for (std::size_t i=0; i<m; ++i) {
double asum = 0;
for (std::size_t j=0; j<n; ++j) {
asum += std::fabs(A[i*incRowA+j*incColA]);
}
if (std::isnan(asum)) {
return asum;
}
if (asum>res) {
res = asum;
}
}
return res;
}

void
gescal(std::size_t m, std::size_t n, double alpha,
double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
if (m==0 || n==0 || alpha==1) {
return;
}
// A is row major: scale A^T
if (incRowA>incColA) {
gescal(n, m, alpha, A, incColA, incRowA);
return;
}
// A is col major:
if (alpha!=0) {
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] *= alpha;
}
}
} else {
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = 0;
}
}
}
}

void
geaxpy(std::size_t m, std::size_t n, double alpha,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
if (m==0 || n==0 || alpha==0) {
return;
}
// if B is row major:   B^T <- alpha*A^T + B^T
if (incRowB>incColB) {
geaxpy(n, m, alpha, A, incColA, incRowA, B, incColB, incRowB);
return;
}
// B is col major:
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
B[i*incRowB+j*incColB] += alpha*A[i*incRowA+j*incColA];
}
}
}

void
gecopy(std::size_t m, std::size_t n,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
if (m==0 || n==0) {
return;
}
// if B is row major:   B^T <- A^T
if (incRowB>incColB) {
gecopy(n, m, A, incColA, incRowA, B, incColB, incRowB);
return;
}
// B is col major:
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
}
}
}

#define MAX(x,y)    ((x)>(y)) ? (x) : (y)

double
gemm_err_est(std::size_t m, std::size_t n, std::size_t k,
double alpha,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
const double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
const double *C0, std::ptrdiff_t incRowC0, std::ptrdiff_t incColC0,
double beta,
const double *C_, std::ptrdiff_t incRowC_, std::ptrdiff_t incColC_,
double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
geaxpy(m, n, -1, C_, incRowC_, incColC_, C, incRowC, incColC);

double normD  = genorm_inf(m, n, C, incRowC, incColC);
std::size_t N = MAX(m, MAX(n, k));

if (std::isnan(normD)) {
return normD;
}

if (normD==0) {
return 0;
}

double normA = 0;
double normB = 0;

if (alpha!=0) {
normB  = genorm_inf(k, n, B, incRowB, incColB);
normA  = genorm_inf(m, k, A, incRowA, incColA);
normA  *= fabs(alpha);
}

double normC0 = 0;
if (beta!=0) {
normC0 = genorm_inf(m, n, C0, incRowC0, incColC0);
normC0 *= fabs(beta);
}

return normD/(DBL_EPSILON*(N*normA*normB+normC0));
}

} // namespace test

//------------------------------------------------------------------------------

void
gemm_ref(std::size_t m, std::size_t n, std::size_t k,
double alpha,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
const double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
double beta,
double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
if (beta!=1) {
if (beta!=0) {
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
C[i*incRowC+j*incColC] *= beta;
}
}
} else {
for (std::size_t j=0; j<n; ++j) {
for (std::size_t i=0; i<m; ++i) {
C[i*incRowC+j*incColC] = 0;
}
}
}
}
if (k==0 || alpha==0) {
return;
}
for (std::size_t j=0; j<n; ++j) {
for (std::size_t l=0; l<k; ++l) {
for (std::size_t i=0; i<m; ++i) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}

//------------------------------------------------------------------------------

#ifndef ALPHA
#define ALPHA   1
#endif

#ifndef BETA
#define BETA    1
#endif

#ifndef DIM_MAX_M
#define DIM_MAX_M   2000
#endif

#ifndef DIM_MAX_N
#define DIM_MAX_N   2000
#endif

#ifndef DIM_MAX_K
#define DIM_MAX_K   2000
#endif

#ifndef COLMAJOR_C
#define COLMAJOR_C 1
#endif

#ifndef COLMAJOR_A
#define COLMAJOR_A 1
#endif

#ifndef COLMAJOR_B
#define COLMAJOR_B 1
#endif

int
main()
{
fmt::printf("#Configuration:\n");
fmt::printf("#\tMC = %5zu\n", ulmblas::dgemm_parameter::MC);
fmt::printf("#\tNC = %5zu\n", ulmblas::dgemm_parameter::NC);
fmt::printf("#\tKC = %5zu\n", ulmblas::dgemm_parameter::KC);
fmt::printf("#\tMR = %5zu\n", ulmblas::dugemm_parameter::MR);
fmt::printf("#\tNR = %5zu\n", ulmblas::dugemm_parameter::NR);

fmt::printf("#\n");
fmt::printf("#Benchmark:\n");
fmt::printf("#%7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %12s "
"%7s %7s %12s %12s\n",
"MR", "NR", "k", "incRowC", "incColC",
"incRowA", "incColA", "incRowB", "incColB",
"alpha", "beta", "error", "tRef", "tTst",
"mflops: ref", "tst");
for (std::size_t m=300, n=300, k=300;
m <= DIM_MAX_M && n <= DIM_MAX_N && k <= DIM_MAX_K;
m += 100, n +=100, k += 100)
{
std::size_t incRowC = (COLMAJOR_C) ? 1 : n;
std::size_t incColC = (COLMAJOR_C) ? m : 1;

std::size_t incRowA = (COLMAJOR_A) ? 1 : k;
std::size_t incColA = (COLMAJOR_A) ? m : 1;

std::size_t incRowB = (COLMAJOR_B) ? 1 : n;
std::size_t incColB = (COLMAJOR_B) ? k : 1;

tools::DoubleArray A(m*k);
tools::DoubleArray B(k*n);
tools::DoubleArray C0(m*n);
tools::DoubleArray Cref(m*n);
tools::DoubleArray Ctst(m*n);

if (BETA==0) {
test::nanMatrix(m, n, C0, incRowC, incColC);
} else {
test::randMatrix(m, n, C0, incRowC, incColC);
}
if (ALPHA==0) {
test::nanMatrix(m, k, A, incRowA, incColA);
test::nanMatrix(k, n, B, incRowB, incColB);
} else {
test::randMatrix(m, k, A, incRowA, incColA);
test::randMatrix(k, n, B, incRowB, incColB);
}

// call reference implementation
test::gecopy(m, n,
C0, incRowC, incColC,
Cref, incRowC, incColC);
double tRef = test::wallTime();
gemm_ref(m, n, k,
ALPHA,
A, incRowA, incColA,
B, incRowB, incColB,
BETA,
Cref, incRowC, incColC);
tRef = test::wallTime() - tRef;

// call, test and bench other implementation

int     runs = 0;
double  tTst = 0;
do {
test::gecopy(m, n,
C0, incRowC, incColC,
Ctst, incRowC, incColC);
double t = test::wallTime();
ulmblas::gemm(m, n, k,
ALPHA,
A, incRowA, incColA,
B, incRowB, incColB,
BETA,
Ctst, incRowC, incColC);
tTst += test::wallTime() - t;
++runs;
} while (tTst < 1);
tTst /= runs;

double err = test::gemm_err_est(m, n, k,
ALPHA,
A, incRowA, incColA,
B, incRowB, incColB,
C0, incRowC, incColC,
BETA,
Cref, incRowC, incColC,
Ctst, incRowC, incColC);

double mflop = 2.*m/1000*n/1000*k;

fmt::printf(" %7zu %7zu %7zu %7zu %7zu %7zu %7zu %7zu %7zu "
"%7.2lf %7.2lf %12.2e %7.2lf %7.2lf %12.2lf %12.2lf\n",
m, n, k, incRowC, incColC, incRowA, incColA,
incRowB, incColB, ALPHA, BETA, err, tRef, tTst,
mflop/tRef, mflop/tTst);
}
}


## Benchmarks (on the Linux machines in E44)

Compiling and running the benchmark on a machine in E44 (here on heim):

heim$g++-7.2 -Wall -std=c++17 -O3 -mavx -o gemm_test gemm_test.cpp heim$ gemm_test | tee bench_gemm.dat
#Configuration:
#	MC =   256
#	NC =  2048
#	KC =   256
#	MR =     4
#	NR =     8
#
#Benchmark:
#     MR      NR       k incRowC incColC incRowA incColA incRowB incColB   alpha    beta        error    tRef    tTst  mflops: ref          tst
300     300     300       1     300       1     300       1     300       1       1     3.83e-04    0.04    0.02      1350.00      2405.94
400     400     400       1     400       1     400       1     400       1       1     3.40e-04    0.05    0.05      2560.00      2584.62
500     500     500       1     500       1     500       1     500       1       1     2.94e-04    0.09    0.10      2777.78      2594.34
600     600     600       1     600       1     600       1     600       1       1     2.52e-04    0.14    0.17      3085.71      2592.00
700     700     700       1     700       1     700       1     700       1       1     2.09e-04    0.24    0.26      2858.33      2588.68
800     800     800       1     800       1     800       1     800       1       1     1.92e-04    0.37    0.39      2767.57      2625.64
900     900     900       1     900       1     900       1     900       1       1     1.66e-04    0.53    0.56      2750.94      2603.57
1000    1000    1000       1    1000       1    1000       1    1000       1       1     1.50e-04    0.76    0.76      2631.58      2631.58
1100    1100    1100       1    1100       1    1100       1    1100       1       1     1.34e-04    0.98    1.01      2716.33      2635.64
1200    1200    1200       1    1200       1    1200       1    1200       1       1     1.23e-04    1.29    1.32      2679.07      2618.18
1300    1300    1300       1    1300       1    1300       1    1300       1       1     1.16e-04    1.62    1.68      2712.35      2615.48
1400    1400    1400       1    1400       1    1400       1    1400       1       1     1.09e-04    2.05    2.08      2677.07      2638.46
1500    1500    1500       1    1500       1    1500       1    1500       1       1     9.90e-05    2.49    2.56      2710.84      2636.72
1600    1600    1600       1    1600       1    1600       1    1600       1       1     9.28e-05    3.03    3.11      2703.63      2634.08
1700    1700    1700       1    1700       1    1700       1    1700       1       1     8.96e-05    3.61    3.73      2721.88      2634.32
1800    1800    1800       1    1800       1    1800       1    1800       1       1     8.37e-05    4.34    4.43      2687.56      2632.96
1900    1900    1900       1    1900       1    1900       1    1900       1       1     7.92e-05    5.05    5.21      2716.44      2633.01
2000    2000    2000       1    2000       1    2000       1    2000       1       1     7.59e-05    5.94    6.06      2693.60      2640.26
heim$cat bench_gemm.dat #Configuration: # MC = 256 # NC = 2048 # KC = 256 # MR = 4 # NR = 8 # #Benchmark: # MR NR k incRowC incColC incRowA incColA incRowB incColB alpha beta error tRef tTst mflops: ref tst 300 300 300 1 300 1 300 1 300 1 1 3.83e-04 0.04 0.02 1350.00 2405.94 400 400 400 1 400 1 400 1 400 1 1 3.40e-04 0.05 0.05 2560.00 2584.62 500 500 500 1 500 1 500 1 500 1 1 2.94e-04 0.09 0.10 2777.78 2594.34 600 600 600 1 600 1 600 1 600 1 1 2.52e-04 0.14 0.17 3085.71 2592.00 700 700 700 1 700 1 700 1 700 1 1 2.09e-04 0.24 0.26 2858.33 2588.68 800 800 800 1 800 1 800 1 800 1 1 1.92e-04 0.37 0.39 2767.57 2625.64 900 900 900 1 900 1 900 1 900 1 1 1.66e-04 0.53 0.56 2750.94 2603.57 1000 1000 1000 1 1000 1 1000 1 1000 1 1 1.50e-04 0.76 0.76 2631.58 2631.58 1100 1100 1100 1 1100 1 1100 1 1100 1 1 1.34e-04 0.98 1.01 2716.33 2635.64 1200 1200 1200 1 1200 1 1200 1 1200 1 1 1.23e-04 1.29 1.32 2679.07 2618.18 1300 1300 1300 1 1300 1 1300 1 1300 1 1 1.16e-04 1.62 1.68 2712.35 2615.48 1400 1400 1400 1 1400 1 1400 1 1400 1 1 1.09e-04 2.05 2.08 2677.07 2638.46 1500 1500 1500 1 1500 1 1500 1 1500 1 1 9.90e-05 2.49 2.56 2710.84 2636.72 1600 1600 1600 1 1600 1 1600 1 1600 1 1 9.28e-05 3.03 3.11 2703.63 2634.08 1700 1700 1700 1 1700 1 1700 1 1700 1 1 8.96e-05 3.61 3.73 2721.88 2634.32 1800 1800 1800 1 1800 1 1800 1 1800 1 1 8.37e-05 4.34 4.43 2687.56 2632.96 1900 1900 1900 1 1900 1 1900 1 1900 1 1 7.92e-05 5.05 5.21 2716.44 2633.01 2000 2000 2000 1 2000 1 2000 1 2000 1 1 7.59e-05 5.94 6.06 2693.60 2640.26 heim$ 

Using the gnuplot script

set terminal svg noenhanced size 900, 500
set output "bench_gemm.svg"
set xlabel "Matrix dimensions: m=n=k"
set ylabel "MFLOPS
set title "GEMM (for double) on a machine in E44"
set key outside
set yrange [0:26000]
plot "bench_gemm.dat" using 1:15 with linespoints lt 1 lw 2 title "dgemm_colmajor", \
"bench_gemm.dat" using 1:16 with linespoints lt 3 lw 2 title "ulmblas::dgemm (step 1)"


we get by running it through gnuplot

theon$gnuplot gemm.gnuplot theon$ 

this plot:

## Conclusion

So far all our effort did not pay off!

## Exercise

• Confirm the benchmarks (i.e. compile gemm_test.cpp as shown above and run the benchmark).

• The parameters M_R and N_R used in the packing functions should be know at compile time. This will allow the compiler to apply loop unrolling.

Change the signature of the packing functions to

template <std::size_t M_R>
void
pack_A(std::size_t M, std::size_t K,
const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
double *p)
{
// ...
}

template <std::size_t N_R>
void
pack_B(std::size_t K, std::size_t N,
const double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
double *p)
{
// ...
}


Further modifications:

• Adapt the implementation of the packing functions accordingly

• and also calls of the packing function in the frame algorithm.

• Run the benchmark after applying all modifications.