Improving the solution of quiz 03

Content

The road so far:

We first present possible solutions for both quizzes. We then compare the results in common benchmark program gemm_session12.cpp.

Solutions for quiz 03

Here is a possible solution for quiz 03:

#include <cassert>
#include <cfloat>
#include <cstdlib>
#include <cstddef>
#include <cmath>

#include <algorithm>
#include <complex>
#include <map>
#include <cstdlib>
#include <string>
#include <printf.hpp>

#include <sys/times.h>
#include <unistd.h>

//------------------------------------------------------------------------------

//
// From session 8, page 2
//

namespace tools {

struct DoubleArray
{
    DoubleArray(std::size_t n)
        : ptr(new double[n])
    {
        if (!ptr) {
            std::abort();
        }
    }

    ~DoubleArray()
    {
        delete [] ptr;
    }

    operator double *() const
    {
        return ptr;
    }

    double * const ptr;

};

} // namespace tools


namespace ulmblas {

void
gecopy(std::size_t m, std::size_t n,
       const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
       double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
    if (m==0 || n==0) {
        return;
    }
    // if B is row major:   B^T <- A^T
    if (incRowB>incColB) {
        gecopy(n, m, A, incColA, incRowA, B, incColB, incRowB);
        return;
    }
    // B is col major:
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t i=0; i<m; ++i) {
            B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
        }
    }
}

void
gescal(std::size_t m, std::size_t n, double alpha,
       double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    if (m==0 || n==0 || alpha==1) {
        return;
    }
    // A is row major: scale A^T
    if (incRowA>incColA) {
        gescal(n, m, alpha, A, incColA, incRowA);
        return;
    }
    // A is col major:
    if (alpha!=0) {
        for (std::size_t j=0; j<n; ++j) {
            for (std::size_t i=0; i<m; ++i) {
                A[i*incRowA+j*incColA] *= alpha;
            }
        }
    } else {
        for (std::size_t j=0; j<n; ++j) {
            for (std::size_t i=0; i<m; ++i) {
                A[i*incRowA+j*incColA] = 0;
            }
        }
    }
}

void
geaxpy(std::size_t m, std::size_t n, double alpha,
       const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
       double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
    if (m==0 || n==0 || alpha==0) {
        return;
    }
    // if B is row major:   B^T <- alpha*A^T + B^T
    if (incRowB>incColB) {
        geaxpy(n, m, alpha, A, incColA, incRowA, B, incColB, incRowB);
        return;
    }
    // B is col major:
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t i=0; i<m; ++i) {
            B[i*incRowB+j*incColB] += alpha*A[i*incRowA+j*incColA];
        }
    }
}

// from session 9, page 4
void
pack_A(std::size_t M, std::size_t K,
       const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
       std::size_t M_R,
       double *p)
{
    std::size_t m_p = (M + M_R - 1) / M_R;

    if (incRowA<incColA) {
        for (std::size_t J=0; J<K; ++J) {
            for (std::size_t I=0; I<M_R*m_p; ++I) {
                std::size_t mu = M_R*K*(I/M_R) + J*M_R + (I % M_R);

                p[mu] = (I<M) ? A[I*incRowA+J*incColA]
                              : 0;
            }
        }
    } else {
        for (std::size_t I=0; I<M_R*m_p; ++I) {
            for (std::size_t J=0; J<K; ++J) {
                std::size_t mu = M_R*K*(I/M_R) + J*M_R + (I % M_R);

                p[mu] = (I<M) ? A[I*incRowA+J*incColA]
                              : 0;
            }
        }
    }
}

// from session 9, page 6
void
pack_B(std::size_t K, std::size_t N,
       const double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
       std::size_t N_R,
       double *p)
{
    pack_A(N, K, B, incColB, incRowB, N_R, p);
}

// from session 10, page 1
#ifndef DUGEMM_MR_DEFAULT
#define DUGEMM_MR_DEFAULT   4
#endif

#ifndef DUGEMM_NR_DEFAULT
#define DUGEMM_NR_DEFAULT   8
#endif

namespace dugemm_parameter {
    std::size_t MR = DUGEMM_MR_DEFAULT;
    std::size_t NR = DUGEMM_NR_DEFAULT;
} // namespace dugemm_parameter

void
ugemm_ref(std::size_t k, double alpha,
          const double *A, const double *B,
          double beta,
          double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
    using namespace dugemm_parameter;

    double AB[MR*NR];

    for (std::size_t i=0; i<MR*NR; ++i) {
        AB[i] = 0;
    }
    for (std::size_t l=0; l<k; ++l) {
        for (std::size_t i=0; i<MR; ++i) {
            for (std::size_t j=0; j<NR; ++j) {
                AB[i*NR + j] += A[i]*B[j];
            }
        }
        A += MR;
        B += NR;
    }
    // Yeah, this is unnecessary if (alpha==0). But ok ...
    for (std::size_t i=0; i<MR*NR; ++i) {
        AB[i] *= alpha;
    }
    // This check for beta is really necessary
    if (beta!=0) {
        for (std::size_t j=0; j<NR; ++j) {
            for (std::size_t i=0; i<MR; ++i) {
                C[i*incRowC+j*incColC] *= beta;
                C[i*incRowC+j*incColC] += AB[i*NR+j];
            }
        }
    } else {
        for (std::size_t j=0; j<NR; ++j) {
            for (std::size_t i=0; i<MR; ++i) {
                C[i*incRowC+j*incColC] = AB[i*NR+j];
            }
        }
    }
}

void
mgemm(std::size_t M, std::size_t N, std::size_t K,
      double alpha,
      const double *A, const double *B,
      double beta,
      double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
    using namespace dugemm_parameter;

    std::size_t mp = (M+MR-1) / MR;
    std::size_t np = (N+NR-1) / NR;

    std::size_t mr_ = M % MR;
    std::size_t nr_ = N % NR;

    double C_[MR*NR];

    for (std::size_t j=0; j<np; ++j) {
        std::size_t nr = (j<np-1 || nr_==0) ? NR
                                            : nr_;
        for (std::size_t i=0; i<mp; ++i) {
            std::size_t mr = (i<mp-1 || mr_==0) ? MR
                                                : mr_;
            if (mr==MR && nr==NR) {
                ugemm_ref(K, alpha,
                          &A[i*MR*K], &B[j*K*NR],
                          beta,
                          &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC);
            } else {
                ugemm_ref(K, alpha,
                          &A[i*MR*K], &B[j*K*NR],
                          0,
                          C_, 1, MR);
                gescal(mr, nr, beta,
                       &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC);
                geaxpy(mr, nr, 1,
                       C_, 1, MR,
                       &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC);
            }
        }
    }
}

// from session 10, page 7
#ifndef DGEMM_MC_DEFAULT
#define DGEMM_MC_DEFAULT   256
#endif

#ifndef DGEMM_NC_DEFAULT
#define DGEMM_NC_DEFAULT   2048
#endif

#ifndef DGEMM_KC_DEFAULT
#define DGEMM_KC_DEFAULT   256
#endif

namespace dgemm_parameter {
    std::size_t MC = DGEMM_MC_DEFAULT;
    std::size_t NC = DGEMM_NC_DEFAULT;
    std::size_t KC = DGEMM_KC_DEFAULT;
} // namespace dgemm_parameter

void
gemm(std::size_t m, std::size_t n, std::size_t k,
      double alpha,
      const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
      const double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
      double beta,
      double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
    using namespace dugemm_parameter;
    using namespace dgemm_parameter;

    assert(MC % MR == 0);
    assert(NC % NR == 0);

    if (alpha==0 || k==0) {
        gescal(m, n, beta, C, incRowC, incColC);
        return;
    }

    std::size_t mb = (m + MC-1)/MC;
    std::size_t nb = (n + NC-1)/NC;
    std::size_t kb = (k + KC-1)/KC;

    std::size_t mc_ = m % MC;
    std::size_t nc_ = n % NC;
    std::size_t kc_ = k % KC;

    tools::DoubleArray A_(MC*KC);
    tools::DoubleArray B_(KC*NC);

    for (std::size_t j=0; j<nb; ++j) {
        std::size_t N = (j<nb-1 || nc_==0) ? NC
                                           : nc_;
        for (std::size_t l=0; l<kb; ++l) {
            std::size_t K = (l<kb-1 || kc_==0) ? KC
                                               : kc_;
            double beta_ = (l==0) ? beta
                                  : 1;
            pack_B(K, N,
                   &B[l*KC*incRowB+j*NC*incColB], incRowB, incColB, NR,
                   B_);

            for (std::size_t i=0; i<mb; ++i) {
                std::size_t M = (i<mb-1 || mc_==0) ? MC
                                                   : mc_;

                pack_A(M, K,
                       &A[i*MC*incRowA+l*KC*incColA], incRowA, incColA, MR,
                       A_);

                mgemm(M, N, K, alpha, A_, B_, beta_,
                      &C[i*MC*incRowC+j*NC*incColC], incRowC, incColC);
            }
        }
    }
}

} // namespace ulmblas

extern "C" {

void
ulm_dgemm_(const char *transA, const char *transB,
           const int *m, const int *n, const int *k,
           const double *alpha,
           const double *A, const int *ldA, const double *B, const int *ldB,
           const double *beta,
           double *C, const int *ldC)
{
    bool ntA = *transA=='N' || *transA=='n';
    bool ntB = *transB=='N' || *transB=='n';

    std::ptrdiff_t incRowA = ntA ? 1 : *ldA;
    std::ptrdiff_t incColA = ntA ? *ldA : 1;

    std::ptrdiff_t incRowB = ntB ? 1 : *ldB;
    std::ptrdiff_t incColB = ntB ? *ldB : 1;

    ulmblas::gemm(*m, *n, *k, *alpha,
                  A, incRowA, incColA, B, incRowB, incColB,
                  *beta, C, 1, *ldC);
}

} // extern "C"


This can be tested with

#include <cassert>
#include <cstdlib>
#include <cstddef>
#include <cmath>
#include <cfloat>
#include <printf.hpp>
#include <sys/times.h>
#include <unistd.h>

//------------------------------------------------------------------------------

#include "gemm.hpp"

//------------------------------------------------------------------------------

namespace test{

double
wallTime() {
    static int ticks_per_second = 0;
    if (!ticks_per_second) {
        ticks_per_second = sysconf(_SC_CLK_TCK);
    }
    struct tms timebuf;
    /* times returns the number of real time ticks passed since start */
    return (double) times(&timebuf) / ticks_per_second;
}


void
randMatrix(std::size_t m, std::size_t n,
           double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] = ((double)rand() - RAND_MAX/2)*2/RAND_MAX;
        }
    }
}

void
nanMatrix(std::size_t m, std::size_t n,
          double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] = std::nan("");
        }
    }
}

void
printMatrix(std::size_t m, std::size_t n,
            const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    for (std::size_t i=0; i<m; ++i) {
        for (std::size_t j=0; j<n; ++j) {
            fmt::printf("%10.3lf ", A[i*incRowA+j*incColA]);
        }
        fmt::printf("\n");
    }
    fmt::printf("\n");
}

double
genorm_inf(std::size_t m, std::size_t n,
           const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    double res = 0;
    for (std::size_t i=0; i<m; ++i) {
        double asum = 0;
        for (std::size_t j=0; j<n; ++j) {
            asum += std::fabs(A[i*incRowA+j*incColA]);
        }
        if (std::isnan(asum)) {
            return asum;
        }
        if (asum>res) {
            res = asum;
        }
    }
    return res;
}

void
gescal(std::size_t m, std::size_t n, double alpha,
       double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA)
{
    if (m==0 || n==0 || alpha==1) {
        return;
    }
    // A is row major: scale A^T
    if (incRowA>incColA) {
        gescal(n, m, alpha, A, incColA, incRowA);
        return;
    }
    // A is col major:
    if (alpha!=0) {
        for (std::size_t j=0; j<n; ++j) {
            for (std::size_t i=0; i<m; ++i) {
                A[i*incRowA+j*incColA] *= alpha;
            }
        }
    } else {
        for (std::size_t j=0; j<n; ++j) {
            for (std::size_t i=0; i<m; ++i) {
                A[i*incRowA+j*incColA] = 0;
            }
        }
    }
}

void
geaxpy(std::size_t m, std::size_t n, double alpha,
       const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
       double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
    if (m==0 || n==0 || alpha==0) {
        return;
    }
    // if B is row major:   B^T <- alpha*A^T + B^T
    if (incRowB>incColB) {
        geaxpy(n, m, alpha, A, incColA, incRowA, B, incColB, incRowB);
        return;
    }
    // B is col major:
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t i=0; i<m; ++i) {
            B[i*incRowB+j*incColB] += alpha*A[i*incRowA+j*incColA];
        }
    }
}

void
gecopy(std::size_t m, std::size_t n,
       const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
       double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB)
{
    if (m==0 || n==0) {
        return;
    }
    // if B is row major:   B^T <- A^T
    if (incRowB>incColB) {
        gecopy(n, m, A, incColA, incRowA, B, incColB, incRowB);
        return;
    }
    // B is col major:
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t i=0; i<m; ++i) {
            B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
        }
    }
}

#define MAX(x,y)    ((x)>(y)) ? (x) : (y)

double
gemm_err_est(std::size_t m, std::size_t n, std::size_t k,
             double alpha,
             const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
             const double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
             const double *C0, std::ptrdiff_t incRowC0, std::ptrdiff_t incColC0,
             double beta,
             const double *C_, std::ptrdiff_t incRowC_, std::ptrdiff_t incColC_,
             double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
    geaxpy(m, n, -1, C_, incRowC_, incColC_, C, incRowC, incColC);

    double normD  = genorm_inf(m, n, C, incRowC, incColC);
    std::size_t N = MAX(m, MAX(n, k));

    if (std::isnan(normD)) {
        return normD;
    }

    if (normD==0) {
        return 0;
    }

    double normA = 0;
    double normB = 0;

    if (alpha!=0) {
        normB  = genorm_inf(k, n, B, incRowB, incColB);
        normA  = genorm_inf(m, k, A, incRowA, incColA);
        normA  *= fabs(alpha);
    }

    double normC0 = 0;
    if (beta!=0) {
        normC0 = genorm_inf(m, n, C0, incRowC0, incColC0);
        normC0 *= fabs(beta);
    }

    return normD/(DBL_EPSILON*(N*normA*normB+normC0));
}

} // namespace test

//------------------------------------------------------------------------------

void
gemm_ref(std::size_t m, std::size_t n, std::size_t k,
         double alpha,
         const double *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
         const double *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
         double beta,
         double *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
    if (beta!=1) {
        if (beta!=0) {
            for (std::size_t j=0; j<n; ++j) {
                for (std::size_t i=0; i<m; ++i) {
                    C[i*incRowC+j*incColC] *= beta;
                }
            }
        } else {
            for (std::size_t j=0; j<n; ++j) {
                for (std::size_t i=0; i<m; ++i) {
                    C[i*incRowC+j*incColC] = 0;
                }
            }
        }
    }
    if (k==0 || alpha==0) {
        return;
    }
    for (std::size_t j=0; j<n; ++j) {
        for (std::size_t l=0; l<k; ++l) {
            for (std::size_t i=0; i<m; ++i) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

//------------------------------------------------------------------------------

#ifndef ALPHA
#define ALPHA   1
#endif

#ifndef BETA
#define BETA    1
#endif

#ifndef DIM_MAX_M
#define DIM_MAX_M   2000
#endif

#ifndef DIM_MAX_N
#define DIM_MAX_N   2000
#endif

#ifndef DIM_MAX_K
#define DIM_MAX_K   2000
#endif

#ifndef COLMAJOR_C
#define COLMAJOR_C 1
#endif

#ifndef COLMAJOR_A
#define COLMAJOR_A 1
#endif

#ifndef COLMAJOR_B
#define COLMAJOR_B 1
#endif

int
main()
{
    fmt::printf("#Configuration:\n");
    fmt::printf("#\tMC = %5zu\n", ulmblas::dgemm_parameter::MC);
    fmt::printf("#\tNC = %5zu\n", ulmblas::dgemm_parameter::NC);
    fmt::printf("#\tKC = %5zu\n", ulmblas::dgemm_parameter::KC);
    fmt::printf("#\tMR = %5zu\n", ulmblas::dugemm_parameter::MR);
    fmt::printf("#\tNR = %5zu\n", ulmblas::dugemm_parameter::NR);


    fmt::printf("#\n");
    fmt::printf("#Benchmark:\n");
    fmt::printf("#%7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %12s "
                "%7s %7s %12s %12s\n",
                "MR", "NR", "k", "incRowC", "incColC",
                "incRowA", "incColA", "incRowB", "incColB",
                "alpha", "beta", "error", "tRef", "tTst",
                "mflops: ref", "tst");
    for (std::size_t m=300, n=300, k=300;
         m <= DIM_MAX_M && n <= DIM_MAX_N && k <= DIM_MAX_K;
         m += 100, n +=100, k += 100)
    {
        std::size_t incRowC = (COLMAJOR_C) ? 1 : n;
        std::size_t incColC = (COLMAJOR_C) ? m : 1;

        std::size_t incRowA = (COLMAJOR_A) ? 1 : k;
        std::size_t incColA = (COLMAJOR_A) ? m : 1;

        std::size_t incRowB = (COLMAJOR_B) ? 1 : n;
        std::size_t incColB = (COLMAJOR_B) ? k : 1;

        tools::DoubleArray A(m*k);
        tools::DoubleArray B(k*n);
        tools::DoubleArray C0(m*n);
        tools::DoubleArray Cref(m*n);
        tools::DoubleArray Ctst(m*n);

        if (BETA==0) {
            test::nanMatrix(m, n, C0, incRowC, incColC);
        } else {
            test::randMatrix(m, n, C0, incRowC, incColC);
        }
        if (ALPHA==0) {
            test::nanMatrix(m, k, A, incRowA, incColA);
            test::nanMatrix(k, n, B, incRowB, incColB);
        } else {
            test::randMatrix(m, k, A, incRowA, incColA);
            test::randMatrix(k, n, B, incRowB, incColB);
        }

        // call reference implementation
        test::gecopy(m, n,
                     C0, incRowC, incColC,
                     Cref, incRowC, incColC);
        double tRef = test::wallTime();
        gemm_ref(m, n, k,
                 ALPHA,
                 A, incRowA, incColA,
                 B, incRowB, incColB,
                 BETA,
                 Cref, incRowC, incColC);
        tRef = test::wallTime() - tRef;

        // call, test and bench other implementation

        int     runs = 0;
        double  tTst = 0;
        do {
            test::gecopy(m, n,
                         C0, incRowC, incColC,
                         Ctst, incRowC, incColC);
            double t = test::wallTime();
            ulmblas::gemm(m, n, k,
                          ALPHA,
                          A, incRowA, incColA,
                          B, incRowB, incColB,
                          BETA,
                          Ctst, incRowC, incColC);
            tTst += test::wallTime() - t;
            ++runs;
        } while (tTst < 1);
        tTst /= runs;


        double err = test::gemm_err_est(m, n, k,
                                        ALPHA,
                                        A, incRowA, incColA,
                                        B, incRowB, incColB,
                                        C0, incRowC, incColC,
                                        BETA,
                                        Cref, incRowC, incColC,
                                        Ctst, incRowC, incColC);

        double mflop = 2.*m/1000*n/1000*k;

        fmt::printf(" %7zu %7zu %7zu %7zu %7zu %7zu %7zu %7zu %7zu "
                    "%7.2lf %7.2lf %12.2e %7.2lf %7.2lf %12.2lf %12.2lf\n",
                    m, n, k, incRowC, incColC, incRowA, incColA,
                    incRowB, incColB, ALPHA, BETA, err, tRef, tTst,
                    mflop/tRef, mflop/tTst);
    }
}

Benchmarks (on the Linux machines in E44)

Compiling and running the benchmark on a machine in E44 (here on heim):

heim$ g++-7.2 -Wall -std=c++17 -O3 -mavx -o gemm_test gemm_test.cpp
heim$ gemm_test | tee bench_gemm.dat
#Configuration:
#	MC =   256
#	NC =  2048
#	KC =   256
#	MR =     4
#	NR =     8
#
#Benchmark:
#     MR      NR       k incRowC incColC incRowA incColA incRowB incColB   alpha    beta        error    tRef    tTst  mflops: ref          tst
     300     300     300       1     300       1     300       1     300       1       1     3.83e-04    0.04    0.02      1350.00      2405.94
     400     400     400       1     400       1     400       1     400       1       1     3.40e-04    0.05    0.05      2560.00      2584.62
     500     500     500       1     500       1     500       1     500       1       1     2.94e-04    0.09    0.10      2777.78      2594.34
     600     600     600       1     600       1     600       1     600       1       1     2.52e-04    0.14    0.17      3085.71      2592.00
     700     700     700       1     700       1     700       1     700       1       1     2.09e-04    0.24    0.26      2858.33      2588.68
     800     800     800       1     800       1     800       1     800       1       1     1.92e-04    0.37    0.39      2767.57      2625.64
     900     900     900       1     900       1     900       1     900       1       1     1.66e-04    0.53    0.56      2750.94      2603.57
    1000    1000    1000       1    1000       1    1000       1    1000       1       1     1.50e-04    0.76    0.76      2631.58      2631.58
    1100    1100    1100       1    1100       1    1100       1    1100       1       1     1.34e-04    0.98    1.01      2716.33      2635.64
    1200    1200    1200       1    1200       1    1200       1    1200       1       1     1.23e-04    1.29    1.32      2679.07      2618.18
    1300    1300    1300       1    1300       1    1300       1    1300       1       1     1.16e-04    1.62    1.68      2712.35      2615.48
    1400    1400    1400       1    1400       1    1400       1    1400       1       1     1.09e-04    2.05    2.08      2677.07      2638.46
    1500    1500    1500       1    1500       1    1500       1    1500       1       1     9.90e-05    2.49    2.56      2710.84      2636.72
    1600    1600    1600       1    1600       1    1600       1    1600       1       1     9.28e-05    3.03    3.11      2703.63      2634.08
    1700    1700    1700       1    1700       1    1700       1    1700       1       1     8.96e-05    3.61    3.73      2721.88      2634.32
    1800    1800    1800       1    1800       1    1800       1    1800       1       1     8.37e-05    4.34    4.43      2687.56      2632.96
    1900    1900    1900       1    1900       1    1900       1    1900       1       1     7.92e-05    5.05    5.21      2716.44      2633.01
    2000    2000    2000       1    2000       1    2000       1    2000       1       1     7.59e-05    5.94    6.06      2693.60      2640.26
heim$ cat bench_gemm.dat
#Configuration:
#	MC =   256
#	NC =  2048
#	KC =   256
#	MR =     4
#	NR =     8
#
#Benchmark:
#     MR      NR       k incRowC incColC incRowA incColA incRowB incColB   alpha    beta        error    tRef    tTst  mflops: ref          tst
     300     300     300       1     300       1     300       1     300       1       1     3.83e-04    0.04    0.02      1350.00      2405.94
     400     400     400       1     400       1     400       1     400       1       1     3.40e-04    0.05    0.05      2560.00      2584.62
     500     500     500       1     500       1     500       1     500       1       1     2.94e-04    0.09    0.10      2777.78      2594.34
     600     600     600       1     600       1     600       1     600       1       1     2.52e-04    0.14    0.17      3085.71      2592.00
     700     700     700       1     700       1     700       1     700       1       1     2.09e-04    0.24    0.26      2858.33      2588.68
     800     800     800       1     800       1     800       1     800       1       1     1.92e-04    0.37    0.39      2767.57      2625.64
     900     900     900       1     900       1     900       1     900       1       1     1.66e-04    0.53    0.56      2750.94      2603.57
    1000    1000    1000       1    1000       1    1000       1    1000       1       1     1.50e-04    0.76    0.76      2631.58      2631.58
    1100    1100    1100       1    1100       1    1100       1    1100       1       1     1.34e-04    0.98    1.01      2716.33      2635.64
    1200    1200    1200       1    1200       1    1200       1    1200       1       1     1.23e-04    1.29    1.32      2679.07      2618.18
    1300    1300    1300       1    1300       1    1300       1    1300       1       1     1.16e-04    1.62    1.68      2712.35      2615.48
    1400    1400    1400       1    1400       1    1400       1    1400       1       1     1.09e-04    2.05    2.08      2677.07      2638.46
    1500    1500    1500       1    1500       1    1500       1    1500       1       1     9.90e-05    2.49    2.56      2710.84      2636.72
    1600    1600    1600       1    1600       1    1600       1    1600       1       1     9.28e-05    3.03    3.11      2703.63      2634.08
    1700    1700    1700       1    1700       1    1700       1    1700       1       1     8.96e-05    3.61    3.73      2721.88      2634.32
    1800    1800    1800       1    1800       1    1800       1    1800       1       1     8.37e-05    4.34    4.43      2687.56      2632.96
    1900    1900    1900       1    1900       1    1900       1    1900       1       1     7.92e-05    5.05    5.21      2716.44      2633.01
    2000    2000    2000       1    2000       1    2000       1    2000       1       1     7.59e-05    5.94    6.06      2693.60      2640.26
heim$ 

Using the gnuplot script

set terminal svg noenhanced size 900, 500
set output "bench_gemm.svg"
set xlabel "Matrix dimensions: m=n=k"
set ylabel "MFLOPS
set title "GEMM (for double) on a machine in E44"
set key outside
set yrange [0:26000]
plot "bench_gemm.dat" using 1:15 with linespoints lt 1 lw 2 title "dgemm_colmajor", \
     "bench_gemm.dat" using 1:16 with linespoints lt 3 lw 2 title "ulmblas::dgemm (step 1)"

we get by running it through gnuplot

theon$ gnuplot gemm.gnuplot
theon$ 

this plot:

Conclusion

So far all our effort did not pay off!

Exercise