Possible solution

Content

Source Code

#include <math.h>
#include <stddef.h>
#include <stdio.h>
#include <sys/times.h>
#include <unistd.h>


#ifndef MINDIM_M
#define MINDIM_M    100
#endif

#ifndef MINDIM_N
#define MINDIM_N    100
#endif

#ifndef MINDIM_K
#define MINDIM_K    100
#endif

#ifndef MAXDIM_M
#define MAXDIM_M    1000
#endif

#ifndef MAXDIM_N
#define MAXDIM_N    1000
#endif

#ifndef MAXDIM_K
#define MAXDIM_K    1000
#endif

#ifndef INC_M
#define INC_M   100
#endif

#ifndef INC_N
#define INC_N   100
#endif

#ifndef INC_K
#define INC_K   100
#endif

#ifndef MIN_T
#define MIN_T   1
#endif

#ifndef ALPHA
#define ALPHA   1
#endif

#ifndef BETA
#define BETA   1
#endif

double A[MAXDIM_M*MAXDIM_K];
double B[MAXDIM_K*MAXDIM_N];
double C1[MAXDIM_M*MAXDIM_N];
double C2[MAXDIM_M*MAXDIM_N];

/* return real time in seconds since start of the process */
double
wallTime()
{
   static clock_t ticks_per_second = 0;
   if (!ticks_per_second) {
      ticks_per_second = sysconf(_SC_CLK_TCK);
   }
   struct tms timebuf;
   /* times returns the number of real time ticks passed since start */
   return (double) times(&timebuf) / ticks_per_second;
}

double
asumDiff(size_t m, size_t n,
         const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
         const double *B, ptrdiff_t incRowB, ptrdiff_t incColB)
{
    double diff = 0;
    for (size_t i = 0; i < m; ++i) {
        for (size_t j = 0; j < n; ++j) {
            diff += fabs(B[i*incRowB+j*incColB] - A[i*incRowA+j*incColA]);
        }
    }
    return diff;
}

void
initMatrix(size_t m, size_t n, double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t j = 0; j < n; ++j) {
        for (size_t i = 0; i < m; ++i) {
            A[i*incRowA+j*incColA] = j*n+i+1;
        }
    }
}

void
printMatrix(size_t m, size_t n,
            const double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i = 0; i < m; ++i) {
        printf("   ");
        for (size_t j = 0; j < n; ++j) {
            printf("%4.1lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

void
dgemm_row(size_t m, size_t n, size_t k, double alpha,
          const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
          const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
          double beta,
          double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
    if (beta!=0) {
        if (beta!=1) {
            for (size_t i=0; i<m; ++i) {
                for (size_t j=0; j<n; ++j) {
                    C[i*incRowC+j*incColC] *= beta;
                }
            }
        }
    } else {
        for (size_t i=0; i<m; ++i) {
            for (size_t j=0; j<n; ++j) {
                C[i*incRowC+j*incColC] = 0;
            }
        }
    }
    if (alpha==0 || k==0) {
        return;
    }
    for (size_t i=0; i<m; ++i) {
        for (size_t l=0; l<k; ++l) {
            for (size_t j=0; j<n; ++j) {
                C[i*incRowC+j*incColC] +=
                    alpha*A[i*incRowA+l*incColA]*B[l*incRowB+j*incColB];
            }
        }
    }
}

void
dgemm_col(size_t m, size_t n, size_t k, double alpha,
          const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
          const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
          double beta,
          double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
    if (beta!=0) {
        if (beta!=1) {
            for (size_t j=0; j<n; ++j) {
                for (size_t i=0; i<m; ++i) {
                    C[i*incRowC+j*incColC] *= beta;
                }
            }
        }
    } else {
        for (size_t j=0; j<n; ++j) {
            for (size_t i=0; i<m; ++i) {
                C[i*incRowC+j*incColC] = 0;
            }
        }
    }
    if (alpha==0 || k==0) {
        return;
    }
    for (size_t j=0; j<n; ++j) {
        for (size_t l=0; l<k; ++l) {
            for (size_t i=0; i<m; ++i) {
                C[i*incRowC+j*incColC] +=
                    alpha*A[i*incRowA+l*incColA]*B[l*incRowB+j*incColB];
            }
        }
    }
}


int
main()
{
    printf("   M    N    K      t1      t2   t2/t1       diff\n");
    printf("               col-maj row-maj\n");
    printf("=================================================\n");

    for (size_t m = MINDIM_M, n = MINDIM_N, k = MINDIM_K;
         m <= MAXDIM_M && n <= MAXDIM_N && k <= MAXDIM_K;
	 m += INC_M, n += INC_N, k += INC_K)
    {
        // set storage order for A, B, C1, C2
        ptrdiff_t incRowA = 1, incColA = m;
        ptrdiff_t incRowB = 1, incColB = k;
        ptrdiff_t incRowC = 1, incColC = m;     // used for C1, C2

        initMatrix(m, k, A, incRowA, incColA);
        initMatrix(k, n, B, incRowB, incColB);

        size_t runs = 0;
        double t1 = 0;
        do {
            initMatrix(m, n, C1, incRowC, incColC);

            double t0 = wallTime();
            dgemm_col(m, n, k, ALPHA,
                      A, incRowA, incColA,
                      B, incRowB, incColB,
                      BETA,
                      C1, incRowC, incColC);
            t1 += wallTime() - t0;
            ++runs;
        } while (t1 < MIN_T);
        t1 /= runs;

        runs = 0;
        double t2 = 0;
        do {
            initMatrix(m, n, C2, incRowC, incColC);

            double t0 = wallTime();
            dgemm_row(m, n, k,
                      ALPHA,
                      A, incRowA, incColA,
                      B, incRowB, incColB,
                      BETA,
                      C2, incRowC, incColC);
            t2 += wallTime() - t0;
            ++runs;
        } while (t2 < MIN_T);
        t2 /= runs;

        double diff = asumDiff(m, n,
                               C1, incRowC, incColC,
                               C2, incRowC, incColC);

        printf("%4zu %4zu %4zu %7.2lf %7.2lf %7.2lf %10.2le\n",
	       m, n, k, t1, t2, t2/t1, diff);
    }
}

Compile and run code

theon$ gcc -DMAXDIM_M=1800 -DMAXDIM_N=1800 -DMAXDIM_K=1800 -Wall -O3 -DMIN_T=0 -o bench_gemm_sol bench_gemm_sol.c
theon$ ./bench_gemm_sol > bench_gemm.data
theon$ cat bench_gemm.data
   M    N    K      t1      t2   t2/t1       diff
               col-maj row-maj
=================================================
 100  100  100    0.00    0.01     Inf   0.00e+00
 200  200  200    0.00    0.01     Inf   0.00e+00
 300  300  300    0.02    0.07    3.50   0.00e+00
 400  400  400    0.04    0.20    5.00   0.00e+00
 500  500  500    0.07    0.36    5.14   0.00e+00
 600  600  600    0.11    0.63    5.73   0.00e+00
 700  700  700    0.19    1.19    6.26   0.00e+00
 800  800  800    0.28    2.84   10.14   0.00e+00
 900  900  900    0.39    6.69   17.15   0.00e+00
1000 1000 1000    0.54   10.91   20.20   0.00e+00
1100 1100 1100    0.71   19.32   27.21   0.00e+00
1200 1200 1200    0.92   30.99   33.68   0.00e+00
1300 1300 1300    1.17   40.18   34.34   0.00e+00
1400 1400 1400    1.46   52.22   35.77   0.00e+00
1500 1500 1500    1.81   71.18   39.33   0.00e+00
1600 1600 1600    2.22   86.81   39.10   0.00e+00
1700 1700 1700    2.72  124.35   45.72   0.00e+00
1800 1800 1800    3.34  160.20   47.96   0.00e+00
theon$ 

Plot

set terminal svg size 900, 500
set output "bench_gemm.svg"
set xlabel "Matrix dim A: M=N"
set ylabel "Runtime"
set logscale x
set logscale y
set title "GEMM"
set key outside
set pointsize 0.5
plot "bench_gemm.data" using 1:4 with linespoints lt 2 lw 3 title "col-major", \
     "bench_gemm.data" using 1:5 with linespoints lt 3 lw 3 title "row-major"
heim$ gnuplot bench_gemm.gnuplot
heim$