GEMM Micro Kernel

Content

The GEMM micro kernel is supposed to perform the GEMM operation

\[C \leftarrow \beta \cdot C + \alpha \cdot A \cdot B\]

for the following special case:

The dimensions of the matrices are as follows:
- \(C\) is a \(M_r \times N_r\) matrix.
- \(A\) is a \(M_r \times k\) matrix.
- \(B\) is a \(k \times N_r\) matrix.
Storage of \(A\) and \(B\):
- Matrix \(A\) is stored col major with \(\text{incRow}_A = 1\) and \(\text{incCol}_A = M_r\). That means, all elements of \(A\) are located in a contiguous memory block (with \(M_r \cdot k\)) elements.
- Matrix \(B\) is stored row major with \(\text{incRow}_B = N_r\) and \(\text{incCol}_B = 1\). That means, all elements of \(B\) are located in a contiguous memory block (with \(k \cdot N_r\)) elements.
Parameters \(M_r\) and \(N_r\) are given as literal constants.

Notes on requirements for the Algorithm and the Implementation

The performance of the micro kernel will be extremely critical for the overall performance of the GEMM operation. So where possible loops should be unrolled. In particular, nested loops should be ordered such that inner loops can be unrolled if possible. Do not call functions (unless it is guaranteed that the compiler can inline these calls).

The implementation should operate as follows:

first compute the matrix product \(A \cdot B\) and store the result in CPU registers. For the sake of simplicity we assume that the compiler maps local variables directly to registers. So assume that storing results in a local variable means to store them in a register and not on the stack.
Next, scale \(C\), i.e. perform the operation \(C \leftarrow \beta C\). In case \(\beta =0\) matrix \(C\) can contain NaN entries!
Finally, update the scaled matrix \(C\) with the previously updated product \(AB\) Apply the scaling of \(AB\) withinthis update step, i.e. perform here the operation
\[C \leftarrow C + \alpha \cdot (AB)\]

Exercise

Implement function dgemm_micro_ref in the test program below. Before you start coding: Make yourself familiar with the test program below.

Simple Test Program

session16/simple_test_micro_ex.c

#include <stdlib.h>     // for malloc(), free(), abort()
#include <stdio.h>      // for printf()
#include <stddef.h>     // for size_t, ptrdiff_t
#include <stdbool.h>    // for typedef bool
#include <math.h>       // for nan()

//-- (literal) constants for DGEMM ---------------------------------------------

#ifndef DGEMM_MC
#define DGEMM_MC    4
#endif

#ifndef DGEMM_NC
#define DGEMM_NC    6
#endif

#ifndef DGEMM_KC
#define DGEMM_KC    5
#endif

#ifndef DGEMM_MR
#define DGEMM_MR    2
#endif

#ifndef DGEMM_NR
#define DGEMM_NR    3
#endif

#if (DGEMM_MC % DGEMM_MR != 0)
#error "DGEMM_MC must be a multiple of DEGMM_MR."
#endif

#if (DGEMM_NC % DGEMM_NR != 0)
#error "DGEMM_NC must be a multiple of DEGMM_NR."
#endif

//-- packing blocks of A -------------------------------------------------------

void
pack_A(size_t mc, size_t kc,
       const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
       double *A_)
{
    size_t mb = mc / DGEMM_MR;

    // pack whole panels of A
    for (size_t ib=0; ib<mb; ++ib) {
        for (size_t j=0; j<kc; ++j) {
            for (size_t i=0; i<DGEMM_MR; ++i) {
                A_[ib*DGEMM_MR*kc + j*DGEMM_MR + i]
                    = A[(ib*DGEMM_MR+i)*incRowA + j*incColA];
            }
        }
    }

    // if necessary: pack last panel with zero padding
    if (mb*DGEMM_MR<mc) {
        size_t mr = mc % DGEMM_MR;
        for (size_t j=0; j<kc; ++j) {
            for (size_t i=0; i<mr; ++i) {
                A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i]
                    = A[(mb*DGEMM_MR+i)*incRowA + j*incColA];
            }
            for (size_t i=mr; i<DGEMM_MR; ++i) {
                A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i] = 0;
            }
        }
    }
}

//-- packing blocks of B -------------------------------------------------------

void
pack_B(size_t kc, size_t nc,
       const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
       double *B_)
{
    size_t nb = nc / DGEMM_NR;

    // pack whole panels of B
    for (size_t jb=0; jb<nb; ++jb) {
        for (size_t i=0; i<kc; ++i) {
            for (size_t j=0; j<DGEMM_NR; ++j) {
                B_[jb*DGEMM_NR*kc + i*DGEMM_NR + j]
                    = B[i*incRowB + (jb*DGEMM_NR+j)*incColB];
            }
        }
    }

    // if necessary: pack last panel with zero padding
    if (nb*DGEMM_NR<nc) {
        size_t nr = nc % DGEMM_NR;
        for (size_t i=0; i<kc; ++i) {
            for (size_t j=0; j<nr; ++j) {
                B_[nb*DGEMM_NR*kc + i*DGEMM_NR + j]
                    = B[i*incRowB + (nb*DGEMM_NR+j)*incColB];
            }
            for (size_t j=nr; j<DGEMM_NR; ++j) {
                B_[nb*DGEMM_NR*kc + i*DGEMM_NR + j] = 0;
            }
        }
    }
}

//-- GEMM micro kernel (reference implementation) ------------------------------

void
dgemm_micro_ref(size_t k, double alpha,
                const double *A, const double *B,
                double beta,
                double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
    double AB[DGEMM_MR*DGEMM_NR];

    /*

        TODO: Your implementation

    */
}

//-- auxiliary functions and test program --------------------------------------

void
initDGeMatrix(size_t m, size_t n, bool withNan,
              double *A,
              ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            A[i*incRowA + j*incColA] = withNan ? nan("")
                                               : i*n + j + 1;
        }
    }
}

void
printDGeMatrix(size_t m, size_t n,
               const double *A,
               ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            printf("%9.2lf ", A[i*incRowA + j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

#ifndef COLMAJOR
#define COLMAJOR 1
#endif

int
main()
{
    size_t      k       = 6;
    ptrdiff_t   incRowC = COLMAJOR ? 1 : DGEMM_NR;
    ptrdiff_t   incColC = COLMAJOR ? DGEMM_MR : 1;

    double      *A = malloc(DGEMM_MR*k*sizeof(double));
    double      *B = malloc(k*DGEMM_NR*sizeof(double));
    double      *C = malloc(DGEMM_MR*DGEMM_NR*sizeof(double));

    if (!A || !B || !C) {
        abort();
    }
    initDGeMatrix(DGEMM_MR, k, false, A, 1, DGEMM_MR);
    initDGeMatrix(k, DGEMM_NR, false, B, DGEMM_NR, 1);
    initDGeMatrix(DGEMM_MR, DGEMM_NR, true, C, incRowC, incColC);

    printf("k = %zu\n", k);
    printf("A =\n");
    printDGeMatrix(DGEMM_MR, k, A, 1, DGEMM_MR);

    printf("B =\n");
    printDGeMatrix(k, DGEMM_NR, B, DGEMM_NR, 1);

    printf("C =\n");
    printDGeMatrix(DGEMM_MR, DGEMM_NR, C, incRowC, incColC);

    double alpha = 1;
    double beta  = 0;

    // call micro kernel
    printf("C <- %5.2lf * C + %5.2lf * A * B\n", beta, alpha);
    dgemm_micro_ref(k, alpha, A, B, beta, C, incRowC, incColC);

    printf("C =\n");
    printDGeMatrix(DGEMM_MR, DGEMM_NR, C, incRowC, incColC);

    alpha = 2;
    beta  = 1;

    // call micro kernel
    printf("C <- %5.2lf * C + %5.2lf * A * B\n", beta, alpha);
    dgemm_micro_ref(k, alpha, A, B, beta, C, incRowC, incColC);
    printf("C =\n");
    printDGeMatrix(DGEMM_MR, DGEMM_NR, C, incRowC, incColC);

    free(A);
    free(B);
    free(C);
}