GEMM Micro Kernel

Content

The GEMM micro kernel is supposed to perform the GEMM operation

\[C \leftarrow \beta \cdot C + \alpha \cdot A \cdot B\]

for the following special case:

Notes on requirements for the Algorithm and the Implementation

The performance of the micro kernel will be extremely critical for the overall performance of the GEMM operation. So where possible loops should be unrolled. In particular, nested loops should be ordered such that inner loops can be unrolled if possible. Do not call functions (unless it is guaranteed that the compiler can inline these calls).

The implementation should operate as follows:

Exercise

Simple Test Program

#include <stdlib.h>     // for malloc(), free(), abort()
#include <stdio.h>      // for printf()
#include <stddef.h>     // for size_t, ptrdiff_t
#include <stdbool.h>    // for typedef bool
#include <math.h>       // for nan()

//-- (literal) constants for DGEMM ---------------------------------------------

#ifndef DGEMM_MC
#define DGEMM_MC    4
#endif

#ifndef DGEMM_NC
#define DGEMM_NC    6
#endif

#ifndef DGEMM_KC
#define DGEMM_KC    5
#endif

#ifndef DGEMM_MR
#define DGEMM_MR    2
#endif

#ifndef DGEMM_NR
#define DGEMM_NR    3
#endif

#if (DGEMM_MC % DGEMM_MR != 0)
#error "DGEMM_MC must be a multiple of DEGMM_MR."
#endif

#if (DGEMM_NC % DGEMM_NR != 0)
#error "DGEMM_NC must be a multiple of DEGMM_NR."
#endif

//-- packing blocks of A -------------------------------------------------------

void
pack_A(size_t mc, size_t kc,
       const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
       double *A_)
{
    size_t mb = mc / DGEMM_MR;

    // pack whole panels of A
    for (size_t ib=0; ib<mb; ++ib) {
        for (size_t j=0; j<kc; ++j) {
            for (size_t i=0; i<DGEMM_MR; ++i) {
                A_[ib*DGEMM_MR*kc + j*DGEMM_MR + i]
                    = A[(ib*DGEMM_MR+i)*incRowA + j*incColA];
            }
        }
    }

    // if necessary: pack last panel with zero padding
    if (mb*DGEMM_MR<mc) {
        size_t mr = mc % DGEMM_MR;
        for (size_t j=0; j<kc; ++j) {
            for (size_t i=0; i<mr; ++i) {
                A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i]
                    = A[(mb*DGEMM_MR+i)*incRowA + j*incColA];
            }
            for (size_t i=mr; i<DGEMM_MR; ++i) {
                A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i] = 0;
            }
        }
    }
}

//-- packing blocks of B -------------------------------------------------------

void
pack_B(size_t kc, size_t nc,
       const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
       double *B_)
{
    size_t nb = nc / DGEMM_NR;

    // pack whole panels of B
    for (size_t jb=0; jb<nb; ++jb) {
        for (size_t i=0; i<kc; ++i) {
            for (size_t j=0; j<DGEMM_NR; ++j) {
                B_[jb*DGEMM_NR*kc + i*DGEMM_NR + j]
                    = B[i*incRowB + (jb*DGEMM_NR+j)*incColB];
            }
        }
    }

    // if necessary: pack last panel with zero padding
    if (nb*DGEMM_NR<nc) {
        size_t nr = nc % DGEMM_NR;
        for (size_t i=0; i<kc; ++i) {
            for (size_t j=0; j<nr; ++j) {
                B_[nb*DGEMM_NR*kc + i*DGEMM_NR + j]
                    = B[i*incRowB + (nb*DGEMM_NR+j)*incColB];
            }
            for (size_t j=nr; j<DGEMM_NR; ++j) {
                B_[nb*DGEMM_NR*kc + i*DGEMM_NR + j] = 0;
            }
        }
    }
}

//-- GEMM micro kernel (reference implementation) ------------------------------

void
dgemm_micro_ref(size_t k, double alpha,
                const double *A, const double *B,
                double beta,
                double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
    double AB[DGEMM_MR*DGEMM_NR];

    /*

        TODO: Your implementation

    */
}

//-- auxiliary functions and test program --------------------------------------

void
initDGeMatrix(size_t m, size_t n, bool withNan,
              double *A,
              ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            A[i*incRowA + j*incColA] = withNan ? nan("")
                                               : i*n + j + 1;
        }
    }
}

void
printDGeMatrix(size_t m, size_t n,
               const double *A,
               ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            printf("%9.2lf ", A[i*incRowA + j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

#ifndef COLMAJOR
#define COLMAJOR 1
#endif

int
main()
{
    size_t      k       = 6;
    ptrdiff_t   incRowC = COLMAJOR ? 1 : DGEMM_NR;
    ptrdiff_t   incColC = COLMAJOR ? DGEMM_MR : 1;

    double      *A = malloc(DGEMM_MR*k*sizeof(double));
    double      *B = malloc(k*DGEMM_NR*sizeof(double));
    double      *C = malloc(DGEMM_MR*DGEMM_NR*sizeof(double));

    if (!A || !B || !C) {
        abort();
    }
    initDGeMatrix(DGEMM_MR, k, false, A, 1, DGEMM_MR);
    initDGeMatrix(k, DGEMM_NR, false, B, DGEMM_NR, 1);
    initDGeMatrix(DGEMM_MR, DGEMM_NR, true, C, incRowC, incColC);

    printf("k = %zu\n", k);
    printf("A =\n");
    printDGeMatrix(DGEMM_MR, k, A, 1, DGEMM_MR);

    printf("B =\n");
    printDGeMatrix(k, DGEMM_NR, B, DGEMM_NR, 1);

    printf("C =\n");
    printDGeMatrix(DGEMM_MR, DGEMM_NR, C, incRowC, incColC);

    double alpha = 1;
    double beta  = 0;

    // call micro kernel
    printf("C <- %5.2lf * C + %5.2lf * A * B\n", beta, alpha);
    dgemm_micro_ref(k, alpha, A, B, beta, C, incRowC, incColC);

    printf("C =\n");
    printDGeMatrix(DGEMM_MR, DGEMM_NR, C, incRowC, incColC);

    alpha = 2;
    beta  = 1;

    // call micro kernel
    printf("C <- %5.2lf * C + %5.2lf * A * B\n", beta, alpha);
    dgemm_micro_ref(k, alpha, A, B, beta, C, incRowC, incColC);
    printf("C =\n");
    printDGeMatrix(DGEMM_MR, DGEMM_NR, C, incRowC, incColC);

    free(A);
    free(B);
    free(C);
}