GEMM Macro Kernel

Content

The GEMM macro kernel is supposed to perform the GEMM operation

\[C \leftarrow \beta \cdot C + \alpha \cdot A \cdot B\]

for the following special case:

The macro kernel performs its operation by using the micro kernel (multiplying panels of \(A\) with panels of \(B\)).

Exercise

Simple Test Program

#include <stdlib.h>     // for malloc(), free(), abort()
#include <stdio.h>      // for printf()
#include <stddef.h>     // for size_t, ptrdiff_t
#include <stdbool.h>    // for typedef bool
#include <math.h>       // for nan()

//-- (literal) constants for DGEMM ---------------------------------------------

#ifndef DGEMM_MC
#define DGEMM_MC    4
#endif

#ifndef DGEMM_NC
#define DGEMM_NC    6
#endif

#ifndef DGEMM_KC
#define DGEMM_KC    5
#endif

#ifndef DGEMM_MR
#define DGEMM_MR    2
#endif

#ifndef DGEMM_NR
#define DGEMM_NR    3
#endif

#if (DGEMM_MC % DGEMM_MR != 0)
#error "DGEMM_MC must be a multiple of DEGMM_MR."
#endif

#if (DGEMM_NC % DGEMM_NR != 0)
#error "DGEMM_NC must be a multiple of DEGMM_NR."
#endif

//-- packing blocks of A -------------------------------------------------------

void
pack_A(size_t mc, size_t kc,
       const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
       double *A_)
{
    size_t mb = mc / DGEMM_MR;

    // pack whole panels of A
    for (size_t ib=0; ib<mb; ++ib) {
        for (size_t j=0; j<kc; ++j) {
            for (size_t i=0; i<DGEMM_MR; ++i) {
                A_[ib*DGEMM_MR*kc + j*DGEMM_MR + i]
                    = A[(ib*DGEMM_MR+i)*incRowA + j*incColA];
            }
        }
    }

    // if necessary: pack last panel with zero padding
    if (mb*DGEMM_MR<mc) {
        size_t mr = mc % DGEMM_MR;
        for (size_t j=0; j<kc; ++j) {
            for (size_t i=0; i<mr; ++i) {
                A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i]
                    = A[(mb*DGEMM_MR+i)*incRowA + j*incColA];
            }
            for (size_t i=mr; i<DGEMM_MR; ++i) {
                A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i] = 0;
            }
        }
    }
}

//-- packing blocks of B -------------------------------------------------------

void
pack_B(size_t kc, size_t nc,
       const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
       double *B_)
{
    size_t nb = nc / DGEMM_NR;

    // pack whole panels of B
    for (size_t jb=0; jb<nb; ++jb) {
        for (size_t i=0; i<kc; ++i) {
            for (size_t j=0; j<DGEMM_NR; ++j) {
                B_[jb*DGEMM_NR*kc + i*DGEMM_NR + j]
                    = B[i*incRowB + (jb*DGEMM_NR+j)*incColB];
            }
        }
    }

    // if necessary: pack last panel with zero padding
    if (nb*DGEMM_NR<nc) {
        size_t nr = nc % DGEMM_NR;
        for (size_t i=0; i<kc; ++i) {
            for (size_t j=0; j<nr; ++j) {
                B_[nb*DGEMM_NR*kc + i*DGEMM_NR + j]
                    = B[i*incRowB + (nb*DGEMM_NR+j)*incColB];
            }
            for (size_t j=nr; j<DGEMM_NR; ++j) {
                B_[nb*DGEMM_NR*kc + i*DGEMM_NR + j] = 0;
            }
        }
    }
}

//-- GEMM micro kernel (reference implementation) ------------------------------

void
dgemm_micro_ref(size_t k, double alpha,
                const double *A, const double *B,
                double beta,
                double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
    double R[DGEMM_MR*DGEMM_NR];

    // R <- 0
    for (size_t i=0; i<DGEMM_MR; ++i) {
        for (size_t j=0; j<DGEMM_NR; ++j) {
            R[i*DGEMM_NR+j] = 0;
        }
    }
    // R <- R + A*B
    for (size_t l=0; l<k; ++l) {
        for (size_t i=0; i<DGEMM_MR; ++i) {
            for (size_t j=0; j<DGEMM_NR; ++j) {
                R[i*DGEMM_NR+j] += A[i+l*DGEMM_MR]*B[l*DGEMM_NR+j];
            }
        }
    }
    // R <- alpha*R
    for (size_t i=0; i<DGEMM_MR; ++i) {
        for (size_t j=0; j<DGEMM_NR; ++j) {
            R[i*DGEMM_NR+j] *= alpha;
        }
    }
    // C <- beta*C + R
    if (beta==0) {
        for (size_t i=0; i<DGEMM_MR; ++i) {
            for (size_t j=0; j<DGEMM_NR; ++j) {
                C[i*incRowC+j*incColC] = R[i*DGEMM_NR+j];
            }
        }
    } else {
        for (size_t i=0; i<DGEMM_MR; ++i) {
            for (size_t j=0; j<DGEMM_NR; ++j) {
                C[i*incRowC+j*incColC] *= beta;
                C[i*incRowC+j*incColC] += R[i*DGEMM_NR+j];
            }
        }
    }
}

//-- GEMM macro kernel (reference implementation) ------------------------------
void
ge_dscal(size_t m, size_t n,
         double alpha,
         double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
    if (alpha!=0) {
        for (size_t j=0; j<n; ++j) {
            for (size_t i=0; i<m; ++i) {
                A[i*incRowA + j*incColA] *= alpha;
            }
        }
    } else {
        for (size_t j=0; j<n; ++j) {
            for (size_t i=0; i<m; ++i) {
                A[i*incRowA + j*incColA] = 0;
            }
        }
    }
}

void
ge_daxpy(size_t m, size_t n, double alpha,
         const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
         double *B, ptrdiff_t incRowB, ptrdiff_t incColB)
{
    if (m==0 || n==0 || alpha==0) {
        return;
    }
    for (size_t j=0; j<n; ++j) {
        for (size_t i=0; i<m; ++i) {
            B[i*incRowB + j*incColB] += alpha*A[i*incRowA + j*incColA];
        }
    }
}


void
dgemm_macro(size_t mc, size_t nc, size_t kc,
            double alpha,
            const double *A, const double *B,
            double beta,
            double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
    double AB[DGEMM_MR*DGEMM_NR];

    /*

        TODO: Your implementation

    */
}

//-- auxiliary functions and test program --------------------------------------

void
initDGeMatrix(size_t m, size_t n, bool withNan,
              double *A,
              ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            A[i*incRowA + j*incColA] = withNan ? nan("")
                                               : i*n + j + 1;
        }
    }
}

void
printDGeMatrix(size_t m, size_t n,
               const double *A,
               ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            printf("%9.2lf ", A[i*incRowA + j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

void
test_macro(size_t m, size_t n, size_t k,
           double alpha,
           const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
           const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
           double beta,
           double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
    double *A_ = malloc(DGEMM_MC*DGEMM_KC*sizeof(double));
    double *B_ = malloc(DGEMM_KC*DGEMM_NC*sizeof(double));

    if (!A_ || !B_) {
        abort();
    }

    size_t mc = m < DGEMM_MC ? m : DGEMM_MC;
    size_t nc = n < DGEMM_NC ? n : DGEMM_NC;
    size_t kc = k < DGEMM_KC ? k : DGEMM_KC;

    printf("mc = %zu\n", mc);
    printf("nc = %zu\n", nc);
    printf("kc = %zu\n", kc);
    printf("Block from A =\n");
    printDGeMatrix(mc, kc, A, incRowA, incColA);

    printf("Block from B =\n");
    printDGeMatrix(kc, nc, B, incRowB, incColB);

    printf("Block from C =\n");
    printDGeMatrix(mc, nc, C, incRowC, incColC);


    pack_A(mc, kc, A, incRowA, incColA, A_);
    pack_B(kc, nc, B, incRowB, incColB, B_);

    dgemm_macro(mc, nc, kc, alpha, A_, B_, beta, C, incRowC, incColC);

    printf("C <- %5.2lf * C + %5.2lf * A * B\n", beta, alpha);
    printf("C =\n");
    printDGeMatrix(mc, nc, C, incRowC, incColC);

    free(A_);
    free(B_);
}

#ifndef COLMAJOR
#define COLMAJOR 1
#endif

int
main()
{
    size_t      m       = 2;
    size_t      n       = 4;
    size_t      k       = 4;

    ptrdiff_t   incRowA = COLMAJOR ? 1 : k;
    ptrdiff_t   incColA = COLMAJOR ? m : 1;

    ptrdiff_t   incRowB = COLMAJOR ? 1 : n;
    ptrdiff_t   incColB = COLMAJOR ? k : 1;

    ptrdiff_t   incRowC = COLMAJOR ? 1 : n;
    ptrdiff_t   incColC = COLMAJOR ? m : 1;

    double      *A = malloc(m*k*sizeof(double));
    double      *B = malloc(k*n*sizeof(double));
    double      *C = malloc(m*n*sizeof(double));

    if (!A || !B || !C) {
        abort();
    }
    initDGeMatrix(m, k, false, A, incRowA, incColA);
    initDGeMatrix(k, n, false, B, incRowB, incColB);
    initDGeMatrix(m, n, true, C, incRowC, incColC);

    double alpha = 1;
    double beta  = 0;

    test_macro(m, n, k,
               alpha,
               A, incRowA, incColA,
               B, incRowB, incColB,
               beta,
               C, incRowC, incColC);

    free(A);
    free(B);
    free(C);
}