Packing Blocks from Matrix A

Content

In the cache optimized GEMM-Operation \(\beta C + \alpha A B \to C\) the matrix \(A\) gets partitioned into blocks with maximal dimension \(M_c \times K_c\). Each block \(A_{i,l}\) of \(A\) gets packed into col-major horizontal panels with \(M_r\) rows.

Exercise

Simple Test Program

#include <stdlib.h>     // for malloc(), free(), abort()
#include <stdio.h>      // for printf()
#include <stddef.h>     // for size_t, ptrdiff_t

//-- (literal) constants for DGEMM ---------------------------------------------

#ifndef DGEMM_MC
#define DGEMM_MC    4
#endif

#ifndef DGEMM_KC
#define DGEMM_KC    5
#endif

#ifndef DGEMM_MR
#define DGEMM_MR    2
#endif

#if (DGEMM_MC % DGEMM_MR != 0)
#error "DGEMM_MC must be a multiple of DEGMM_MR."
#endif

//-- packing blocks of A -------------------------------------------------------

void
dpack_A(size_t mc, size_t kc,
        const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
        double *A_)
{
    // TODO: Your code
}

//-- auxiliary functions and test program --------------------------------------

void
initDGeMatrix(size_t m, size_t n,
              double *A,
              ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            A[i*incRowA + j*incColA] = i*n + j + 1;
        }
    }
}

void
printDGeMatrix(size_t m, size_t n,
               const double *A,
               ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            printf("%9.2lf ", A[i*incRowA + j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

void
test_pack_A(size_t m, size_t k,
            double *A,
            ptrdiff_t incRowA, ptrdiff_t incColA)
{
    double *A_ = malloc(DGEMM_MC*DGEMM_KC*sizeof(double));
    if (! A_) {
        abort();
    }

    size_t mb  = (m+DGEMM_MC-1) / DGEMM_MC;
    size_t kb  = (k+DGEMM_KC-1) / DGEMM_KC;

    size_t mc_ = m % DGEMM_MC;
    size_t kc_ = k % DGEMM_KC;

    printf("DGEMM_MC = %d\n", DGEMM_MC);
    printf("DGEMM_KC = %d\n", DGEMM_KC);
    printf("DGEMM_MR = %d\n", DGEMM_MR);

    for (size_t ib=0; ib<mb; ++ib) {
        size_t mc = ib<mb-1 || mc_==0 ? DGEMM_MC
                                      : mc_;

        for (size_t lb=0; lb<kb; ++lb) {
            size_t kc = lb<kb-1 || kc_==0 ? DGEMM_KC
                                          : kc_;

            printf("packing %zu x %zu block from A(%zu, %zu)\n",
                   mc, kc, ib*DGEMM_MC, lb*DGEMM_KC);

            dpack_A(mc, kc,
                    &A[ib*DGEMM_MC*incRowA + lb*DGEMM_KC*incColA],
                    incRowA, incColA,
                    A_);

            printf("Panels in buffer A_:\n");
            printDGeMatrix(DGEMM_MR, DGEMM_MC*DGEMM_KC/DGEMM_MR,
                           A_, 1, DGEMM_MR);

            printf("A_ as array:\n");
            printDGeMatrix(1, DGEMM_MC*DGEMM_KC, A_, 0, 1);
        }
    }

    free(A_);
}

#ifndef COLMAJOR
#define COLMAJOR 1
#endif

int
main()
{
    size_t      m       = 7;
    size_t      k       = 8;
    ptrdiff_t   incRowA = COLMAJOR ? 1 : k;
    ptrdiff_t   incColA = COLMAJOR ? m : 1;
    double      *A      = malloc(m*k*sizeof(double));

    if (!A) {
        abort();
    }
    initDGeMatrix(m, k, A, incRowA, incColA);

    printf("m = %zu\n", m);
    printf("k = %zu\n", k);
    printf("A =\n");
    printDGeMatrix(m, k, A, incRowA, incColA);

    test_pack_A(m, k, A, incRowA, incColA);

    free(A);
}