Possible Solution

Content

#include <stdlib.h>     // for malloc(), free(), abort()
#include <stdio.h>      // for printf()
#include <stddef.h>     // for size_t, ptrdiff_t

//-- (literal) constants for DGEMM ---------------------------------------------

#ifndef DGEMM_MC
#define DGEMM_MC    4
#endif

#ifndef DGEMM_NC
#define DGEMM_NC    6
#endif

#ifndef DGEMM_KC
#define DGEMM_KC    5
#endif

#ifndef DGEMM_MR
#define DGEMM_MR    2
#endif

#ifndef DGEMM_NR
#define DGEMM_NR    3
#endif

#if (DGEMM_MC % DGEMM_MR != 0)
#error "DGEMM_MC must be a multiple of DEGMM_MR."
#endif

#if (DGEMM_NC % DGEMM_NR != 0)
#error "DGEMM_NC must be a multiple of DEGMM_NR."
#endif

//-- packing blocks of A -------------------------------------------------------

void
dpack_A(size_t mc, size_t kc,
        const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
        double *A_)
{
    size_t mb = mc / DGEMM_MR;

    // pack whole panels of A
    for (size_t ib=0; ib<mb; ++ib) {
        for (size_t j=0; j<kc; ++j) {
            for (size_t i=0; i<DGEMM_MR; ++i) {
                A_[ib*DGEMM_MR*kc + j*DGEMM_MR + i]
                    = A[(ib*DGEMM_MR+i)*incRowA + j*incColA];
            }
        }
    }

    // if necessary: pack last panel with zero padding
    if (mb*DGEMM_MR<mc) {
        size_t mr = mc % DGEMM_MR;
        for (size_t j=0; j<kc; ++j) {
            for (size_t i=0; i<mr; ++i) {
                A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i]
                    = A[(mb*DGEMM_MR+i)*incRowA + j*incColA];
            }
            for (size_t i=mr; i<DGEMM_MR; ++i) {
                A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i] = 0;
            }
        }
    }
}

//-- packing blocks of B -------------------------------------------------------

void
dpack_B(size_t kc, size_t nc,
        const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
        double *B_)
{
    size_t nb = nc / DGEMM_NR;

    // pack whole panels of B
    for (size_t jb=0; jb<nb; ++jb) {
        for (size_t i=0; i<kc; ++i) {
            for (size_t j=0; j<DGEMM_NR; ++j) {
                B_[jb*DGEMM_NR*kc + i*DGEMM_NR + j]
                    = B[i*incRowB + (jb*DGEMM_NR+j)*incColB];
            }
        }
    }

    // if necessary: pack last panel with zero padding
    if (nb*DGEMM_NR<nc) {
        size_t nr = nc % DGEMM_NR;
        for (size_t i=0; i<kc; ++i) {
            for (size_t j=0; j<nr; ++j) {
                B_[nb*DGEMM_NR*kc + i*DGEMM_NR + j]
                    = B[i*incRowB + (nb*DGEMM_NR+j)*incColB];
            }
            for (size_t j=nr; j<DGEMM_NR; ++j) {
                B_[nb*DGEMM_NR*kc + i*DGEMM_NR + j] = 0;
            }
        }
    }
}


//-- auxiliary functions and test program --------------------------------------

void
initDGeMatrix(size_t m, size_t n,
              double *A,
              ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            A[i*incRowA + j*incColA] = i*n + j + 1;
        }
    }
}

void
printDGeMatrix(size_t m, size_t n,
               const double *A,
               ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            printf("%9.2lf ", A[i*incRowA + j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

void
test_pack_B(size_t k, size_t n,
            double *B,
            ptrdiff_t incRowB, ptrdiff_t incColB)
{
    double *B_ = malloc(DGEMM_KC*DGEMM_NC*sizeof(double));
    if (! B_) {
        abort();
    }

    size_t kb  = (k+DGEMM_KC-1) / DGEMM_KC;
    size_t nb  = (n+DGEMM_NC-1) / DGEMM_NC;

    size_t kc_ = k % DGEMM_KC;
    size_t nc_ = n % DGEMM_NC;

    printf("DGEMM_KC = %d\n", DGEMM_KC);
    printf("DGEMM_NC = %d\n", DGEMM_NC);
    printf("DGEMM_NR = %d\n", DGEMM_NR);

    for (size_t lb=0; lb<kb; ++lb) {
        size_t kc = lb<kb-1 || kc_==0 ? DGEMM_KC
                                      : kc_;

        for (size_t jb=0; jb<nb; ++jb) {
            size_t nc = jb<nb-1 || nc_==0 ? DGEMM_NC
                                          : nc_;

            printf("packing %zu x %zu block from B(%zu, %zu)\n",
                   kc, nc, lb*DGEMM_KC, jb*DGEMM_NC);

            dpack_B(kc, nc,
                    &B[lb*DGEMM_KC*incRowB + jb*DGEMM_NC*incColB],
                    incRowB, incColB,
                    B_);

            printf("Panels in buffer B_:\n");
            printDGeMatrix(DGEMM_KC*DGEMM_NC/DGEMM_NR, DGEMM_NR,
                           B_, DGEMM_NR, 1);

            printf("B_ as array:\n");
            printDGeMatrix(1, DGEMM_KC*DGEMM_NC, B_, 0, 1);
        }
    }

    free(B_);
}

#ifndef COLMAJOR
#define COLMAJOR 1
#endif

int
main()
{
    size_t      k       = 6;
    size_t      n       = 8;
    ptrdiff_t   incRowB = COLMAJOR ? 1 : n;
    ptrdiff_t   incColB = COLMAJOR ? k : 1;
    double      *B      = malloc(k*n*sizeof(double));

    if (!B) {
        abort();
    }
    initDGeMatrix(k, n, B, incRowB, incColB);

    printf("k = %zu\n", k);
    printf("n = %zu\n", n);
    printf("B =\n");
    printDGeMatrix(k, n, B, incRowB, incColB);

    test_pack_B(k, n, B, incRowB, incColB);

    free(B);
}