Packing Blocks from Matrix B
In the cache optimized GEMM-Operation \(\beta C + \alpha A B \to C\) the matrix \(B\) gets partitioned into blocks with maximal dimension \(K_c \times N_c\). Each block \(B_{l,j}\) of \(B\) gets packed into row-major vertical panels with \(N_r\) columns.
Exercise
-
Implement function dpack_B in the test program below. Before you start coding: Make yourself familiar with the test program below.
Simple Test Program
#include <stdlib.h> // for malloc(), free(), abort() #include <stdio.h> // for printf() #include <stddef.h> // for size_t, ptrdiff_t //-- (literal) constants for DGEMM --------------------------------------------- #ifndef DGEMM_MC #define DGEMM_MC 4 #endif #ifndef DGEMM_NC #define DGEMM_NC 6 #endif #ifndef DGEMM_KC #define DGEMM_KC 5 #endif #ifndef DGEMM_MR #define DGEMM_MR 2 #endif #ifndef DGEMM_NR #define DGEMM_NR 3 #endif #if (DGEMM_MC % DGEMM_MR != 0) #error "DGEMM_MC must be a multiple of DEGMM_MR." #endif #if (DGEMM_NC % DGEMM_NR != 0) #error "DGEMM_NC must be a multiple of DEGMM_NR." #endif //-- packing blocks of A ------------------------------------------------------- void dpack_A(size_t mc, size_t kc, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, double *A_) { size_t mb = mc / DGEMM_MR; // pack whole panels of A for (size_t ib=0; ib<mb; ++ib) { for (size_t j=0; j<kc; ++j) { for (size_t i=0; i<DGEMM_MR; ++i) { A_[ib*DGEMM_MR*kc + j*DGEMM_MR + i] = A[(ib*DGEMM_MR+i)*incRowA + j*incColA]; } } } // if necessary: pack last panel with zero padding if (mb*DGEMM_MR<mc) { size_t mr = mc % DGEMM_MR; for (size_t j=0; j<kc; ++j) { for (size_t i=0; i<mr; ++i) { A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i] = A[(mb*DGEMM_MR+i)*incRowA + j*incColA]; } for (size_t i=mr; i<DGEMM_MR; ++i) { A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i] = 0; } } } } //-- packing blocks of B ------------------------------------------------------- void dpack_B(size_t kc, size_t nc, const double *B, ptrdiff_t incRowB, ptrdiff_t incColB, double *B_) { // TODO: Your code } //-- auxiliary functions and test program -------------------------------------- void initDGeMatrix(size_t m, size_t n, double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { A[i*incRowA + j*incColA] = i*n + j + 1; } } } void printDGeMatrix(size_t m, size_t n, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { printf("%9.2lf ", A[i*incRowA + j*incColA]); } printf("\n"); } printf("\n"); } void test_pack_B(size_t k, size_t n, double *B, ptrdiff_t incRowB, ptrdiff_t incColB) { double *B_ = malloc(DGEMM_KC*DGEMM_NC*sizeof(double)); if (! B_) { abort(); } size_t kb = (k+DGEMM_KC-1) / DGEMM_KC; size_t nb = (n+DGEMM_NC-1) / DGEMM_NC; size_t kc_ = k % DGEMM_KC; size_t nc_ = n % DGEMM_NC; printf("DGEMM_KC = %d\n", DGEMM_KC); printf("DGEMM_NC = %d\n", DGEMM_NC); printf("DGEMM_NR = %d\n", DGEMM_NR); for (size_t lb=0; lb<kb; ++lb) { size_t kc = lb<kb-1 || kc_==0 ? DGEMM_KC : kc_; for (size_t jb=0; jb<nb; ++jb) { size_t nc = jb<nb-1 || nc_==0 ? DGEMM_NC : nc_; printf("packing %zu x %zu block from B(%zu, %zu)\n", kc, nc, lb*DGEMM_KC, jb*DGEMM_NC); dpack_B(kc, nc, &B[lb*DGEMM_KC*incRowB + jb*DGEMM_NC*incColB], incRowB, incColB, B_); printf("Panels in buffer B_:\n"); printDGeMatrix(DGEMM_KC*DGEMM_NC/DGEMM_NR, DGEMM_NR, B_, DGEMM_NR, 1); printf("B_ as array:\n"); printDGeMatrix(1, DGEMM_KC*DGEMM_NC, B_, 0, 1); } } free(B_); } #ifndef COLMAJOR #define COLMAJOR 1 #endif int main() { size_t k = 6; size_t n = 8; ptrdiff_t incRowB = COLMAJOR ? 1 : n; ptrdiff_t incColB = COLMAJOR ? k : 1; double *B = malloc(k*n*sizeof(double)); if (!B) { abort(); } initDGeMatrix(k, n, B, incRowB, incColB); printf("k = %zu\n", k); printf("n = %zu\n", n); printf("B =\n"); printDGeMatrix(k, n, B, incRowB, incColB); test_pack_B(k, n, B, incRowB, incColB); free(B); }