Packing Blocks from Matrix A
In the cache optimized GEMM-Operation \(\beta C + \alpha A B \to C\) the matrix \(A\) gets partitioned into blocks with maximal dimension \(M_c \times K_c\). Each block \(A_{i,l}\) of \(A\) gets packed into col-major horizontal panels with \(M_r\) rows.
Exercise
-
Implement function dpack_A in the test program below. Before you start coding: Make yourself familiar with the test program below.
Simple Test Program
#include <stdlib.h> // for malloc(), free(), abort() #include <stdio.h> // for printf() #include <stddef.h> // for size_t, ptrdiff_t //-- (literal) constants for DGEMM --------------------------------------------- #ifndef DGEMM_MC #define DGEMM_MC 4 #endif #ifndef DGEMM_KC #define DGEMM_KC 5 #endif #ifndef DGEMM_MR #define DGEMM_MR 2 #endif #if (DGEMM_MC % DGEMM_MR != 0) #error "DGEMM_MC must be a multiple of DEGMM_MR." #endif //-- packing blocks of A ------------------------------------------------------- void dpack_A(size_t mc, size_t kc, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, double *A_) { // TODO: Your code } //-- auxiliary functions and test program -------------------------------------- void initDGeMatrix(size_t m, size_t n, double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { A[i*incRowA + j*incColA] = i*n + j + 1; } } } void printDGeMatrix(size_t m, size_t n, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { printf("%9.2lf ", A[i*incRowA + j*incColA]); } printf("\n"); } printf("\n"); } void test_pack_A(size_t m, size_t k, double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { double *A_ = malloc(DGEMM_MC*DGEMM_KC*sizeof(double)); if (! A_) { abort(); } size_t mb = (m+DGEMM_MC-1) / DGEMM_MC; size_t kb = (k+DGEMM_KC-1) / DGEMM_KC; size_t mc_ = m % DGEMM_MC; size_t kc_ = k % DGEMM_KC; printf("DGEMM_MC = %d\n", DGEMM_MC); printf("DGEMM_KC = %d\n", DGEMM_KC); printf("DGEMM_MR = %d\n", DGEMM_MR); for (size_t ib=0; ib<mb; ++ib) { size_t mc = ib<mb-1 || mc_==0 ? DGEMM_MC : mc_; for (size_t lb=0; lb<kb; ++lb) { size_t kc = lb<kb-1 || kc_==0 ? DGEMM_KC : kc_; printf("packing %zu x %zu block from A(%zu, %zu)\n", mc, kc, ib*DGEMM_MC, lb*DGEMM_KC); dpack_A(mc, kc, &A[ib*DGEMM_MC*incRowA + lb*DGEMM_KC*incColA], incRowA, incColA, A_); printf("Panels in buffer A_:\n"); printDGeMatrix(DGEMM_MR, DGEMM_MC*DGEMM_KC/DGEMM_MR, A_, 1, DGEMM_MR); printf("A_ as array:\n"); printDGeMatrix(1, DGEMM_MC*DGEMM_KC, A_, 0, 1); } } free(A_); } #ifndef COLMAJOR #define COLMAJOR 1 #endif int main() { size_t m = 7; size_t k = 8; ptrdiff_t incRowA = COLMAJOR ? 1 : k; ptrdiff_t incColA = COLMAJOR ? m : 1; double *A = malloc(m*k*sizeof(double)); if (!A) { abort(); } initDGeMatrix(m, k, A, incRowA, incColA); printf("m = %zu\n", m); printf("k = %zu\n", k); printf("A =\n"); printDGeMatrix(m, k, A, incRowA, incColA); test_pack_A(m, k, A, incRowA, incColA); free(A); }