GEMM Macro Kernel
The GEMM macro kernel is supposed to perform the GEMM operation
\[C \leftarrow \beta \cdot C + \alpha \cdot A \cdot B\]for the following special case:
-
\(C\) is a \(m_c \times n_c\) matrix.
-
Matrices \(A\) and \(B\) are given in a packed storage format:
-
Originally matrix \(A\) was a \(m_c \times k_c\) matrix. It was partitioned into horizontal \(M_r \times k_c\) panels and these panels were packed into buffer. The macro kernel has only access to this buffer. Eventually the last panel was zero padded.
-
Originally matrix \(B\) was a \(k_c \times n_c\) matrix. It was partitioned into vertical \(k_c \times N_r\) panels and these panels were packed into buffer. The macro kernel has only access to this buffer. Eventually the last panel was zero padded.
-
The macro kernel performs its operation by using the micro kernel (multiplying panels of \(A\) with panels of \(B\)).
Exercise
-
Implement function dgemm_macro in the test program below. Before you start coding: Make yourself familiar with the test program below.
Simple Test Program
#include <stdlib.h> // for malloc(), free(), abort() #include <stdio.h> // for printf() #include <stddef.h> // for size_t, ptrdiff_t #include <stdbool.h> // for typedef bool #include <math.h> // for nan() //-- (literal) constants for DGEMM --------------------------------------------- #ifndef DGEMM_MC #define DGEMM_MC 4 #endif #ifndef DGEMM_NC #define DGEMM_NC 6 #endif #ifndef DGEMM_KC #define DGEMM_KC 5 #endif #ifndef DGEMM_MR #define DGEMM_MR 2 #endif #ifndef DGEMM_NR #define DGEMM_NR 3 #endif #if (DGEMM_MC % DGEMM_MR != 0) #error "DGEMM_MC must be a multiple of DEGMM_MR." #endif #if (DGEMM_NC % DGEMM_NR != 0) #error "DGEMM_NC must be a multiple of DEGMM_NR." #endif //-- packing blocks of A ------------------------------------------------------- void pack_A(size_t mc, size_t kc, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, double *A_) { size_t mb = mc / DGEMM_MR; // pack whole panels of A for (size_t ib=0; ib<mb; ++ib) { for (size_t j=0; j<kc; ++j) { for (size_t i=0; i<DGEMM_MR; ++i) { A_[ib*DGEMM_MR*kc + j*DGEMM_MR + i] = A[(ib*DGEMM_MR+i)*incRowA + j*incColA]; } } } // if necessary: pack last panel with zero padding if (mb*DGEMM_MR<mc) { size_t mr = mc % DGEMM_MR; for (size_t j=0; j<kc; ++j) { for (size_t i=0; i<mr; ++i) { A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i] = A[(mb*DGEMM_MR+i)*incRowA + j*incColA]; } for (size_t i=mr; i<DGEMM_MR; ++i) { A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i] = 0; } } } } //-- packing blocks of B ------------------------------------------------------- void pack_B(size_t kc, size_t nc, const double *B, ptrdiff_t incRowB, ptrdiff_t incColB, double *B_) { size_t nb = nc / DGEMM_NR; // pack whole panels of B for (size_t jb=0; jb<nb; ++jb) { for (size_t i=0; i<kc; ++i) { for (size_t j=0; j<DGEMM_NR; ++j) { B_[jb*DGEMM_NR*kc + i*DGEMM_NR + j] = B[i*incRowB + (jb*DGEMM_NR+j)*incColB]; } } } // if necessary: pack last panel with zero padding if (nb*DGEMM_NR<nc) { size_t nr = nc % DGEMM_NR; for (size_t i=0; i<kc; ++i) { for (size_t j=0; j<nr; ++j) { B_[nb*DGEMM_NR*kc + i*DGEMM_NR + j] = B[i*incRowB + (nb*DGEMM_NR+j)*incColB]; } for (size_t j=nr; j<DGEMM_NR; ++j) { B_[nb*DGEMM_NR*kc + i*DGEMM_NR + j] = 0; } } } } //-- GEMM micro kernel (reference implementation) ------------------------------ void dgemm_micro_ref(size_t k, double alpha, const double *A, const double *B, double beta, double *C, ptrdiff_t incRowC, ptrdiff_t incColC) { double R[DGEMM_MR*DGEMM_NR]; // R <- 0 for (size_t i=0; i<DGEMM_MR; ++i) { for (size_t j=0; j<DGEMM_NR; ++j) { R[i*DGEMM_NR+j] = 0; } } // R <- R + A*B for (size_t l=0; l<k; ++l) { for (size_t i=0; i<DGEMM_MR; ++i) { for (size_t j=0; j<DGEMM_NR; ++j) { R[i*DGEMM_NR+j] += A[i+l*DGEMM_MR]*B[l*DGEMM_NR+j]; } } } // R <- alpha*R for (size_t i=0; i<DGEMM_MR; ++i) { for (size_t j=0; j<DGEMM_NR; ++j) { R[i*DGEMM_NR+j] *= alpha; } } // C <- beta*C + R if (beta==0) { for (size_t i=0; i<DGEMM_MR; ++i) { for (size_t j=0; j<DGEMM_NR; ++j) { C[i*incRowC+j*incColC] = R[i*DGEMM_NR+j]; } } } else { for (size_t i=0; i<DGEMM_MR; ++i) { for (size_t j=0; j<DGEMM_NR; ++j) { C[i*incRowC+j*incColC] *= beta; C[i*incRowC+j*incColC] += R[i*DGEMM_NR+j]; } } } } //-- GEMM macro kernel (reference implementation) ------------------------------ void ge_dscal(size_t m, size_t n, double alpha, double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { if (alpha!=0) { for (size_t j=0; j<n; ++j) { for (size_t i=0; i<m; ++i) { A[i*incRowA + j*incColA] *= alpha; } } } else { for (size_t j=0; j<n; ++j) { for (size_t i=0; i<m; ++i) { A[i*incRowA + j*incColA] = 0; } } } } void ge_daxpy(size_t m, size_t n, double alpha, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, double *B, ptrdiff_t incRowB, ptrdiff_t incColB) { if (m==0 || n==0 || alpha==0) { return; } for (size_t j=0; j<n; ++j) { for (size_t i=0; i<m; ++i) { B[i*incRowB + j*incColB] += alpha*A[i*incRowA + j*incColA]; } } } void dgemm_macro(size_t mc, size_t nc, size_t kc, double alpha, const double *A, const double *B, double beta, double *C, ptrdiff_t incRowC, ptrdiff_t incColC) { double AB[DGEMM_MR*DGEMM_NR]; /* TODO: Your implementation */ } //-- auxiliary functions and test program -------------------------------------- void initDGeMatrix(size_t m, size_t n, bool withNan, double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { A[i*incRowA + j*incColA] = withNan ? nan("") : i*n + j + 1; } } } void printDGeMatrix(size_t m, size_t n, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { printf("%9.2lf ", A[i*incRowA + j*incColA]); } printf("\n"); } printf("\n"); } void test_macro(size_t m, size_t n, size_t k, double alpha, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, const double *B, ptrdiff_t incRowB, ptrdiff_t incColB, double beta, double *C, ptrdiff_t incRowC, ptrdiff_t incColC) { double *A_ = malloc(DGEMM_MC*DGEMM_KC*sizeof(double)); double *B_ = malloc(DGEMM_KC*DGEMM_NC*sizeof(double)); if (!A_ || !B_) { abort(); } size_t mc = m < DGEMM_MC ? m : DGEMM_MC; size_t nc = n < DGEMM_NC ? n : DGEMM_NC; size_t kc = k < DGEMM_KC ? k : DGEMM_KC; printf("mc = %zu\n", mc); printf("nc = %zu\n", nc); printf("kc = %zu\n", kc); printf("Block from A =\n"); printDGeMatrix(mc, kc, A, incRowA, incColA); printf("Block from B =\n"); printDGeMatrix(kc, nc, B, incRowB, incColB); printf("Block from C =\n"); printDGeMatrix(mc, nc, C, incRowC, incColC); pack_A(mc, kc, A, incRowA, incColA, A_); pack_B(kc, nc, B, incRowB, incColB, B_); dgemm_macro(mc, nc, kc, alpha, A_, B_, beta, C, incRowC, incColC); printf("C <- %5.2lf * C + %5.2lf * A * B\n", beta, alpha); printf("C =\n"); printDGeMatrix(mc, nc, C, incRowC, incColC); free(A_); free(B_); } #ifndef COLMAJOR #define COLMAJOR 1 #endif int main() { size_t m = 2; size_t n = 4; size_t k = 4; ptrdiff_t incRowA = COLMAJOR ? 1 : k; ptrdiff_t incColA = COLMAJOR ? m : 1; ptrdiff_t incRowB = COLMAJOR ? 1 : n; ptrdiff_t incColB = COLMAJOR ? k : 1; ptrdiff_t incRowC = COLMAJOR ? 1 : n; ptrdiff_t incColC = COLMAJOR ? m : 1; double *A = malloc(m*k*sizeof(double)); double *B = malloc(k*n*sizeof(double)); double *C = malloc(m*n*sizeof(double)); if (!A || !B || !C) { abort(); } initDGeMatrix(m, k, false, A, incRowA, incColA); initDGeMatrix(k, n, false, B, incRowB, incColB); initDGeMatrix(m, n, true, C, incRowC, incColC); double alpha = 1; double beta = 0; test_macro(m, n, k, alpha, A, incRowA, incColA, B, incRowB, incColB, beta, C, incRowC, incColC); free(A); free(B); free(C); }