GEMM Frame Algorithm
Implement the GEMM frame algorithm that uses packing blocks of \(A\) and \(B\) as well as the GEMM macro kernel.
You can use the following skeleton:
#include <stddef.h> #include <stdio.h> #include <stdlib.h> void initGeMatrix(size_t m, size_t n, double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { A[i*incRowA+j*incColA] = i*n + j + 1; } } } void printGeMatrix(size_t m, size_t n, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA) { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { printf("%11.4lf ", A[i*incRowA+j*incColA]); } printf("\n"); } printf("\n"); } void dgemm_ref(size_t m, size_t n, size_t k, double alpha, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, const double *B, ptrdiff_t incRowB, ptrdiff_t incColB, double beta, double *C, ptrdiff_t incRowC, ptrdiff_t incColC) { size_t i, j, l; if (beta!=1) { if (beta!=0) { for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { C[i*incRowC+j*incColC] *= beta; } } } else { for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { C[i*incRowC+j*incColC] = 0; } } } } if (alpha!=0) { for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { for (l=0; l<k; ++l) { C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } } #ifndef DGEMM_MR #define DGEMM_MR 4 #endif #ifndef DGEMM_NR #define DGEMM_NR 5 #endif #ifndef DGEMM_MC #define DGEMM_MC 8 #endif #ifndef DGEMM_NC #define DGEMM_NC 10 #endif #ifndef DGEMM_KC #define DGEMM_KC 9 #endif void dgepack_A(size_t m, size_t k, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, double *p) { size_t mb = (m+DGEMM_MR-1)/DGEMM_MR; for (size_t l=0; l<k; ++l) { for (size_t i1=0; i1<mb; ++i1) { for (size_t i0=0; i0<DGEMM_MR; ++i0) { size_t i = i1*DGEMM_MR + i0; size_t nu = i1*DGEMM_MR*k + l*DGEMM_MR + i0; p[nu] = (i<m) ? A[i*incRowA + l*incColA] : 0; } } } } void dgepack_B(size_t k, size_t n, const double *B, ptrdiff_t incRowB, ptrdiff_t incColB, double *p) { size_t nb = (n+DGEMM_NR-1)/DGEMM_NR; for (size_t j1=0; j1<nb; ++j1) { for (size_t j0=0; j0<DGEMM_NR; ++j0) { for (size_t l=0; l<k; ++l) { size_t j = j1*DGEMM_NR + j0; size_t nu = j1*DGEMM_NR*k + l*DGEMM_NR + j0; p[nu] = (j<n) ? B[l*incRowB + j*incColB] : 0; } } } } void dgemm_micro(size_t k, double alpha, const double *A, const double *B, double beta, double *C, ptrdiff_t incRowC, ptrdiff_t incColC) { double AB[DGEMM_MR*DGEMM_NR]; // AB <- A*B for (size_t i=0; i<DGEMM_MR*DGEMM_NR; ++i) { AB[i] = 0; } for (size_t l=0; l<k; ++l) { for (size_t i=0; i<DGEMM_MR; ++i) { for (size_t j=0; j<DGEMM_NR; ++j) { AB[i+j*DGEMM_MR] += A[i+l*DGEMM_MR]*B[l*DGEMM_NR+j]; } } } // C <- beta*C if (beta!=1) { if (beta!=0) { for (size_t i=0; i<DGEMM_MR; ++i) { for (size_t j=0; j<DGEMM_NR; ++j) { C[i*incRowC+j*incColC] *= beta; } } } else { for (size_t i=0; i<DGEMM_MR; ++i) { for (size_t j=0; j<DGEMM_NR; ++j) { C[i*incRowC+j*incColC] = 0; } } } } // C <- C + alpha*AB for (size_t i=0; i<DGEMM_MR; ++i) { for (size_t j=0; j<DGEMM_NR; ++j) { C[i*incRowC+j*incColC] += alpha*AB[i+j*DGEMM_MR]; } } } void dgescal(size_t m, size_t n, double alpha, double *X, size_t incRowX, size_t incColX) { if (alpha==1) { return; } if (alpha!=0) { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { X[i*incRowX+j*incColX] *= alpha; } } } else { for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { X[i*incRowX+j*incColX] = 0; } } } } void dgeaxpy(size_t m, size_t n, double alpha, const double *X, size_t incRowX, size_t incColX, double *Y, size_t incRowY, size_t incColY) { if (alpha==0) { return; } for (size_t i=0; i<m; ++i) { for (size_t j=0; j<n; ++j) { Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; } } } void dgemm_macro(size_t m, size_t n, size_t k, double alpha, const double *A, const double *B, double beta, double *C, ptrdiff_t incRowC, ptrdiff_t incColC) { double AB[DGEMM_MR*DGEMM_NR]; size_t mb = (m+DGEMM_MR-1) / DGEMM_MR; size_t nb = (n+DGEMM_NR-1) / DGEMM_NR; size_t mr = m % DGEMM_MR; size_t nr = n % DGEMM_NR; for (size_t i=0; i<mb; ++i) { size_t m_ = (i<mb-1 || mr==0) ? DGEMM_MR : mr; for (size_t j=0; j<nb; ++j) { size_t n_ = (j<nb-1 || nr==0) ? DGEMM_NR : nr; if (m_==DGEMM_MR && n_==DGEMM_NR) { dgemm_micro(k, alpha, &A[i*DGEMM_MR*k], &B[j*k*DGEMM_NR], beta, &C[i*DGEMM_MR*incRowC+j*DGEMM_NR*incColC], incRowC, incColC); } else { dgemm_micro(k, alpha, &A[i*DGEMM_MR*k], &B[j*k*DGEMM_NR], 0, AB, 1, DGEMM_MR); dgescal(DGEMM_MR, DGEMM_NR, beta, &C[i*DGEMM_MR*incRowC+j*DGEMM_NR*incColC], incRowC, incColC); dgeaxpy(DGEMM_MR, DGEMM_NR, 1, AB, 1, DGEMM_MR, &C[i*DGEMM_MR*incRowC+j*DGEMM_NR*incColC], incRowC, incColC); } } } } void dgemm_frame(size_t m, size_t n, size_t k, double alpha, const double *A, ptrdiff_t incRowA, ptrdiff_t incColA, const double *B, ptrdiff_t incRowB, ptrdiff_t incColB, double beta, double *C, ptrdiff_t incRowC, ptrdiff_t incColC) { if (k==0 || alpha==0) { dgescal(m, n, beta, C, incRowC, incColC); return; } // your code here } int main() { size_t m = 2*DGEMM_MC - 1; size_t n = 2*DGEMM_NC - 2; size_t k = 2*DGEMM_KC - 3; ptrdiff_t incRowA = 1; ptrdiff_t incColA = m; ptrdiff_t incRowB = 1; ptrdiff_t incColB = k; ptrdiff_t incRowC = 1; ptrdiff_t incColC = m; double *A = malloc(m*k*sizeof(*A)); double *B = malloc(k*n*sizeof(*B)); double C0[m*n]; double C1[m*n]; initGeMatrix(m, k, A, incRowA, incColA); initGeMatrix(k, n, B, incRowB, incColB); initGeMatrix(m, n, C0, incRowC, incColC); initGeMatrix(m, n, C1, incRowC, incColC); printf("A=\n"); printGeMatrix(m, k, A, incRowA, incColA); printf("B=\n"); printGeMatrix(k, n, B, incRowB, incColB); printf("C=\n"); printGeMatrix(m, n, C0, incRowC, incColC); double alpha = 1; double beta = 1; dgemm_ref(m, n, k, alpha, A, incRowA, incColA, B, incRowB, incColB, beta, C0, incRowC, incColC); dgemm_frame(m, n, k, alpha, A, incRowA, incColA, B, incRowB, incColB, beta, C1, incRowC, incColC); printf("gemm_ref computed C=\n"); printGeMatrix(m, n, C0, incRowC, incColC); printf("gemm_macro computed C=\n"); printGeMatrix(m, n, C1, incRowC, incColC); free(A); free(B); }