Content |
Aufgabe: GEMM-Micro-Kernel
Die GEMM-Operation \(C \leftarrow \beta\,C + \alpha\,A B\) soll für folgenden Spezialfall programmiert werden:
-
\(C\) ist eine \(M_r \times N_r\) Matrix im Full-Storage-Format.
-
\(A\) ist eine \(M_r \times k\) Col-Major Matrix.
-
\(B\) ist eine \(k \times N_r\) Row-Major Matrix.
-
Die Elemente von \(A\) und \(B\) liegen zusammenhängend im Speicher.
Algorithmus
Bei der Implementierung soll seine lokale \(M_r \times N_r\) Matrix \(\overline{AB}\) benutzt werden die zunächst mit Null initialisiert und dann mit dem Produkt \(A \cdot B\) überschrieben wird:
-
\(\overline{AB} \leftarrow \mathbf{0}\)
-
\(\overline{AB} \leftarrow A \cdot B\)
-
Falls \(\beta = 0\)
-
\(C \leftarrow \alpha\,\overline{AB}\)
-
-
Sonst
-
\(C \leftarrow \beta\,C + \alpha\,\overline{AB}\)
-
Weitere Anforderungen:
-
Die Funktion soll keine Funktionsaufrufe beinhalten.
-
Bei geschachtelten Schleifen sollen die innersten Schleifen vom Compiler ausgrollt werden können.
Vorlage
#include <stdio.h> #include <math.h> #include <stdlib.h> #include <sys/times.h> #include <unistd.h> //-- setup and print matrices -------------------------------------------------- void initGeMatrix(int m, int n, double *A, int incRowA, int incColA) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { A[i*incRowA+j*incColA] = i*n + j + 1; } } } void printGeMatrix(int m, int n, const double *A, int incRowA, int incColA) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { printf("%10.4lf ", A[i*incRowA+j*incColA]); } printf("\n"); } printf("\n\n"); } //------------------------------------------------------------------------------ #ifndef DGEMM_MC #define DGEMM_MC 8 #endif #ifndef DGEMM_NC #define DGEMM_NC 9 #endif #ifndef DGEMM_KC #define DGEMM_KC 10 #endif #ifndef DGEMM_MR #define DGEMM_MR 4 #endif #ifndef DGEMM_NR #define DGEMM_NR 3 #endif //------------------------------------------------------------------------------ void dpack_A(int m, int k, const double *A, int incRowA, int incColA, double *p) { int i, i0, j, l, nu; int mp = (m+DGEMM_MR-1) / DGEMM_MR; for (j=0; j<k; ++j) { for (l=0; l<mp; ++l) { for (i0=0; i0<DGEMM_MR; ++i0) { i = l*DGEMM_MR + i0; nu = l*DGEMM_MR*k + j*DGEMM_MR + i0; p[nu] = (i<m) ? A[i*incRowA+j*incColA] : 0; } } } } void dpack_B(int k, int n, const double *B, int incRowB, int incColB, double *p) { int i, j, j0, l, nu; int np = (n+DGEMM_NR-1) / DGEMM_NR; for (l=0; l<np; ++l) { for (j0=0; j0<DGEMM_NR; ++j0) { j = l*DGEMM_NR + j0; for (i=0; i<k; ++i) { nu = l*DGEMM_NR*k + i*DGEMM_NR + j0; p[nu] = (j<n) ? B[i*incRowB+j*incColB] : 0; } } } } void dgemm_micro(int k, double alpha, const double *A, const double *B, double beta, double *C, int incRowC, int incColC) { int i, j, l; double AB[DGEMM_MR*DGEMM_NR]; // ... your code here ... } //------------------------------------------------------------------------------ void dgecopy(int m, int n, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX]; } } } void dgemm_ilj(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i, j, l; for (i=0; i<m; ++i) { for (l=0; l<k; ++l) { for (j=0; j<n; ++j) { if (l==0) { C[i*incRowC+j*incColC] *= beta; } C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } //------------------------------------------------------------------------------ #ifndef DIM_K #define DIM_K 12 #endif #ifndef ALPHA #define ALPHA 2 #endif #ifndef BETA #define BETA 2 #endif #ifndef ROWMAJOR #define INCROW_C 1 #define INCCOL_C DGEMM_MR #else #define INCROW_C DGEMM_NR #define INCCOL_C 1 #endif double A_panel[DIM_K*DGEMM_MR]; double B_panel[DIM_K*DGEMM_NR]; double C[DGEMM_MR*DGEMM_NR]; double C_ref[DGEMM_MR*DGEMM_NR]; int main() { initGeMatrix(DGEMM_MR, DIM_K, A_panel, 1, DGEMM_MR); initGeMatrix(DIM_K, DGEMM_NR, B_panel, DGEMM_NR, 1); initGeMatrix(DGEMM_MR, DGEMM_NR, C, INCROW_C, INCCOL_C); dgecopy(DGEMM_MR, DGEMM_NR, C, INCROW_C, INCCOL_C, C_ref, INCROW_C, INCCOL_C); printf("A_panel =\n"); printGeMatrix(DGEMM_MR, DIM_K, A_panel, 1, DGEMM_MR); printf("B_panel =\n"); printGeMatrix(DIM_K, DGEMM_NR, B_panel, DGEMM_NR, 1); printf("C =\n"); printGeMatrix(DGEMM_MR, DGEMM_NR, C, INCROW_C, INCCOL_C); /* ... your code: call dgemm_micro! */ printf("%lf*C + %lf*A*B =\n", (double)ALPHA, (double)BETA); printGeMatrix(DGEMM_MR, DGEMM_NR, C, INCROW_C, INCCOL_C); printf("C_ref =\n"); dgemm_ilj(DGEMM_MR, DGEMM_NR, DIM_K, (double)ALPHA, A_panel, 1, DGEMM_MR, B_panel, DGEMM_NR, 1, (double)BETA, C_ref, INCROW_C, INCCOL_C); printGeMatrix(DGEMM_MR, DGEMM_NR, C_ref, INCROW_C, INCCOL_C); return 0; }