Content |
Aufgabe: GEMM-Macro-Kernel
Bezüglich \(M_r\) und \(N_r\) wird die \(m \times n\) Matrix \(C\) partitioniert. Dann entstehen \(m_p = \left\lceil \frac{m}{M_r} \right\rceil\) Blockzeilen und \(n_p = \left\lceil \frac{n}{N_r} \right\rceil\) Blockspalten:
\[C = \left(\begin{array}{c|c|c|c} C_{0,\,0} & C_{0,\,N_r} & \dots & C_{0,\, N_r\cdot (n_p-1)} \\ \hline C_{M_r,\,0} & C_{M_r,\,N_r} & \dots & C_{M_r,\,N_r\cdot (n_p-1)} \\ \hline \vdots & \vdots & & \vdots \\ \hline C_{M_r\cdot (m_p-1),\,0} & C_{M_r\cdot (m_p-1),\,N_r} & \dots & C_{M_r\cdot (m_p-1),\,N_r\cdot (n_p-1)} \\ \end{array}\right)\]Für \(i \in \{0, M_r, \dots, M_r\cdot (m_p-1)\}\) und \(j \in \{0, N_r, \dots, N_r\cdot (n_p-1)\}\) ist der Block \(C_{i,\,j}\) eine \(m_r \times n_r\) Matrix mit
\[m_r =\begin{cases}M_r, & i+M_r < m, \\m - i, & \text{else}\end{cases}\qquad\text{und}\qquad n_r =\begin{cases}N_r, & j+N_r < n, \\n - j, & \text{else.}\end{cases}\]Für die GEMM-Operation
\[C \leftarrow \beta \, C + \alpha\,A B\]liegen die Matrizen \(A\) und \(B\) in einem entsprechend gepackten Format als \(\overline{A}\) und \(\overline{B}\) vor:
-
\(\overline{A}\) ist ein gepackter \(m_c \times k_c\) Block aus \(A\). Dieser besteht aus \(m_p = \left\lceil \frac{m_c}{M_r} \right\rceil\) horizontalen Paneelen:
\[\overline{A} =\left(\begin{array}{c} A_0 \\ \hline A_{M_r} \\ \hline \vdots \\ \hline A_{M_r \cdot (m_p-1)} \end{array}\right)\]Jede Paneele hat die Dimension \(M_r \times k_c\) (Die letzte Paneele wurde beim Packen eventuell mit Nullen auf \(M_r\) Zeilen aufgefüllt).
-
\(\overline{B}\) ist ein gepackter \(k_c \times n_c\) Block aus \(B\). Dieser besteht aus \(n_p = \left\lceil \frac{n_c}{N_r} \right\rceil\) vertikalen Paneelen:
\[\overline{B} =\left(\begin{array}{c|c|c|c} B_0 & B_{N_r} & \dots & B_{N_r\cdot (n_p-1)} \end{array}\right)\]Jede Paneele hat die Dimension \(k_c \times N_r\) (Die letzte Paneele wurde beim Packen eventuell mit Nullen auf \(N_r\) Spalten aufgefüllt).
Das Produkt einer Paneele aus \(\overline{A}\) mit einer Paneele aus \(\overline{B}\) ist somit stets ein Block mit Dimension \(M_r \times N_r\) Im Algorithmus ist somit eine Fallunterscheidung notwendig:
-
\(\overline{C}\) sei eine lokale \(M_r \times N_r\) Matrix
-
for \(i=0, M_r, \dots, M_r\cdot(m_p-1)\)
-
\(m_r = \begin{cases} M_r,& i+M_r < m,\\ m - i,& \text{else} \end{cases}\)
-
for \(j=0, N_r, \dots, N_r\cdot(n_p-1)\)
-
\(n_r = \begin{cases} N_r,& j+N_r < n,\\ n - j,& \text{else} \end{cases}\)
-
if \(m_r = M_r \;\land\; n_r = N_r\)
-
\(C_{i,j} \leftarrow \beta C_{i,j} + \alpha \overline{A}_i \overline{B}_j\)
-
-
else
-
\(\overline{C} \leftarrow \alpha \overline{A}_i \overline{B}_j\)
-
\(C_{i,j} \leftarrow \beta C_{i,j} + \overline{C}_{(0:m_r-1)\times(0:n_r-1)}\)
-
-
-
Hinweise:
-
Das Produkt von Paneelen soll natürlich mit dem Micro-Kernel berechnet werden.
-
Mit \(\overline{C}_{(0:m_r-1)\times(0:n_r-1)}\) bezeichnen wir die ersten \(m_r\) Zeilen und \(n_r\) Spalten von \(\overline{C}\).
Vorlage
Die Vorlage ist wieder unabhängig vom HPC-Projekt. Funktionen wie beispielsweise dgescal oder dgeaxpy wurden manuell vom HPC-Projekt übernommen.
Vervollständigt die Funktion dgemm_macro:
#include <stdio.h> #include <math.h> #include <stdlib.h> #include <sys/times.h> #include <unistd.h> //-- setup and print matrices -------------------------------------------------- void initGeMatrix(int m, int n, double *A, int incRowA, int incColA) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { A[i*incRowA+j*incColA] = i*n + j + 1; } } } void printGeMatrix(int m, int n, const double *A, int incRowA, int incColA) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { printf("%10.4lf ", A[i*incRowA+j*incColA]); } printf("\n"); } printf("\n\n"); } //------------------------------------------------------------------------------ void dgecopy(int m, int n, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX]; } } } void dgescal(int m, int n, double alpha, double *X, int incRowX, int incColX) { int i, j; for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { X[i*incRowX+j*incColX] *= alpha; } } } void dgeaxpy(int m, int n, double alpha, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY) { int i, j; if (alpha==0 || m==0 || n==0) { return; } for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; } } } void dgemm_ilj(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i, j, l; for (i=0; i<m; ++i) { for (l=0; l<k; ++l) { for (j=0; j<n; ++j) { if (l==0) { C[i*incRowC+j*incColC] *= beta; } C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } //------------------------------------------------------------------------------ #ifndef DGEMM_MC #define DGEMM_MC 8 #endif #ifndef DGEMM_NC #define DGEMM_NC 9 #endif #ifndef DGEMM_KC #define DGEMM_KC 10 #endif #ifndef DGEMM_MR #define DGEMM_MR 4 #endif #ifndef DGEMM_NR #define DGEMM_NR 3 #endif //------------------------------------------------------------------------------ void dpack_A(int m, int k, const double *A, int incRowA, int incColA, double *p) { int i, i0, j, l, nu; int mp = (m+DGEMM_MR-1) / DGEMM_MR; for (j=0; j<k; ++j) { for (l=0; l<mp; ++l) { for (i0=0; i0<DGEMM_MR; ++i0) { i = l*DGEMM_MR + i0; nu = l*DGEMM_MR*k + j*DGEMM_MR + i0; p[nu] = (i<m) ? A[i*incRowA+j*incColA] : 0; } } } } void dpack_B(int k, int n, const double *B, int incRowB, int incColB, double *p) { int i, j, j0, l, nu; int np = (n+DGEMM_NR-1) / DGEMM_NR; for (l=0; l<np; ++l) { for (j0=0; j0<DGEMM_NR; ++j0) { j = l*DGEMM_NR + j0; for (i=0; i<k; ++i) { nu = l*DGEMM_NR*k + i*DGEMM_NR + j0; p[nu] = (j<n) ? B[i*incRowB+j*incColB] : 0; } } } } void dgemm_micro(int k, double alpha, const double *A, const double *B, double beta, double *C, int incRowC, int incColC) { int i, j, l; double AB[DGEMM_MR*DGEMM_NR]; for (l=0; l<DGEMM_MR*DGEMM_NR; ++l) { AB[l] = 0; } for (l=0; l<k; ++l) { for (j=0; j<DGEMM_NR; ++j) { for (i=0; i<DGEMM_MR; ++i) { AB[i+j*DGEMM_MR] += A[i+l*DGEMM_MR]*B[j+l*DGEMM_NR]; } } } if (beta==0) { for (j=0; j<DGEMM_NR; ++j) { for (i=0; i<DGEMM_MR; ++i) { C[i*incRowC+j*incColC] = alpha*AB[i+j*DGEMM_MR]; } } } else { for (j=0; j<DGEMM_NR; ++j) { for (i=0; i<DGEMM_MR; ++i) { C[i*incRowC+j*incColC] = beta*C[i*incRowC+j*incColC] + alpha*AB[i+j*DGEMM_MR]; } } } } void dgemm_macro(int m, int n, int k, double alpha, const double *A, const double *B, double beta, double *C, int incRowC, int incColC) { double C_[DGEMM_MR*DGEMM_NR]; int i, j; const int MR = DGEMM_MR; const int NR = DGEMM_NR; for (i=0; i<m; i+=MR) { int mr = (i+MR<m) ? MR : m - i; // ... your code here .... } } //------------------------------------------------------------------------------ #ifndef DIM_M #define DIM_M 8 #endif #ifndef DIM_N #define DIM_N 7 #endif #ifndef DIM_K #define DIM_K 12 #endif #ifndef ALPHA #define ALPHA 2 #endif #ifndef BETA #define BETA 2 #endif #ifndef ROWMAJOR_A #define INCROW_A 1 #define INCCOL_A DIM_M #else #define INCROW_A DIM_K #define INCCOL_A 1 #endif #ifndef ROWMAJOR_B #define INCROW_B 1 #define INCCOL_B DIM_K #else #define INCROW_B DIM_N #define INCCOL_B 1 #endif #ifndef ROWMAJOR_C #define INCROW_C 1 #define INCCOL_C DIM_M #else #define INCROW_C DIM_N #define INCCOL_C 1 #endif double A[DIM_M*DIM_K]; double B[DIM_K*DIM_N]; double A_[DGEMM_MC*DGEMM_KC]; double B_[DGEMM_KC*DGEMM_NC]; double C[DIM_M*DIM_N]; double C_ref[DIM_M*DIM_N]; #define MIN(X,Y) (X) < (Y) ? (X) : (Y) int main() { initGeMatrix(DIM_M, DIM_K, A, INCROW_A, INCCOL_A); initGeMatrix(DIM_K, DIM_N, B, INCROW_B, INCCOL_B); initGeMatrix(DIM_M, DIM_N, C, INCROW_C, INCCOL_C); dgecopy(DIM_M, DIM_N, C, INCROW_C, INCCOL_C, C_ref, INCROW_C, INCCOL_C); // pack block from A dpack_A(MIN(DIM_M, DGEMM_MC), MIN(DIM_K, DGEMM_KC), A, INCROW_A, INCCOL_A, A_); // pack block from B dpack_B(MIN(DIM_K, DGEMM_KC), MIN(DIM_N, DGEMM_NC), B, INCROW_B, INCCOL_B, B_); printf("A =\n"); printGeMatrix(DIM_M, DIM_K, A, INCROW_A, INCCOL_A); printf("B =\n"); printGeMatrix(DIM_K, DIM_N, B, INCROW_B, INCCOL_B); printf("C =\n"); printGeMatrix(DIM_M, DIM_N, C, INCROW_C, INCCOL_C); // pack block from A dpack_A(MIN(DIM_M, DGEMM_MC), MIN(DIM_K, DGEMM_KC), A, INCROW_A, INCCOL_A, A_); // pack block from B dpack_B(MIN(DIM_K, DGEMM_KC), MIN(DIM_N, DGEMM_NC), B, INCROW_B, INCCOL_B, B_); // multiply with macro-kernel dgemm_macro(MIN(DIM_M, DGEMM_MC), MIN(DIM_N, DGEMM_NC), MIN(DIM_K, DGEMM_KC), (double)ALPHA, A_, B_, (double)BETA, C, INCROW_C, INCCOL_C); printf("%lf*C + %lf*A*B =\n", (double)ALPHA, (double)BETA); printGeMatrix(DIM_M, DIM_N, C, INCROW_C, INCCOL_C); printf("C_ref =\n"); dgemm_ilj(MIN(DIM_M, DGEMM_MC), MIN(DIM_N, DGEMM_NC), MIN(DIM_K, DGEMM_KC), (double)ALPHA, A, INCROW_A, INCCOL_A, B, INCROW_B, INCCOL_B, (double)BETA, C_ref, INCROW_C, INCCOL_C); printGeMatrix(DIM_M, DIM_N, C_ref, INCROW_C, INCCOL_C); return 0; }