Content |
Das Finale
Vom sogenannten Frame-Algorithmus soll der Makro-Kernel benutzt werden, um die gesammte GEMM-Operation durchzuführen. Dies soll analog zur gemm_jli Variante von page01 geschehen. Statt
-
For \(j=0, \dots, n-1\)
-
For \(\ell=0, \dots, k-1\)
-
For \(i=0, \dots, m-1\)
-
Falls \(\ell = 0 \)
-
\(c_{i,j} \leftarrow \beta c_{i,j}\)
-
-
\(c_{i,j} \leftarrow c_{i,j} + \alpha a_{i,\ell} b_{\ell, j}\)
-
-
-
wird dies blockweise durchgeführt. Die Matrizen werden dabei bezüglich \(M_c\), \(N_c\) und \(K_c\) partitioniert:
-
Die \(m \times k\) Matrix \(A\) bezüglich \(M_c\) und \(K_c\).
\[A = \left(\begin{array}{c|c|c|c} A_{0,\,0} & A_{0,\,K_c} & \dots & A_{0,\, K_c\cdot (k_b-1)} \\ \hline A_{M_c,\,0} & A_{M_c,\,K_c} & \dots & A_{M_c,\,K_c\cdot (k_b-1)} \\ \hline \vdots & \vdots & & \vdots \\ \hline A_{M_c\cdot (m_b-1),\,0} & A_{M_c\cdot (m_b-1),\,K_c} & \dots & A_{M_c\cdot (m_b-1),\,K_c\cdot (k_b-1)} \\ \end{array}\right)\] -
Die \(k \times n\) Matrix \(B\) bezüglich \(K_c\) und \(N_c\).
\[B = \left(\begin{array}{c|c|c|c} B_{0,\,0} & B_{0,\,N_c} & \dots & B_{0,\, N_c\cdot (n_b-1)} \\ \hline B_{K_c,\,0} & B_{K_c,\,N_c} & \dots & B_{K_c,\,N_c\cdot (n_b-1)} \\ \hline \vdots & \vdots & & \vdots \\ \hline B_{K_c\cdot (k_b-1),\,0} & B_{K_c\cdot (k_b-1),\,N_c} & \dots & B_{K_c\cdot (k_b-1),\,N_c\cdot (n_b-1)} \\ \end{array}\right)\] -
Die \(m \times n\) Matrix \(C\) bezüglich \(M_c\) und \(N_c\).
\[C = \left(\begin{array}{c|c|c|c} C_{0,\,0} & C_{0,\,N_c} & \dots & C_{0,\, N_c\cdot (n_b-1)} \\ \hline C_{M_c,\,0} & C_{M_c,\,N_c} & \dots & C_{M_c,\,N_c\cdot (n_b-1)} \\ \hline \vdots & \vdots & & \vdots \\ \hline C_{M_c\cdot (m_b-1),\,0} & C_{M_c\cdot (m_b-1),\,N_c} & \dots & C_{M_c\cdot (m_b-1),\,N_c\cdot (n_b-1)} \\ \end{array}\right)\]
Für die Anzahl der Blöcke gilt dabei formal:
-
\(m_b = \left\lceil \frac{m}{M_c} \right\rceil\)
-
\(n_b = \left\lceil \frac{n}{N_c} \right\rceil\)
-
\(k_b = \left\lceil \frac{k}{K_c} \right\rceil\)
Als Puffer zum Packen von Blöcken stehen \(\overline{A}\) für \(M_c \cdot K_c\) und \(\overline{B}\) für \(K_c \cdot N_c\) Elemente zur Verfügung.
-
For \(j=0, N_c, \dots, N_c\cdot(n_b-1)\)
-
\(n_c = \begin{cases} N_c,& j+N_c < n,\\ n - j,& \text{else} \end{cases}\)
-
For \(\ell=0, K_c,\dots, K_c\cdot(k_b-1)\)
-
\(\tilde{\beta} = \begin{cases} \beta, &\ell=0,\\ 1, &\text{sonst.} \end{cases}\)
-
\(\text{pack}_B:\; \overline{B} \leftarrow B_{\ell,\,j}\)
-
For \(i=0, M_c, \dots, M_c\cdot(m_b-1)\)
-
\(m_c = \begin{cases} M_c,& i+M_c < m,\\ m - i,& \text{else} \end{cases}\)
-
\(\text{pack}_A:\; \overline{A} \leftarrow A_{i,\,\ell}\)
-
\(C_{i,j} \leftarrow \beta C_{i,j} + \alpha \overline{A} \overline{B}\)
-
-
-
Vorlage
Ergänzt die Prozedur dgemm:
#include <stdio.h> #include <math.h> #include <stdlib.h> #include <sys/times.h> #include <unistd.h> //-- setup and print matrices -------------------------------------------------- double walltime() { struct tms ts; static double ClockTick=0.0; if (ClockTick==0.0) { ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK)); } return ((double) times(&ts)) * ClockTick; } void initGeMatrix(int m, int n, double *A, int incRowA, int incColA) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { A[i*incRowA+j*incColA] = i*n + j + 1; } } } void randGeMatrix(int m, int n, double *A, int incRowA, int incColA) { int i, j; for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { A[i*incRowA+j*incColA] = ((double)rand()-RAND_MAX/2)*200/RAND_MAX; } } } void printGeMatrix(int m, int n, const double *A, int incRowA, int incColA) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { printf("%10.4lf ", A[i*incRowA+j*incColA]); } printf("\n"); } printf("\n\n"); } void dgemm_ref(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i, j, l; if (beta!=1) { for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { if (beta!=0) { C[i*incRowC+j*incColC] *= beta; } else { C[i*incRowC+j*incColC] = 0; } } } } if (alpha!=0) { for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { for (l=0; l<k; ++l) { C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } } double dgenrm1(int m, int n, const double *A, int incRowA, int incColA) { int i, j; double result = 0; for (j=0; j<n; ++j) { double sum = 0; for (i=0; i<m; ++i) { sum += fabs(A[i*incRowA+j*incColA]); } if (sum>result) { result = sum; } } return result; } void dgecopy(int m, int n, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX]; } } } void dgescal(int m, int n, double alpha, double *X, int incRowX, int incColX) { int i, j; for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { X[i*incRowX+j*incColX] *= alpha; } } } void dgeaxpy(int m, int n, double alpha, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY) { int i, j; if (alpha==0 || m==0 || n==0) { return; } for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; } } } #define MIN(X,Y) ((X)<(Y) ? (X) : (Y)) #define MAX(X,Y) ((X)>(Y) ? (X) : (Y)) double err_dgemm(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, const double *C0, int incRowC0, int incColC0, double *C, int incRowC, int incColC) { double normA = dgenrm1(m, k, A, incRowA, incColA); double normB = dgenrm1(k, n, B, incRowB, incColB); double normC = dgenrm1(m, n, C, incRowC0, incColC0); double normD; int mn = (m>n) ? m : n; int mnk = (mn>k) ? mn : k; normA = MAX(normA, fabs(alpha)*normA); normC = MAX(normC, fabs(beta)*normC); dgeaxpy(m, n, -1.0, C0, incRowC0, incColC0, C, incRowC, incColC); normD = dgenrm1(m, n, C, incRowC, incColC); return normD/(mnk*normA*normB*normC); } //------------------------------------------------------------------------------ #ifndef DGEMM_MC #define DGEMM_MC 256 #endif #ifndef DGEMM_NC #define DGEMM_NC 512 #endif #ifndef DGEMM_KC #define DGEMM_KC 256 #endif #ifndef DGEMM_MR #define DGEMM_MR 4 #endif #ifndef DGEMM_NR #define DGEMM_NR 4 #endif //------------------------------------------------------------------------------ void dpack_A(int m, int k, const double *A, int incRowA, int incColA, double *p) { int i, i0, j, l, nu; int mp = (m+DGEMM_MR-1) / DGEMM_MR; for (j=0; j<k; ++j) { for (l=0; l<mp; ++l) { for (i0=0; i0<DGEMM_MR; ++i0) { i = l*DGEMM_MR + i0; nu = l*DGEMM_MR*k + j*DGEMM_MR + i0; p[nu] = (i<m) ? A[i*incRowA+j*incColA] : 0; } } } } void dpack_B(int k, int n, const double *B, int incRowB, int incColB, double *p) { int i, j, j0, l, nu; int np = (n+DGEMM_NR-1) / DGEMM_NR; for (l=0; l<np; ++l) { for (j0=0; j0<DGEMM_NR; ++j0) { j = l*DGEMM_NR + j0; for (i=0; i<k; ++i) { nu = l*DGEMM_NR*k + i*DGEMM_NR + j0; p[nu] = (j<n) ? B[i*incRowB+j*incColB] : 0; } } } } void dgemm_micro(int k, double alpha, const double *A, const double *B, double beta, double *C, int incRowC, int incColC) { int i, j, l; double AB[DGEMM_MR*DGEMM_NR]; for (l=0; l<DGEMM_MR*DGEMM_NR; ++l) { AB[l] = 0; } for (l=0; l<k; ++l) { for (j=0; j<DGEMM_NR; ++j) { for (i=0; i<DGEMM_MR; ++i) { AB[i+j*DGEMM_MR] += A[i+l*DGEMM_MR]*B[l*DGEMM_NR+j]; } } } if (beta==0) { for (j=0; j<DGEMM_NR; ++j) { for (i=0; i<DGEMM_MR; ++i) { C[i*incRowC+j*incColC] = alpha*AB[i+j*DGEMM_MR]; } } } else { for (j=0; j<DGEMM_NR; ++j) { for (i=0; i<DGEMM_MR; ++i) { C[i*incRowC+j*incColC] = beta*C[i*incRowC+j*incColC] + alpha*AB[i+j*DGEMM_MR]; } } } } void dgemm_macro(int m, int n, int k, double alpha, const double *A, const double *B, double beta, double *C, int incRowC, int incColC) { double C_[DGEMM_MR*DGEMM_NR]; int i, j; const int MR = DGEMM_MR; const int NR = DGEMM_NR; for (j=0; j<n; j+=NR) { int nr = (j+NR<n) ? NR : n - j; for (i=0; i<m; i+=MR) { int mr = (i+MR<m) ? MR : m - i; if (mr==MR && nr==NR) { dgemm_micro(k, alpha, &A[i*k], &B[j*k], beta, &C[i*incRowC+j*incColC], incRowC, incColC); } else { dgemm_micro(k, alpha, &A[i*k], &B[j*k], 0., C_, 1, MR); dgescal(mr, nr, beta, &C[i*incRowC+j*incColC], incRowC, incColC); dgeaxpy(mr, nr, 1., C_, 1, MR, &C[i*incRowC+j*incColC], incRowC, incColC); } } } } void dgemm(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i, j, l; const int MC = DGEMM_MC; const int NC = DGEMM_NC; const int KC = DGEMM_KC; if (alpha==0.0 || k==0) { dgescal(m, n, beta, C, incRowC, incColC); } else { double *A_ = (double *)malloc(sizeof(double)*DGEMM_MC*DGEMM_KC); double *B_ = (double *)malloc(sizeof(double)*DGEMM_KC*DGEMM_NC); for (j=0; j<n; j+=NC) { int nc = (j+NC<=n) ? NC : n - j; // ... your code here ... } free(A_); free(B_); } } //------------------------------------------------------------------------------ #ifndef ALPHA #define ALPHA 2 #endif #ifndef BETA #define BETA 2 #endif #ifndef MIN_N #define MIN_N 101 #endif #ifndef MAX_N #define MAX_N 1500 #endif #ifndef INC_N #define INC_N 100 #endif #ifndef MIN_M #define MIN_M 101 #endif #ifndef MAX_M #define MAX_M 1500 #endif #ifndef INC_M #define INC_M 100 #endif #ifndef MIN_K #define MIN_K 100 #endif #ifndef MAX_K #define MAX_K 1500 #endif #ifndef INC_K #define INC_K 100 #endif #ifndef ROWMAJOR_A #define ROWMAJOR_A 0 #endif #ifndef ROWMAJOR_B #define ROWMAJOR_B 0 #endif #ifndef ROWMAJOR_C #define ROWMAJOR_C 0 #endif #if (ROWMAJOR_A==1) # define INCROW_A MAX_K # define INCCOL_A 1 #else # define INCROW_A 1 # define INCCOL_A MAX_M #endif #if (ROWMAJOR_B==1) # define INCROW_B MAX_N # define INCCOL_B 1 #else # define INCROW_B 1 # define INCCOL_B MAX_K #endif #if (ROWMAJOR_C==1) # define INCROW_C MAX_N # define INCCOL_C 1 #else # define INCROW_C 1 # define INCCOL_C MAX_M #endif double A_[MAX_M*MAX_K*MIN(INCROW_A,INCCOL_A)]; double B_[MAX_K*MAX_N*MIN(INCROW_B,INCCOL_B)]; double C_[MAX_M*MAX_N]; double C_0[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)]; double C_1[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)]; int main() { int m, n, k; randGeMatrix(MAX_M, MAX_K, A_, INCROW_A, INCCOL_A); randGeMatrix(MAX_K, MAX_N, B_, INCROW_B, INCCOL_B); randGeMatrix(MAX_M, MAX_N, C_, 1, MAX_M); printf("#%9s %9s %9s", "m", "n", "k"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf("\n"); for (m=MIN_M, n=MIN_N, k=MIN_K; n<=MAX_N && m<=MAX_M && k<=MAX_K; m+=INC_M, n+=INC_N, k+=INC_K) { double t, dt, err; int runs = 1; double ops = 2.0*m/1000*n/1000*k; printf(" %9d %9d %9d", m, n, k); // compute reference solution dgecopy(m, n, C_, 1, MAX_M, C_0, INCROW_C, INCCOL_C); dgemm_ref(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_0, INCROW_C, INCCOL_C); // bench dgemm t = 0; runs = 0; do { dgecopy(m, n, C_, 1, MAX_M, C_1, INCROW_C, INCCOL_C); dt = walltime(); dgemm(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_1, INCROW_C, INCCOL_C); dt = walltime() - dt; t += dt; ++runs; } while (t<0.3); t /= runs; err = err_dgemm(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_0, INCROW_C, INCCOL_C, C_1, INCROW_C, INCCOL_C); printf(" %12.2e %12.2lf %12.2e", t, ops/t, err); printf("\n"); } return 0; }