========== Das Finale [TOC] ========== Vom sogenannten Frame-Algorithmus soll der Makro-Kernel benutzt werden, um die gesammte GEMM-Operation durchzuführen. Dies soll analog zur `gemm_jli` Variante von __page01__ geschehen. Statt ---- BOX ------------------------------------------------------------- - For $j=0, \dots, n-1$ - For $\ell=0, \dots, k-1$ - For $i=0, \dots, m-1$ - Falls $\ell = 0 $ - $c_{i,j} \leftarrow \beta c_{i,j}$ - $c_{i,j} \leftarrow c_{i,j} + \alpha a_{i,\ell} b_{\ell, j}$ ---------------------------------------------------------------------- wird dies blockweise durchgeführt. Die Matrizen werden dabei bezüglich $M_c$, $N_c$ und $K_c$ partitioniert: - Die $m \times k$ Matrix $A$ bezüglich $M_c$ und $K_c$. ---- LATEX --------------------------------------------------------------------- A = \left(\begin{array}{c|c|c|c} A_{0,\,0} & A_{0,\,K_c} & \dots & A_{0,\, K_c\cdot (k_b-1)} \\ \hline A_{M_c,\,0} & A_{M_c,\,K_c} & \dots & A_{M_c,\,K_c\cdot (k_b-1)} \\ \hline \vdots & \vdots & & \vdots \\ \hline A_{M_c\cdot (m_b-1),\,0} & A_{M_c\cdot (m_b-1),\,K_c} & \dots & A_{M_c\cdot (m_b-1),\,K_c\cdot (k_b-1)} \\ \end{array}\right) -------------------------------------------------------------------------------- - Die $k \times n$ Matrix $B$ bezüglich $K_c$ und $N_c$. ---- LATEX --------------------------------------------------------------------- B = \left(\begin{array}{c|c|c|c} B_{0,\,0} & B_{0,\,N_c} & \dots & B_{0,\, N_c\cdot (n_b-1)} \\ \hline B_{K_c,\,0} & B_{K_c,\,N_c} & \dots & B_{K_c,\,N_c\cdot (n_b-1)} \\ \hline \vdots & \vdots & & \vdots \\ \hline B_{K_c\cdot (k_b-1),\,0} & B_{K_c\cdot (k_b-1),\,N_c} & \dots & B_{K_c\cdot (k_b-1),\,N_c\cdot (n_b-1)} \\ \end{array}\right) -------------------------------------------------------------------------------- - Die $m \times n$ Matrix $C$ bezüglich $M_c$ und $N_c$. ---- LATEX --------------------------------------------------------------------- C = \left(\begin{array}{c|c|c|c} C_{0,\,0} & C_{0,\,N_c} & \dots & C_{0,\, N_c\cdot (n_b-1)} \\ \hline C_{M_c,\,0} & C_{M_c,\,N_c} & \dots & C_{M_c,\,N_c\cdot (n_b-1)} \\ \hline \vdots & \vdots & & \vdots \\ \hline C_{M_c\cdot (m_b-1),\,0} & C_{M_c\cdot (m_b-1),\,N_c} & \dots & C_{M_c\cdot (m_b-1),\,N_c\cdot (n_b-1)} \\ \end{array}\right) -------------------------------------------------------------------------------- Für die Anzahl der Blöcke gilt dabei formal: - $m_b = \left\lceil \frac{m}{M_c} \right\rceil$ - $n_b = \left\lceil \frac{n}{N_c} \right\rceil$ - $k_b = \left\lceil \frac{k}{K_c} \right\rceil$ Als Puffer zum Packen von Blöcken stehen $\overline{A}$ für $M_c \cdot K_c$ und $\overline{B}$ für $K_c \cdot N_c$ Elemente zur Verfügung. ---- BOX ------------------------------------------------------------- - For $j=0, N_c, \dots, N_c\cdot(n_b-1)$ - $n_c = \begin{cases} N_c,& j+N_c < n,\\ n - j,& \text{else} \end{cases}$ - For $\ell=0, K_c,\dots, K_c\cdot(k_b-1)$ - $\tilde{\beta} = \begin{cases} \beta, &\ell=0,\\ 1, &\text{sonst.} \end{cases}$ - $\text{pack}_B:\; \overline{B} \leftarrow B_{\ell,\,j}$ - For $i=0, M_c, \dots, M_c\cdot(m_b-1)$ - $m_c = \begin{cases} M_c,& i+M_c < m,\\ m - i,& \text{else} \end{cases}$ - $\text{pack}_A:\; \overline{A} \leftarrow A_{i,\,\ell}$ - $C_{i,j} \leftarrow \beta C_{i,j} + \alpha \overline{A} \overline{B}$ ---------------------------------------------------------------------- Vorlage ======= Ergänzt die Prozedur `dgemm`: ---- CODE (type=c) ------------------------------------------------------------- #include #include #include #include #include //-- setup and print matrices -------------------------------------------------- double walltime() { struct tms ts; static double ClockTick=0.0; if (ClockTick==0.0) { ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK)); } return ((double) times(&ts)) * ClockTick; } void initGeMatrix(int m, int n, double *A, int incRowA, int incColA) { int i, j; for (i=0; iresult) { result = sum; } } return result; } void dgecopy(int m, int n, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY) { int i, j; for (i=0; i(Y) ? (X) : (Y)) double err_dgemm(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, const double *C0, int incRowC0, int incColC0, double *C, int incRowC, int incColC) { double normA = dgenrm1(m, k, A, incRowA, incColA); double normB = dgenrm1(k, n, B, incRowB, incColB); double normC = dgenrm1(m, n, C, incRowC0, incColC0); double normD; int mn = (m>n) ? m : n; int mnk = (mn>k) ? mn : k; normA = MAX(normA, fabs(alpha)*normA); normC = MAX(normC, fabs(beta)*normC); dgeaxpy(m, n, -1.0, C0, incRowC0, incColC0, C, incRowC, incColC); normD = dgenrm1(m, n, C, incRowC, incColC); return normD/(mnk*normA*normB*normC); } //------------------------------------------------------------------------------ #ifndef DGEMM_MC #define DGEMM_MC 256 #endif #ifndef DGEMM_NC #define DGEMM_NC 512 #endif #ifndef DGEMM_KC #define DGEMM_KC 256 #endif #ifndef DGEMM_MR #define DGEMM_MR 4 #endif #ifndef DGEMM_NR #define DGEMM_NR 4 #endif //------------------------------------------------------------------------------ void dpack_A(int m, int k, const double *A, int incRowA, int incColA, double *p) { int i, i0, j, l, nu; int mp = (m+DGEMM_MR-1) / DGEMM_MR; for (j=0; j doc:session07/page01 :navigate: up -> doc:index back -> doc:session07/page10 next -> doc:session07/page12