============================= Einfache GEMM Block-Varianten [TOC:2] ============================= Durch Makros werden Konstanten $M_c$, $N_c$ und $K_c$ festgelegt. Die Matrizen werden dann wie folgt partitioniert: - Die $m \times k$ Matrix $A$ quasi-äquidistant in $M_c \times K_c$ Blöcke. - Die $k \times n$ Matrix $B$ quasi-äquidistant in $K_c \times N_c$ Blöcke. - Die $m \times n$ Matrix $C$ quasi-äquidistant in $M_c \times N_c$ Blöcke. Beachtet, dass bei einer _quasi-äquidistanten_ Partitionierung damit die maximale Größe eines Blockes angegeben werden. Blöcke am unteren oder rechten Rand können kleiner sein. Die Multiplikation wird dabei auf die Multiplikation ---- LATEX --------------------------------------------------------------------- C_{i,j} \leftarrow \beta\,C_{i,j} + \alpha\,A_{i,\ell} B_{\ell,j} -------------------------------------------------------------------------------- von Blöcken zurückgeführt. Dafür sind wieder 6 Varianten möglich, die sich darin unterscheiden wie die Schleifen über $i$, $j$ und $\ell$ verschachtelt sind. In allen Varianten sollen die Blöcke $A_{i,\ell}$ und $B_{\ell,j}$ vor der Block-Multiplikation gepuffert werden: Mit ---- LATEX --------------------------------------------------------------------- \overline{A} \leftarrow A_{i,\ell},\quad \overline{B} \leftarrow B_{\ell,j},\quad \overline{C} \leftarrow C_{i,j} -------------------------------------------------------------------------------- deuten wir an, dass die Blöcke $A_{i,\ell}$, $B_{\ell,j}$ und $C_{i,j}$ jeweils kopiert werden. Das Speicherformat für die Matrizen $\overline{A}$, $\overline{B}$ und $\overline{C}$ sei stets Col-Major. Die Operation ---- LATEX --------------------------------------------------------------------- \overline{C} \leftarrow \overline{A}\overline{B} -------------------------------------------------------------------------------- soll dann mit der *ungeblockten `dgemm_jli` Variante für GEMM* ausgeführt werden. Anschließend kann mit GEAXPY ---- LATEX --------------------------------------------------------------------- C_{i,j} \leftarrow C_{i,j} + \alpha \overline{C} -------------------------------------------------------------------------------- Am Anfang der Funktion (außerhalb der Schleifen) sei bereits C mit ---- LATEX --------------------------------------------------------------------- C \leftarrow \beta C \qquad(\text{GESCAL}) -------------------------------------------------------------------------------- überschrieben worden. Vorlage ======= Grundgerüst für `level3.h` -------------------------- Die Header-Datei ist um eine Signatur für `dgemm_blk_jli` ergänzt worden: ---- CODE(type=c)-------------------------------------------------------------- #ifndef ULMBLAS_LEVEL3_H #define ULMBLAS_LEVEL3_H 1 // // BLAS Level 3 // //-- GEMM ---------------------------------------------------------------------- void dgemm_mv(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_jil(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_jli(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_vm(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_ijl(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_ilj(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_vv(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_lij(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_lji(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_blk_jli(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); #endif // ULMBLAS_LEVEL3_H ------------------------------------------------------------------------------- Grundgerüst für `level3.c` -------------------------- Die Source-Datei wird entsprechned um die Funktion `dgemm_blk_jli` erweitert: ---- CODE (type=c) ------------------------------------------------------------ #include #include #include #include // // BLAS Level 3 // //-- GEMM ---------------------------------------------------------------------- void dgemm_mv(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int j; for (j=0; j #include #include #include #include #include #include #ifndef ALPHA #define ALPHA 2 #endif #ifndef BETA #define BETA 2 #endif #ifndef MIN_N #define MIN_N 100 #endif #ifndef MAX_N #define MAX_N 1500 #endif #ifndef INC_N #define INC_N 100 #endif #ifndef MIN_M #define MIN_M 100 #endif #ifndef MAX_M #define MAX_M 1500 #endif #ifndef INC_M #define INC_M 100 #endif #ifndef MIN_K #define MIN_K 100 #endif #ifndef MAX_K #define MAX_K 1500 #endif #ifndef INC_K #define INC_K 100 #endif #ifndef ROWMAJOR_A #define ROWMAJOR_A 0 #endif #ifndef ROWMAJOR_B #define ROWMAJOR_B 0 #endif #ifndef ROWMAJOR_C #define ROWMAJOR_C 0 #endif #if (ROWMAJOR_A==1) # define INCROW_A MAX_K # define INCCOL_A 1 #else # define INCROW_A 1 # define INCCOL_A MAX_M #endif #if (ROWMAJOR_B==1) # define INCROW_B MAX_N # define INCCOL_B 1 #else # define INCROW_B 1 # define INCCOL_B MAX_K #endif #if (ROWMAJOR_C==1) # define INCROW_C MAX_N # define INCCOL_C 1 #else # define INCROW_C 1 # define INCCOL_C MAX_M #endif #define MIN(X,Y) ((X)<(Y) ? (X) : (Y)) #define MAX(X,Y) ((X)>(Y) ? (X) : (Y)) double A_[MAX_M*MAX_K*MIN(INCROW_A,INCCOL_A)]; double B_[MAX_K*MAX_N*MIN(INCROW_B,INCCOL_B)]; double C_[MAX_M*MAX_N]; double C_0[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)]; double C_1[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)]; int main() { int m, n, k; randGeMatrix(MAX_M, MAX_K, A_, INCROW_A, INCCOL_A); randGeMatrix(MAX_K, MAX_N, B_, INCROW_B, INCCOL_B); randGeMatrix(MAX_M, MAX_N, C_, 1, MAX_M); printf("#%9s %9s %9s", "m", "n", "k"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf("\n"); for (m=MIN_M, n=MIN_N, k=MIN_K; n<=MAX_N && m<=MAX_M && k<=MAX_K; m+=INC_M, n+=INC_N, k+=INC_K) { double t, dt, err; int runs = 1; double ops = 2.0*m/1000*n/1000*k; printf(" %9d %9d %9d", m, n, k); // compute reference solution dgecopy(m, n, C_, 1, MAX_M, C_0, INCROW_C, INCCOL_C); dgemm_ref(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_0, INCROW_C, INCCOL_C); // bench matrix-vector variants // dgemm_mv t = 0; runs = 0; do { dgecopy(m, n, C_, 1, MAX_M, C_1, INCROW_C, INCCOL_C); dt = walltime(); dgemm_mv(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_1, INCROW_C, INCCOL_C); dt = walltime() - dt; t += dt; ++runs; } while (t<0.3); t /= runs; err = err_dgemm(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_0, INCROW_C, INCCOL_C, C_1, INCROW_C, INCCOL_C); printf(" %12.2e %12.2lf %12.2e", t, ops/t, err); // dgemm_blk_jli t = 0; runs = 0; do { dgecopy(m, n, C_, 1, MAX_M, C_1, INCROW_C, INCCOL_C); dt = walltime(); dgemm_blk_jli(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_1, INCROW_C, INCCOL_C); dt = walltime() - dt; t += dt; ++runs; } while (t<0.3); t /= runs; err = err_dgemm(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_0, INCROW_C, INCCOL_C, C_1, INCROW_C, INCCOL_C); printf(" %12.2e %12.2lf %12.2e", t, ops/t, err); printf("\n"); } return 0; } ------------------------------------------------------------------------------- :navigate: up -> doc:index next -> doc:session07/page04 back -> doc:session07/page02