Content |
Einfache GEMM Block-Varianten
Durch Makros werden Konstanten \(M_c\), \(N_c\) und \(K_c\) festgelegt. Die Matrizen werden dann wie folgt partitioniert:
-
Die \(m \times k\) Matrix \(A\) quasi-äquidistant in \(M_c \times K_c\) Blöcke.
-
Die \(k \times n\) Matrix \(B\) quasi-äquidistant in \(K_c \times N_c\) Blöcke.
-
Die \(m \times n\) Matrix \(C\) quasi-äquidistant in \(M_c \times N_c\) Blöcke.
Beachtet, dass bei einer quasi-äquidistanten Partitionierung damit die maximale Größe eines Blockes angegeben werden. Blöcke am unteren oder rechten Rand können kleiner sein.
Die Multiplikation wird dabei auf die Multiplikation
\[ C_{i,j} \leftarrow \beta\,C_{i,j} + \alpha\,A_{i,\ell} B_{\ell,j}\]von Blöcken zurückgeführt. Dafür sind wieder 6 Varianten möglich, die sich darin unterscheiden wie die Schleifen über \(i\), \(j\) und \(\ell\) verschachtelt sind.
In allen Varianten sollen die Blöcke \(A_{i,\ell}\) und \(B_{\ell,j}\) vor der Block-Multiplikation gepuffert werden: Mit
\[ \overline{A} \leftarrow A_{i,\ell},\quad \overline{B} \leftarrow B_{\ell,j},\quad \overline{C} \leftarrow C_{i,j}\]deuten wir an, dass die Blöcke \(A_{i,\ell}\), \(B_{\ell,j}\) und \(C_{i,j}\) jeweils kopiert werden. Das Speicherformat für die Matrizen \(\overline{A}\), \(\overline{B}\) und \(\overline{C}\) sei stets Col-Major.
Die Operation
\[ \overline{C} \leftarrow \overline{A}\overline{B}\]soll dann mit der ungeblockten dgemm_jli Variante für GEMM ausgeführt werden.
Anschließend kann mit GEAXPY
\[ C_{i,j} \leftarrow C_{i,j} + \alpha \overline{C}\]Am Anfang der Funktion (außerhalb der Schleifen) sei bereits C mit
\[ C \leftarrow \beta C \qquad(\text{GESCAL})\]überschrieben worden.
Vorlage
Grundgerüst für level3.h
Die Header-Datei ist um eine Signatur für dgemm_blk_jli ergänzt worden:
#ifndef ULMBLAS_LEVEL3_H #define ULMBLAS_LEVEL3_H 1 // // BLAS Level 3 // //-- GEMM ---------------------------------------------------------------------- void dgemm_mv(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_jil(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_jli(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_vm(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_ijl(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_ilj(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_vv(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_lij(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_lji(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); void dgemm_blk_jli(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC); #endif // ULMBLAS_LEVEL3_H
Grundgerüst für level3.c
Die Source-Datei wird entsprechned um die Funktion dgemm_blk_jli erweitert:
#include <ulmblas/level3.h> #include <ulmblas/level2.h> #include <ulmblas/level1.h> #include <stdlib.h> // // BLAS Level 3 // //-- GEMM ---------------------------------------------------------------------- void dgemm_mv(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int j; for (j=0; j<n; ++j) { dgemv(m, k, alpha, A, incRowA, incColA, &B[j*incColB], incRowB, beta, &C[j*incColC], incRowC); } } void dgemm_jil(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i, j, l; for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { C[i*incRowC+j*incColC] *= beta; for (l=0; l<k; ++l) { C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } void dgemm_jli(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i, j, l; for (j=0; j<n; ++j) { for (l=0; l<k; ++l) { for (i=0; i<m; ++i) { if (l==0) { C[i*incRowC+j*incColC] *= beta; } C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } void dgemm_vm(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i; for (i=0; i<m; ++i) { dgemv(n, k, alpha, B, incColB, incRowB, &A[i*incRowA], incColA, beta, &C[i*incRowC], incColC); } } void dgemm_ilj(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i, j, l; for (i=0; i<m; ++i) { for (l=0; l<k; ++l) { for (j=0; j<n; ++j) { if (l==0) { C[i*incRowC+j*incColC] *= beta; } C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } void dgemm_ijl(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i, j, l; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { C[i*incRowC+j*incColC] *= beta; for (l=0; l<k; ++l) { C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } void dgemm_vv(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int l; dgescal(m, n, alpha, C, incRowC, incColC); for (l=0; l<k; ++l) { dger(m, n, alpha, &A[l*incColA], incRowA, &B[l*incRowB], incColB, C, incRowC, incColC); } } void dgemm_lij(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i, j, l; for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { C[i*incRowC+j*incColC] *= beta; } } for (l=0; l<k; ++l) { for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } void dgemm_lji(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i, j, l; for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { C[i*incRowC+j*incColC] *= beta; } } for (l=0; l<k; ++l) { for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } //-------------------------------------------------------------------------------- #ifndef GEMM_MC #define GEMM_MC 256 #endif #ifndef GEMM_NC #define GEMM_NC 256 #endif #ifndef GEMM_KC #define GEMM_KC 1024 #endif void dgemm_blk_jli(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i, j, l; double *A_ = (double *)malloc(sizeof(double)*GEMM_MC*GEMM_KC); double *B_ = (double *)malloc(sizeof(double)*GEMM_KC*GEMM_NC); double *C_ = (double *)malloc(sizeof(double)*GEMM_MC*GEMM_NC); dgescal(m, n, beta, C, incRowC, incColC); for (j=0; j<n; j+=GEMM_NC) { int nc = (j+GEMM_NC<=n) ? GEMM_NC : n - j; // ... your code here ... } free(A_); free(B_); free(C_); }
Program für Benchmark
#include <stdio.h> #include <stdlib.h> #include <ulmblas/level1.h> #include <ulmblas/level3.h> #include <bench/aux.h> #include <bench/errbound.h> #include <bench/refblas.h> #ifndef ALPHA #define ALPHA 2 #endif #ifndef BETA #define BETA 2 #endif #ifndef MIN_N #define MIN_N 100 #endif #ifndef MAX_N #define MAX_N 1500 #endif #ifndef INC_N #define INC_N 100 #endif #ifndef MIN_M #define MIN_M 100 #endif #ifndef MAX_M #define MAX_M 1500 #endif #ifndef INC_M #define INC_M 100 #endif #ifndef MIN_K #define MIN_K 100 #endif #ifndef MAX_K #define MAX_K 1500 #endif #ifndef INC_K #define INC_K 100 #endif #ifndef ROWMAJOR_A #define ROWMAJOR_A 0 #endif #ifndef ROWMAJOR_B #define ROWMAJOR_B 0 #endif #ifndef ROWMAJOR_C #define ROWMAJOR_C 0 #endif #if (ROWMAJOR_A==1) # define INCROW_A MAX_K # define INCCOL_A 1 #else # define INCROW_A 1 # define INCCOL_A MAX_M #endif #if (ROWMAJOR_B==1) # define INCROW_B MAX_N # define INCCOL_B 1 #else # define INCROW_B 1 # define INCCOL_B MAX_K #endif #if (ROWMAJOR_C==1) # define INCROW_C MAX_N # define INCCOL_C 1 #else # define INCROW_C 1 # define INCCOL_C MAX_M #endif #define MIN(X,Y) ((X)<(Y) ? (X) : (Y)) #define MAX(X,Y) ((X)>(Y) ? (X) : (Y)) double A_[MAX_M*MAX_K*MIN(INCROW_A,INCCOL_A)]; double B_[MAX_K*MAX_N*MIN(INCROW_B,INCCOL_B)]; double C_[MAX_M*MAX_N]; double C_0[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)]; double C_1[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)]; int main() { int m, n, k; randGeMatrix(MAX_M, MAX_K, A_, INCROW_A, INCCOL_A); randGeMatrix(MAX_K, MAX_N, B_, INCROW_B, INCCOL_B); randGeMatrix(MAX_M, MAX_N, C_, 1, MAX_M); printf("#%9s %9s %9s", "m", "n", "k"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf("\n"); for (m=MIN_M, n=MIN_N, k=MIN_K; n<=MAX_N && m<=MAX_M && k<=MAX_K; m+=INC_M, n+=INC_N, k+=INC_K) { double t, dt, err; int runs = 1; double ops = 2.0*m/1000*n/1000*k; printf(" %9d %9d %9d", m, n, k); // compute reference solution dgecopy(m, n, C_, 1, MAX_M, C_0, INCROW_C, INCCOL_C); dgemm_ref(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_0, INCROW_C, INCCOL_C); // bench matrix-vector variants // dgemm_mv t = 0; runs = 0; do { dgecopy(m, n, C_, 1, MAX_M, C_1, INCROW_C, INCCOL_C); dt = walltime(); dgemm_mv(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_1, INCROW_C, INCCOL_C); dt = walltime() - dt; t += dt; ++runs; } while (t<0.3); t /= runs; err = err_dgemm(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_0, INCROW_C, INCCOL_C, C_1, INCROW_C, INCCOL_C); printf(" %12.2e %12.2lf %12.2e", t, ops/t, err); // dgemm_blk_jli t = 0; runs = 0; do { dgecopy(m, n, C_, 1, MAX_M, C_1, INCROW_C, INCCOL_C); dt = walltime(); dgemm_blk_jli(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_1, INCROW_C, INCCOL_C); dt = walltime() - dt; t += dt; ++runs; } while (t<0.3); t /= runs; err = err_dgemm(m, n, k, ALPHA, A_, INCROW_A, INCCOL_A, B_, INCROW_B, INCCOL_B, BETA, C_0, INCROW_C, INCCOL_C, C_1, INCROW_C, INCCOL_C); printf(" %12.2e %12.2lf %12.2e", t, ops/t, err); printf("\n"); } return 0; }