Test, ob alles noch tut :-)
Mit folgemdem Code könnt Ihr testen, ob soweit alles passt:
#include <cstdio> #include "bench.h" #include "ulmblas.h" #include "gemm_refcolmajor.h" #include "gemm_blocked.h" #ifndef COLMAJOR #define COLMAJOR 1 #endif #ifndef MAXDIM_M #define MAXDIM_M 7000 #endif #ifndef MAXDIM_N #define MAXDIM_N 7000 #endif #ifndef MAXDIM_K #define MAXDIM_K 7000 #endif #ifndef MIN_M #define MIN_M 100 #endif #ifndef MIN_N #define MIN_N 100 #endif #ifndef MIN_K #define MIN_K 100 #endif #ifndef MAX_M #define MAX_M 7000 #endif #ifndef MAX_N #define MAX_N 7000 #endif #ifndef MAX_K #define MAX_K 7000 #endif #ifndef INC_M #define INC_M 100 #endif #ifndef INC_N #define INC_N 100 #endif #ifndef INC_K #define INC_K 100 #endif #ifndef ALPHA #define ALPHA 1.5 #endif #ifndef BETA #define BETA 1.5 #endif int main() { typedef double T; T *A = new T[MAXDIM_M*MAXDIM_K]; T *B = new T[MAXDIM_K*MAXDIM_N]; T *C1 = new T[MAXDIM_M*MAXDIM_N]; T *C2 = new T[MAXDIM_M*MAXDIM_N]; bench::initGeMatrix(MAXDIM_M, MAXDIM_K, A, 1, MAXDIM_M); bench::initGeMatrix(MAXDIM_K, MAXDIM_N, B, 1, MAXDIM_K); bench::initGeMatrix(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M); ulmBLAS::gecopy(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M, C2, 1, MAXDIM_M); // Header-Zeile fuer die Ausgabe printf("%5s %5s %5s ", "m", "n", "k"); printf("%5s %5s ", "IRA", "ICA"); printf("%5s %5s ", "IRB", "ICB"); printf("%5s %5s ", "IRC", "ICC"); printf("%20s %9s", "refColMajor: t", "MFLOPS"); printf("%20s %9s %9s", "blocked GEMM: t", "MFLOPS", "diff"); printf("\n"); bench::WallTime<double> wallTime; for (long m = MIN_M, n = MIN_N, k = MIN_K; m <=MAX_M && n <= MAX_N && k <= MAX_K; m += INC_M, n += INC_N, k += INC_K) { double t, diff; long incRowA = (COLMAJOR) ? 1 : k; long incColA = (COLMAJOR) ? m : 1; long incRowB = (COLMAJOR) ? 1 : n; long incColB = (COLMAJOR) ? k : 1; long incRowC = (COLMAJOR) ? 1 : n; long incColC = (COLMAJOR) ? m : 1; printf("%5ld %5ld %5ld ", m, n, k); printf("%5ld %5ld ", incRowA, incColA); printf("%5ld %5ld ", incRowB, incColB); printf("%5ld %5ld ", incRowC, incColC); wallTime.tic(); refColMajor::gemm(m, n, k, ALPHA, A, incRowA, incColA, B, incRowB, incColB, BETA, C1, incRowC, incColC); t = wallTime.toc(); printf("%20.4lf %9.2lf", t, 2.*m/1000*n/1000*k/t); wallTime.tic(); blocked::gemm(m, n, k, ALPHA, A, incRowA, incColA, B, incRowB, incColB, BETA, C2, incRowC, incColC); t = wallTime.toc(); diff = bench::asumDiffGeMatrix(m, n, C1, incRowC, incColC, C2, incRowC, incColC)/(m*n); printf("%20.4lf %9.2lf %9.1e", t, 2.*m/1000*n/1000*k/t, diff); printf("\n"); } delete [] A; delete [] B; delete [] C1; delete [] C2; }
Für einen kurzen Test kann mit -DMAX_M=500 die maximale Matrixgröße eingeschränkt werden. Beim Default-Wert werden die Matrizen bis zu \(7000 \times 7000\) groß:
$shell> g++ -Wall -std=c++11 -DMAX_M=500 -O3 -o gemm_bench gemm_bench.cc $shell> ./gemm_bench m n k IRA ICA IRB ICB IRC ICC refColMajor: t MFLOPS blocked GEMM: t MFLOPS diff 100 100 100 1 100 1 100 1 100 0.0008 2594.26 0.0010 1937.07 1.7e-11 200 200 200 1 200 1 200 1 200 0.0064 2497.92 0.0062 2564.44 3.8e-11 300 300 300 1 300 1 300 1 300 0.0212 2550.27 0.0184 2929.12 7.1e-11 400 400 400 1 400 1 400 1 400 0.0498 2571.82 0.0430 2973.68 1.2e-10 500 500 500 1 500 1 500 1 500 0.0964 2594.13 0.0822 3040.85 1.9e-10