#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef COLMAJOR #define COLMAJOR 1 #endif #ifndef MAXDIM_M #define MAXDIM_M 7000 #endif #ifndef MAXDIM_N #define MAXDIM_N 7000 #endif #ifndef MAXDIM_K #define MAXDIM_K 7000 #endif #ifndef MIN_M #define MIN_M 100 #endif #ifndef MIN_N #define MIN_N 100 #endif #ifndef MIN_K #define MIN_K 100 #endif #ifndef MAX_M #define MAX_M 7000 #endif #ifndef MAX_N #define MAX_N 7000 #endif #ifndef MAX_K #define MAX_K 7000 #endif #ifndef INC_M #define INC_M 100 #endif #ifndef INC_N #define INC_N 100 #endif #ifndef INC_K #define INC_K 100 #endif #ifndef ALPHA #define ALPHA 1.5 #endif #ifndef BETA #define BETA 1.5 #endif using namespace hpc::matvec; template typename std::enable_if::value, void>::type blocked_apply(MA &A, unsigned int nof_row_threads, unsigned int nof_col_threads, Func func) { using Index = typename MA::Index; using hpc::aux::foreach_slice; std::vector threads(nof_row_threads * nof_col_threads); unsigned int ti = 0; foreach_slice(nof_row_threads, A.numRows, [&] (Index rows_offset, Index rows_size) { foreach_slice(nof_col_threads, A.numCols, [&,rows_offset,rows_size] (Index cols_offset, Index cols_size) { threads[ti++] = std::thread( [&,rows_offset,rows_size,cols_offset,cols_size]() { func(rows_offset, cols_offset, rows_size, cols_size); }); }); }); for (auto& t: threads) { t.join(); } } template typename std::enable_if::value, void>::type blocked_apply(const MA &A, unsigned int nof_row_threads, unsigned int nof_col_threads, Func func) { using Index = typename MA::Index; using hpc::aux::foreach_slice; std::vector threads(nof_row_threads * nof_col_threads); unsigned int ti = 0; foreach_slice(nof_row_threads, A.numRows, [&] (Index rows_offset, Index rows_size) { foreach_slice(nof_col_threads, A.numCols, [&,rows_offset,rows_size] (Index cols_offset, Index cols_size) { threads[ti++] = std::thread([=,&func]() { func(rows_offset, cols_offset, rows_size, cols_size); }); }); }); for (auto& t: threads) { t.join(); } } template typename std::enable_if::value, void>::type apply(MA &A, Func func, unsigned int nof_row_threads, unsigned int nof_col_threads) { using Index = typename MA::Index; blocked_apply(A, nof_row_threads, nof_col_threads, [&](Index rows_offset, Index cols_offset, Index rows_size, Index cols_size) { auto A_ = A(rows_offset, cols_offset, rows_size, cols_size); apply(A_, func); }); } template typename std::enable_if::value, void>::type apply(const MA &A, Func func, unsigned int nof_row_threads, unsigned int nof_col_threads) { using Index = typename MA::Index; blocked_apply(A, nof_row_threads, nof_col_threads, [&](Index rows_offset, Index cols_offset, Index rows_size, Index cols_size) { auto A_ = A(rows_offset, cols_offset, rows_size, cols_size); apply(A_, func); }); } template typename std::enable_if::value && IsGeMatrix::value, void>::type copy(const MA &A, MB &B, unsigned int nof_row_threads, unsigned int nof_col_threads) { assert(A.numRows==B.numRows); assert(A.numCols==B.numCols); using Index = typename std::common_type::type; blocked_apply(A, nof_row_threads, nof_col_threads, [&](Index rows_offset, Index cols_offset, Index rows_size, Index cols_size) { auto A_ = A(rows_offset, cols_offset, rows_size, cols_size); auto B_ = B(rows_offset, cols_offset, rows_size, cols_size); copy(A_, B_); }); } template typename std::enable_if::value && IsGeMatrix::value && IsGeMatrix::value, void>::type mm(const Alpha& alpha, const MA& A, const MB& B, const Beta& beta, MC& C, unsigned int nof_row_threads, unsigned int nof_col_threads) { using Index = typename std::common_type::type; assert(nof_row_threads > 0); assert(nof_col_threads > 0); assert(A.numCols == B.numRows && A.numRows == C.numRows && B.numCols == C.numCols); Index m = A.numRows; Index n = B.numCols; Index k = A.numCols; std::vector threads(nof_row_threads * nof_col_threads); unsigned int ti = 0; using hpc::aux::foreach_slice; foreach_slice(nof_row_threads, m, [&] (Index rows_offset, Index rows_size) { foreach_slice(nof_col_threads, n, [&,rows_offset,rows_size] (Index cols_offset, Index cols_size) { threads[ti++] = std::thread( [&,rows_offset,rows_size,cols_offset,cols_size]() { auto A_ = A(rows_offset, 0, rows_size, A.numCols); auto B_ = B(0, cols_offset, B.numRows, cols_size); auto C_ = C(rows_offset, cols_offset, rows_size, cols_size); mm(alpha, A_, B_, beta, C_); }); }); }); for (auto& t: threads) { t.join(); } } template typename std::enable_if::value, void>::type randomInit(MA &A) { typedef typename MA::ElementType T; typedef typename MA::Index Index; std::random_device random; std::mt19937 mt(random()); std::uniform_real_distribution uniform(-100,100); auto func = [&](T &val, Index i, Index j) mutable -> void { val = uniform(mt); }; hpc::matvec::apply(A, func); } template typename std::enable_if::value, void>::type randomInit(MA &A, unsigned int nof_row_threads, unsigned int nof_col_threads) { using Index = typename MA::Index; blocked_apply(A, nof_row_threads, nof_col_threads, [&](Index rows_offset, Index cols_offset, Index rows_size, Index cols_size) { auto A_ = A(rows_offset, cols_offset, rows_size, cols_size); randomInit(A_); }); } // // C0 is the trusted result of C <- beta C + alpha*A*B // C1 is computed by a routine subject to testing // template typename std::enable_if::value && hpc::matvec::IsGeMatrix::value && hpc::matvec::IsGeMatrix::value && hpc::matvec::IsGeMatrix::value, double>::type estimateGemmResidual(const Alpha &alpha, const MA &A, const MB &B, const Beta &beta, const MC0 &C0, const MC1 &C1) { typedef typename MC0::ElementType TC0; typedef typename MC0::Index Index; Index m= C1.numRows; Index n= C1.numCols; Index k= A.numCols; double aNorm = hpc::matvec::asum(A) * std::abs(alpha); double bNorm = hpc::matvec::asum(B); double cNorm = hpc::matvec::asum(C1) * std::abs(beta); double diff = 0; hpc::matvec::apply(C0, [=,&diff](const TC0 &val, Index i, Index j) -> void { diff += std::abs(C1(i,j) - val); }); // Using eps for double gives upper bound in case elements have lower // precision. double eps = std::numeric_limits::epsilon(); double res = diff/(aNorm*bNorm*cNorm*eps*std::max(std::max(m,n),k)); return res; } namespace refblas { template void gemm(Index m, Index n, Index k, Alpha alpha, const TA *A, Index incRowA, Index incColA, const TB *B, Index incRowB, Index incColB, Beta beta, TC *C, Index incRowC, Index incColC) { hpc::ulmblas::gescal(m, n, beta, C, incRowC, incColC); for (Index j=0; j A_(MAXDIM_M, MAXDIM_K, order); hpc::matvec::GeMatrix B_(MAXDIM_K, MAXDIM_N, order); hpc::matvec::GeMatrix C1_(MAXDIM_M, MAXDIM_N, order); hpc::matvec::GeMatrix C2_(MAXDIM_M, MAXDIM_N, order); hpc::matvec::GeMatrix C3_(MAXDIM_M, MAXDIM_N, order); hpc::matvec::GeMatrix C4_(MAXDIM_M, MAXDIM_N, order); hpc::matvec::GeMatrix C5_(MAXDIM_M, MAXDIM_N, order); randomInit(A_, 2, 2); randomInit(B_, 2, 2); randomInit(C1_, 2, 2); const Alpha alpha(ALPHA); const Beta beta(BETA); copy(C1_, C2_, 2, 2); copy(C1_, C3_, 2, 2); copy(C1_, C4_, 2, 2); copy(C1_, C5_, 2, 2); // Header-Zeile fuer die Ausgabe printf("%5s %5s %5s ", "m", "n", "k"); printf("%20s %9s", "refColMajor: t", "MFLOPS"); printf("%20s %9s", "blocked GEMM: t", "MFLOPS"); printf("%20s %9s %9s", "2x2 mt/blk GEMM: t", "MFLOPS", "speedup"); printf("%20s %9s %9s", "1x4 mt/blk GEMM: t", "MFLOPS", "speedup"); printf("%20s %9s %9s", "4x1 mt/blk GEMM: t", "MFLOPS", "speedup"); printf(" %9s %9s", "res12", "res13"); printf(" %9s %9s", "res14", "res15"); printf("\n"); hpc::aux::WallTime wallTime; for (long m = MIN_M, n = MIN_N, k = MIN_K; m <=MAX_M && n <= MAX_N && k <= MAX_K; m += INC_M, n += INC_N, k += INC_K) { double t; auto A = A_(0,0,m,k); auto B = B_(0,0,k,n); auto C1 = C1_(0,0,m,n); auto C2 = C2_(0,0,m,n); auto C3 = C3_(0,0,m,n); auto C4 = C4_(0,0,m,n); auto C5 = C5_(0,0,m,n); printf("%5ld %5ld %5ld ", m, n, k); wallTime.tic(); refblas::gemm(Index(m), Index(n), Index(k), alpha, A.data, A.incRow, A.incCol, B.data, B.incRow, B.incCol, beta, C1.data, C1.incRow, C1.incCol); t = wallTime.toc(); printf("%20.4lf %9.2lf", t, 2.*m/1000*n/1000*k/t); wallTime.tic(); mm(alpha, A, B, beta, C2); double t1 = wallTime.toc(); double res12 = estimateGemmResidual(alpha, A, B, beta, C1, C2); printf("%20.4lf %9.2lf", t1, 2.*m/1000*n/1000*k/t1); wallTime.tic(); mm(alpha, A, B, beta, C3, 2, 2); double t2 = wallTime.toc(); double res13 = estimateGemmResidual(alpha, A, B, beta, C1, C3); double speedup = t1/t2; printf("%20.4lf %9.2lf %9.2lf", t2, 2.*m/1000*n/1000*k/t2, speedup); wallTime.tic(); mm(alpha, A, B, beta, C4, 1, 4); t2 = wallTime.toc(); double res14 = estimateGemmResidual(alpha, A, B, beta, C1, C4); speedup = t1/t2; printf("%20.4lf %9.2lf %9.2lf", t2, 2.*m/1000*n/1000*k/t2, speedup); wallTime.tic(); mm(alpha, A, B, beta, C5, 4, 1); t2 = wallTime.toc(); double res15 = estimateGemmResidual(alpha, A, B, beta, C1, C5); speedup = t1/t2; printf("%20.4lf %9.2lf %9.2lf", t2, 2.*m/1000*n/1000*k/t2, speedup); printf(" %9.1e %9.1e %9.1e %9.1e", res12, res13, res14, res15); printf("\n"); } }