#include <cassert>
#include <cmath>
#include <cstddef>
#include <cstdio>
#include <limits>
#include <random>
#include <thread>
#include <hpc/aux/primitive_type.h>
#include <hpc/aux/slices.h>
#include <hpc/aux/walltime.h>
#include <hpc/matvec/apply.h>
#include <hpc/matvec/asum.h>
#include <hpc/matvec/axpy.h>
#include <hpc/matvec/copy.h>
#include <hpc/matvec/gematrix.h>
#include <hpc/matvec/isgematrix.h>
#include <hpc/matvec/mm.h>
#include <hpc/matvec/print.h>
#include <hpc/matvec/scal.h>
#ifndef COLMAJOR
#define COLMAJOR 1
#endif
#ifndef MAXDIM_M
#define MAXDIM_M 7000
#endif
#ifndef MAXDIM_N
#define MAXDIM_N 7000
#endif
#ifndef MAXDIM_K
#define MAXDIM_K 7000
#endif
#ifndef MIN_M
#define MIN_M 100
#endif
#ifndef MIN_N
#define MIN_N 100
#endif
#ifndef MIN_K
#define MIN_K 100
#endif
#ifndef MAX_M
#define MAX_M 7000
#endif
#ifndef MAX_N
#define MAX_N 7000
#endif
#ifndef MAX_K
#define MAX_K 7000
#endif
#ifndef INC_M
#define INC_M 100
#endif
#ifndef INC_N
#define INC_N 100
#endif
#ifndef INC_K
#define INC_K 100
#endif
#ifndef ALPHA
#define ALPHA 1.5
#endif
#ifndef BETA
#define BETA 1.5
#endif
using namespace hpc::matvec;
template <typename MA, typename Func>
typename std::enable_if<IsGeMatrix<MA>::value, void>::type
blocked_apply(MA &A,
unsigned int nof_row_threads, unsigned int nof_col_threads,
Func func) {
using Index = typename MA::Index;
using hpc::aux::foreach_slice;
std::vector<std::thread> threads(nof_row_threads * nof_col_threads);
unsigned int ti = 0;
foreach_slice(nof_row_threads, A.numRows, [&]
(Index rows_offset, Index rows_size) {
foreach_slice(nof_col_threads, A.numCols, [&,rows_offset,rows_size]
(Index cols_offset, Index cols_size) {
threads[ti++] = std::thread(
[&,rows_offset,rows_size,cols_offset,cols_size]() {
func(rows_offset, cols_offset, rows_size, cols_size);
});
});
});
for (auto& t: threads) {
t.join();
}
}
template <typename MA, typename Func>
typename std::enable_if<IsGeMatrix<MA>::value, void>::type
blocked_apply(const MA &A,
unsigned int nof_row_threads, unsigned int nof_col_threads,
Func func) {
using Index = typename MA::Index;
using hpc::aux::foreach_slice;
std::vector<std::thread> threads(nof_row_threads * nof_col_threads);
unsigned int ti = 0;
foreach_slice(nof_row_threads, A.numRows, [&]
(Index rows_offset, Index rows_size) {
foreach_slice(nof_col_threads, A.numCols, [&,rows_offset,rows_size]
(Index cols_offset, Index cols_size) {
threads[ti++] = std::thread([=,&func]() {
func(rows_offset, cols_offset, rows_size, cols_size);
});
});
});
for (auto& t: threads) {
t.join();
}
}
template <typename MA, typename Func>
typename std::enable_if<IsGeMatrix<MA>::value, void>::type
apply(MA &A, Func func,
unsigned int nof_row_threads, unsigned int nof_col_threads) {
using Index = typename MA::Index;
blocked_apply(A, nof_row_threads, nof_col_threads,
[&](Index rows_offset, Index cols_offset,
Index rows_size, Index cols_size) {
auto A_ = A(rows_offset, cols_offset, rows_size, cols_size);
apply(A_, func);
});
}
template <typename MA, typename Func>
typename std::enable_if<IsGeMatrix<MA>::value, void>::type
apply(const MA &A, Func func,
unsigned int nof_row_threads, unsigned int nof_col_threads) {
using Index = typename MA::Index;
blocked_apply(A, nof_row_threads, nof_col_threads,
[&](Index rows_offset, Index cols_offset,
Index rows_size, Index cols_size) {
auto A_ = A(rows_offset, cols_offset, rows_size, cols_size);
apply(A_, func);
});
}
template <typename MA, typename MB>
typename std::enable_if<IsGeMatrix<MA>::value
&& IsGeMatrix<MB>::value,
void>::type
copy(const MA &A, MB &B,
unsigned int nof_row_threads, unsigned int nof_col_threads) {
assert(A.numRows==B.numRows);
assert(A.numCols==B.numCols);
using Index = typename std::common_type<typename MA::Index,
typename MB::Index>::type;
blocked_apply(A, nof_row_threads, nof_col_threads,
[&](Index rows_offset, Index cols_offset,
Index rows_size, Index cols_size) {
auto A_ = A(rows_offset, cols_offset, rows_size, cols_size);
auto B_ = B(rows_offset, cols_offset, rows_size, cols_size);
copy(A_, B_);
});
}
template<typename Alpha, typename MA, typename MB, typename Beta, typename MC>
typename std::enable_if<IsGeMatrix<MA>::value
&& IsGeMatrix<MB>::value
&& IsGeMatrix<MC>::value,
void>::type
mm(const Alpha& alpha, const MA& A, const MB& B, const Beta& beta, MC& C,
unsigned int nof_row_threads,
unsigned int nof_col_threads) {
using Index =
typename std::common_type<typename MA::Index,
typename MB::Index, typename MC::Index>::type;
assert(nof_row_threads > 0); assert(nof_col_threads > 0);
assert(A.numCols == B.numRows && A.numRows == C.numRows &&
B.numCols == C.numCols);
Index m = A.numRows; Index n = B.numCols; Index k = A.numCols;
std::vector<std::thread> threads(nof_row_threads * nof_col_threads);
unsigned int ti = 0;
using hpc::aux::foreach_slice;
foreach_slice(nof_row_threads, m, [&]
(Index rows_offset, Index rows_size) {
foreach_slice(nof_col_threads, n, [&,rows_offset,rows_size]
(Index cols_offset, Index cols_size) {
threads[ti++] = std::thread(
[&,rows_offset,rows_size,cols_offset,cols_size]() {
auto A_ = A(rows_offset, 0, rows_size, A.numCols);
auto B_ = B(0, cols_offset, B.numRows, cols_size);
auto C_ = C(rows_offset, cols_offset, rows_size, cols_size);
mm(alpha, A_, B_, beta, C_);
});
});
});
for (auto& t: threads) {
t.join();
}
}
template <typename MA>
typename std::enable_if<hpc::matvec::IsRealGeMatrix<MA>::value,
void>::type
randomInit(MA &A)
{
typedef typename MA::ElementType T;
typedef typename MA::Index Index;
std::random_device random;
std::mt19937 mt(random());
std::uniform_real_distribution<T> uniform(-100,100);
auto func = [&](T &val, Index i, Index j) mutable -> void
{
val = uniform(mt);
};
hpc::matvec::apply(A, func);
}
template <typename MA>
typename std::enable_if<hpc::matvec::IsRealGeMatrix<MA>::value,
void>::type
randomInit(MA &A,
unsigned int nof_row_threads, unsigned int nof_col_threads) {
using Index = typename MA::Index;
blocked_apply(A, nof_row_threads, nof_col_threads,
[&](Index rows_offset, Index cols_offset,
Index rows_size, Index cols_size) {
auto A_ = A(rows_offset, cols_offset, rows_size, cols_size);
randomInit(A_);
});
}
//
// C0 is the trusted result of C <- beta C + alpha*A*B
// C1 is computed by a routine subject to testing
//
template <typename Alpha, typename MA, typename MB, typename Beta,
typename MC0, typename MC1>
typename std::enable_if<hpc::matvec::IsGeMatrix<MA>::value
&& hpc::matvec::IsGeMatrix<MB>::value
&& hpc::matvec::IsGeMatrix<MC0>::value
&& hpc::matvec::IsGeMatrix<MC1>::value,
double>::type
estimateGemmResidual(const Alpha &alpha, const MA &A, const MB &B,
const Beta &beta, const MC0 &C0, const MC1 &C1)
{
typedef typename MC0::ElementType TC0;
typedef typename MC0::Index Index;
Index m= C1.numRows;
Index n= C1.numCols;
Index k= A.numCols;
double aNorm = hpc::matvec::asum(A) * std::abs(alpha);
double bNorm = hpc::matvec::asum(B);
double cNorm = hpc::matvec::asum(C1) * std::abs(beta);
double diff = 0;
hpc::matvec::apply(C0, [=,&diff](const TC0 &val, Index i, Index j) -> void
{
diff += std::abs(C1(i,j) - val);
});
// Using eps for double gives upper bound in case elements have lower
// precision.
double eps = std::numeric_limits<double>::epsilon();
double res = diff/(aNorm*bNorm*cNorm*eps*std::max(std::max(m,n),k));
return res;
}
namespace refblas {
template <typename Index, typename Alpha, typename TA, typename TB,
typename Beta, typename TC>
void
gemm(Index m, Index n, Index k,
Alpha alpha,
const TA *A, Index incRowA, Index incColA,
const TB *B, Index incRowB, Index incColB,
Beta beta,
TC *C, Index incRowC, Index incColC)
{
hpc::ulmblas::gescal(m, n, beta, C, incRowC, incColC);
for (Index j=0; j<n; ++j) {
for (Index l=0; l<k; ++l) {
for (Index i=0; i<m; ++i) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
} // namespace refblas
int
main()
{
typedef double Alpha;
typedef double TA;
typedef double TB;
typedef double Beta;
typedef double TC;
typedef std::size_t Index;
hpc::matvec::StorageOrder order = (COLMAJOR)
? hpc::matvec::StorageOrder::ColMajor
: hpc::matvec::StorageOrder::RowMajor;
hpc::matvec::GeMatrix<TA, Index> A_(MAXDIM_M, MAXDIM_K, order);
hpc::matvec::GeMatrix<TB, Index> B_(MAXDIM_K, MAXDIM_N, order);
hpc::matvec::GeMatrix<TC, Index> C1_(MAXDIM_M, MAXDIM_N, order);
hpc::matvec::GeMatrix<TC, Index> C2_(MAXDIM_M, MAXDIM_N, order);
hpc::matvec::GeMatrix<TC, Index> C3_(MAXDIM_M, MAXDIM_N, order);
hpc::matvec::GeMatrix<TC, Index> C4_(MAXDIM_M, MAXDIM_N, order);
hpc::matvec::GeMatrix<TC, Index> C5_(MAXDIM_M, MAXDIM_N, order);
randomInit(A_, 2, 2);
randomInit(B_, 2, 2);
randomInit(C1_, 2, 2);
const Alpha alpha(ALPHA);
const Beta beta(BETA);
copy(C1_, C2_, 2, 2);
copy(C1_, C3_, 2, 2);
copy(C1_, C4_, 2, 2);
copy(C1_, C5_, 2, 2);
// Header-Zeile fuer die Ausgabe
printf("%5s %5s %5s ", "m", "n", "k");
printf("%20s %9s", "refColMajor: t", "MFLOPS");
printf("%20s %9s", "blocked GEMM: t", "MFLOPS");
printf("%20s %9s %9s", "2x2 mt/blk GEMM: t", "MFLOPS", "speedup");
printf("%20s %9s %9s", "1x4 mt/blk GEMM: t", "MFLOPS", "speedup");
printf("%20s %9s %9s", "4x1 mt/blk GEMM: t", "MFLOPS", "speedup");
printf(" %9s %9s", "res12", "res13");
printf(" %9s %9s", "res14", "res15");
printf("\n");
hpc::aux::WallTime<double> wallTime;
for (long m = MIN_M, n = MIN_N, k = MIN_K;
m <=MAX_M && n <= MAX_N && k <= MAX_K;
m += INC_M, n += INC_N, k += INC_K)
{
double t;
auto A = A_(0,0,m,k);
auto B = B_(0,0,k,n);
auto C1 = C1_(0,0,m,n);
auto C2 = C2_(0,0,m,n);
auto C3 = C3_(0,0,m,n);
auto C4 = C4_(0,0,m,n);
auto C5 = C5_(0,0,m,n);
printf("%5ld %5ld %5ld ", m, n, k);
wallTime.tic();
refblas::gemm(Index(m), Index(n), Index(k), alpha,
A.data, A.incRow, A.incCol,
B.data, B.incRow, B.incCol,
beta,
C1.data, C1.incRow, C1.incCol);
t = wallTime.toc();
printf("%20.4lf %9.2lf", t, 2.*m/1000*n/1000*k/t);
wallTime.tic();
mm(alpha, A, B, beta, C2);
double t1 = wallTime.toc();
double res12 = estimateGemmResidual(alpha, A, B, beta, C1, C2);
printf("%20.4lf %9.2lf", t1, 2.*m/1000*n/1000*k/t1);
wallTime.tic();
mm(alpha, A, B, beta, C3, 2, 2);
double t2 = wallTime.toc();
double res13 = estimateGemmResidual(alpha, A, B, beta, C1, C3);
double speedup = t1/t2;
printf("%20.4lf %9.2lf %9.2lf", t2, 2.*m/1000*n/1000*k/t2, speedup);
wallTime.tic();
mm(alpha, A, B, beta, C4, 1, 4);
t2 = wallTime.toc();
double res14 = estimateGemmResidual(alpha, A, B, beta, C1, C4);
speedup = t1/t2;
printf("%20.4lf %9.2lf %9.2lf", t2, 2.*m/1000*n/1000*k/t2, speedup);
wallTime.tic();
mm(alpha, A, B, beta, C5, 4, 1);
t2 = wallTime.toc();
double res15 = estimateGemmResidual(alpha, A, B, beta, C1, C5);
speedup = t1/t2;
printf("%20.4lf %9.2lf %9.2lf", t2, 2.*m/1000*n/1000*k/t2, speedup);
printf(" %9.1e %9.1e %9.1e %9.1e", res12, res13, res14, res15);
printf("\n");
}
}