Lösungsvorschlag

#include <cassert>
#include <cmath>
#include <cstddef>
#include <cstdio>
#include <limits>
#include <random>
#include <thread>
#include <hpc/aux/primitive_type.h>
#include <hpc/aux/slices.h>
#include <hpc/aux/walltime.h>
#include <hpc/matvec/apply.h>
#include <hpc/matvec/asum.h>
#include <hpc/matvec/axpy.h>
#include <hpc/matvec/copy.h>
#include <hpc/matvec/gematrix.h>
#include <hpc/matvec/isgematrix.h>
#include <hpc/matvec/mm.h>
#include <hpc/matvec/print.h>
#include <hpc/matvec/scal.h>

#ifndef COLMAJOR
#define COLMAJOR 1
#endif

#ifndef MAXDIM_M
#define MAXDIM_M    7000
#endif

#ifndef MAXDIM_N
#define MAXDIM_N    7000
#endif

#ifndef MAXDIM_K
#define MAXDIM_K    7000
#endif

#ifndef MIN_M
#define MIN_M   100
#endif

#ifndef MIN_N
#define MIN_N   100
#endif

#ifndef MIN_K
#define MIN_K   100
#endif

#ifndef MAX_M
#define MAX_M   7000
#endif

#ifndef MAX_N
#define MAX_N   7000
#endif

#ifndef MAX_K
#define MAX_K   7000
#endif

#ifndef INC_M
#define INC_M   100
#endif

#ifndef INC_N
#define INC_N   100
#endif

#ifndef INC_K
#define INC_K   100
#endif

#ifndef ALPHA
#define ALPHA   1.5
#endif

#ifndef BETA
#define BETA    1.5
#endif

using namespace hpc::matvec;

template <typename MA, typename Func>
typename std::enable_if<IsGeMatrix<MA>::value, void>::type
blocked_apply(MA &A,
      unsigned int nof_row_threads, unsigned int nof_col_threads,
      Func func) {
   using Index = typename MA::Index;
   using hpc::aux::foreach_slice;
   std::vector<std::thread> threads(nof_row_threads * nof_col_threads);
   unsigned int ti = 0;
   foreach_slice(nof_row_threads, A.numRows, [&]
         (Index rows_offset, Index rows_size) {
      foreach_slice(nof_col_threads, A.numCols, [&,rows_offset,rows_size]
            (Index cols_offset, Index cols_size) {
         threads[ti++] = std::thread(
               [&,rows_offset,rows_size,cols_offset,cols_size]() {
            func(rows_offset, cols_offset, rows_size, cols_size);
         });
      });
   });
   for (auto& t: threads) {
      t.join();
   }
}

template <typename MA, typename Func>
typename std::enable_if<IsGeMatrix<MA>::value, void>::type
blocked_apply(const MA &A,
      unsigned int nof_row_threads, unsigned int nof_col_threads,
      Func func) {
   using Index = typename MA::Index;
   using hpc::aux::foreach_slice;
   std::vector<std::thread> threads(nof_row_threads * nof_col_threads);
   unsigned int ti = 0;
   foreach_slice(nof_row_threads, A.numRows, [&]
         (Index rows_offset, Index rows_size) {
      foreach_slice(nof_col_threads, A.numCols, [&,rows_offset,rows_size]
            (Index cols_offset, Index cols_size) {
         threads[ti++] = std::thread([=,&func]() {
            func(rows_offset, cols_offset, rows_size, cols_size);
         });
      });
   });
   for (auto& t: threads) {
      t.join();
   }
}

template <typename MA, typename Func>
typename std::enable_if<IsGeMatrix<MA>::value, void>::type
apply(MA &A, Func func,
      unsigned int nof_row_threads, unsigned int nof_col_threads) {
   using Index = typename MA::Index;
   blocked_apply(A, nof_row_threads, nof_col_threads,
         [&](Index rows_offset, Index cols_offset,
            Index rows_size, Index cols_size) {
      auto A_ = A(rows_offset, cols_offset, rows_size, cols_size);
      apply(A_, func);
   });
}

template <typename MA, typename Func>
typename std::enable_if<IsGeMatrix<MA>::value, void>::type
apply(const MA &A, Func func,
      unsigned int nof_row_threads, unsigned int nof_col_threads) {
   using Index = typename MA::Index;
   blocked_apply(A, nof_row_threads, nof_col_threads,
         [&](Index rows_offset, Index cols_offset,
            Index rows_size, Index cols_size) {
      auto A_ = A(rows_offset, cols_offset, rows_size, cols_size);
      apply(A_, func);
   });
}

template <typename MA, typename MB>
typename std::enable_if<IsGeMatrix<MA>::value
                     && IsGeMatrix<MB>::value,
         void>::type
copy(const MA &A, MB &B,
      unsigned int nof_row_threads, unsigned int nof_col_threads) {
   assert(A.numRows==B.numRows);
   assert(A.numCols==B.numCols);
   using Index = typename std::common_type<typename MA::Index,
      typename MB::Index>::type;
   blocked_apply(A, nof_row_threads, nof_col_threads,
         [&](Index rows_offset, Index cols_offset,
            Index rows_size, Index cols_size) {
      auto A_ = A(rows_offset, cols_offset, rows_size, cols_size);
      auto B_ = B(rows_offset, cols_offset, rows_size, cols_size);
      copy(A_, B_);
   });
}

template<typename Alpha, typename MA, typename MB, typename Beta, typename MC>
typename std::enable_if<IsGeMatrix<MA>::value
                     && IsGeMatrix<MB>::value
                     && IsGeMatrix<MC>::value,
         void>::type
mm(const Alpha& alpha, const MA& A, const MB& B, const Beta& beta, MC& C,
      unsigned int nof_row_threads,
      unsigned int nof_col_threads) {
   using Index =
      typename std::common_type<typename MA::Index,
         typename MB::Index, typename MC::Index>::type;
   assert(nof_row_threads > 0); assert(nof_col_threads > 0);
   assert(A.numCols == B.numRows && A.numRows == C.numRows &&
      B.numCols == C.numCols);
   Index m = A.numRows; Index n = B.numCols; Index k = A.numCols;

   std::vector<std::thread> threads(nof_row_threads * nof_col_threads);
   unsigned int ti = 0;

   using hpc::aux::foreach_slice;

   foreach_slice(nof_row_threads, m, [&]
         (Index rows_offset, Index rows_size) {
      foreach_slice(nof_col_threads, n, [&,rows_offset,rows_size]
            (Index cols_offset, Index cols_size) {
         threads[ti++] = std::thread(
               [&,rows_offset,rows_size,cols_offset,cols_size]() {
            auto A_ = A(rows_offset, 0, rows_size, A.numCols);
            auto B_ = B(0, cols_offset, B.numRows, cols_size);
            auto C_ = C(rows_offset, cols_offset, rows_size, cols_size);
            mm(alpha, A_, B_, beta, C_);
         });
      });
   });
   for (auto& t: threads) {
      t.join();
   }
}

template <typename MA>
typename std::enable_if<hpc::matvec::IsRealGeMatrix<MA>::value,
         void>::type
randomInit(MA &A)
{
    typedef typename MA::ElementType  T;
    typedef typename MA::Index        Index;

    std::random_device                  random;
    std::mt19937                        mt(random());
    std::uniform_real_distribution<T>   uniform(-100,100);

    auto func = [&](T &val, Index i, Index j) mutable -> void
                {
                    val = uniform(mt);
                };

    hpc::matvec::apply(A, func);
}

template <typename MA>
typename std::enable_if<hpc::matvec::IsRealGeMatrix<MA>::value,
         void>::type
randomInit(MA &A,
      unsigned int nof_row_threads, unsigned int nof_col_threads) {
   using Index = typename MA::Index;
   blocked_apply(A, nof_row_threads, nof_col_threads,
      [&](Index rows_offset, Index cols_offset,
         Index rows_size, Index cols_size) {
      auto A_ = A(rows_offset, cols_offset, rows_size, cols_size);
      randomInit(A_);
   });
}

//
//  C0 is the trusted result of C <- beta C + alpha*A*B
//  C1 is computed by a routine subject to testing
//
template <typename Alpha, typename MA, typename MB, typename Beta,
          typename MC0, typename MC1>
typename std::enable_if<hpc::matvec::IsGeMatrix<MA>::value
                     && hpc::matvec::IsGeMatrix<MB>::value
                     && hpc::matvec::IsGeMatrix<MC0>::value
                     && hpc::matvec::IsGeMatrix<MC1>::value,
         double>::type
estimateGemmResidual(const Alpha &alpha, const MA &A, const MB &B,
                     const Beta &beta, const MC0 &C0, const MC1 &C1)
{
    typedef typename MC0::ElementType   TC0;
    typedef typename MC0::Index         Index;

    Index m= C1.numRows;
    Index n= C1.numCols;
    Index k= A.numCols;

    double aNorm = hpc::matvec::asum(A) * std::abs(alpha);
    double bNorm = hpc::matvec::asum(B);
    double cNorm = hpc::matvec::asum(C1) * std::abs(beta);
    double diff = 0;
    hpc::matvec::apply(C0, [=,&diff](const TC0 &val, Index i, Index j) -> void
                           {
                               diff += std::abs(C1(i,j) - val);
                           });
    // Using eps for double gives upper bound in case elements have lower
    // precision.
    double eps = std::numeric_limits<double>::epsilon();
    double res = diff/(aNorm*bNorm*cNorm*eps*std::max(std::max(m,n),k));
    return res;
}

namespace refblas {

template <typename Index, typename Alpha, typename TA, typename TB,
          typename Beta, typename TC>
void
gemm(Index m, Index n, Index k,
     Alpha alpha,
     const TA *A, Index incRowA, Index incColA,
     const TB *B, Index incRowB, Index incColB,
     Beta beta,
     TC *C, Index incRowC, Index incColC)
{
    hpc::ulmblas::gescal(m, n, beta, C, incRowC, incColC);
    for (Index j=0; j<n; ++j) {
        for (Index l=0; l<k; ++l) {
            for (Index i=0; i<m; ++i) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

} // namespace refblas

int
main()
{
    typedef double          Alpha;
    typedef double          TA;
    typedef double          TB;
    typedef double          Beta;
    typedef double          TC;
    typedef std::size_t     Index;

    hpc::matvec::StorageOrder order = (COLMAJOR)
                                    ? hpc::matvec::StorageOrder::ColMajor
                                    : hpc::matvec::StorageOrder::RowMajor;

    hpc::matvec::GeMatrix<TA, Index> A_(MAXDIM_M, MAXDIM_K, order);
    hpc::matvec::GeMatrix<TB, Index> B_(MAXDIM_K, MAXDIM_N, order);
    hpc::matvec::GeMatrix<TC, Index> C1_(MAXDIM_M, MAXDIM_N, order);
    hpc::matvec::GeMatrix<TC, Index> C2_(MAXDIM_M, MAXDIM_N, order);
    hpc::matvec::GeMatrix<TC, Index> C3_(MAXDIM_M, MAXDIM_N, order);
    hpc::matvec::GeMatrix<TC, Index> C4_(MAXDIM_M, MAXDIM_N, order);
    hpc::matvec::GeMatrix<TC, Index> C5_(MAXDIM_M, MAXDIM_N, order);


    randomInit(A_, 2, 2);
    randomInit(B_, 2, 2);
    randomInit(C1_, 2, 2);

    const Alpha alpha(ALPHA);
    const Beta  beta(BETA);

    copy(C1_, C2_, 2, 2);
    copy(C1_, C3_, 2, 2);
    copy(C1_, C4_, 2, 2);
    copy(C1_, C5_, 2, 2);

    // Header-Zeile fuer die Ausgabe
    printf("%5s %5s %5s ", "m", "n", "k");
    printf("%20s %9s", "refColMajor: t", "MFLOPS");
    printf("%20s %9s", "blocked GEMM: t", "MFLOPS");
    printf("%20s %9s %9s", "2x2 mt/blk GEMM: t", "MFLOPS", "speedup");
    printf("%20s %9s %9s", "1x4 mt/blk GEMM: t", "MFLOPS", "speedup");
    printf("%20s %9s %9s", "4x1 mt/blk GEMM: t", "MFLOPS", "speedup");
    printf(" %9s %9s", "res12", "res13");
    printf(" %9s %9s", "res14", "res15");
    printf("\n");

    hpc::aux::WallTime<double> wallTime;

    for (long m = MIN_M, n = MIN_N, k = MIN_K;
         m <=MAX_M && n <= MAX_N && k <= MAX_K;
         m += INC_M, n += INC_N, k += INC_K)
    {
        double t;

        auto A  = A_(0,0,m,k);
        auto B  = B_(0,0,k,n);
        auto C1 = C1_(0,0,m,n);
        auto C2 = C2_(0,0,m,n);
        auto C3 = C3_(0,0,m,n);
        auto C4 = C4_(0,0,m,n);
        auto C5 = C5_(0,0,m,n);

        printf("%5ld %5ld %5ld ", m, n, k);

        wallTime.tic();
        refblas::gemm(Index(m), Index(n), Index(k), alpha,
                      A.data, A.incRow, A.incCol,
                      B.data, B.incRow, B.incCol,
                      beta,
                      C1.data, C1.incRow, C1.incCol);
        t = wallTime.toc();
        printf("%20.4lf %9.2lf", t, 2.*m/1000*n/1000*k/t);

        wallTime.tic();
        mm(alpha, A, B, beta, C2);
        double t1 = wallTime.toc();
        double res12 = estimateGemmResidual(alpha, A, B, beta, C1, C2);
        printf("%20.4lf %9.2lf", t1, 2.*m/1000*n/1000*k/t1);

        wallTime.tic();
        mm(alpha, A, B, beta, C3, 2, 2);
        double t2 = wallTime.toc();
        double res13 = estimateGemmResidual(alpha, A, B, beta, C1, C3);
        double speedup = t1/t2;
        printf("%20.4lf %9.2lf %9.2lf", t2, 2.*m/1000*n/1000*k/t2, speedup);

        wallTime.tic();
        mm(alpha, A, B, beta, C4, 1, 4);
        t2 = wallTime.toc();
        double res14 = estimateGemmResidual(alpha, A, B, beta, C1, C4);
        speedup = t1/t2;
        printf("%20.4lf %9.2lf %9.2lf", t2, 2.*m/1000*n/1000*k/t2, speedup);

        wallTime.tic();
        mm(alpha, A, B, beta, C5, 4, 1);
        t2 = wallTime.toc();
        double res15 = estimateGemmResidual(alpha, A, B, beta, C1, C5);
        speedup = t1/t2;
        printf("%20.4lf %9.2lf %9.2lf", t2, 2.*m/1000*n/1000*k/t2, speedup);

        printf(" %9.1e %9.1e %9.1e %9.1e", res12, res13, res14, res15);
        printf("\n");
    }
}