Parallelization of the matrix-matrix product

Content

A parallelization appears particularly useful within the macro kernel, i.e. within the mgemm function by parallelizing the outer for-loop.

Exercises

Preparation and compilation

To adapt gemm.hpp and mgemm.hpp you should create a local directory named hpc/ulmblas and copy the header files gemm.hpp and mgemm.hpp into it. This can be done using the following commands:

mkdir -p hpc/ulmblas
cp /home/numerik/pub/hpc/ws18/session22/hpc/ulmblas/{gemm.hpp,mgemm.hpp} hpc/ulmblas

The following three compilation commands show how to compile a single-threaded version, a version based on OpenMP, and one of the thread pool of our own library. On our hosts in E.44:

heim$ g++-7.2 -DD_BLOCKSIZE_MR=4 -DD_BLOCKSIZE_NR=8 -O3 -g -I. -I/home/numerik/pub/hpc/ws18/session22 -std=c++17 -o gemm_session22_serial gemm_session22.cpp -pthread
heim$ g++-7.2 -DD_BLOCKSIZE_MR=4 -DD_BLOCKSIZE_NR=8 -O3 -g -I. -I/home/numerik/pub/hpc/ws18/session22 -std=c++17 -fopenmp -o gemm_session22_openmp gemm_session22.cpp
heim$ g++-7.2 -DD_BLOCKSIZE_MR=4 -DD_BLOCKSIZE_NR=8 -O3 -g -I. -I/home/numerik/pub/hpc/ws18/session22 -std=c++17 -DGLOBAL_THREAD_POOL=global_tpool -o gemm_session22_tpool gemm_session22.cpp -pthread
heim$ 

On theon:

theon$ g++ -DD_BLOCKSIZE_MR=4 -DD_BLOCKSIZE_NR=8 -O3 -g -I. -I/home/numerik/pub/hpc/ws18/session22 -std=c++17 -o gemm_session22_serial gemm_session22.cpp
theon$ g++ -DD_BLOCKSIZE_MR=4 -DD_BLOCKSIZE_NR=8 -O3 -g -I. -I/home/numerik/pub/hpc/ws18/session22 -std=c++17 -fopenmp -o gemm_session22_openmp gemm_session22.cpp
theon$ g++ -DD_BLOCKSIZE_MR=4 -DD_BLOCKSIZE_NR=8 -O3 -g -I. -I/home/numerik/pub/hpc/ws18/session22 -std=c++17 -DGLOBAL_THREAD_POOL=global_tpool -o gemm_session22_tpool gemm_session22.cpp
theon$ 

Please note that the order of the ā€œ-Iā€ options is important. These directories are searched from left to right. Hence, your local directory must be given first such that it can take precedence over the regular library code at /home/numerik/pub/hpc/ws18/session22.

Source code

#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <printf.hpp>
#include <hpc/aux/cache-line.hpp>
#include <hpc/aux/repool.hpp>
#include <hpc/aux/slices.hpp>
#include <hpc/aux/walltime.hpp>
#include <hpc/matvec/axpy.hpp>
#include <hpc/matvec/copy.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/iterators.hpp>
#include <hpc/matvec/mm.hpp>
#include <hpc/matvec/print.hpp>
#include <hpc/matvec/scal.hpp>
#include <hpc/mt/thread_pool.hpp>
#include <hpc/tools/buffer.hpp>

#ifndef ALPHA
#define ALPHA 1
#endif

#ifndef BETA
#define BETA 1
#endif

#ifndef DIM_MAX_M
#define DIM_MAX_M 1500
#endif

#ifndef DIM_MAX_N
#define DIM_MAX_N 1500
#endif

#ifndef DIM_MAX_K
#define DIM_MAX_K 1500
#endif

#ifndef COLMAJOR_C
#define COLMAJOR_C 1
#endif

#ifndef COLMAJOR_A
#define COLMAJOR_A 1
#endif

#ifndef COLMAJOR_B
#define COLMAJOR_B 1
#endif

using namespace hpc;
using namespace hpc::aux;
using namespace hpc::matvec;
using namespace hpc::mt;

template<typename T>
std::size_t align_dim(std::size_t dim) {
   return align(dim * sizeof(T),
      std::max(get_cache_line_size(), sizeof(T))) / sizeof(T);
}

template<typename T>
struct AlignedMatrixSpace : public hpc::tools::Buffer<T> {
   AlignedMatrixSpace(std::size_t m, std::size_t n) :
      hpc::tools::Buffer<T>(align_dim<T>(m) * align_dim<T>(n),
	 get_cache_line_size()) {
   }
};

namespace hpc { namespace matvec {

   template<typename T>
   struct AlignedGeMatrixView : public GeMatrixView<T> {
      using is_GeMatrix = std::true_type;

      AlignedGeMatrixView(AlignedMatrixSpace<T>& space,
	    std::size_t m, std::size_t n,
	    Order order = Order::ColMajor) :
	 GeMatrixView<T>(m, n,
	    /* conj = */ false,
	    /* data = */ space.data,
	    /* incRow = */ order==Order::RowMajor? align_dim<T>(n): 1,
	    /* incCol = */ order==Order::ColMajor? align_dim<T>(m): 1) {
      }
   };

} } // namespaces matvec, hpc

constexpr hpc::matvec::Order select_order(int colmajor) {
   using namespace hpc::matvec;
   return colmajor > 0? Order::ColMajor: Order::RowMajor;
}

template <
   template<typename> class MatrixA, typename T,
   typename RandomEnginePool,
   Require< Ge<MatrixA<T>> > = true
>
void randomInit(MatrixA<T>& A, RandomEnginePool& repool) {
   using EngineType = typename RandomEnginePool::EngineType;
   RandomEngineGuard<EngineType> guard(repool);
   std::uniform_real_distribution<double> uniform(-100, 100);
   auto& engine = guard.get();

   for (auto [i, j, Aij]: A) {
      Aij = uniform(engine);
      (void) i; (void) j; // suppress gcc warning
   }
}

template <
   template<typename> class MatrixA, typename T,
   typename RandomEnginePool,
   Require< Ge<MatrixA<T>> > = true
>
void randomInit(MatrixA<T>& A, RandomEnginePool& repool, ThreadPool& tpool) {
   auto granularity = get_cache_line_size() / sizeof(A(0, 0));
   auto nof_tasks = align(A.numRows(), granularity) *
      align(A.numCols(), granularity);

   std::vector<std::future<void>> futures(nof_tasks);

   std::size_t job_index = 0;
   foreach_slice<AlignedSlices>(tpool.size(), A.numRows(), granularity, [&]
         (std::size_t rows_offset, std::size_t rows_size) {
      foreach_slice<AlignedSlices>(tpool.size(), A.numCols(),
	    [&,rows_offset,rows_size]
	    (std::size_t cols_offset, std::size_t cols_size) {
	 futures[job_index++] = tpool.submit([=, &A, &repool]() {
	    auto A_ = A.view(rows_offset, cols_offset,
	       rows_size, cols_size);
	    randomInit(A_, repool);
	 });
      });
   });

   for (auto& future: futures) {
      if (future.valid()) future.get();
   }
}

/*
   return the infinity norm (or maximum absolute row sum norm) of a matrix,
   see http://mathworld.wolfram.com/MaximumAbsoluteRowSumNorm.html
*/

template <
   template<typename> class MatrixA, typename T,
   Require< Ge<MatrixA<T>> > = true
>
auto matrix_inf_norm(const MatrixA<T>& A) {
   using TA = decltype(std::abs(A(0, 0)));
   DenseVector<TA> abs_row_sum(A.numRows());
   TA max = TA{};
   for (auto [i, j, Aij]: A) {
      if (std::isnan(Aij)) return Aij;
      if (j == 0) {
	 abs_row_sum(i) = std::abs(Aij);
      } else {
	 abs_row_sum(i) += std::abs(Aij);
      }
   }
   for (auto [i, sumi]: abs_row_sum) {
      if (sumi > max) max = sumi;
      (void) i; // suppress gcc warning
   }
   return max;
}

/*
   error estimator for C <- beta C + alpha*A*B where
      C0    is the original state of C
      Cref  the result of a trusted implementation
      Ctst  is computed by a routine subject to testing
            (and modified by this function to compute Cref - Ctst)
*/
template <
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   template<typename> class MatrixC0,
   template<typename> class MatrixCref,
   template<typename> class MatrixCtst,
   typename T,
   Require<
      Ge<MatrixA<T>>,
      Ge<MatrixB<T>>,
      Ge<MatrixC0<T>>,
      Ge<MatrixCref<T>>,
      Ge<MatrixCtst<T>>
   > = true
>
auto gemm_err_est(T alpha,
      const MatrixA<T>& A,
      const MatrixB<T>& B,
      const MatrixC0<T>& C0,
      T beta,
      const MatrixCref<T>& Cref,
      MatrixCtst<T>& Ctst) {
   using PT = decltype(std::abs(A(0, 0)));

   auto m = C0.numRows();
   auto n = C0.numCols();
   auto k = A.numCols();
   auto N = std::max({m, n, k});

   axpy(PT(-1), Cref, Ctst);
   auto normD = matrix_inf_norm(Ctst);
   if (std::isnan(normD)) {
      return normD;
   }
   if (normD == PT(0)) {
      return normD;
   }
   PT normA, normB;
   if (alpha != PT(0)) {
      normA = matrix_inf_norm(A) * alpha;
      normB = matrix_inf_norm(B);
   } else {
      normA = normB = PT(0);
   }
   PT normC0;
   if (beta != PT(0)) {
      normC0 = matrix_inf_norm(C0) * beta;
   } else {
      normC0 = 0;
   }
   auto eps = std::numeric_limits<PT>::epsilon();
   return normD / (eps * (N * normA * normB + normC0));
}

template <
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   template<typename> class MatrixC,
   typename T,
   typename ThreadPool,
   Require<
      Ge<MatrixA<T>>,
      Ge<MatrixB<T>>,
      Ge<MatrixC<T>>
   > = true
>
void mm(const T& alpha, const MatrixA<T>& A, const MatrixB<T>& B, const T& beta,
      MatrixC<T>& C, ThreadPool& tpool) {
   auto granularity = get_cache_line_size() / sizeof(C(0, 0));
   auto nof_tasks = align(C.numRows(), granularity) *
      align(C.numCols(), granularity);

   std::vector<std::future<void>> futures(nof_tasks);

   std::size_t job_index = 0;
   foreach_slice<AlignedSlices>(tpool.size(), C.numRows(), granularity, [&]
         (std::size_t rows_offset, std::size_t rows_size) {
      foreach_slice<AlignedSlices>(tpool.size(), C.numCols(),
	    [&, rows_offset, rows_size]
	    (std::size_t cols_offset, std::size_t cols_size) {
	 futures[job_index++] = tpool.submit([=, &A, &B, &C]() {
	    auto A_ = A.constView(rows_offset, 0, rows_size, A.numCols());
	    auto B_ = B.constView(0, cols_offset, A.numCols(), cols_size);
	    auto C_ = C.view(rows_offset, cols_offset,
	       rows_size, cols_size);
	    mm(alpha, A_, B_, beta, C_);
	 });
      });
   });

   for (auto& future: futures) {
      if (future.valid()) future.get();
   }
}

namespace ref {

template <
   typename Alpha,
   typename Beta,
   typename T,
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename MatrixC,
   Require<
      Ge<MatrixA<T>>,
      Ge<MatrixB<T>>,
      Ge<MatrixC>
   > = true>
/// - gemm operation $C \leftarrow \beta C + \alpha A B$
void mm(const Alpha& alpha, const MatrixA<T>& A, const MatrixB<T>& B,
      const Beta& beta, MatrixC&& C) {
   assert(A.numCols()==B.numRows());
   assert(C.numRows()==A.numRows());
   assert(C.numCols()==B.numCols());

   scal(beta, C);
   for (auto [i, j, Cij]: C) {
      for (std::size_t l = 0; l < A.numCols(); ++l) {
	 Cij += alpha * A(i, l) * B(l, j);
      }
   }
}

} // namespace ref

hpc::mt::ThreadPool global_tpool;

int main() {
   RandomEnginePool<std::mt19937> repool(global_tpool.size());

   using Real = double;
   AlignedMatrixSpace<Real> A_space(DIM_MAX_N, DIM_MAX_K);
   AlignedMatrixSpace<Real> B_space(DIM_MAX_K, DIM_MAX_N);
   AlignedMatrixSpace<Real> C0_space(DIM_MAX_M, DIM_MAX_N);
   AlignedMatrixSpace<Real> Cref_space(DIM_MAX_M, DIM_MAX_N);
   AlignedMatrixSpace<Real> Ctst_space(DIM_MAX_M, DIM_MAX_N);

   fmt::printf("#%109s | %35s |\n",
               "| refmm              ",
               "updated matvec::mm:                ");
   fmt::printf("#%7s %7s %7s %7s %7s %7s %7s %7s %7s %7s %7s"
               " | %7s %11s | %7s %7s %11s %7s |\n",
               "m", "n", "k", "incRowC", "incColC",
               "incRowA", "incColA", "incRowB", "incColB",
               "alpha", "beta",
               "time", "mflops",
               "time", "error", "mflops", "factor");

   for (std::size_t m=300, n=300, k=300;
	 m <= DIM_MAX_M && n <= DIM_MAX_N && k <= DIM_MAX_K;
	 m += 100, n +=100, k += 100) {
      using namespace hpc::matvec;
      AlignedGeMatrixView<Real> A(A_space, m, k, select_order(COLMAJOR_A));
      AlignedGeMatrixView<Real> B(B_space, k, n, select_order(COLMAJOR_B));
      AlignedGeMatrixView<Real> C0(C0_space, m, n, select_order(COLMAJOR_C));
      AlignedGeMatrixView<Real> Cref(Cref_space,
	 m, n, select_order(COLMAJOR_C));
      AlignedGeMatrixView<Real> Ctst(Ctst_space,
	 m, n, select_order(COLMAJOR_C));
      randomInit(A, repool, global_tpool);
      randomInit(B, repool, global_tpool);
      randomInit(C0, repool, global_tpool);

      WallTime<double> walltime;

      /* run non-parallelized reference implementation */
      copy(C0, Cref);
      walltime.tic();
      ref::mm(Real(ALPHA), A, B, Real(BETA), Cref);
      auto tref = walltime.toc();

      /* run possibly parallelized algorithm */
      copy(C0, Ctst);
      walltime.tic();
      mm(Real(ALPHA), A, B, Real(BETA), Ctst);
      auto tpar = walltime.toc();

      /* compare results */
      auto err_est = gemm_err_est(Real(ALPHA), A, B, C0,
	 Real(BETA), Cref, Ctst);

      /* print results */
      double mflop = 2.*m/1000*n/1000*k;
      fmt::printf(" %7zu %7zu %7zu %7zu %7zu %7zu %7zu %7zu %7zu "
                    "%7.2lf %7.2lf | %7.2lf %11.2lf | "
		    "%7.2lf %7.0e %11.2lf %7.2lf |\n",
                    m, n, k, C0.incRow(), C0.incCol(), A.incRow(), A.incCol(),
                    B.incRow(), B.incCol(), ALPHA, BETA,
                    tref, mflop/tref,
		    tpar, err_est, mflop/tpar, tref/tpar);
      std::cout << std::flush;
   }
}