Two-dimensional aggregation

Content

Assume, we want to run the Jacobi iteration until

\[ \max_{i,j = 1 \dots N-1} \left| A_{{k+1}_{i,j}} - A_{{k}_{i,j}} \right| \le \epsilon\]

for some given threshold \(\epsilon\). Then we need to compute a global maximum of all the differences. This could be done by applying some global aggregation operator. As CUDA provides no convenient support for aggregation this has to be programmed explicitly.

Fortunately, it is possible to do this using lambda expressions which have experimental support in CUDA and need to be switched on using the --expt-extended-lambda flag for nvcc (already included in the generic Makefile for this session). Such an self-written aggregation operator could then be used as follows:

   T maxdiff = hpc::cuda::aggregate(global_diff, [] __device__
	 (T x, T y) {
      return x > y? x: y;
   });

Note the __device__ specifier between [] and the parameter list. This compiles the entire lambda expression for the device. The operator is then applied in some sequence on all elements of the matrix. The aggregate function can be declared as follows:

template<
   template<typename> class Matrix,
   typename T,
   typename Aggregator,
   Require< DeviceGe<Matrix<T>> > = true
>
T aggregate(const Matrix<T>& A, Aggregator&& aggregator) {
   /* ... */
}

How is it possible to parallelize the aggregation? One simple option is to let one thread run for each element of the matrix. But this wastes most of the computing capacity as quite a number of them will idle. Another option is to aggregate either row-wise or column-wise whereby each thread aggregates a row or column within a block, respectively. But then you will need more iterations and have to switch from row-wise to column-wise and vice versa until just one element remains which is returned. The second approach is faster despite the greater number of kernel function calls.

Exercise

Sources

Test program and a trivial implementation:

#include <cmath>
#include <limits>
#include <random>
#include <printf.hpp>
#include <hpc/cuda/aggregate.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/cuda/timer.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/for-all-indices.hpp>
#include <hpc/matvec/print.hpp>

using namespace hpc;
using namespace hpc::aux;
using namespace hpc::cuda;
using namespace hpc::matvec;

template<typename T>
struct minmax {
   minmax() :
      min(std::numeric_limits<T>::max()),
      max(std::numeric_limits<T>::lowest()) {
   }
   void update(T value) {
      if (value < min) min = value;
      if (value > max) max = value;
   }
   T min;
   T max;
};

template <
   template<typename> class MatrixA, typename T,
   Require< Ge<MatrixA<T>> > = true
>
minmax<T> randomInit(MatrixA<T>& A) {
   std::random_device random;
   std::mt19937 mt{random()};
   std::uniform_real_distribution<T> uniform(-100, 100);
   minmax<T> mm;
   
   for_all_indices(A, [&](std::size_t i, std::size_t j) {
      A(i, j) = uniform(mt);
      mm.update(A(i, j));
   });
   return mm;
}

template<typename T>
void run_test() {
   for (std::size_t M: {1, 2, 7, 42, 666, 2048}) {
      for (std::size_t N: {1, 2, 7, 42, 666, 2048}) {
	 GeMatrix<T> A(M, N, Order::ColMajor);
	 auto mm = randomInit(A);
	 DeviceGeMatrix<T> devA(A.numRows(), A.numCols(), Order::ColMajor);
	 copy(A, devA);
	 Timer max_timer, min_timer;
	 max_timer.start();
	 auto max = aggregate(devA, [] __device__
	       (T x, T y) {
	    return x > y? x: y;
	 });
	 max_timer.stop();
	 min_timer.start();
	 auto min = aggregate(devA, [] __device__
	       (T x, T y) {
	    return x < y? x: y;
	 });
	 min_timer.stop();
	 fmt::printf("%4d x %4d: %.5f ms %.5f ms\n",
	    M, N, min_timer.elapsed() * 1000, max_timer.elapsed() * 1000);
	 if (min != mm.min) {
	    fmt::printf("min diverts: host: %8g, device: %8g\n", mm.min, min);
	 }
	 if (max != mm.max) {
	    fmt::printf("max diverts: host: %8g, device: %8g\n", mm.max, max);
	 }
	 if (min != mm.min || max != mm.max) {
	    print(A);
	 }
      }
   }
}

int main() {
   run_test<double>();
}
#ifndef TRIVIAL_AGGREGATE_HPP
#define TRIVIAL_AGGREGATE_HPP

#ifdef __CUDACC__

#include <cstddef>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/gematrix.hpp>

namespace hpc { namespace cuda {

namespace trivial_aggregate_impl {

constexpr std::size_t BLOCK_DIM = 32;

template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   typename Aggregator,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void aggregate2d(const MatrixA<T> A, MatrixB<T> B,
      Aggregator aggregator) {
   std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
   std::size_t j = threadIdx.y + blockIdx.y * blockDim.y;

   __shared__ T local_block[BLOCK_DIM][BLOCK_DIM];

   std::size_t me_i = threadIdx.x;
   std::size_t me_j = threadIdx.y;
   if (i < A.numRows() && j < A.numCols()) {
      local_block[me_i][me_j] = A(i, j);
   }

   std::size_t active_i = blockDim.x / 2;
   std::size_t active_j = blockDim.y / 2;

   while (active_i > 0) {
      __syncthreads();
      if (me_i < active_i && i + active_i < A.numRows()) {
	 local_block[me_i][me_j] = aggregator(local_block[me_i][me_j],
	    local_block[me_i + active_i][me_j]);
      }
      active_i /= 2;
   }
   while (active_j > 0) {
      __syncthreads();
      if (me_j < active_j && j + active_j < A.numCols()) {
	 local_block[me_i][me_j] = aggregator(local_block[me_i][me_j],
	    local_block[me_i][me_j + active_j]);
      }
      active_j /= 2;
   }
   if (me_i == 0 && me_j == 0) {
      B(blockIdx.x, blockIdx.y) = local_block[0][0];
   }
}

inline constexpr std::size_t ceildiv(std::size_t x, std::size_t y) {
   /* note that we expect x > 0 && y > 0;
      not safe against overflows but we expect y to be small */
   return (x + y - 1) / y;
}

} // namespace trivial_aggregate_impl

template<
   template<typename> class Matrix,
   typename T,
   typename Aggregator,
   Require< DeviceGe<Matrix<T>> > = true
>
T aggregate(const Matrix<T>& A, Aggregator&& aggregator) {
   using namespace trivial_aggregate_impl;
   if (A.numRows() > 1 || A.numCols() > 1) {
      DeviceGeMatrix<T> B(ceildiv(A.numRows(), BLOCK_DIM),
	 ceildiv(A.numCols(), BLOCK_DIM));
      auto target = B.view();
      const auto source = A.view();
      dim3 grid_dim(ceildiv(A.numRows(), BLOCK_DIM),
	 ceildiv(A.numCols(), BLOCK_DIM));
      dim3 block_dim(BLOCK_DIM, BLOCK_DIM);
      aggregate2d<<<grid_dim, block_dim>>>(source, target, aggregator);
      return aggregate(target, std::move(aggregator));
   } else {
      T result;
      CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T),
	 cudaMemcpyDeviceToHost);
      return result;
   }
}

} } // namespaces cuda, hpc

#else
#	error This CUDA source must be compiled using nvcc
#endif

#endif