Sample solution

Content

The following approach takes into account that aggregation does not come with a well-defined order. Hence, the implementation is free to chose any order it wants. In our particular case of seeking the maximum, the order is indeed irrelevant. But in other cases it may play a role – for example when we sum up all elements without any precautions for a loss of significance (BTW, the Kahan summation algorithm can be helpful here). We are already using this freedom when we decide to aggregate either row or column-wise.

The interesting point is that when order does not matter we have the freedom to use transposing views of a matrix. If a matrix is in row-major order, a transposing view is in col-major order, and vice versa.

Hence, we add to the implementation a function named transposed_view which delivers a matrix in the opposite storage order:

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto transposed_view(const Matrix<T>& A) {
   return DeviceGeMatrixConstView<T>(A.numCols(), A.numRows(), A.conj(),
      A.data(), A.incCol(), A.incRow());
}

Now we can create functions that deliver a view in the preferred storage order:

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto row_major_view(const Matrix<T>& A) {
   if (A.incRow() < A.incCol()) {
      return transposed_view(A);
   } else {
      return A.view();
   }
}

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto col_major_view(const Matrix<T>& A) {
   if (A.incRow() > A.incCol()) {
      return transposed_view(A);
   } else {
      return A.view();
   }
}

Then, we chose always the best storage order. In addition, a special case was added when the matrix fits into one block:

template<
   template<typename> class Matrix,
   typename T,
   typename Aggregator,
   Require< DeviceGe<Matrix<T>> > = true
>
T aggregate(const Matrix<T>& A, Aggregator&& aggregator) {
   using namespace aggregate_impl;
   if (A.numRows() > 1 || A.numCols() > 1) {
      if (A.numRows() <= BLOCK_DIM && A.numCols() <= BLOCK_DIM) {
         /* most efficiently handled by a trivial 2d aggregator */
         DeviceGeMatrix<T> B(1, 1);
         const auto source = col_major_view(A);
         dim3 block_dim(align_to_power_of_2(source.numRows()),
            align_to_power_of_2(source.numCols()));
         auto target = B.view();
         aggregate2d<<<1, block_dim>>>(source, target, aggregator);
         return aggregate(target, std::move(aggregator));
      } else {
         /* aggregate column- or row-wise */
         if (A.numRows() > A.numCols()) {
            auto source = row_major_view(A);
            dim3 grid_dim(ceildiv(source.numRows(), BLOCK_DIM),
               ceildiv(source.numCols(), BLOCK_DIM));
            DeviceGeMatrix<T> B(ceildiv(source.numRows(), BLOCK_DIM),
               source.numCols());
            auto target = B.view();
            dim3 block_dim(1, BLOCK_DIM);
            aggregate_col_wise<<<grid_dim, block_dim>>>(source, target,
               aggregator);
            return aggregate(target, std::move(aggregator));
         } else {
            auto source = col_major_view(A);
            dim3 grid_dim(ceildiv(source.numRows(), BLOCK_DIM),
               ceildiv(source.numCols(), BLOCK_DIM));
            DeviceGeMatrix<T> B(source.numRows(),
               ceildiv(source.numCols(), BLOCK_DIM));
            auto target = B.view();
            dim3 block_dim(BLOCK_DIM, 1);
            aggregate_row_wise<<<grid_dim, block_dim>>>(source, target,
               aggregator);
            return aggregate(target, std::move(aggregator));
         }
      }
   } else {
      T result;
      CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T),
         cudaMemcpyDeviceToHost);
      return result;
   }
}

Now we yield some significant performance improvements:

livingstone$ make EXTRA=step02 test-aggregate
nvcc -c -std=c++14 -Istep02 -I/home/numerik/pub/hpc/ws19/session30 -I/usr/include/gdk-pixbuf-2.0 -I/usr/include/libpng16 -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include --expt-extended-lambda test-aggregate.cu
nvcc -w -o test-aggregate -std=c++14 --expt-extended-lambda test-aggregate.o -lgdk_pixbuf-2.0 -lgobject-2.0 -lglib-2.0
livingstone$ ./test-aggregate
   1 x    1: 0.02112 ms 0.01930 ms
   1 x    2: 0.02973 ms 0.04115 ms
   1 x    7: 0.03629 ms 0.02986 ms
   1 x   42: 0.04019 ms 0.05293 ms
   1 x  666: 0.04723 ms 0.04915 ms
   1 x 2048: 0.05994 ms 0.04678 ms
   2 x    1: 0.03453 ms 0.02979 ms
   2 x    2: 0.04285 ms 0.02979 ms
   2 x    7: 0.03504 ms 0.03021 ms
   2 x   42: 0.03878 ms 0.04854 ms
   2 x  666: 0.03968 ms 0.04470 ms
   2 x 2048: 0.05808 ms 0.04035 ms
   7 x    1: 0.03478 ms 0.02976 ms
   7 x    2: 0.03722 ms 0.02947 ms
   7 x    7: 0.03674 ms 0.03094 ms
   7 x   42: 0.04086 ms 0.05277 ms
   7 x  666: 0.04912 ms 0.03894 ms
   7 x 2048: 0.06061 ms 0.05018 ms
  42 x    1: 0.05277 ms 0.05776 ms
  42 x    2: 0.06246 ms 0.05443 ms
  42 x    7: 0.05184 ms 0.05853 ms
  42 x   42: 0.06640 ms 0.06272 ms
  42 x  666: 0.06806 ms 0.04835 ms
  42 x 2048: 0.08464 ms 0.06534 ms
 666 x    1: 0.05050 ms 0.05648 ms
 666 x    2: 0.05843 ms 0.05846 ms
 666 x    7: 0.04976 ms 0.04029 ms
 666 x   42: 0.05933 ms 0.04237 ms
 666 x  666: 0.47030 ms 0.32147 ms
 666 x 2048: 0.58477 ms 0.45366 ms
2048 x    1: 0.06506 ms 0.06246 ms
2048 x    2: 0.06326 ms 0.04669 ms
2048 x    7: 0.06186 ms 0.05645 ms
2048 x   42: 0.07149 ms 0.05869 ms
2048 x  666: 0.59434 ms 0.45837 ms
2048 x 2048: 0.93987 ms 0.81520 ms
livingstone$ 
#ifndef HPC_CUDA_AGGREGATE_HPP
#define HPC_CUDA_AGGREGATE_HPP 1

#ifdef __CUDACC__

#include <hpc/aux/ceildiv.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/cuda/properties.hpp>

namespace hpc { namespace cuda {

namespace aggregate_impl {

/* we expect our GPU to support BLOCK_DIM x BLOCK_DIM threads per block;
   this must be a power of 2 and is best a multiply of the warp size */
constexpr std::size_t BLOCK_DIM = 32;

/* kernel that operates with a thread for each row
   of each block of matrix A */
template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   typename Aggregator,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void aggregate_row_wise(const MatrixA<T> A, MatrixB<T> B,
      Aggregator aggregator) {
   std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM;
   std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM;

   if (i < A.numRows()) {
      T val = A(i, j);
      std::size_t maxoffset = BLOCK_DIM;
      if (j + maxoffset >= A.numCols()) {
	 maxoffset = A.numCols() - j;
      }
      for (std::size_t offset = 1; offset < maxoffset; ++offset) {
	 val = aggregator(val, A(i, j + offset));
      }
      B(i, blockIdx.y) = val;
   }
}

/* kernel that operates with a thread for each column
   of each block of matrix A */
template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   typename Aggregator,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void aggregate_col_wise(const MatrixA<T> A, MatrixB<T> B,
      Aggregator aggregator) {
   std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM;
   std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM;

   if (j < A.numCols()) {
      T val = A(i, j);
      std::size_t maxoffset = BLOCK_DIM;
      if (i + maxoffset >= A.numRows()) {
	 maxoffset = A.numRows() - i;
      }
      for (std::size_t offset = 1; offset < maxoffset; ++offset) {
	 val = aggregator(val, A(i + offset, j));
      }
      B(blockIdx.x, j) = val;
   }
}

/* kernel that operates with a thread for each element of matrix A;
   note that this kernel function is expected to be configured
   with blockDim.x and blockDim.y to be powers of 2 */
template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   typename Aggregator,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void aggregate2d(const MatrixA<T> A, MatrixB<T> B,
      Aggregator aggregator) {
   std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
   std::size_t j = threadIdx.y + blockIdx.y * blockDim.y;

   __shared__ T local_block[BLOCK_DIM][BLOCK_DIM];

   std::size_t me_i = threadIdx.x;
   std::size_t me_j = threadIdx.y;
   if (i < A.numRows() && j < A.numCols()) {
      local_block[me_i][me_j] = A(i, j);
   }

   std::size_t active_i = blockDim.x / 2;
   std::size_t active_j = blockDim.y / 2;

   while (active_i > 0) {
      __syncthreads();
      if (me_i < active_i && i + active_i < A.numRows()) {
	 local_block[me_i][me_j] = aggregator(local_block[me_i][me_j],
	    local_block[me_i + active_i][me_j]);
      }
      active_i /= 2;
   }
   while (active_j > 0) {
      __syncthreads();
      if (me_j < active_j && j + active_j < A.numCols()) {
	 local_block[me_i][me_j] = aggregator(local_block[me_i][me_j],
	    local_block[me_i][me_j + active_j]);
      }
      active_j /= 2;
   }
   if (me_i == 0 && me_j == 0) {
      B(blockIdx.x, blockIdx.y) = local_block[0][0];
   }
}

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto transposed_view(const Matrix<T>& A) {
   return DeviceGeMatrixConstView<T>(A.numCols(), A.numRows(), A.conj(),
      A.data(), A.incCol(), A.incRow());
}

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto row_major_view(const Matrix<T>& A) {
   if (A.incRow() < A.incCol()) {
      return transposed_view(A);
   } else {
      return A.view();
   }
}

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto col_major_view(const Matrix<T>& A) {
   if (A.incRow() > A.incCol()) {
      return transposed_view(A);
   } else {
      return A.view();
   }
}

template<typename T>
inline constexpr T align_to_power_of_2(T val) {
   T res = 1;
   while (res < val) {
      res *= 2;
   }
   return res;
}

} // namespace aggregate_impl

template<
   template<typename> class Matrix,
   typename T,
   typename Aggregator,
   Require< DeviceGe<Matrix<T>> > = true
>
T aggregate(const Matrix<T>& A, Aggregator&& aggregator) {
   using namespace aggregate_impl;
   using namespace hpc::aux;
   if (A.numRows() > 1 || A.numCols() > 1) {
      if (A.numRows() <= BLOCK_DIM && A.numCols() <= BLOCK_DIM) {
	 /* most efficiently handled by a trivial 2d aggregator */
	 DeviceGeMatrix<T> B(1, 1);
	 const auto source = col_major_view(A);
	 dim3 block_dim(align_to_power_of_2(source.numRows()),
	    align_to_power_of_2(source.numCols()));
	 auto target = B.view();
	 aggregate2d<<<1, block_dim>>>(source, target, aggregator);
	 return aggregate(target, std::move(aggregator));
      } else {
	 /* aggregate column- or row-wise */
	 if (A.numRows() > A.numCols()) {
	    auto source = row_major_view(A);
	    dim3 grid_dim(ceildiv(source.numRows(), BLOCK_DIM),
	       ceildiv(source.numCols(), BLOCK_DIM));
	    DeviceGeMatrix<T> B(ceildiv(source.numRows(), BLOCK_DIM),
	       source.numCols());
	    auto target = B.view();
	    dim3 block_dim(1, BLOCK_DIM);
	    aggregate_col_wise<<<grid_dim, block_dim>>>(source, target,
	       aggregator);
	    return aggregate(target, std::move(aggregator));
	 } else {
	    auto source = col_major_view(A);
	    dim3 grid_dim(ceildiv(source.numRows(), BLOCK_DIM),
	       ceildiv(source.numCols(), BLOCK_DIM));
	    DeviceGeMatrix<T> B(source.numRows(),
	       ceildiv(source.numCols(), BLOCK_DIM));
	    auto target = B.view();
	    dim3 block_dim(BLOCK_DIM, 1);
	    aggregate_row_wise<<<grid_dim, block_dim>>>(source, target,
	       aggregator);
	    return aggregate(target, std::move(aggregator));
	 }
      }
   } else {
      T result;
      CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T),
	 cudaMemcpyDeviceToHost);
      return result;
   }
}

} } // namespaces cuda and hpc

#else
#	error This CUDA source must be compiled using nvcc
#endif

#endif