Sample solution

Content

The following approach takes into account that aggregation does not come with a well-defined order. Hence, the implementation is free to chose any order it wants. In our particular case of seeking the maximum, the order is indeed irrelevant. But in other cases it may play a role – for example when we sum up all elements without any precautions for a loss of significance (BTW, the Kahan summation algorithm can be helpful here). We are already using this freedom when we decide to aggregate either row or column-wise.

The interesting point is that when order does not matter we have the freedom to use transposing views of a matrix. If a matrix is in row-major order, a transposing view is in col-major order, and vice versa.

Hence, we add to the implementation a function named transposed_view which delivers a matrix in the opposite storage order:

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto transposed_view(const Matrix<T>& A) {
   return DeviceGeMatrixConstView<T>(A.numCols(), A.numRows(), A.conj(),
      A.data(), A.incCol(), A.incRow());
}

Now we can create functions that deliver a view in the preferred storage order:

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto row_major_view(const Matrix<T>& A) {
   if (A.incRow() < A.incCol()) {
      return transposed_view(A);
   } else {
      return A.view();
   }
}

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto col_major_view(const Matrix<T>& A) {
   if (A.incRow() > A.incCol()) {
      return transposed_view(A);
   } else {
      return A.view();
   }
}

Then, we chose always the best storage order. In addition, a special case was added when the matrix fits into one block:

template<
   template<typename> class Matrix,
   typename T,
   typename Aggregator,
   Require< DeviceGe<Matrix<T>> > = true
>
T aggregate(const Matrix<T>& A, Aggregator&& aggregator) {
   using namespace aggregate_impl;
   if (A.numRows() > 1 || A.numCols() > 1) {
      if (A.numRows() <= BLOCK_DIM && A.numCols() <= BLOCK_DIM) {
         /* most efficiently handled by a trivial 2d aggregator */
         DeviceGeMatrix<T> B(1, 1);
         const auto source = col_major_view(A);
         dim3 block_dim(align_to_power_of_2(source.numRows()),
            align_to_power_of_2(source.numCols()));
         auto target = B.view();
         aggregate2d<<<1, block_dim>>>(source, target, aggregator);
         return aggregate(target, std::move(aggregator));
      } else {
         /* aggregate column- or row-wise */
         if (A.numRows() > A.numCols()) {
            auto source = row_major_view(A);
            dim3 grid_dim(ceildiv(source.numRows(), BLOCK_DIM),
               ceildiv(source.numCols(), BLOCK_DIM));
            DeviceGeMatrix<T> B(ceildiv(source.numRows(), BLOCK_DIM),
               source.numCols());
            auto target = B.view();
            dim3 block_dim(1, BLOCK_DIM);
            aggregate_col_wise<<<grid_dim, block_dim>>>(source, target,
               aggregator);
            return aggregate(target, std::move(aggregator));
         } else {
            auto source = col_major_view(A);
            dim3 grid_dim(ceildiv(source.numRows(), BLOCK_DIM),
               ceildiv(source.numCols(), BLOCK_DIM));
            DeviceGeMatrix<T> B(source.numRows(),
               ceildiv(source.numCols(), BLOCK_DIM));
            auto target = B.view();
            dim3 block_dim(BLOCK_DIM, 1);
            aggregate_row_wise<<<grid_dim, block_dim>>>(source, target,
               aggregator);
            return aggregate(target, std::move(aggregator));
         }
      }
   } else {
      T result;
      CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T),
         cudaMemcpyDeviceToHost);
      return result;
   }
}

Now we yield some significant performance improvements:

livingstone$ ./test-aggregate
   1 x    1: 0.01338 ms 0.01866 ms
   1 x    2: 0.03770 ms 0.04144 ms
   1 x    7: 0.03510 ms 0.03216 ms
   1 x   42: 0.05661 ms 0.05312 ms
   1 x  666: 0.04237 ms 0.05232 ms
   1 x 2048: 0.05184 ms 0.04922 ms
   2 x    1: 0.03565 ms 0.03133 ms
   2 x    2: 0.03584 ms 0.03152 ms
   2 x    7: 0.03562 ms 0.03037 ms
   2 x   42: 0.04045 ms 0.06445 ms
   2 x  666: 0.04026 ms 0.04704 ms
   2 x 2048: 0.06304 ms 0.04656 ms
   7 x    1: 0.03722 ms 0.03293 ms
   7 x    2: 0.04496 ms 0.03245 ms
   7 x    7: 0.02976 ms 0.03299 ms
   7 x   42: 0.04365 ms 0.05325 ms
   7 x  666: 0.05120 ms 0.04003 ms
   7 x 2048: 0.06221 ms 0.04326 ms
  42 x    1: 0.05398 ms 0.06029 ms
  42 x    2: 0.05654 ms 0.07037 ms
  42 x    7: 0.05290 ms 0.06013 ms
  42 x   42: 0.06675 ms 0.06304 ms
  42 x  666: 0.06307 ms 0.04931 ms
  42 x 2048: 0.08496 ms 0.07021 ms
 666 x    1: 0.05098 ms 0.05606 ms
 666 x    2: 0.05760 ms 0.05814 ms
 666 x    7: 0.05142 ms 0.04013 ms
 666 x   42: 0.05933 ms 0.04742 ms
 666 x  666: 0.47325 ms 0.32544 ms
 666 x 2048: 0.58461 ms 0.45338 ms
2048 x    1: 0.06493 ms 0.07059 ms
2048 x    2: 0.06227 ms 0.04794 ms
2048 x    7: 0.06253 ms 0.05690 ms
2048 x   42: 0.07162 ms 0.06064 ms
2048 x  666: 0.58288 ms 0.45914 ms
2048 x 2048: 0.93130 ms 0.80144 ms
livingstone$
#ifndef HPC_CUDA_AGGREGATE_HPP
#define HPC_CUDA_AGGREGATE_HPP 1

#ifdef __CUDACC__

#include <hpc/cuda/check.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/cuda/properties.hpp>

namespace hpc { namespace cuda {

namespace aggregate_impl {

/* we expect our GPU to support BLOCK_DIM x BLOCK_DIM threads per block;
   this must be a power of 2 and is best a multiply of the warp size */
constexpr std::size_t BLOCK_DIM = 32;

/* kernel that operates with a thread for each row
   of each block of matrix A */
template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   typename Aggregator,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void aggregate_row_wise(const MatrixA<T> A, MatrixB<T> B,
      Aggregator aggregator) {
   std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM;
   std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM;

   if (i < A.numRows()) {
      T val = A(i, j);
      std::size_t maxoffset = BLOCK_DIM;
      if (j + maxoffset >= A.numCols()) {
	 maxoffset = A.numCols() - j;
      }
      for (std::size_t offset = 1; offset < maxoffset; ++offset) {
	 val = aggregator(val, A(i, j + offset));
      }
      B(i, blockIdx.y) = val;
   }
}

/* kernel that operates with a thread for each column
   of each block of matrix A */
template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   typename Aggregator,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void aggregate_col_wise(const MatrixA<T> A, MatrixB<T> B,
      Aggregator aggregator) {
   std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM;
   std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM;

   if (j < A.numCols()) {
      T val = A(i, j);
      std::size_t maxoffset = BLOCK_DIM;
      if (i + maxoffset >= A.numRows()) {
	 maxoffset = A.numRows() - i;
      }
      for (std::size_t offset = 1; offset < maxoffset; ++offset) {
	 val = aggregator(val, A(i + offset, j));
      }
      B(blockIdx.x, j) = val;
   }
}

/* kernel that operates with a thread for each element of matrix A;
   note that this kernel function is expected to be configured
   with blockDim.x and blockDim.y to be powers of 2 */
template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   typename Aggregator,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void aggregate2d(const MatrixA<T> A, MatrixB<T> B,
      Aggregator aggregator) {
   std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
   std::size_t j = threadIdx.y + blockIdx.y * blockDim.y;

   __shared__ T local_block[BLOCK_DIM][BLOCK_DIM];

   std::size_t me_i = threadIdx.x;
   std::size_t me_j = threadIdx.y;
   if (i < A.numRows() && j < A.numCols()) {
      local_block[me_i][me_j] = A(i, j);
   }

   std::size_t active_i = blockDim.x / 2;
   std::size_t active_j = blockDim.y / 2;

   while (active_i > 0) {
      __syncthreads();
      if (me_i < active_i && i + active_i < A.numRows()) {
	 local_block[me_i][me_j] = aggregator(local_block[me_i][me_j],
	    local_block[me_i + active_i][me_j]);
      }
      active_i /= 2;
   }
   while (active_j > 0) {
      __syncthreads();
      if (me_j < active_j && j + active_j < A.numCols()) {
	 local_block[me_i][me_j] = aggregator(local_block[me_i][me_j],
	    local_block[me_i][me_j + active_j]);
      }
      active_j /= 2;
   }
   if (me_i == 0 && me_j == 0) {
      B(blockIdx.x, blockIdx.y) = local_block[0][0];
   }
}

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto transposed_view(const Matrix<T>& A) {
   return DeviceGeMatrixConstView<T>(A.numCols(), A.numRows(), A.conj(),
      A.data(), A.incCol(), A.incRow());
}

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto row_major_view(const Matrix<T>& A) {
   if (A.incRow() < A.incCol()) {
      return transposed_view(A);
   } else {
      return A.view();
   }
}

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto col_major_view(const Matrix<T>& A) {
   if (A.incRow() > A.incCol()) {
      return transposed_view(A);
   } else {
      return A.view();
   }
}

inline constexpr std::size_t ceildiv(std::size_t x, std::size_t y) {
   /* note that we expect x > 0 && y > 0;
      not safe against overflows but we expect y to be small */
   return (x + y - 1) / y;
}

template<typename T>
inline constexpr T align_to_power_of_2(T val) {
   T res = 1;
   while (res < val) {
      res *= 2;
   }
   return res;
}

} // namespace aggregate_impl

template<
   template<typename> class Matrix,
   typename T,
   typename Aggregator,
   Require< DeviceGe<Matrix<T>> > = true
>
T aggregate(const Matrix<T>& A, Aggregator&& aggregator) {
   using namespace aggregate_impl;
   if (A.numRows() > 1 || A.numCols() > 1) {
      if (A.numRows() <= BLOCK_DIM && A.numCols() <= BLOCK_DIM) {
	 /* most efficiently handled by a trivial 2d aggregator */
	 DeviceGeMatrix<T> B(1, 1);
	 const auto source = col_major_view(A);
	 dim3 block_dim(align_to_power_of_2(source.numRows()),
	    align_to_power_of_2(source.numCols()));
	 auto target = B.view();
	 aggregate2d<<<1, block_dim>>>(source, target, aggregator);
	 return aggregate(target, std::move(aggregator));
      } else {
	 /* aggregate column- or row-wise */
	 if (A.numRows() > A.numCols()) {
	    auto source = row_major_view(A);
	    dim3 grid_dim(ceildiv(source.numRows(), BLOCK_DIM),
	       ceildiv(source.numCols(), BLOCK_DIM));
	    DeviceGeMatrix<T> B(ceildiv(source.numRows(), BLOCK_DIM),
	       source.numCols());
	    auto target = B.view();
	    dim3 block_dim(1, BLOCK_DIM);
	    aggregate_col_wise<<<grid_dim, block_dim>>>(source, target,
	       aggregator);
	    return aggregate(target, std::move(aggregator));
	 } else {
	    auto source = col_major_view(A);
	    dim3 grid_dim(ceildiv(source.numRows(), BLOCK_DIM),
	       ceildiv(source.numCols(), BLOCK_DIM));
	    DeviceGeMatrix<T> B(source.numRows(),
	       ceildiv(source.numCols(), BLOCK_DIM));
	    auto target = B.view();
	    dim3 block_dim(BLOCK_DIM, 1);
	    aggregate_row_wise<<<grid_dim, block_dim>>>(source, target,
	       aggregator);
	    return aggregate(target, std::move(aggregator));
	 }
      }
   } else {
      T result;
      CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T),
	 cudaMemcpyDeviceToHost);
      return result;
   }
}

} } // namespaces cuda and hpc

#else
#	error This CUDA source must be compiled using nvcc
#endif

#endif