#ifndef HPC_CUDA_AGGREGATE_HPP
#define HPC_CUDA_AGGREGATE_HPP 1
#ifdef __CUDACC__
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/cuda/properties.hpp>
namespace hpc { namespace cuda {
namespace aggregate_impl {
constexpr std::size_t BLOCK_DIM = 32;
template<
template<typename> class MatrixA,
template<typename> class MatrixB,
typename T,
typename Aggregator,
Require<
DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
> = true
>
__global__ void aggregate_row_wise(const MatrixA<T> A, MatrixB<T> B,
Aggregator aggregator) {
std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM;
std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM;
if (i < A.numRows()) {
T val = A(i, j);
std::size_t maxoffset = BLOCK_DIM;
if (j + maxoffset >= A.numCols()) {
maxoffset = A.numCols() - j;
}
for (std::size_t offset = 1; offset < maxoffset; ++offset) {
val = aggregator(val, A(i, j + offset));
}
B(i, blockIdx.y) = val;
}
}
template<
template<typename> class MatrixA,
template<typename> class MatrixB,
typename T,
typename Aggregator,
Require<
DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
> = true
>
__global__ void aggregate_col_wise(const MatrixA<T> A, MatrixB<T> B,
Aggregator aggregator) {
std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM;
std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM;
if (j < A.numCols()) {
T val = A(i, j);
std::size_t maxoffset = BLOCK_DIM;
if (i + maxoffset >= A.numRows()) {
maxoffset = A.numRows() - i;
}
for (std::size_t offset = 1; offset < maxoffset; ++offset) {
val = aggregator(val, A(i + offset, j));
}
B(blockIdx.x, j) = val;
}
}
inline constexpr std::size_t ceildiv(std::size_t x, std::size_t y) {
/* note that we expect x > 0 && y > 0;
not safe against overflows but we expect y to be small */
return (x + y - 1) / y;
}
} // namespace aggregate_impl
template<
template<typename> class Matrix,
typename T,
typename Aggregator,
Require< DeviceGe<Matrix<T>> > = true
>
T aggregate(const Matrix<T>& A, Aggregator&& aggregator) {
using namespace aggregate_impl;
if (A.numRows() > 1 || A.numCols() > 1) {
const auto source = A.view();
dim3 grid_dim(ceildiv(A.numRows(), BLOCK_DIM),
ceildiv(A.numCols(), BLOCK_DIM));
if (A.numRows() > A.numCols()) {
DeviceGeMatrix<T> B(ceildiv(A.numRows(), BLOCK_DIM), A.numCols());
auto target = B.view();
dim3 block_dim(1, BLOCK_DIM);
aggregate_col_wise<<<grid_dim, block_dim>>>(source, target,
aggregator);
return aggregate(target, std::move(aggregator));
} else {
DeviceGeMatrix<T> B(A.numRows(), ceildiv(A.numCols(), BLOCK_DIM));
auto target = B.view();
dim3 block_dim(BLOCK_DIM, 1);
aggregate_row_wise<<<grid_dim, block_dim>>>(source, target,
aggregator);
return aggregate(target, std::move(aggregator));
}
} else {
T result;
CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T),
cudaMemcpyDeviceToHost);
return result;
}
}
} } // namespaces cuda and hpc
#else
# error This CUDA source must be compiled using nvcc
#endif
#endif