#ifndef HPC_CUDA_AGGREGATE_HPP #define HPC_CUDA_AGGREGATE_HPP 1 #ifdef __CUDACC__ #include #include #include namespace hpc { namespace cuda { namespace aggregate_impl { constexpr std::size_t BLOCK_DIM = 32; template< template class MatrixA, template class MatrixB, typename T, typename Aggregator, Require< DeviceGe>, DeviceView>, DeviceGe>, DeviceView> > = true > __global__ void aggregate_row_wise(const MatrixA A, MatrixB B, Aggregator aggregator) { std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM; std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM; if (i < A.numRows()) { T val = A(i, j); std::size_t maxoffset = BLOCK_DIM; if (j + maxoffset >= A.numCols()) { maxoffset = A.numCols() - j; } for (std::size_t offset = 1; offset < maxoffset; ++offset) { val = aggregator(val, A(i, j + offset)); } B(i, blockIdx.y) = val; } } template< template class MatrixA, template class MatrixB, typename T, typename Aggregator, Require< DeviceGe>, DeviceView>, DeviceGe>, DeviceView> > = true > __global__ void aggregate_col_wise(const MatrixA A, MatrixB B, Aggregator aggregator) { std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM; std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM; if (j < A.numCols()) { T val = A(i, j); std::size_t maxoffset = BLOCK_DIM; if (i + maxoffset >= A.numRows()) { maxoffset = A.numRows() - i; } for (std::size_t offset = 1; offset < maxoffset; ++offset) { val = aggregator(val, A(i + offset, j)); } B(blockIdx.x, j) = val; } } inline constexpr std::size_t ceildiv(std::size_t x, std::size_t y) { /* note that we expect x > 0 && y > 0; not safe against overflows but we expect y to be small */ return (x + y - 1) / y; } } // namespace aggregate_impl template< template class Matrix, typename T, typename Aggregator, Require< DeviceGe> > = true > T aggregate(const Matrix& A, Aggregator&& aggregator) { using namespace aggregate_impl; if (A.numRows() > 1 || A.numCols() > 1) { const auto source = A.view(); dim3 grid_dim(ceildiv(A.numRows(), BLOCK_DIM), ceildiv(A.numCols(), BLOCK_DIM)); if (A.numRows() > A.numCols()) { DeviceGeMatrix B(ceildiv(A.numRows(), BLOCK_DIM), A.numCols()); auto target = B.view(); dim3 block_dim(1, BLOCK_DIM); aggregate_col_wise<<>>(source, target, aggregator); return aggregate(target, std::move(aggregator)); } else { DeviceGeMatrix B(A.numRows(), ceildiv(A.numCols(), BLOCK_DIM)); auto target = B.view(); dim3 block_dim(BLOCK_DIM, 1); aggregate_row_wise<<>>(source, target, aggregator); return aggregate(target, std::move(aggregator)); } } else { T result; CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T), cudaMemcpyDeviceToHost); return result; } } } } // namespaces cuda and hpc #else # error This CUDA source must be compiled using nvcc #endif #endif