#ifndef TRIVIAL_AGGREGATE_HPP #define TRIVIAL_AGGREGATE_HPP #ifdef __CUDACC__ #include #include #include namespace hpc { namespace cuda { namespace trivial_aggregate_impl { constexpr std::size_t BLOCK_DIM = 32; template< template class MatrixA, template class MatrixB, typename T, typename Aggregator, Require< DeviceGe>, DeviceView>, DeviceGe>, DeviceView> > = true > __global__ void aggregate2d(const MatrixA A, MatrixB B, Aggregator aggregator) { std::size_t i = threadIdx.x + blockIdx.x * blockDim.x; std::size_t j = threadIdx.y + blockIdx.y * blockDim.y; __shared__ T local_block[BLOCK_DIM][BLOCK_DIM]; std::size_t me_i = threadIdx.x; std::size_t me_j = threadIdx.y; if (i < A.numRows() && j < A.numCols()) { local_block[me_i][me_j] = A(i, j); } std::size_t active_i = blockDim.x / 2; std::size_t active_j = blockDim.y / 2; while (active_i > 0) { __syncthreads(); if (me_i < active_i && i + active_i < A.numRows()) { local_block[me_i][me_j] = aggregator(local_block[me_i][me_j], local_block[me_i + active_i][me_j]); } active_i /= 2; } while (active_j > 0) { __syncthreads(); if (me_j < active_j && j + active_j < A.numCols()) { local_block[me_i][me_j] = aggregator(local_block[me_i][me_j], local_block[me_i][me_j + active_j]); } active_j /= 2; } if (me_i == 0 && me_j == 0) { B(blockIdx.x, blockIdx.y) = local_block[0][0]; } } inline constexpr std::size_t ceildiv(std::size_t x, std::size_t y) { /* note that we expect x > 0 && y > 0; not safe against overflows but we expect y to be small */ return (x + y - 1) / y; } } // namespace trivial_aggregate_impl template< template class Matrix, typename T, typename Aggregator, Require< DeviceGe> > = true > T aggregate(const Matrix& A, Aggregator&& aggregator) { using namespace trivial_aggregate_impl; if (A.numRows() > 1 || A.numCols() > 1) { DeviceGeMatrix B(ceildiv(A.numRows(), BLOCK_DIM), ceildiv(A.numCols(), BLOCK_DIM)); auto target = B.view(); const auto source = A.view(); dim3 grid_dim(ceildiv(A.numRows(), BLOCK_DIM), ceildiv(A.numCols(), BLOCK_DIM)); dim3 block_dim(BLOCK_DIM, BLOCK_DIM); aggregate2d<<>>(source, target, aggregator); return aggregate(target, std::move(aggregator)); } else { T result; CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T), cudaMemcpyDeviceToHost); return result; } } } } // namespaces cuda, hpc #else # error This CUDA source must be compiled using nvcc #endif #endif