#include <cmath>
#include <cstddef>
#include <printf.hpp>
#include <hpc/cuda/check.hpp>
constexpr std::size_t N = 8192;
constexpr std::size_t THREADS_PER_BLOCK 1024
constexpr std::size_t NUM_BLOCKS =
(N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
template<typename TX, typename TY>
TX dot(std::size_t n, const TX* x, std::ptrdiff_t incX,
TY* y, std::ptrdiff_t incY) {
TX res = 0;
for (std::size_t index = 0; index < n; ++index) {
res += x[index * incX] * y[index * incY];
}
return res;
}
template<typename T>
__global__ void asum(std::size_t n, const T* x, T* sums) {
__shared__ double sums_per_block[THREADS_PER_BLOCK];
std::size_t index = threadIdx.x + blockIdx.x * blockDim.x;
std::size_t me = threadIdx.x;
if (index < n) {
sums_per_block[me] = x[index];
} else {
sums_per_block[me] = 0;
}
/* aggregate sum within a block */
index = blockDim.x / 2;
while (index) {
__syncthreads();
if (me < index) {
sums_per_block[me] += sums_per_block[me + index];
}
index /= 2;
}
if (me == 0) {
sums[blockIdx.x] = sums_per_block[0];
}
}
template<typename TX, typename TY, typename T>
__global__ void dot(std::size_t n,
const TX* x, std::ptrdiff_t incX, TY* y, std::ptrdiff_t incY, T* sums) {
std::size_t index = threadIdx.x + blockIdx.x * blockDim.x;
T res;
if (index < n) {
res = x[index * incX] * y[index * incY];
} else {
res = 0;
}
__shared__ double sums_per_block[THREADS_PER_BLOCK];
std::size_t me = threadIdx.x;
sums_per_block[me] = res;
/* aggregate sum within a block */
std::size_t active = blockDim.x / 2;
while (active > 0) {
__syncthreads();
if (me < active) {
sums_per_block[me] += sums_per_block[me + active];
}
active /= 2;
}
if (me == 0) {
sums[blockIdx.x] = sums_per_block[0];
}
}
int main() {
double a[N]; double b[N];
for (std::size_t i = 0; i < N; ++i) {
a[i] = i; b[i] = i * i;
}
/* transfer vectors to GPU memory */
double* device_a;
CHECK_CUDA(cudaMalloc, (void**)&device_a, N * sizeof(double));
CHECK_CUDA(cudaMemcpy, device_a, a, N * sizeof(double),
cudaMemcpyHostToDevice);
double* device_b;
CHECK_CUDA(cudaMalloc, (void**)&device_b, N * sizeof(double));
CHECK_CUDA(cudaMemcpy, device_b, b, N * sizeof(double),
cudaMemcpyHostToDevice);
double* device_sums;
CHECK_CUDA(cudaMalloc, (void**)&device_sums, NUM_BLOCKS * sizeof(double));
/* execute kernel function on GPU */
dot<<<NUM_BLOCKS, THREADS_PER_BLOCK>>>(N,
device_a, 1, device_b, 1, device_sums);
std::size_t len = NUM_BLOCKS;
while (len > 1) {
std::size_t num_blocks =
(len + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
double* device_sums2;
CHECK_CUDA(cudaMalloc, (void**)&device_sums2,
num_blocks * sizeof(double));
asum<<<num_blocks, THREADS_PER_BLOCK>>>(len, device_sums, device_sums2);
CHECK_CUDA(cudaFree, device_sums); device_sums = device_sums2;
len = num_blocks;
}
/* transfer result vector from GPU to host memory */
double sum;
CHECK_CUDA(cudaMemcpy, &sum, device_sums, sizeof(double),
cudaMemcpyDeviceToHost);
/* free space allocated at GPU memory */
CHECK_CUDA(cudaFree, device_a); CHECK_CUDA(cudaFree, device_b);
CHECK_CUDA(cudaFree, device_sums);
/* print difference to local result */
double local_sum = dot(N, a, 1, b, 1);
fmt::printf("diff: %12.4lg\n", std::abs(sum - local_sum));
}