Sample solution
Content |
Regarding the question: The invocation of the access method is no longer permitted on the CPU side as it is tagged as __device__. Hence, we need to use x.data() instead of &x(0).
#include <cmath> #include <printf.hpp> #include <hpc/cuda/check.hpp> #include <hpc/cuda/copy.hpp> #include <hpc/cuda/densevector.hpp> #include <hpc/matvec/densevector.hpp> constexpr std::size_t N = 8192; constexpr std::size_t THREADS_PER_BLOCK 1024 constexpr std::size_t NUM_BLOCKS = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; template<typename TX, typename TY> TX dot(std::size_t n, const TX* x, std::ptrdiff_t incX, TY* y, std::size_t incY) { TX res = 0; for (std::size_t index = 0; index < n; ++index) { res += x[index * incX] * y[index * incY]; } return res; } template<typename T> __global__ void asum(std::size_t n, const T* x, T* sums) { __shared__ double sums_per_block[THREADS_PER_BLOCK]; std::size_t index = threadIdx.x + blockIdx.x * blockDim.x; std::size_t me = threadIdx.x; if (index < n) { sums_per_block[me] = x[index]; } else { sums_per_block[me] = 0; } /* aggregate sum within a block */ index = blockDim.x / 2; while (index) { __syncthreads(); if (me < index) { sums_per_block[me] += sums_per_block[me + index]; } index /= 2; } if (me == 0) { sums[blockIdx.x] = sums_per_block[0]; } } template<typename TX, typename TY, typename T> __global__ void dot(std::size_t n, const TX* x, std::ptrdiff_t incX, TY* y, std::ptrdiff_t incY, T* sums) { std::size_t index = threadIdx.x + blockIdx.x * blockDim.x; T res; if (index < n) { res = x[index * incX] * y[index * incY]; } else { res = 0; } __shared__ double sums_per_block[THREADS_PER_BLOCK]; std::size_t me = threadIdx.x; sums_per_block[me] = res; /* aggregate sum within a block */ std::size_t active = blockDim.x / 2; while (active > 0) { __syncthreads(); if (me < active) { sums_per_block[me] += sums_per_block[me + active]; } active /= 2; } if (me == 0) { sums[blockIdx.x] = sums_per_block[0]; } } int main() { using T = double; hpc::matvec::DenseVector<T> a(N); hpc::matvec::DenseVector<T> b(a.length()); for (std::size_t i = 0; i < a.length(); ++i) { a(i) = i; b(i) = i * i; } /* transfer vectors to GPU memory */ hpc::cuda::DeviceDenseVector<T> dev_a(a.length()); hpc::cuda::DeviceDenseVector<T> dev_b(b.length()); copy(a, dev_a); copy(b, dev_b); auto dev_sums = new hpc::cuda::DeviceDenseVector<T>(NUM_BLOCKS); /* execute kernel function on GPU */ dot<<<NUM_BLOCKS, THREADS_PER_BLOCK>>>(dev_a.length(), dev_a.data(), dev_a.inc(), dev_b.data(), dev_b.inc(), dev_sums->data()); std::size_t len = NUM_BLOCKS; while (len > 1) { std::size_t num_blocks = (len + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; auto dev_sums2 = new hpc::cuda::DeviceDenseVector<T>(num_blocks); asum<<<num_blocks, THREADS_PER_BLOCK>>>(len, dev_sums->data(), dev_sums2->data()); delete dev_sums; dev_sums = dev_sums2; len = num_blocks; } /* transfer result vector from GPU to host memory */ hpc::matvec::DenseVector<T> result(1); copy(*dev_sums, result); delete dev_sums; /* print difference to local result */ auto local_sum = dot(a.length(), a.data(), a.inc(), b.data(), b.inc()); fmt::printf("diff: %12.4lg\n", std::abs(result(0) - local_sum)); }