Sample solution

Content

#include <cstddef>
#include <printf.hpp>
#include <hpc/cuda/check.hpp>

template<typename Alpha, typename TX>
__global__ void scal(std::size_t n, Alpha alpha, TX* x, std::ptrdiff_t incX) {
   std::size_t index = threadIdx.x + blockIdx.x * blockDim.x;
   if (index < n) {
      x[index*incX] *= alpha;
   }
}

constexpr std::size_t N = 8192;

int main() {
   double a[N];
   for (std::size_t i = 0; i < N; ++i) {
      a[i] = i;
   }

   /* transfer vectors to GPU memory */
   double* device_a;
   CHECK_CUDA(cudaMalloc, (void**)&device_a, N * sizeof(double));
   CHECK_CUDA(cudaMemcpy, device_a, a, N * sizeof(double),
      cudaMemcpyHostToDevice);

   /* execute kernel function on GPU */
   std::size_t threads_per_block = 128; /* a multiple of the warp size */
   std::size_t num_blocks = (N + threads_per_block - 1) / threads_per_block;
   scal<<<num_blocks, threads_per_block>>>(N, 2.0, device_a, 1);

   /* transfer result vector from GPU to host memory */
   CHECK_CUDA(cudaMemcpy, a, device_a, N * sizeof(double),
      cudaMemcpyDeviceToHost);
   /* free space allocated at GPU memory */
   CHECK_CUDA(cudaFree, device_a);

   /* print result */
   for (std::size_t i = 0; i < N; ++i) {
      fmt::printf(" %12.4lg", a[i]);
      if (i % 10 == 9) fmt::printf("\n");
   }
   fmt::printf("\n");
}