Lösungsvorschlag

#include <cstdio>
#include <hpc/cuda/check.h>

template<typename Index, typename Alpha, typename TX>
__global__ void scal(Index n, Alpha alpha, TX* x, Index incX) {
   Index index = threadIdx.x + blockIdx.x * blockDim.x;
   if (index < n) {
      x[index*incX] *= alpha;
   }
}

#define N 8192

int main() {
   double a[N];
   for (unsigned int i = 0; i < N; ++i) {
      a[i] = i;
   }

   /* transfer vectors to GPU memory */
   double* cuda_a;
   CHECK_CUDA(cudaMalloc, (void**)&cuda_a, N * sizeof(double));
   CHECK_CUDA(cudaMemcpy, cuda_a, a, N * sizeof(double),
      cudaMemcpyHostToDevice);

   /* execute kernel function on GPU */
   unsigned int threads_per_block = 128; /* a multiple of the warp size */
   unsigned int num_blocks = (N + threads_per_block - 1) / threads_per_block;
   scal<<<num_blocks, threads_per_block>>>(N, 2.0, cuda_a, 1);

   /* transfer result vector from GPU to host memory */
   CHECK_CUDA(cudaMemcpy, a, cuda_a, N * sizeof(double),
      cudaMemcpyDeviceToHost);
   /* free space allocated at GPU memory */
   CHECK_CUDA(cudaFree, cuda_a);

   /* print result */
   for (unsigned int i = 0; i < N; ++i) {
      std::printf(" %12.4lg", a[i]);
      if (i % 10 == 9) std::printf("\n");
   }
   std::printf("\n");
}