#include <cstdio>
#include <hpc/cuda/check.h>
template<typename Index, typename Alpha, typename TX>
__global__ void scal(Index n, Alpha alpha, TX* x, Index incX) {
Index index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < n) {
x[index*incX] *= alpha;
}
}
#define N 8192
int main() {
double a[N];
for (unsigned int i = 0; i < N; ++i) {
a[i] = i;
}
/* transfer vectors to GPU memory */
double* cuda_a;
CHECK_CUDA(cudaMalloc, (void**)&cuda_a, N * sizeof(double));
CHECK_CUDA(cudaMemcpy, cuda_a, a, N * sizeof(double),
cudaMemcpyHostToDevice);
/* execute kernel function on GPU */
unsigned int threads_per_block = 128; /* a multiple of the warp size */
unsigned int num_blocks = (N + threads_per_block - 1) / threads_per_block;
scal<<<num_blocks, threads_per_block>>>(N, 2.0, cuda_a, 1);
/* transfer result vector from GPU to host memory */
CHECK_CUDA(cudaMemcpy, a, cuda_a, N * sizeof(double),
cudaMemcpyDeviceToHost);
/* free space allocated at GPU memory */
CHECK_CUDA(cudaFree, cuda_a);
/* print result */
for (unsigned int i = 0; i < N; ++i) {
std::printf(" %12.4lg", a[i]);
if (i % 10 == 9) std::printf("\n");
}
std::printf("\n");
}