#include <cstddef>
#include <printf.hpp>
#include <hpc/cuda/check.hpp>
template<typename Alpha, typename TX>
__global__ void scal(std::size_t n, Alpha alpha, TX* x, std::ptrdiff_t incX) {
std::size_t index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < n) {
x[index*incX] *= alpha;
}
}
constexpr std::size_t N = 8192;
int main() {
double a[N];
for (std::size_t i = 0; i < N; ++i) {
a[i] = i;
}
/* transfer vectors to GPU memory */
double* device_a;
CHECK_CUDA(cudaMalloc, (void**)&device_a, N * sizeof(double));
CHECK_CUDA(cudaMemcpy, device_a, a, N * sizeof(double),
cudaMemcpyHostToDevice);
/* execute kernel function on GPU */
std::size_t threads_per_block = 128; /* a multiple of the warp size */
std::size_t num_blocks = (N + threads_per_block - 1) / threads_per_block;
scal<<<num_blocks, threads_per_block>>>(N, 2.0, device_a, 1);
/* transfer result vector from GPU to host memory */
CHECK_CUDA(cudaMemcpy, a, device_a, N * sizeof(double),
cudaMemcpyDeviceToHost);
/* free space allocated at GPU memory */
CHECK_CUDA(cudaFree, device_a);
/* print result */
for (std::size_t i = 0; i < N; ++i) {
fmt::printf(" %12.4lg", a[i]);
if (i % 10 == 9) fmt::printf("\n");
}
fmt::printf("\n");
}