Sample solution

Content

#include <cstddef>
#include <printf.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/ulmblas/scal.hpp>

template<typename Alpha, typename TX>
__global__ void gescal(std::size_t m, std::size_t n, Alpha alpha,
      TX* X, std::ptrdiff_t incRowX, std::ptrdiff_t incColX) {
   std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
   std::size_t j = threadIdx.y + blockIdx.y * blockDim.y;
   if (i < m && j < n) {
      X[i*incRowX + j*incColX] *= alpha;
   }
}

int main() {
   hpc::matvec::GeMatrix<double> A(48, 4);
   for (std::size_t i = 0; i < A.numRows(); ++i) {
      for (std::size_t j = 0; j < A.numCols(); ++j) {
	 A(i, j) = 1000 * i + j;
      }
   }
   hpc::cuda::DeviceGeMatrix<double> B(A.numRows(), A.numCols());
   hpc::cuda::copy(A, B);

   std::size_t dim_per_block = 16;
   std::size_t num_blocks_per_row = (A.numRows() + dim_per_block - 1) /
      dim_per_block;
   std::size_t num_blocks_per_col = (A.numCols() + dim_per_block - 1) /
      dim_per_block;
   dim3 block(dim_per_block, dim_per_block);
   dim3 grid(num_blocks_per_row, num_blocks_per_col);

   gescal<<<grid, block>>>(B.numRows(), B.numCols(),
      2, B.data(), B.incRow(), B.incCol());

   hpc::cuda::copy(B, A);
   for (std::size_t i = 0; i < A.numRows(); ++i) {
      for (std::size_t j = 0; j < A.numCols(); ++j) {
	 fmt::printf(" %12lg", A(i, j));
      }
      fmt::printf("\n");
   }
}