Lösungsvorschlag

#include <cstdio>
#include <hpc/cuda/check.h>
#include <hpc/ulmblas/gescal.h>
#include <hpc/matvec/gematrix.h>
#include <hpc/cuda/gematrix.h>
#include <hpc/cuda/copy.h>

template<typename Index, typename Alpha, typename TX>
__global__ void gescal(Index m, Index n, Alpha alpha,
      TX* X, Index incRowX, Index incColX) {
   Index i = threadIdx.x + blockIdx.x * blockDim.x;
   Index j = threadIdx.y + blockIdx.y * blockDim.y;
   if (i < m && j < n) {
      X[i*incRowX + j*incColX] *= alpha;
   }
}

int main() {
   hpc::matvec::GeMatrix<double> A(48, 4);
   for (int i = 0; i < A.numRows; ++i) {
      for (int j = 0; j < A.numCols; ++j) {
         A(i, j) = 1000 * i + j;
      }
   }
   hpc::cuda::GeMatrix<double> B(A.numRows, A.numCols);
   hpc::cuda::copy(A, B);

   unsigned int dim_per_block = 16;
   unsigned int num_blocks_per_row = (A.numRows + dim_per_block - 1) /
      dim_per_block;
   unsigned int num_blocks_per_col = (A.numCols + dim_per_block - 1) /
      dim_per_block;
   dim3 block(dim_per_block, dim_per_block);
   dim3 grid(num_blocks_per_row, num_blocks_per_col);

   gescal<<<grid, block>>>(B.numRows, B.numCols,
      2, B.data, B.incRow, B.incCol);

   hpc::cuda::copy(B, A);
   for (int i = 0; i < A.numRows; ++i) {
      for (int j = 0; j < A.numCols; ++j) {
         std::printf(" %12lg", A(i, j));
      }
      std::printf("\n");
   }
}