# Lösungsvorschlag

#include <cstdio>
#include <hpc/cuda/check.h>
#include <hpc/ulmblas/gescal.h>
#include <hpc/matvec/gematrix.h>
#include <hpc/cuda/gematrix.h>
#include <hpc/cuda/copy.h>

template<typename Index, typename Alpha, typename TX>
__global__ void gescal(Index m, Index n, Alpha alpha,
TX* X, Index incRowX, Index incColX) {
Index i = threadIdx.x + blockIdx.x * blockDim.x;
Index j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < m && j < n) {
X[i*incRowX + j*incColX] *= alpha;
}
}

int main() {
hpc::matvec::GeMatrix<double> A(48, 4);
for (int i = 0; i < A.numRows; ++i) {
for (int j = 0; j < A.numCols; ++j) {
A(i, j) = 1000 * i + j;
}
}
hpc::cuda::GeMatrix<double> B(A.numRows, A.numCols);
hpc::cuda::copy(A, B);

unsigned int dim_per_block = 16;
unsigned int num_blocks_per_row = (A.numRows + dim_per_block - 1) /
dim_per_block;
unsigned int num_blocks_per_col = (A.numCols + dim_per_block - 1) /
dim_per_block;
dim3 block(dim_per_block, dim_per_block);
dim3 grid(num_blocks_per_row, num_blocks_per_col);

gescal<<<grid, block>>>(B.numRows, B.numCols,
2, B.data, B.incRow, B.incCol);

hpc::cuda::copy(B, A);
for (int i = 0; i < A.numRows; ++i) {
for (int j = 0; j < A.numCols; ++j) {
std::printf(" %12lg", A(i, j));
}
std::printf("\n");
}
}