#include <cstddef>
#include <printf.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/ulmblas/scal.hpp>
template<typename Alpha, typename TX>
__global__ void gescal(std::size_t m, std::size_t n, Alpha alpha,
TX* X, std::ptrdiff_t incRowX, std::ptrdiff_t incColX) {
std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
std::size_t j = threadIdx.y + blockIdx.y * blockDim.y;
if (i < m && j < n) {
X[i*incRowX + j*incColX] *= alpha;
}
}
int main() {
hpc::matvec::GeMatrix<double> A(48, 4);
for (std::size_t i = 0; i < A.numRows(); ++i) {
for (std::size_t j = 0; j < A.numCols(); ++j) {
A(i, j) = 1000 * i + j;
}
}
hpc::cuda::DeviceGeMatrix<double> B(A.numRows(), A.numCols());
hpc::cuda::copy(A, B);
std::size_t dim_per_block = 16;
std::size_t num_blocks_per_row = (A.numRows() + dim_per_block - 1) /
dim_per_block;
std::size_t num_blocks_per_col = (A.numCols() + dim_per_block - 1) /
dim_per_block;
dim3 block(dim_per_block, dim_per_block);
dim3 grid(num_blocks_per_row, num_blocks_per_col);
gescal<<<grid, block>>>(B.numRows(), B.numCols(),
2, B.data(), B.incRow(), B.incCol());
hpc::cuda::copy(B, A);
for (std::size_t i = 0; i < A.numRows(); ++i) {
for (std::size_t j = 0; j < A.numCols(); ++j) {
fmt::printf(" %12lg", A(i, j));
}
fmt::printf("\n");
}
}