# Matrix-Produkt mit Cuda

Folgende Vorlage kann zum Testen der Matrix Implementierung benutzt werden:

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cstdio>
#include <hpc/cuda/check.h>
#include <hpc/ulmblas/gescal.h>
#include <hpc/matvec/gematrix.h>
#include <hpc/cuda/gematrix.h>
#include <hpc/cuda/mm.h>
#include <hpc/cuda/copy.h>
#include <hpc/ulmblas/gecopy.h>

void
printMatrix(const hpc::matvec::GeMatrix<double> &A, const char *name=0)
{
if (name) {
printf("%s =\n", name);
}
for (int i=0; i<A.numRows; ++i) {
for (int j=0; j<A.numCols; ++j) {
printf("%6.2lf ", A(i,j));
}
printf("\n");
}
printf("\n");
}

void
initMatrix(hpc::matvec::GeMatrix<double> &A)
{
for (int i=0; i<A.numRows; ++i) {
for (int j=0; j<A.numCols; ++j) {
A(i,j) = i + j*A.numRows;
}
}
}

template <typename Index, typename Alpha, typename TX>
void
gescal(Index m, Index n,
Alpha alpha,
TX *X, Index incRowX, Index incColX)
{
if (alpha!=1.0) {
for (long i=0; i<m; ++i) {
for (long j=0; j<n; ++j) {
X[i*incRowX+j*incColX] *= alpha;
}
}
}
}

template <typename Index, typename Alpha, typename TA, typename TB,
typename Beta, typename TC>
void
refGemm(Index m, Index n, Index k,
Alpha alpha,
const TA *A, Index incRowA, Index incColA,
const TB *B, Index incRowB, Index incColB,
Beta beta,
TC *C, Index incRowC, Index incColC)
{
gescal(m, n, beta, C, incRowC, incColC);
for (Index j=0; j<n; ++j) {
for (Index l=0; l<k; ++l) {
for (Index i=0; i<m; ++i) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}

int
main() {
//
//  Matrizen auf Host anlegen und initialisieren
//
std::size_t m = 16;
std::size_t n = 16;
std::size_t k = 16;

hpc::matvec::GeMatrix<double> A(m, k);
hpc::matvec::GeMatrix<double> B(k, n);
hpc::matvec::GeMatrix<double> C(m, n);
hpc::matvec::GeMatrix<double> C_check(m, n);
double alpha = 1;
double beta  = 0;

initMatrix(A);
initMatrix(B);
initMatrix(C);
//hpc::matvec::copy(C, C_check);
hpc::ulmblas::gecopy(C.numRows, C.numCols,
C.data, C.incRow, C.incCol,
C_check.data, C_check.incRow, C_check.incCol);

printMatrix(A, "A");
printMatrix(B, "B");
printMatrix(C, "C");

//
//  Matrizen auf Device anlegen
//
hpc::cuda::GeMatrix<double> dA(A.numRows, A.numCols);
hpc::cuda::GeMatrix<double> dB(B.numRows, B.numCols);
hpc::cuda::GeMatrix<double> dC(C.numRows, C.numCols);

//
//  Matrizen von Host auf Device kopieren
//
hpc::cuda::copy(A, dA);
hpc::cuda::copy(B, dB);
hpc::cuda::copy(C, dC);

//
//  Operation auf dem Device ausfuehren
//
hpc::cuda::mm(1.0, dA, dB, 0.0, dC);

//
//  Ergebnis von Device auf den Host kopieren
//
hpc::cuda::copy(dC, C);

printf("C <- %5.3lf*A*B + %5.3lf*C\n", alpha, beta);

printMatrix(C, "C");

//
//  Zum Test mit matvec Implementierung vergleichen
//
//hpc::matvec::mm(1.0, A, B, 0.0, C_check);
refGemm(C.numRows, C.numCols, A.numCols,
alpha,
A.data, A.incRow, A.incCol,
B.data, B.incRow, B.incCol,
beta,
C_check.data, C_check.incRow, C_check.incCol);

printMatrix(C_check, "C_check");
}


Das Vorlesungsprojekt aus Session 24 muss dazu nur um die Implementierung in hpc/cuda/mm.h ergänzt werden:

#ifndef HPC_CUDA_MM_H
#define HPC_CUDA_MM_H 1

#include <cassert>
#include <hpc/cuda/check.h>
#include <hpc/cuda/gematrix.h>

namespace hpc { namespace cuda {

template <typename T>
struct BlockSize
{
static const int M     = 64;
static const int N     = 64;
static const int K     = 16;
static const int DIM   = 16;

static const int THR_M = M / DIM;
static const int THR_N = N / DIM;

//static_assert(M>0 && N>0, "Invalid block size.");
//static_assert(M % DIM == 0, "M must be a multiple of DIM.");
//static_assert(N % DIM == 0, "N must be a multiple of DIM.");
};

template <typename Index, typename T>
__device__ void
gemm_fermi_kernel(Index m, Index n, Index k,
const T alpha,
const T *A, Index incRowA, Index incColA,
const T *B, Index incRowB, Index incColB,
const T beta,
T *C, Index incRowC, Index incColC)
{
/*
Index i0  = blockIdx.y * BlockSize<T>::M;
Index j0  = blockIdx.x * BlockSize<T>::N;

A += i0*incRowA;
B += j0*incColB;
C += i0*incRowC + j0*incColC;

__shared__ T A_shared[BlockSize<T>::M][BlockSize<T>::K];
__shared__ T B_shared[BlockSize<T>::K][BlockSize<T>::N];

T C_register[BlockSize<T>::THR_M][BlockSize<T>::THR_N];
T A_register[BlockSize<T>::THR_M];
T B_register[BlockSize<T>::THR_N];

*/

}

template <typename Index, typename T>
__global__ void
gemm_fermi(Index m, Index n, Index k,
const T alpha,
const T *A, Index incRowA, Index incColA,
const T *B, Index incRowB, Index incColB,
const T beta,
T *C, Index incRowC, Index incColC)
{
gemm_fermi_kernel(m, n, k,
alpha,
A, incRowA, incColA,
B, incRowB, incColB,
beta,
C, incRowC, incColC);
}

template <typename Alpha, typename T, typename Index, typename Beta>
void
mm(const Alpha &alpha, const GeMatrix<T, Index> &A, const GeMatrix<T, Index> &B,
const Beta &beta, GeMatrix<T, Index> &C)
{
assert(A.numCols==B.numRows);

const Index m = C.numRows;
const Index n = C.numCols;
const Index k = A.numCols;

dim3 dimBlock(BlockSize<T>::DIM, BlockSize<T>::DIM);
dim3 dimGrid((m+BlockSize<T>::M-1)/BlockSize<T>::M,
(n+BlockSize<T>::N-1)/BlockSize<T>::N);

gemm_fermi<<<dimGrid, dimBlock>>>(m, n, k,
alpha,
A.data, A.incRow, A.incCol,
B.data, B.incRow, B.incCol,
beta,
C.data, C.incRow, C.incCol);

nvcc  -I ../../ --gpu-architecture compute_20 -code sm_20 --ptxas-options=-v bench_cugemm.cu