Matrix-Produkt mit Cuda

Folgende Vorlage kann zum Testen der Matrix Implementierung benutzt werden:

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cstdio>
#include <hpc/cuda/check.h>
#include <hpc/ulmblas/gescal.h>
#include <hpc/matvec/gematrix.h>
#include <hpc/cuda/gematrix.h>
#include <hpc/cuda/mm.h>
#include <hpc/cuda/copy.h>
#include <hpc/ulmblas/gecopy.h>


void
printMatrix(const hpc::matvec::GeMatrix<double> &A, const char *name=0)
{
    if (name) {
        printf("%s =\n", name);
    }
    for (int i=0; i<A.numRows; ++i) {
        for (int j=0; j<A.numCols; ++j) {
            printf("%6.2lf ", A(i,j));
        }
        printf("\n");
    }
    printf("\n");
}

void
initMatrix(hpc::matvec::GeMatrix<double> &A)
{
    for (int i=0; i<A.numRows; ++i) {
        for (int j=0; j<A.numCols; ++j) {
            A(i,j) = i + j*A.numRows;
        }
    }
}

template <typename Index, typename Alpha, typename TX>
void
gescal(Index m, Index n,
       Alpha alpha,
       TX *X, Index incRowX, Index incColX)
{
    if (alpha!=1.0) {
        for (long i=0; i<m; ++i) {
            for (long j=0; j<n; ++j) {
                X[i*incRowX+j*incColX] *= alpha;
            }
        }
    }
}

template <typename Index, typename Alpha, typename TA, typename TB,
          typename Beta, typename TC>
void
refGemm(Index m, Index n, Index k,
        Alpha alpha,
        const TA *A, Index incRowA, Index incColA,
        const TB *B, Index incRowB, Index incColB,
        Beta beta,
        TC *C, Index incRowC, Index incColC)
{
     gescal(m, n, beta, C, incRowC, incColC);
     for (Index j=0; j<n; ++j) {
         for (Index l=0; l<k; ++l) {
             for (Index i=0; i<m; ++i) {
                 C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                                *B[l*incRowB+j*incColB];
             }
         }
    }
}

int
main() {
//
//  Matrizen auf Host anlegen und initialisieren
//
    std::size_t m = 16;
    std::size_t n = 16;
    std::size_t k = 16;

    hpc::matvec::GeMatrix<double> A(m, k);
    hpc::matvec::GeMatrix<double> B(k, n);
    hpc::matvec::GeMatrix<double> C(m, n);
    hpc::matvec::GeMatrix<double> C_check(m, n);
    double alpha = 1;
    double beta  = 0;

    initMatrix(A);
    initMatrix(B);
    initMatrix(C);
    //hpc::matvec::copy(C, C_check);
    hpc::ulmblas::gecopy(C.numRows, C.numCols,
                         C.data, C.incRow, C.incCol,
                         C_check.data, C_check.incRow, C_check.incCol);

    printMatrix(A, "A");
    printMatrix(B, "B");
    printMatrix(C, "C");


//
//  Matrizen auf Device anlegen
//
    hpc::cuda::GeMatrix<double> dA(A.numRows, A.numCols);
    hpc::cuda::GeMatrix<double> dB(B.numRows, B.numCols);
    hpc::cuda::GeMatrix<double> dC(C.numRows, C.numCols);

//
//  Matrizen von Host auf Device kopieren
//
    hpc::cuda::copy(A, dA);
    hpc::cuda::copy(B, dB);
    hpc::cuda::copy(C, dC);

//
//  Operation auf dem Device ausfuehren
//
    hpc::cuda::mm(1.0, dA, dB, 0.0, dC);

//
//  Ergebnis von Device auf den Host kopieren
//
    hpc::cuda::copy(dC, C);

    printf("C <- %5.3lf*A*B + %5.3lf*C\n", alpha, beta);

    printMatrix(C, "C");

//
//  Zum Test mit matvec Implementierung vergleichen
//
    //hpc::matvec::mm(1.0, A, B, 0.0, C_check);
    refGemm(C.numRows, C.numCols, A.numCols,
            alpha,
            A.data, A.incRow, A.incCol,
            B.data, B.incRow, B.incCol,
            beta,
            C_check.data, C_check.incRow, C_check.incCol);

    printMatrix(C_check, "C_check");
}

Das Vorlesungsprojekt aus Session 24 muss dazu nur um die Implementierung in hpc/cuda/mm.h ergänzt werden:

session25/hpc/cuda/mm.h

#ifndef HPC_CUDA_MM_H
#define HPC_CUDA_MM_H 1

#include <cassert>
#include <hpc/cuda/check.h>
#include <hpc/cuda/gematrix.h>

namespace hpc { namespace cuda {

template <typename T>
struct BlockSize
{
    static const int M     = 64;
    static const int N     = 64;
    static const int K     = 16;
    static const int DIM   = 16;

    static const int THR_M = M / DIM;
    static const int THR_N = N / DIM;

    //static_assert(M>0 && N>0, "Invalid block size.");
    //static_assert(M % DIM == 0, "M must be a multiple of DIM.");
    //static_assert(N % DIM == 0, "N must be a multiple of DIM.");
};

template <typename Index, typename T>
__device__ void
gemm_fermi_kernel(Index m, Index n, Index k,
                  const T alpha,
                  const T *A, Index incRowA, Index incColA,
                  const T *B, Index incRowB, Index incColB,
                  const T beta,
                  T *C, Index incRowC, Index incColC)
{
    /*
    Index i0  = blockIdx.y * BlockSize<T>::M;
    Index j0  = blockIdx.x * BlockSize<T>::N;

    A += i0*incRowA;
    B += j0*incColB;
    C += i0*incRowC + j0*incColC;

    Index threadRow = threadIdx.x;
    Index threadCol = threadIdx.y;

    __shared__ T A_shared[BlockSize<T>::M][BlockSize<T>::K];
    __shared__ T B_shared[BlockSize<T>::K][BlockSize<T>::N];

    T C_register[BlockSize<T>::THR_M][BlockSize<T>::THR_N];
    T A_register[BlockSize<T>::THR_M];
    T B_register[BlockSize<T>::THR_N];



        .... your code here ....

     */

}

template <typename Index, typename T>
__global__ void
gemm_fermi(Index m, Index n, Index k,
           const T alpha,
           const T *A, Index incRowA, Index incColA,
           const T *B, Index incRowB, Index incColB,
           const T beta,
           T *C, Index incRowC, Index incColC)
{
    gemm_fermi_kernel(m, n, k,
                      alpha,
                      A, incRowA, incColA,
                      B, incRowB, incColB,
                      beta,
                      C, incRowC, incColC);
}

template <typename Alpha, typename T, typename Index, typename Beta>
void
mm(const Alpha &alpha, const GeMatrix<T, Index> &A, const GeMatrix<T, Index> &B,
   const Beta &beta, GeMatrix<T, Index> &C)
{
    assert(A.numCols==B.numRows);

    const Index m = C.numRows;
    const Index n = C.numCols;
    const Index k = A.numCols;

    dim3 dimBlock(BlockSize<T>::DIM, BlockSize<T>::DIM);
    dim3 dimGrid((m+BlockSize<T>::M-1)/BlockSize<T>::M,
                 (n+BlockSize<T>::N-1)/BlockSize<T>::N);

    gemm_fermi<<<dimGrid, dimBlock>>>(m, n, k,
                                      alpha,
                                      A.data, A.incRow, A.incCol,
                                      B.data, B.incRow, B.incCol,
                                      beta,
                                      C.data, C.incRow, C.incCol);
    CHECK_CUDA(cudaDeviceSynchronize);
}

} } // namespace matvec, hpc

#endif // HPC_CUDA_MM_H

Im Verzeichnis hpc/tests/ kann obiger Test mit

nvcc  -I ../../ --gpu-architecture compute_20 -code sm_20 --ptxas-options=-v bench_cugemm.cu

übersetzt werden.