Matrix-Produkt mit Cuda
Folgende Vorlage kann zum Testen der Matrix Implementierung benutzt werden:
#include <cuda_runtime.h> #include <cublas_v2.h> #include <cstdio> #include <hpc/cuda/check.h> #include <hpc/ulmblas/gescal.h> #include <hpc/matvec/gematrix.h> #include <hpc/cuda/gematrix.h> #include <hpc/cuda/mm.h> #include <hpc/cuda/copy.h> #include <hpc/ulmblas/gecopy.h> void printMatrix(const hpc::matvec::GeMatrix<double> &A, const char *name=0) { if (name) { printf("%s =\n", name); } for (int i=0; i<A.numRows; ++i) { for (int j=0; j<A.numCols; ++j) { printf("%6.2lf ", A(i,j)); } printf("\n"); } printf("\n"); } void initMatrix(hpc::matvec::GeMatrix<double> &A) { for (int i=0; i<A.numRows; ++i) { for (int j=0; j<A.numCols; ++j) { A(i,j) = i + j*A.numRows; } } } template <typename Index, typename Alpha, typename TX> void gescal(Index m, Index n, Alpha alpha, TX *X, Index incRowX, Index incColX) { if (alpha!=1.0) { for (long i=0; i<m; ++i) { for (long j=0; j<n; ++j) { X[i*incRowX+j*incColX] *= alpha; } } } } template <typename Index, typename Alpha, typename TA, typename TB, typename Beta, typename TC> void refGemm(Index m, Index n, Index k, Alpha alpha, const TA *A, Index incRowA, Index incColA, const TB *B, Index incRowB, Index incColB, Beta beta, TC *C, Index incRowC, Index incColC) { gescal(m, n, beta, C, incRowC, incColC); for (Index j=0; j<n; ++j) { for (Index l=0; l<k; ++l) { for (Index i=0; i<m; ++i) { C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } int main() { // // Matrizen auf Host anlegen und initialisieren // std::size_t m = 16; std::size_t n = 16; std::size_t k = 16; hpc::matvec::GeMatrix<double> A(m, k); hpc::matvec::GeMatrix<double> B(k, n); hpc::matvec::GeMatrix<double> C(m, n); hpc::matvec::GeMatrix<double> C_check(m, n); double alpha = 1; double beta = 0; initMatrix(A); initMatrix(B); initMatrix(C); //hpc::matvec::copy(C, C_check); hpc::ulmblas::gecopy(C.numRows, C.numCols, C.data, C.incRow, C.incCol, C_check.data, C_check.incRow, C_check.incCol); printMatrix(A, "A"); printMatrix(B, "B"); printMatrix(C, "C"); // // Matrizen auf Device anlegen // hpc::cuda::GeMatrix<double> dA(A.numRows, A.numCols); hpc::cuda::GeMatrix<double> dB(B.numRows, B.numCols); hpc::cuda::GeMatrix<double> dC(C.numRows, C.numCols); // // Matrizen von Host auf Device kopieren // hpc::cuda::copy(A, dA); hpc::cuda::copy(B, dB); hpc::cuda::copy(C, dC); // // Operation auf dem Device ausfuehren // hpc::cuda::mm(1.0, dA, dB, 0.0, dC); // // Ergebnis von Device auf den Host kopieren // hpc::cuda::copy(dC, C); printf("C <- %5.3lf*A*B + %5.3lf*C\n", alpha, beta); printMatrix(C, "C"); // // Zum Test mit matvec Implementierung vergleichen // //hpc::matvec::mm(1.0, A, B, 0.0, C_check); refGemm(C.numRows, C.numCols, A.numCols, alpha, A.data, A.incRow, A.incCol, B.data, B.incRow, B.incCol, beta, C_check.data, C_check.incRow, C_check.incCol); printMatrix(C_check, "C_check"); }
Das Vorlesungsprojekt aus Session 24 muss dazu nur um die Implementierung in hpc/cuda/mm.h ergänzt werden:
#ifndef HPC_CUDA_MM_H #define HPC_CUDA_MM_H 1 #include <cassert> #include <hpc/cuda/check.h> #include <hpc/cuda/gematrix.h> namespace hpc { namespace cuda { template <typename T> struct BlockSize { static const int M = 64; static const int N = 64; static const int K = 16; static const int DIM = 16; static const int THR_M = M / DIM; static const int THR_N = N / DIM; //static_assert(M>0 && N>0, "Invalid block size."); //static_assert(M % DIM == 0, "M must be a multiple of DIM."); //static_assert(N % DIM == 0, "N must be a multiple of DIM."); }; template <typename Index, typename T> __device__ void gemm_fermi_kernel(Index m, Index n, Index k, const T alpha, const T *A, Index incRowA, Index incColA, const T *B, Index incRowB, Index incColB, const T beta, T *C, Index incRowC, Index incColC) { /* Index i0 = blockIdx.y * BlockSize<T>::M; Index j0 = blockIdx.x * BlockSize<T>::N; A += i0*incRowA; B += j0*incColB; C += i0*incRowC + j0*incColC; Index threadRow = threadIdx.x; Index threadCol = threadIdx.y; __shared__ T A_shared[BlockSize<T>::M][BlockSize<T>::K]; __shared__ T B_shared[BlockSize<T>::K][BlockSize<T>::N]; T C_register[BlockSize<T>::THR_M][BlockSize<T>::THR_N]; T A_register[BlockSize<T>::THR_M]; T B_register[BlockSize<T>::THR_N]; .... your code here .... */ } template <typename Index, typename T> __global__ void gemm_fermi(Index m, Index n, Index k, const T alpha, const T *A, Index incRowA, Index incColA, const T *B, Index incRowB, Index incColB, const T beta, T *C, Index incRowC, Index incColC) { gemm_fermi_kernel(m, n, k, alpha, A, incRowA, incColA, B, incRowB, incColB, beta, C, incRowC, incColC); } template <typename Alpha, typename T, typename Index, typename Beta> void mm(const Alpha &alpha, const GeMatrix<T, Index> &A, const GeMatrix<T, Index> &B, const Beta &beta, GeMatrix<T, Index> &C) { assert(A.numCols==B.numRows); const Index m = C.numRows; const Index n = C.numCols; const Index k = A.numCols; dim3 dimBlock(BlockSize<T>::DIM, BlockSize<T>::DIM); dim3 dimGrid((m+BlockSize<T>::M-1)/BlockSize<T>::M, (n+BlockSize<T>::N-1)/BlockSize<T>::N); gemm_fermi<<<dimGrid, dimBlock>>>(m, n, k, alpha, A.data, A.incRow, A.incCol, B.data, B.incRow, B.incCol, beta, C.data, C.incRow, C.incCol); CHECK_CUDA(cudaDeviceSynchronize); } } } // namespace matvec, hpc #endif // HPC_CUDA_MM_H
Im Verzeichnis hpc/tests/ kann obiger Test mit
nvcc -I ../../ --gpu-architecture compute_20 -code sm_20 --ptxas-options=-v bench_cugemm.cu
übersetzt werden.