#ifndef HPC_CUDA_MM_H #define HPC_CUDA_MM_H 1 #include #include #include namespace hpc { namespace cuda { template struct BlockSize { static const int M = 64; static const int N = 64; static const int K = 16; static const int DIM = 16; static const int THR_M = M / DIM; static const int THR_N = N / DIM; //static_assert(M>0 && N>0, "Invalid block size."); //static_assert(M % DIM == 0, "M must be a multiple of DIM."); //static_assert(N % DIM == 0, "N must be a multiple of DIM."); }; template __device__ void gemm_fermi_kernel(Index m, Index n, Index k, const T alpha, const T *A, Index incRowA, Index incColA, const T *B, Index incRowB, Index incColB, const T beta, T *C, Index incRowC, Index incColC) { /* Index i0 = blockIdx.y * BlockSize::M; Index j0 = blockIdx.x * BlockSize::N; A += i0*incRowA; B += j0*incColB; C += i0*incRowC + j0*incColC; Index threadRow = threadIdx.x; Index threadCol = threadIdx.y; __shared__ T A_shared[BlockSize::M][BlockSize::K]; __shared__ T B_shared[BlockSize::K][BlockSize::N]; T C_register[BlockSize::THR_M][BlockSize::THR_N]; T A_register[BlockSize::THR_M]; T B_register[BlockSize::THR_N]; .... your code here .... */ } template __global__ void gemm_fermi(Index m, Index n, Index k, const T alpha, const T *A, Index incRowA, Index incColA, const T *B, Index incRowB, Index incColB, const T beta, T *C, Index incRowC, Index incColC) { gemm_fermi_kernel(m, n, k, alpha, A, incRowA, incColA, B, incRowB, incColB, beta, C, incRowC, incColC); } template void mm(const Alpha &alpha, const GeMatrix &A, const GeMatrix &B, const Beta &beta, GeMatrix &C) { assert(A.numCols==B.numRows); const Index m = C.numRows; const Index n = C.numCols; const Index k = A.numCols; dim3 dimBlock(BlockSize::DIM, BlockSize::DIM); dim3 dimGrid((m+BlockSize::M-1)/BlockSize::M, (n+BlockSize::N-1)/BlockSize::N); gemm_fermi<<>>(m, n, k, alpha, A.data, A.incRow, A.incCol, B.data, B.incRow, B.incCol, beta, C.data, C.incRow, C.incCol); CHECK_CUDA(cudaDeviceSynchronize); } } } // namespace matvec, hpc #endif // HPC_CUDA_MM_H