Sample solution
Content |
The difference between both variants is significant. On average, the kernel function for the row-major variant uses ca. 1.87 ms where the col-major variant needs just 0.69 ms.
livingstone$ make jacobi1-colmaj jacobi1-rowmaj nvcc -c -std=c++14 -I/home/numerik/pub/pp/ss19/lib -I/usr/include/gdk-pixbuf-2.0 -I/usr/include/libpng16 -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include jacobi1-colmaj.cu nvcc -w -o jacobi1-colmaj -std=c++14 jacobi1-colmaj.o -lgdk_pixbuf-2.0 -lgobject-2.0 -lglib-2.0 nvcc -c -std=c++14 -I/home/numerik/pub/pp/ss19/lib -I/usr/include/gdk-pixbuf-2.0 -I/usr/include/libpng16 -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include jacobi1-rowmaj.cu nvcc -w -o jacobi1-rowmaj -std=c++14 jacobi1-rowmaj.o -lgdk_pixbuf-2.0 -lgobject-2.0 -lglib-2.0 livingstone$ diff jacobi1-colmaj.cu jacobi1-rowmaj.cu 50c50 < GeMatrix<double> A(N + 2, N + 2, Order::ColMajor); --- > GeMatrix<double> A(N + 2, N + 2, Order::RowMajor); 64c64 < DeviceGeMatrix<double> devA(A.numRows(), A.numCols(), Order::ColMajor); --- > DeviceGeMatrix<double> devA(A.numRows(), A.numCols(), Order::RowMajor); livingstone$ nvprof ./jacobi1-rowmaj ==837== NVPROF is profiling process 837, command: ./jacobi1-rowmaj ==837== Profiling application: ./jacobi1-rowmaj ==837== Profiling result: Type Time(%) Time Calls Avg Min Max Name GPU activities: 99.58% 1.8778ms 1 1.8778ms 1.8778ms 1.8778ms void jacobi<hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView, unsigned int) 0.29% 5.4400us 1 5.4400us 5.4400us 5.4400us [CUDA memcpy DtoH] 0.13% 2.5280us 1 2.5280us 2.5280us 2.5280us [CUDA memcpy HtoD] API calls: 97.86% 207.63ms 1 207.63ms 207.63ms 207.63ms cudaMalloc 0.91% 1.9260ms 2 963.01us 25.282us 1.9007ms cudaMemcpy 0.51% 1.0777ms 97 11.110us 1.9560us 395.71us cuDeviceGetAttribute 0.38% 803.78us 1 803.78us 803.78us 803.78us cudaGetDeviceProperties 0.15% 314.70us 1 314.70us 314.70us 314.70us cudaFree 0.12% 258.69us 1 258.69us 258.69us 258.69us cuDeviceTotalMem 0.05% 104.55us 1 104.55us 104.55us 104.55us cuDeviceGetName 0.02% 33.383us 1 33.383us 33.383us 33.383us cudaLaunchKernel 0.00% 9.2890us 3 3.0960us 2.5140us 4.1910us cuDeviceGetCount 0.00% 6.6340us 1 6.6340us 6.6340us 6.6340us cudaGetDevice 0.00% 6.2150us 2 3.1070us 2.4440us 3.7710us cuDeviceGet 0.00% 5.7270us 1 5.7270us 5.7270us 5.7270us cuDeviceGetPCIBusId 0.00% 2.7240us 1 2.7240us 2.7240us 2.7240us cuDeviceGetUuid livingstone$ nvprof ./jacobi1-colmaj ==861== NVPROF is profiling process 861, command: ./jacobi1-colmaj ==861== Profiling application: ./jacobi1-colmaj ==861== Profiling result: Type Time(%) Time Calls Avg Min Max Name GPU activities: 98.96% 690.97us 1 690.97us 690.97us 690.97us void jacobi<hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView, unsigned int) 0.68% 4.7360us 1 4.7360us 4.7360us 4.7360us [CUDA memcpy DtoH] 0.36% 2.4960us 1 2.4960us 2.4960us 2.4960us [CUDA memcpy HtoD] API calls: 98.07% 158.02ms 1 158.02ms 158.02ms 158.02ms cudaMalloc 0.62% 998.10us 97 10.289us 1.8860us 354.44us cuDeviceGetAttribute 0.48% 765.58us 1 765.58us 765.58us 765.58us cudaGetDeviceProperties 0.46% 733.80us 2 366.90us 20.183us 713.62us cudaMemcpy 0.15% 237.38us 1 237.38us 237.38us 237.38us cuDeviceTotalMem 0.14% 222.16us 1 222.16us 222.16us 222.16us cudaFree 0.06% 94.981us 1 94.981us 94.981us 94.981us cuDeviceGetName 0.02% 29.192us 1 29.192us 29.192us 29.192us cudaLaunchKernel 0.01% 9.6380us 3 3.2120us 2.3750us 4.6090us cuDeviceGetCount 0.00% 8.0310us 1 8.0310us 8.0310us 8.0310us cuDeviceGetPCIBusId 0.00% 6.0060us 1 6.0060us 6.0060us 6.0060us cudaGetDevice 0.00% 5.6570us 2 2.8280us 2.4450us 3.2120us cuDeviceGet 0.00% 2.6540us 1 2.6540us 2.6540us 2.6540us cuDeviceGetUuid livingstone$
Let us have a look at the kernel function:
template< template<typename> class Matrix, typename T, Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true > __global__ void jacobi(Matrix<T> A, unsigned int nofiterations) { std::size_t i = threadIdx.x + 1; std::size_t j = threadIdx.y + 1; for (unsigned int it = 0; it < nofiterations; ++it) { T Aij = (A(i-1, j) + A(i, j-1) + A(i, j+1) + A(i+1, j)) / 4.0; __syncthreads(); A(i, j) = Aij; __syncthreads(); } }
The index i depends on threadIdx.x, j on threadIdx.y. Within a warp, j is the same for all threads but i ranges from 0 to 31. If we have row-major, the individual threads will access distant values of A(i-1, j) etc. whereas in case of col-major the values will be stored consecutively in memory and thereby more likely within the same cache line. This means much less time will be spent with loading and storing.
Of course, we do not really require matrices on the GPU to be in col-major as we could have easily switched the indices i and j for a row-major matrix.
#include <cstddef> #include <cmath> #include <printf.hpp> #include <hpc/aux/hsvcolor.hpp> #include <hpc/cuda/check.hpp> #include <hpc/cuda/copy.hpp> #include <hpc/cuda/properties.hpp> #include <hpc/matvec/for-all-indices.hpp> #include <hpc/matvec/gematrix.hpp> #include <hpc/matvec/matrix2pixbuf.hpp> template<typename T> const T PI = std::acos(T(-1.0)); template<typename T> const T E = std::exp(T(1.0)); template<typename T> const T E_POWER_MINUS_PI = std::pow(E<T>, -PI<T>); using namespace hpc; template< template<typename> class Matrix, typename T, Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true > __global__ void jacobi(Matrix<T> A, unsigned int nofiterations) { std::size_t i = threadIdx.x + 1; std::size_t j = threadIdx.y + 1; for (unsigned int it = 0; it < nofiterations; ++it) { T Aij = (A(i-1, j) + A(i, j-1) + A(i, j+1) + A(i+1, j)) / 4.0; __syncthreads(); A(i, j) = Aij; __syncthreads(); } } template<typename T> constexpr T int_sqrt(T n) { T result = 1; while (result * result <= n) { ++result; } return result - 1; } int main(int argc, char** argv) { using namespace hpc::matvec; using namespace hpc::cuda; using namespace hpc::aux; using T = double; std::size_t max_threads = get_max_threads_per_block(); std::size_t N = int_sqrt(max_threads); GeMatrix<T> A(N + 2, N + 2, Order::ColMajor); /* initialize the entire matrix, including its borders */ for_all_indices(A, [&](std::size_t i, std::size_t j) -> void { if (j == 0) { A(i, j) = std::sin(PI<T> * (T(i)/(A.numRows()-1))); } else if (j == A.numCols() - 1) { A(i, j) = std::sin(PI<T> * (T(i)/(A.numRows()-1))) * E_POWER_MINUS_PI<T>; } else { A(i, j) = 0; } }); DeviceGeMatrix<T> devA(A.numRows(), A.numCols(), Order::ColMajor); copy(A, devA); dim3 block(N, N); jacobi<<<1, block>>>(devA.view(), 520); copy(devA, A); auto pixbuf = create_pixbuf(A, [](T val) -> HSVColor<T> { return HSVColor<T>((1-val) * 240, 1, 1); }, 8); gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr, "quality", "100", nullptr); }