Sample solution

Content

The difference between both variants is significant. On average, the kernel function for the row-major variant uses ca. 1.87 ms where the col-major variant needs just 0.69 ms.

livingstone$ make jacobi1-colmaj jacobi1-rowmaj
nvcc -c -std=c++14 -I/home/numerik/pub/pp/ss19/lib -I/usr/include/gdk-pixbuf-2.0 -I/usr/include/libpng16 -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include  jacobi1-colmaj.cu
nvcc -w -o jacobi1-colmaj -std=c++14  jacobi1-colmaj.o -lgdk_pixbuf-2.0 -lgobject-2.0 -lglib-2.0
nvcc -c -std=c++14 -I/home/numerik/pub/pp/ss19/lib -I/usr/include/gdk-pixbuf-2.0 -I/usr/include/libpng16 -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include  jacobi1-rowmaj.cu
nvcc -w -o jacobi1-rowmaj -std=c++14  jacobi1-rowmaj.o -lgdk_pixbuf-2.0 -lgobject-2.0 -lglib-2.0
livingstone$ diff jacobi1-colmaj.cu jacobi1-rowmaj.cu
50c50
<    GeMatrix<double> A(N + 2, N + 2, Order::ColMajor);
---
>    GeMatrix<double> A(N + 2, N + 2, Order::RowMajor);
64c64
<    DeviceGeMatrix<double> devA(A.numRows(), A.numCols(), Order::ColMajor);
---
>    DeviceGeMatrix<double> devA(A.numRows(), A.numCols(), Order::RowMajor);
livingstone$ nvprof ./jacobi1-rowmaj
==837== NVPROF is profiling process 837, command: ./jacobi1-rowmaj
==837== Profiling application: ./jacobi1-rowmaj
==837== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.58%  1.8778ms         1  1.8778ms  1.8778ms  1.8778ms  void jacobi<hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView, unsigned int)
                    0.29%  5.4400us         1  5.4400us  5.4400us  5.4400us  [CUDA memcpy DtoH]
                    0.13%  2.5280us         1  2.5280us  2.5280us  2.5280us  [CUDA memcpy HtoD]
      API calls:   97.86%  207.63ms         1  207.63ms  207.63ms  207.63ms  cudaMalloc
                    0.91%  1.9260ms         2  963.01us  25.282us  1.9007ms  cudaMemcpy
                    0.51%  1.0777ms        97  11.110us  1.9560us  395.71us  cuDeviceGetAttribute
                    0.38%  803.78us         1  803.78us  803.78us  803.78us  cudaGetDeviceProperties
                    0.15%  314.70us         1  314.70us  314.70us  314.70us  cudaFree
                    0.12%  258.69us         1  258.69us  258.69us  258.69us  cuDeviceTotalMem
                    0.05%  104.55us         1  104.55us  104.55us  104.55us  cuDeviceGetName
                    0.02%  33.383us         1  33.383us  33.383us  33.383us  cudaLaunchKernel
                    0.00%  9.2890us         3  3.0960us  2.5140us  4.1910us  cuDeviceGetCount
                    0.00%  6.6340us         1  6.6340us  6.6340us  6.6340us  cudaGetDevice
                    0.00%  6.2150us         2  3.1070us  2.4440us  3.7710us  cuDeviceGet
                    0.00%  5.7270us         1  5.7270us  5.7270us  5.7270us  cuDeviceGetPCIBusId
                    0.00%  2.7240us         1  2.7240us  2.7240us  2.7240us  cuDeviceGetUuid
livingstone$ nvprof ./jacobi1-colmaj
==861== NVPROF is profiling process 861, command: ./jacobi1-colmaj
==861== Profiling application: ./jacobi1-colmaj
==861== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   98.96%  690.97us         1  690.97us  690.97us  690.97us  void jacobi<hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView, unsigned int)
                    0.68%  4.7360us         1  4.7360us  4.7360us  4.7360us  [CUDA memcpy DtoH]
                    0.36%  2.4960us         1  2.4960us  2.4960us  2.4960us  [CUDA memcpy HtoD]
      API calls:   98.07%  158.02ms         1  158.02ms  158.02ms  158.02ms  cudaMalloc
                    0.62%  998.10us        97  10.289us  1.8860us  354.44us  cuDeviceGetAttribute
                    0.48%  765.58us         1  765.58us  765.58us  765.58us  cudaGetDeviceProperties
                    0.46%  733.80us         2  366.90us  20.183us  713.62us  cudaMemcpy
                    0.15%  237.38us         1  237.38us  237.38us  237.38us  cuDeviceTotalMem
                    0.14%  222.16us         1  222.16us  222.16us  222.16us  cudaFree
                    0.06%  94.981us         1  94.981us  94.981us  94.981us  cuDeviceGetName
                    0.02%  29.192us         1  29.192us  29.192us  29.192us  cudaLaunchKernel
                    0.01%  9.6380us         3  3.2120us  2.3750us  4.6090us  cuDeviceGetCount
                    0.00%  8.0310us         1  8.0310us  8.0310us  8.0310us  cuDeviceGetPCIBusId
                    0.00%  6.0060us         1  6.0060us  6.0060us  6.0060us  cudaGetDevice
                    0.00%  5.6570us         2  2.8280us  2.4450us  3.2120us  cuDeviceGet
                    0.00%  2.6540us         1  2.6540us  2.6540us  2.6540us  cuDeviceGetUuid
livingstone$

Let us have a look at the kernel function:

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
>
__global__ void jacobi(Matrix<T> A, unsigned int nofiterations) {
   std::size_t i = threadIdx.x + 1;
   std::size_t j = threadIdx.y + 1;
   for (unsigned int it = 0; it < nofiterations; ++it) {
      T Aij = (A(i-1, j) + A(i, j-1) + A(i, j+1) + A(i+1, j)) / 4.0;
      __syncthreads();
      A(i, j) = Aij;
      __syncthreads();
   }
}

The index i depends on threadIdx.x, j on threadIdx.y. Within a warp, j is the same for all threads but i ranges from 0 to 31. If we have row-major, the individual threads will access distant values of A(i-1, j) etc. whereas in case of col-major the values will be stored consecutively in memory and thereby more likely within the same cache line. This means much less time will be spent with loading and storing.

Of course, we do not really require matrices on the GPU to be in col-major as we could have easily switched the indices i and j for a row-major matrix.

#include <cstddef>
#include <cmath>
#include <printf.hpp>
#include <hpc/aux/hsvcolor.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/properties.hpp>
#include <hpc/matvec/for-all-indices.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/matrix2pixbuf.hpp>

template<typename T>
const T PI = std::acos(T(-1.0));
template<typename T>
const T E = std::exp(T(1.0));
template<typename T>
const T E_POWER_MINUS_PI = std::pow(E<T>, -PI<T>);

using namespace hpc;

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
>
__global__ void jacobi(Matrix<T> A, unsigned int nofiterations) {
   std::size_t i = threadIdx.x + 1;
   std::size_t j = threadIdx.y + 1;
   for (unsigned int it = 0; it < nofiterations; ++it) {
      T Aij = (A(i-1, j) + A(i, j-1) + A(i, j+1) + A(i+1, j)) / 4.0;
      __syncthreads();
      A(i, j) = Aij;
      __syncthreads();
   }
}

template<typename T>
constexpr T int_sqrt(T n) {
   T result = 1;
   while (result * result <= n) {
      ++result;
   }
   return result - 1;
}

int main(int argc, char** argv) {
   using namespace hpc::matvec;
   using namespace hpc::cuda;
   using namespace hpc::aux;

   using T = double;
   std::size_t max_threads = get_max_threads_per_block();
   std::size_t N = int_sqrt(max_threads);
   GeMatrix<T> A(N + 2, N + 2, Order::ColMajor);

   /* initialize the entire matrix, including its borders */
   for_all_indices(A, [&](std::size_t i, std::size_t j) -> void {
      if (j == 0) {
	 A(i, j) = std::sin(PI<T> * (T(i)/(A.numRows()-1)));
      } else if (j == A.numCols() - 1) {
	 A(i, j) = std::sin(PI<T> * (T(i)/(A.numRows()-1))) *
	    E_POWER_MINUS_PI<T>;
      } else {
	 A(i, j) = 0;
      }
   });

   DeviceGeMatrix<T> devA(A.numRows(), A.numCols(), Order::ColMajor);
   copy(A, devA);
   dim3 block(N, N);
   jacobi<<<1, block>>>(devA.view(), 520);
   copy(devA, A);

   auto pixbuf = create_pixbuf(A, [](T val) -> HSVColor<T> {
      return HSVColor<T>((1-val) * 240, 1, 1);
   }, 8);
   gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr,
      "quality", "100", nullptr);
}