Sample solution

Content

The difference between both variants is significant. On average, the kernel function for the row-major variant uses ca. 1.9 ms where the col-major variant needs just 0.69 ms.

livingstone$ cp /home/numerik/pub/hpc/ws19/session29/misc/Makefile .
livingstone$ make depend
gcc-makedepend -std=c++14 -I/home/numerik/pub/hpc/ws19/session29 -D__CUDACC__ -x c++ jacobi1-colmaj.cu -x c++ jacobi1-rowmaj.cu
livingstone$ make jacobi1-colmaj jacobi1-rowmaj
nvcc -c -std=c++14 -I/home/numerik/pub/hpc/ws19/session29 -I/usr/include/gdk-pixbuf-2.0 -I/usr/include/libpng16 -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include  jacobi1-colmaj.cu
nvcc -w -o jacobi1-colmaj -std=c++14  jacobi1-colmaj.o -lgdk_pixbuf-2.0 -lgobject-2.0 -lglib-2.0
nvcc -c -std=c++14 -I/home/numerik/pub/hpc/ws19/session29 -I/usr/include/gdk-pixbuf-2.0 -I/usr/include/libpng16 -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include  jacobi1-rowmaj.cu
nvcc -w -o jacobi1-rowmaj -std=c++14  jacobi1-rowmaj.o -lgdk_pixbuf-2.0 -lgobject-2.0 -lglib-2.0
livingstone$ diff jacobi1-colmaj.cu jacobi1-rowmaj.cu
50c50
<    GeMatrix<double> A(N + 2, N + 2, Order::ColMajor);
---
>    GeMatrix<double> A(N + 2, N + 2, Order::RowMajor);
64c64
<    DeviceGeMatrix<double> devA(A.numRows(), A.numCols(), Order::ColMajor);
---
>    DeviceGeMatrix<double> devA(A.numRows(), A.numCols(), Order::RowMajor);
livingstone$ nvprof ./jacobi1-rowmaj
==25450== NVPROF is profiling process 25450, command: ./jacobi1-rowmaj
==25450== Profiling application: ./jacobi1-rowmaj
==25450== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.61%  1.8616ms         1  1.8616ms  1.8616ms  1.8616ms  void jacobi<hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView, unsigned int)
                    0.25%  4.7360us         1  4.7360us  4.7360us  4.7360us  [CUDA memcpy DtoH]
                    0.13%  2.4640us         1  2.4640us  2.4640us  2.4640us  [CUDA memcpy HtoD]
      API calls:   97.24%  156.32ms         1  156.32ms  156.32ms  156.32ms  cudaMalloc
                    1.19%  1.9049ms         2  952.43us  20.533us  1.8843ms  cudaMemcpy
                    0.66%  1.0568ms        97  10.894us  2.0260us  373.43us  cuDeviceGetAttribute
                    0.50%  810.14us         1  810.14us  810.14us  810.14us  cudaGetDeviceProperties
                    0.17%  269.23us         1  269.23us  269.23us  269.23us  cuDeviceTotalMem
                    0.15%  233.47us         1  233.47us  233.47us  233.47us  cudaFree
                    0.06%  95.331us         1  95.331us  95.331us  95.331us  cuDeviceGetName
                    0.02%  29.472us         1  29.472us  29.472us  29.472us  cudaLaunchKernel
                    0.01%  10.128us         3  3.3760us  2.5850us  4.8890us  cuDeviceGetCount
                    0.00%  6.5650us         1  6.5650us  6.5650us  6.5650us  cudaGetDevice
                    0.00%  5.9360us         2  2.9680us  2.3740us  3.5620us  cuDeviceGet
                    0.00%  5.1680us         1  5.1680us  5.1680us  5.1680us  cuDeviceGetPCIBusId
                    0.00%  2.7940us         1  2.7940us  2.7940us  2.7940us  cuDeviceGetUuid
livingstone$ nvprof ./jacobi1-colmaj
==25535== NVPROF is profiling process 25535, command: ./jacobi1-colmaj
==25535== Profiling application: ./jacobi1-colmaj
==25535== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   98.83%  692.57us         1  692.57us  692.57us  692.57us  void jacobi<hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView, unsigned int)
                    0.75%  5.2480us         1  5.2480us  5.2480us  5.2480us  [CUDA memcpy DtoH]
                    0.42%  2.9760us         1  2.9760us  2.9760us  2.9760us  [CUDA memcpy HtoD]
      API calls:   98.00%  140.08ms         1  140.08ms  140.08ms  140.08ms  cudaMalloc
                    0.64%  920.76us        97  9.4920us  1.6060us  332.37us  cuDeviceGetAttribute
                    0.51%  734.85us         2  367.43us  20.533us  714.32us  cudaMemcpy
                    0.47%  674.37us         1  674.37us  674.37us  674.37us  cudaGetDeviceProperties
                    0.14%  199.18us         1  199.18us  199.18us  199.18us  cudaFree
                    0.14%  196.32us         1  196.32us  196.32us  196.32us  cuDeviceTotalMem
                    0.06%  82.480us         1  82.480us  82.480us  82.480us  cuDeviceGetName
                    0.02%  28.285us         1  28.285us  28.285us  28.285us  cudaLaunchKernel
                    0.01%  8.3100us         3  2.7700us  2.1650us  3.7010us  cuDeviceGetCount
                    0.00%  6.2850us         1  6.2850us  6.2850us  6.2850us  cuDeviceGetPCIBusId
                    0.00%  5.3080us         1  5.3080us  5.3080us  5.3080us  cudaGetDevice
                    0.00%  5.1680us         2  2.5840us  2.2350us  2.9330us  cuDeviceGet
                    0.00%  2.2350us         1  2.2350us  2.2350us  2.2350us  cuDeviceGetUuid
livingstone$ 

Let us have a look at the kernel function:

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
>
__global__ void jacobi(Matrix<T> A, unsigned int nofiterations) {
   std::size_t i = threadIdx.x + 1;
   std::size_t j = threadIdx.y + 1;
   for (unsigned int it = 0; it < nofiterations; ++it) {
      T Aij = (A(i-1, j) + A(i, j-1) + A(i, j+1) + A(i+1, j)) / 4.0;
      __syncthreads();
      A(i, j) = Aij;
      __syncthreads();
   }
}

The index i depends on threadIdx.x, j on threadIdx.y. Within a warp, j is the same for all threads but i ranges from 0 to 31. If we have row-major, the individual threads will access distant values of A(i-1, j) etc. whereas in case of col-major the values will be stored consecutively in memory and thereby more likely within the same cache line. This means much less time will be spent with loading and storing.

Of course, we do not really require matrices on the GPU to be in col-major as we could have easily switched the indices i and j for a row-major matrix.

#include <cstddef>
#include <cmath>
#include <printf.hpp>
#include <hpc/aux/hsvcolor.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/properties.hpp>
#include <hpc/matvec/for-all-indices.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/matrix2pixbuf.hpp>

const auto PI = std::acos(-1.0);
const auto E = std::exp(1.0);
const auto E_POWER_MINUS_PI = std::pow(E, -PI);

using namespace hpc;

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
>
__global__ void jacobi(Matrix<T> A, unsigned int nofiterations) {
   std::size_t i = threadIdx.x + 1;
   std::size_t j = threadIdx.y + 1;
   for (unsigned int it = 0; it < nofiterations; ++it) {
      T Aij = (A(i-1, j) + A(i, j-1) + A(i, j+1) + A(i+1, j)) / 4.0;
      __syncthreads();
      A(i, j) = Aij;
      __syncthreads();
   }
}

template<typename T>
constexpr T int_sqrt(T n) {
   T result = 1;
   while (result * result <= n) {
      ++result;
   }
   return result - 1;
}

int main(int argc, char** argv) {
   using namespace hpc::matvec;
   using namespace hpc::cuda;
   using namespace hpc::aux;

   std::size_t max_threads = get_max_threads_per_block();
   std::size_t N = int_sqrt(max_threads);
   GeMatrix<double> A(N + 2, N + 2, Order::ColMajor);

   /* initialize the entire matrix, including its borders */
   for_all_indices(A, [&](std::size_t i, std::size_t j) -> void {
      if (j == 0) {
	 A(i, j) = std::sin(PI * ((double)i/(A.numRows()-1)));
      } else if (j == A.numCols() - 1) {
	 A(i, j) = std::sin(PI * ((double)i/(A.numRows()-1))) *
	    E_POWER_MINUS_PI;
      } else {
	 A(i, j) = 0;
      }
   });

   DeviceGeMatrix<double> devA(A.numRows(), A.numCols(), Order::ColMajor);
   copy(A, devA);
   dim3 block(N, N);
   jacobi<<<1, block>>>(devA.view(), 520);
   copy(devA, A);

   auto pixbuf = create_pixbuf(A, [](double val) -> HSVColor<double> {
      return HSVColor<double>((1-val) * 240, 1, 1);
   }, 8);
   gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr,
      "quality", "100", nullptr);
}