Simple Jacobi solver for the GPU

Content

We revisit now the Jacobi solver for the GPU. We start with a very simple approach where we operate with one block on the GPU only. This easies synchronization as the threads of a block can easily use __sync_threads() to keep in sync.

For reasons of simplicity we work initially with a fixed number of iterations. We will see later how this can be improved.

Exercise

Develop a kernel function for the Jacobi solver that operates on one block only. The number of iterations is to be passed as parameter. Each thread shall operate only on one \(A_{i,j}\).

Try to solve this with one matrix only. This can indeed be done with proper synchronization. Make sure that the kernel operates on the inner part of \(A\) only.

Think about you access the matrix \(A\) within the kernel function. Which approach is more cache-friendly for the GPU? Or, alternatively, try both variants of matrix storage organisation. Compare the times of both variants.

You can simply profile your application using the nvprof utility, i.e. invoke nvprof ./jacobi1 instead of simply ./jacobi1. Look for the first section titled GPU activities where you will find void jacobi... – interesting is the total time and the average time (Avg).

Explain the difference. Consider that within a warp threadIdx.y is identical for all threads but the values of threadIdx.x are numbered consecutively.

The lecture library is available at /home/numerik/pub/pp/ss19/lib.

Skeleton

#include <cassert>
#include <cmath>
#include <printf.hpp>
#include <hpc/aux/hsvcolor.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/properties.hpp>
#include <hpc/matvec/for-all-indices.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/matrix2pixbuf.hpp>

template<typename T>
const T PI = std::acos(T(-1.0));
template<typename T>
const T E = std::exp(T(1.0));
template<typename T>
const T E_POWER_MINUS_PI = std::pow(E<T>, -PI<T>);

using namespace hpc;

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
>
__global__ void jacobi(Matrix<T> A, unsigned int nofiterations) {
   /* to be done */
}

template<typename T>
constexpr T int_sqrt(T n) {
   T result = 1;
   while (result * result <= n) {
      ++result;
   }
   return result - 1;
}

int main(int argc, char** argv) {
   using namespace hpc::aux;
   using namespace hpc::cuda;
   using namespace hpc::matvec;

   using T = double;
   std::size_t max_threads = get_max_threads_per_block();
   std::size_t N = int_sqrt(max_threads);
   GeMatrix<T> A(N + 2, N + 2, Order::RowMajor);

   /* initialize the entire matrix, including its borders */
   for_all_indices(A, [&](std::size_t i, std::size_t j) -> void {
      if (j == 0) {
	 A(i, j) = std::sin(PI<T> * (T(i)/(A.numRows()-1)));
      } else if (j == A.numCols() - 1) {
	 A(i, j) = std::sin(PI<T> * (T(i)/(A.numRows()-1))) *
	    E_POWER_MINUS_PI<T>;
      } else {
	 A(i, j) = 0;
      }
   });

   DeviceGeMatrix<T> devA(A.numRows(), A.numCols(), Order::RowMajor);
   copy(A, devA);
   dim3 block(N, N);
   jacobi<<<1, block>>>(devA.view(), 520);
   copy(devA, A);

   auto pixbuf = create_pixbuf(A, [](T val) -> HSVColor<T> {
      return HSVColor<T>((1-val) * 240, 1, 1);
   }, 8);
   gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr,
      "quality", "100", nullptr);
}

Makefile

CudaSources := $(wildcard *.cu)
CudaObjects := $(patsubst %.cu,%.o,$(CudaSources))
Targets := $(patsubst %.cu,%,$(CudaSources))
STD := -std=c++14
CPPFLAGS := $(STD) -I/home/numerik/pub/pp/ss19/lib
NVCCFLAGS :=
NVCC := nvcc
CXX := $(NVCC)
CC := $(NVCC)
CXXFLAGS := $(shell pkg-config --cflags gdk-pixbuf-2.0 | sed 's/-pthread //')
LDLIBS := $(shell pkg-config --libs gdk-pixbuf-2.0 | sed 's/-pthread //; s/-Wl/-Xlinker /g')

.PHONY:	all
all:	$(Targets)

$(CudaObjects):	%.o: %.cu
		$(NVCC) -c $(CPPFLAGS) $(CXXFLAGS) $(NVCCFLAGS) $<

$(Targets): %: %.o
		$(NVCC) -w -o $@ $(STD) $(NVCCFLAGS) $< $(LDLIBS)

GCCSources := $(patsubst %,-x c++ %,$(CudaSources))
.PHONY: depend
depend:
		gcc-makedepend $(CPPFLAGS) -D__CUDACC__ $(GCCSources)

.PHONY:	clean
clean:
		rm -f $(Targets) $(CudaObjects)
# DO NOT DELETE
jacobi1-colmaj.o: jacobi1-colmaj.cu \
 /home/numerik/pub/pp/ss19/lib/hpc/aux/hsvcolor.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/aux/rgbcolor.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/hostdev.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/check.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/copy.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/gematrix.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/densevector.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/mixin.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/buffer.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/aux/cache-line.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/views.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/traits.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/traits.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/views.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/storage/mixin.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/tools/buffer.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/gematrix.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/densevector.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/properties.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/for-all-indices.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/matrix2pixbuf.hpp
jacobi0.o: jacobi0.cu /home/numerik/pub/pp/ss19/lib/hpc/aux/hsvcolor.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/aux/rgbcolor.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/hostdev.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/check.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/copy.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/gematrix.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/densevector.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/mixin.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/buffer.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/aux/cache-line.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/views.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/traits.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/traits.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/views.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/storage/mixin.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/tools/buffer.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/gematrix.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/densevector.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/properties.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/for-all-indices.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/matrix2pixbuf.hpp
jacobi1-rowmaj.o: jacobi1-rowmaj.cu \
 /home/numerik/pub/pp/ss19/lib/hpc/aux/hsvcolor.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/aux/rgbcolor.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/hostdev.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/check.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/copy.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/gematrix.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/densevector.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/mixin.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/buffer.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/aux/cache-line.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/views.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/traits.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/traits.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/views.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/storage/mixin.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/tools/buffer.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/gematrix.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/densevector.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/properties.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/for-all-indices.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/matrix2pixbuf.hpp
jacobi3.o: jacobi3.cu /home/numerik/pub/pp/ss19/lib/hpc/aux/hsvcolor.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/aux/rgbcolor.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/hostdev.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/check.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/copy.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/gematrix.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/densevector.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/mixin.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/buffer.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/aux/cache-line.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/views.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/traits.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/traits.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/views.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/storage/mixin.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/tools/buffer.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/gematrix.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/densevector.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/properties.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/matrix2pixbuf.hpp
jacobi2.o: jacobi2.cu /home/numerik/pub/pp/ss19/lib/hpc/aux/hsvcolor.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/aux/rgbcolor.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/hostdev.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/check.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/copy.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/gematrix.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/densevector.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/mixin.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/buffer.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/aux/cache-line.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/views.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/traits.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/traits.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/views.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/storage/mixin.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/tools/buffer.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/gematrix.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/densevector.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/cuda/properties.hpp \
 /home/numerik/pub/pp/ss19/lib/hpc/matvec/matrix2pixbuf.hpp