Simple Jacobi solver for the GPU


We revisit now the Jacobi solver (see sessions 24 and 25) for the GPU. We start with a very simple approach where we operate with one block on the GPU only. This easies synchronization as the threads of a block can easily use __syncthreads() to keep in sync.

For reasons of simplicity we work initially with a fixed number of iterations. We will see later how this can be improved.


Develop a kernel function for the Jacobi solver that operates on one block only. The number of iterations is to be passed as parameter. Each thread shall operate only on one \(A_{i,j}\).

Try to solve this with one matrix only. This can indeed be done with proper synchronization. Make sure that the kernel operates on the inner part of \(A\) only.

Think about you access the matrix \(A\) within the kernel function. Which approach is more cache-friendly for the GPU? Or, alternatively, try both variants of matrix storage organisation. Compare the times of both variants.

You can simply profile your application using the nvprof utility, i.e. invoke nvprof ./jacobi1 instead of simply ./jacobi1. Look for the first section titled GPU activities where you will find void jacobi... – interesting is the total time and the average time (Avg).

Explain the difference. Consider that within a warp threadIdx.y is identical for all threads but the values of threadIdx.x are numbered consecutively.

The lecture library is available at /home/numerik/pub/hpc/ws19/session29.


#include <cassert>
#include <cmath>
#include <printf.hpp>
#include <hpc/aux/hsvcolor.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/properties.hpp>
#include <hpc/matvec/for-all-indices.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/matrix2pixbuf.hpp>

const auto PI = std::acos(-1.0);
const auto E = std::exp(1.0);
const auto E_POWER_MINUS_PI = std::pow(E, -PI);

using namespace hpc;

   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
__global__ void jacobi(Matrix<T> A, unsigned int nofiterations) {
   /* to be done */

template<typename T>
constexpr T int_sqrt(T n) {
   T result = 1;
   while (result * result <= n) {
   return result - 1;

int main(int argc, char** argv) {
   using namespace hpc::aux;
   using namespace hpc::cuda;
   using namespace hpc::matvec;

   std::size_t max_threads = get_max_threads_per_block();
   std::size_t N = int_sqrt(max_threads);
   GeMatrix<double> A(N + 2, N + 2, Order::RowMajor);

   /* initialize the entire matrix, including its borders */
   for_all_indices(A, [&](std::size_t i, std::size_t j) -> void {
      if (j == 0) {
	 A(i, j) = std::sin(PI * ((double)i/(A.numRows()-1)));
      } else if (j == A.numCols() - 1) {
	 A(i, j) = std::sin(PI * ((double)i/(A.numRows()-1))) *
      } else {
	 A(i, j) = 0;

   DeviceGeMatrix<double> devA(A.numRows(), A.numCols(), Order::RowMajor);
   copy(A, devA);
   dim3 block(N, N);
   jacobi<<<1, block>>>(devA.view(), 520);
   copy(devA, A);

   auto pixbuf = create_pixbuf(A, [](double val) -> HSVColor<double> {
      return HSVColor<double>((1-val) * 240, 1, 1);
   }, 8);
   gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr,
      "quality", "100", nullptr);


A generic Makefile is provided at /home/numerik/pub/hpc/ws19/session29/misc/Makefile:

CudaSources := $(wildcard *.cu)
CudaObjects := $(patsubst,%.o,$(CudaSources))
Targets := $(patsubst,%,$(CudaSources))
STD := -std=c++14
CPPFLAGS := $(STD) -I/home/numerik/pub/hpc/ws19/session29
NVCC := nvcc
CXX := $(NVCC)
CC := $(NVCC)
CXXFLAGS := $(shell pkg-config --cflags gdk-pixbuf-2.0 | sed 's/-pthread //')
LDLIBS := $(shell pkg-config --libs gdk-pixbuf-2.0 | sed 's/-pthread //; s/-Wl/-Xlinker /g')

.PHONY:	all
all:	$(Targets)

$(CudaObjects):	%.o:

$(Targets): %: %.o
		$(NVCC) -w -o $@ $(STD) $(NVCCFLAGS) $< $(LDLIBS)

GCCSources := $(patsubst %,-x c++ %,$(CudaSources))
.PHONY: depend
		gcc-makedepend $(CPPFLAGS) -D__CUDACC__ $(GCCSources)

.PHONY:	clean
		rm -f $(Targets) $(CudaObjects)