Global synchronization

Content

Global synchronization
- Exercise
- Sources
Sample solution
Two-dimensional aggregation
Sample solution
Improving the two-dimensional aggregation
Sample solution

In the moment we move from a single block to multiple blocks we are no longer able to synchronize globally among all threads within the GPU. However, it is possible to synchronize at the host as by default a sequence of kernel function calls is serialized. Hence, we can move the loop for the Jacobi solver steps to the host for a global synchronization.

We are no longer able to do this on just one matrix \(A\) as this trick was based on per-block synchronization. Instead we work now with matrices \(A\) and \(B\). In the first step we move from \(A\) to \(B\), then from \(B\) to \(A\) etc. We must take care not just to initialize \(A\) but also at least the border of \(B\) (the interior of \(B\) gets initialized during the first Jacobi step).

Exercise

Develop a kernel function init_matrix_border that works like init_matrix but initializes the border only. Think about how this kernel function is to be configured. As we have just to initialize its border we do need much less threads than for initializing the entire matrix.

Test this separately by copying \(B\) back and generating a graphic for it.
Develop a kernel function jacobi_iteration that performs a single Jacobi step. Make sure that it does not change the fixed border. Invoke the kernel function within a loop on the host with at least 1941 Jacobi steps.

Sources

Program text from the last session:

session09/jacobi3.cu

#include <cmath>
#include <cstddef>
#include <memory>
#include <gdk-pixbuf/gdk-pixbuf.h>
#include <printf.hpp>
#include <hpc/aux/hsvcolor.hpp>
#include <hpc/aux/rgbcolor.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/cuda/properties.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/matrix2pixbuf.hpp>

using T = double;

using namespace hpc;

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
>
__global__ void init_matrix(Matrix<T> A) {
   std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
   std::size_t j = threadIdx.y + blockIdx.y * blockDim.y;

   const auto PI = std::acos(-T(1.0));
   const auto E = std::exp(T(1.0));
   const auto E_POWER_MINUS_PI = std::pow(E, -PI);

   std::size_t N = A.numRows();
   T value;
   if (j == N-1) {
      value = std::sin(PI * (T(i)/N)) * E_POWER_MINUS_PI;
   } else if (j == 0) {
      value = std::sin(PI * (T(i)/N));
   } else {
      value = 0;
   }
   A(i, j) = value;
}

template<typename T>
constexpr T int_sqrt(T n) {
   T result = 1;
   while (result * result <= n) {
      ++result;
   }
   return result - 1;
}

int main() {
   using namespace hpc::matvec;
   using namespace hpc::cuda;
   using namespace hpc::aux;

   const std::size_t max_threads = get_max_threads_per_block();
   const std::size_t BLOCK_DIM = int_sqrt(max_threads);
   const std::size_t GRID_DIM = 2;
   const std::size_t N = BLOCK_DIM * GRID_DIM;

   GeMatrix<T> A(N, N, Order::ColMajor);
   DeviceGeMatrix<T> devA(A.numRows(), A.numCols(), Order::ColMajor);
   copy(A, devA);
   dim3 block_dim(BLOCK_DIM, BLOCK_DIM);
   dim3 grid_dim(GRID_DIM, GRID_DIM);
   init_matrix<<<grid_dim, block_dim>>>(devA.view());
   copy(devA, A);

   auto pixbuf = create_pixbuf(A, [](T val) -> HSVColor<T> {
      return HSVColor<T>((1-val) * 240, 1, 1);
   }, 8);
   gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr,
      "quality", "100", nullptr);
}

Generic Makefile for this session:

session10/Makefile

CudaSources := $(wildcard *.cu)
CudaObjects := $(patsubst %.cu,%.o,$(CudaSources))
Targets := $(patsubst %.cu,%,$(CudaSources))
STD := -std=c++14
CPPFLAGS := $(STD) -I. -I/home/numerik/pub/pp/ss19/lib
NVCCFLAGS := --expt-extended-lambda
NVCC := nvcc
CXX := $(NVCC)
CC := $(NVCC)
CXXFLAGS := $(shell pkg-config --cflags gdk-pixbuf-2.0 | sed 's/-pthread //')
LDLIBS := $(shell pkg-config --libs gdk-pixbuf-2.0 | sed 's/-pthread //; s/-Wl/-Xlinker /g')

.PHONY:	all
all:	$(Targets)

$(CudaObjects):	%.o: %.cu
		$(NVCC) -c $(CPPFLAGS) $(CXXFLAGS) $(NVCCFLAGS) $<

$(Targets): %: %.o
		$(NVCC) -w -o $@ $(STD) $(NVCCFLAGS) $< $(LDLIBS)

GCCSources := $(patsubst %,-x c++ %,$(CudaSources))
.PHONY: depend
depend:
		gcc-makedepend $(CPPFLAGS) -D__CUDACC__ $(GCCSources)

.PHONY:	clean
clean:
		rm -f $(Targets) $(CudaObjects)
# DO NOT DELETE