Global synchronization


In the moment we move from a single block to multiple blocks we are no longer able to synchronize globally among all threads within the GPU. However, it is possible to synchronize at the host as by default a sequence of kernel function calls is serialized. Hence, we can move the loop for the Jacobi solver steps to the host for a global synchronization.

We are no longer able to do this on just one matrix \(A\) as this trick was based on per-block synchronization. Instead we work now with matrices \(A\) and \(B\). In the first step we move from \(A\) to \(B\), then from \(B\) to \(A\) etc. We must take care not just to initialize \(A\) but also at least the border of \(B\) (the interior of \(B\) gets initialized during the first Jacobi step).



Program text from the last session:

#include <cmath>
#include <cstddef>
#include <memory>
#include <gdk-pixbuf/gdk-pixbuf.h>
#include <printf.hpp>
#include <hpc/aux/hsvcolor.hpp>
#include <hpc/aux/rgbcolor.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/cuda/properties.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/matrix2pixbuf.hpp>

using T = double;

using namespace hpc;

   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
__global__ void init_matrix(Matrix<T> A) {
   std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
   std::size_t j = threadIdx.y + blockIdx.y * blockDim.y;

   const auto PI = std::acos(-T(1.0));
   const auto E = std::exp(T(1.0));
   const auto E_POWER_MINUS_PI = std::pow(E, -PI);

   std::size_t N = A.numRows();
   T value;
   if (j == N-1) {
      value = std::sin(M_PI * (static_cast<T>(i)/N)) * E_POWER_MINUS_PI;
   } else if (j == 0) {
      value = std::sin(M_PI * (static_cast<T>(i)/N));
   } else {
      value = 0;
   A(i, j) = value;

template<typename T>
constexpr T int_sqrt(T n) {
   T result = 1;
   while (result * result <= n) {
   return result - 1;

int main() {
   using namespace hpc::matvec;
   using namespace hpc::cuda;
   using namespace hpc::aux;

   const std::size_t max_threads = get_max_threads_per_block();
   const std::size_t BLOCK_DIM = int_sqrt(max_threads);
   const std::size_t GRID_DIM = 2;
   const std::size_t N = BLOCK_DIM * GRID_DIM;

   GeMatrix<T> A(N, N, Order::ColMajor);
   DeviceGeMatrix<T> devA(A.numRows(), A.numCols(), Order::ColMajor);
   copy(A, devA);
   dim3 block_dim(BLOCK_DIM, BLOCK_DIM);
   dim3 grid_dim(GRID_DIM, GRID_DIM);
   init_matrix<<<grid_dim, block_dim>>>(devA.view());
   copy(devA, A);

   auto pixbuf = create_pixbuf(A, [](T val) -> HSVColor<double> {
      return HSVColor<T>((1-val) * 240, 1, 1);
   }, 8);
   gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr,
      "quality", "100", nullptr);

A generic Makefile is provided at /home/numerik/pub/hpc/ws19/session30/misc/Makefile:

CudaSources := $(wildcard *.cu)
CudaObjects := $(patsubst,%.o,$(CudaSources))
Targets := $(patsubst,%,$(CudaSources))
STD := -std=c++14
ifdef EXTRA
CPPFLAGS := $(STD) -I$(EXTRA) -I/home/numerik/pub/hpc/ws19/session30
CPPFLAGS := $(STD) -I/home/numerik/pub/hpc/ws19/session30
NVCCFLAGS := --expt-extended-lambda
NVCC := nvcc
CXX := $(NVCC)
CC := $(NVCC)
CXXFLAGS := $(shell pkg-config --cflags gdk-pixbuf-2.0 | sed 's/-pthread //')
LDLIBS := $(shell pkg-config --libs gdk-pixbuf-2.0 | sed 's/-pthread //; s/-Wl/-Xlinker /g')

.PHONY:	all
all:	$(Targets)

$(CudaObjects):	%.o:

$(Targets): %: %.o
		$(NVCC) -w -o $@ $(STD) $(NVCCFLAGS) $< $(LDLIBS)

GCCSources := $(patsubst %,-x c++ %,$(CudaSources))
.PHONY: depend
		gcc-makedepend $(CPPFLAGS) -D__CUDACC__ $(GCCSources)

.PHONY:	clean
		rm -f $(Targets) $(CudaObjects)

This Makefile supports an extra subdirectory with headers if the EXTRA macro is specified at the command line. If you have, for example, a local subdirectory named hpc with header files which is to be visible within #include <hpc/...>, you should make as follows: make EXTRA=. depend; make EXTRA=. all. Likewise, if you have a subdirectory named step01/hpc, you should invoke make EXTRA=step01 depend; make EXTRA=step01 all. You do not need to specify the EXTRA parameter unless you have such a subdirectory with header files.

As before, all programs are to be tested on livingstone.