Global synchronization

Content

In the moment we move from a single block to multiple blocks we are no longer able to synchronize globally among all threads within the GPU. However, it is possible to synchronize at the host as by default a sequence of kernel function calls is serialized. Hence, we can move the loop for the Jacobi solver steps to the host for a global synchronization.

We are no longer able to do this on just one matrix \(A\) as this trick was based on per-block synchronization. Instead we work now with matrices \(A\) and \(B\). In the first step we move from \(A\) to \(B\), then from \(B\) to \(A\) etc. We must take care not just to initialize \(A\) but also at least the border of \(B\) (the interior of \(B\) gets initialized during the first Jacobi step).

Exercise

Sources

Program text from the last session:

#include <cmath>
#include <cstddef>
#include <memory>
#include <gdk-pixbuf/gdk-pixbuf.h>
#include <printf.hpp>
#include <hpc/aux/hsvcolor.hpp>
#include <hpc/aux/rgbcolor.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/cuda/properties.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/matrix2pixbuf.hpp>

using T = double;

using namespace hpc;

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
>
__global__ void init_matrix(Matrix<T> A) {
   std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
   std::size_t j = threadIdx.y + blockIdx.y * blockDim.y;

   const auto PI = std::acos(-T(1.0));
   const auto E = std::exp(T(1.0));
   const auto E_POWER_MINUS_PI = std::pow(E, -PI);

   std::size_t N = A.numRows();
   T value;
   if (j == N-1) {
      value = std::sin(M_PI * (static_cast<T>(i)/N)) * E_POWER_MINUS_PI;
   } else if (j == 0) {
      value = std::sin(M_PI * (static_cast<T>(i)/N));
   } else {
      value = 0;
   }
   A(i, j) = value;
}

template<typename T>
constexpr T int_sqrt(T n) {
   T result = 1;
   while (result * result <= n) {
      ++result;
   }
   return result - 1;
}

int main() {
   using namespace hpc::matvec;
   using namespace hpc::cuda;
   using namespace hpc::aux;

   const std::size_t max_threads = get_max_threads_per_block();
   const std::size_t BLOCK_DIM = int_sqrt(max_threads);
   const std::size_t GRID_DIM = 2;
   const std::size_t N = BLOCK_DIM * GRID_DIM;

   GeMatrix<T> A(N, N, Order::ColMajor);
   DeviceGeMatrix<T> devA(A.numRows(), A.numCols(), Order::ColMajor);
   copy(A, devA);
   dim3 block_dim(BLOCK_DIM, BLOCK_DIM);
   dim3 grid_dim(GRID_DIM, GRID_DIM);
   init_matrix<<<grid_dim, block_dim>>>(devA.view());
   copy(devA, A);

   auto pixbuf = create_pixbuf(A, [](T val) -> HSVColor<double> {
      return HSVColor<T>((1-val) * 240, 1, 1);
   }, 8);
   gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr,
      "quality", "100", nullptr);
}

Generic Makefile for this session:

CudaSources := $(wildcard *.cu)
CudaObjects := $(patsubst %.cu,%.o,$(CudaSources))
Targets := $(patsubst %.cu,%,$(CudaSources))
STD := -std=c++14
CPPFLAGS := $(STD) -I. -I/usr/local/hpc/session30
NVCCFLAGS := --expt-extended-lambda
NVCC := nvcc
CXX := $(NVCC)
CC := $(NVCC)
CXXFLAGS := $(shell pkg-config --cflags gdk-pixbuf-2.0 | sed 's/-pthread //')
LDLIBS := $(shell pkg-config --libs gdk-pixbuf-2.0 | sed 's/-pthread //; s/-Wl/-Xlinker /g')

.PHONY:	all
all:	$(Targets)

$(CudaObjects):	%.o: %.cu
		$(NVCC) -c $(CPPFLAGS) $(CXXFLAGS) $(NVCCFLAGS) $<

$(Targets): %: %.o
		$(NVCC) -w -o $@ $(STD) $(NVCCFLAGS) $< $(LDLIBS)

GCCSources := $(patsubst %,-x c++ %,$(CudaSources))
.PHONY: depend
depend:
		gcc-makedepend $(CPPFLAGS) -D__CUDACC__ $(GCCSources)

.PHONY:	clean
clean:
		rm -f $(Targets) $(CudaObjects)
# DO NOT DELETE