Working with a 2-dimensional grid of 2-dimensional blocks


The Jacobi solver was very limited as just the dimensions of one block were supported. We want now to move to a more general solution (which will be completed in the next session).

Firstly, we do no longer want to initialize the matrix \(A\) on the host. Hence, we no longer need to copy the matrix from host to device as the device can perform the initialization much faster than the host.


Take the skeleton below and develop a kernel function that is responsible for the initialization of \(A\). Do this for a grid of \(2 \times 2\) blocks (see GRID_DIM in the skeleton).

Then generate the graphics which shall look as following:


#include <cmath>
#include <cstddef>
#include <memory>
#include <gdk-pixbuf/gdk-pixbuf.h>
#include <printf.hpp>
#include <hpc/aux/hsvcolor.hpp>
#include <hpc/aux/rgbcolor.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/cuda/properties.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/matrix2pixbuf.hpp>

using T = double;

using namespace hpc;

   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
__global__ void init_matrix(Matrix<T> A) {
   /* ... TBD ... */

template<typename T>
constexpr T int_sqrt(T n) {
   T result = 1;
   while (result * result <= n) {
   return result - 1;

int main() {
   using namespace hpc::matvec;
   using namespace hpc::cuda;
   using namespace hpc::aux;

   const std::size_t max_threads = get_max_threads_per_block();
   const std::size_t BLOCK_DIM = int_sqrt(max_threads);
   const std::size_t GRID_DIM = 2;
   const std::size_t N = BLOCK_DIM * GRID_DIM;

   GeMatrix<T> A(N, N, Order::ColMajor);

   /* ... TBD ... */

   auto pixbuf = create_pixbuf(A, [](T val) -> HSVColor<double> {
      return HSVColor<T>((1-val) * 240, 1, 1);
   }, 8);
   gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr,
      "quality", "100", nullptr);