Sample solution

Content

Global synchronization
Sample solution
Two-dimensional aggregation
Sample solution
Improving the two-dimensional aggregation
Sample solution

#include <cmath>
#include <cstddef>
#include <memory>
#include <gdk-pixbuf/gdk-pixbuf.h>
#include <printf.hpp>
#include <hpc/aux/hsvcolor.hpp>
#include <hpc/aux/rgbcolor.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/cuda/properties.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/matrix2pixbuf.hpp>

using T = double;

using namespace hpc;

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
>
__global__ void init_matrix(Matrix<T> A) {
   std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
   std::size_t j = threadIdx.y + blockIdx.y * blockDim.y;

   const auto PI = std::acos(-T(1.0));
   const auto E = std::exp(T(1.0));
   const auto E_POWER_MINUS_PI = std::pow(E, -PI);

   std::size_t N = A.numRows();
   T value;
   if (j == N-1) {
      value = std::sin(M_PI * (T(i)/N)) * E_POWER_MINUS_PI;
   } else if (j == 0) {
      value = std::sin(M_PI * (T(i)/N));
   } else {
      value = 0;
   }
   A(i, j) = value;
}

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
>
__global__ void init_matrix_border(Matrix<T> A) {
   std::size_t index = threadIdx.x + blockIdx.x * blockDim.x;

   const auto PI = std::acos(-T(1.0));
   const auto E = std::exp(T(1.0));
   const auto E_POWER_MINUS_PI = std::pow(E, -PI);

   std::size_t N = A.numRows();
   A(index, N-1) = std::sin(M_PI * (T(index)/N)) *
      E_POWER_MINUS_PI;
   A(index, 0) = std::sin(M_PI * (T(index)/N));
   if (index > 0 && index < N-1) {
      A(0, index) = 0; A(N-1, index) = 0;
   }
}

template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void jacobi_iteration(const MatrixA<T> A, MatrixB<T> B) {
   std::size_t i = threadIdx.y + blockIdx.y * blockDim.y;
   std::size_t j = threadIdx.x + blockIdx.x * blockDim.x;

   T value;
   if (i == 0 || j == 0 || i == A.numRows()-1 || j == A.numCols()-1) {
      value = A(i, j);
   } else {
      value = 0.25 *
	 (A(i-1, j) + A(i, j-1) + A(i, j+1) + A(i+1, j));
   }
   B(i, j) = value;
}

template<typename T>
constexpr T int_sqrt(T n) {
   T result = 1;
   while (result * result <= n) {
      ++result;
   }
   return result - 1;
}

int main() {
   using namespace hpc::aux;
   using namespace hpc::cuda;
   using namespace hpc::matvec;

   const std::size_t max_threads = get_max_threads_per_block();
   const std::size_t BLOCK_DIM = int_sqrt(max_threads);
   const std::size_t GRID_DIM = 2;
   const std::size_t N = BLOCK_DIM * GRID_DIM;

   GeMatrix<T> A(N, N, Order::ColMajor);
   DeviceGeMatrix<T> devA(A.numRows(), A.numCols(), Order::ColMajor);
   DeviceGeMatrix<T> devB(A.numRows(), A.numCols(), Order::ColMajor);

   dim3 block_dim(BLOCK_DIM, BLOCK_DIM);
   dim3 grid_dim(GRID_DIM, GRID_DIM);
   init_matrix<<<grid_dim, block_dim>>>(devA.view());
   init_matrix_border<<<grid_dim, BLOCK_DIM>>>(devB.view());

   auto A_ = devA.view(); auto B_ = devB.view();
   std::size_t iterations = 0;
   while (iterations < 1941) {
      jacobi_iteration<<<grid_dim, block_dim>>>(A_, B_); ++iterations;
      jacobi_iteration<<<grid_dim, block_dim>>>(B_, A_); ++iterations;
   }
   copy(B_, A);

   auto pixbuf = create_pixbuf(A, [](T val) -> HSVColor<double> {
      return HSVColor<T>((1-val) * 240, 1, 1);
   }, 8);
   gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr,
      "quality", "100", nullptr);
}

livingstone$ nvprof ./jacobi4
==989== NVPROF is profiling process 989, command: ./jacobi4
==989== Profiling application: ./jacobi4
==989== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   99.77%  12.256ms      1942  6.3110us  6.1750us  7.1030us  void jacobi_iteration<hpc::cuda::DeviceGeMatrixView, hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView, hpc<hpc<double>::cuda::DeviceGeMatrixView>::cuda::DeviceGeMatrixView)
                    0.16%  19.264us         1  19.264us  19.264us  19.264us  [CUDA memcpy DtoH]
                    0.04%  4.4800us         1  4.4800us  4.4800us  4.4800us  void init_matrix_border<hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView)
                    0.03%  4.0320us         1  4.0320us  4.0320us  4.0320us  void init_matrix<hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView)
      API calls:   91.30%  222.44ms         2  111.22ms  10.895us  222.43ms  cudaMalloc
                    7.64%  18.616ms      1944  9.5750us  7.8220us  37.853us  cudaLaunchKernel
                    0.45%  1.0859ms        97  11.195us  2.0950us  397.95us  cuDeviceGetAttribute
                    0.33%  802.17us         1  802.17us  802.17us  802.17us  cudaGetDeviceProperties
                    0.11%  260.57us         1  260.57us  260.57us  260.57us  cuDeviceTotalMem
                    0.10%  247.44us         2  123.72us  39.878us  207.56us  cudaFree
                    0.04%  100.22us         1  100.22us  100.22us  100.22us  cuDeviceGetName
                    0.03%  61.668us         1  61.668us  61.668us  61.668us  cudaMemcpy
                    0.00%  9.7770us         3  3.2590us  2.5840us  4.4700us  cuDeviceGetCount
                    0.00%  6.8440us         1  6.8440us  6.8440us  6.8440us  cuDeviceGetPCIBusId
                    0.00%  6.6350us         1  6.6350us  6.6350us  6.6350us  cudaGetDevice
                    0.00%  6.2850us         2  3.1420us  2.5140us  3.7710us  cuDeviceGet
                    0.00%  2.7930us         1  2.7930us  2.7930us  2.7930us  cuDeviceGetUuid
livingstone$