Sample solution

Content

#ifndef HPC_CUDA_AGGREGATE_HPP
#define HPC_CUDA_AGGREGATE_HPP 1

#ifdef __CUDACC__

#include <hpc/aux/ceildiv.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/cuda/properties.hpp>

namespace hpc { namespace cuda {

namespace aggregate_impl {

constexpr std::size_t BLOCK_DIM = 32;

template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   typename Aggregator,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void aggregate_row_wise(const MatrixA<T> A, MatrixB<T> B,
      Aggregator aggregator) {
   std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM;
   std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM;

   if (i < A.numRows()) {
      T val = A(i, j);
      std::size_t maxoffset = BLOCK_DIM;
      if (j + maxoffset >= A.numCols()) {
	 maxoffset = A.numCols() - j;
      }
      for (std::size_t offset = 1; offset < maxoffset; ++offset) {
	 val = aggregator(val, A(i, j + offset));
      }
      B(i, blockIdx.y) = val;
   }
}

template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   typename Aggregator,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void aggregate_col_wise(const MatrixA<T> A, MatrixB<T> B,
      Aggregator aggregator) {
   std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM;
   std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM;

   if (j < A.numCols()) {
      T val = A(i, j);
      std::size_t maxoffset = BLOCK_DIM;
      if (i + maxoffset >= A.numRows()) {
	 maxoffset = A.numRows() - i;
      }
      for (std::size_t offset = 1; offset < maxoffset; ++offset) {
	 val = aggregator(val, A(i + offset, j));
      }
      B(blockIdx.x, j) = val;
   }
}

} // namespace aggregate_impl

template<
   template<typename> class Matrix,
   typename T,
   typename Aggregator,
   Require< DeviceGe<Matrix<T>> > = true
>
T aggregate(const Matrix<T>& A, Aggregator&& aggregator) {
   using namespace aggregate_impl;
   using namespace hpc::aux;
   if (A.numRows() > 1 || A.numCols() > 1) {
      const auto source = A.view();
      dim3 grid_dim(ceildiv(A.numRows(), BLOCK_DIM),
	 ceildiv(A.numCols(), BLOCK_DIM));
      if (A.numRows() > A.numCols()) {
	 DeviceGeMatrix<T> B(ceildiv(A.numRows(), BLOCK_DIM), A.numCols());
	 auto target = B.view();
	 dim3 block_dim(1, BLOCK_DIM);
	 aggregate_col_wise<<<grid_dim, block_dim>>>(source, target,
	    aggregator);
	 return aggregate(target, std::move(aggregator));
      } else {
	 DeviceGeMatrix<T> B(A.numRows(), ceildiv(A.numCols(), BLOCK_DIM));
	 auto target = B.view();
	 dim3 block_dim(BLOCK_DIM, 1);
	 aggregate_row_wise<<<grid_dim, block_dim>>>(source, target,
	    aggregator);
	 return aggregate(target, std::move(aggregator));
      }
   } else {
      T result;
      CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T),
	 cudaMemcpyDeviceToHost);
      return result;
   }
}

} } // namespaces cuda and hpc

#else
#	error This CUDA source must be compiled using nvcc
#endif

#endif
#include <cmath>
#include <cstddef>
#include <memory>
#include <gdk-pixbuf/gdk-pixbuf.h>
#include <printf.hpp>
#include <hpc/aux/hsvcolor.hpp>
#include <hpc/aux/rgbcolor.hpp>
#include <hpc/cuda/aggregate.hpp>
#include <hpc/cuda/check.hpp>
#include <hpc/cuda/copy.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/cuda/properties.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/matrix2pixbuf.hpp>

using T = double;

using namespace hpc;

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
>
__global__ void init_matrix(Matrix<T> A) {
   std::size_t i = threadIdx.x + blockIdx.x * blockDim.x;
   std::size_t j = threadIdx.y + blockIdx.y * blockDim.y;

   const auto PI = std::acos(-T(1.0));
   const auto E = std::exp(T(1.0));
   const auto E_POWER_MINUS_PI = std::pow(E, -PI);

   std::size_t N = A.numRows();
   T value;
   if (j == N-1) {
      value = std::sin(M_PI * (static_cast<T>(i)/N)) * E_POWER_MINUS_PI;
   } else if (j == 0) {
      value = std::sin(M_PI * (static_cast<T>(i)/N));
   } else {
      value = 0;
   }
   A(i, j) = value;
}

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>>, DeviceView<Matrix<T>> > = true
>
__global__ void init_matrix_border(Matrix<T> A) {
   std::size_t index = threadIdx.x + blockIdx.x * blockDim.x;

   const auto PI = std::acos(-T(1.0));
   const auto E = std::exp(T(1.0));
   const auto E_POWER_MINUS_PI = std::pow(E, -PI);

   std::size_t N = A.numRows();
   A(index, N-1) = std::sin(M_PI * (static_cast<T>(index)/N)) * E_POWER_MINUS_PI;
   A(index, 0) = std::sin(M_PI * (static_cast<T>(index)/N));
   if (index > 0 && index < N-1) {
      A(0, index) = 0; A(N-1, index) = 0;
   }
}

template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   template<typename> class Distance,
   typename T,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>,
      DeviceGe<Distance<T>>, DeviceView<Distance<T>>
   > = true
>
__global__ void jacobi_iteration(const MatrixA<T> A, MatrixB<T> B,
      Distance<T> D) {
   std::size_t i = threadIdx.y + blockIdx.y * blockDim.y;
   std::size_t j = threadIdx.x + blockIdx.x * blockDim.x;

   T value;
   if (i == 0 || j == 0 || i == A.numRows()-1 || j == A.numCols()-1) {
      value = A(i, j);
   } else {
      value = 0.25 *
	 (A(i-1, j) + A(i, j-1) + A(i, j+1) + A(i+1, j));
   }
   D(i, j) = std::abs(value - A(i, j));
   B(i, j) = value;
}

template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void jacobi_iteration(const MatrixA<T> A, MatrixB<T> B) {
   std::size_t i = threadIdx.y + blockIdx.y * blockDim.y;
   std::size_t j = threadIdx.x + blockIdx.x * blockDim.x;

   T value;
   if (i == 0 || j == 0 || i == A.numRows()-1 || j == A.numCols()-1) {
      value = A(i, j);
   } else {
      value = 0.25 *
	 (A(i-1, j) + A(i, j-1) + A(i, j+1) + A(i+1, j));
   }
   B(i, j) = value;
}

template<typename T>
constexpr T int_sqrt(T n) {
   T result = 1;
   while (result * result <= n) {
      ++result;
   }
   return result - 1;
}

int main() {
   using namespace hpc::matvec;
   using namespace hpc::cuda;
   using namespace hpc::aux;

   const std::size_t max_threads = get_max_threads_per_block();
   const std::size_t BLOCK_DIM = int_sqrt(max_threads);
   const std::size_t GRID_DIM = 2;
   const std::size_t N = BLOCK_DIM * GRID_DIM;

   GeMatrix<T> A(N, N, Order::ColMajor);
   DeviceGeMatrix<T> devA(A.numRows(), A.numCols(), Order::ColMajor);
   DeviceGeMatrix<T> devB(A.numRows(), A.numCols(), Order::ColMajor);

   dim3 block_dim(BLOCK_DIM, BLOCK_DIM);
   dim3 grid_dim(GRID_DIM, GRID_DIM);
   init_matrix<<<grid_dim, block_dim>>>(devA.view());
   init_matrix_border<<<grid_dim, BLOCK_DIM>>>(devB.view());

   auto A_ = devA.view(); auto B_ = devB.view();

   DeviceGeMatrix<T> global_diff_dev(A.numRows(), A.numCols(),
      Order::ColMajor);
   auto global_diff = global_diff_dev.view();

   std::size_t iterations = 0;
   for (;;) {
      for (int i = 0; i < 10; ++i) {
	 jacobi_iteration<<<grid_dim, block_dim>>>(A_, B_); ++iterations;
	 jacobi_iteration<<<grid_dim, block_dim>>>(B_, A_); ++iterations;
      }
      jacobi_iteration<<<grid_dim, block_dim>>>(A_, B_); ++iterations;
      jacobi_iteration<<<grid_dim, block_dim>>>(B_, A_, global_diff);
      ++iterations;

      T maxdiff = hpc::cuda::aggregate(global_diff, [] __device__
	    (T x, T y) {
	 return x > y? x: y;
      });
      if (maxdiff < 1E-7) break;
   }
   fmt::printf("%d iterations\n", iterations);
   copy(B_, A);

   auto pixbuf = create_pixbuf(A, [](T val) -> HSVColor<double> {
      return HSVColor<T>((1-val) * 240, 1, 1);
   }, 8);
   gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr,
      "quality", "100", nullptr);
}
livingstone$ make EXTRA=step01 jacobi5
nvcc -c -std=c++14 -Istep01 -I/home/numerik/pub/hpc/ws19/session30 -I/usr/include/gdk-pixbuf-2.0 -I/usr/include/libpng16 -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include --expt-extended-lambda jacobi5.cu
nvcc -w -o jacobi5 -std=c++14 --expt-extended-lambda jacobi5.o -lgdk_pixbuf-2.0 -lgobject-2.0 -lglib-2.0
livingstone$ nvprof ./jacobi5
==2775== NVPROF is profiling process 2775, command: ./jacobi5
6710 iterations
==2775== Profiling application: ./jacobi5
==2775== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   85.44%  40.511ms      6405  6.3240us  6.1750us  7.4550us  void jacobi_iteration<hpc::cuda::DeviceGeMatrixView, hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView, hpc<hpc<double>::cuda::DeviceGeMatrixView>::cuda::DeviceGeMatrixView)
                    5.60%  2.6547ms       305  8.7030us  8.5750us  9.0240us  void jacobi_iteration<hpc::cuda::DeviceGeMatrixView, hpc::cuda::DeviceGeMatrixView, hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView, hpc<hpc<double>::cuda::DeviceGeMatrixView>::cuda::DeviceGeMatrixView, hpc<hpc<double>::cuda::DeviceGeMatrixView>::cuda::DeviceGeMatrixView)
                    3.89%  1.8434ms       610  3.0210us  1.7280us  4.8960us  _ZN3hpc4cuda14aggregate_impl18aggregate_col_wiseINS0_23DeviceGeMatrixConstViewENS0_18DeviceGeMatrixViewEdZ4mainEUlddE_Lb1EEEvT_IT1_ET0_IS7_ET2_
                    3.88%  1.8415ms       610  3.0180us  1.8230us  4.7360us  _ZN3hpc4cuda14aggregate_impl18aggregate_row_wiseINS0_23DeviceGeMatrixConstViewENS0_18DeviceGeMatrixViewEdZ4mainEUlddE_Lb1EEEvT_IT1_ET0_IS7_ET2_
                    1.17%  556.35us       306  1.8180us  1.4720us  18.624us  [CUDA memcpy DtoH]
                    0.01%  4.3840us         1  4.3840us  4.3840us  4.3840us  void init_matrix_border<hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView)
                    0.01%  3.9040us         1  3.9040us  3.9040us  3.9040us  void init_matrix<hpc::cuda::DeviceGeMatrixView, double, bool=1>(hpc<double>::cuda::DeviceGeMatrixView)
      API calls:   75.32%  272.22ms      1223  222.58us  6.1460us  263.24ms  cudaMalloc
                   20.08%  72.575ms      7932  9.1490us  7.6830us  303.24us  cudaLaunchKernel
                    2.15%  7.7748ms      1223  6.3570us  5.0280us  213.92us  cudaFree
                    1.86%  6.7074ms       306  21.919us  20.393us  48.189us  cudaMemcpy
                    0.28%  998.28us        97  10.291us  1.8160us  356.04us  cuDeviceGetAttribute
                    0.21%  767.74us         1  767.74us  767.74us  767.74us  cudaGetDeviceProperties
                    0.07%  238.22us         1  238.22us  238.22us  238.22us  cuDeviceTotalMem
                    0.03%  97.006us         1  97.006us  97.006us  97.006us  cuDeviceGetName
                    0.00%  9.2190us         3  3.0730us  2.3750us  4.1900us  cuDeviceGetCount
                    0.00%  6.3550us         1  6.3550us  6.3550us  6.3550us  cudaGetDevice
                    0.00%  5.7970us         2  2.8980us  2.3750us  3.4220us  cuDeviceGet
                    0.00%  5.4480us         1  5.4480us  5.4480us  5.4480us  cuDeviceGetPCIBusId
                    0.00%  2.5840us         1  2.5840us  2.5840us  2.5840us  cuDeviceGetUuid
livingstone$