1
       2
       3
       4
       5
       6
       7
       8
       9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
      88
      89
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
     100
     101
     102
     103
     104
     105
     106
     107
     108
     109
     110
     111
     112
     113
     114
     115
     116
     117
     118
     119
     120
     121
#ifndef HPC_CUDA_AGGREGATE_HPP
#define HPC_CUDA_AGGREGATE_HPP 1

#ifdef __CUDACC__

#include <hpc/cuda/check.hpp>
#include <hpc/cuda/gematrix.hpp>
#include <hpc/cuda/properties.hpp>

namespace hpc { namespace cuda {

namespace aggregate_impl {

constexpr std::size_t BLOCK_DIM = 32;

template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   typename Aggregator,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void aggregate_row_wise(const MatrixA<T> A, MatrixB<T> B,
      Aggregator aggregator) {
   std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM;
   std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM;

   if (i < A.numRows()) {
      T val = A(i, j);
      std::size_t maxoffset = BLOCK_DIM;
      if (j + maxoffset >= A.numCols()) {
	 maxoffset = A.numCols() - j;
      }
      for (std::size_t offset = 1; offset < maxoffset; ++offset) {
	 val = aggregator(val, A(i, j + offset));
      }
      B(i, blockIdx.y) = val;
   }
}

template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   typename Aggregator,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void aggregate_col_wise(const MatrixA<T> A, MatrixB<T> B,
      Aggregator aggregator) {
   std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM;
   std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM;

   if (j < A.numCols()) {
      T val = A(i, j);
      std::size_t maxoffset = BLOCK_DIM;
      if (i + maxoffset >= A.numRows()) {
	 maxoffset = A.numRows() - i;
      }
      for (std::size_t offset = 1; offset < maxoffset; ++offset) {
	 val = aggregator(val, A(i + offset, j));
      }
      B(blockIdx.x, j) = val;
   }
}

inline constexpr std::size_t ceildiv(std::size_t x, std::size_t y) {
   /* note that we expect x > 0 && y > 0;
      not safe against overflows but we expect y to be small */
   return (x + y - 1) / y;
}

} // namespace aggregate_impl

template<
   template<typename> class Matrix,
   typename T,
   typename Aggregator,
   Require< DeviceGe<Matrix<T>> > = true
>
T aggregate(const Matrix<T>& A, Aggregator&& aggregator) {
   using namespace aggregate_impl;
   if (A.numRows() > 1 || A.numCols() > 1) {
      const auto source = A.view();
      dim3 grid_dim(ceildiv(A.numRows(), BLOCK_DIM),
	 ceildiv(A.numCols(), BLOCK_DIM));
      if (A.numRows() > A.numCols()) {
	 DeviceGeMatrix<T> B(ceildiv(A.numRows(), BLOCK_DIM), A.numCols());
	 auto target = B.view();
	 dim3 block_dim(1, BLOCK_DIM);
	 aggregate_col_wise<<<grid_dim, block_dim>>>(source, target,
	    aggregator);
	 return aggregate(target, std::move(aggregator));
      } else {
	 DeviceGeMatrix<T> B(A.numRows(), ceildiv(A.numCols(), BLOCK_DIM));
	 auto target = B.view();
	 dim3 block_dim(BLOCK_DIM, 1);
	 aggregate_row_wise<<<grid_dim, block_dim>>>(source, target,
	    aggregator);
	 return aggregate(target, std::move(aggregator));
      }
   } else {
      T result;
      CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T),
	 cudaMemcpyDeviceToHost);
      return result;
   }
}

} } // namespaces cuda and hpc

#else
#	error This CUDA source must be compiled using nvcc
#endif

#endif