===============
Sample solution                 [TOC]
===============

The following approach takes into account that aggregation does not come
with a well-defined order. Hence, the implementation is free to chose
any order it wants. In our particular case of seeking the maximum, the
order is indeed irrelevant. But in other cases it may play a role – 
for example when we sum up all elements without any precautions for a
loss of significance (BTW, the Kahan summation algorithm can be helpful
here). We are already using this freedom when we decide to aggregate
either row or column-wise.

The interesting point is that when order does not matter we have
the freedom to use transposing views of a matrix. If a matrix
is in row-major order, a transposing view is in col-major order,
and vice versa.

Hence, we add to the implementation a function named
`transposed_view` which delivers a matrix in the opposite
storage order:

---- CODE (type=cpp) ----------------------------------------------------------
template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto transposed_view(const Matrix<T>& A) {
   return DeviceGeMatrixConstView<T>(A.numCols(), A.numRows(), A.conj(),
      A.data(), A.incCol(), A.incRow());
}
-------------------------------------------------------------------------------

Now we can create functions that deliver a view in
the preferred storage order:

---- CODE (type=cpp) ----------------------------------------------------------
template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto row_major_view(const Matrix<T>& A) {
   if (A.incRow() < A.incCol()) {
      return transposed_view(A);
   } else {
      return A.view();
   }
}

template<
   template<typename> class Matrix,
   typename T,
   Require< DeviceGe<Matrix<T>> > = true
>
auto col_major_view(const Matrix<T>& A) {
   if (A.incRow() > A.incCol()) {
      return transposed_view(A);
   } else {
      return A.view();
   }
}
-------------------------------------------------------------------------------

Then, we chose always the best storage order. In addition,
a special case was added when the matrix fits into one block:

---- CODE (type=cpp) ----------------------------------------------------------
template<
   template<typename> class Matrix,
   typename T,
   typename Aggregator,
   Require< DeviceGe<Matrix<T>> > = true
>
T aggregate(const Matrix<T>& A, Aggregator&& aggregator) {
   using namespace aggregate_impl;
   if (A.numRows() > 1 || A.numCols() > 1) {
      if (A.numRows() <= BLOCK_DIM && A.numCols() <= BLOCK_DIM) {
         /* most efficiently handled by a trivial 2d aggregator */
         DeviceGeMatrix<T> B(1, 1);
         const auto source = col_major_view(A);
         dim3 block_dim(align_to_power_of_2(source.numRows()),
            align_to_power_of_2(source.numCols()));
         auto target = B.view();
         aggregate2d<<<1, block_dim>>>(source, target, aggregator);
         return aggregate(target, std::move(aggregator));
      } else {
         /* aggregate column- or row-wise */
         if (A.numRows() > A.numCols()) {
            auto source = row_major_view(A);
            dim3 grid_dim(ceildiv(source.numRows(), BLOCK_DIM),
               ceildiv(source.numCols(), BLOCK_DIM));
            DeviceGeMatrix<T> B(ceildiv(source.numRows(), BLOCK_DIM),
               source.numCols());
            auto target = B.view();
            dim3 block_dim(1, BLOCK_DIM);
            aggregate_col_wise<<<grid_dim, block_dim>>>(source, target,
               aggregator);
            return aggregate(target, std::move(aggregator));
         } else {
            auto source = col_major_view(A);
            dim3 grid_dim(ceildiv(source.numRows(), BLOCK_DIM),
               ceildiv(source.numCols(), BLOCK_DIM));
            DeviceGeMatrix<T> B(source.numRows(),
               ceildiv(source.numCols(), BLOCK_DIM));
            auto target = B.view();
            dim3 block_dim(BLOCK_DIM, 1);
            aggregate_row_wise<<<grid_dim, block_dim>>>(source, target,
               aggregator);
            return aggregate(target, std::move(aggregator));
         }
      }
   } else {
      T result;
      CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T),
         cudaMemcpyDeviceToHost);
      return result;
   }
}
-------------------------------------------------------------------------------

Now we yield some significant performance improvements:

---- CODE (type=sh) -----------------------------------------------------------
livingstone$ ./test-aggregate
   1 x    1: 0.01338 ms 0.01866 ms
   1 x    2: 0.03770 ms 0.04144 ms
   1 x    7: 0.03510 ms 0.03216 ms
   1 x   42: 0.05661 ms 0.05312 ms
   1 x  666: 0.04237 ms 0.05232 ms
   1 x 2048: 0.05184 ms 0.04922 ms
   2 x    1: 0.03565 ms 0.03133 ms
   2 x    2: 0.03584 ms 0.03152 ms
   2 x    7: 0.03562 ms 0.03037 ms
   2 x   42: 0.04045 ms 0.06445 ms
   2 x  666: 0.04026 ms 0.04704 ms
   2 x 2048: 0.06304 ms 0.04656 ms
   7 x    1: 0.03722 ms 0.03293 ms
   7 x    2: 0.04496 ms 0.03245 ms
   7 x    7: 0.02976 ms 0.03299 ms
   7 x   42: 0.04365 ms 0.05325 ms
   7 x  666: 0.05120 ms 0.04003 ms
   7 x 2048: 0.06221 ms 0.04326 ms
  42 x    1: 0.05398 ms 0.06029 ms
  42 x    2: 0.05654 ms 0.07037 ms
  42 x    7: 0.05290 ms 0.06013 ms
  42 x   42: 0.06675 ms 0.06304 ms
  42 x  666: 0.06307 ms 0.04931 ms
  42 x 2048: 0.08496 ms 0.07021 ms
 666 x    1: 0.05098 ms 0.05606 ms
 666 x    2: 0.05760 ms 0.05814 ms
 666 x    7: 0.05142 ms 0.04013 ms
 666 x   42: 0.05933 ms 0.04742 ms
 666 x  666: 0.47325 ms 0.32544 ms
 666 x 2048: 0.58461 ms 0.45338 ms
2048 x    1: 0.06493 ms 0.07059 ms
2048 x    2: 0.06227 ms 0.04794 ms
2048 x    7: 0.06253 ms 0.05690 ms
2048 x   42: 0.07162 ms 0.06064 ms
2048 x  666: 0.58288 ms 0.45914 ms
2048 x 2048: 0.93130 ms 0.80144 ms
livingstone$
-------------------------------------------------------------------------------

:import:session10/step02/hpc/cuda/aggregate.hpp [fold]

:navigate: up    -> doc:index
           back  -> doc:session10/page05