Improving the two-dimensional aggregation

Content

A close look of the results of test-aggregate unveils unsymmetric results. For example, an aggregation for a \(2048 \times 666\) matrix takes much longer than an aggregation of an \(666 \times 2048\) matrix:

livingstone$ make EXTRA=step01 test-aggregate
nvcc -c -std=c++14 -Istep01 -I/home/numerik/pub/hpc/ws19/session30 -I/usr/include/gdk-pixbuf-2.0 -I/usr/include/libpng16 -I/usr/include/glib-2.0 -I/usr/lib/x86_64-linux-gnu/glib-2.0/include --expt-extended-lambda test-aggregate.cu
nvcc -w -o test-aggregate -std=c++14 --expt-extended-lambda test-aggregate.o -lgdk_pixbuf-2.0 -lgobject-2.0 -lglib-2.0
livingstone$ ./test-aggregate
   1 x    1: 0.01606 ms 0.02397 ms
   1 x    2: 0.04195 ms 0.05498 ms
   1 x    7: 0.03965 ms 0.04429 ms
   1 x   42: 0.05792 ms 0.06426 ms
   1 x  666: 0.05706 ms 0.06477 ms
   1 x 2048: 0.07424 ms 0.07312 ms
   2 x    1: 0.03949 ms 0.04349 ms
   2 x    2: 0.05645 ms 0.06166 ms
   2 x    7: 0.05421 ms 0.06019 ms
   2 x   42: 0.07600 ms 0.07862 ms
   2 x  666: 0.06678 ms 0.07286 ms
   2 x 2048: 0.08608 ms 0.07120 ms
   7 x    1: 0.03747 ms 0.04093 ms
   7 x    2: 0.05290 ms 0.05718 ms
   7 x    7: 0.05878 ms 0.05773 ms
   7 x   42: 0.06717 ms 0.07411 ms
   7 x  666: 0.07018 ms 0.05136 ms
   7 x 2048: 0.08509 ms 0.05779 ms
  42 x    1: 0.05123 ms 0.06397 ms
  42 x    2: 0.06893 ms 0.07418 ms
  42 x    7: 0.06970 ms 0.07264 ms
  42 x   42: 0.09802 ms 0.08499 ms
  42 x  666: 0.06390 ms 0.04794 ms
  42 x 2048: 0.07565 ms 0.07370 ms
 666 x    1: 0.04723 ms 0.04979 ms
 666 x    2: 0.05904 ms 0.05050 ms
 666 x    7: 0.05114 ms 0.03930 ms
 666 x   42: 0.06272 ms 0.05514 ms
 666 x  666: 0.47450 ms 0.31971 ms
 666 x 2048: 0.60512 ms 0.47197 ms
2048 x    1: 0.05200 ms 0.05251 ms
2048 x    2: 0.06368 ms 0.04688 ms
2048 x    7: 0.06224 ms 0.04739 ms
2048 x   42: 0.09318 ms 0.08192 ms
2048 x  666: 1.03763 ms 0.91270 ms
2048 x 2048: 0.97466 ms 0.83706 ms
livingstone$ 

A close look of aggregate reveals that we select aggregate_col_wise or aggregate_row_wise without any consideration of the storage organisation of A:

template<
   template<typename> class Matrix,
   typename T,
   typename Aggregator,
   Require< DeviceGe<Matrix<T>> > = true
>
T aggregate(const Matrix<T>& A, Aggregator&& aggregator) {
   using namespace aggregate_impl;
   if (A.numRows() > 1 || A.numCols() > 1) {
      const auto source = A.view();
      dim3 grid_dim(ceildiv(A.numRows(), BLOCK_DIM),
	 ceildiv(A.numCols(), BLOCK_DIM));
      if (A.numRows() > A.numCols()) {
	 DeviceGeMatrix<T> B(ceildiv(A.numRows(), BLOCK_DIM), A.numCols());
	 auto target = B.view();
	 dim3 block_dim(1, BLOCK_DIM);
	 aggregate_col_wise<<<grid_dim, block_dim>>>(source, target,
	    aggregator);
	 return aggregate(target, std::move(aggregator));
      } else {
	 DeviceGeMatrix<T> B(A.numRows(), ceildiv(A.numCols(), BLOCK_DIM));
	 auto target = B.view();
	 dim3 block_dim(BLOCK_DIM, 1);
	 aggregate_row_wise<<<grid_dim, block_dim>>>(source, target,
	    aggregator);
	 return aggregate(target, std::move(aggregator));
      }
   } else {
      T result;
      CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T),
	 cudaMemcpyDeviceToHost);
      return result;
   }
}

Let us consider aggregate_col_wise. Does it provide better performance if A is in row-major or in col-major?

template<
   template<typename> class MatrixA,
   template<typename> class MatrixB,
   typename T,
   typename Aggregator,
   Require<
      DeviceGe<MatrixA<T>>, DeviceView<MatrixA<T>>,
      DeviceGe<MatrixB<T>>, DeviceView<MatrixB<T>>
   > = true
>
__global__ void aggregate_col_wise(const MatrixA<T> A, MatrixB<T> B,
      Aggregator aggregator) {
   std::size_t i = threadIdx.x + blockIdx.x * BLOCK_DIM;
   std::size_t j = threadIdx.y + blockIdx.y * BLOCK_DIM;

   if (j < A.numCols()) {
      T val = A(i, j);
      std::size_t maxoffset = BLOCK_DIM;
      if (i + maxoffset >= A.numRows()) {
	 maxoffset = A.numRows() - i;
      }
      for (std::size_t offset = 1; offset < maxoffset; ++offset) {
	 val = aggregator(val, A(i + offset, j));
      }
      B(blockIdx.x, j) = val;
   }
}

The relevant access operation is A(i + offset, j). Within a warp, just threadIdx.y varies as blockIdx.x is 1 while blockIdx.y is 32. The only variable depending on threadIdx.y is j. Hence, while each thread runs through an individual column, a warp accesses a row during each iteration of the loop. Hence, we can expect the best performance for a matrix in row-major organization.

Likewise aggregate_row_wise operates best with a matrix in col-major.

Exercise