=============== Sample solution [TOC] =============== The following approach takes into account that aggregation does not come with a well-defined order. Hence, the implementation is free to chose any order it wants. In our particular case of seeking the maximum, the order is indeed irrelevant. But in other cases it may play a role – for example when we sum up all elements without any precautions for a loss of significance (BTW, the Kahan summation algorithm can be helpful here). We are already using this freedom when we decide to aggregate either row or column-wise. The interesting point is that when order does not matter we have the freedom to use transposing views of a matrix. If a matrix is in row-major order, a transposing view is in col-major order, and vice versa. Hence, we add to the implementation a function named `transposed_view` which delivers a matrix in the opposite storage order: ---- CODE (type=cpp) ---------------------------------------------------------- template< template class Matrix, typename T, Require< DeviceGe> > = true > auto transposed_view(const Matrix& A) { return DeviceGeMatrixConstView(A.numCols(), A.numRows(), A.conj(), A.data(), A.incCol(), A.incRow()); } ------------------------------------------------------------------------------- Now we can create functions that deliver a view in the preferred storage order: ---- CODE (type=cpp) ---------------------------------------------------------- template< template class Matrix, typename T, Require< DeviceGe> > = true > auto row_major_view(const Matrix& A) { if (A.incRow() < A.incCol()) { return transposed_view(A); } else { return A.view(); } } template< template class Matrix, typename T, Require< DeviceGe> > = true > auto col_major_view(const Matrix& A) { if (A.incRow() > A.incCol()) { return transposed_view(A); } else { return A.view(); } } ------------------------------------------------------------------------------- Then, we chose always the best storage order. In addition, a special case was added when the matrix fits into one block: ---- CODE (type=cpp) ---------------------------------------------------------- template< template class Matrix, typename T, typename Aggregator, Require< DeviceGe> > = true > T aggregate(const Matrix& A, Aggregator&& aggregator) { using namespace aggregate_impl; if (A.numRows() > 1 || A.numCols() > 1) { if (A.numRows() <= BLOCK_DIM && A.numCols() <= BLOCK_DIM) { /* most efficiently handled by a trivial 2d aggregator */ DeviceGeMatrix B(1, 1); const auto source = col_major_view(A); dim3 block_dim(align_to_power_of_2(source.numRows()), align_to_power_of_2(source.numCols())); auto target = B.view(); aggregate2d<<<1, block_dim>>>(source, target, aggregator); return aggregate(target, std::move(aggregator)); } else { /* aggregate column- or row-wise */ if (A.numRows() > A.numCols()) { auto source = row_major_view(A); dim3 grid_dim(ceildiv(source.numRows(), BLOCK_DIM), ceildiv(source.numCols(), BLOCK_DIM)); DeviceGeMatrix B(ceildiv(source.numRows(), BLOCK_DIM), source.numCols()); auto target = B.view(); dim3 block_dim(1, BLOCK_DIM); aggregate_col_wise<<>>(source, target, aggregator); return aggregate(target, std::move(aggregator)); } else { auto source = col_major_view(A); dim3 grid_dim(ceildiv(source.numRows(), BLOCK_DIM), ceildiv(source.numCols(), BLOCK_DIM)); DeviceGeMatrix B(source.numRows(), ceildiv(source.numCols(), BLOCK_DIM)); auto target = B.view(); dim3 block_dim(BLOCK_DIM, 1); aggregate_row_wise<<>>(source, target, aggregator); return aggregate(target, std::move(aggregator)); } } } else { T result; CHECK_CUDA(cudaMemcpy, &result, A.data(), sizeof(T), cudaMemcpyDeviceToHost); return result; } } ------------------------------------------------------------------------------- Now we yield some significant performance improvements: ---- CODE (type=sh) ----------------------------------------------------------- livingstone$ ./test-aggregate 1 x 1: 0.01338 ms 0.01866 ms 1 x 2: 0.03770 ms 0.04144 ms 1 x 7: 0.03510 ms 0.03216 ms 1 x 42: 0.05661 ms 0.05312 ms 1 x 666: 0.04237 ms 0.05232 ms 1 x 2048: 0.05184 ms 0.04922 ms 2 x 1: 0.03565 ms 0.03133 ms 2 x 2: 0.03584 ms 0.03152 ms 2 x 7: 0.03562 ms 0.03037 ms 2 x 42: 0.04045 ms 0.06445 ms 2 x 666: 0.04026 ms 0.04704 ms 2 x 2048: 0.06304 ms 0.04656 ms 7 x 1: 0.03722 ms 0.03293 ms 7 x 2: 0.04496 ms 0.03245 ms 7 x 7: 0.02976 ms 0.03299 ms 7 x 42: 0.04365 ms 0.05325 ms 7 x 666: 0.05120 ms 0.04003 ms 7 x 2048: 0.06221 ms 0.04326 ms 42 x 1: 0.05398 ms 0.06029 ms 42 x 2: 0.05654 ms 0.07037 ms 42 x 7: 0.05290 ms 0.06013 ms 42 x 42: 0.06675 ms 0.06304 ms 42 x 666: 0.06307 ms 0.04931 ms 42 x 2048: 0.08496 ms 0.07021 ms 666 x 1: 0.05098 ms 0.05606 ms 666 x 2: 0.05760 ms 0.05814 ms 666 x 7: 0.05142 ms 0.04013 ms 666 x 42: 0.05933 ms 0.04742 ms 666 x 666: 0.47325 ms 0.32544 ms 666 x 2048: 0.58461 ms 0.45338 ms 2048 x 1: 0.06493 ms 0.07059 ms 2048 x 2: 0.06227 ms 0.04794 ms 2048 x 7: 0.06253 ms 0.05690 ms 2048 x 42: 0.07162 ms 0.06064 ms 2048 x 666: 0.58288 ms 0.45914 ms 2048 x 2048: 0.93130 ms 0.80144 ms livingstone$ ------------------------------------------------------------------------------- :import:session10/step02/hpc/cuda/aggregate.hpp [fold] :navigate: up -> doc:index back -> doc:session10/page05