Lösungsvorschlag

template<typename MA, typename MB>
typename std::enable_if<
   hpc::matvec::IsGeMatrix<MA>::value && hpc::matvec::IsGeMatrix<MB>::value &&
      std::is_same<typename MA::ElementType, typename MB::ElementType>::value,
   int>::type
scatter_by_block(const MA& A, MB& B, int root,
      MPI_Comm grid, int dims[2], int coords[2], int overlap = 0) {
   assert(overlap < A.numRows && overlap < A.numCols);

   int nof_processes; MPI_Comm_size(grid, &nof_processes);
   int rank; MPI_Comm_rank(grid, &rank);
   hpc::aux::Slices<int> rows(dims[0], A.numRows - 2*overlap);
   hpc::aux::Slices<int> columns(dims[1], A.numCols - 2*overlap);

   if (rank == root) {
      MPI_Request requests[nof_processes-1]; int ri = 0;
      for (int i = 0; i < nof_processes; ++i) {
         int coords_[2];
         MPI_Cart_coords(grid, i, 2, coords_);
         auto A_ = A(rows.offset(coords_[0]), columns.offset(coords_[1]),
            rows.size(coords_[0]) + 2*overlap,
            columns.size(coords_[1]) + 2*overlap);
         if (i == root) {
            hpc::matvec::copy(A_, B);
         } else {
            MPI_Isend(&A_(0, 0), 1, get_type(A_),
               i, 0, grid, &requests[ri++]);
         }
      }
      for (auto& request: requests) {
         MPI_Status status;
         MPI_Wait(&request, &status);
      }
   } else {
      MPI_Status status;
      MPI_Recv(&B(0, 0), 1, get_type(B), root, 0, grid, &status);
   }
}

template<typename MA, typename MB>
typename std::enable_if<
   hpc::matvec::IsGeMatrix<MA>::value && hpc::matvec::IsGeMatrix<MB>::value &&
      std::is_same<typename MA::ElementType, typename MB::ElementType>::value,
   int>::type
gather_by_block(const MA& A, MB& B, int root,
      MPI_Comm grid, int dims[2], int coords[2], int overlap = 0) {
   assert(overlap < A.numRows && overlap < A.numCols);

   int nof_processes; MPI_Comm_size(grid, &nof_processes);
   int rank; MPI_Comm_rank(grid, &rank);
   hpc::aux::Slices<int> rows(dims[0], B.numRows - 2*overlap);
   hpc::aux::Slices<int> columns(dims[1], B.numCols - 2*overlap);

   auto A_ = A(overlap, overlap, A.numRows - 2*overlap, A.numCols - 2*overlap);
   if (rank == root) {
      MPI_Request requests[nof_processes-1]; int ri = 0;
      for (int i = 0; i < nof_processes; ++i) {
         int coords_[2];
         MPI_Cart_coords(grid, i, 2, coords_);
         auto B_ = B(rows.offset(coords_[0]) + overlap,
            columns.offset(coords_[1]) + overlap,
            rows.size(coords_[0]), columns.size(coords_[1]));
         if (i == root) {
            hpc::matvec::copy(A_, B_);
         } else {
            MPI_Irecv(&B_(0, 0), 1, get_type(B_),
               i, 0, grid, &requests[ri++]);
         }
      }
      for (auto& request: requests) {
         MPI_Status status;
         MPI_Wait(&request, &status);
      }
   } else {
      MPI_Send(&A_(0, 0), 1, get_type(A_), root, 0, grid);
   }
}