Scatter and Gather with non-uniform partitions

Content

Both, MPI_Scatter and MPI_Gather, require that each of the processes works on exactly the same number of elements (like rows of a matrix). What can be done if the number of rows is not a multiply of the number of processes?

MPI offers for theses cases alternative operations MPI_Scatterv and MPI_Gatherv where the number of elements can be specified for each of the processes along with the respective offsets:

int MPI_Scatterv(const void* sendbuf, int sendcounts[], const int displs[],
                 MPI_Datatype sendtype,
                 void* recvbuf, int recvcount, MPI_Datatype recvtype,
                 int root, MPI_Comm comm);

int MPI_Gatherv(const void* sendbuf, int sendcount, MPI_Datatype sendtype,
                void* recvbuf, const int recvcounts[], const int displs[],
                MPI_Datatype recvtype, int root, MPI_Comm comm);

Exercise

It appears advantageous to offer functions for scattering matrices by rows where all the parameters are computed in dependence of an actual matrix. The partitioning can be done with the help of a hpc::aux::UniformSlices object of the lecture library. Such an object delivers the necessary values for the arrays sendcounts[]/recvcounts[] and displs[].

The two following functions are expected to copy from A to B. In case of scatter_by_row the matrix A is to be partitioned and the matrix B represents the local partition of A. In case of gather_by_row the matrix A represents the local partition and B is the matrix of the root process that collects all individual partitions.

#include <type_traits>
#include <vector>
#include <mpi.h>
#include <hpc/aux/slices.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/mpi/fundamental.hpp>

template<typename T,
   template<typename> class MA,
   template<typename> class MB,
   Require<Ge<MA<T>>, Ge<MB<T>>> = true>
int scatter_by_row(const MA<T>& A, MB<T>& B, int root, MPI_Comm comm) {

   /* ... */
}

template<typename T,
   template<typename> class MA,
   template<typename> class MB,
   Require<Ge<MA<T>>, Ge<MB<T>>> = true>
int gather_by_row(const MA<T>& A, MB<T>& B, int root, MPI_Comm comm) {

   /* ... */
}

Test both functions with the help of the following test program:

#include <mpi.h>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/iterators.hpp>
#include <hpc/matvec/print.hpp>
#include <hpc/mpi/matrix.hpp>

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int nof_processes; MPI_Comm_size(MPI_COMM_WORLD, &nof_processes);
   int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank);

   using namespace hpc::matvec;
   using namespace hpc::mpi;
   using namespace hpc::aux;

   using Matrix = GeMatrix<double>;
   int share = 3;
   int num_rows = nof_processes * share + 1;
   int num_cols = 5;

   Matrix A(num_rows, num_cols); /* entire matrix */
   MPI_Datatype rowtype_A = get_row_type(A);

   UniformSlices<int> slices(nof_processes, num_rows);
   Matrix B(slices.size(rank), num_cols,
      Order::RowMajor); /* individual share */
   MPI_Datatype rowtype_B = get_row_type(B);

   if (rank == 0) {
      for (auto [i, j, Aij]: A) {
	 Aij = i * 100 + j;
      }
   }

   scatter_by_row(A, B, 0, MPI_COMM_WORLD);
   for (auto [i, j, Bij]: B) {
      Bij += 10000 * (rank + 1);
      (void) i; (void) j; // suppress gcc warning
   }
   gather_by_row(B, A, 0, MPI_COMM_WORLD);

   MPI_Finalize();

   if (rank == 0) {
      print(A, " %6g");
   }
}