matrix.hpp (session07/hpc/mpi/matrix.hpp)

#ifndef HPC_MPI_MATRIX_HPP
#define HPC_MPI_MATRIX_HPP 1

#include <cassert>
#include <memory>
#include <type_traits>
#include <vector>
#include <mpi.h>
#include <hpc/aux/slices.hpp>
#include <hpc/matvec/copy.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/mpi/fundamental.hpp>

namespace hpc { namespace mpi {

/* construct MPI data type for a row of Matrix A
   where the extent is adapted such that consecutive
   rows can be combined even if they overlap */
template<typename T, template<typename> class Matrix,
   Require<Ge<Matrix<T>>> = true>
MPI_Datatype get_row_type(const Matrix<T>& A) {
   MPI_Datatype rowtype;
   MPI_Type_vector(
      /* count = */ A.numCols(),
      /* blocklength = */ 1,
      /* stride = */ A.incCol(),
      /* element type = */ get_type(A(0, 0)),
      /* newly created type = */ &rowtype);

   /* in case of row-major we are finished */
   if (A.incRow() == A.numCols()) {
      MPI_Type_commit(&rowtype);
      return rowtype;
   }

   /* the extent of the MPI data type does not match
      the offset of subsequent rows -- this is a problem
      whenever we want to handle more than one row;
      to fix this we need to use the resize function
      which allows us to adapt the extent to A.incRow() */
   MPI_Datatype resized_rowtype;
   MPI_Type_create_resized(rowtype, 0, /* lb remains unchanged */
      A.incRow() * sizeof(T), &resized_rowtype);
   MPI_Type_commit(&resized_rowtype);
   MPI_Type_free(&rowtype);
   return resized_rowtype;
}

/* create MPI data type for matrix A */
template<typename T, template<typename> class Matrix,
   Require<Ge<Matrix<T>>> = true>
MPI_Datatype get_type(const Matrix<T>& A) {
   MPI_Datatype datatype;
   if (A.incCol() == 1) {
      MPI_Type_vector(
	 /* count = */ A.numRows(),
	 /* blocklength = */ A.numCols(),
	 /* stride = */ A.incRow(),
	 /* element type = */ get_type(A(0, 0)),
	 /* newly created type = */ &datatype);
   } else {
      /* vector of row vectors */
      MPI_Datatype rowtype = get_row_type(A);
      MPI_Type_contiguous(A.numRows(), rowtype, &datatype);
      MPI_Type_free(&rowtype);
   }
   MPI_Type_commit(&datatype);
   return datatype;
}

template<typename T,
   template<typename> class MA,
   template<typename> class MB,
   Require<Ge<MA<T>>> = true, Require<Ge<MB<T>>> = true>
int scatter_by_row(const MA<T>& A, MB<T>& B, int root, MPI_Comm comm) {

   int nof_processes; MPI_Comm_size(comm, &nof_processes);
   int rank; MPI_Comm_rank(comm, &rank);

   MPI_Datatype rowtype_B = get_row_type(B);
   int rval;
   if (rank == root) {
      assert(A.numCols() == B.numCols());

      hpc::aux::UniformSlices<int> slices(nof_processes, A.numRows());
      std::vector<int> counts(nof_processes);
      std::vector<int> offsets(nof_processes);

      MPI_Datatype rowtype_A = get_row_type(A);
      for (int i = 0; i < nof_processes; ++i) {
	 if (i < A.numRows()) {
	    counts[i] = slices.size(i);
	    offsets[i] = slices.offset(i);
	 } else {
	    counts[i] = 0; offsets[i] = 0;
	 }
      }
      int recvcount = counts[rank];
      assert(B.numRows() == recvcount);

      /* aged OpenMPI implementations of Debian wheezy and jessie
         expect void* instead of const void*; hence we need to remove const */
      rval = MPI_Scatterv(
	 const_cast<void*>(reinterpret_cast<const void*>(&A(0, 0))),
	 &counts[0], &offsets[0], rowtype_A,
	 &B(0, 0), recvcount, rowtype_B, root, comm);
      MPI_Type_free(&rowtype_A);
   } else {
      int recvcount = B.numRows();
      rval = MPI_Scatterv(nullptr, nullptr, nullptr, nullptr,
	 &B(0, 0), recvcount, rowtype_B, root, comm);
   }
   MPI_Type_free(&rowtype_B);
   return rval;
}

template<typename T,
   template<typename> class MA,
   template<typename> class MB,
   Require<Ge<MA<T>>> = true, Require<Ge<MB<T>>> = true>
int gather_by_row(const MA<T>& A, MB<T>& B, int root, MPI_Comm comm) {
   int nof_processes; MPI_Comm_size(comm, &nof_processes);
   int rank; MPI_Comm_rank(comm, &rank);

   MPI_Datatype rowtype_A = get_row_type(A);

   int rval;
   if (rank == root) {
      assert(A.numCols() == B.numCols());

      hpc::aux::UniformSlices<int> slices(nof_processes, B.numRows());
      std::vector<int> counts(nof_processes);
      std::vector<int> offsets(nof_processes);

      for (int i = 0; i < nof_processes; ++i) {
	 if (i < B.numRows()) {
	    counts[i] = slices.size(i);
	    offsets[i] = slices.offset(i);
	 } else {
	    counts[i] = 0; offsets[i] = 0;
	 }
      }
      int sendcount = counts[rank];
      assert(A.numRows() == sendcount);

      MPI_Datatype rowtype_B = get_row_type(B);

      /* aged OpenMPI implementations of Debian wheezy and jessie
         expect void* instead of const void*; hence we need to remove const */
      rval = MPI_Gatherv(
	 const_cast<void*>(reinterpret_cast<const void*>(&A(0, 0))),
	 sendcount, rowtype_A,
	 &B(0, 0), &counts[0], &offsets[0], rowtype_B, root, comm);
      MPI_Type_free(&rowtype_B);
   } else {
      int sendcount = A.numRows();
      rval = MPI_Gatherv((void*) &A(0, 0), sendcount, rowtype_A,
	 nullptr, nullptr, nullptr, nullptr, root, comm);
   }
   MPI_Type_free(&rowtype_A);
   return rval;
}

template<typename T,
   template<typename> class MA,
   template<typename> class MB,
   Require<Ge<MA<T>>, Ge<MB<T>>> = true>
int scatter_by_block(const MA<T>& A, MB<T>& B, int root,
      MPI_Comm grid, int overlap = 0) {
   assert(overlap < A.numRows() && overlap < A.numCols());

   int nof_processes; MPI_Comm_size(grid, &nof_processes);
   int rank; MPI_Comm_rank(grid, &rank);
   int dims[2]; int coords[2]; int periods[2];
   MPI_Cart_get(grid, 2, dims, periods, coords);
   hpc::aux::UniformSlices<int> rows(dims[0], A.numRows() - 2*overlap);
   hpc::aux::UniformSlices<int> columns(dims[1], A.numCols() - 2*overlap);

   if (rank == root) {
      MPI_Request requests[nof_processes-1]; int ri = 0;
      for (int i = 0; i < nof_processes; ++i) {
	 MPI_Cart_coords(grid, i, 2, coords);
	 auto A_ = A.block(rows.offset(coords[0]),
	    columns.offset(coords[1])).dim(
	       rows.size(coords[0]) + 2*overlap,
	       columns.size(coords[1]) + 2*overlap);
	 if (i == root) {
	    hpc::matvec::copy(A_, B);
	 } else {
	    /* aged OpenMPI implementations of Debian wheezy and jessie
	       expect void* instead of const void*;
	       hence we need to remove const */
	    MPI_Isend(
	       const_cast<void*>(reinterpret_cast<const void*>(&A_(0, 0))),
	       1, get_type(A_), i, 0, grid, &requests[ri++]);
	 }
      }
      for (auto& request: requests) {
	 MPI_Status status;
	 MPI_Wait(&request, &status);
      }
   } else {
      MPI_Status status;
      MPI_Recv(&B(0, 0), 1, get_type(B), root, 0, grid, &status);
   }
}

template<typename T,
   template<typename> typename MA,
   template<typename> typename MB,
   Require<Ge<MA<T>>, Ge<MB<T>>> = true>
int gather_by_block(const MA<T>& A, MB<T>& B, int root,
      MPI_Comm grid, int overlap = 0) {
   assert(overlap < A.numRows() && overlap < A.numCols());

   int nof_processes; MPI_Comm_size(grid, &nof_processes);
   int rank; MPI_Comm_rank(grid, &rank);
   int dims[2]; int coords[2]; int periods[2];
   MPI_Cart_get(grid, 2, dims, periods, coords);
   hpc::aux::UniformSlices<int> rows(dims[0], B.numRows() - 2*overlap);
   hpc::aux::UniformSlices<int> columns(dims[1], B.numCols() - 2*overlap);

   auto A_ = A.block(overlap, overlap).dim(
      A.numRows() - 2*overlap, A.numCols() - 2*overlap);
   if (rank == root) {
      MPI_Request requests[nof_processes-1]; int ri = 0;
      for (int i = 0; i < nof_processes; ++i) {
	 MPI_Cart_coords(grid, i, 2, coords);
	 auto B_ = B.block(rows.offset(coords[0]) + overlap,
	    columns.offset(coords[1]) + overlap).dim(
	       rows.size(coords[0]), columns.size(coords[1]));
	 if (i == root) {
	    hpc::matvec::copy(A_, B_);
	 } else {
	    MPI_Irecv(&B_(0, 0), 1, get_type(B_),
	       i, 0, grid, &requests[ri++]);
	 }
      }
      for (auto& request: requests) {
	 MPI_Status status;
	 MPI_Wait(&request, &status);
      }
   } else {
      /* aged OpenMPI implementations of Debian wheezy and jessie
         expect void* instead of const void*; hence we need to remove const */
      MPI_Send(const_cast<void*>(reinterpret_cast<const void*>(&A_(0, 0))),
	 1, get_type(A_), root, 0, grid);
   }
}

} } // namespaces mpi, hpc

#endif // HPC_MPI_MATRIX_H