Sample solution
Content |
This solution works with additional template functions in hpc/mpi/matrix.hpp that support scatter and gather operations for matrices. These are one-to-many communications where a master distributes partitions of a matrix to other processes (scatter) and where the other processes send back their updated partitions which are sent back to the master process and merged (gather). MPI supports following operations:
int MPI_Scatterv(const void* sendbuf, int* sendcounts, int* displs, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPI_Comm comm); int MPI_Gatherv(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int* recvcounts, int* displs, MPI_Datatype recvtype, int root, MPI_Comm comm);
There exists also operations MPI_Scatter and MPI_Gather which unfortunately do not support cases where slices can differ in their extents.
These two operations are wrapped into scatter_by_row and gather_by_row:
template<typename T, template<typename> typename MA, template<typename> typename MB, Require<Ge<MA<T>>> = true, Require<Ge<MB<T>>> = true> int scatter_by_row(const MA<T>& A, MB<T>& B, int root, MPI_Comm comm) { /* ... */ } template<typename T, template<typename> typename MA, template<typename> typename MB, Require<Ge<MA<T>>> = true, Require<Ge<MB<T>>> = true> int gather_by_row(const MA<T>& A, MB<T>& B, int root, MPI_Comm comm) { /* ... */ }
We will see more about this communication pattern in the next session.
#include <cassert> #include <cmath> #include <cstdlib> #include <unistd.h> #include <mpi.h> #include <printf.hpp> #include <hpc/matvec/gematrix.hpp> #include <hpc/mpi/matrix.hpp> #include <hpc/mpi/vector.hpp> #include <hpc/matvec/copy.hpp> #include <hpc/matvec/iterators.hpp> #include <hpc/matvec/matrix2pixbuf.hpp> #include <hpc/aux/slices.hpp> #include <hpc/aux/hsvcolor.hpp> const auto PI = std::acos(-1.0); const auto E = std::exp(1.0); const auto E_POWER_MINUS_PI = std::pow(E, -PI); using namespace hpc; template<typename T, template<typename> typename Matrix, Require<Ge<Matrix<T>>> = true> T jacobi_iteration(const Matrix<T>& A, Matrix<T>& B) { assert(A.numRows() > 2 && A.numCols() > 2); T maxdiff = 0; for (std::size_t i = 1; i + 1 < B.numRows(); ++i) { for (std::size_t j = 1; j + 1 < B.numCols(); ++j) { B(i, j) = 0.25 * (A(i - 1, j) + A(i + 1, j) + A(i, j - 1) + A(i, j + 1)); T diff = std::fabs(A(i, j) - B(i, j)); if (diff > maxdiff) maxdiff = diff; } } return maxdiff; } template<typename Matrix> void exchange_with_neighbors(Matrix& A, int previous, int next, MPI_Datatype rowtype) { MPI_Status status; // upward MPI_Sendrecv(&A(1, 0), 1, rowtype, previous, 0, &A(A.numRows()-1, 0), 1, rowtype, next, 0, MPI_COMM_WORLD, &status); // downward MPI_Sendrecv(&A(A.numRows()-2, 0), 1, rowtype, next, 0, &A(0, 0), 1, rowtype, previous, 0, MPI_COMM_WORLD, &status); } int main(int argc, char** argv) { MPI_Init(&argc, &argv); int nof_processes; MPI_Comm_size(MPI_COMM_WORLD, &nof_processes); int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); using namespace hpc::matvec; using namespace hpc::mpi; using namespace hpc::aux; using Matrix = GeMatrix<double>; Matrix A(100, 100, Order::RowMajor); if (rank == 0) { for (auto [i, j, Aij]: A) { if (j == 0) { Aij = std::sin(PI * ((double)i/(A.numRows()-1))); } else if (j == A.numCols() - 1) { Aij = std::sin(PI * ((double)i/(A.numRows()-1))) * E_POWER_MINUS_PI; } else { Aij = 0; } } } UniformSlices<std::size_t> slices(nof_processes, A.numRows() - 2); Matrix B1(slices.size(rank) + 2, A.numCols(), Order::RowMajor); for (auto [i, j, Bij]: B1) { Bij = 0; } auto B = B1.block(1, 0).dim(B1.numRows() - 2, B1.numCols()); MPI_Datatype rowtype = get_row_type(B); /* distribute main body of A */ auto A_ = A.block(1, 0).dim(A.numRows() - 2, A.numCols()); scatter_by_row(A_, B, 0, MPI_COMM_WORLD); /* distribute first and last row of A */ if (rank == 0) { copy(A.block(0, 0).dim(1, A.numCols()), B1.block(0, 0).dim(1, B1.numCols())); } if (nof_processes == 1) { copy(A.block(A.numRows()-1, 0).dim(1, A.numCols()), B1.block(B.numRows()-1, 0).dim(1, B1.numCols())); } else if (rank == 0) { MPI_Send(&A(A.numRows()-1, 0), 1, rowtype, nof_processes-1, 0, MPI_COMM_WORLD); } else if (rank == nof_processes - 1) { MPI_Status status; MPI_Recv(&B1(B.numRows()-1, 0), 1, rowtype, 0, 0, MPI_COMM_WORLD, &status); } Matrix B2(B1.numRows(), B1.numCols(), Order::RowMajor); copy(B1, B2); /* actually just the border needs to be copied */ int previous = rank == 0? MPI_PROC_NULL: rank-1; int next = rank == nof_processes-1? MPI_PROC_NULL: rank+1; double eps = 1e-6; unsigned int iterations; for (iterations = 0; ; ++iterations) { double maxdiff = jacobi_iteration(B1, B2); exchange_with_neighbors(B2, previous, next, rowtype); maxdiff = jacobi_iteration(B2, B1); if (iterations % 10 == 0) { double global_max; MPI_Reduce(&maxdiff, &global_max, 1, get_type(maxdiff), MPI_MAX, 0, MPI_COMM_WORLD); MPI_Bcast(&global_max, 1, get_type(maxdiff), 0, MPI_COMM_WORLD); if (global_max < eps) break; } exchange_with_neighbors(B1, previous, next, rowtype); } if (rank == 0) fmt::printf("%d iterations\n", iterations); gather_by_row(B, A_, 0, MPI_COMM_WORLD); MPI_Finalize(); if (rank == 0) { auto pixbuf = create_pixbuf(A, [](double val) -> HSVColor<double> { return HSVColor<double>((1-val) * 240, 1, 1); }, 8); gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr, "quality", "100", nullptr); } }
#ifndef HPC_MPI_MATRIX_HPP #define HPC_MPI_MATRIX_HPP 1 #include <cassert> #include <memory> #include <type_traits> #include <vector> #include <mpi.h> #include <hpc/aux/slices.hpp> #include <hpc/matvec/gematrix.hpp> #include <hpc/mpi/fundamental.hpp> namespace hpc { namespace mpi { /* construct MPI data type for a row of Matrix A where the extent is adapted such that consecutive rows can be combined even if they overlap */ template<typename T, template<typename> class Matrix, Require<Ge<Matrix<T>>> = true> MPI_Datatype get_row_type(const Matrix<T>& A) { MPI_Datatype rowtype; MPI_Type_vector( /* count = */ A.numCols(), /* blocklength = */ 1, /* stride = */ A.incCol(), /* element type = */ get_type(A(0, 0)), /* newly created type = */ &rowtype); /* in case of row-major we are finished */ if (A.incRow() == A.numCols()) { MPI_Type_commit(&rowtype); return rowtype; } /* the extent of the MPI data type does not match the offset of subsequent rows -- this is a problem whenever we want to handle more than one row; to fix this we need to use the resize function which allows us to adapt the extent to A.incRow() */ MPI_Datatype resized_rowtype; MPI_Type_create_resized(rowtype, 0, /* lb remains unchanged */ A.incRow() * sizeof(T), &resized_rowtype); MPI_Type_commit(&resized_rowtype); MPI_Type_free(&rowtype); return resized_rowtype; } /* create MPI data type for matrix A */ template<typename T, template<typename> class Matrix, Require<Ge<Matrix<T>>> = true> MPI_Datatype get_type(const Matrix<T>& A) { MPI_Datatype datatype; if (A.incCol() == 1) { MPI_Type_vector( /* count = */ A.numRows(), /* blocklength = */ A.numCols(), /* stride = */ A.incRow(), /* element type = */ get_type(A(0, 0)), /* newly created type = */ &datatype); } else { /* vector of row vectors */ MPI_Datatype rowtype = get_row_type(A); MPI_Type_contiguous(A.numRows(), rowtype, &datatype); MPI_Type_free(&rowtype); } MPI_Type_commit(&datatype); return datatype; } template<typename T, template<typename> class MA, template<typename> class MB, Require<Ge<MA<T>>> = true, Require<Ge<MB<T>>> = true> int scatter_by_row(const MA<T>& A, MB<T>& B, int root, MPI_Comm comm) { int nof_processes; MPI_Comm_size(comm, &nof_processes); int rank; MPI_Comm_rank(comm, &rank); MPI_Datatype rowtype_B = get_row_type(B); int rval; if (rank == root) { assert(A.numCols() == B.numCols()); hpc::aux::UniformSlices<int> slices(nof_processes, A.numRows()); std::vector<int> counts(nof_processes); std::vector<int> offsets(nof_processes); MPI_Datatype rowtype_A = get_row_type(A); for (int i = 0; i < nof_processes; ++i) { if (i < A.numRows()) { counts[i] = slices.size(i); offsets[i] = slices.offset(i); } else { counts[i] = 0; offsets[i] = 0; } } int recvcount = counts[rank]; assert(B.numRows() == recvcount); /* aged OpenMPI implementations of Debian wheezy and jessie expect void* instead of const void*; hence we need to remove const */ rval = MPI_Scatterv( const_cast<void*>(reinterpret_cast<const void*>(&A(0, 0))), &counts[0], &offsets[0], rowtype_A, &B(0, 0), recvcount, rowtype_B, root, comm); MPI_Type_free(&rowtype_A); } else { int recvcount = B.numRows(); rval = MPI_Scatterv(nullptr, nullptr, nullptr, nullptr, &B(0, 0), recvcount, rowtype_B, root, comm); } MPI_Type_free(&rowtype_B); return rval; } template<typename T, template<typename> class MA, template<typename> class MB, Require<Ge<MA<T>>> = true, Require<Ge<MB<T>>> = true> int gather_by_row(const MA<T>& A, MB<T>& B, int root, MPI_Comm comm) { int nof_processes; MPI_Comm_size(comm, &nof_processes); int rank; MPI_Comm_rank(comm, &rank); MPI_Datatype rowtype_A = get_row_type(A); int rval; if (rank == root) { assert(A.numCols() == B.numCols()); hpc::aux::UniformSlices<int> slices(nof_processes, B.numRows()); std::vector<int> counts(nof_processes); std::vector<int> offsets(nof_processes); for (int i = 0; i < nof_processes; ++i) { if (i < B.numRows()) { counts[i] = slices.size(i); offsets[i] = slices.offset(i); } else { counts[i] = 0; offsets[i] = 0; } } int sendcount = counts[rank]; assert(A.numRows() == sendcount); MPI_Datatype rowtype_B = get_row_type(B); /* aged OpenMPI implementations of Debian wheezy and jessie expect void* instead of const void*; hence we need to remove const */ rval = MPI_Gatherv( const_cast<void*>(reinterpret_cast<const void*>(&A(0, 0))), sendcount, rowtype_A, &B(0, 0), &counts[0], &offsets[0], rowtype_B, root, comm); MPI_Type_free(&rowtype_B); } else { int sendcount = A.numRows(); rval = MPI_Gatherv((void*) &A(0, 0), sendcount, rowtype_A, nullptr, nullptr, nullptr, nullptr, root, comm); } MPI_Type_free(&rowtype_A); return rval; } } } // namespaces mpi, hpc #endif // HPC_MPI_MATRIX_H