#include #include #include #include #include #include #include #include #include #include #include #include using namespace hpc; template const T PI = std::acos(T(-1.0)); template const T E = std::exp(T(1.0)); template const T E_POWER_MINUS_PI = std::pow(E, -PI); /* used to specify an I/O transfer with one of our neighbors; each process has four input and four output operations per iteration */ struct IOTransfer { int rank; /* rank of neighbor */ void* addr; /* start address */ MPI_Datatype datatype; /* either a column or row */ }; /* perform a Jacobi iteration: A: previous state, i.e. A_n B: to be computed state A_{n+1} in_vectors & out_vectors: I/O operations with our four neighbors, reading into B (outer border) and sending from B (inner border) */ template typename Matrix, Require>> = true> T jacobi_iteration(const Matrix& A, Matrix& B, const IOTransfer (&in_vectors)[4], const IOTransfer (&out_vectors)[4], MPI_Comm grid) { assert(A.numRows() > 2 && A.numCols() > 2); T maxdiff = 0; auto jacobi = [&](std::size_t i, std::size_t j) { B(i, j) = 0.25 * (A(i - 1, j) + A(i + 1, j) + A(i, j - 1) + A(i, j + 1)); T diff = std::fabs(A(i, j) - B(i, j)); if (diff > maxdiff) maxdiff = diff; }; /* compute the border first which is sent in advance to our neighbors */ for (std::size_t i = 1; i + 1 < B.numRows(); ++i) { jacobi(i, 1); jacobi(i, B.numCols()-2); } for (std::size_t j = 2; j + 2 < B.numCols(); ++j) { jacobi(1, j); jacobi(B.numRows()-2, j); } /* send border to our neighbors and initiate the receive operations */ MPI_Request requests[8]; int ri = 0; for (auto& in_vector: in_vectors) { MPI_Irecv(in_vector.addr, 1, in_vector.datatype, in_vector.rank, 0, grid, &requests[ri++]); } for (auto& out_vector: out_vectors) { MPI_Isend(out_vector.addr, 1, out_vector.datatype, out_vector.rank, 0, grid, &requests[ri++]); } /* compute the inner block in parallel to the initiated communication */ for (std::size_t i = 2; i + 2 < B.numRows(); ++i) { for (std::size_t j = 2; j + 2 < B.numCols(); ++j) { jacobi(i, j); } } /* wait for the initiated I/O to finish */ for (auto& request: requests) { MPI_Status status; MPI_Wait(&request, &status); } return maxdiff; } int main(int argc, char** argv) { MPI_Init(&argc, &argv); int nof_processes; MPI_Comm_size(MPI_COMM_WORLD, &nof_processes); int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); using namespace hpc::matvec; using namespace hpc::mpi; using namespace hpc::aux; using T = double; using Matrix = GeMatrix; /* create two-dimensional Cartesian grid for our prcesses */ int dims[2] = {0, 0}; int periods[2] = {false, false}; MPI_Dims_create(nof_processes, 2, dims); MPI_Comm grid; MPI_Cart_create(MPI_COMM_WORLD, 2, // number of dimensions dims, // actual dimensions periods, // both dimensions are non-periodical true, // reorder is permitted &grid // newly created communication domain ); MPI_Comm_rank(grid, &rank); // update rank (could have changed) /* initialize the entire matrix, including its borders */ Matrix A(100, 100, Order::RowMajor); if (rank == 0) { for (auto [i, j, Aij]: A) { if (j == 0) { Aij = std::sin(PI * (T(i)/(A.numRows()-1))); } else if (j == A.numCols() - 1) { Aij = std::sin(PI * (T(i)/(A.numRows()-1))) * E_POWER_MINUS_PI; } else { Aij = 0; } } } /* get our position within the grid */ int coords[2]; MPI_Cart_coords(grid, rank, 2, coords); /* create matrices B1, B2 for our subarea */ int overlap = 1; UniformSlices rows(dims[0], A.numRows() - 2*overlap); UniformSlices columns(dims[1], A.numCols() - 2*overlap); Matrix B1(rows.size(coords[0]) + 2*overlap, columns.size(coords[1]) + 2*overlap, Order::RowMajor); Matrix B2(rows.size(coords[0]) + 2*overlap, columns.size(coords[1]) + 2*overlap, Order::RowMajor); /* distribute main body of A including left and right border */ scatter_by_block(A, B1, 0, grid, overlap); copy(B1, B2); /* actually just the border needs to be copied */ /* get the process numbers of our neighbors */ int left, right, upper, lower; MPI_Cart_shift(grid, 0, 1, &upper, &lower); MPI_Cart_shift(grid, 1, 1, &left, &right); /* compute type for inner rows and cols without the border */ auto B_inner_row = B1.block(0, 1).dim(overlap, B1.numCols() - 2*overlap); MPI_Datatype rowtype = get_type(B_inner_row); auto B_inner_col = B1.block(0, 1).dim(B1.numRows() - 2*overlap, overlap); MPI_Datatype coltype = get_type(B_inner_col); IOTransfer B1_in_vectors[] = { {left, &B1(1, 0), coltype}, {upper, &B1(0, 1), rowtype}, {right, &B1(1, B1.numCols()-overlap), coltype}, {lower, &B1(B1.numRows()-overlap, 1), rowtype}, }; IOTransfer B1_out_vectors[] = { {left, &B1(1, 1), coltype}, {upper, &B1(1, 1), rowtype}, {right, &B1(1, B1.numCols()-2*overlap), coltype}, {lower, &B1(B1.numRows()-2*overlap, 1), rowtype}, }; IOTransfer B2_in_vectors[] = { {left, &B2(1, 0), coltype}, {upper, &B2(0, 1), rowtype}, {right, &B2(1, B2.numCols()-overlap), coltype}, {lower, &B2(B2.numRows()-overlap, 1), rowtype}, }; IOTransfer B2_out_vectors[] = { {left, &B2(1, 1), coltype}, {upper, &B2(1, 1), rowtype}, {right, &B2(1, B2.numCols()-2*overlap), coltype}, {lower, &B2(B2.numRows()-2*overlap, 1), rowtype}, }; /* main loop for the Jacobi iterations */ T eps = 1e-6; unsigned int iterations; for (iterations = 0; ; ++iterations) { T maxdiff = jacobi_iteration(B1, B2, B2_in_vectors, B2_out_vectors, grid); maxdiff = jacobi_iteration(B2, B1, B1_in_vectors, B1_out_vectors, grid); if (iterations % 10 == 0) { T global_max; MPI_Reduce(&maxdiff, &global_max, 1, get_type(maxdiff), MPI_MAX, 0, grid); MPI_Bcast(&global_max, 1, get_type(maxdiff), 0, grid); if (global_max < eps) break; } } if (rank == 0) fmt::printf("%d iterations\n", iterations); /* collect results */ gather_by_block(B1, A, 0, grid, overlap); MPI_Finalize(); /* generate JPG file with the result */ if (rank == 0) { auto pixbuf = create_pixbuf(A, [](T val) -> HSVColor { return HSVColor((1-val) * 240, 1, 1); }, 8); gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr, "quality", "100", nullptr); } }