#include <cassert>
#include <cmath>
#include <printf.hpp>
#include <mpi.h>
#include <hpc/aux/hsvcolor.hpp>
#include <hpc/aux/slices.hpp>
#include <hpc/matvec/copy.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/iterators.hpp>
#include <hpc/matvec/matrix2pixbuf.hpp>
#include <hpc/mpi/matrix.hpp>
#include <hpc/mpi/vector.hpp>
using namespace hpc;
template<typename T>
const T PI = std::acos(T(-1.0));
template<typename T>
const T E = std::exp(T(1.0));
template<typename T>
const T E_POWER_MINUS_PI = std::pow(E<T>, -PI<T>);
/* used to specify an I/O transfer with one
   of our neighbors;
   each process has four input and four output
   operations per iteration
*/
struct IOTransfer {
   int rank; /* rank of neighbor */
   void* addr; /* start address */
   MPI_Datatype datatype; /* either a column or row */
};
/* perform a Jacobi iteration:
      A: previous state, i.e. A_n
      B: to be computed state A_{n+1}
      in_vectors & out_vectors:
	 I/O operations with our four neighbors,
	 reading into B (outer border) and
	 sending from B (inner border)
*/
template<typename T, template<typename> typename Matrix,
   Require<Ge<Matrix<T>>> = true>
T jacobi_iteration(const Matrix<T>& A, Matrix<T>& B,
      const IOTransfer (&in_vectors)[4],
      const IOTransfer (&out_vectors)[4],
      MPI_Comm grid) {
   assert(A.numRows() > 2 && A.numCols() > 2);
   T maxdiff = 0;
   auto jacobi = [&](std::size_t i, std::size_t j) {
      B(i, j) = 0.25 *
	 (A(i - 1, j) + A(i + 1, j) + A(i, j - 1) + A(i, j + 1));
      T diff = std::fabs(A(i, j) - B(i, j));
      if (diff > maxdiff) maxdiff = diff;
   };
   /* compute the border first which is sent in advance
      to our neighbors */
   for (std::size_t i = 1; i + 1 < B.numRows(); ++i) {
      jacobi(i, 1);
      jacobi(i, B.numCols()-2);
   }
   for (std::size_t j = 2; j + 2 < B.numCols(); ++j) {
      jacobi(1, j);
      jacobi(B.numRows()-2, j);
   }
   /* send border to our neighbors and
      initiate the receive operations */ 
   MPI_Request requests[8]; int ri = 0;
   for (auto& in_vector: in_vectors) {
      MPI_Irecv(in_vector.addr, 1, in_vector.datatype,
	 in_vector.rank, 0, grid, &requests[ri++]);
   }
   for (auto& out_vector: out_vectors) {
      MPI_Isend(out_vector.addr, 1, out_vector.datatype,
	 out_vector.rank, 0, grid, &requests[ri++]);
   }
   /* compute the inner block in parallel
      to the initiated communication */
   for (std::size_t i = 2; i + 2 < B.numRows(); ++i) {
      for (std::size_t j = 2; j + 2 < B.numCols(); ++j) {
	 jacobi(i, j);
      }
   }
   /* wait for the initiated I/O to finish */
   for (auto& request: requests) {
      MPI_Status status;
      MPI_Wait(&request, &status);
   }
   return maxdiff;
}
int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);
   int nof_processes; MPI_Comm_size(MPI_COMM_WORLD, &nof_processes);
   int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   using namespace hpc::matvec;
   using namespace hpc::mpi;
   using namespace hpc::aux;
   using T = double;
   using Matrix = GeMatrix<T>;
   /* create two-dimensional Cartesian grid for our prcesses */
   int dims[2] = {0, 0}; int periods[2] = {false, false};
   MPI_Dims_create(nof_processes, 2, dims);
   MPI_Comm grid;
   MPI_Cart_create(MPI_COMM_WORLD,
      2,        // number of dimensions
      dims,     // actual dimensions
      periods,  // both dimensions are non-periodical
      true,     // reorder is permitted
      &grid     // newly created communication domain
   );
   MPI_Comm_rank(grid, &rank); // update rank (could have changed)
   /* initialize the entire matrix, including its borders */
   Matrix A(100, 100, Order::RowMajor);
   if (rank == 0) {
      for (auto [i, j, Aij]: A) {
	 if (j == 0) {
	    Aij = std::sin(PI<T> * (T(i)/(A.numRows()-1)));
	 } else if (j == A.numCols() - 1) {
	    Aij = std::sin(PI<T> * (T(i)/(A.numRows()-1))) *
	       E_POWER_MINUS_PI<T>;
	 } else {
	    Aij = 0;
	 }
      }
   }
   /* get our position within the grid */
   int coords[2];
   MPI_Cart_coords(grid, rank, 2, coords);
   /* create matrices B1, B2 for our subarea */
   int overlap = 1;
   UniformSlices<int> rows(dims[0], A.numRows() - 2*overlap);
   UniformSlices<int> columns(dims[1], A.numCols() - 2*overlap);
   Matrix B1(rows.size(coords[0]) + 2*overlap,
	     columns.size(coords[1]) + 2*overlap,
             Order::RowMajor);
   Matrix B2(rows.size(coords[0]) + 2*overlap,
             columns.size(coords[1]) + 2*overlap,
             Order::RowMajor);
   /* distribute main body of A including left and right border */
   scatter_by_block(A, B1, 0, grid, overlap);
   copy(B1, B2); /* actually just the border needs to be copied */
   /* get the process numbers of our neighbors */
   int left, right, upper, lower;
   MPI_Cart_shift(grid, 0, 1, &upper, &lower);
   MPI_Cart_shift(grid, 1, 1, &left, &right);
   /* compute type for inner rows and cols without the border */
   auto B_inner_row = B1.block(0, 1).dim(overlap, B1.numCols() - 2*overlap);
   MPI_Datatype rowtype = get_type(B_inner_row);
   auto B_inner_col = B1.block(0, 1).dim(B1.numRows() - 2*overlap, overlap);
   MPI_Datatype coltype = get_type(B_inner_col);
   IOTransfer B1_in_vectors[] = {
      {left, &B1(1, 0), coltype},
      {upper, &B1(0, 1), rowtype},
      {right, &B1(1, B1.numCols()-overlap), coltype},
      {lower, &B1(B1.numRows()-overlap, 1), rowtype},
   };
   IOTransfer B1_out_vectors[] = {
      {left, &B1(1, 1), coltype},
      {upper, &B1(1, 1), rowtype},
      {right, &B1(1, B1.numCols()-2*overlap), coltype},
      {lower, &B1(B1.numRows()-2*overlap, 1), rowtype},
   };
   IOTransfer B2_in_vectors[] = {
      {left, &B2(1, 0), coltype},
      {upper, &B2(0, 1), rowtype},
      {right, &B2(1, B2.numCols()-overlap), coltype},
      {lower, &B2(B2.numRows()-overlap, 1), rowtype},
   };
   IOTransfer B2_out_vectors[] = {
      {left, &B2(1, 1), coltype},
      {upper, &B2(1, 1), rowtype},
      {right, &B2(1, B2.numCols()-2*overlap), coltype},
      {lower, &B2(B2.numRows()-2*overlap, 1), rowtype},
   };
   /* main loop for the Jacobi iterations */
   T eps = 1e-6; unsigned int iterations;
   for (iterations = 0; ; ++iterations) {
      T maxdiff = jacobi_iteration(B1, B2, B2_in_vectors, B2_out_vectors,
	 grid);
      maxdiff = jacobi_iteration(B2, B1, B1_in_vectors, B1_out_vectors, grid);
      if (iterations % 10 == 0) {
	 T global_max;
	 MPI_Reduce(&maxdiff, &global_max, 1, get_type(maxdiff),
	    MPI_MAX, 0, grid);
	 MPI_Bcast(&global_max, 1, get_type(maxdiff), 0, grid);
	 if (global_max < eps) break;
      }
   }
   if (rank == 0) fmt::printf("%d iterations\n", iterations);
   /* collect results */
   gather_by_block(B1, A, 0, grid, overlap);
   MPI_Finalize();
   /* generate JPG file with the result */
   if (rank == 0) {
      auto pixbuf = create_pixbuf(A, [](T val) -> HSVColor<float> {
	 return HSVColor<float>((1-val) * 240, 1, 1);
      }, 8);
      gdk_pixbuf_save(pixbuf, "jacobi.jpg", "jpeg", nullptr,
	 "quality", "100", nullptr);
   }
}