Exercise: GEMM for distributed matrices

Content

Function hpc::matvec::mm is supposed to compute the general matrix-matrix product for distributed matrices.

Update for class hpc::mpi::Grid

We extend class hpc::mpi::Grid for row and column communicators using MPI_Comm_split. Assume we have eight nodes within a \(2 \times 4\) grid:

\[\begin{array}{cccc} (0,0) & (0,1) & (0,2) & (0,3) \\ (1,0) & (1,1) & (1,2) & (1,3) \\\end{array}\]

MPI_Comm_split will create the communicators commRow and commCol. These will group the different nodes:

#ifndef HPC_MPI_GRID_HPP
#define HPC_MPI_GRID_HPP 1

#include <mpi.h>

namespace hpc { namespace mpi {

struct Grid
{
    Grid()
    {
        MPI_Comm_size(MPI_COMM_WORLD, &nof_processes);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);

        int dims[2] = {0, 0};
        int periods[2] = {false, false};
        MPI_Dims_create(nof_processes, 2, dims);
        MPI_Cart_create(MPI_COMM_WORLD,
                        2,        // number of dimensions
                        dims,     // actual dimensions
                        periods,  // both dimensions are non-periodical
                        true,     // reorder is permitted
                        &comm     // newly created communication domain
                       );
        MPI_Comm_rank(comm, &rank); // update rank (could have changed)
        numNodeRows = dims[0];
        numNodeCols = dims[1];

        /* get our position within the grid */
        int coords[2];
        MPI_Cart_coords(comm, rank, 2, coords);
        nodeRow = coords[0];
        nodeCol = coords[1];

        /* split MPI_COMM_WORLD to get row and col comms */
        MPI_Comm_split(MPI_COMM_WORLD, nodeRow, rank, &commRow);
        MPI_Comm_split(MPI_COMM_WORLD, nodeCol, rank, &commCol);

        int rankInCol, rankInRow;

        MPI_Comm_rank(commRow, &rankInRow);
        MPI_Comm_rank(commCol, &rankInCol);

        assert(rankInCol == nodeRow);
        assert(rankInRow == nodeCol);
    }

    int         numNodeRows, numNodeCols;
    int         nodeRow, nodeCol;
    int         nof_processes, rank;

    MPI_Comm    comm, commRow, commCol;
};

} } // namespaces mpi, hpc

#endif // HPC_MPI_GRID_HPP

Update for class `hpc::mpi::GeMatrix

In the previous session methods like rowOffset or colOffset received a node rank and then computed the position within the grid. This was changed:

This simplifies the implementation as follows:

#ifndef HPC_MPI_GEMATRIX_HPP
#define HPC_MPI_GEMATRIX_HPP 1

#include <cassert>
#include <memory>
#include <type_traits>
#include <mpi.h>
#include <hpc/aux/slices.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/mpi/fundamental.hpp>
#include "grid.hpp"

namespace hpc { namespace mpi {

template <typename T>
struct GeMatrix
{
    GeMatrix(std::size_t numRows, std::size_t numCols, Grid grid)
        : numRows(numRows), numCols(numCols),
          rows(grid.numNodeRows, numRows),
          cols(grid.numNodeCols, numCols),
          grid(grid),
          buffer(rows.size(grid.nodeRow),cols.size(grid.nodeCol))
    {
    }

    std::size_t
    rowOffset(int nodeRow) const
    {
        return rows.offset(nodeRow);
    }

    std::size_t
    colOffset(int nodeCol) const
    {
        return cols.offset(nodeCol);
    }

    std::size_t
    numLocalRows(int nodeRow) const
    {
        return rows.size(nodeRow);
    }

    std::size_t
    numLocalCols(int nodeCol) const
    {
        return cols.size(nodeCol);
    }

    std::size_t                  numRows, numCols;
    hpc::aux::UniformSlices<int> rows, cols;
    Grid                         grid;
    hpc::matvec::GeMatrix<T>     buffer;
};

} } // namespaces mpi, hpc

#endif // HPC_MPI_GEMATRIX_HPP

Update for scatter and gather

The scatter and gather method was adapted to the above modifications:

#ifndef HPC_MPI_COPY_HPP
#define HPC_MPI_COPY_HPP 1

#include <cassert>
#include <mpi.h>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/copy.hpp>
#include "gematrix.hpp"
#include <hpc/mpi/matrix.hpp>

namespace hpc { namespace mpi {

// scatter: Basically this is 'scatter_by_block' from session 25
template <typename T>
void
copy(const hpc::matvec::GeMatrix<T> &A, GeMatrix<T> &B)
{
    auto &grid        = B.grid;

    auto &rows        = B.rows;
    auto &columns     = B.cols;

    if (grid.rank == 0) {
        MPI_Request requests[grid.nof_processes-1]; int ri = 0;
        for (int r = 0; r < grid.nof_processes; ++r) {
            // get position of r in grid
            int coords[2];
            MPI_Cart_coords(grid.comm, r, 2, coords);

            // get offset and dimension of block stored by r
            auto i0 = B.rowOffset(coords[0]);
            auto j0 = B.colOffset(coords[1]);
            auto m0 = B.numLocalRows(coords[0]);
            auto n0 = B.numLocalCols(coords[1]);

            auto A_ = A.block(i0, j0).dim(m0, n0);

            if (r == 0) {
                hpc::matvec::copy(A_, B.buffer);
            } else {
                /* aged OpenMPI implementations of Debian wheezy and jessie
                   expect void* instead of const void*;
                   hence we need to remove const */
                MPI_Isend(
                    const_cast<void*>(reinterpret_cast<const void*>(&A_(0, 0))),
                    1, get_type(A_), r, 0, grid.comm, &requests[ri++]);
            }
        }
        for (auto& request: requests) {
            MPI_Status status;
            MPI_Wait(&request, &status);
        }
   } else {
      auto &B_ = B.buffer;

      MPI_Status status;
      MPI_Recv(&B_(0, 0), 1, get_type(B_), 0, 0, grid.comm, &status);
   }
}

// gather: Basically this is 'gather_by_block' from session 25
template <typename T>
void
copy(const GeMatrix<T> &A, hpc::matvec::GeMatrix<T> &B)
{
    auto &grid        = A.grid;

    auto &rows        = A.rows;
    auto &columns     = A.cols;

    if (grid.rank == 0) {
        MPI_Request requests[grid.nof_processes-1]; int ri = 0;
        for (int r = 0; r < grid.nof_processes; ++r) {
            // get position of r in grid
            int coords[2];
            MPI_Cart_coords(grid.comm, r, 2, coords);

            // get offset and dimension of block stored by r
            auto i0 = A.rowOffset(coords[0]);
            auto j0 = A.colOffset(coords[1]);
            auto m0 = A.numLocalRows(coords[0]);
            auto n0 = A.numLocalCols(coords[1]);

            auto B_ = B.block(i0, j0).dim(m0, n0);
            if (r == 0) {
                hpc::matvec::copy(A.buffer, B_);
            } else {
                MPI_Irecv(&B_(0, 0), 1, get_type(B_),
                          r, 0, grid.comm, &requests[ri++]);
            }
        }
        for (auto& request: requests) {
            MPI_Status status;
            MPI_Wait(&request, &status);
        }
    } else {
        auto &A_ = A.buffer;
        /* aged OpenMPI implementations of Debian wheezy and jessie
           expect void* instead of const void*; hence we need to remove const */
        MPI_Send(const_cast<void*>(reinterpret_cast<const void*>(&A_(0, 0))),
                 1, get_type(A_), 0, 0, grid.comm);
    }
}

} } // namespaces mpi, hpc

#endif // HPC_MPI_COPY_HPP

Skeleton for hpc::mpi::mm

The skeleton already implements the broadcast of blocks from \(A\):

#ifndef HPC_MPI_GEMM_HPP
#define HPC_MPI_GEMM_HPP 1

#include <cassert>
#include <mpi.h>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/copy.hpp>
#include <hpc/matvec/mm.hpp>
#include "gematrix.hpp"
#include <hpc/mpi/matrix.hpp>

namespace hpc { namespace mpi {

template <typename T>
void
mm(T alpha, const GeMatrix<T> &A, const GeMatrix<T> &B, T beta, GeMatrix<T> &C)
{
    auto grid        = A.grid;

    assert(A.numCols == B.numRows);
    assert(C.numRows == A.numRows);
    assert(C.numCols == B.numCols);

    std::size_t k    = A.numCols;
    std::size_t bs   = 1;

    assert(C.numLocalRows(grid.nodeRow) == A.numLocalRows(grid.nodeRow));
    assert(C.numLocalCols(grid.nodeCol) == B.numLocalCols(grid.nodeCol));
    assert(C.rowOffset(grid.nodeRow) == A.rowOffset(grid.nodeRow));
    assert(C.colOffset(grid.nodeCol) == B.colOffset(grid.nodeCol));

    auto i0 = A.rowOffset(grid.nodeRow);
    auto j0 = B.colOffset(grid.nodeCol);

    auto m0 = A.numLocalRows(grid.nodeRow);
    auto n0 = B.numLocalCols(grid.nodeCol);

    hpc::matvec::GeMatrix<T>    A_(m0, bs);
    hpc::matvec::GeMatrix<T>    B_(bs, n0);

    auto typeA_ = get_type(A_);
    auto typeB_ = get_type(B_);


    for (std::size_t l=0; l<k; ++l) {

        // Broadcast block of A to node row
        int rootA;
        for (int c=0; c<grid.numNodeCols; ++c) {
            auto l0 = A.colOffset(c);
            auto l1 = l0 + A.numLocalCols(c);

            // check if we have the block
            if (l >= l0 && l < l1) {
                hpc::matvec::copy(A.buffer.block(0, l-l0).dim(m0, bs), A_);
                rootA = c;
            }
        }
        MPI_Bcast(&A_(0,0), 1, typeA_, rootA, grid.commRow);

        // Broadcast block of B to node col
        /*

            TODO: Your code

        */

        auto beta_ = l==0 ? beta : 1;

        hpc::matvec::mm(alpha, A_, B_, beta_, C.buffer);
    }
}

} } // namespaces mpi, hpc

#endif // HPC_MPI_GEMM_HPP

Exercise

Test program

#include <hpc/matvec/print.hpp>
#include "grid.hpp"
#include "gematrix.hpp"
#include "copy.hpp"
#include "gemm.hpp"
#include <mpi.h>
#include <hpc/matvec/iterators.hpp>
#include <hpc/matvec/axpy.hpp>
#include <hpc/matvec/mm.hpp>


int
main(int argc, char** argv)
{
    MPI_Init(&argc, &argv);

    hpc::mpi::Grid          grid;

    std::size_t m = 8;
    std::size_t n = 10;
    std::size_t k = 12;

    hpc::matvec::GeMatrix<double>    A(m, k), B(k, n), C(m, n),
                                     Cref(m, n), Cmpi(m,n);
    double                           alpha = 1, beta = 0;
    hpc::mpi::GeMatrix<double>       dA(m, k, grid),
                                     dB(k, n, grid),
                                     dC(m, n, grid);

    if (grid.rank==0) {
        for (auto [i, j, Aij]: A) {
            Aij = i * 100 + j;
        }
        for (auto [i, j, Bij]: B) {
            Bij = i * 10 + j;
        }
        for (auto [i, j, Cij]: C) {
            Cij = i * 100 + j;
        }
        fmt::printf("A =\n");
        hpc::matvec::print(A);
        fmt::printf("B =\n");
        hpc::matvec::print(B);
        fmt::printf("C = \n");
        hpc::matvec::print(C);
        fmt::printf("C <- %f * A*B + %f*C\n", alpha, beta);
        hpc::matvec::copy(C, Cref);
        hpc::matvec::mm(alpha, A, B, beta, Cref);
        fmt::printf("Cref =\n");
        hpc::matvec::print(Cref);
    }

    hpc::mpi::copy(A, dA);
    hpc::mpi::copy(B, dB);
    hpc::mpi::copy(C, dC);

    hpc::mpi::mm(alpha, dA, dB, beta, dC);

    hpc::mpi::copy(dC, Cmpi);

    if (grid.rank==0) {
        fmt::printf("Cmpi =\n");
        hpc::matvec::print(Cmpi);

        hpc::matvec::axpy(-1, Cref, Cmpi);

        fmt::printf("Cref - Cmpi =\n");
        hpc::matvec::print(Cmpi);
    }

    MPI_Finalize();
}

You can compile on theon with

theon$ mpic++ -g -std=c++17 -I. -I/home/numerik/pub/hpc/ws18/session25 -o test_gemm test_gemm.cpp
theon$