# Exercise: GEMM for distributed matrices

#### Content

Function hpc::matvec::mm is supposed to compute the general matrix-matrix product for distributed matrices.

## Update for class hpc::mpi::Grid

We extend class hpc::mpi::Grid for row and column communicators using MPI_Comm_split. Assume we have eight nodes within a $$2 \times 4$$ grid:

$\begin{array}{cccc} (0,0) & (0,1) & (0,2) & (0,3) \\ (1,0) & (1,1) & (1,2) & (1,3) \\\end{array}$

MPI_Comm_split will create the communicators commRow and commCol. These will group the different nodes:

• commRow connects either:

• the nodes where nodeRow is , i.e: $$(0,0)$$, $$(0,1)$$, $$(0,2)$$, $$(0,3)$$

• or nodes where nodeRow is 1, i.e: $$(1,0)$$, $$(1,1)$$, $$(1,2)$$, $$(1,3)$$

• commCol connects either:

• the nodes where nodeCol is , i.e: $$(0,0)$$, $$(1,0)$$

• or nodes where nodeCol is 1, i.e: $$(0,1)$$, $$(1,1)$$

• or nodes where nodeCol is 2, i.e: $$(0,2)$$, $$(1,2)$$

• or nodes where nodeCol is 3, i.e: $$(0,3)$$, $$(1,3)$$

#ifndef HPC_MPI_GRID_HPP
#define HPC_MPI_GRID_HPP 1

#include <cassert>
#include <mpi.h>

namespace hpc { namespace mpi {

struct Grid
{
Grid()
{
MPI_Comm_size(MPI_COMM_WORLD, &nof_processes);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);

int dims[2] = {0, 0};
int periods[2] = {false, false};
MPI_Dims_create(nof_processes, 2, dims);
MPI_Cart_create(MPI_COMM_WORLD,
2,        // number of dimensions
dims,     // actual dimensions
periods,  // both dimensions are non-periodical
true,     // reorder is permitted
&comm     // newly created communication domain
);
MPI_Comm_rank(comm, &rank); // update rank (could have changed)
numNodeRows = dims[0];
numNodeCols = dims[1];

/* get our position within the grid */
int coords[2];
MPI_Cart_coords(comm, rank, 2, coords);
nodeRow = coords[0];
nodeCol = coords[1];

/* split MPI_COMM_WORLD to get row and col comms */
MPI_Comm_split(MPI_COMM_WORLD, nodeRow, rank, &commRow);
MPI_Comm_split(MPI_COMM_WORLD, nodeCol, rank, &commCol);

int rankInCol, rankInRow;

MPI_Comm_rank(commRow, &rankInRow);
MPI_Comm_rank(commCol, &rankInCol);

assert(rankInCol == nodeRow);
assert(rankInRow == nodeCol);
}

int         numNodeRows, numNodeCols;
int         nodeRow, nodeCol;
int         nof_processes, rank;

MPI_Comm    comm, commRow, commCol;
};

} } // namespaces mpi, hpc

#endif // HPC_MPI_GRID_HPP


## Update for class hpc::mpi::GeMatrix

In the previous session methods like rowOffset or colOffset received a node rank and then computed the position within the grid. This was changed:

• Methods that return the row offset or the number of locally stored rows just receive the node row and

• methods that return the col offset or the number of locally stored cols just receive the node col

This simplifies the implementation as follows:

#ifndef HPC_MPI_GEMATRIX_HPP
#define HPC_MPI_GEMATRIX_HPP 1

#include <cassert>
#include <memory>
#include <type_traits>
#include <mpi.h>
#include <hpc/aux/slices.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/mpi/fundamental.hpp>
#include "grid.hpp"

namespace hpc { namespace mpi {

template <typename T>
struct GeMatrix
{
GeMatrix(std::size_t numRows, std::size_t numCols, Grid grid)
: numRows(numRows), numCols(numCols),
rows(grid.numNodeRows, numRows),
cols(grid.numNodeCols, numCols),
grid(grid),
buffer(rows.size(grid.nodeRow),cols.size(grid.nodeCol))
{
}

std::size_t
rowOffset(int nodeRow) const
{
return rows.offset(nodeRow);
}

std::size_t
colOffset(int nodeCol) const
{
return cols.offset(nodeCol);
}

std::size_t
numLocalRows(int nodeRow) const
{
return rows.size(nodeRow);
}

std::size_t
numLocalCols(int nodeCol) const
{
return cols.size(nodeCol);
}

std::size_t                  numRows, numCols;
hpc::aux::UniformSlices<int> rows, cols;
Grid                         grid;
hpc::matvec::GeMatrix<T>     buffer;
};

} } // namespaces mpi, hpc

#endif // HPC_MPI_GEMATRIX_HPP


## Update for scatter and gather

The scatter and gather method was adapted to the above modifications:

#ifndef HPC_MPI_COPY_HPP
#define HPC_MPI_COPY_HPP 1

#include <cassert>
#include <mpi.h>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/copy.hpp>
#include "gematrix.hpp"
#include <hpc/mpi/matrix.hpp>

namespace hpc { namespace mpi {

// scatter: Basically this is 'scatter_by_block' from session 25
template <typename T>
void
copy(const hpc::matvec::GeMatrix<T> &A, GeMatrix<T> &B)
{
auto &grid        = B.grid;

auto &rows        = B.rows;
auto &columns     = B.cols;

if (grid.rank == 0) {
MPI_Request requests[grid.nof_processes-1]; int ri = 0;
for (int r = 0; r < grid.nof_processes; ++r) {
// get position of r in grid
int coords[2];
MPI_Cart_coords(grid.comm, r, 2, coords);

// get offset and dimension of block stored by r
auto i0 = B.rowOffset(coords[0]);
auto j0 = B.colOffset(coords[1]);
auto m0 = B.numLocalRows(coords[0]);
auto n0 = B.numLocalCols(coords[1]);

auto A_ = A.block(i0, j0).dim(m0, n0);

if (r == 0) {
hpc::matvec::copy(A_, B.buffer);
} else {
/* aged OpenMPI implementations of Debian wheezy and jessie
expect void* instead of const void*;
hence we need to remove const */
MPI_Isend(
const_cast<void*>(reinterpret_cast<const void*>(&A_(0, 0))),
1, get_type(A_), r, 0, grid.comm, &requests[ri++]);
}
}
for (auto& request: requests) {
MPI_Status status;
MPI_Wait(&request, &status);
}
} else {
auto &B_ = B.buffer;

MPI_Status status;
MPI_Recv(&B_(0, 0), 1, get_type(B_), 0, 0, grid.comm, &status);
}
}

// gather: Basically this is 'gather_by_block' from session 25
template <typename T>
void
copy(const GeMatrix<T> &A, hpc::matvec::GeMatrix<T> &B)
{
auto &grid        = A.grid;

auto &rows        = A.rows;
auto &columns     = A.cols;

if (grid.rank == 0) {
MPI_Request requests[grid.nof_processes-1]; int ri = 0;
for (int r = 0; r < grid.nof_processes; ++r) {
// get position of r in grid
int coords[2];
MPI_Cart_coords(grid.comm, r, 2, coords);

// get offset and dimension of block stored by r
auto i0 = A.rowOffset(coords[0]);
auto j0 = A.colOffset(coords[1]);
auto m0 = A.numLocalRows(coords[0]);
auto n0 = A.numLocalCols(coords[1]);

auto B_ = B.block(i0, j0).dim(m0, n0);
if (r == 0) {
hpc::matvec::copy(A.buffer, B_);
} else {
MPI_Irecv(&B_(0, 0), 1, get_type(B_),
r, 0, grid.comm, &requests[ri++]);
}
}
for (auto& request: requests) {
MPI_Status status;
MPI_Wait(&request, &status);
}
} else {
auto &A_ = A.buffer;
/* aged OpenMPI implementations of Debian wheezy and jessie
expect void* instead of const void*; hence we need to remove const */
MPI_Send(const_cast<void*>(reinterpret_cast<const void*>(&A_(0, 0))),
1, get_type(A_), 0, 0, grid.comm);
}
}

} } // namespaces mpi, hpc

#endif // HPC_MPI_COPY_HPP


## Skeleton for hpc::mpi::mm

The skeleton already implements the broadcast of blocks from $$A$$:

#ifndef HPC_MPI_GEMM_HPP
#define HPC_MPI_GEMM_HPP 1

#include <cassert>
#include <mpi.h>
#include <hpc/matvec/copy.hpp>
#include <hpc/matvec/gematrix.hpp>
#include <hpc/matvec/mm.hpp>
#include <hpc/mpi/matrix.hpp>
#include "gematrix.hpp"

namespace hpc { namespace mpi {

template <typename T>
void
mm(T alpha, const GeMatrix<T> &A, const GeMatrix<T> &B, T beta, GeMatrix<T> &C)
{
auto grid        = A.grid;

assert(A.numCols == B.numRows);
assert(C.numRows == A.numRows);
assert(C.numCols == B.numCols);

std::size_t k    = A.numCols;
std::size_t bs   = 1;

assert(C.numLocalRows(grid.nodeRow) == A.numLocalRows(grid.nodeRow));
assert(C.numLocalCols(grid.nodeCol) == B.numLocalCols(grid.nodeCol));
assert(C.rowOffset(grid.nodeRow) == A.rowOffset(grid.nodeRow));
assert(C.colOffset(grid.nodeCol) == B.colOffset(grid.nodeCol));

auto i0 = A.rowOffset(grid.nodeRow);
auto j0 = B.colOffset(grid.nodeCol);

auto m0 = A.numLocalRows(grid.nodeRow);
auto n0 = B.numLocalCols(grid.nodeCol);

hpc::matvec::GeMatrix<T>    A_(m0, bs);
hpc::matvec::GeMatrix<T>    B_(bs, n0);

auto typeA_ = get_type(A_);
auto typeB_ = get_type(B_);

for (std::size_t l=0; l<k; ++l) {

// Broadcast block of A to node row
int rootA;
for (int c=0; c<grid.numNodeCols; ++c) {
auto l0 = A.colOffset(c);
auto l1 = l0 + A.numLocalCols(c);

// check if we have the block
if (l >= l0 && l < l1) {
hpc::matvec::copy(A.buffer.block(0, l-l0).dim(m0, bs), A_);
rootA = c;
}
}
MPI_Bcast(&A_(0,0), 1, typeA_, rootA, grid.commRow);

// Broadcast block of B to node col
/*

*/

auto beta_ = l==0 ? beta : 1;

hpc::matvec::mm(alpha, A_, B_, beta_, C.buffer);
}
}

} } // namespaces mpi, hpc

#endif // HPC_MPI_GEMM_HPP


## Exercise

• Implement the distributed GEMM operation (You just have to add the broadcasting of blocks from $$B$$).

## Test program

#include <mpi.h>
#include <printf.hpp>
#include <hpc/matvec/axpy.hpp>
#include <hpc/matvec/iterators.hpp>
#include <hpc/matvec/mm.hpp>
#include <hpc/matvec/print.hpp>
#include "copy.hpp"
#include "gematrix.hpp"
#include "gemm.hpp"
#include "grid.hpp"

int
main(int argc, char** argv)
{
MPI_Init(&argc, &argv);

hpc::mpi::Grid          grid;

std::size_t m = 8;
std::size_t n = 10;
std::size_t k = 12;

hpc::matvec::GeMatrix<double>    A(m, k), B(k, n), C(m, n),
Cref(m, n), Cmpi(m,n);
double                           alpha = 1, beta = 0;
hpc::mpi::GeMatrix<double>       dA(m, k, grid),
dB(k, n, grid),
dC(m, n, grid);

if (grid.rank==0) {
for (auto [i, j, Aij]: A) {
Aij = i * 100 + j;
}
for (auto [i, j, Bij]: B) {
Bij = i * 10 + j;
}
for (auto [i, j, Cij]: C) {
Cij = i * 100 + j;
}
fmt::printf("A =\n");
hpc::matvec::print(A);
fmt::printf("B =\n");
hpc::matvec::print(B);
fmt::printf("C = \n");
hpc::matvec::print(C);
fmt::printf("C <- %f * A*B + %f*C\n", alpha, beta);
hpc::matvec::copy(C, Cref);
hpc::matvec::mm(alpha, A, B, beta, Cref);
fmt::printf("Cref =\n");
hpc::matvec::print(Cref);
}

hpc::mpi::copy(A, dA);
hpc::mpi::copy(B, dB);
hpc::mpi::copy(C, dC);

hpc::mpi::mm(alpha, dA, dB, beta, dC);

hpc::mpi::copy(dC, Cmpi);

if (grid.rank==0) {
fmt::printf("Cmpi =\n");
hpc::matvec::print(Cmpi);

hpc::matvec::axpy(-1, Cref, Cmpi);

fmt::printf("Cref - Cmpi =\n");
hpc::matvec::print(Cmpi);
}

MPI_Finalize();
}


You can compile on theon as follows:

theon$mpic++ -g -std=c++17 -I. -I/home/numerik/pub/hpc/ws19/session27 -o test_gemm test_gemm.cpp theon$ 

Or at one of the machines in E.44:

heim$OMPI_CXX=g++-8.3 mpic++ -g -std=c++17 -I. -I/home/numerik/pub/hpc/ws19/session27 -o test_gemm test_gemm.cpp heim$