Sample solution

Content

Summary

Source files

#ifndef HPC_ULMBLAS_GEMM_HPP
#define HPC_ULMBLAS_GEMM_HPP

#include <cstddef>
#include <hpc/tools/buffer.hpp>
#include <hpc/ulmblas/ugemm/config.hpp>
#if defined(_OPENMP)
#include <omp.h>
#endif
#ifdef GLOBAL_THREAD_POOL
#include <hpc/mt/thread_pool.hpp>
#endif
#ifdef GLOBAL_THREAD_POOL
extern hpc::mt::ThreadPool GLOBAL_THREAD_POOL;
#endif

namespace hpc { namespace ulmblas {

template <typename T>
void
gemm(std::size_t m, std::size_t n, std::size_t k,
      T alpha,
      bool conjA, const T *A, std::ptrdiff_t incRowA, std::ptrdiff_t incColA,
      bool conjB, const T *B, std::ptrdiff_t incRowB, std::ptrdiff_t incColB,
      T beta,
      T *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
    if (alpha==T(0) || k==0) {
        gescal(m, n, beta, C, incRowC, incColC);
        return;
    }

#if defined(GLOBAL_THREAD_POOL)
    int nof_threads = ::GLOBAL_THREAD_POOL.size();
#elif defined(_OPENMP)
    int nof_threads;
    #pragma omp parallel
    {
	if (omp_get_thread_num() == 0) {
	    nof_threads = omp_get_num_threads();
	}
    }
#else
    int nof_threads = 1;
#endif

    GemmParameter<T>   p(m, n, k);
    std::size_t MC = p.MC * nof_threads;
    std::size_t NC = p.NC;
    std::size_t KC = p.KC;

    std::size_t mb = (m + MC-1)/MC;
    std::size_t nb = (n + NC-1)/NC;
    std::size_t kb = (k + KC-1)/KC;

    std::size_t mc_ = m % MC;
    std::size_t nc_ = n % NC;
    std::size_t kc_ = k % KC;

    tools::Buffer<T> A_(MC*KC+p.extra_A, p.alignment);
    tools::Buffer<T> B_(KC*NC+p.extra_B, p.alignment);

    for (std::size_t j=0; j<nb; ++j) {
        std::size_t N = (j<nb-1 || nc_==0) ? NC
                                           : nc_;
        for (std::size_t l=0; l<kb; ++l) {
            std::size_t K = (l<kb-1 || kc_==0) ? KC
                                               : kc_;
            T beta_ = (l==0) ? beta
                             : 1;
            p.pack_B(K, N, conjB,
                     &B[l*KC*incRowB+j*NC*incColB], incRowB, incColB,
                     B_.data);

            for (std::size_t i=0; i<mb; ++i) {
                std::size_t M = (i<mb-1 || mc_==0) ? MC
                                                   : mc_;

                p.pack_A(M, K, conjA,
                         &A[i*MC*incRowA+l*KC*incColA], incRowA, incColA,
                         A_.data);

                p.mgemm(M, N, K, alpha, A_.data, B_.data, beta_,
                        &C[i*MC*incRowC+j*NC*incColC], incRowC, incColC);
            }
        }
    }
}


} } // namespace ulmblas, hpc

#endif // HPC_ULMBLAS_GEMM_HPP
#ifndef HPC_ULMBLAS_MGEMM_HPP
#define HPC_ULMBLAS_MGEMM_HPP

#include <cstddef>
#include <hpc/ulmblas/axpy.hpp>
#include <hpc/ulmblas/scal.hpp>
#ifdef GLOBAL_THREAD_POOL
#include <hpc/mt/thread_pool.hpp>
#include <hpc/aux/slices.hpp>
#elif defined(_OPENMP)
#include <omp.h>
#include <hpc/aux/slices.hpp>
#endif

#ifdef GLOBAL_THREAD_POOL
extern hpc::mt::ThreadPool GLOBAL_THREAD_POOL;
#endif

namespace hpc { namespace ulmblas {

template <typename T>
using UGemm = void (*)(std::size_t, T,
                       const T *, const T *,
                       T,
                       T *, std::ptrdiff_t, std::ptrdiff_t,
                       const T *, const T *);

template <typename T, std::size_t MR, std::size_t NR, UGemm<T> ugemm>
void
mgemm(std::size_t M, std::size_t N, std::size_t K,
      T alpha,
      const T *A, const T *B,
      T beta,
      T *C, std::ptrdiff_t incRowC, std::ptrdiff_t incColC)
{
    std::size_t mp = (M+MR-1) / MR;
    std::size_t np = (N+NR-1) / NR;

    std::size_t mr_ = M % MR;
    std::size_t nr_ = N % NR;

    auto mgemm_body = [=](std::size_t start_index, std::size_t size) {
	T C_[MR*NR];
	const T *a_next = A;
	const T *b_next = nullptr;
	for (std::size_t j=start_index; j<start_index+size; ++j) {
	    std::size_t nr = (j<np-1 || nr_==0) ? NR
						: nr_;
	    b_next = &B[j*K*NR];
	    for (std::size_t i=0; i<mp; ++i) {
		std::size_t mr = (i<mp-1 || mr_==0) ? MR
						    : mr_;

		a_next = &A[(i+1)*MR*K];
		if (i==mp-1) {
		    a_next = A;
		    b_next = &B[(j+1)*K*NR];
		    if (j==np-1) {
			b_next = B;
		    }
		}

		if (mr==MR && nr==NR) {
		    ugemm(K, alpha,
			  &A[i*MR*K], &B[j*K*NR],
			  beta,
			  &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC,
			  a_next, b_next);
		} else {
		    ugemm(K, alpha,
			  &A[i*MR*K], &B[j*K*NR],
			  0,
			  C_, 1, MR,
			  a_next, b_next);
		    gescal(mr, nr, beta,
			   &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC);
		    geaxpy(mr, nr, T(1),
			   false, C_, 1, MR,
			   &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC);
		}
	    }
	}
    };

#ifdef GLOBAL_THREAD_POOL
    using ::GLOBAL_THREAD_POOL;
    hpc::mt::ThreadPool& tpool(GLOBAL_THREAD_POOL);
    std::vector<std::future<void>> futures(tpool.size());
    int index = 0;
    /* fork ... */
    hpc::aux::foreach_slice<hpc::aux::UniformSlices>(tpool.size(), np,
	    [=,&index, &futures,&tpool](std::size_t start_index,
		std::size_t size) {
	futures[index++] = tpool.submit([=]() {
	    mgemm_body(start_index, size);
	});
    });
    /* ... and join() */
    for (auto& f: futures) {
	if (f.valid()) f.get();
    }
#elif defined(_OPENMP)
    #pragma omp parallel
    {
        hpc::aux::UniformSlices<size_t> slices(omp_get_num_threads(), np);
	int i = omp_get_thread_num();
	mgemm_body(slices.offset(i), slices.size(i));
    }
#else
    mgemm_body(0, np);
#endif
}

} } // namespace ulmblas, hpc

#endif // HPC_ULMBLAS_MGEMM_HPP

Compilation and Execution

theon$ g++ -DD_BLOCKSIZE_MR=4 -DD_BLOCKSIZE_NR=8 -O3 -g -I. -I/home/numerik/pub/hpc/ws18/session22 -std=c++17 -o gemm_session22_serial gemm_session22.cpp
theon$ g++ -DD_BLOCKSIZE_MR=4 -DD_BLOCKSIZE_NR=8 -O3 -g -I. -I/home/numerik/pub/hpc/ws18/session22 -std=c++17 -fopenmp -o gemm_session22_openmp gemm_session22.cpp
theon$ g++ -DD_BLOCKSIZE_MR=4 -DD_BLOCKSIZE_NR=8 -O3 -g -I. -I/home/numerik/pub/hpc/ws18/session22 -std=c++17 -DGLOBAL_THREAD_POOL=global_tpool -o gemm_session22_tpool gemm_session22.cpp
theon$ ./gemm_session22_serial
#                                                                                        | refmm               | updated matvec::mm:                 |
#      m       n       k incRowC incColC incRowA incColA incRowB incColB   alpha    beta |    time      mflops |    time   error      mflops  factor |
     300     300     300       1     304       1     304       1     304       1       1 |    0.04     1236.23 |    0.02   3e-04     2525.72    2.04 |
     400     400     400       1     400       1     400       1     400       1       1 |    0.14      927.91 |    0.04   3e-04     3048.52    3.29 |
     500     500     500       1     504       1     504       1     504       1       1 |    0.22     1115.25 |    0.08   3e-04     3107.84    2.79 |
     600     600     600       1     600       1     600       1     600       1       1 |    0.36     1202.15 |    0.13   2e-04     3259.18    2.71 |
     700     700     700       1     704       1     704       1     704       1       1 |    0.80      853.21 |    0.20   2e-04     3467.72    4.06 |
     800     800     800       1     800       1     800       1     800       1       1 |    1.09      935.92 |    0.31   2e-04     3347.73    3.58 |
     900     900     900       1     904       1     904       1     904       1       1 |    1.18     1230.51 |    0.45   2e-04     3252.59    2.64 |
    1000    1000    1000       1    1000       1    1000       1    1000       1       1 |    1.60     1248.76 |    0.59   2e-04     3400.97    2.72 |
    1100    1100    1100       1    1104       1    1104       1    1104       1       1 |    2.74      972.05 |    0.88   1e-04     3013.09    3.10 |
    1200    1200    1200       1    1200       1    1200       1    1200       1       1 |    3.56      971.75 |    1.02   1e-04     3396.11    3.49 |
    1300    1300    1300       1    1304       1    1304       1    1304       1       1 |    4.55      966.46 |    1.32   1e-04     3337.64    3.45 |
    1400    1400    1400       1    1400       1    1400       1    1400       1       1 |   11.59      473.61 |    1.60   1e-04     3431.76    7.25 |
    1500    1500    1500       1    1504       1    1504       1    1504       1       1 |   14.23      474.51 |    2.00   1e-04     3369.82    7.10 |
theon$ ./gemm_session22_openmp
#                                                                                        | refmm               | updated matvec::mm:                 |
#      m       n       k incRowC incColC incRowA incColA incRowB incColB   alpha    beta |    time      mflops |    time   error      mflops  factor |
     300     300     300       1     304       1     304       1     304       1       1 |    0.08      709.98 |    0.03   3e-04     1734.70    2.44 |
     400     400     400       1     400       1     400       1     400       1       1 |    0.11     1199.43 |    0.02   3e-04     6532.29    5.45 |
     500     500     500       1     504       1     504       1     504       1       1 |    0.20     1223.33 |    0.02   3e-04    10733.43    8.77 |
     600     600     600       1     600       1     600       1     600       1       1 |    0.35     1224.33 |    0.03   2e-04    13081.55   10.68 |
     700     700     700       1     704       1     704       1     704       1       1 |    0.97      704.04 |    0.05   2e-04    13397.64   19.03 |
     800     800     800       1     800       1     800       1     800       1       1 |    1.10      926.77 |    0.07   2e-04    14946.31   16.13 |
     900     900     900       1     904       1     904       1     904       1       1 |    1.18     1237.00 |    0.11   2e-04    12822.70   10.37 |
    1000    1000    1000       1    1000       1    1000       1    1000       1       1 |    1.63     1230.58 |    0.13   1e-04    14929.51   12.13 |
    1100    1100    1100       1    1104       1    1104       1    1104       1       1 |    2.53     1051.53 |    0.15   1e-04    17689.09   16.82 |
    1200    1200    1200       1    1200       1    1200       1    1200       1       1 |    3.49      991.42 |    0.18   1e-04    19731.73   19.90 |
    1300    1300    1300       1    1304       1    1304       1    1304       1       1 |    4.62      951.26 |    0.20   1e-04    21524.44   22.63 |
    1400    1400    1400       1    1400       1    1400       1    1400       1       1 |   11.78      465.84 |    0.20   1e-04    27279.33   58.56 |
    1500    1500    1500       1    1504       1    1504       1    1504       1       1 |   14.44      467.49 |    0.27   1e-04    24891.45   53.24 |
theon$ ./gemm_session22_tpool
#                                                                                        | refmm               | updated matvec::mm:                 |
#      m       n       k incRowC incColC incRowA incColA incRowB incColB   alpha    beta |    time      mflops |    time   error      mflops  factor |
     300     300     300       1     304       1     304       1     304       1       1 |    0.06      971.77 |    0.03   3e-04     2043.29    2.10 |
     400     400     400       1     400       1     400       1     400       1       1 |    0.13      976.84 |    0.01   3e-04    10656.60   10.91 |
     500     500     500       1     504       1     504       1     504       1       1 |    0.24     1042.06 |    0.02   3e-04    13284.04   12.75 |
     600     600     600       1     600       1     600       1     600       1       1 |    0.48      897.05 |    0.03   2e-04    15197.80   16.94 |
     700     700     700       1     704       1     704       1     704       1       1 |    0.78      884.59 |    0.05   2e-04    12572.06   14.21 |
     800     800     800       1     800       1     800       1     800       1       1 |    1.06      967.41 |    0.05   2e-04    21266.07   21.98 |
     900     900     900       1     904       1     904       1     904       1       1 |    1.21     1205.54 |    0.09   2e-04    16823.02   13.95 |
    1000    1000    1000       1    1000       1    1000       1    1000       1       1 |    1.60     1248.09 |    0.11   1e-04    17576.73   14.08 |
    1100    1100    1100       1    1104       1    1104       1    1104       1       1 |    2.65     1004.95 |    0.09   1e-04    29443.26   29.30 |
    1200    1200    1200       1    1200       1    1200       1    1200       1       1 |    3.44     1003.92 |    0.12   1e-04    29419.97   29.31 |
    1300    1300    1300       1    1304       1    1304       1    1304       1       1 |    4.58      959.97 |    0.15   1e-04    28762.06   29.96 |
    1400    1400    1400       1    1400       1    1400       1    1400       1       1 |   11.59      473.57 |    0.17   1e-04    31825.22   67.20 |
    1500    1500    1500       1    1504       1    1504       1    1504       1       1 |   14.29      472.31 |    0.21   1e-04    32908.37   69.68 |
theon$