Lösungsvorschlag

Festlegung von Blockgrößen durch Macros

#ifndef HPC_ULMBLAS_CONFIG_H
#define HPC_ULMBLAS_CONFIG_H 1

//
// default
//
#ifndef DEFAULT_BLOCKSIZE_MR
#define DEFAULT_BLOCKSIZE_MR 8
#endif

#ifndef DEFAULT_BLOCKSIZE_NR
#define DEFAULT_BLOCKSIZE_NR 8
#endif

#ifndef DEFAULT_BLOCKSIZE_MC
#define DEFAULT_BLOCKSIZE_MC 64
#endif

#ifndef DEFAULT_BLOCKSIZE_KC
#define DEFAULT_BLOCKSIZE_KC 64
#endif

#ifndef DEFAULT_BLOCKSIZE_NC
#define DEFAULT_BLOCKSIZE_NC 256
#endif

//
// single precision(float)
//
#ifndef S_BLOCKSIZE_MR
#define S_BLOCKSIZE_MR 4
#endif

#ifndef S_BLOCKSIZE_NR
#define S_BLOCKSIZE_NR 4
#endif

#ifndef S_BLOCKSIZE_MC
#define S_BLOCKSIZE_MC 512
#endif

#ifndef S_BLOCKSIZE_KC
#define S_BLOCKSIZE_KC 512
#endif

#ifndef S_BLOCKSIZE_NC
#define S_BLOCKSIZE_NC 4096
#endif

//
// double precision(double)
//
#ifndef D_BLOCKSIZE_MR
#define D_BLOCKSIZE_MR 4
#endif

#ifndef D_BLOCKSIZE_NR
#define D_BLOCKSIZE_NR 4
#endif

#ifndef D_BLOCKSIZE_MC
#define D_BLOCKSIZE_MC 256
#endif

#ifndef D_BLOCKSIZE_KC
#define D_BLOCKSIZE_KC 256
#endif

#ifndef D_BLOCKSIZE_NC
#define D_BLOCKSIZE_NC 4096
#endif

//
// complex single precision(std::complex<float>)
//
#ifndef C_BLOCKSIZE_MR
#define C_BLOCKSIZE_MR 4
#endif

#ifndef C_BLOCKSIZE_NR
#define C_BLOCKSIZE_NR 8
#endif

#ifndef C_BLOCKSIZE_MC
#define C_BLOCKSIZE_MC 256
#endif

#ifndef C_BLOCKSIZE_KC
#define C_BLOCKSIZE_KC 256
#endif

#ifndef C_BLOCKSIZE_NC
#define C_BLOCKSIZE_NC 4096
#endif

//
// complex double precision(std::complex<double>)
//
#ifndef Z_BLOCKSIZE_MR
#define Z_BLOCKSIZE_MR 4
#endif

#ifndef Z_BLOCKSIZE_NR
#define Z_BLOCKSIZE_NR 4
#endif

#ifndef Z_BLOCKSIZE_MC
#define Z_BLOCKSIZE_MC 256
#endif

#ifndef Z_BLOCKSIZE_KC
#define Z_BLOCKSIZE_KC 128
#endif

#ifndef Z_BLOCKSIZE_NC
#define Z_BLOCKSIZE_NC 4096
#endif

#endif // HPC_ULMBLAS_BLOCKSIZE_H

Festlegung von Blockgrößen durch Traits

session12/page01/hpc/ulmblas/blocksize.h

#ifndef HPC_ULMBLAS_BLOCKSIZE_H
#define HPC_ULMBLAS_BLOCKSIZE_H 1

#include <complex>
#include <hpc/ulmblas/config.h>

namespace hpc { namespace ulmblas {

template <typename T>
struct BlockSize
{
    static const int MC = DEFAULT_BLOCKSIZE_MC;
    static const int KC = DEFAULT_BLOCKSIZE_KC;
    static const int NC = DEFAULT_BLOCKSIZE_NC;
    static const int MR = DEFAULT_BLOCKSIZE_MR;
    static const int NR = DEFAULT_BLOCKSIZE_NR;


    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};

template <>
struct BlockSize<float>
{
    static const int MC = S_BLOCKSIZE_MC;
    static const int KC = S_BLOCKSIZE_KC;
    static const int NC = S_BLOCKSIZE_NC;
    static const int MR = S_BLOCKSIZE_MR;
    static const int NR = S_BLOCKSIZE_NR;

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};

template <>
struct BlockSize<double>
{
    static const int MC = D_BLOCKSIZE_MC;
    static const int KC = D_BLOCKSIZE_KC;
    static const int NC = D_BLOCKSIZE_NC;
    static const int MR = D_BLOCKSIZE_MR;
    static const int NR = D_BLOCKSIZE_NR;

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};

template <>
struct BlockSize<std::complex<float> >
{
    static const int MC = C_BLOCKSIZE_MC;
    static const int KC = C_BLOCKSIZE_KC;
    static const int NC = C_BLOCKSIZE_NC;
    static const int MR = C_BLOCKSIZE_MR;
    static const int NR = C_BLOCKSIZE_NR;

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};

template <>
struct BlockSize<std::complex<double> >
{
    static const int MC = Z_BLOCKSIZE_MC;
    static const int KC = Z_BLOCKSIZE_KC;
    static const int NC = Z_BLOCKSIZE_NC;
    static const int MR = Z_BLOCKSIZE_MR;
    static const int NR = Z_BLOCKSIZE_NR;

    static_assert(MC>0 && KC>0 && NC>0 && MR>0 && NR>0, "Invalid block size.");
    static_assert(MC % MR == 0, "MC must be a multiple of MR.");
    static_assert(NC % NR == 0, "NC must be a multiple of NR.");
};

} } // namespace ulmblas, hpc

#endif // HPC_ULMBLAS_BLOCKSIZE_H

Funktionen zum Packen von Blöcken

session12/page01/hpc/ulmblas/pack.h

#ifndef HPC_ULMBLAS_PACK_H
#define HPC_ULMBLAS_PACK_H 1

#include <hpc/ulmblas/blocksize.h>

namespace hpc { namespace ulmblas {

template <typename Index, typename TA, typename T>
void
pack_A(Index mc, Index kc,
       const TA *A, Index incRowA, Index incColA,
       T *p)
{
    Index MR = BlockSize<T>::MR;
    Index mp = (mc+MR-1) / MR;

    for (Index j=0; j<kc; ++j) {
        for (Index l=0; l<mp; ++l) {
            for (Index i0=0; i0<MR; ++i0) {
                Index i  = l*MR + i0;
                Index nu = l*MR*kc + j*MR + i0;
                p[nu]   = (i<mc) ? A[i*incRowA+j*incColA]
                                 : T(0);
            }
        }
    }
}

template <typename Index, typename TB, typename T>
void
pack_B(Index kc, Index nc,
       const TB *B, Index incRowB, Index incColB,
       T *p)
{
    Index NR = BlockSize<T>::NR;
    Index np = (nc+NR-1) / NR;

    for (Index l=0; l<np; ++l) {
        for (Index j0=0; j0<NR; ++j0) {
            for (Index i=0; i<kc; ++i) {
                Index j  = l*NR+j0;
                Index nu = l*NR*kc + i*NR + j0;
                p[nu]   = (j<nc) ? B[i*incRowB+j*incColB]
                                 : T(0);
            }
        }
    }
}

} } // namespace ulmblas, hpc

#endif // HPC_ULMBLAS_PACK_H

Macro-Kernel

session12/page01/hpc/ulmblas/mgemm.h

#ifndef HPC_ULMBLAS_MGEMM_H
#define HPC_ULMBLAS_MGEMM_H 1

#include <algorithm>
#include <hpc/ulmblas/blocksize.h>
#include <hpc/ulmblas/gescal.h>
#include <hpc/ulmblas/geaxpy.h>
#include <hpc/ulmblas/kernels/ugemm.h>

namespace hpc { namespace ulmblas {

template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc,
      T alpha,
      const T *A, const T *B,
      Beta beta,
      TC *C, Index incRowC, Index incColC)
{
    Index MR = BlockSize<T>::MR;
    Index NR = BlockSize<T>::NR;
    T C_[BlockSize<T>::MR*BlockSize<T>::NR];

    Index mp  = (mc+MR-1) / MR;
    Index np  = (nc+NR-1) / NR;
    Index mr_ = mc % MR;
    Index nr_ = nc % NR;

    for (Index j=0; j<np; ++j) {
        Index nr = (j!=np-1 || nr_==0) ? NR : nr_;

        for (Index i=0; i<mp; ++i) {
            Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;

            if (mr==MR && nr==NR) {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      beta,
                      &C[i*MR*incRowC+j*NR*incColC],
                      incRowC, incColC);
            } else {
                std::fill_n(C_, MR*NR, T(0));
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      T(0),
                      C_, Index(1), MR);
                gescal(mr, nr, beta,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
                geaxpy(mr, nr, T(1), C_, Index(1), MR,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
            }
        }
    }
}

} } // namespace ulmblas, hpc

#endif // HPC_ULMBLAS_MGEMM_H

Micro-Kernel

Include-File zum Einbinden aller Micro-Kernel

session12/page01/hpc/ulmblas/kernels/ugemm.h

#ifndef HPC_ULMBLAS_KERNELS_UGEMM_H
#define HPC_ULMBLAS_KERNELS_UGEMM_H 1

#include <hpc/ulmblas/config.h>
#include <hpc/ulmblas/kernels/ugemm_ref.h>
#include <hpc/ulmblas/kernels/ugemm_buf.h>

#endif // HPC_ULMBLAS_KERNELS_UGEMM_H

Micro-Kernel Wrapper für inhomogene Typen

session12/page01/hpc/ulmblas/kernels/ugemm_buf.h

#ifndef HPC_ULMBLAS_KERNELS_UGEMM_BUF_H
#define HPC_ULMBLAS_KERNELS_UGEMM_BUF_H 1

#include <hpc/ulmblas/blocksize.h>

namespace hpc { namespace ulmblas {

template <typename Index, typename T, typename Beta, typename TC>
void
ugemm(Index kc, T alpha,
      const T *A, const T *B,
      Beta beta,
      TC *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    T P[BlockSize<T>::MR*BlockSize<T>::NR];

    std::fill_n(P, MR*NR, T(0));
    ugemm(kc, alpha, A, B, T(0), P, Index(1), MR);
    gescal(MR, NR, beta, C, incRowC, incColC);
    geaxpy(MR, NR, T(1), P, Index(1), MR, C, incRowC, incColC);
}

} } // namespace ulmblas, hpc

#endif // HPC_ULMBLAS_KERNELS_UGEMM_BUF_H

Referenz-Implementierung für homogene Typen

session12/page01/hpc/ulmblas/kernels/ugemm_ref.h

#ifndef HPC_ULMBLAS_KERNELS_UGEMM_REF_H
#define HPC_ULMBLAS_KERNELS_UGEMM_REF_H 1

#include <hpc/ulmblas/blocksize.h>

namespace hpc { namespace ulmblas {

template <typename Index, typename T>
void
ugemm(Index kc, T alpha,
      const T *A, const T *B,
      T beta,
      T *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    T P[BlockSize<T>::MR*BlockSize<T>::NR];

    for (Index l=0; l<MR*NR; ++l) {
        P[l] = 0;
    }
    for (Index l=0; l<kc; ++l) {
        for (Index j=0; j<NR; ++j) {
            for (Index i=0; i<MR; ++i) {
                P[i+j*MR] += A[i+l*MR]*B[l*NR+j];
            }
        }
    }
    for (Index j=0; j<NR; ++j) {
        for (Index i=0; i<MR; ++i) {
            C[i*incRowC+j*incColC] *= beta;
            C[i*incRowC+j*incColC] += alpha*P[i+j*MR];
        }
    }
}

} } // namespace ulmblas, hpc

#endif // HPC_ULMBLAS_KERNELS_UGEMM_REF_H

Frame-Algorithmus

session12/page01/hpc/ulmblas/gemm.h

#ifndef HPC_ULMBLAS_GEMM_H
#define HPC_ULMBLAS_GEMM_H 1

#include <complex>
#include <type_traits>
#include <hpc/ulmblas/blocksize.h>
#include <hpc/ulmblas/gescal.h>
#include <hpc/ulmblas/pack.h>
#include <hpc/ulmblas/mgemm.h>
#include <hpc/ulmblas/kernels/ugemm.h>
#include <cstdlib>

namespace hpc { namespace ulmblas {

template <typename Index, typename Alpha,
         typename TA, typename TB,
         typename Beta,
         typename TC>
void
gemm(Index m, Index n, Index k,
     Alpha alpha,
     const TA *A, Index incRowA, Index incColA,
     const TB *B, Index incRowB, Index incColB,
     Beta beta,
     TC *C, Index incRowC, Index incColC)
{
    typedef typename std::common_type<Alpha, TA, TB>::type  T;

    const Index MC = BlockSize<T>::MC;
    const Index NC = BlockSize<T>::NC;
    const Index KC = BlockSize<T>::KC;

    const Index mb = (m+MC-1) / MC;
    const Index nb = (n+NC-1) / NC;
    const Index kb = (k+KC-1) / KC;

    const Index mc_ = m % MC;
    const Index nc_ = n % NC;
    const Index kc_ = k % KC;

    T *A_ = new T[MC*KC];
    T *B_ = new T[KC*NC];

    if (alpha==Alpha(0) || k==0) {
        gescal(m, n, beta, C, incRowC, incColC);
        return;
    }

    for (Index j=0; j<nb; ++j) {
        Index nc = (j!=nb-1 || nc_==0) ? NC : nc_;

        for (Index l=0; l<kb; ++l) {
            Index   kc  = (l!=kb-1 || kc_==0) ? KC : kc_;
            Beta beta_  = (l==0) ? beta : Beta(1);

            pack_B(kc, nc,
                   &B[l*KC*incRowB+j*NC*incColB],
                   incRowB, incColB,
                   B_);

            for (Index i=0; i<mb; ++i) {
                Index mc = (i!=mb-1 || mc_==0) ? MC : mc_;

                pack_A(mc, kc,
                       &A[i*MC*incRowA+l*KC*incColA],
                       incRowA, incColA,
                       A_);

                mgemm(mc, nc, kc,
                      T(alpha), A_, B_, beta_,
                      &C[i*MC*incRowC+j*NC*incColC],
                      incRowC, incColC);
            }
        }
    }
    delete [] A_;
    delete [] B_;
}

} } // namespace ulmblas, hpc

#endif // HPC_ULMBLAS_GEMM_H