Geblocktes GEMM

Implementiert den geblockten GEMM Algorithmus aus der Vorlesung. Als Vorlage könnt ihr folgenden Code verwenden oder den eigenen Code entsprechend ergänzen. Zu implementieren sind:

Den Micro-Kernel ugemm
Den Macro-Kernel mgemm
Den Frame-Algorithmus gemm
Die Pack-Funktionen pack_A und pack_B

Die verschiedenen GEMM Algorithmen

Alle Algorithmen berechnen die Operation

\[C \leftarrow \beta\,C + \alpha\, A B\quad\text{mit}\;A \in \mathbb{M}^{m \times k},\;B \in \mathbb{M}^{k \times n}\;\text{und}\;C \in \mathbb{M}^{m \times n}.\]

Unterschiede ergeben sich aus verschiedenen Gründen:

Speicherformat

Der Frame-Algorithmus erhält Matrizen, die zeilen- oder spaltenweise im Speicher liegen können. Beim Macro- und Micro-Kernel ist das Speicherformat aber nicht beliebig.
Matrix-Dimensionen

Beim Frame-Algorithmus können die Dimensionen beliebig groß sein. Beim Macro-Kernel sind die Dimensionen nach oben beschränkt. Und beim Micro-Kernel sind gewisse Dimensionen sogar fest vorgegeben.

Die Algorithmen werden also immer spezieller.

Micro-Kernel

Für die Dimensionen gilt
\[A \in \mathbb{M}^{M_r \times k},\;B \in \mathbb{M}^{k \times N_r}\;\text{und}\;C \in \mathbb{M}^{M_r \times N_r}.\]
wobei \(M_r\) und \(N_r\) bekannte Konstanten sind.
Für das Speicherformat gilt:
- \(A\) ist spaltenweise und \(B\) ist zeilenweise gespeichert. Zudem sind die Elemente in einem zusammenhängenden Speicherbereich.
- Bezüglich \(C\) werden keine Annahmen gemacht.

Macro-Kernel

Für die Dimensionen gilt
\[A \in \mathbb{M}^{m \times k},\;B \in \mathbb{M}^{k \times n}\;\text{und}\;C \in \mathbb{M}^{m \times n}.\]
dabei gilt
\[m \leq M_C,\quad k \leq K_C,\quad n \leq N_C\]
wobei \(M_C\), \(K_C\) und \(N_r\) bekannte Konstanten sind.
Für das Speicherformat gilt:
- Bezüglich \(C\) wird keine Annahme gemacht
- Matrix \(A\) liegt so im Speicher, dass bei der Partitionierung in Paneele der Höhe \(M_r\), d.h.
  \[A = \left(\begin{array}{c} A_0 \\ \hline A_1 \\ \hline \vdots \\ A_{m_b-1} \end{array}\right),\quad m_b = \lceil m/M_r \rceil\]
  die einzelnen Paneele in aufeinanderfolgenden, zusammenhängenden Blöcken im Speicher liegen. Die letzte Paneele \(A_{m_b-1}\) wurde eventuell mit Nullen auf die Höhe \(M_r\) aufgefüllt (Zero Padding).
- Matrix \(B\) liegt so im Speicher, dass bei der Partitionierung in Paneele der Breite \(N_r\), d.h.
  \[B = \left(\begin{array}{c|c|c|c} B_0 & B_1 & \dots & B_{n_b-1} \end{array}\right),\quad n_b = \lceil n/N_r \rceil\]
  die einzelnen Paneele in aufeinanderfolgenden, zusammenhängenden Blöcken im Speicher liegen. Die letzte Paneele \(B_{n_b-1}\) wurde eventuell mit Nullen auf die Breite \(N_r\) aufgefüllt (Zero Padding).

Vorlage

session8/gemm_blk.cc

#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <sys/times.h>
#include <unistd.h>

#ifndef COLMAJOR
#define COLMAJOR 1
#endif

#ifndef MAXDIM_M
#define MAXDIM_M    4000
#endif

#ifndef MAXDIM_N
#define MAXDIM_N    4000
#endif

#ifndef MAXDIM_K
#define MAXDIM_K    4000
#endif

#ifndef MIN_M
#define MIN_M   100
#endif

#ifndef MIN_N
#define MIN_N   100
#endif

#ifndef MIN_K
#define MIN_K   100
#endif

#ifndef MAX_M
#define MAX_M   7000
#endif

#ifndef MAX_N
#define MAX_N   7000
#endif

#ifndef MAX_K
#define MAX_K   7000
#endif

#ifndef INC_M
#define INC_M   100
#endif

#ifndef INC_N
#define INC_N   100
#endif

#ifndef INC_K
#define INC_K   100
#endif

#ifndef ALPHA
#define ALPHA   1.5
#endif

#ifndef BETA
#define BETA    1.5
#endif


//==============================================================================
// bench
//==============================================================================

namespace bench {

//------------------------------------------------------------------------------
// Auxiliary data for benchmarking
//------------------------------------------------------------------------------

double A[MAXDIM_M*MAXDIM_K];
double B[MAXDIM_K*MAXDIM_N];

double C1[MAXDIM_M*MAXDIM_N];
double C2[MAXDIM_M*MAXDIM_N];
double C3[MAXDIM_M*MAXDIM_N];

//------------------------------------------------------------------------------
// Auxiliary functions for benchmarking
//------------------------------------------------------------------------------

double
walltime()
{
   struct tms    ts;
   static double ClockTick=0.0;

   if (ClockTick==0.0) {
        ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
   }
   return ((double) times(&ts)) * ClockTick;
}

void
initMatrix(long m, long n, double *A, long incRowA, long incColA)
{
    for (long j=0; j<n; ++j) {
        for (long i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] = ((double)rand() - RAND_MAX/2)*200/RAND_MAX;
        }
    }
}

double
asumDiffMatrix(long m, long n,
               const double *A, long incRowA, long incColA,
               double *B, long incRowB, long incColB)
{

    double asum = 0;

    for (long j=0; j<n; ++j) {
        for (long i=0; i<m; ++i) {
            asum += fabs(B[i*incRowB+j*incColB] - A[i*incRowA+j*incColA]);
        }
    }
    return asum;
}

} // namespace bench


//==============================================================================
// ulmBLAS
//==============================================================================

namespace ulmBLAS {

//------------------------------------------------------------------------------
// A <- B
//------------------------------------------------------------------------------

void
gecopy(long m, long n,
       const double *A, long incRowA, long incColA,
       double *B, long incRowB, long incColB)
{
    for (long j=0; j<n; ++j) {
        for (long i=0; i<m; ++i) {
            B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
        }
    }
}

//------------------------------------------------------------------------------
// Y <- Y + alpha*Y�#
//------------------------------------------------------------------------------

void
geaxpy(long m, long n,
       double alpha,
       const double *X, long incRowX, long incColX,
       double       *Y, long incRowY, long incColY)
{
    for (long i=0; i<m; ++i) {
        for (long j=0; j<n; ++j) {
            Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
        }
    }
}

//------------------------------------------------------------------------------
// A <- alpha * A
//------------------------------------------------------------------------------

void
gescal(long m, long n,
       double alpha,
       double *X, long incRowX, long incColX)
{
    if (alpha!=1.0) {
        for (long i=0; i<m; ++i) {
            for (long j=0; j<n; ++j) {
                X[i*incRowX+j*incColX] *= alpha;
            }
        }
    }
}

} // namespace ulmBLAS

//==============================================================================
// refColMajor
//==============================================================================

namespace refColMajor {

void
gemm(long m, long n, long k,
     double alpha,
     const double *A, long incRowA, long incColA,
     const double *B, long incRowB, long incColB,
     double beta,
     double *C, long incRowC, long incColC)
{
    ulmBLAS::gescal(m, n, beta, C, incRowC, incColC);
    for (long j=0; j<n; ++j) {
        for (long l=0; l<k; ++l) {
            for (long i=0; i<m; ++i) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

} // namespace refColMajor

//==============================================================================
// simpleBuffer
//==============================================================================

#ifndef SIMPLE_PUFFER_M_C
#define SIMPLE_PUFFER_M_C 256
#endif

#ifndef SIMPLE_PUFFER_K_C
#define SIMPLE_PUFFER_K_C 256
#endif

#ifndef SIMPLE_PUFFER_N_C
#define SIMPLE_PUFFER_N_C 1024
#endif

namespace simpleBuffer {

double A_[SIMPLE_PUFFER_M_C*SIMPLE_PUFFER_K_C];
double B_[SIMPLE_PUFFER_K_C*SIMPLE_PUFFER_N_C];
double C_[SIMPLE_PUFFER_M_C*SIMPLE_PUFFER_N_C];

void
gemm(long m, long n, long k,
     double alpha,
     const double *A, long incRowA, long incColA,
     const double *B, long incRowB, long incColB,
     double beta,
     double *C, long incRowC, long incColC)
{
    long i, j, l;

    long M_C = SIMPLE_PUFFER_M_C;
    long N_C = SIMPLE_PUFFER_N_C;
    long K_C = SIMPLE_PUFFER_K_C;

    long mb = m / M_C;
    long nb = n / N_C;
    long kb = k / K_C;

    long mr = m % M_C;
    long nr = n % N_C;
    long kr = k % K_C;

    using ulmBLAS::geaxpy;
    using ulmBLAS::gecopy;
    using ulmBLAS::gescal;
    using refColMajor::gemm;

    gescal(m, n, beta, C, incRowC, incColC);

    for (j=0; j<=nb; ++j) {
        long N = (j<nb || nr==0) ? N_C : nr;

        for (l=0; l<=kb; ++l) {
            long K = (l<kb || kr==0) ? K_C : kr;

            gecopy(K, N,
                   &B[l*K_C*incRowB+j*N_C*incColB], incRowB, incColB,
                   B_, 1, K_C);

            for (i=0; i<=mb; ++i) {
                long M = (i<mb || mr==0) ? M_C : mr;

                gecopy(M, K,
                       &A[i*M_C*incRowA+l*K_C*incColA], incRowA, incColA,
                       A_, 1, M_C);

                gemm(M, N, K,
                     1.0,
                     A_, 1, M_C,
                     B_, 1, K_C,
                     0.0,
                     C_, 1, M_C);

                geaxpy(M, N, alpha,
                       C_, 1, M_C,
                       &C[i*M_C*incRowC+j*N_C*incColC], incRowC, incColC);
            }
        }
    }
}

} // namespace simpleBuffer

//==============================================================================
// blocked
//==============================================================================


#ifndef BLOCKED_MC
#define BLOCKED_MC 256
#endif

#ifndef BLOCKED_KC
#define BLOCKED_KC 256
#endif

#ifndef BLOCKED_NC
#define BLOCKED_NC 1024
#endif

#ifndef BLOCKED_MR
#define BLOCKED_MR 2
#endif

#ifndef BLOCKED_NR
#define BLOCKED_NR 8
#endif

namespace blocked {

double A_[BLOCKED_MC*BLOCKED_KC];
double B_[BLOCKED_KC*BLOCKED_NC];

void
pack_A(long mc, long kc,
       const double *A, long incRowA, long incColA,
       double *p)
{
    long MR = BLOCKED_MR;
    long mp = (mc+MR-1) / MR;

    // ...
}

void
pack_B(long kc, long nc,
       const double *B, long incRowB, long incColB,
       double *p)
{
    long NR = BLOCKED_NR;
    long np = (nc+NR-1) / NR;

    // ...
}

void
ugemm(long kc, double alpha,
      const double *A, const double *B,
      double beta,
      double *C, long incRowC, long incColC)
{
    double P[BLOCKED_MR*BLOCKED_NR];
    long MR = BLOCKED_MR;
    long NR = BLOCKED_NR;

    // ...
}

void
mgemm(long mc, long nc, long kc,
      double alpha,
      const double *A, const double *B,
      double beta,
      double *C, long incRowC, long incColC)
{
    double C_[BLOCKED_MR*BLOCKED_NR];
    long MR = BLOCKED_MR;
    long NR = BLOCKED_NR;

    long mp  = (mc+MR-1) / MR;
    long np  = (nc+NR-1) / NR;
    long mr_ = mc % MR;
    long nr_ = nc % NR;

    // ...
}

void
gemm(long m, long n, long k,
     double alpha,
     const double *A, long incRowA, long incColA,
     const double *B, long incRowB, long incColB,
     double beta,
     double *C, long incRowC, long incColC)
{
    long MC = BLOCKED_MC;
    long NC = BLOCKED_NC;
    long KC = BLOCKED_KC;

    long mb = (m+MC-1) / MC;
    long nb = (n+NC-1) / NC;
    long kb = (k+KC-1) / KC;

    long mc_ = m % MC;
    long nc_ = n % NC;
    long kc_ = k % KC;

    // ...
}

} // namespace blocked

int
main()
{
    using namespace bench;
    using namespace ulmBLAS;
    using namespace std;

    initMatrix(MAXDIM_M, MAXDIM_K, A, 1, MAXDIM_M);
    initMatrix(MAXDIM_K, MAXDIM_N, B, 1, MAXDIM_K);
    initMatrix(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M);
    gecopy(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M, C2, 1, MAXDIM_M);
    gecopy(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M, C3, 1, MAXDIM_M);

    // Header-Zeile f�r die Ausgabe
    printf("%5s %5s %5s ", "m", "n", "k");
    printf("%5s %5s ", "IRA", "ICA");
    printf("%5s %5s ", "IRB", "ICB");
    printf("%5s %5s ", "IRC", "ICC");
    printf("%20s %9s", "refColMajor: t", "MFLOPS");
    printf("%20s %9s %9s", "refSimpleBuffer: t", "MFLOPS", "diff");
    printf("\n");

    for (long m = MIN_M, n = MIN_N, k = MIN_K;
         m <=MAX_M && n <= MAX_N && k <= MAX_K;
         m += INC_M, n += INC_N, k += INC_K)
    {
        double t0, t1, diff;

        long incRowA = (COLMAJOR) ? 1 : k;
        long incColA = (COLMAJOR) ? m : 1;

        long incRowB = (COLMAJOR) ? 1 : n;
        long incColB = (COLMAJOR) ? k : 1;

        long incRowC = (COLMAJOR) ? 1 : n;
        long incColC = (COLMAJOR) ? m : 1;

        printf("%5ld %5ld %5ld ", m, n, k);
        printf("%5ld %5ld ", incRowA, incColA);
        printf("%5ld %5ld ", incRowB, incColB);
        printf("%5ld %5ld ", incRowC, incColC);

        t0 = walltime();
        refColMajor::gemm(m, n, k, ALPHA,
                          A, incRowA, incColA,
                          B, incRowB, incColB,
                          BETA,
                          C1, incRowC, incColC);
        t1 = walltime() - t0;
        printf("%20.4lf %9.2lf", t1, 2.*m/1000*n/1000*k/t1);

        t0 = walltime();
        simpleBuffer::gemm(m, n, k, ALPHA,
                           A, incRowA, incColA,
                           B, incRowB, incColB,
                           BETA,
                           C2, incRowC, incColC);
        t1 = walltime() - t0;
        diff = asumDiffMatrix(m, n,
                              C1, incRowC, incColC,
                              C2, incRowC, incColC);
        printf("%20.4lf %9.2lf %9.1e", t1, 2.*m/1000*n/1000*k/t1, diff);

        t0 = walltime();
        blocked::gemm(m, n, k, ALPHA,
                      A, incRowA, incColA,
                      B, incRowB, incColB,
                      BETA,
                      C3, incRowC, incColC);
        t1 = walltime() - t0;
        diff = asumDiffMatrix(m, n,
                              C1, incRowC, incColC,
                              C3, incRowC, incColC);
        printf("%20.4lf %9.2lf %9.1e", t1, 2.*m/1000*n/1000*k/t1, diff);
        printf("\n");
    }
    return 0;
}