Aufgabe: GEMM-Macro-Kernel

Bezüglich \(M_r\) und \(N_r\) wird die \(m \times n\) Matrix \(C\) partitioniert. Dann entstehen \(m_p = \left\lceil \frac{m}{M_r} \right\rceil\) Blockzeilen und \(n_p = \left\lceil \frac{n}{N_r} \right\rceil\) Blockspalten:

\[C = \left(\begin{array}{c|c|c|c} C_{0,\,0} & C_{0,\,N_r} & \dots & C_{0,\, N_r\cdot (n_p-1)} \\ \hline C_{M_r,\,0} & C_{M_r,\,N_r} & \dots & C_{M_r,\,N_r\cdot (n_p-1)} \\ \hline \vdots & \vdots & & \vdots \\ \hline C_{M_r\cdot (m_p-1),\,0} & C_{M_r\cdot (m_p-1),\,N_r} & \dots & C_{M_r\cdot (m_p-1),\,N_r\cdot (n_p-1)} \\ \end{array}\right)\]

Für \(i \in \{0, M_r, \dots, M_r\cdot (m_p-1)\}\) und \(j \in \{0, N_r, \dots, N_r\cdot (n_p-1)\}\) ist der Block \(C_{i,\,j}\) eine \(m_r \times n_r\) Matrix mit

\[m_r =\begin{cases}M_r, & i+M_r < m, \\m - i, & \text{else}\end{cases}\qquad\text{und}\qquad n_r =\begin{cases}N_r, & j+N_r < n, \\n - j, & \text{else.}\end{cases}\]

Für die GEMM-Operation

\[C \leftarrow \beta \, C + \alpha\,A B\]

liegen die Matrizen \(A\) und \(B\) in einem entsprechend gepackten Format als \(\overline{A}\) und \(\overline{B}\) vor:

\(\overline{A}\) ist ein gepackter \(m_c \times k_c\) Block aus \(A\). Dieser besteht aus \(m_p = \left\lceil \frac{m_c}{M_r} \right\rceil\) horizontalen Paneelen:
\[\overline{A} =\left(\begin{array}{c} A_0 \\ \hline A_{M_r} \\ \hline \vdots \\ \hline A_{M_r \cdot (m_p-1)} \end{array}\right)\]
Jede Paneele hat die Dimension \(M_r \times k_c\) (Die letzte Paneele wurde beim Packen eventuell mit Nullen auf \(M_r\) Zeilen aufgefüllt).
\(\overline{B}\) ist ein gepackter \(k_c \times n_c\) Block aus \(B\). Dieser besteht aus \(n_p = \left\lceil \frac{n_c}{N_r} \right\rceil\) vertikalen Paneelen:
\[\overline{B} =\left(\begin{array}{c|c|c|c} B_0 & B_{N_r} & \dots & B_{N_r\cdot (n_p-1)} \end{array}\right)\]
Jede Paneele hat die Dimension \(k_c \times N_r\) (Die letzte Paneele wurde beim Packen eventuell mit Nullen auf \(N_r\) Spalten aufgefüllt).

Das Produkt einer Paneele aus \(\overline{A}\) mit einer Paneele aus \(\overline{B}\) ist somit stets ein Block mit Dimension \(M_r \times N_r\) Im Algorithmus ist somit eine Fallunterscheidung notwendig:

\(\overline{C}\) sei eine lokale \(M_r \times N_r\) Matrix
for \(i=0, M_r, \dots, M_r\cdot(m_p-1)\)
- \(m_r = \begin{cases} M_r,& i+M_r < m,\\ m - i,& \text{else} \end{cases}\)
- for \(j=0, N_r, \dots, N_r\cdot(n_p-1)\)
  - \(n_r = \begin{cases} N_r,& j+N_r < n,\\ n - j,& \text{else} \end{cases}\)
  - if \(m_r = M_r \;\land\; n_r = N_r\)
    - \(C_{i,j} \leftarrow \beta C_{i,j} + \alpha \overline{A}_i \overline{B}_j\)
  - else
    - \(\overline{C} \leftarrow \alpha \overline{A}_i \overline{B}_j\)
    - \(C_{i,j} \leftarrow \beta C_{i,j} + \overline{C}_{(0:m_r-1)\times(0:n_r-1)}\)

Hinweise:

Das Produkt von Paneelen soll natürlich mit dem Micro-Kernel berechnet werden.
Mit \(\overline{C}_{(0:m_r-1)\times(0:n_r-1)}\) bezeichnen wir die ersten \(m_r\) Zeilen und \(n_r\) Spalten von \(\overline{C}\).

Vorlage

Die Vorlage ist wieder unabhängig vom HPC-Projekt. Funktionen wie beispielsweise dgescal oder dgeaxpy wurden manuell vom HPC-Projekt übernommen.

Vervollständigt die Funktion dgemm_macro:

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>

//-- setup and print matrices --------------------------------------------------

void
initGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
printGeMatrix(int m, int n, const double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n\n");
}

//------------------------------------------------------------------------------

void
dgecopy(int m, int n,
        const double *X, int incRowX, int incColX,
        double *Y, int incRowY, int incColY)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX];
        }
    }
}

void
dgescal(int m, int n, double alpha,
        double *X, int incRowX, int incColX)
{
    int i, j;

    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            X[i*incRowX+j*incColX] *= alpha;
        }
    }
}

void
dgeaxpy(int m, int n, double alpha,
        const double *X, int incRowX, int incColX,
        double *Y, int incRowY, int incColY)
{
    int i, j;

    if (alpha==0 || m==0 || n==0) {
        return;
    }
    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
        }
    }
}

void
dgemm_ilj(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (i=0; i<m; ++i) {
        for (l=0; l<k; ++l) {
            for (j=0; j<n; ++j) {
                if (l==0) {
                    C[i*incRowC+j*incColC] *= beta;
                }
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

//------------------------------------------------------------------------------

#ifndef DGEMM_MC
#define DGEMM_MC 8
#endif

#ifndef DGEMM_NC
#define DGEMM_NC 9
#endif

#ifndef DGEMM_KC
#define DGEMM_KC 10
#endif

#ifndef DGEMM_MR
#define DGEMM_MR 4
#endif

#ifndef DGEMM_NR
#define DGEMM_NR 3
#endif



//------------------------------------------------------------------------------

void
dpack_A(int m, int k, const double *A, int incRowA, int incColA, double *p)
{
    int i, i0, j, l, nu;
    int mp = (m+DGEMM_MR-1) / DGEMM_MR;

    for (j=0; j<k; ++j) {
        for (l=0; l<mp; ++l) {
            for (i0=0; i0<DGEMM_MR; ++i0) {
                i     = l*DGEMM_MR + i0;
                nu    = l*DGEMM_MR*k + j*DGEMM_MR + i0;
                p[nu] = (i<m) ? A[i*incRowA+j*incColA]
                              : 0;
            }
        }
    }
}

void
dpack_B(int k, int n, const double *B, int incRowB, int incColB, double *p)
{
    int i, j, j0, l, nu;
    int np = (n+DGEMM_NR-1) / DGEMM_NR;

    for (l=0; l<np; ++l) {
        for (j0=0; j0<DGEMM_NR; ++j0) {
            j = l*DGEMM_NR + j0;
            for (i=0; i<k; ++i) {
                nu    = l*DGEMM_NR*k + i*DGEMM_NR + j0;
                p[nu] = (j<n) ? B[i*incRowB+j*incColB]
                              : 0;
            }
        }
    }
}

void
dgemm_micro(int k,
            double alpha,
            const double *A,
            const double *B,
            double beta,
            double *C, int incRowC, int incColC)
{
    int    i, j, l;
    double AB[DGEMM_MR*DGEMM_NR];

    for (l=0; l<DGEMM_MR*DGEMM_NR; ++l) {
        AB[l] = 0;
    }
    for (l=0; l<k; ++l) {
        for (j=0; j<DGEMM_NR; ++j) {
            for (i=0; i<DGEMM_MR; ++i) {
                AB[i+j*DGEMM_MR] += A[i+l*DGEMM_MR]*B[j+l*DGEMM_NR];
            }
        }
    }
    if (beta==0) {
        for (j=0; j<DGEMM_NR; ++j) {
            for (i=0; i<DGEMM_MR; ++i) {
                C[i*incRowC+j*incColC] = alpha*AB[i+j*DGEMM_MR];
            }
        }
    } else {
        for (j=0; j<DGEMM_NR; ++j) {
            for (i=0; i<DGEMM_MR; ++i) {
                C[i*incRowC+j*incColC] = beta*C[i*incRowC+j*incColC]
                                       + alpha*AB[i+j*DGEMM_MR];
            }
        }
    }
}

void
dgemm_macro(int m, int n, int k,
            double alpha,
            const double *A,
            const double *B,
            double beta,
            double *C, int incRowC, int incColC)
{
    double C_[DGEMM_MR*DGEMM_NR];
    int i, j;

    const int MR = DGEMM_MR;
    const int NR = DGEMM_NR;

    for (i=0; i<m; i+=MR) {
        int mr = (i+MR<m) ? MR
                          : m - i;

        // ... your code here ....
    }
}

//------------------------------------------------------------------------------
#ifndef DIM_M
#define DIM_M 8
#endif

#ifndef DIM_N
#define DIM_N 7
#endif

#ifndef DIM_K
#define DIM_K 12
#endif

#ifndef ALPHA
#define ALPHA 2
#endif

#ifndef BETA
#define BETA 2
#endif

#ifndef ROWMAJOR_A
#define INCROW_A 1
#define INCCOL_A DIM_M
#else
#define INCROW_A DIM_K
#define INCCOL_A 1
#endif

#ifndef ROWMAJOR_B
#define INCROW_B 1
#define INCCOL_B DIM_K
#else
#define INCROW_B DIM_N
#define INCCOL_B 1
#endif

#ifndef ROWMAJOR_C
#define INCROW_C 1
#define INCCOL_C DIM_M
#else
#define INCROW_C DIM_N
#define INCCOL_C 1
#endif

double A[DIM_M*DIM_K];
double B[DIM_K*DIM_N];

double A_[DGEMM_MC*DGEMM_KC];
double B_[DGEMM_KC*DGEMM_NC];

double C[DIM_M*DIM_N];
double C_ref[DIM_M*DIM_N];

#define MIN(X,Y) (X) < (Y) ? (X) : (Y)

int
main()
{
    initGeMatrix(DIM_M, DIM_K, A, INCROW_A, INCCOL_A);
    initGeMatrix(DIM_K, DIM_N, B, INCROW_B, INCCOL_B);
    initGeMatrix(DIM_M, DIM_N, C, INCROW_C, INCCOL_C);
    dgecopy(DIM_M, DIM_N,
            C, INCROW_C, INCCOL_C,
            C_ref, INCROW_C, INCCOL_C);

    // pack block from A
    dpack_A(MIN(DIM_M, DGEMM_MC), MIN(DIM_K, DGEMM_KC),
            A, INCROW_A, INCCOL_A, A_);

    // pack block from B
    dpack_B(MIN(DIM_K, DGEMM_KC), MIN(DIM_N, DGEMM_NC),
            B, INCROW_B, INCCOL_B, B_);

    printf("A =\n");
    printGeMatrix(DIM_M, DIM_K, A, INCROW_A, INCCOL_A);
    printf("B =\n");
    printGeMatrix(DIM_K, DIM_N, B, INCROW_B, INCCOL_B);
    printf("C =\n");
    printGeMatrix(DIM_M, DIM_N, C, INCROW_C, INCCOL_C);

    // pack block from A
    dpack_A(MIN(DIM_M, DGEMM_MC), MIN(DIM_K, DGEMM_KC),
            A, INCROW_A, INCCOL_A, A_);

    // pack block from B
    dpack_B(MIN(DIM_K, DGEMM_KC), MIN(DIM_N, DGEMM_NC),
            B, INCROW_B, INCCOL_B, B_);

    // multiply with macro-kernel
    dgemm_macro(MIN(DIM_M, DGEMM_MC),
                MIN(DIM_N, DGEMM_NC),
                MIN(DIM_K, DGEMM_KC),
                (double)ALPHA,
                A_, B_,
                (double)BETA,
                C, INCROW_C, INCCOL_C);

    printf("%lf*C + %lf*A*B =\n", (double)ALPHA, (double)BETA);
    printGeMatrix(DIM_M, DIM_N, C, INCROW_C, INCCOL_C);

    printf("C_ref =\n");
    dgemm_ilj(MIN(DIM_M, DGEMM_MC),
              MIN(DIM_N, DGEMM_NC),
              MIN(DIM_K, DGEMM_KC),
              (double)ALPHA,
              A, INCROW_A, INCCOL_A,
              B, INCROW_B, INCCOL_B,
              (double)BETA,
              C_ref, INCROW_C, INCCOL_C);
    printGeMatrix(DIM_M, DIM_N, C_ref, INCROW_C, INCCOL_C);

    return 0;
}