Content

Einfache GEMM Block-Varianten

Durch Makros werden Konstanten \(M_c\), \(N_c\) und \(K_c\) festgelegt. Die Matrizen werden dann wie folgt partitioniert:

Beachtet, dass bei einer quasi-äquidistanten Partitionierung damit die maximale Größe eines Blockes angegeben werden. Blöcke am unteren oder rechten Rand können kleiner sein.

Die Multiplikation wird dabei auf die Multiplikation

\[ C_{i,j} \leftarrow \beta\,C_{i,j} + \alpha\,A_{i,\ell} B_{\ell,j}\]

von Blöcken zurückgeführt. Dafür sind wieder 6 Varianten möglich, die sich darin unterscheiden wie die Schleifen über \(i\), \(j\) und \(\ell\) verschachtelt sind.

In allen Varianten sollen die Blöcke \(A_{i,\ell}\) und \(B_{\ell,j}\) vor der Block-Multiplikation gepuffert werden: Mit

\[ \overline{A} \leftarrow A_{i,\ell},\quad \overline{B} \leftarrow B_{\ell,j},\quad \overline{C} \leftarrow C_{i,j}\]

deuten wir an, dass die Blöcke \(A_{i,\ell}\), \(B_{\ell,j}\) und \(C_{i,j}\) jeweils kopiert werden. Das Speicherformat für die Matrizen \(\overline{A}\), \(\overline{B}\) und \(\overline{C}\) sei stets Col-Major.

Die Operation

\[ \overline{C} \leftarrow \overline{A}\overline{B}\]

soll dann mit der ungeblockten dgemm_jli Variante für GEMM ausgeführt werden.

Anschließend kann mit GEAXPY

\[ C_{i,j} \leftarrow C_{i,j} + \alpha \overline{C}\]

Am Anfang der Funktion (außerhalb der Schleifen) sei bereits C mit

\[ C \leftarrow \beta C \qquad(\text{GESCAL})\]

überschrieben worden.

Vorlage

Grundgerüst für level3.h

Die Header-Datei ist um eine Signatur für dgemm_blk_jli ergänzt worden:

#ifndef ULMBLAS_LEVEL3_H
#define ULMBLAS_LEVEL3_H 1

//
// BLAS Level 3
//

//-- GEMM ----------------------------------------------------------------------

void
dgemm_mv(int m, int n, int k,
         double alpha,
         const double *A, int incRowA, int incColA,
         const double *B, int incRowB, int incColB,
         double beta,
         double *C, int incRowC, int incColC);

void
dgemm_jil(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC);

void
dgemm_jli(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC);

void
dgemm_vm(int m, int n, int k,
         double alpha,
         const double *A, int incRowA, int incColA,
         const double *B, int incRowB, int incColB,
         double beta,
         double *C, int incRowC, int incColC);

void
dgemm_ijl(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC);

void
dgemm_ilj(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC);

void
dgemm_vv(int m, int n, int k,
         double alpha,
         const double *A, int incRowA, int incColA,
         const double *B, int incRowB, int incColB,
         double beta,
         double *C, int incRowC, int incColC);

void
dgemm_lij(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC);

void
dgemm_lji(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC);

void
dgemm_blk_jli(int m, int n, int k,
              double alpha,
              const double *A, int incRowA, int incColA,
              const double *B, int incRowB, int incColB,
              double beta,
              double *C, int incRowC, int incColC);

#endif // ULMBLAS_LEVEL3_H

Grundgerüst für level3.c

Die Source-Datei wird entsprechned um die Funktion dgemm_blk_jli erweitert:

#include <ulmblas/level3.h>
#include <ulmblas/level2.h>
#include <ulmblas/level1.h>
#include <stdlib.h>

//
// BLAS Level 3
//

//-- GEMM ----------------------------------------------------------------------

void
dgemm_mv(int m, int n, int k,
         double alpha,
         const double *A, int incRowA, int incColA,
         const double *B, int incRowB, int incColB,
         double beta,
         double *C, int incRowC, int incColC)
{
    int j;

    for (j=0; j<n; ++j) {
        dgemv(m, k,
              alpha,
              A, incRowA, incColA,
              &B[j*incColB], incRowB,
              beta,
              &C[j*incColC], incRowC);
    }
}

void
dgemm_jil(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            C[i*incRowC+j*incColC] *= beta;
            for (l=0; l<k; ++l) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

void
dgemm_jli(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (j=0; j<n; ++j) {
        for (l=0; l<k; ++l) {
            for (i=0; i<m; ++i) {
                if (l==0) {
                    C[i*incRowC+j*incColC] *= beta;
                }
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

void
dgemm_vm(int m, int n, int k,
         double alpha,
         const double *A, int incRowA, int incColA,
         const double *B, int incRowB, int incColB,
         double beta,
         double *C, int incRowC, int incColC)
{
    int i;

    for (i=0; i<m; ++i) {
        dgemv(n, k,
              alpha,
              B, incColB, incRowB,
              &A[i*incRowA], incColA,
              beta,
              &C[i*incRowC], incColC);
    }
}

void
dgemm_ilj(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (i=0; i<m; ++i) {
        for (l=0; l<k; ++l) {
            for (j=0; j<n; ++j) {
                if (l==0) {
                    C[i*incRowC+j*incColC] *= beta;
                }
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

void
dgemm_ijl(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            C[i*incRowC+j*incColC] *= beta;
            for (l=0; l<k; ++l) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

void
dgemm_vv(int m, int n, int k,
         double alpha,
         const double *A, int incRowA, int incColA,
         const double *B, int incRowB, int incColB,
         double beta,
         double *C, int incRowC, int incColC)
{
    int l;

    dgescal(m, n, alpha, C, incRowC, incColC);

    for (l=0; l<k; ++l) {
        dger(m, n, alpha,
             &A[l*incColA], incRowA,
             &B[l*incRowB], incColB,
             C, incRowC, incColC);
    }
}

void
dgemm_lij(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            C[i*incRowC+j*incColC] *= beta;
        }
    }
    for (l=0; l<k; ++l) {
        for (i=0; i<m; ++i) {
            for (j=0; j<n; ++j) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

void
dgemm_lji(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            C[i*incRowC+j*incColC] *= beta;
        }
    }
    for (l=0; l<k; ++l) {
        for (j=0; j<n; ++j) {
            for (i=0; i<m; ++i) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}


//--------------------------------------------------------------------------------
#ifndef GEMM_MC
#define GEMM_MC     256
#endif

#ifndef GEMM_NC
#define GEMM_NC     256
#endif

#ifndef GEMM_KC
#define GEMM_KC     1024
#endif

void
dgemm_blk_jli(int m, int n, int k,
              double alpha,
              const double *A, int incRowA, int incColA,
              const double *B, int incRowB, int incColB,
              double beta,
              double *C, int incRowC, int incColC)
{
    int i, j, l;

    double *A_ = (double *)malloc(sizeof(double)*GEMM_MC*GEMM_KC);
    double *B_ = (double *)malloc(sizeof(double)*GEMM_KC*GEMM_NC);
    double *C_ = (double *)malloc(sizeof(double)*GEMM_MC*GEMM_NC);

    dgescal(m, n, beta, C, incRowC, incColC);

    for (j=0; j<n; j+=GEMM_NC) {
        int nc = (j+GEMM_NC<=n) ? GEMM_NC
                                : n - j;

        // ... your code here ...
    }

    free(A_);
    free(B_);
    free(C_);
}

Program für Benchmark

#include <stdio.h>

#include <stdlib.h>
#include <ulmblas/level1.h>
#include <ulmblas/level3.h>
#include <bench/aux.h>
#include <bench/errbound.h>
#include <bench/refblas.h>

#ifndef ALPHA
#define ALPHA 2
#endif

#ifndef BETA
#define BETA 2
#endif

#ifndef MIN_N
#define MIN_N 100
#endif

#ifndef MAX_N
#define MAX_N 1500
#endif

#ifndef INC_N
#define INC_N 100
#endif

#ifndef MIN_M
#define MIN_M 100
#endif

#ifndef MAX_M
#define MAX_M 1500
#endif

#ifndef INC_M
#define INC_M 100
#endif

#ifndef MIN_K
#define MIN_K 100
#endif

#ifndef MAX_K
#define MAX_K 1500
#endif

#ifndef INC_K
#define INC_K 100
#endif

#ifndef ROWMAJOR_A
#define ROWMAJOR_A 0
#endif

#ifndef ROWMAJOR_B
#define ROWMAJOR_B 0
#endif

#ifndef ROWMAJOR_C
#define ROWMAJOR_C 0
#endif

#if (ROWMAJOR_A==1)
#   define INCROW_A  MAX_K
#   define INCCOL_A  1
#else
#   define INCROW_A  1
#   define INCCOL_A  MAX_M
#endif

#if (ROWMAJOR_B==1)
#   define INCROW_B  MAX_N
#   define INCCOL_B  1
#else
#   define INCROW_B  1
#   define INCCOL_B  MAX_K
#endif

#if (ROWMAJOR_C==1)
#   define INCROW_C  MAX_N
#   define INCCOL_C  1
#else
#   define INCROW_C  1
#   define INCCOL_C  MAX_M
#endif

#define MIN(X,Y)   ((X)<(Y) ? (X) : (Y))
#define MAX(X,Y)   ((X)>(Y) ? (X) : (Y))

double A_[MAX_M*MAX_K*MIN(INCROW_A,INCCOL_A)];
double B_[MAX_K*MAX_N*MIN(INCROW_B,INCCOL_B)];
double C_[MAX_M*MAX_N];

double C_0[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)];
double C_1[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)];

int
main()
{
    int m, n, k;

    randGeMatrix(MAX_M, MAX_K, A_, INCROW_A, INCCOL_A);
    randGeMatrix(MAX_K, MAX_N, B_, INCROW_B, INCCOL_B);
    randGeMatrix(MAX_M, MAX_N, C_, 1, MAX_M);

    printf("#%9s %9s %9s", "m", "n", "k");
    printf(" %12s %12s %12s", "t", "MFLOPS", "err");
    printf(" %12s %12s %12s", "t", "MFLOPS", "err");
    printf(" %12s %12s %12s", "t", "MFLOPS", "err");
    printf("\n");

    for (m=MIN_M, n=MIN_N, k=MIN_K;
         n<=MAX_N && m<=MAX_M && k<=MAX_K;
         m+=INC_M, n+=INC_N, k+=INC_K)
    {
        double t, dt, err;
        int    runs  = 1;
        double ops   = 2.0*m/1000*n/1000*k;

        printf(" %9d %9d %9d", m, n, k);

        // compute reference solution
        dgecopy(m, n, C_, 1, MAX_M, C_0, INCROW_C, INCCOL_C);
        dgemm_ref(m, n, k,
                  ALPHA,
                  A_, INCROW_A, INCCOL_A,
                  B_, INCROW_B, INCCOL_B,
                  BETA,
                  C_0, INCROW_C, INCCOL_C);

        // bench matrix-vector variants

        // dgemm_mv

        t    = 0;
        runs = 0;
        do {
            dgecopy(m, n, C_, 1, MAX_M, C_1, INCROW_C, INCCOL_C);
            dt = walltime();
            dgemm_mv(m, n, k,
                     ALPHA,
                     A_, INCROW_A, INCCOL_A,
                     B_, INCROW_B, INCCOL_B,
                     BETA,
                     C_1, INCROW_C, INCCOL_C);
            dt = walltime() - dt;
            t += dt;
            ++runs;
        } while (t<0.3);
        t /= runs;

        err = err_dgemm(m, n, k,
                        ALPHA,
                        A_, INCROW_A, INCCOL_A,
                        B_, INCROW_B, INCCOL_B,
                        BETA,
                        C_0, INCROW_C, INCCOL_C,
                        C_1, INCROW_C, INCCOL_C);

        printf(" %12.2e %12.2lf %12.2e", t, ops/t, err);

        // dgemm_blk_jli

        t    = 0;
        runs = 0;
        do {
            dgecopy(m, n, C_, 1, MAX_M, C_1, INCROW_C, INCCOL_C);
            dt = walltime();
            dgemm_blk_jli(m, n, k,
                          ALPHA,
                          A_, INCROW_A, INCCOL_A,
                          B_, INCROW_B, INCCOL_B,
                          BETA,
                          C_1, INCROW_C, INCCOL_C);
            dt = walltime() - dt;
            t += dt;
            ++runs;
        } while (t<0.3);
        t /= runs;

        err = err_dgemm(m, n, k,
                        ALPHA,
                        A_, INCROW_A, INCCOL_A,
                        B_, INCROW_B, INCCOL_B,
                        BETA,
                        C_0, INCROW_C, INCCOL_C,
                        C_1, INCROW_C, INCCOL_C);

        printf(" %12.2e %12.2lf %12.2e", t, ops/t, err);
        printf("\n");
    }

    return 0;
}