Lösungsvorschlag

$shell> gcc -Wall dgemm_micro.c
$shell> ./a.out
A_panel =
    1.0000     2.0000     3.0000     4.0000     5.0000     6.0000     7.0000     8.0000     9.0000    10.0000    11.0000    12.0000 
   13.0000    14.0000    15.0000    16.0000    17.0000    18.0000    19.0000    20.0000    21.0000    22.0000    23.0000    24.0000 
   25.0000    26.0000    27.0000    28.0000    29.0000    30.0000    31.0000    32.0000    33.0000    34.0000    35.0000    36.0000 
   37.0000    38.0000    39.0000    40.0000    41.0000    42.0000    43.0000    44.0000    45.0000    46.0000    47.0000    48.0000 


B_panel =
    1.0000     2.0000     3.0000 
    4.0000     5.0000     6.0000 
    7.0000     8.0000     9.0000 
   10.0000    11.0000    12.0000 
   13.0000    14.0000    15.0000 
   16.0000    17.0000    18.0000 
   19.0000    20.0000    21.0000 
   22.0000    23.0000    24.0000 
   25.0000    26.0000    27.0000 
   28.0000    29.0000    30.0000 
   31.0000    32.0000    33.0000 
   34.0000    35.0000    36.0000 


C =
    1.0000     2.0000     3.0000 
    4.0000     5.0000     6.0000 
    7.0000     8.0000     9.0000 
   10.0000    11.0000    12.0000 


2.000000*C + 2.000000*A*B =
 3590.0000  3748.0000  3906.0000 
 8636.0000  9082.0000  9528.0000 
13682.0000 14416.0000 15150.0000 
18728.0000 19750.0000 20772.0000 


C_ref =
 3590.0000  3748.0000  3906.0000 
 8636.0000  9082.0000  9528.0000 
13682.0000 14416.0000 15150.0000 
18728.0000 19750.0000 20772.0000 
$shell> 
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>

//-- setup and print matrices --------------------------------------------------

void
initGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
printGeMatrix(int m, int n, const double *A, int incRowA, int incColA)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n\n");
}

//------------------------------------------------------------------------------

void
dgecopy(int m, int n,
        const double *X, int incRowX, int incColX,
        double *Y, int incRowY, int incColY)
{
    int i, j;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX];
        }
    }
}

void
dgescal(int m, int n, double alpha,
        double *X, int incRowX, int incColX)
{
    int i, j;

    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            X[i*incRowX+j*incColX] *= alpha;
        }
    }
}

void
dgeaxpy(int m, int n, double alpha,
        const double *X, int incRowX, int incColX,
        double *Y, int incRowY, int incColY)
{
    int i, j;

    if (alpha==0 || m==0 || n==0) {
        return;
    }
    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
        }
    }
}

void
dgemm_ilj(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (i=0; i<m; ++i) {
        for (l=0; l<k; ++l) {
            for (j=0; j<n; ++j) {
                if (l==0) {
                    C[i*incRowC+j*incColC] *= beta;
                }
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

//------------------------------------------------------------------------------

#ifndef DGEMM_MC
#define DGEMM_MC 8
#endif

#ifndef DGEMM_NC
#define DGEMM_NC 9
#endif

#ifndef DGEMM_KC
#define DGEMM_KC 10
#endif

#ifndef DGEMM_MR
#define DGEMM_MR 4
#endif

#ifndef DGEMM_NR
#define DGEMM_NR 3
#endif



//------------------------------------------------------------------------------

void
dpack_A(int m, int k, const double *A, int incRowA, int incColA, double *p)
{
    int i, i0, j, l, nu;
    int mp = (m+DGEMM_MR-1) / DGEMM_MR;

    for (j=0; j<k; ++j) {
        for (l=0; l<mp; ++l) {
            for (i0=0; i0<DGEMM_MR; ++i0) {
                i     = l*DGEMM_MR + i0;
                nu    = l*DGEMM_MR*k + j*DGEMM_MR + i0;
                p[nu] = (i<m) ? A[i*incRowA+j*incColA]
                              : 0;
            }
        }
    }
}

void
dpack_B(int k, int n, const double *B, int incRowB, int incColB, double *p)
{
    int i, j, j0, l, nu;
    int np = (n+DGEMM_NR-1) / DGEMM_NR;

    for (l=0; l<np; ++l) {
        for (j0=0; j0<DGEMM_NR; ++j0) {
            j = l*DGEMM_NR + j0;
            for (i=0; i<k; ++i) {
                nu    = l*DGEMM_NR*k + i*DGEMM_NR + j0;
                p[nu] = (j<n) ? B[i*incRowB+j*incColB]
                              : 0;
            }
        }
    }
}

void
dgemm_micro(int k,
            double alpha,
            const double *A,
            const double *B,
            double beta,
            double *C, int incRowC, int incColC)
{
    int    i, j, l;
    double AB[DGEMM_MR*DGEMM_NR];

    for (l=0; l<DGEMM_MR*DGEMM_NR; ++l) {
        AB[l] = 0;
    }
    for (l=0; l<k; ++l) {
        for (j=0; j<DGEMM_NR; ++j) {
            for (i=0; i<DGEMM_MR; ++i) {
                AB[i+j*DGEMM_MR] += A[i+l*DGEMM_MR]*B[j+l*DGEMM_NR];
            }
        }
    }
    if (beta==0) {
        for (j=0; j<DGEMM_NR; ++j) {
            for (i=0; i<DGEMM_MR; ++i) {
                C[i*incRowC+j*incColC] = alpha*AB[i+j*DGEMM_MR];
            }
        }
    } else {
        for (j=0; j<DGEMM_NR; ++j) {
            for (i=0; i<DGEMM_MR; ++i) {
                C[i*incRowC+j*incColC] = beta*C[i*incRowC+j*incColC]
                                       + alpha*AB[i+j*DGEMM_MR];
            }
        }
    }
}

//------------------------------------------------------------------------------

#ifndef DIM_K
#define DIM_K 12
#endif

#ifndef ALPHA
#define ALPHA 2
#endif

#ifndef BETA
#define BETA 2
#endif

#ifndef ROWMAJOR
#define INCROW_C 1
#define INCCOL_C DGEMM_MR
#else
#define INCROW_C DGEMM_NR
#define INCCOL_C 1
#endif


double A_panel[DIM_K*DGEMM_MR];
double B_panel[DIM_K*DGEMM_NR];
double C[DGEMM_MR*DGEMM_NR];
double C_ref[DGEMM_MR*DGEMM_NR];

int
main()
{
    initGeMatrix(DGEMM_MR, DIM_K, A_panel, 1, DGEMM_MR);
    initGeMatrix(DIM_K, DGEMM_NR, B_panel, DGEMM_NR, 1);
    initGeMatrix(DGEMM_MR, DGEMM_NR, C, INCROW_C, INCCOL_C);
    dgecopy(DGEMM_MR, DGEMM_NR,
            C, INCROW_C, INCCOL_C,
            C_ref, INCROW_C, INCCOL_C);

    printf("A_panel =\n");
    printGeMatrix(DGEMM_MR, DIM_K, A_panel, 1, DGEMM_MR);
    printf("B_panel =\n");
    printGeMatrix(DIM_K, DGEMM_NR, B_panel, DGEMM_NR, 1);
    printf("C =\n");
    printGeMatrix(DGEMM_MR, DGEMM_NR, C, INCROW_C, INCCOL_C);

    dgemm_micro(DIM_K,
                (double)ALPHA,
                A_panel, B_panel,
                (double)BETA,
                C, INCROW_C, INCCOL_C);
    printf("%lf*C + %lf*A*B =\n", (double)ALPHA, (double)BETA);
    printGeMatrix(DGEMM_MR, DGEMM_NR, C, INCROW_C, INCCOL_C);

    printf("C_ref =\n");
    dgemm_ilj(DGEMM_MR, DGEMM_NR, DIM_K,
              (double)ALPHA,
              A_panel, 1, DGEMM_MR,
              B_panel, DGEMM_NR, 1,
              (double)BETA,
              C_ref, INCROW_C, INCCOL_C);
    printGeMatrix(DGEMM_MR, DGEMM_NR, C_ref, INCROW_C, INCCOL_C);

    return 0;
}