Possible Solution

Setup a Framework for Testing

#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>

void
initGeMatrix(size_t m, size_t n,
             double *A,
             ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
printGeMatrix(size_t m, size_t n,
              const double *A,
              ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

void
dgemm_ref(size_t m, size_t n, size_t k,
          double alpha,
          const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
          const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
          double beta,
          double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
    size_t i, j, l;

    if (beta!=1) {
        if (beta!=0) {
            for (i=0; i<m; ++i) {
                for (j=0; j<n; ++j) {
                    C[i*incRowC+j*incColC] *= beta;
                }
            }
        } else {
            for (i=0; i<m; ++i) {
                for (j=0; j<n; ++j) {
                    C[i*incRowC+j*incColC] = 0;
                }
            }
        }
    }
    if (alpha!=0) {
        for (i=0; i<m; ++i) {
            for (j=0; j<n; ++j) {
                for (l=0; l<k; ++l) {
                    C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                                   *B[l*incRowB+j*incColB];
                }
            }
        }
    }
}


#ifndef DGEMM_MR
#define DGEMM_MR    4
#endif

#ifndef DGEMM_NR
#define DGEMM_NR    5
#endif


int
main()
{
    size_t      k       = 7;

    double      *A      = malloc(DGEMM_MR*k*sizeof(*A));
    double      *B      = malloc(k*DGEMM_NR*sizeof(*B));

    double      C0[DGEMM_MR*DGEMM_NR];
    double      C1[DGEMM_MR*DGEMM_NR];

    initGeMatrix(DGEMM_MR, k, A, 1, DGEMM_MR);
    initGeMatrix(k, DGEMM_NR, B, DGEMM_NR, 1);
    initGeMatrix(DGEMM_MR, DGEMM_NR, C0, 1, DGEMM_MR);
    initGeMatrix(DGEMM_MR, DGEMM_NR, C1, 1, DGEMM_MR);

    printf("A=\n");
    printGeMatrix(DGEMM_MR, k, A, 1, DGEMM_MR);
    printf("B=\n");
    printGeMatrix(k, DGEMM_NR, B, DGEMM_NR, 1);
    printf("C=\n");
    printGeMatrix(DGEMM_MR, DGEMM_NR, C0, 1, DGEMM_MR);

    double alpha = 1;
    double beta  = 1;
    dgemm_ref(DGEMM_MR, DGEMM_NR, k,
              alpha,
              A, 1, DGEMM_MR,
              B, DGEMM_NR, 1,
              beta,
              C0, 1, DGEMM_MR);

    printf("gemm_ref computed C=\n");
    printGeMatrix(DGEMM_MR, DGEMM_NR, C0, 1, DGEMM_MR);

    printf("gemm_micro computed C=\n");
    printGeMatrix(DGEMM_MR, DGEMM_NR, C1, 1, DGEMM_MR);

    free(A);
    free(B);
}

Implement and test dgemm_macro

#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>

void
initGeMatrix(size_t m, size_t n,
             double *A,
             ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
printGeMatrix(size_t m, size_t n,
              const double *A,
              ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

void
dgemm_ref(size_t m, size_t n, size_t k,
          double alpha,
          const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
          const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
          double beta,
          double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
    size_t i, j, l;

    if (beta!=1) {
        if (beta!=0) {
            for (i=0; i<m; ++i) {
                for (j=0; j<n; ++j) {
                    C[i*incRowC+j*incColC] *= beta;
                }
            }
        } else {
            for (i=0; i<m; ++i) {
                for (j=0; j<n; ++j) {
                    C[i*incRowC+j*incColC] = 0;
                }
            }
        }
    }
    if (alpha!=0) {
        for (i=0; i<m; ++i) {
            for (j=0; j<n; ++j) {
                for (l=0; l<k; ++l) {
                    C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                                   *B[l*incRowB+j*incColB];
                }
            }
        }
    }
}


#ifndef DGEMM_MR
#define DGEMM_MR    4
#endif

#ifndef DGEMM_NR
#define DGEMM_NR    5
#endif

void
dgemm_micro(size_t k, double alpha,
            const double *A, const double *B,
            double beta,
            double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
    double AB[DGEMM_MR*DGEMM_NR];

    // AB <- A*B
    for (size_t i=0; i<DGEMM_MR*DGEMM_NR; ++i) {
        AB[i] = 0;
    }
    for (size_t l=0; l<k; ++l) {
        for (size_t i=0; i<DGEMM_MR; ++i) {
            for (size_t j=0; j<DGEMM_NR; ++j) {
                AB[i+j*DGEMM_MR] += A[i+l*DGEMM_MR]*B[l*DGEMM_NR+j];
            }
        }
    }
    // C <- beta*C
    if (beta!=1) {
        if (beta!=0) {
            for (size_t i=0; i<DGEMM_MR; ++i) {
                for (size_t j=0; j<DGEMM_NR; ++j) {
                    C[i*incRowC+j*incColC] *= beta;
                }
            }
        } else {
            for (size_t i=0; i<DGEMM_MR; ++i) {
                for (size_t j=0; j<DGEMM_NR; ++j) {
                    C[i*incRowC+j*incColC] = 0;
                }
            }
        }
    }
    // C <- C + alpha*AB
    for (size_t i=0; i<DGEMM_MR; ++i) {
        for (size_t j=0; j<DGEMM_NR; ++j) {
            C[i*incRowC+j*incColC] += alpha*AB[i+j*DGEMM_MR];
        }
    }
}


int
main()
{
    size_t      k       = 7;

    double      *A      = malloc(DGEMM_MR*k*sizeof(*A));
    double      *B      = malloc(k*DGEMM_NR*sizeof(*B));

    double      C0[DGEMM_MR*DGEMM_NR];
    double      C1[DGEMM_MR*DGEMM_NR];

    initGeMatrix(DGEMM_MR, k, A, 1, DGEMM_MR);
    initGeMatrix(k, DGEMM_NR, B, DGEMM_NR, 1);
    initGeMatrix(DGEMM_MR, DGEMM_NR, C0, 1, DGEMM_MR);
    initGeMatrix(DGEMM_MR, DGEMM_NR, C1, 1, DGEMM_MR);

    printf("A=\n");
    printGeMatrix(DGEMM_MR, k, A, 1, DGEMM_MR);
    printf("B=\n");
    printGeMatrix(k, DGEMM_NR, B, DGEMM_NR, 1);
    printf("C=\n");
    printGeMatrix(DGEMM_MR, DGEMM_NR, C0, 1, DGEMM_MR);

    double alpha = 1;
    double beta  = 1;
    dgemm_ref(DGEMM_MR, DGEMM_NR, k,
              alpha,
              A, 1, DGEMM_MR,
              B, DGEMM_NR, 1,
              beta,
              C0, 1, DGEMM_MR);

    dgemm_micro(k, alpha, A, B, beta, C1, 1, DGEMM_MR);


    printf("gemm_ref computed C=\n");
    printGeMatrix(DGEMM_MR, DGEMM_NR, C0, 1, DGEMM_MR);

    printf("gemm_micro computed C=\n");
    printGeMatrix(DGEMM_MR, DGEMM_NR, C1, 1, DGEMM_MR);

    free(A);
    free(B);
}