#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
void
initGeMatrix(size_t m, size_t n,
double *A,
ptrdiff_t incRowA, ptrdiff_t incColA)
{
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
A[i*incRowA+j*incColA] = i*n + j + 1;
}
}
}
void
printGeMatrix(size_t m, size_t n,
const double *A,
ptrdiff_t incRowA, ptrdiff_t incColA)
{
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
printf("%10.4lf ", A[i*incRowA+j*incColA]);
}
printf("\n");
}
printf("\n");
}
void
dgemm_ref(size_t m, size_t n, size_t k,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
double beta,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
size_t i, j, l;
if (beta!=1) {
if (beta!=0) {
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
C[i*incRowC+j*incColC] *= beta;
}
}
} else {
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
C[i*incRowC+j*incColC] = 0;
}
}
}
}
if (alpha!=0) {
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
for (l=0; l<k; ++l) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
}
#ifndef DGEMM_MR
#define DGEMM_MR 4
#endif
#ifndef DGEMM_NR
#define DGEMM_NR 5
#endif
void
dgemm_micro(size_t k, double alpha,
const double *A, const double *B,
double beta,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
double AB[DGEMM_MR*DGEMM_NR];
for (size_t i=0; i<DGEMM_MR*DGEMM_NR; ++i) {
AB[i] = 0;
}
for (size_t l=0; l<k; ++l) {
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
AB[i+j*DGEMM_MR] += A[i+l*DGEMM_MR]*B[l*DGEMM_NR+j];
}
}
}
if (beta!=1) {
if (beta!=0) {
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
C[i*incRowC+j*incColC] *= beta;
}
}
} else {
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
C[i*incRowC+j*incColC] = 0;
}
}
}
}
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
C[i*incRowC+j*incColC] += alpha*AB[i+j*DGEMM_MR];
}
}
}
int
main()
{
size_t k = 7;
double *A = malloc(DGEMM_MR*k*sizeof(*A));
double *B = malloc(k*DGEMM_NR*sizeof(*B));
double C0[DGEMM_MR*DGEMM_NR];
double C1[DGEMM_MR*DGEMM_NR];
initGeMatrix(DGEMM_MR, k, A, 1, DGEMM_MR);
initGeMatrix(k, DGEMM_NR, B, DGEMM_NR, 1);
initGeMatrix(DGEMM_MR, DGEMM_NR, C0, 1, DGEMM_MR);
initGeMatrix(DGEMM_MR, DGEMM_NR, C1, 1, DGEMM_MR);
printf("A=\n");
printGeMatrix(DGEMM_MR, k, A, 1, DGEMM_MR);
printf("B=\n");
printGeMatrix(k, DGEMM_NR, B, DGEMM_NR, 1);
printf("C=\n");
printGeMatrix(DGEMM_MR, DGEMM_NR, C0, 1, DGEMM_MR);
double alpha = 1;
double beta = 1;
dgemm_ref(DGEMM_MR, DGEMM_NR, k,
alpha,
A, 1, DGEMM_MR,
B, DGEMM_NR, 1,
beta,
C0, 1, DGEMM_MR);
dgemm_micro(k, alpha, A, B, beta, C1, 1, DGEMM_MR);
printf("gemm_ref computed C=\n");
printGeMatrix(DGEMM_MR, DGEMM_NR, C0, 1, DGEMM_MR);
printf("gemm_micro computed C=\n");
printGeMatrix(DGEMM_MR, DGEMM_NR, C1, 1, DGEMM_MR);
free(A);
free(B);
}