#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
void
initGeMatrix(size_t m, size_t n,
double *A,
ptrdiff_t incRowA, ptrdiff_t incColA)
{
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
A[i*incRowA+j*incColA] = i*n + j + 1;
}
}
}
void
printGeMatrix(size_t m, size_t n,
const double *A,
ptrdiff_t incRowA, ptrdiff_t incColA)
{
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
printf("%10.4lf ", A[i*incRowA+j*incColA]);
}
printf("\n");
}
printf("\n");
}
void
dgemm_ref(size_t m, size_t n, size_t k,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
double beta,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
size_t i, j, l;
if (beta!=1) {
if (beta!=0) {
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
C[i*incRowC+j*incColC] *= beta;
}
}
} else {
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
C[i*incRowC+j*incColC] = 0;
}
}
}
}
if (alpha!=0) {
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
for (l=0; l<k; ++l) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
}
#ifndef DGEMM_MR
#define DGEMM_MR 4
#endif
#ifndef DGEMM_NR
#define DGEMM_NR 5
#endif
#ifndef DGEMM_MC
#define DGEMM_MC 8
#endif
#ifndef DGEMM_NC
#define DGEMM_NC 10
#endif
#ifndef DGEMM_KC
#define DGEMM_KC 9
#endif
void
dgepack_A(size_t m, size_t k,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
double *p)
{
size_t mb = (m+DGEMM_MR-1)/DGEMM_MR;
for (size_t l=0; l<k; ++l) {
for (size_t i1=0; i1<mb; ++i1) {
for (size_t i0=0; i0<DGEMM_MR; ++i0) {
size_t i = i1*DGEMM_MR + i0;
size_t nu = i1*DGEMM_MR*k + l*DGEMM_MR + i0;
p[nu] = (i<m) ? A[i*incRowA + l*incColA]
: 0;
}
}
}
}
void
dgepack_B(size_t k, size_t n,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
double *p)
{
size_t nb = (n+DGEMM_NR-1)/DGEMM_NR;
for (size_t j1=0; j1<nb; ++j1) {
for (size_t j0=0; j0<DGEMM_NR; ++j0) {
for (size_t l=0; l<k; ++l) {
size_t j = j1*DGEMM_NR + j0;
size_t nu = j1*DGEMM_NR*k + l*DGEMM_NR + j0;
p[nu] = (j<n) ? B[l*incRowB + j*incColB]
: 0;
}
}
}
}
void
dgemm_micro(size_t k, double alpha,
const double *A, const double *B,
double beta,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
double AB[DGEMM_MR*DGEMM_NR];
for (size_t i=0; i<DGEMM_MR*DGEMM_NR; ++i) {
AB[i] = 0;
}
for (size_t l=0; l<k; ++l) {
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
AB[i+j*DGEMM_MR] += A[i+l*DGEMM_MR]*B[l*DGEMM_NR+j];
}
}
}
if (beta!=1) {
if (beta!=0) {
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
C[i*incRowC+j*incColC] *= beta;
}
}
} else {
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
C[i*incRowC+j*incColC] = 0;
}
}
}
}
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
C[i*incRowC+j*incColC] += alpha*AB[i+j*DGEMM_MR];
}
}
}
void
dgescal(size_t m, size_t n,
double alpha,
double *X, size_t incRowX, size_t incColX)
{
if (alpha==1) {
return;
}
if (alpha!=0) {
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
X[i*incRowX+j*incColX] *= alpha;
}
}
} else {
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
X[i*incRowX+j*incColX] = 0;
}
}
}
}
void
dgeaxpy(size_t m, size_t n,
double alpha,
const double *X, size_t incRowX, size_t incColX,
double *Y, size_t incRowY, size_t incColY)
{
if (alpha==0) {
return;
}
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
}
}
}
void
dgemm_macro(size_t m, size_t n, size_t k, double alpha,
const double *A, const double *B,
double beta,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
double AB[DGEMM_MR*DGEMM_NR];
size_t mb = (m+DGEMM_MR-1) / DGEMM_MR;
size_t nb = (n+DGEMM_NR-1) / DGEMM_NR;
size_t mr = m % DGEMM_MR;
size_t nr = n % DGEMM_NR;
for (size_t i=0; i<mb; ++i) {
size_t m_ = (i<mb-1 || mr==0) ? DGEMM_MR
: mr;
for (size_t j=0; j<nb; ++j) {
size_t n_ = (j<nb-1 || nr==0) ? DGEMM_NR
: nr;
if (m_==DGEMM_MR && n_==DGEMM_NR) {
dgemm_micro(k, alpha,
&A[i*DGEMM_MR*k], &B[j*k*DGEMM_NR],
beta,
&C[i*DGEMM_MR*incRowC+j*DGEMM_NR*incColC],
incRowC, incColC);
} else {
dgemm_micro(k, alpha,
&A[i*DGEMM_MR*k], &B[j*k*DGEMM_NR],
0,
AB, 1, DGEMM_MR);
dgescal(m_, n_,
beta,
&C[i*DGEMM_MR*incRowC+j*DGEMM_NR*incColC],
incRowC, incColC);
dgeaxpy(m_, n_,
1,
AB, 1, DGEMM_MR,
&C[i*DGEMM_MR*incRowC+j*DGEMM_NR*incColC],
incRowC, incColC);
}
}
}
}
int
main()
{
size_t m = DGEMM_MC - 1;
size_t n = DGEMM_NC - 2;
size_t k = DGEMM_KC - 3;
ptrdiff_t incRowA = 1;
ptrdiff_t incColA = m;
ptrdiff_t incRowB = 1;
ptrdiff_t incColB = k;
ptrdiff_t incRowC = 1;
ptrdiff_t incColC = m;
double *A = malloc(m*k*sizeof(*A));
double *B = malloc(k*n*sizeof(*B));
double C0[m*n];
double C1[m*n];
initGeMatrix(m, k, A, incRowA, incColA);
initGeMatrix(k, n, B, incRowB, incColB);
initGeMatrix(m, n, C0, incRowC, incColC);
initGeMatrix(m, n, C1, incRowC, incColC);
printf("A=\n");
printGeMatrix(m, k, A, incRowA, incColA);
printf("B=\n");
printGeMatrix(k, n, B, incRowB, incColB);
printf("C=\n");
printGeMatrix(m, n, C0, incRowC, incColC);
double alpha = 1;
double beta = 1;
dgemm_ref(m, n, k,
alpha,
A, incRowA, incColA,
B, incRowB, incColB,
beta,
C0, incRowC, incColC);
double *Ap = malloc(DGEMM_MC*DGEMM_KC*sizeof(*Ap));
double *Bp = malloc(DGEMM_KC*DGEMM_NC*sizeof(*Bp));
dgepack_A(m, k, A, incRowA, incColA, Ap);
dgepack_B(k, n, B, incRowB, incColB, Bp);
dgemm_macro(m, n, k, alpha, Ap, Bp, beta, C1, incRowC, incColC);
free(Bp);
free(Ap);
printf("gemm_ref computed C=\n");
printGeMatrix(m, n, C0, incRowC, incColC);
printf("gemm_macro computed C=\n");
printGeMatrix(m, n, C1, incRowC, incColC);
free(A);
free(B);
}