#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>
void
initGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
int i, j;
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
A[i*incRowA+j*incColA] = i*n + j + 1;
}
}
}
void
printGeMatrix(int m, int n, const double *A, int incRowA, int incColA)
{
int i, j;
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
printf("%10.4lf ", A[i*incRowA+j*incColA]);
}
printf("\n");
}
printf("\n\n");
}
void
dgecopy(int m, int n,
const double *X, int incRowX, int incColX,
double *Y, int incRowY, int incColY)
{
int i, j;
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX];
}
}
}
void
dgescal(int m, int n, double alpha,
double *X, int incRowX, int incColX)
{
int i, j;
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
X[i*incRowX+j*incColX] *= alpha;
}
}
}
void
dgeaxpy(int m, int n, double alpha,
const double *X, int incRowX, int incColX,
double *Y, int incRowY, int incColY)
{
int i, j;
if (alpha==0 || m==0 || n==0) {
return;
}
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
}
}
}
void
dgemm_ilj(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
for (i=0; i<m; ++i) {
for (l=0; l<k; ++l) {
for (j=0; j<n; ++j) {
if (l==0) {
C[i*incRowC+j*incColC] *= beta;
}
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
#ifndef DGEMM_MC
#define DGEMM_MC 8
#endif
#ifndef DGEMM_NC
#define DGEMM_NC 9
#endif
#ifndef DGEMM_KC
#define DGEMM_KC 10
#endif
#ifndef DGEMM_MR
#define DGEMM_MR 4
#endif
#ifndef DGEMM_NR
#define DGEMM_NR 3
#endif
void
dpack_A(int m, int k, const double *A, int incRowA, int incColA, double *p)
{
int i, i0, j, l, nu;
int mp = (m+DGEMM_MR-1) / DGEMM_MR;
for (j=0; j<k; ++j) {
for (l=0; l<mp; ++l) {
for (i0=0; i0<DGEMM_MR; ++i0) {
i = l*DGEMM_MR + i0;
nu = l*DGEMM_MR*k + j*DGEMM_MR + i0;
p[nu] = (i<m) ? A[i*incRowA+j*incColA]
: 0;
}
}
}
}
void
dpack_B(int k, int n, const double *B, int incRowB, int incColB, double *p)
{
int i, j, j0, l, nu;
int np = (n+DGEMM_NR-1) / DGEMM_NR;
for (l=0; l<np; ++l) {
for (j0=0; j0<DGEMM_NR; ++j0) {
j = l*DGEMM_NR + j0;
for (i=0; i<k; ++i) {
nu = l*DGEMM_NR*k + i*DGEMM_NR + j0;
p[nu] = (j<n) ? B[i*incRowB+j*incColB]
: 0;
}
}
}
}
void
dgemm_micro(int k,
double alpha,
const double *A,
const double *B,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
double AB[DGEMM_MR*DGEMM_NR];
for (l=0; l<DGEMM_MR*DGEMM_NR; ++l) {
AB[l] = 0;
}
for (l=0; l<k; ++l) {
for (j=0; j<DGEMM_NR; ++j) {
for (i=0; i<DGEMM_MR; ++i) {
AB[i+j*DGEMM_MR] += A[i+l*DGEMM_MR]*B[j+l*DGEMM_NR];
}
}
}
if (beta==0) {
for (j=0; j<DGEMM_NR; ++j) {
for (i=0; i<DGEMM_MR; ++i) {
C[i*incRowC+j*incColC] = alpha*AB[i+j*DGEMM_MR];
}
}
} else {
for (j=0; j<DGEMM_NR; ++j) {
for (i=0; i<DGEMM_MR; ++i) {
C[i*incRowC+j*incColC] = beta*C[i*incRowC+j*incColC]
+ alpha*AB[i+j*DGEMM_MR];
}
}
}
}
void
dgemm_macro(int m, int n, int k,
double alpha,
const double *A,
const double *B,
double beta,
double *C, int incRowC, int incColC)
{
double C_[DGEMM_MR*DGEMM_NR];
int i, j;
const int MR = DGEMM_MR;
const int NR = DGEMM_NR;
for (j=0; j<n; j+=NR) {
int nr = (j+NR<n) ? NR
: n - j;
for (i=0; i<m; i+=MR) {
int mr = (i+MR<m) ? MR
: m - i;
if (mr==MR && nr==NR) {
dgemm_micro(k, alpha,
&A[i*k], &B[j*k],
beta,
&C[i*incRowC+j*incColC], incRowC, incColC);
} else {
dgemm_micro(k, alpha,
&A[i*k], &B[j*k],
0.,
C_, 1, MR);
dgescal(mr, nr, beta,
&C[i*incRowC+j*incColC], incRowC, incColC);
dgeaxpy(mr, nr, 1., C_, 1, MR,
&C[i*incRowC+j*incColC], incRowC, incColC);
}
}
}
}
#ifndef DIM_M
#define DIM_M 8
#endif
#ifndef DIM_N
#define DIM_N 7
#endif
#ifndef DIM_K
#define DIM_K 12
#endif
#ifndef ALPHA
#define ALPHA 2
#endif
#ifndef BETA
#define BETA 2
#endif
#ifndef ROWMAJOR_A
#define INCROW_A 1
#define INCCOL_A DIM_M
#else
#define INCROW_A DIM_K
#define INCCOL_A 1
#endif
#ifndef ROWMAJOR_B
#define INCROW_B 1
#define INCCOL_B DIM_K
#else
#define INCROW_B DIM_N
#define INCCOL_B 1
#endif
#ifndef ROWMAJOR_C
#define INCROW_C 1
#define INCCOL_C DIM_M
#else
#define INCROW_C DIM_N
#define INCCOL_C 1
#endif
double A[DIM_M*DIM_K];
double B[DIM_K*DIM_N];
double A_[DGEMM_MC*DGEMM_KC];
double B_[DGEMM_KC*DGEMM_NC];
double C[DIM_M*DIM_N];
double C_ref[DIM_M*DIM_N];
#define MIN(X,Y) (X) < (Y) ? (X) : (Y)
int
main()
{
initGeMatrix(DIM_M, DIM_K, A, INCROW_A, INCCOL_A);
initGeMatrix(DIM_K, DIM_N, B, INCROW_B, INCCOL_B);
initGeMatrix(DIM_M, DIM_N, C, INCROW_C, INCCOL_C);
dgecopy(DIM_M, DIM_N,
C, INCROW_C, INCCOL_C,
C_ref, INCROW_C, INCCOL_C);
dpack_A(MIN(DIM_M, DGEMM_MC), MIN(DIM_K, DGEMM_KC),
A, INCROW_A, INCCOL_A, A_);
dpack_B(MIN(DIM_K, DGEMM_KC), MIN(DIM_N, DGEMM_NC),
B, INCROW_B, INCCOL_B, B_);
printf("A =\n");
printGeMatrix(DIM_M, DIM_K, A, INCROW_A, INCCOL_A);
printf("B =\n");
printGeMatrix(DIM_K, DIM_N, B, INCROW_B, INCCOL_B);
printf("C =\n");
printGeMatrix(DIM_M, DIM_N, C, INCROW_C, INCCOL_C);
dpack_A(MIN(DIM_M, DGEMM_MC), MIN(DIM_K, DGEMM_KC),
A, INCROW_A, INCCOL_A, A_);
dpack_B(MIN(DIM_K, DGEMM_KC), MIN(DIM_N, DGEMM_NC),
B, INCROW_B, INCCOL_B, B_);
dgemm_macro(MIN(DIM_M, DGEMM_MC),
MIN(DIM_N, DGEMM_NC),
MIN(DIM_K, DGEMM_KC),
(double)ALPHA,
A_, B_,
(double)BETA,
C, INCROW_C, INCCOL_C);
printf("%lf*C + %lf*A*B =\n", (double)ALPHA, (double)BETA);
printGeMatrix(DIM_M, DIM_N, C, INCROW_C, INCCOL_C);
printf("C_ref =\n");
dgemm_ilj(MIN(DIM_M, DGEMM_MC),
MIN(DIM_N, DGEMM_NC),
MIN(DIM_K, DGEMM_KC),
(double)ALPHA,
A, INCROW_A, INCCOL_A,
B, INCROW_B, INCCOL_B,
(double)BETA,
C_ref, INCROW_C, INCCOL_C);
printGeMatrix(DIM_M, DIM_N, C_ref, INCROW_C, INCCOL_C);
return 0;
}