#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>
#include <stdint.h>
double
walltime()
{
struct tms ts;
static double ClockTick=0.0;
if (ClockTick==0.0) {
ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
}
return ((double) times(&ts)) * ClockTick;
}
void
initGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
int i, j;
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
A[i*incRowA+j*incColA] = i*n + j + 1;
}
}
}
void
randGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
int i, j;
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = ((double)rand()-RAND_MAX/2)*200/RAND_MAX;
}
}
}
void
printGeMatrix(int m, int n, const double *A, int incRowA, int incColA)
{
int i, j;
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
printf("%10.4lf ", A[i*incRowA+j*incColA]);
}
printf("\n");
}
printf("\n\n");
}
void
dgemm_ref(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
if (beta!=1) {
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
if (beta!=0) {
C[i*incRowC+j*incColC] *= beta;
} else {
C[i*incRowC+j*incColC] = 0;
}
}
}
}
if (alpha!=0) {
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
for (l=0; l<k; ++l) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
}
double
dgenrm1(int m, int n, const double *A, int incRowA, int incColA)
{
int i, j;
double result = 0;
for (j=0; j<n; ++j) {
double sum = 0;
for (i=0; i<m; ++i) {
sum += fabs(A[i*incRowA+j*incColA]);
}
if (sum>result) {
result = sum;
}
}
return result;
}
void
dgecopy(int m, int n,
const double *X, int incRowX, int incColX,
double *Y, int incRowY, int incColY)
{
int i, j;
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX];
}
}
}
void
dgescal(int m, int n, double alpha,
double *X, int incRowX, int incColX)
{
int i, j;
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
X[i*incRowX+j*incColX] *= alpha;
}
}
}
void
dgeaxpy(int m, int n, double alpha,
const double *X, int incRowX, int incColX,
double *Y, int incRowY, int incColY)
{
int i, j;
if (alpha==0 || m==0 || n==0) {
return;
}
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
}
}
}
#define MIN(X,Y) ((X)<(Y) ? (X) : (Y))
#define MAX(X,Y) ((X)>(Y) ? (X) : (Y))
double
err_dgemm(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
const double *C0, int incRowC0, int incColC0,
double *C, int incRowC, int incColC)
{
double normA = dgenrm1(m, k, A, incRowA, incColA);
double normB = dgenrm1(k, n, B, incRowB, incColB);
double normC = dgenrm1(m, n, C, incRowC0, incColC0);
double normD;
int mn = (m>n) ? m : n;
int mnk = (mn>k) ? mn : k;
normA = MAX(normA, fabs(alpha)*normA);
normC = MAX(normC, fabs(beta)*normC);
dgeaxpy(m, n, -1.0, C0, incRowC0, incColC0, C, incRowC, incColC);
normD = dgenrm1(m, n, C, incRowC, incColC);
return normD/(mnk*normA*normB*normC);
}
void *
malloc_(size_t alignment, size_t size)
{
size += alignment;
void *ptr = malloc(size);
void *ptr2 = (void *)(((size_t)ptr + alignment) & ~(alignment-1));
void **vp = (void**) ptr2 - 1;
*vp = ptr;
return ptr2;
}
void
free_(void *ptr)
{
free(*((void**)ptr-1));
}
#define DGEMM_MC 256
#define DGEMM_NC 512
#define DGEMM_KC 256
#define DGEMM_MR 4
#define DGEMM_NR 8
void
dpack_A(int m, int k, const double *A, int incRowA, int incColA, double *p)
{
int i, i0, j, l, nu;
int mp = (m+DGEMM_MR-1) / DGEMM_MR;
for (j=0; j<k; ++j) {
for (l=0; l<mp; ++l) {
for (i0=0; i0<DGEMM_MR; ++i0) {
i = l*DGEMM_MR + i0;
nu = l*DGEMM_MR*k + j*DGEMM_MR + i0;
p[nu] = (i<m) ? A[i*incRowA+j*incColA]
: 0;
}
}
}
}
void
dpack_B(int k, int n, const double *B, int incRowB, int incColB, double *p)
{
int i, j, j0, l, nu;
int np = (n+DGEMM_NR-1) / DGEMM_NR;
for (l=0; l<np; ++l) {
for (j0=0; j0<DGEMM_NR; ++j0) {
j = l*DGEMM_NR + j0;
for (i=0; i<k; ++i) {
nu = l*DGEMM_NR*k + i*DGEMM_NR + j0;
p[nu] = (j<n) ? B[i*incRowB+j*incColB]
: 0;
}
}
}
}
void
ugemm_4_8(int64_t k,
double alpha,
const double *A,
const double *B,
double beta,
double *C, int64_t incRowC, int64_t incColC);
void
dgemm_macro(int m, int n, int k,
double alpha,
const double *A,
const double *B,
double beta,
double *C, int incRowC, int incColC)
{
double C_[DGEMM_MR*DGEMM_NR];
int i, j;
const int MR = DGEMM_MR;
const int NR = DGEMM_NR;
for (j=0; j<n; j+=NR) {
int nr = (j+NR<n) ? NR
: n - j;
for (i=0; i<m; i+=MR) {
int mr = (i+MR<m) ? MR
: m - i;
if (mr==MR && nr==NR) {
ugemm_4_8(k, alpha,
&A[i*k], &B[j*k],
beta,
&C[i*incRowC+j*incColC], incRowC, incColC);
} else {
ugemm_4_8(k, alpha,
&A[i*k], &B[j*k],
0.,
C_, 1, MR);
dgescal(mr, nr, beta,
&C[i*incRowC+j*incColC], incRowC, incColC);
dgeaxpy(mr, nr, 1., C_, 1, MR,
&C[i*incRowC+j*incColC], incRowC, incColC);
}
}
}
}
void
dgemm(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
const int MC = DGEMM_MC;
const int NC = DGEMM_NC;
const int KC = DGEMM_KC;
if (alpha==0.0 || k==0) {
dgescal(m, n, beta, C, incRowC, incColC);
} else {
double *A_ = (double *)malloc_(32, sizeof(double)*DGEMM_MC*DGEMM_KC);
double *B_ = (double *)malloc_(32, sizeof(double)*DGEMM_KC*DGEMM_NC);
for (j=0; j<n; j+=NC) {
int nc = (j+NC<=n) ? NC
: n - j;
for (l=0; l<k; l+=KC) {
int kc = (l+KC<=k) ? KC
: k - l;
double beta_ = (l==0) ? beta
: 1;
dpack_B(kc, nc,
&B[l*incRowB+j*incColB], incRowB, incColB,
B_);
for (i=0; i<m; i+=MC) {
int mc = (i+MC<=m) ? MC
: m - i;
dpack_A(mc, kc,
&A[i*incRowA+l*incColA], incRowA, incColA,
A_);
dgemm_macro(mc, nc, kc,
alpha,
A_, B_,
beta_,
&C[i*incRowC+j*incColC], incRowC, incColC);
}
}
}
free_(A_);
free_(B_);
}
}
#ifndef ALPHA
#define ALPHA 2
#endif
#ifndef BETA
#define BETA 2
#endif
#ifndef MIN_N
#define MIN_N 100
#endif
#ifndef MAX_N
#define MAX_N 1500
#endif
#ifndef INC_N
#define INC_N 100
#endif
#ifndef MIN_M
#define MIN_M 100
#endif
#ifndef MAX_M
#define MAX_M 1500
#endif
#ifndef INC_M
#define INC_M 100
#endif
#ifndef MIN_K
#define MIN_K 100
#endif
#ifndef MAX_K
#define MAX_K 1500
#endif
#ifndef INC_K
#define INC_K 100
#endif
#ifndef ROWMAJOR_A
#define ROWMAJOR_A 0
#endif
#ifndef ROWMAJOR_B
#define ROWMAJOR_B 0
#endif
#ifndef ROWMAJOR_C
#define ROWMAJOR_C 0
#endif
#if (ROWMAJOR_A==1)
# define INCROW_A MAX_K
# define INCCOL_A 1
#else
# define INCROW_A 1
# define INCCOL_A MAX_M
#endif
#if (ROWMAJOR_B==1)
# define INCROW_B MAX_N
# define INCCOL_B 1
#else
# define INCROW_B 1
# define INCCOL_B MAX_K
#endif
#if (ROWMAJOR_C==1)
# define INCROW_C MAX_N
# define INCCOL_C 1
#else
# define INCROW_C 1
# define INCCOL_C MAX_M
#endif
double A_[MAX_M*MAX_K*MIN(INCROW_A,INCCOL_A)];
double B_[MAX_K*MAX_N*MIN(INCROW_B,INCCOL_B)];
double C_[MAX_M*MAX_N];
double C_0[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)];
double C_1[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)];
int
main()
{
int m, n, k;
randGeMatrix(MAX_M, MAX_K, A_, INCROW_A, INCCOL_A);
randGeMatrix(MAX_K, MAX_N, B_, INCROW_B, INCCOL_B);
randGeMatrix(MAX_M, MAX_N, C_, 1, MAX_M);
printf("#%9s %9s %9s", "m", "n", "k");
printf(" %12s %12s %12s", "t", "MFLOPS", "err");
printf(" %12s %12s %12s", "t", "MFLOPS", "err");
printf(" %12s %12s %12s", "t", "MFLOPS", "err");
printf("\n");
for (m=MIN_M, n=MIN_N, k=MIN_K;
n<=MAX_N && m<=MAX_M && k<=MAX_K;
m+=INC_M, n+=INC_N, k+=INC_K)
{
double t, dt, err;
int runs = 1;
double ops = 2.0*m/1000*n/1000*k;
printf(" %9d %9d %9d", m, n, k);
dgecopy(m, n, C_, 1, MAX_M, C_0, INCROW_C, INCCOL_C);
dgemm_ref(m, n, k,
ALPHA,
A_, INCROW_A, INCCOL_A,
B_, INCROW_B, INCCOL_B,
BETA,
C_0, INCROW_C, INCCOL_C);
t = 0;
runs = 0;
do {
dgecopy(m, n, C_, 1, MAX_M, C_1, INCROW_C, INCCOL_C);
dt = walltime();
dgemm(m, n, k,
ALPHA,
A_, INCROW_A, INCCOL_A,
B_, INCROW_B, INCCOL_B,
BETA,
C_1, INCROW_C, INCCOL_C);
dt = walltime() - dt;
t += dt;
++runs;
} while (t<0.3);
t /= runs;
err = err_dgemm(m, n, k,
ALPHA,
A_, INCROW_A, INCCOL_A,
B_, INCROW_B, INCCOL_B,
BETA,
C_0, INCROW_C, INCCOL_C,
C_1, INCROW_C, INCCOL_C);
printf(" %12.2e %12.2lf %12.2e", t, ops/t, err);
printf("\n");
}
return 0;
}