#include <cmath>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <sys/times.h>
#include <unistd.h>
#include <mkl_blas.h>
#ifndef COLMAJOR
#define COLMAJOR 1
#endif
#ifndef MAXDIM_M
#define MAXDIM_M 5120
#endif
#ifndef MAXDIM_N
#define MAXDIM_N 5120
#endif
#ifndef MAXDIM_K
#define MAXDIM_K 5120
#endif
#ifndef MIN_M
#define MIN_M 500
#endif
#ifndef MIN_N
#define MIN_N 500
#endif
#ifndef MIN_K
#define MIN_K 500
#endif
#ifndef MAX_M
#define MAX_M 5000
#endif
#ifndef MAX_N
#define MAX_N 5000
#endif
#ifndef MAX_K
#define MAX_K 5000
#endif
#ifndef INC_M
#define INC_M 100
#endif
#ifndef INC_N
#define INC_N 100
#endif
#ifndef INC_K
#define INC_K 100
#endif
#ifndef ALPHA
#define ALPHA 1
#endif
#ifndef BETA
#define BETA 1
#endif
//==============================================================================
// bench
//==============================================================================
namespace bench {
//------------------------------------------------------------------------------
// Auxiliary data for benchmarking
//------------------------------------------------------------------------------
double A[MAXDIM_M*MAXDIM_K] __attribute__ ((aligned (32)));
double B[MAXDIM_K*MAXDIM_N] __attribute__ ((aligned (32)));
double C1[MAXDIM_M*MAXDIM_N] __attribute__ ((aligned (32)));
double C2[MAXDIM_M*MAXDIM_N] __attribute__ ((aligned (32)));
double C3[MAXDIM_M*MAXDIM_N] __attribute__ ((aligned (32)));
double C4[MAXDIM_M*MAXDIM_N] __attribute__ ((aligned (32)));
//------------------------------------------------------------------------------
// Auxiliary functions for benchmarking
//------------------------------------------------------------------------------
double
walltime()
{
struct tms ts;
static double ClockTick=0.0;
if (ClockTick==0.0) {
ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
}
return ((double) times(&ts)) * ClockTick;
}
void
initMatrix(long m, long n, double *A, long incRowA, long incColA)
{
for (long j=0; j<n; ++j) {
for (long i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = ((double)rand() - RAND_MAX/2)*200/RAND_MAX;
}
}
}
double
asumDiffMatrix(long m, long n,
const double *A, long incRowA, long incColA,
double *B, long incRowB, long incColB)
{
double asum = 0;
for (long j=0; j<n; ++j) {
for (long i=0; i<m; ++i) {
asum += fabs(B[i*incRowB+j*incColB] - A[i*incRowA+j*incColA]);
}
}
return asum;
}
} // namespace bench
//==============================================================================
// ulmBLAS
//==============================================================================
namespace ulmBLAS {
//------------------------------------------------------------------------------
// A <- B
//------------------------------------------------------------------------------
void
gecopy(long m, long n,
const double *A, long incRowA, long incColA,
double *B, long incRowB, long incColB)
{
for (long j=0; j<n; ++j) {
for (long i=0; i<m; ++i) {
B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
}
}
}
//------------------------------------------------------------------------------
// Y <- Y + alpha*Yä#
//------------------------------------------------------------------------------
void
geaxpy(long m, long n,
double alpha,
const double *X, long incRowX, long incColX,
double *Y, long incRowY, long incColY)
{
for (long i=0; i<m; ++i) {
for (long j=0; j<n; ++j) {
Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
}
}
}
//------------------------------------------------------------------------------
// A <- alpha * A
//------------------------------------------------------------------------------
void
gescal(long m, long n,
double alpha,
double *X, long incRowX, long incColX)
{
if (alpha!=1.0) {
for (long i=0; i<m; ++i) {
for (long j=0; j<n; ++j) {
X[i*incRowX+j*incColX] *= alpha;
}
}
}
}
} // namespace ulmBLAS
//==============================================================================
// refColMajor
//==============================================================================
namespace refColMajor {
void
gemm(long m, long n, long k,
double alpha,
const double *A, long incRowA, long incColA,
const double *B, long incRowB, long incColB,
double beta,
double *C, long incRowC, long incColC)
{
ulmBLAS::gescal(m, n, beta, C, incRowC, incColC);
for (long j=0; j<n; ++j) {
for (long l=0; l<k; ++l) {
for (long i=0; i<m; ++i) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
} // namespace refColMajor
//==============================================================================
// simpleBuffer
//==============================================================================
#ifndef SIMPLE_PUFFER_M_C
#define SIMPLE_PUFFER_M_C 256
#endif
#ifndef SIMPLE_PUFFER_K_C
#define SIMPLE_PUFFER_K_C 256
#endif
#ifndef SIMPLE_PUFFER_N_C
#define SIMPLE_PUFFER_N_C 1024
#endif
namespace simpleBuffer {
double A_[SIMPLE_PUFFER_M_C*SIMPLE_PUFFER_K_C];
double B_[SIMPLE_PUFFER_K_C*SIMPLE_PUFFER_N_C];
double C_[SIMPLE_PUFFER_M_C*SIMPLE_PUFFER_N_C];
void
gemm(long m, long n, long k,
double alpha,
const double *A, long incRowA, long incColA,
const double *B, long incRowB, long incColB,
double beta,
double *C, long incRowC, long incColC)
{
long i, j, l;
long M_C = SIMPLE_PUFFER_M_C;
long N_C = SIMPLE_PUFFER_N_C;
long K_C = SIMPLE_PUFFER_K_C;
long mb = (m+M_C-1) / M_C;
long nb = (n+N_C-1) / N_C;
long kb = (k+K_C-1) / K_C;
long mr = m % M_C;
long nr = n % N_C;
long kr = k % K_C;
using ulmBLAS::geaxpy;
using ulmBLAS::gecopy;
using ulmBLAS::gescal;
gescal(m, n, beta, C, incRowC, incColC);
for (j=0; j<nb; ++j) {
long N = (j<nb-1 || nr==0) ? N_C : nr;
for (l=0; l<kb; ++l) {
long K = (l<kb-1 || kr==0) ? K_C : kr;
gecopy(K, N,
&B[l*K_C*incRowB+j*N_C*incColB], incRowB, incColB,
B_, 1, K_C);
for (i=0; i<mb; ++i) {
long M = (i<mb-1 || mr==0) ? M_C : mr;
gecopy(M, K,
&A[i*M_C*incRowA+l*K_C*incColA], incRowA, incColA,
A_, 1, M_C);
refColMajor::gemm(M, N, K,
1.0,
A_, 1, M_C,
B_, 1, K_C,
0.0,
C_, 1, M_C);
geaxpy(M, N, alpha,
C_, 1, M_C,
&C[i*M_C*incRowC+j*N_C*incColC], incRowC, incColC);
}
}
}
}
} // namespace simpleBuffer
//==============================================================================
// blocked
//==============================================================================
#ifndef BLOCKED_MC
#define BLOCKED_MC 256
#endif
#ifndef BLOCKED_KC
#define BLOCKED_KC 256
#endif
#ifndef BLOCKED_NC
#define BLOCKED_NC 4096
#endif
#ifndef BLOCKED_MR
#define BLOCKED_MR 4
#endif
#ifndef BLOCKED_NR
#define BLOCKED_NR 4
#endif
namespace blocked {
double A_[BLOCKED_MC*BLOCKED_KC] __attribute__ ((aligned (32)));
double B_[BLOCKED_KC*BLOCKED_NC] __attribute__ ((aligned (32)));
void
pack_A(long mc, long kc,
const double *A, long incRowA, long incColA,
double *p)
{
long MR = BLOCKED_MR;
long mp = (mc+MR-1) / MR;
for (long j=0; j<kc; ++j) {
for (long l=0; l<mp; ++l) {
for (long i0=0; i0<MR; ++i0) {
long i = l*MR + i0;
long nu = l*MR*kc + j*MR + i0;
p[nu] = (i<mc) ? A[i*incRowA+j*incColA]
: 0.0;
}
}
}
}
void
pack_B(long kc, long nc,
const double *B, long incRowB, long incColB,
double *p)
{
long NR = BLOCKED_NR;
long np = (nc+NR-1) / NR;
for (long l=0; l<np; ++l) {
for (long j0=0; j0<NR; ++j0) {
for (long i=0; i<kc; ++i) {
long j = l*NR+j0;
long nu = l*NR*kc + i*NR + j0;
p[nu] = (j<nc) ? B[i*incRowB+j*incColB]
: 0.0;
}
}
}
}
void
ugemm(long kc, double alpha,
const double *A, const double *B,
double beta,
double *C, long incRowC, long incColC)
{
double P[BLOCKED_MR*BLOCKED_NR];
long MR = BLOCKED_MR;
long NR = BLOCKED_NR;
for (long l=0; l<MR*NR; ++l) {
P[l] = 0;
}
for (long l=0; l<kc; ++l) {
for (long j=0; j<NR; ++j) {
for (long i=0; i<MR; ++i) {
P[i+j*MR] += A[i+l*MR]*B[l*NR+j];
}
}
}
for (long j=0; j<NR; ++j) {
for (long i=0; i<MR; ++i) {
C[i*incRowC+j*incColC] *= beta;
C[i*incRowC+j*incColC] += alpha*P[i+j*MR];
}
}
}
extern "C" {
void
ugemm_8_4(long kc, double alpha,
const double *A, const double *B,
double beta,
double *C, long incRowC, long incColC);
void
ugemm_4_8(long kc, double alpha,
const double *A, const double *B,
double beta,
double *C, long incRowC, long incColC);
void
ugemm_2_16(long kc, double alpha,
const double *A, const double *B,
double beta,
double *C, long incRowC, long incColC);
} // extern "C"
void
mgemm(long mc, long nc, long kc,
double alpha,
const double *A, const double *B,
double beta,
double *C, long incRowC, long incColC)
{
double C_[BLOCKED_MR*BLOCKED_NR];
long MR = BLOCKED_MR;
long NR = BLOCKED_NR;
long mp = (mc+MR-1) / MR;
long np = (nc+NR-1) / NR;
long mr_ = mc % MR;
long nr_ = nc % NR;
for (long j=0; j<np; ++j) {
long nr = (j!=np-1 || nr_==0) ? NR : nr_;
for (long i=0; i<mp; ++i) {
long mr = (i!=mp-1 || mr_==0) ? MR : mr_;
if (mr==MR && nr==NR) {
ugemm(kc, alpha,
&A[i*kc*MR], &B[j*kc*NR],
beta,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
} else {
ugemm(kc, alpha,
&A[i*kc*MR], &B[j*kc*NR],
0.0,
C_, 1, MR);
ulmBLAS::gescal(mr, nr, beta,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
ulmBLAS::geaxpy(mr, nr, 1.0, C_, 1, MR,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
}
}
}
}
void
gemm(long m, long n, long k,
double alpha,
const double *A, long incRowA, long incColA,
const double *B, long incRowB, long incColB,
double beta,
double *C, long incRowC, long incColC)
{
long MC = BLOCKED_MC;
long NC = BLOCKED_NC;
long KC = BLOCKED_KC;
long mb = (m+MC-1) / MC;
long nb = (n+NC-1) / NC;
long kb = (k+KC-1) / KC;
long mc_ = m % MC;
long nc_ = n % NC;
long kc_ = k % KC;
// if (alpha==0.0 || k==0) {
// ulmBLAS::gescal(m, n, beta, C, incRowC, incColC);
// return;
// }
for (long j=0; j<nb; ++j) {
long nc = (j!=nb-1 || nc_==0) ? NC : nc_;
for (long l=0; l<kb; ++l) {
long kc = (l!=kb-1 || kc_==0) ? KC : kc_;
double beta_ = (l==0) ? beta : 1.0;
pack_B(kc, nc,
&B[l*KC*incRowB+j*NC*incColB],
incRowB, incColB,
B_);
for (long i=0; i<mb; ++i) {
long mc = (i!=mb-1 || mc_==0) ? MC : mc_;
pack_A(mc, kc,
&A[i*MC*incRowA+l*KC*incColA],
incRowA, incColA,
A_);
mgemm(mc, nc, kc,
alpha, A_, B_, beta_,
&C[i*MC*incRowC+j*NC*incColC],
incRowC, incColC);
}
}
}
}
} // namespace blocked
//------------------------------------------------------------------------------
namespace mkl {
void
gemm(MKL_INT m, MKL_INT n, MKL_INT k,
double alpha,
const double *A, MKL_INT incRowA, MKL_INT incColA,
const double *B, MKL_INT incRowB, MKL_INT incColB,
double beta,
double *C, MKL_INT incRowC, MKL_INT incColC)
{
assert(incRowC==1);
char transA = (incRowA==1) ? 'N' : 'T';
char transB = (incRowB==1) ? 'N' : 'T';
MKL_INT ldA = (incRowA==1) ? incColA : incRowA;
MKL_INT ldB = (incRowB==1) ? incColB : incRowB;
MKL_INT ldC = incColC;
dgemm(&transA, &transB,
&m, &n, &k,
&alpha,
A, &ldA,
B, &ldB,
&beta,
C, &ldC);
}
} // namespace mkl
//------------------------------------------------------------------------------
int
main()
{
using namespace bench;
using namespace ulmBLAS;
using namespace std;
initMatrix(MAXDIM_M, MAXDIM_K, A, 1, MAXDIM_M);
initMatrix(MAXDIM_K, MAXDIM_N, B, 1, MAXDIM_K);
initMatrix(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M);
gecopy(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M, C2, 1, MAXDIM_M);
gecopy(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M, C3, 1, MAXDIM_M);
gecopy(MAXDIM_M, MAXDIM_N, C1, 1, MAXDIM_M, C4, 1, MAXDIM_M);
// Header-Zeile für die Ausgabe
printf("%5s %5s %5s ", "m", "n", "k");
printf("%5s %5s ", "IRA", "ICA");
printf("%5s %5s ", "IRB", "ICB");
printf("%5s %5s ", "IRC", "ICC");
printf("%20s %9s", "refColMajor: t", "MFLOPS");
printf("%20s %9s %9s", "refSimpleBuffer: t", "MFLOPS", "diff");
printf("%20s %9s %9s", "blocked GEMM: t", "MFLOPS", "diff");
printf("%20s %9s %9s", "Intel MKL: t", "MFLOPS", "diff");
printf("\n");
for (long m = MIN_M, n = MIN_N, k = MIN_K;
m <=MAX_M && n <= MAX_N && k <= MAX_K;
m += INC_M, n += INC_N, k += INC_K)
{
double t0, t1, diff;
long incRowA = (COLMAJOR) ? 1 : k;
long incColA = (COLMAJOR) ? m : 1;
long incRowB = (COLMAJOR) ? 1 : n;
long incColB = (COLMAJOR) ? k : 1;
long incRowC = (COLMAJOR) ? 1 : n;
long incColC = (COLMAJOR) ? m : 1;
printf("%5ld %5ld %5ld ", m, n, k);
printf("%5ld %5ld ", incRowA, incColA);
printf("%5ld %5ld ", incRowB, incColB);
printf("%5ld %5ld ", incRowC, incColC);
t0 = walltime();
refColMajor::gemm(m, n, k, ALPHA,
A, incRowA, incColA,
B, incRowB, incColB,
BETA,
C1, incRowC, incColC);
t1 = walltime() - t0;
printf("%20.4lf %9.2lf", t1, 2.*m/1000*n/1000*k/t1);
t0 = walltime();
simpleBuffer::gemm(m, n, k, ALPHA,
A, incRowA, incColA,
B, incRowB, incColB,
BETA,
C2, incRowC, incColC);
t1 = walltime() - t0;
diff = asumDiffMatrix(m, n,
C1, incRowC, incColC,
C2, incRowC, incColC);
printf("%20.4lf %9.2lf %9.1e", t1, 2.*m/1000*n/1000*k/t1, diff);
t0 = walltime();
blocked::gemm(m, n, k, ALPHA,
A, incRowA, incColA,
B, incRowB, incColB,
BETA,
C3, incRowC, incColC);
t1 = walltime() - t0;
diff = asumDiffMatrix(m, n,
C1, incRowC, incColC,
C3, incRowC, incColC);
printf("%20.4lf %9.2lf %9.1e", t1, 2.*m/1000*n/1000*k/t1, diff);
t0 = walltime();
mkl::gemm(m, n, k, ALPHA,
A, incRowA, incColA,
B, incRowB, incColB,
BETA,
C4, incRowC, incColC);
t1 = walltime() - t0;
diff = asumDiffMatrix(m, n,
C1, incRowC, incColC,
C4, incRowC, incColC);
printf("%20.4lf %9.2lf %9.1e", t1, 2.*m/1000*n/1000*k/t1, diff);
printf("\n");
}
return 0;
}