#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>
#ifndef COLMAJOR
#define COLMAJOR 1
#endif
#ifndef MAXDIM_M
#define MAXDIM_M 4000
#endif
#ifndef MAXDIM_N
#define MAXDIM_N 4000
#endif
#ifndef MAXDIM_K
#define MAXDIM_K 4000
#endif
#ifndef MIN_M
#define MIN_M 100
#endif
#ifndef MIN_N
#define MIN_N 100
#endif
#ifndef MIN_K
#define MIN_K 100
#endif
#ifndef MAX_M
#define MAX_M 7000
#endif
#ifndef MAX_N
#define MAX_N 7000
#endif
#ifndef MAX_K
#define MAX_K 7000
#endif
#ifndef INC_M
#define INC_M 100
#endif
#ifndef INC_N
#define INC_N 100
#endif
#ifndef INC_K
#define INC_K 100
#endif
#ifndef ALPHA
#define ALPHA 1.5
#endif
#ifndef BETA
#define BETA 1.5
#endif
//==============================================================================
// bench
//==============================================================================
//------------------------------------------------------------------------------
// Auxiliary data for benchmarking
//------------------------------------------------------------------------------
double A_bench[MAXDIM_M*MAXDIM_K];
double B_bench[MAXDIM_K*MAXDIM_N];
double C1_bench[MAXDIM_M*MAXDIM_N];
double C2_bench[MAXDIM_M*MAXDIM_N];
//------------------------------------------------------------------------------
// Auxiliary functions for benchmarking
//------------------------------------------------------------------------------
double
walltime_bench()
{
struct tms ts;
static double ClockTick=0.0;
if (ClockTick==0.0) {
ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
}
return ((double) times(&ts)) * ClockTick;
}
void
initMatrix_bench(long m, long n, double *A, long incRowA, long incColA)
{
long i, j;
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = ((double)rand() - RAND_MAX/2)*200/RAND_MAX;
}
}
}
double
dasumDiffMatrix_bench(long m, long n,
const double *A, long incRowA, long incColA,
double *B, long incRowB, long incColB)
{
long i, j;
double asum = 0;
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
asum += fabs(B[i*incRowB+j*incColB] - A[i*incRowA+j*incColA]);
}
}
return asum;
}
//==============================================================================
// ulmBLAS
//==============================================================================
//------------------------------------------------------------------------------
// A <- B
//------------------------------------------------------------------------------
void
dgecopy_ulmBLAS(long m, long n,
const double *A, long incRowA, long incColA,
double *B, long incRowB, long incColB)
{
long i, j;
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
B[i*incRowB+j*incColB] = A[i*incRowA+j*incColA];
}
}
}
//------------------------------------------------------------------------------
// Y <- Y + alpha*Y
//------------------------------------------------------------------------------
void
dgeaxpy_ulmBLAS(long m, long n,
double alpha,
const double *X, long incRowX, long incColX,
double *Y, long incRowY, long incColY)
{
long i, j;
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
}
}
}
//------------------------------------------------------------------------------
// A <- alpha * A
//------------------------------------------------------------------------------
void
dgescal_ulmBLAS(long m, long n,
double alpha,
double *X, long incRowX, long incColX)
{
long i, j;
if (alpha!=1.0) {
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
X[i*incRowX+j*incColX] *= alpha;
}
}
}
}
//==============================================================================
// refColMajor
//==============================================================================
void
dgemm_refColMajor(long m, long n, long k,
double alpha,
const double *A, long incRowA, long incColA,
const double *B, long incRowB, long incColB,
double beta,
double *C, long incRowC, long incColC)
{
long i, j, l;
dgescal_ulmBLAS(m, n, beta, C, incRowC, incColC);
for (j=0; j<n; ++j) {
for (l=0; l<k; ++l) {
for (i=0; i<m; ++i) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
//==============================================================================
// simpleBuffer
//==============================================================================
#ifndef SIMPLE_PUFFER_MC
#define SIMPLE_PUFFER_MC 256
#endif
#ifndef SIMPLE_PUFFER_KC
#define SIMPLE_PUFFER_KC 256
#endif
#ifndef SIMPLE_PUFFER_NC
#define SIMPLE_PUFFER_NC 1024
#endif
double A_simpleBuffer[SIMPLE_PUFFER_MC*SIMPLE_PUFFER_KC];
double B_simpleBuffer[SIMPLE_PUFFER_KC*SIMPLE_PUFFER_NC];
double C_simpleBuffer[SIMPLE_PUFFER_MC*SIMPLE_PUFFER_NC];
void
dgemm_simpleBuffer(long m, long n, long k,
double alpha,
const double *A, long incRowA, long incColA,
const double *B, long incRowB, long incColB,
double beta,
double *C, long incRowC, long incColC)
{
long i, j, l;
long MC = SIMPLE_PUFFER_MC;
long NC = SIMPLE_PUFFER_NC;
long KC = SIMPLE_PUFFER_KC;
long mb = m / MC;
long nb = n / NC;
long kb = k / KC;
long mr = m % MC;
long nr = n % NC;
long kr = k % KC;
dgescal_ulmBLAS(m, n, beta, C, incRowC, incColC);
for (j=0; j<=nb; ++j) {
long N = (j<nb || nr==0) ? NC : nr;
for (l=0; l<=kb; ++l) {
long K = (l<kb || kr==0) ? KC : kr;
dgecopy_ulmBLAS(K, N,
&B[l*KC*incRowB+j*NC*incColB], incRowB, incColB,
B_simpleBuffer, 1, KC);
for (i=0; i<=mb; ++i) {
long M = (i<mb || mr==0) ? MC : mr;
dgecopy_ulmBLAS(M, K,
&A[i*MC*incRowA+l*KC*incColA], incRowA, incColA,
A_simpleBuffer, 1, MC);
dgemm_refColMajor(M, N, K,
1.0,
A_simpleBuffer, 1, MC,
B_simpleBuffer, 1, KC,
0.0,
C_simpleBuffer, 1, MC);
dgeaxpy_ulmBLAS(M, N,
alpha,
C_simpleBuffer, 1, MC,
&C[i*MC*incRowC+j*NC*incColC], incRowC, incColC);
}
}
}
}
int
main()
{
initMatrix_bench(MAXDIM_M, MAXDIM_K, A_bench, 1, MAXDIM_M);
initMatrix_bench(MAXDIM_K, MAXDIM_N, B_bench, 1, MAXDIM_K);
initMatrix_bench(MAXDIM_M, MAXDIM_N, C1_bench, 1, MAXDIM_M);
dgecopy_ulmBLAS(MAXDIM_M, MAXDIM_N,
C1_bench, 1, MAXDIM_M,
C2_bench, 1, MAXDIM_M);
long m, n, k;
// Header-Zeile für die Ausgabe
printf("%5s %5s %5s ", "m", "n", "k");
printf("%5s %5s ", "IRA", "ICA");
printf("%5s %5s ", "IRB", "ICB");
printf("%5s %5s ", "IRC", "ICC");
printf("%20s %9s", "refColMajor: t", "MFLOPS");
printf("%20s %9s %9s", "refSimpleBuffer: t", "MFLOPS", "diff");
printf("\n");
for (m = MIN_M, n = MIN_N, k = MIN_K;
m <=MAX_M && n <= MAX_N && k <= MAX_K;
m += INC_M, n += INC_N, k += INC_K)
{
double t0, t1, diff;
long incRowA = (COLMAJOR) ? 1 : k;
long incColA = (COLMAJOR) ? m : 1;
long incRowB = (COLMAJOR) ? 1 : n;
long incColB = (COLMAJOR) ? k : 1;
long incRowC = (COLMAJOR) ? 1 : n;
long incColC = (COLMAJOR) ? m : 1;
printf("%5ld %5ld %5ld ", m, n, k);
printf("%5ld %5ld ", incRowA, incColA);
printf("%5ld %5ld ", incRowB, incColB);
printf("%5ld %5ld ", incRowC, incColC);
t0 = walltime_bench();
dgemm_refColMajor(m, n, k, ALPHA,
A_bench, incRowA, incColA,
B_bench, incRowB, incColB,
BETA,
C1_bench, incRowC, incColC);
t1 = walltime_bench() - t0;
printf("%20.4lf %9.2lf", t1, 2.*m/1000*n/1000*k/t1);
t0 = walltime_bench();
dgemm_simpleBuffer(m, n, k, ALPHA,
A_bench, incRowA, incColA,
B_bench, incRowB, incColB,
BETA,
C2_bench, incRowC, incColC);
t1 = walltime_bench() - t0;
diff = dasumDiffMatrix_bench(m, n,
C1_bench, incRowC, incColC,
C2_bench, incRowC, incColC);
printf("%20.4lf %9.2lf %9.1e", t1, 2.*m/1000*n/1000*k/t1, diff);
printf("\n");
}
return 0;
}