#include <stdio.h>
#include <bench/aux.h>
#include <bench/refblas.h>
#include <bench/errbound.h>
#include <ulmblas/level1.h>
#include <ulmblas/level3.h>
#ifndef ALPHA
#define ALPHA 2
#endif
#ifndef BETA
#define BETA 2
#endif
#ifndef MIN_N
#define MIN_N 100
#endif
#ifndef MAX_N
#define MAX_N 1500
#endif
#ifndef INC_N
#define INC_N 100
#endif
#ifndef MIN_M
#define MIN_M 100
#endif
#ifndef MAX_M
#define MAX_M 1500
#endif
#ifndef INC_M
#define INC_M 100
#endif
#ifndef MIN_K
#define MIN_K 100
#endif
#ifndef MAX_K
#define MAX_K 1500
#endif
#ifndef INC_K
#define INC_K 100
#endif
#ifndef ROWMAJOR_A
#define ROWMAJOR_A 0
#endif
#ifndef ROWMAJOR_B
#define ROWMAJOR_B 0
#endif
#ifndef ROWMAJOR_C
#define ROWMAJOR_C 0
#endif
#if (ROWMAJOR_A==1)
#   define INCROW_A  MAX_K
#   define INCCOL_A  1
#else
#   define INCROW_A  1
#   define INCCOL_A  MAX_M
#endif
#if (ROWMAJOR_B==1)
#   define INCROW_B  MAX_N
#   define INCCOL_B  1
#else
#   define INCROW_B  1
#   define INCCOL_B  MAX_K
#endif
#if (ROWMAJOR_C==1)
#   define INCROW_C  MAX_N
#   define INCCOL_C  1
#else
#   define INCROW_C  1
#   define INCCOL_C  MAX_M
#endif
#define MIN(X,Y)   ((X)<(Y) ? (X) : (Y))
#define MAX(X,Y)   ((X)>(Y) ? (X) : (Y))
double A_[MAX_M*MAX_K*MIN(INCROW_A,INCCOL_A)];
double B_[MAX_K*MAX_N*MIN(INCROW_B,INCCOL_B)];
double C_[MAX_M*MAX_N];
double C_0[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)];
double C_1[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)];
int
main()
{
    int m, n, k;
    randGeMatrix(MAX_M, MAX_K, A_, INCROW_A, INCCOL_A);
    randGeMatrix(MAX_K, MAX_N, B_, INCROW_B, INCCOL_B);
    randGeMatrix(MAX_M, MAX_N, C_, 1, MAX_M);
    printf("#%9s %9s %9s", "m", "n", "k");
    printf(" %12s %12s %12s", "t", "MFLOPS", "err");
    printf(" %12s %12s %12s", "t", "MFLOPS", "err");
    printf("\n");
    for (m=MIN_M, n=MIN_N, k=MIN_K;
         n<=MAX_N && m<=MAX_M && k<=MAX_K;
         m+=INC_M, n+=INC_N, k+=INC_K)
    {
        double t, dt, err;
        int    runs  = 1;
        double ops   = 2.0*m/1000*n/1000*k;
        printf(" %9d %9d %9d", m, n, k);
        
        dgecopy(m, n, C_, 1, MAX_M, C_0, INCROW_C, INCCOL_C);
        dgemm_ref(m, n, k,
                  ALPHA,
                  A_, INCROW_A, INCCOL_A,
                  B_, INCROW_B, INCCOL_B,
                  BETA,
                  C_0, INCROW_C, INCCOL_C);
        
        t    = 0;
        runs = 0;
        do {
            dgecopy(m, n, C_, 1, MAX_M, C_1, INCROW_C, INCCOL_C);
            dt = walltime();
            dgemm(m, n, k,
                  ALPHA,
                  A_, INCROW_A, INCCOL_A,
                  B_, INCROW_B, INCCOL_B,
                  BETA,
                  C_1, INCROW_C, INCCOL_C);
            dt = walltime() - dt;
            t += dt;
            ++runs;
        } while (t<0.3);
        t /= runs;
        err = err_dgemm(m, n, k,
                        ALPHA,
                        A_, INCROW_A, INCCOL_A,
                        B_, INCROW_B, INCCOL_B,
                        BETA,
                        C_0, INCROW_C, INCCOL_C,
                        C_1, INCROW_C, INCCOL_C);
        printf(" %12.2e %12.2lf %12.2e", t, ops/t, err);
        printf("\n");
    }
    return 0;
}