#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>
//-- timer for benchmarks ------------------------------------------------------
double
walltime()
{
struct tms ts;
static double ClockTick=0.0;
if (ClockTick==0.0) {
ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
}
return ((double) times(&ts)) * ClockTick;
}
//-- setup and print matrices --------------------------------------------------
void
initGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
int i, j;
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
A[i*incRowA+j*incColA] = i*n + j + 1;
}
}
}
void
randGeMatrix(int m, int n, double *A, int incRowA, int incColA)
{
int i, j;
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
A[i*incRowA+j*incColA] = ((double)rand()-RAND_MAX/2)*200/RAND_MAX;
}
}
}
void
makeTrlDiagDom(int n, int unitDiag, double *A, int incRowA, int incColA)
{
int i, j;
for (j=0; j<n; ++j) {
double asum = 0;
double A_jj = (unitDiag) ? 1
: A[j*(incRowA+incColA)];
for (i=j+1; i<n; ++i) {
asum += fabs(A[i*incRowA+j*incColA]);
}
for (i=j+1; i<n; ++i) {
A[i*incRowA+j*incColA] *= A_jj/(asum*1.1);
}
}
}
void
makeGeDiagDom(int n, int m, double *A, int incRowA, int incColA)
{
int i, j;
for (j=0; j<n; ++j) {
double asum = 0;
for (i=0; i<m; ++i) {
if (i!=j) {
asum += fabs(A[i*incRowA+j*incColA]);
}
}
for (i=0; i<m; ++i) {
if (i!=j) {
A[i*incRowA+j*incColA] *= A[j*(incRowA+incColA)]/(asum*1.1);
}
}
}
}
void
printGeMatrix(int m, int n, const double *A, int incRowA, int incColA)
{
int i, j;
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
printf("%10.4lf ", A[i*incRowA+j*incColA]);
}
printf("\n");
}
printf("\n\n");
}
void
printTrMatrix(int m, int n, int unitDiag, int lower,
const double *A, int incRowA, int incColA)
{
int i, j;
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
if (unitDiag && (i==j)) {
printf("%10.4lf ", 1.0);
} else if ((lower && (i>=j)) || (!lower && (i<=j))) {
printf("%10.4lf ", A[i*incRowA+j*incColA]);
} else {
printf("%10.4lf ", 0.0);
}
}
printf("\n");
}
printf("\n\n");
}
//
// BLAS Level 1
//
double
ddot(int n, const double *x, int incX, const double *y, int incY)
{
int i;
double alpha = 0;
for (i=0; i<n; ++i) {
alpha += x[i*incX]*y[i*incY];
}
return alpha;
}
void
daxpy(int n, double alpha, const double *x, int incX, double *y, int incY)
{
int i;
if (alpha==0) {
return;
}
for (i=0; i<n; ++i) {
y[i*incY] += alpha*x[i*incX];
}
}
void
dgeaxpy(int m, int n, double alpha,
const double *X, int incRowX, int incColX,
double *Y, int incRowY, int incColY)
{
int i, j;
if (alpha==0 || m==0 || n==0) {
return;
}
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
}
}
}
void
dscal(int n, double alpha, double *x, int incX)
{
int i;
if (alpha==1) {
return;
}
for (i=0; i<n; ++i) {
x[i*incX] *= alpha;
}
}
void
dcopy(int n, const double *x, int incX, double *y, int incY)
{
int i;
for (i=0; i<n; ++i) {
y[i*incY] = x[i*incX];
}
}
void
dgecopy(int m, int n,
const double *X, int incRowX, int incColX,
double *Y, int incRowY, int incColY)
{
int i, j;
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX];
}
}
}
void
dswap(int n, double *x, int incX, double *y, int incY)
{
int i;
for (i=0; i<n; ++i) {
double tmp;
tmp = x[i*incX];
x[i*incX] = y[i*incY];
y[i*incY] = tmp;
}
}
double
damax(int n, const double *x, int incX)
{
int i;
double result = 0;
for (i=0; i<n; ++i) {
if (fabs(x[i*incX])>result) {
result = fabs(x[i*incX]);
}
}
return result;
}
double
dgenrm1(int m, int n, const double *A, int incRowA, int incColA)
{
int i, j;
double result = 0;
for (j=0; j<n; ++j) {
double sum = 0;
for (i=0; i<m; ++i) {
sum += fabs(A[i*incRowA+j*incColA]);
}
if (sum>result) {
result = sum;
}
}
return result;
}
double
dtrnrm1(int m, int n, int unitDiag, int lower,
const double *A, int incRowA, int incColA)
{
int i, j;
double result = 0;
for (j=0; j<n; ++j) {
double sum = 0;
for (i=0; i<m; ++i) {
if (unitDiag && (i==j)) {
sum += 1.0;
} else if ((lower && (i>=j)) || (!lower && (i<=j))) {
sum += fabs(A[i*incRowA+j*incColA]);
}
}
if (sum>result) {
result = sum;
}
}
return result;
}
//
// BLAS Level 2
//
#ifndef DGEMV_MxBS
#define DGEMV_MxBS 4
#endif
void
dgemv_mxBS(int m, double alpha,
const double *A, int incRowA, int incColA,
const double *x, int incX,
double *y, int incY)
{
int i, j;
if (alpha==0) {
return;
}
for (i=0; i<m; ++i) {
double dot = 0;
for (j=0; j<DGEMV_MxBS; ++j) {
dot += A[i*incRowA+j*incColA]*x[j*incX];
}
y[i*incY] += alpha*dot;
}
}
#ifndef DGEMV_BSxN
#define DGEMV_BSxN 4
#endif
void
dgemv_BSxn(int n, double alpha,
const double *A, int incRowA, int incColA,
const double *x, int incX,
double beta,
double *y, int incY)
{
int i, j;
if (alpha==0) {
return;
}
if (beta!=1) {
for (i=0; i<DGEMV_BSxN; ++i) {
y[i*incY] *= beta;
}
}
for (j=0; j<n; ++j) {
for (i=0; i<DGEMV_BSxN; ++i) {
y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX];
}
}
}
//-- GEMV implementations ------------------------------------------------------
void
dgemv_ref(int m, int n, double alpha,
const double *A, int incRowA, int incColA,
const double *x, int incX,
double beta,
double *y, int incY)
{
int i, j;
for (i=0; i<m; ++i) {
if (beta==0) {
y[i*incY] = 0;
} else {
y[i*incY] *= beta;
}
for (j=0; j<n; ++j) {
y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX];
}
}
}
void
dgemv_unblk(int m, int n, double alpha,
const double *A, int incRowA, int incColA,
const double *x, int incX,
double beta,
double *y, int incY)
{
if (incRowA<incColA) {
int j;
dscal(m, beta, y, incY);
for (j=0; j<n; ++j) {
daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY);
}
} else {
int i;
for (i=0; i<m; ++i) {
y[i*incY] *= beta;
y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX);
}
}
}
void
dgemv(int m, int n, double alpha,
const double *A, int incRowA, int incColA,
const double *x, int incX,
double beta,
double *y, int incY)
{
if (incRowA<incColA) {
int bs = DGEMV_MxBS;
int j;
dscal(m, beta, y, incY);
for (j=0; j+bs<=n; j+=bs) {
dgemv_mxBS(m, alpha,
&A[j*incColA], incRowA, incColA,
&x[j*incX], incX,
y, incY);
}
dgemv_unblk(m, n-j, alpha,
&A[j*incColA], incRowA, incColA,
&x[j*incX], incX,
1.0,
y, incY);
} else {
int bs = DGEMV_BSxN;
int i;
for (i=0; i+bs<=m; i+=bs) {
dgemv_BSxn(n, alpha,
&A[i*incRowA], incRowA, incColA,
x, incX,
beta,
&y[i*incY], incY);
}
dgemv_unblk(m-i, n, alpha,
&A[i*incRowA], incRowA, incColA,
x, incX,
beta,
&y[i*incY], incY);
}
}
//-- TRMV implementations ------------------------------------------------------
void
dtrlmv_ref(int n, int unitDiag,
const double *A, int incRowA, int incColA,
double *x, int incX)
{
int i, j;
for (j=n-1; j>=0; --j) {
for (i=j+1; i<n; ++i) {
x[i*incX] += A[i*incRowA+j*incColA]*x[j*incX];
}
if (!unitDiag) {
x[j*incX] *= A[j*(incRowA+incColA)];
}
}
}
void
dtrlmv_col(int n, int unitDiag,
const double *A, int incRowA, int incColA,
double *x, int incX)
{
int k;
if (unitDiag) {
for (k=n-1; k>=0; --k) {
daxpy(n-k-1, x[k*incX],
&A[(k+1)*incRowA+k*incColA], incRowA,
&x[(k+1)*incX], incX);
}
} else {
for (k=n-1; k>=0; --k) {
daxpy(n-k-1, x[k*incX],
&A[(k+1)*incRowA+k*incColA], incRowA,
&x[(k+1)*incX], incX);
x[k*incX] *= A[k*(incRowA+incColA)];
}
}
}
void
dtrlmv_row(int n, int unitDiag,
const double *A, int incRowA, int incColA,
double *x, int incX)
{
int k;
if (unitDiag) {
for (k=n-1; k>=0; --k) {
x[k*incX] += ddot(k, &A[k*incRowA], incColA, x, incX);
}
} else {
for (k=n-1; k>=0; --k) {
x[k*incX] = ddot(k+1, &A[k*incRowA], incColA, x, incX);
}
}
}
void
dtrlmv(int n, int unitDiag,
const double *A, int incRowA, int incColA,
double *x, int incX)
{
if (incRowA<incColA) {
int bs = DGEMV_MxBS;
int k = (n/bs)*bs;
dtrlmv_col(n-k, unitDiag,
&A[k*(incRowA+incColA)], incRowA, incColA,
&x[k*incX], incX);
for (k-=bs; k>=0; k-=bs) {
dgemv_mxBS(n-k-bs, 1.0,
&A[(k+bs)*incRowA+k*incColA], incRowA, incColA,
&x[k*incX], incX,
&x[(k+bs)*incX], incX);
dtrlmv_col(bs, unitDiag,
&A[k*(incRowA+incColA)], incRowA, incColA,
&x[k*incX], incX);
}
} else {
double buffer[DGEMV_BSxN];
int bs = DGEMV_BSxN;
int k = (n/bs)*bs;
dgemv(n-k, k, 1.0,
&A[k*incRowA], incRowA, incColA,
x, incX,
0.0,
buffer,1);
dtrlmv_row(n-k, unitDiag,
&A[k*(incRowA+incColA)], incRowA, incColA,
&x[k*incX], incX);
daxpy(n-k, 1.0, buffer, 1, &x[k*incX], incX);
for (k-=bs; k>=0; k-=bs) {
dgemv_BSxn(k, 1.0,
&A[k*incRowA], incRowA, incColA,
x, incX,
0.0,
buffer, 1);
dtrlmv_col(bs, unitDiag,
&A[k*(incRowA+incColA)], incRowA, incColA,
&x[k*incX], incX);
daxpy(bs, 1.0, buffer, 1, &x[k*incX], incX);
}
}
}
//-- TRSV implementations ------------------------------------------------------
void
dtrlsv_ref(int n, int unitDiag,
const double *A, int incRowA, int incColA,
double *x, int incX)
{
int i, j;
for (i=0; i<n; ++i) {
for (j=0; j<i; ++j) {
x[i*incX] -= A[i*incRowA+j*incColA]*x[j*incX];
}
x[i*incX] /= (unitDiag) ? 1.0 : A[i*(incRowA+incColA)];
}
}
void
dtrlsv_row(int n, int unitDiag,
const double *A, int incRowA, int incColA,
double *x, int incX)
{
int k;
for (k=0; k<n; ++k) {
x[k*incX] -= ddot(k, &A[k*incRowA], incColA, x, incX);
x[k*incX] /= (unitDiag) ? 1.0 : A[k*(incRowA+incColA)];
}
}
void
dtrlsv_col(int n, int unitDiag,
const double *A, int incRowA, int incColA,
double *x, int incX)
{
int k;
for (k=0; k<n; ++k) {
x[k*incX] /= (unitDiag) ? 1.0 : A[k*(incRowA+incColA)];
daxpy(n-k-1, -x[k*incX],
&A[(k+1)*incRowA+k*incColA], incRowA,
&x[(k+1)*incX], incX);
}
}
void
dtrlsv(int n, int unitDiag,
const double *A, int incRowA, int incColA,
double *x, int incX)
{
if (incRowA<incColA) {
dtrlsv_col(n, unitDiag, A, incRowA, incColA, x, incX);
} else {
dtrlsv_row(n, unitDiag, A, incRowA, incColA, x, incX);
}
}
//-- GER implementations -------------------------------------------------------
void
dger_ref(int m, int n, double alpha,
const double *x, int incX,
const double *y, int incY,
double *A, int incRowA, int incColA)
{
int i, j;
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
A[i*incRowA+j*incColA] += alpha*x[i*incX]*y[j*incY];
}
}
}
void
dger_row(int m, int n, double alpha,
const double *x, int incX,
const double *y, int incY,
double *A, int incRowA, int incColA)
{
int i;
for (i=0; i<m; ++i) {
daxpy(n, alpha*x[i*incX], y, incY, &A[i*incRowA], incColA);
}
}
void
dger_col(int m, int n, double alpha,
const double *x, int incX,
const double *y, int incY,
double *A, int incRowA, int incColA)
{
int j;
for (j=0; j<n; ++j) {
daxpy(m, alpha*y[j*incY], x, incX, &A[j*incColA], incRowA);
}
}
void
dger(int m, int n, double alpha,
const double *x, int incX,
const double *y, int incY,
double *A, int incRowA, int incColA)
{
if (incRowA<incColA) {
dger_col(m, n, alpha, x, incX, y, incY, A, incRowA, incColA);
} else {
dger_row(m, n, alpha, x, incX, y, incY, A, incRowA, incColA);
}
}
//
// BLAS Level 3
//
//-- GEMM ----------------------------------------------------------------------
void
dgemm_ref(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
if (beta!=1) {
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
if (beta!=0) {
C[i*incRowC+j*incColC] *= beta;
} else {
C[i*incRowC+j*incColC] = 0;
}
}
}
}
if (alpha!=0) {
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
for (l=0; l<k; ++l) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
}
void
dgemm_mv(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int j;
for (j=0; j<n; ++j) {
dgemv(m, k,
alpha,
A, incRowA, incColA,
&B[j*incColB], incRowB,
beta,
&C[j*incColC], incRowC);
}
}
void
dgemm_jil(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
if (beta!=0) {
C[i*incRowC+j*incColC] *= beta;
} else {
C[i*incRowC+j*incColC] = 0;
}
for (l=0; l<k; ++l) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
void
dgemm_jli(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
for (j=0; j<n; ++j) {
for (l=0; l<k; ++l) {
for (i=0; i<m; ++i) {
if (l==0) {
if (beta!=0) {
C[i*incRowC+j*incColC] *= beta;
} else {
C[i*incRowC+j*incColC] = 0;
}
}
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
//-- LU factorization ----------------------------------------------------------
void
lu_unblk_var1(int m, int n, double *A, int incRowA, int incColA)
{
int min_mn = (m<n) ? m : n;
int max_mn = (m>n) ? m : n;
int j;
for (j=1; j<max_mn; ++j) {
int k = (min_mn < j) ? min_mn : j;
if (j<n) {
dtrlsv(k, 1,
A, incRowA, incColA,
&A[j*incColA], incRowA);
}
if (j<m) {
dtrlsv(k, 0,
A, incColA, incRowA,
&A[j*incRowA], incColA);
}
if (j<m && j<n) {
A[j*(incRowA+incColA)] -= ddot(j,
&A[j*incRowA], incColA,
&A[j*incColA], incRowA);
}
}
}
void
lu_unblk_var2(int m, int n, double *A, int incRowA, int incColA)
{
int min_mn = (m<n) ? m : n;
int j;
for (j=1; j<m; ++j) {
int k = (min_mn < j) ? min_mn : j;
if (j<m) {
dtrlsv(k, 0,
A, incColA, incRowA,
&A[j*incRowA], incColA);
}
if (j<n) {
dgemv(n-j, j,
-1.0,
&A[j*incColA], incColA, incRowA,
&A[j*incRowA], incColA,
1.0,
&A[j*(incRowA+incColA)], incColA);
}
}
}
void
lu_unblk_var3(int m, int n, double *A, int incRowA, int incColA)
{
int min_mn = (m<n) ? m : n;
int j;
for (j=0; j<n; ++j) {
int k = (min_mn < j) ? min_mn : j;
dtrlsv(k, 1,
A, incRowA, incColA,
&A[j*incColA], incRowA);
if (j<m) {
dgemv(m-j, j,
-1.0,
&A[j*incRowA], incRowA, incColA,
&A[j*incColA], incRowA,
1.0,
&A[j*(incRowA+incColA)], incRowA);
if (j+1<m) {
dscal(m-j-1,
1.0/A[j*(incRowA+incColA)],
&A[(j+1)*incRowA+j*incColA], incRowA);
}
}
}
}
void
lu_unblk_var4(int m, int n, double *A, int incRowA, int incColA)
{
int min_mn = (m<n) ? m : n;
int j;
for (j=0; j<min_mn; ++j) {
dgemv(n-j, j,
-1.0,
&A[j*incColA], incColA, incRowA,
&A[j*incRowA], incColA,
1.0,
&A[j*(incRowA+incColA)], incColA);
if (j+1<m) {
dgemv(m-j-1, j,
-1.0,
&A[(j+1)*incRowA], incRowA, incColA,
&A[j*incColA], incRowA,
1.0,
&A[(j+1)*incRowA+j*incColA], incRowA);
dscal(m-j-1,
1.0/A[j*(incRowA+incColA)],
&A[(j+1)*incRowA+j*incColA], incRowA);
}
}
}
void
lu_unblk_var5(int m, int n, double *A, int incRowA, int incColA)
{
int min_mn = (m<n) ? m : n;
int j;
for (j=0; j<min_mn; ++j) {
dscal(m-j-1,
1.0/A[j*(incRowA+incColA)],
&A[(j+1)*incRowA+j*incColA], incRowA);
dger(m-j-1, n-j-1,
-1.0,
&A[(j+1)*incRowA+j*incColA], incRowA,
&A[j*incRowA+(j+1)*incColA], incColA,
&A[(j+1)*(incRowA+incColA)], incRowA, incColA);
}
}
//-- error bounds --------------------------------------------------------------
#define MIN(X,Y) ((X)<(Y) ? (X) : (Y))
#define MAX(X,Y) ((X)>(Y) ? (X) : (Y))
double
err_dgemv(int m, int n, double alpha,
const double *A, int incRowA, int incColA,
const double *x, int incX,
double beta,
const double *y0, int incY0,
double *y1, int incY1)
{
int max_mn = (m>n) ? m : n;
double normA = fabs(alpha)*dgenrm1(m, n, A, incRowA, incColA);
double normX = damax(n, x, incX);
double normY0 = fabs(beta)*damax(m, y0, incY0);
double normD;
daxpy(m, -1.0, y0, incY0, y1, incY1);
normD = damax(n, y1, incY1);
return normD/(normA*normX*normY0*max_mn);
}
double
err_dtrmv(int n, int unitDiag, int lower,
const double *A, int incRowA, int incColA,
const double *x0, int incX0,
double *x1, int incX1)
{
double normA = dtrnrm1(n, n, unitDiag, lower, A, incRowA, incColA);
double normX0 = damax(n, x0, incX0);
double normD;
daxpy(n, -1.0, x0, incX0, x1, incX1);
normD = damax(n, x1, incX1);
return normD/(n*normA*normX0);
}
double
err_dtrsv(int n, int unitDiag, int lower,
const double *A, int incRowA, int incColA,
const double *x0, int incX0,
double *x1, int incX1)
{
double normA = dtrnrm1(n, n, unitDiag, lower, A, incRowA, incColA);
double normX0 = damax(n, x0, incX0);
double normD;
daxpy(n, -1.0, x0, incX0, x1, incX1);
normD = damax(n, x1, incX1);
return normD/(n*normA*normX0);
}
double
err_dger(int m, int n, double alpha,
const double *x, int incX,
const double *y, int incY,
const double *A0, int incRowA0, int incColA0,
double *A, int incRowA, int incColA)
{
double normA0 = dgenrm1(m, n, A0, incRowA0, incColA0);
double normX = fabs(alpha)*damax(m, x, incX);
double normY = damax(n, y, incY);
double normD;
int mn = (m>n) ? m : n;
dgeaxpy(m, n, -1.0, A0, incRowA0, incColA0, A, incRowA, incColA);
normD = dgenrm1(m, n, A, incRowA, incColA);
return normD/(mn*normA0*normX*normY);
}
double
err_dgemm(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
const double *C0, int incRowC0, int incColC0,
double *C, int incRowC, int incColC)
{
double normA = dgenrm1(m, k, A, incRowA, incColA);
double normB = dgenrm1(k, n, B, incRowB, incColB);
double normC = dgenrm1(m, n, C, incRowC0, incColC0);
double normD;
int mnk = MAX(m, MAX(n, k));
normA = MAX(normA, fabs(alpha)*normA);
normC = MAX(normC, fabs(beta)*normC);
dgeaxpy(m, n, -1.0, C0, incRowC0, incColC0, C, incRowC, incColC);
normD = dgenrm1(m, n, C, incRowC, incColC);
return normD/(mnk*normA*normB*normC);
}
double
err_lu(int m, int n,
const double *A, int incRowA, int incColA,
double *LU, int incRowLU, int incColLU)
{
double aNrm1 = 0;
double err = 0;
int i,j;
int k = (m<n) ? m : n;
double *L = (double *)malloc(m*k*sizeof(double));
double *U = (double *)malloc(k*n*sizeof(double));
for (j=0; j<k; ++j) {
for (i=0; i<m; ++i) {
L[i+j*m] = (i>j) ? LU[i*incRowLU+j*incColLU]
: (i==j) ? 1
: 0;
}
}
for (j=0; j<n; ++j) {
for (i=0; i<k; ++i) {
U[i+j*k] = (i<=j) ? LU[i*incRowLU+j*incColLU]
: 0;
}
}
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
aNrm1 += fabs(A[i*incRowA+j*incColA]);
}
}
dgemm_ref(m, n, k, 1.0,
L, 1, m,
U, 1, k,
0.0,
LU, incRowLU, incColLU);
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
err += fabs(A[i*incRowA+j*incColA]-LU[i*incRowLU+j*incColLU]);
}
}
err /= (aNrm1*k);
free(L);
free(U);
return err;
}
//------------------------------------------------------------------------------
#ifndef ALPHA
#define ALPHA 2
#endif
#ifndef BETA
#define BETA 2
#endif
#ifndef MIN_N
#define MIN_N 100
#endif
#ifndef MAX_N
#define MAX_N 1000
#endif
#ifndef INC_N
#define INC_N 100
#endif
#ifndef MIN_M
#define MIN_M 100
#endif
#ifndef MAX_M
#define MAX_M 1000
#endif
#ifndef INC_M
#define INC_M 100
#endif
#ifndef MIN_K
#define MIN_K 100
#endif
#ifndef MAX_K
#define MAX_K 1000
#endif
#ifndef INC_K
#define INC_K 100
#endif
#ifndef ROWMAJOR_A
#define ROWMAJOR_A 0
#endif
#ifndef ROWMAJOR_B
#define ROWMAJOR_B 0
#endif
#ifndef ROWMAJOR_C
#define ROWMAJOR_C 0
#endif
#if (ROWMAJOR_A==1)
# define INCROW_A MAX_K
# define INCCOL_A 1
#else
# define INCROW_A 1
# define INCCOL_A MAX_M
#endif
#if (ROWMAJOR_B==1)
# define INCROW_B MAX_N
# define INCCOL_B 1
#else
# define INCROW_B 1
# define INCCOL_B MAX_K
#endif
#if (ROWMAJOR_C==1)
# define INCROW_C MAX_N
# define INCCOL_C 1
#else
# define INCROW_C 1
# define INCCOL_C MAX_M
#endif
double A_[MAX_M*MAX_K*MIN(INCROW_A,INCCOL_A)];
double B_[MAX_K*MAX_N*MIN(INCROW_B,INCCOL_B)];
double C_[MAX_M*MAX_N];
double C_0[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)];
double C_1[MAX_M*MAX_N*MIN(INCROW_A,INCCOL_A)];
int
main()
{
int m, n, k;
randGeMatrix(MAX_M, MAX_K, A_, INCROW_A, INCCOL_A);
randGeMatrix(MAX_K, MAX_N, B_, INCROW_B, INCCOL_B);
randGeMatrix(MAX_M, MAX_N, C_, 1, MAX_M);
printf("#%9s %9s %9s", "m", "n", "k");
printf(" %12s %12s %12s", "t", "MFLOPS", "err");
printf(" %12s %12s %12s", "t", "MFLOPS", "err");
printf(" %12s %12s %12s", "t", "MFLOPS", "err");
printf("\n");
for (m=MIN_M, n=MIN_N, k=MIN_K;
n<=MAX_N && m<=MAX_M && k<=MAX_K;
m+=INC_M, n+=INC_N, k+=INC_K)
{
double t, dt, err;
int runs = 1;
double ops = 2*m*n*k;
printf(" %9d %9d %9d", m, n, k);
// compute reference solution
dgecopy(m, n, C_, 1, MAX_M, C_0, INCROW_C, INCCOL_C);
dgemm_ref(m, n, k,
ALPHA,
A_, INCROW_A, INCCOL_A,
B_, INCROW_B, INCCOL_B,
BETA,
C_0, INCROW_C, INCCOL_C);
// bench matrix-vector variants
// dgemm_mv
t = 0;
runs = 0;
do {
dgecopy(m, n, C_, 1, MAX_M, C_1, INCROW_C, INCCOL_C);
dt = walltime();
dgemm_mv(m, n, k,
ALPHA,
A_, INCROW_A, INCCOL_A,
B_, INCROW_B, INCCOL_B,
BETA,
C_1, INCROW_C, INCCOL_C);
dt = walltime() - dt;
t += dt;
++runs;
} while (t<0.3);
t /= runs;
err = err_dgemm(m, n, k,
ALPHA,
A_, INCROW_A, INCCOL_A,
B_, INCROW_B, INCCOL_B,
BETA,
C_0, INCROW_C, INCCOL_C,
C_1, INCROW_C, INCCOL_C);
printf(" %12.2e %12.2lf %12.2e", t, ops/(1000*1000*t), err);
// dgemm_jil
t = 0;
runs = 0;
do {
dgecopy(m, n, C_, 1, MAX_M, C_1, INCROW_C, INCCOL_C);
dt = walltime();
dgemm_jil(m, n, k,
ALPHA,
A_, INCROW_A, INCCOL_A,
B_, INCROW_B, INCCOL_B,
BETA,
C_1, INCROW_C, INCCOL_C);
dt = walltime() - dt;
t += dt;
++runs;
} while (t<0.3);
t /= runs;
err = err_dgemm(m, n, k,
ALPHA,
A_, INCROW_A, INCCOL_A,
B_, INCROW_B, INCCOL_B,
BETA,
C_0, INCROW_C, INCCOL_C,
C_1, INCROW_C, INCCOL_C);
printf(" %12.2e %12.2lf %12.2e", t, ops/(1000*1000*t), err);
// dgemm_jli
t = 0;
runs = 0;
do {
dgecopy(m, n, C_, 1, MAX_M, C_1, INCROW_C, INCCOL_C);
dt = walltime();
dgemm_jli(m, n, k,
ALPHA,
A_, INCROW_A, INCCOL_A,
B_, INCROW_B, INCCOL_B,
BETA,
C_1, INCROW_C, INCCOL_C);
dt = walltime() - dt;
t += dt;
++runs;
} while (t<0.3);
t /= runs;
err = err_dgemm(m, n, k,
ALPHA,
A_, INCROW_A, INCCOL_A,
B_, INCROW_B, INCCOL_B,
BETA,
C_0, INCROW_C, INCCOL_C,
C_1, INCROW_C, INCCOL_C);
printf(" %12.2e %12.2lf %12.2e", t, ops/(1000*1000*t), err);
printf("\n");
}
return 0;
}