#include <stdlib.h>
#include <stdio.h>
#include <stddef.h>
#include <math.h>
#include <float.h>
#include <stdbool.h>
#include <sys/times.h>
#include <unistd.h>
double
walltime()
{
struct tms ts;
static double ClockTick=0.0;
if (ClockTick==0.0) {
ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK));
}
return ((double) times(&ts)) * ClockTick;
}
void
initGeMatrix(size_t m, size_t n,
double *A,
ptrdiff_t incRowA, ptrdiff_t incColA)
{
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
A[i*incRowA + j*incColA] = i*n + j +1;
}
}
}
void
randGeMatrix(size_t m, size_t n, bool withNan,
double *A,
ptrdiff_t incRowA, ptrdiff_t incColA)
{
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
A[i*incRowA + j*incColA] = withNan
? nan("")
: 2.*(rand()-RAND_MAX/2)/RAND_MAX;
}
}
}
void
printGeMatrix(size_t m, size_t n,
const double *A,
ptrdiff_t incRowA, ptrdiff_t incColA)
{
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
printf("%9.2lf ", A[i*incRowA + j*incColA]);
}
printf("\n");
}
printf("\n");
}
double
dgenrm_inf(size_t m, size_t n,
const double *A,
ptrdiff_t incRowA, ptrdiff_t incColA)
{
double result = 0;
for (size_t i=0; i<m; ++i) {
double sum = 0;
for (size_t j=0; j<n; ++j) {
sum += fabs(A[i*incRowA+j*incColA]);
}
if (sum>result) {
result = sum;
}
}
return result;
}
void
dcopy(size_t n,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t i=0; i<n; ++i) {
y[i*incY] = x[i*incX];
}
}
void
daxpy(size_t n, double alpha,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t i=0; i<n; ++i) {
y[i*incY] += alpha*x[i*incX];
}
}
double
ddot(size_t n,
const double *x, ptrdiff_t incX,
const double *y, ptrdiff_t incY)
{
double result = 0;
for (size_t i=0; i<n; ++i) {
result += x[i*incX]*y[i*incY];
}
return result;
}
void
dscal(size_t n,
double alpha,
double *x, ptrdiff_t incX)
{
if (alpha==1) {
return;
}
if (alpha!=0) {
for (size_t i=0; i<n; ++i) {
x[i*incX] *= alpha;
}
} else {
for (size_t i=0; i<n; ++i) {
x[i*incX] = 0;
}
}
}
void
dgemv_dot(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double beta,
double *y, ptrdiff_t incY)
{
dscal(m, beta, y, incY);
if (alpha==0) {
return;
}
for (size_t i=0; i<m; ++i) {
y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX);
}
}
#ifndef DOTF
#define DOTF 4
#endif
void
dgemv_dotf(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double beta,
double *y, ptrdiff_t incY)
{
dscal(m, beta, y, incY);
if (alpha==0) {
return;
}
size_t mb = m / DOTF;
for (size_t i=0; i<mb; ++i) {
for (size_t j=0; j<n; ++j) {
for (size_t l=0; l<DOTF; ++l) {
y[(DOTF*i+l)*incY]
+= alpha*A[(DOTF*i+l)*incRowA+j*incColA]*x[j*incX];
}
}
}
for (size_t i=mb*DOTF; i<m; ++i) {
y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX);
}
}
void
dgemv_axpy(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double beta,
double *y, ptrdiff_t incY)
{
dscal(m, beta, y, incY);
if (alpha==0) {
return;
}
for (size_t j=0; j<n; ++j) {
daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY);
}
}
#ifndef AXPYF
#define AXPYF 4
#endif
void
dgemv_axpyf(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double beta,
double *y, ptrdiff_t incY)
{
dscal(m, beta, y, incY);
if (alpha==0) {
return;
}
size_t nb = n / AXPYF;
for (size_t j=0; j<nb; ++j) {
for (size_t i=0; i<m; ++i) {
for (size_t l=0; l<AXPYF; ++l) {
y[i*incY] += alpha*A[i*incRowA+(j*AXPYF+l)*incColA]
*x[(j*AXPYF+l)*incX];
}
}
}
for (size_t j=nb*AXPYF; j<n; ++j) {
daxpy(n, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY);
}
}
void
dgemv_ref(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double beta,
double *y, ptrdiff_t incY)
{
if (beta!=1) {
if (beta!=0) {
for (size_t i=0; i<m; ++i) {
y[i*incY] *= beta;
}
} else {
for (size_t i=0; i<m; ++i) {
y[i*incY] = 0;
}
}
}
if (alpha!=0) {
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX];
}
}
}
}
double
dgemv_err(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
const double *y0, ptrdiff_t incY0,
double beta,
const double *yRef, ptrdiff_t incYRef,
double *ySol, ptrdiff_t incYSol)
{
double nrmY0 = dgenrm_inf(m, 1, y0, incY0, 1);
double nrmX = dgenrm_inf(n, 1, x, incX, 1);
double nrmA = dgenrm_inf(m, n, A, incRowA, incColA);
size_t maxMN = m<n ? n : m;
daxpy(m, -1, yRef, incYRef, ySol, incYSol);
double nrmDiff = dgenrm_inf(m, 1, ySol, incYSol, 1);
return nrmDiff /
(DBL_EPSILON*(maxMN*fabs(alpha)*nrmA*nrmX + m*fabs(beta)*nrmY0));
}
#ifndef COLMAJOR
#define COLMAJOR 0
#endif
#ifndef SEED_RAND
#define SEED_RAND 0
#endif
#ifndef MAX_M
#define MAX_M 4500
#endif
#ifndef MAX_N
#define MAX_N 4500
#endif
#ifndef ALPHA
#define ALPHA 1
#endif
#ifndef BETA
#define BETA 1
#endif
int
main()
{
srand(SEED_RAND);
printf("#COLMAJOR = %d\n", COLMAJOR);
printf("#ALPHA = %lf\n", (double)ALPHA);
printf("#BETA = %lf\n", (double)BETA);
double *A = malloc(MAX_M*MAX_N*sizeof(double));
double *x = malloc(MAX_N*sizeof(double));
double *y0 = malloc(MAX_M*sizeof(double));
double *yRef = malloc(MAX_M*sizeof(double));
double *ySol = malloc(MAX_M*sizeof(double));
if (!A || !x || !y0 || !yRef || !ySol) {
abort();
}
printf("#%4s %4s ", "m", "n");
printf("%10s %10s ", "time ref", "mflops ref");
printf("%10s %10s %7s ", "time 1", "mflops 1", "err");
printf("%10s %10s %7s ", "time 2", "mflops 2", "err");
printf("%10s %10s %7s ", "time 3", "mflops 3", "err");
printf("%10s %10s %7s ", "time 4", "mflops 4", "err");
printf("\n");
for (size_t m=16, n=16; m<=MAX_M && n<=MAX_N; m+=16, n+=16) {
ptrdiff_t incRowA = COLMAJOR ? 1 : n;
ptrdiff_t incColA = COLMAJOR ? m : 1;
ptrdiff_t incX = 1;
ptrdiff_t incY0 = 1;
ptrdiff_t incYRef = 1;
ptrdiff_t incYSol = 1;
double alpha = ALPHA;
double beta = BETA;
double mflop = m*(2*n+1)/1000./1000.;
randGeMatrix(m, n, ALPHA==0, A, incRowA, incColA);
randGeMatrix(n, 1, ALPHA==0, x, incX, 0);
randGeMatrix(m, 1, BETA==0, y0, incY0, 0);
printf(" %4zu %4zu ", m, n);
{
double t = 0;
size_t runs = 0;
while (t<0.1 || runs<3) {
dcopy(m, y0, incY0, yRef, incYRef);
double t0 = walltime();
dgemv_ref(m, n,
alpha,
A, incRowA, incColA,
x, incX,
beta,
yRef, incYRef);
t += walltime() - t0;
++runs;
}
t /= runs;
printf("%10.2lf %10.2lf ", t, mflop/t);
}
{
double t = 0;
size_t runs = 0;
while (t<0.1 || runs<3) {
dcopy(m, y0, incY0, ySol, incYSol);
double t0 = walltime();
dgemv_dot(m, n,
alpha,
A, incRowA, incColA,
x, incX,
beta,
ySol, incYSol);
t += walltime() - t0;
++runs;
}
t /= runs;
double err = dgemv_err(m, n, alpha,
A, incRowA, incColA,
x, incX,
y0, incY0,
beta,
yRef, incYRef,
ySol, incYSol);
printf("%10.2lf %10.2lf %7.1e ", t, mflop/t, err);
}
{
double t = 0;
size_t runs = 0;
while (t<0.1 || runs<3) {
dcopy(m, y0, incY0, ySol, incYSol);
double t0 = walltime();
dgemv_axpy(m, n,
alpha,
A, incRowA, incColA,
x, incX,
beta,
ySol, incYSol);
t += walltime() - t0;
++runs;
}
t /= runs;
double err = dgemv_err(m, n, alpha,
A, incRowA, incColA,
x, incX,
y0, incY0,
beta,
yRef, incYRef,
ySol, incYSol);
printf("%10.2lf %10.2lf %7.1e ", t, mflop/t, err);
}
{
double t = 0;
size_t runs = 0;
while (t<0.1 || runs<3) {
dcopy(m, y0, incY0, ySol, incYSol);
double t0 = walltime();
dgemv_dotf(m, n,
alpha,
A, incRowA, incColA,
x, incX,
beta,
ySol, incYSol);
t += walltime() - t0;
++runs;
}
t /= runs;
double err = dgemv_err(m, n, alpha,
A, incRowA, incColA,
x, incX,
y0, incY0,
beta,
yRef, incYRef,
ySol, incYSol);
printf("%10.2lf %10.2lf %7.1e ", t, mflop/t, err);
}
{
double t = 0;
size_t runs = 0;
while (t<0.1 || runs<3) {
dcopy(m, y0, incY0, ySol, incYSol);
double t0 = walltime();
dgemv_axpyf(m, n,
alpha,
A, incRowA, incColA,
x, incX,
beta,
ySol, incYSol);
t += walltime() - t0;
++runs;
}
t /= runs;
double err = dgemv_err(m, n, alpha,
A, incRowA, incColA,
x, incX,
y0, incY0,
beta,
yRef, incYRef,
ySol, incYSol);
printf("%10.2lf %10.2lf %7.1e ", t, mflop/t, err);
}
printf("\n");
}
free(A);
free(x);
free(y0);
free(yRef);
free(ySol);
}