#include #include #include #include #include //-- timer for benchmarks ------------------------------------------------------ double walltime() { struct tms ts; static double ClockTick=0.0; if (ClockTick==0.0) { ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK)); } return ((double) times(&ts)) * ClockTick; } //-- setup and print matrices -------------------------------------------------- void initGeMatrix(int m, int n, double *A, int incRowA, int incColA) { int i, j; for (i=0; i=j)) || (!lower && (i<=j))) { printf("%10.4lf ", A[i*incRowA+j*incColA]); } else { printf("%10.4lf ", 0.0); } } printf("\n"); } printf("\n\n"); } //-- some BLAS Level 1 procedures and functions -------------------------------- double ddot(int n, const double *x, int incX, const double *y, int incY) { int i; double alpha = 0; for (i=0; iresult) { result = fabs(x[i*incX]); } } return result; } double dgenrm1(int m, int n, const double *A, int incRowA, int incColA) { int i, j; double result = 0; for (j=0; jresult) { result = sum; } } return result; } double dtrnrm1(int m, int n, int unitDiag, int lower, const double *A, int incRowA, int incColA) { int i, j; double result = 0; for (j=0; j=j)) || (!lower && (i<=j))) { sum += fabs(A[i*incRowA+j*incColA]); } } if (sum>result) { result = sum; } } return result; } //-- error bounds -------------------------------------------------------------- double err_dgemv(int m, int n, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double beta, const double *y0, int incY0, double *y1, int incY1) { int max_mn = (m>n) ? m : n; double normA = fabs(alpha)*dgenrm1(m, n, A, incRowA, incColA); double normX = damax(n, x, incX); double normY0 = fabs(beta)*damax(m, y0, incY0); double normD; daxpy(m, -1.0, y0, incY0, y1, incY1); normD = damax(n, y1, incY1); return normD/(normA*normX*normY0*max_mn); } double err_dtrmv(int n, int unitDiag, int lower, const double *A, int incRowA, int incColA, const double *x0, int incX0, double *x1, int incX1) { double normA = dtrnrm1(n, n, unitDiag, lower, A, incRowA, incColA); double normX0 = damax(n, x0, incX0); double normD; daxpy(n, -1.0, x0, incX0, x1, incX1); normD = damax(n, x1, incX1); return normD/(n*normA*normX0); } double err_dtrsv(int n, int unitDiag, int lower, const double *A, int incRowA, int incColA, const double *x0, int incX0, double *x1, int incX1) { double normA = dtrnrm1(n, n, unitDiag, lower, A, incRowA, incColA); double normX0 = damax(n, x0, incX0); double normD; daxpy(n, -1.0, x0, incX0, x1, incX1); normD = damax(n, x1, incX1); return normD/(n*normA*normX0); } double err_dger(int m, int n, double alpha, const double *x, int incX, const double *y, int incY, const double *A0, int incRowA0, int incColA0, double *A, int incRowA, int incColA) { double normA0 = dgenrm1(m, n, A0, incRowA0, incColA0); double normX = fabs(alpha)*damax(m, x, incX); double normY = damax(n, y, incY); double normD; int mn = (m>n) ? m : n; dgeaxpy(m, n, -1.0, A0, incRowA0, incColA0, A, incRowA, incColA); normD = dgenrm1(m, n, A, incRowA, incColA); return normD/(mn*normA0*normX*normY); } //-- Fused BLAS Level 1 procedures and functions ------------------------------- #ifndef DGEMV_MxBS #define DGEMV_MxBS 4 #endif void dgemv_mxBS(int m, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double *y, int incY) { int i, j; if (alpha==0) { return; } for (i=0; i=0; --j) { for (i=j+1; i=0; --k) { daxpy(n-k-1, x[k*incX], &A[(k+1)*incRowA+k*incColA], incRowA, &x[(k+1)*incX], incX); } } else { for (k=n-1; k>=0; --k) { daxpy(n-k-1, x[k*incX], &A[(k+1)*incRowA+k*incColA], incRowA, &x[(k+1)*incX], incX); x[k*incX] *= A[k*(incRowA+incColA)]; } } } void dtrlmv_row(int n, int unitDiag, const double *A, int incRowA, int incColA, double *x, int incX) { int k; if (unitDiag) { for (k=n-1; k>=0; --k) { x[k*incX] += ddot(k, &A[k*incRowA], incColA, x, incX); } } else { for (k=n-1; k>=0; --k) { x[k*incX] = ddot(k+1, &A[k*incRowA], incColA, x, incX); } } } void dtrlmv_ulm(int n, int unitDiag, const double *A, int incRowA, int incColA, double *x, int incX) { if (incRowA=0; k-=bs) { dgemv_mxBS(n-k-bs, 1.0, &A[(k+bs)*incRowA+k*incColA], incRowA, incColA, &x[k*incX], incX, &x[(k+bs)*incX], incX); dtrlmv_col(bs, unitDiag, &A[k*(incRowA+incColA)], incRowA, incColA, &x[k*incX], incX); } } else { double buffer[DGEMV_BSxN]; int bs = DGEMV_BSxN; int k = (n/bs)*bs; dgemv_ulm(n-k, k, 1.0, &A[k*incRowA], incRowA, incColA, x, incX, 0.0, buffer,1); dtrlmv_row(n-k, unitDiag, &A[k*(incRowA+incColA)], incRowA, incColA, &x[k*incX], incX); daxpy(n-k, 1.0, buffer, 1, &x[k*incX], incX); for (k-=bs; k>=0; k-=bs) { dgemv_BSxn(k, 1.0, &A[k*incRowA], incRowA, incColA, x, incX, 0.0, buffer, 1); dtrlmv_col(bs, unitDiag, &A[k*(incRowA+incColA)], incRowA, incColA, &x[k*incX], incX); daxpy(bs, 1.0, buffer, 1, &x[k*incX], incX); } } } //-- TRSV implementations ------------------------------------------------------ void dtrlsv_ref(int n, int unitDiag, const double *A, int incRowA, int incColA, double *x, int incX) { int i, j; for (i=0; i // pre declaration so we don't need to include the complete mkl_blas header. void dgemv(const char *trans, const MKL_INT *m, const MKL_INT *n, const double *alpha, const double *A, const MKL_INT *ldA, const double *x, const MKL_INT *incX, const double *beta, double *y, const MKL_INT *incY); void dgemv_mkl(MKL_INT m, MKL_INT n, double alpha, const double *A, MKL_INT incRowA, MKL_INT incColA, const double *x, MKL_INT incX, double beta, double *y, MKL_INT incY) { MKL_INT ldA = (incRowA==1) ? incColA : incRowA; char trans = (incRowA==1) ? 'N' : 'T'; MKL_INT M = (incRowA==1) ? m : n; MKL_INT N = (incRowA==1) ? n : m; dgemv(&trans, &M, &N, &alpha, A, &ldA, x, &incX, &beta, y, &incY); } void dtrmv(const char *uplo, const char *transa, const char *diag, const MKL_INT *n, const double *a, const MKL_INT *lda, double *b, const MKL_INT *incx); void dtrmv_mkl(int lower, int unitDiag, MKL_INT n, const double *A, int incRowA, int incColA, double *x, int incX) { MKL_INT ldA = (incRowA==1) ? incColA : incRowA; dtrmv(&N, A, &ldA, x, &incX); } #endif // USE_MKL //------------------------------------------------------------------------------ #ifndef MIN_N #define MIN_N 100 #endif #ifndef MAX_N #define MAX_N 8000 #endif #ifndef INC_N #define INC_N 100 #endif #ifndef MIN_M #define MIN_M 100 #endif #ifndef MAX_M #define MAX_M 8000 #endif #ifndef INC_M #define INC_M 100 #endif #ifndef ROWMAJOR #define ROWMAJOR 0 #endif #ifndef UNITDIAG #define UNITDIAG 0 #endif #if (ROWMAJOR==1) # define INCROW_A MAX_N # define INCCOL_A 1 #else # define INCROW_A 1 # define INCCOL_A MAX_M #endif #ifndef INC_X #define INC_X 1 #endif #ifndef INC_Y #define INC_Y 1 #endif #ifndef ALPHA #define ALPHA 1 #endif #define MIN(X,Y) ((X)<(Y) ? (X) : (Y)) double A_[MAX_N*MAX_N]; double A_0[MAX_N*MAX_N*MIN(INCROW_A,INCCOL_A)]; double A_1[MAX_N*MAX_N*MIN(INCROW_A,INCCOL_A)]; double x[MAX_M*INC_X]; double y[MAX_N*INC_Y]; int main() { int m, n; randGeMatrix(MAX_N, MAX_N, A_, 1, MAX_M); randGeMatrix(MAX_M, 1, x, INC_X, 0); randGeMatrix(MAX_N, 1, y, INC_Y, 0); printf("#UNITDIAG=%3d\n", UNITDIAG); printf("#%9s %9s %9s %9s", "m", "n", "INCROW_A", "INCCOL_A"); printf(" %9s %9s %9s", "INC_X", "INC_Y", "ALPHA"); printf(" %12s %12s", "t", "MFLOPS"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf("\n"); for (m=MIN_M, n=MIN_N; n<=MAX_N && m<=MAX_M; m+=INC_M, n+=INC_N) { int runs = 1; int ops = n*(n+1)/2 + (n-1)*n/2; double t0, t1, dt, err; printf(" %9d %9d %9d %9d", m, n, INCROW_A, INCCOL_A); printf(" %9d %9d %9lf", INC_X, INC_Y, (double)ALPHA); t0 = 0; runs = 0; do { dgecopy(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); dt = walltime(); dger_ref(m, n, ALPHA, x, INC_X, y, INC_Y, A_0, INCROW_A, INCCOL_A); dt = walltime() - dt; t0 += dt; ++runs; } while (t0<0.3); t0 /= runs; printf(" %12.2e %12.2lf", t0, ops/(1000*1000*t0)); t1 = 0; runs = 0; do { dgecopy(m, n, A_, 1, MAX_M, A_1, INCROW_A, INCCOL_A); dt = walltime(); dger_row(m, n, ALPHA, x, INC_X, y, INC_Y, A_1, INCROW_A, INCCOL_A); dt = walltime() - dt; t1 += dt; ++runs; } while (t1<0.3); t1 /= runs; err = err_dger(m, n, ALPHA, x, INC_X, y, INC_Y, A_0, INCROW_A, INCCOL_A, A_1, INCROW_A, INCCOL_A); printf(" %12.2e %12.2lf %12.2e", t1, ops/(1000*1000*t1), err); t1 = 0; runs = 0; do { dgecopy(m, n, A_, 1, MAX_M, A_1, INCROW_A, INCCOL_A); dt = walltime(); dger_col(m, n, ALPHA, x, INC_X, y, INC_Y, A_1, INCROW_A, INCCOL_A); dt = walltime() - dt; t1 += dt; ++runs; } while (t1<0.3); t1 /= runs; err = err_dger(m, n, ALPHA, x, INC_X, y, INC_Y, A_0, INCROW_A, INCCOL_A, A_1, INCROW_A, INCCOL_A); printf(" %12.2e %12.2lf %12.2e", t1, ops/(1000*1000*t1), err); printf("\n"); } return 0; }