#include #include #include #include #include #include //-- setup and print matrices -------------------------------------------------- double walltime() { struct tms ts; static double ClockTick=0.0; if (ClockTick==0.0) { ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK)); } return ((double) times(&ts)) * ClockTick; } void initGeMatrix(int m, int n, double *A, int incRowA, int incColA) { int i, j; for (i=0; iresult) { result = sum; } } return result; } void dgecopy(int m, int n, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY) { int i, j; for (i=0; i(Y) ? (X) : (Y)) double err_dgemm(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, const double *C0, int incRowC0, int incColC0, double *C, int incRowC, int incColC) { double normA = dgenrm1(m, k, A, incRowA, incColA); double normB = dgenrm1(k, n, B, incRowB, incColB); double normC = dgenrm1(m, n, C, incRowC0, incColC0); double normD; int mn = (m>n) ? m : n; int mnk = (mn>k) ? mn : k; normA = MAX(normA, fabs(alpha)*normA); normC = MAX(normC, fabs(beta)*normC); dgeaxpy(m, n, -1.0, C0, incRowC0, incColC0, C, incRowC, incColC); normD = dgenrm1(m, n, C, incRowC, incColC); return normD/(mnk*normA*normB*normC); } //------------------------------------------------------------------------------ //-- new with alignment -------------------------------------------------------- void * malloc_(size_t alignment, size_t size) { size += alignment; void *ptr = malloc(size); void *ptr2 = (void *)(((size_t)ptr + alignment) & ~(alignment-1)); void **vp = (void**) ptr2 - 1; *vp = ptr; return ptr2; } void free_(void *ptr) { free(*((void**)ptr-1)); } //------------------------------------------------------------------------------ #define DGEMM_MC 256 #define DGEMM_NC 512 #define DGEMM_KC 256 #define DGEMM_MR 4 #define DGEMM_NR 8 //------------------------------------------------------------------------------ void dpack_A(int m, int k, const double *A, int incRowA, int incColA, double *p) { int i, i0, j, l, nu; int mp = (m+DGEMM_MR-1) / DGEMM_MR; for (j=0; j