Content |
Ungeblockte LU-Zerlegung ohne Pivotisierung
Für eine diagonaldominante \(m \times n\) Matrix \(A\) soll die LU-Zerlegung berechnet werden. Wir betrachten dabei wieder die Partitionierung
\[A =\left(\begin{array}{c|c|c} A_{0,0} & a_{0,j} & A_{0,j+1} \\ \hline a_{j,0}^T & a_{j,j} & a_{j,j+1}^T \\ \hline A_{j+1,0} & a_{j+1,j} & A_{j+1,j+1} \\ \end{array}\right)\]Die Herleitung der Varianten erfolgt wie in der Volesung gezeigt wurde.
LU-Variante 1
-
For \(j=0, \dots, \min\{m,n\}-1\):
-
If j<n
-
\(a_{0,j} \leftarrow L_1(A_{0,0})^{-1} a_{0,j}\)
-
-
If j<m
-
\(a_{j,0} \leftarrow U(A_{0,0})^{-T} a_{j,0}\)
-
-
If j<m and j<n
-
\(a_{j,j} \leftarrow a_{j,j} - a_{j,0}^T a_{0,j}\)
-
-
LU-Variante 2
-
For \(j=0, \dots, \min\{m,n\}-1\):
-
If j<m
-
\(a_{j,0} \leftarrow U(A_{0,0})^{-T} a_{j,0}\)
-
-
If j<n
-
\(a_{j,j} \leftarrow a_{j,j} - a_{0,j}^T a_{j,0}\)
-
\(a_{j,j+1} \leftarrow a_{j,j+1} - A_{0,j+1}^T a_{j,0}\)
-
-
LU-Variante 3
-
For \(j=0, \dots, \min\{m,n\}-1\):
-
If j<n
-
\(a_{0,j} \leftarrow L_1(A_{0,0})^{-1} a_{0,j}\)
-
-
If j<m
-
\(a_{j,j} \leftarrow a_{j,j} - a_{j,0}^T a_{0,j}\)
-
-
If j+1<m
-
\(a_{j+1,j} \leftarrow \frac{1}{a_{j,j}}\left(a_{j+1,j} - A_{j+1,0} a_{0,j}\right)\)
-
-
LU-Variante 4
-
For \(j=0, \dots, \min\{m,n\}-1\):
-
\(a_{j,j} \leftarrow a_{j,j} - a_{0,j}^T a_{j,0}\)
-
\(a_{j,j+1} \leftarrow a_{j,j+1} - A_{0,j+1}^{T} a_{j,0}\)
-
If j+1<m
-
\(a_{j+1,j} \leftarrow \frac{1}{a_{j,j}}\left(a_{j+1,j} - A_{j+1,0}^T a_{0,j}\right)\)
-
-
Hier nützt man natürlich aus, dass dann
\[\left(\begin{array}{c} a_{j,j} \\ \hline a_{j,j+1} \end{array}\right)=\left(\begin{array}{c} a_{j,j} \\ \hline a_{j,j+1} \end{array}\right)-\left(\begin{array}{c} a_{0,j} \\ \hline A_{0,j+1} \end{array}\right)^Ta_{j,0}\]mit einer GEMV-Operation berechnte werden kann.
LU-Variante 5
-
For \(j=0, \dots, \min\{m,n\}-1\):
-
\(a_{j+1,j} \leftarrow \frac{1}{a_{j,j}} a_{j+1,j}\)
-
\(A_{j+1,j+1} \leftarrow A_{j+1,j+1} - a_{j+1,j} a_{j,j+1}^T\)
-
Aufgabe
-
Ergänzt die Vorlage
-
Verwendet verschiedene Optimierungen
-
Testet die Implementierung für zeilen- und splatenweise gespeicherte Matrizen
-
Testet die Implementierung für nicht-quadratische Matrizen
#include <stdio.h> #include <math.h> #include <stdlib.h> #include <sys/times.h> #include <unistd.h> //-- timer for benchmarks ------------------------------------------------------ double walltime() { struct tms ts; static double ClockTick=0.0; if (ClockTick==0.0) { ClockTick = 1.0 / ((double) sysconf(_SC_CLK_TCK)); } return ((double) times(&ts)) * ClockTick; } //-- setup and print matrices -------------------------------------------------- void initGeMatrix(int m, int n, double *A, int incRowA, int incColA) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { A[i*incRowA+j*incColA] = i*n + j + 1; } } } void randGeMatrix(int m, int n, double *A, int incRowA, int incColA) { int i, j; for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { A[i*incRowA+j*incColA] = ((double)rand()-RAND_MAX/2)*200/RAND_MAX; } } } void makeTrlDiagDom(int n, int unitDiag, double *A, int incRowA, int incColA) { int i, j; for (j=0; j<n; ++j) { double asum = 0; double A_jj = (unitDiag) ? 1 : A[j*(incRowA+incColA)]; for (i=j+1; i<n; ++i) { asum += fabs(A[i*incRowA+j*incColA]); } for (i=j+1; i<n; ++i) { A[i*incRowA+j*incColA] *= A_jj/(asum*1.1); } } } void makeGeDiagDom(int n, int m, double *A, int incRowA, int incColA) { int i, j; for (j=0; j<n; ++j) { double asum = 0; for (i=0; i<m; ++i) { if (i!=j) { asum += fabs(A[i*incRowA+j*incColA]); } } for (i=0; i<m; ++i) { if (i!=j) { A[i*incRowA+j*incColA] *= A[j*(incRowA+incColA)]/(asum*1.1); } } } } void printGeMatrix(int m, int n, const double *A, int incRowA, int incColA) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { printf("%10.4lf ", A[i*incRowA+j*incColA]); } printf("\n"); } printf("\n\n"); } void printTrMatrix(int m, int n, int unitDiag, int lower, const double *A, int incRowA, int incColA) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { if (unitDiag && (i==j)) { printf("%10.4lf ", 1.0); } else if ((lower && (i>=j)) || (!lower && (i<=j))) { printf("%10.4lf ", A[i*incRowA+j*incColA]); } else { printf("%10.4lf ", 0.0); } } printf("\n"); } printf("\n\n"); } //-- some BLAS Level 1 procedures and functions -------------------------------- double ddot(int n, const double *x, int incX, const double *y, int incY) { int i; double alpha = 0; for (i=0; i<n; ++i) { alpha += x[i*incX]*y[i*incY]; } return alpha; } void daxpy(int n, double alpha, const double *x, int incX, double *y, int incY) { int i; if (alpha==0) { return; } for (i=0; i<n; ++i) { y[i*incY] += alpha*x[i*incX]; } } void dgeaxpy(int m, int n, double alpha, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY) { int i, j; if (alpha==0 || m==0 || n==0) { return; } for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; } } } void dscal(int n, double alpha, double *x, int incX) { int i; if (alpha==1) { return; } for (i=0; i<n; ++i) { x[i*incX] *= alpha; } } void dcopy(int n, const double *x, int incX, double *y, int incY) { int i; for (i=0; i<n; ++i) { y[i*incY] = x[i*incX]; } } void dgecopy(int m, int n, const double *X, int incRowX, int incColX, double *Y, int incRowY, int incColY) { int i, j; for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { Y[i*incRowY+j*incColY] = X[i*incRowX+j*incColX]; } } } void dswap(int n, double *x, int incX, double *y, int incY) { int i; for (i=0; i<n; ++i) { double tmp; tmp = x[i*incX]; x[i*incX] = y[i*incY]; y[i*incY] = tmp; } } double damax(int n, const double *x, int incX) { int i; double result = 0; for (i=0; i<n; ++i) { if (fabs(x[i*incX])>result) { result = fabs(x[i*incX]); } } return result; } double dgenrm1(int m, int n, const double *A, int incRowA, int incColA) { int i, j; double result = 0; for (j=0; j<n; ++j) { double sum = 0; for (i=0; i<m; ++i) { sum += fabs(A[i*incRowA+j*incColA]); } if (sum>result) { result = sum; } } return result; } double dtrnrm1(int m, int n, int unitDiag, int lower, const double *A, int incRowA, int incColA) { int i, j; double result = 0; for (j=0; j<n; ++j) { double sum = 0; for (i=0; i<m; ++i) { if (unitDiag && (i==j)) { sum += 1.0; } else if ((lower && (i>=j)) || (!lower && (i<=j))) { sum += fabs(A[i*incRowA+j*incColA]); } } if (sum>result) { result = sum; } } return result; } //-- error bounds -------------------------------------------------------------- double err_dgemv(int m, int n, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double beta, const double *y0, int incY0, double *y1, int incY1) { int max_mn = (m>n) ? m : n; double normA = fabs(alpha)*dgenrm1(m, n, A, incRowA, incColA); double normX = damax(n, x, incX); double normY0 = fabs(beta)*damax(m, y0, incY0); double normD; daxpy(m, -1.0, y0, incY0, y1, incY1); normD = damax(n, y1, incY1); return normD/(normA*normX*normY0*max_mn); } double err_dtrmv(int n, int unitDiag, int lower, const double *A, int incRowA, int incColA, const double *x0, int incX0, double *x1, int incX1) { double normA = dtrnrm1(n, n, unitDiag, lower, A, incRowA, incColA); double normX0 = damax(n, x0, incX0); double normD; daxpy(n, -1.0, x0, incX0, x1, incX1); normD = damax(n, x1, incX1); return normD/(n*normA*normX0); } double err_dtrsv(int n, int unitDiag, int lower, const double *A, int incRowA, int incColA, const double *x0, int incX0, double *x1, int incX1) { double normA = dtrnrm1(n, n, unitDiag, lower, A, incRowA, incColA); double normX0 = damax(n, x0, incX0); double normD; daxpy(n, -1.0, x0, incX0, x1, incX1); normD = damax(n, x1, incX1); return normD/(n*normA*normX0); } double err_dger(int m, int n, double alpha, const double *x, int incX, const double *y, int incY, const double *A0, int incRowA0, int incColA0, double *A, int incRowA, int incColA) { double normA0 = dgenrm1(m, n, A0, incRowA0, incColA0); double normX = fabs(alpha)*damax(m, x, incX); double normY = damax(n, y, incY); double normD; int mn = (m>n) ? m : n; dgeaxpy(m, n, -1.0, A0, incRowA0, incColA0, A, incRowA, incColA); normD = dgenrm1(m, n, A, incRowA, incColA); return normD/(mn*normA0*normX*normY); } void dgemm(int m, int n, int k, double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, double beta, double *C, int incRowC, int incColC) { int i, j, l; if (beta!=1) { for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { C[i*incRowC+j*incColC] *= beta; } } } if (alpha!=0) { for (i=0; i<m; ++i) { for (j=0; j<n; ++j) { for (l=0; l<k; ++l) { C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA] *B[l*incRowB+j*incColB]; } } } } } double err_lu(int m, int n, const double *A, int incRowA, int incColA, double *LU, int incRowLU, int incColLU) { double aNrm1 = 0; double err = 0; int i,j; int k = (m<n) ? m : n; double *L = (double *)malloc(m*k*sizeof(double)); double *U = (double *)malloc(k*n*sizeof(double)); for (j=0; j<k; ++j) { for (i=0; i<m; ++i) { L[i+j*m] = (i>j) ? LU[i*incRowLU+j*incColLU] : (i==j) ? 1 : 0; } } for (j=0; j<n; ++j) { for (i=0; i<k; ++i) { U[i+j*k] = (i<=j) ? LU[i*incRowLU+j*incColLU] : 0; } } for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { aNrm1 += fabs(A[i*incRowA+j*incColA]); } } dgemm(m, n, k, 1.0, L, 1, m, U, 1, k, 0.0, LU, incRowLU, incColLU); for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { err += fabs(A[i*incRowA+j*incColA]-LU[i*incRowLU+j*incColLU]); } } err /= (aNrm1*k); free(L); free(U); return err; } //-- Fused BLAS Level 1 procedures and functions ------------------------------- #ifndef DGEMV_MxBS #define DGEMV_MxBS 4 #endif void dgemv_mxBS(int m, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double *y, int incY) { int i, j; if (alpha==0) { return; } for (i=0; i<m; ++i) { double dot = 0; for (j=0; j<DGEMV_MxBS; ++j) { dot += A[i*incRowA+j*incColA]*x[j*incX]; } y[i*incY] += alpha*dot; } } #ifndef DGEMV_BSxN #define DGEMV_BSxN 4 #endif void dgemv_BSxn(int n, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double beta, double *y, int incY) { int i, j; if (alpha==0) { return; } if (beta!=1) { for (i=0; i<DGEMV_BSxN; ++i) { y[i*incY] *= beta; } } for (j=0; j<n; ++j) { for (i=0; i<DGEMV_BSxN; ++i) { y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX]; } } } //-- GEMV implementations ------------------------------------------------------ void dgemv_ref(int m, int n, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double beta, double *y, int incY) { int i, j; for (i=0; i<m; ++i) { if (beta==0) { y[i*incY] = 0; } else { y[i*incY] *= beta; } for (j=0; j<n; ++j) { y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX]; } } } void dgemv_unblk(int m, int n, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double beta, double *y, int incY) { if (incRowA<incColA) { int j; dscal(m, beta, y, incY); for (j=0; j<n; ++j) { daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY); } } else { int i; for (i=0; i<m; ++i) { y[i*incY] *= beta; y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX); } } } void dgemv(int m, int n, double alpha, const double *A, int incRowA, int incColA, const double *x, int incX, double beta, double *y, int incY) { if (incRowA<incColA) { int bs = DGEMV_MxBS; int j; dscal(m, beta, y, incY); for (j=0; j+bs<=n; j+=bs) { dgemv_mxBS(m, alpha, &A[j*incColA], incRowA, incColA, &x[j*incX], incX, y, incY); } dgemv_unblk(m, n-j, alpha, &A[j*incColA], incRowA, incColA, &x[j*incX], incX, 1.0, y, incY); } else { int bs = DGEMV_BSxN; int i; for (i=0; i+bs<=m; i+=bs) { dgemv_BSxn(n, alpha, &A[i*incRowA], incRowA, incColA, x, incX, beta, &y[i*incY], incY); } dgemv_unblk(m-i, n, alpha, &A[i*incRowA], incRowA, incColA, x, incX, beta, &y[i*incY], incY); } } //-- TRMV implementations ------------------------------------------------------ void dtrlmv_ref(int n, int unitDiag, const double *A, int incRowA, int incColA, double *x, int incX) { int i, j; for (j=n-1; j>=0; --j) { for (i=j+1; i<n; ++i) { x[i*incX] += A[i*incRowA+j*incColA]*x[j*incX]; } if (!unitDiag) { x[j*incX] *= A[j*(incRowA+incColA)]; } } } void dtrlmv_col(int n, int unitDiag, const double *A, int incRowA, int incColA, double *x, int incX) { int k; if (unitDiag) { for (k=n-1; k>=0; --k) { daxpy(n-k-1, x[k*incX], &A[(k+1)*incRowA+k*incColA], incRowA, &x[(k+1)*incX], incX); } } else { for (k=n-1; k>=0; --k) { daxpy(n-k-1, x[k*incX], &A[(k+1)*incRowA+k*incColA], incRowA, &x[(k+1)*incX], incX); x[k*incX] *= A[k*(incRowA+incColA)]; } } } void dtrlmv_row(int n, int unitDiag, const double *A, int incRowA, int incColA, double *x, int incX) { int k; if (unitDiag) { for (k=n-1; k>=0; --k) { x[k*incX] += ddot(k, &A[k*incRowA], incColA, x, incX); } } else { for (k=n-1; k>=0; --k) { x[k*incX] = ddot(k+1, &A[k*incRowA], incColA, x, incX); } } } void dtrlmv(int n, int unitDiag, const double *A, int incRowA, int incColA, double *x, int incX) { if (incRowA<incColA) { int bs = DGEMV_MxBS; int k = (n/bs)*bs; dtrlmv_col(n-k, unitDiag, &A[k*(incRowA+incColA)], incRowA, incColA, &x[k*incX], incX); for (k-=bs; k>=0; k-=bs) { dgemv_mxBS(n-k-bs, 1.0, &A[(k+bs)*incRowA+k*incColA], incRowA, incColA, &x[k*incX], incX, &x[(k+bs)*incX], incX); dtrlmv_col(bs, unitDiag, &A[k*(incRowA+incColA)], incRowA, incColA, &x[k*incX], incX); } } else { double buffer[DGEMV_BSxN]; int bs = DGEMV_BSxN; int k = (n/bs)*bs; dgemv(n-k, k, 1.0, &A[k*incRowA], incRowA, incColA, x, incX, 0.0, buffer,1); dtrlmv_row(n-k, unitDiag, &A[k*(incRowA+incColA)], incRowA, incColA, &x[k*incX], incX); daxpy(n-k, 1.0, buffer, 1, &x[k*incX], incX); for (k-=bs; k>=0; k-=bs) { dgemv_BSxn(k, 1.0, &A[k*incRowA], incRowA, incColA, x, incX, 0.0, buffer, 1); dtrlmv_col(bs, unitDiag, &A[k*(incRowA+incColA)], incRowA, incColA, &x[k*incX], incX); daxpy(bs, 1.0, buffer, 1, &x[k*incX], incX); } } } //-- TRSV implementations ------------------------------------------------------ void dtrlsv_ref(int n, int unitDiag, const double *A, int incRowA, int incColA, double *x, int incX) { int i, j; for (i=0; i<n; ++i) { for (j=0; j<i; ++j) { x[i*incX] -= A[i*incRowA+j*incColA]*x[j*incX]; } x[i*incX] /= (unitDiag) ? 1.0 : A[i*(incRowA+incColA)]; } } void dtrlsv_row(int n, int unitDiag, const double *A, int incRowA, int incColA, double *x, int incX) { int k; for (k=0; k<n; ++k) { x[k*incX] -= ddot(k, &A[k*incRowA], incColA, x, incX); x[k*incX] /= (unitDiag) ? 1.0 : A[k*(incRowA+incColA)]; } } void dtrlsv_col(int n, int unitDiag, const double *A, int incRowA, int incColA, double *x, int incX) { int k; for (k=0; k<n; ++k) { x[k*incX] /= (unitDiag) ? 1.0 : A[k*(incRowA+incColA)]; daxpy(n-k-1, -x[k*incX], &A[(k+1)*incRowA+k*incColA], incRowA, &x[(k+1)*incX], incX); } } void dtrlsv(int n, int unitDiag, const double *A, int incRowA, int incColA, double *x, int incX) { if (incRowA<incColA) { // ... your code here ... } else { // ... your code here ... } } //-- GER implementations ------------------------------------------------------- void dger_ref(int m, int n, double alpha, const double *x, int incX, const double *y, int incY, double *A, int incRowA, int incColA) { int i, j; for (j=0; j<n; ++j) { for (i=0; i<m; ++i) { A[i*incRowA+j*incColA] += alpha*x[i*incX]*y[j*incY]; } } } void dger_row(int m, int n, double alpha, const double *x, int incX, const double *y, int incY, double *A, int incRowA, int incColA) { int i; for (i=0; i<m; ++i) { daxpy(n, alpha*x[i*incX], y, incY, &A[i*incRowA], incColA); } } void dger_col(int m, int n, double alpha, const double *x, int incX, const double *y, int incY, double *A, int incRowA, int incColA) { int j; for (j=0; j<n; ++j) { daxpy(m, alpha*y[j*incY], x, incX, &A[j*incColA], incRowA); } } void dger(int m, int n, double alpha, const double *x, int incX, const double *y, int incY, double *A, int incRowA, int incColA) { if (incRowA<incColA) { // ... your code here ... } else { // ... your code here ... } } //-- LU factorization ---------------------------------------------------------- void lu_unblk_var1(int m, int n, double *A, int incRowA, int incColA) { int min_mn = (m<n) ? m : n; int max_mn = (m>n) ? m : n; int j; // ... your code here ... } void lu_unblk_var2(int m, int n, double *A, int incRowA, int incColA) { int min_mn = (m<n) ? m : n; int j; // ... your code here ... } void lu_unblk_var3(int m, int n, double *A, int incRowA, int incColA) { int min_mn = (m<n) ? m : n; int j; // ... your code here ... } void lu_unblk_var4(int m, int n, double *A, int incRowA, int incColA) { int min_mn = (m<n) ? m : n; int j; // ... your code here ... } void lu_unblk_var5(int m, int n, double *A, int incRowA, int incColA) { int min_mn = (m<n) ? m : n; int j; // ... your code here ... } //------------------------------------------------------------------------------ #ifndef MIN_N #define MIN_N 100 #endif #ifndef MAX_N #define MAX_N 8000 #endif #ifndef INC_N #define INC_N 100 #endif #ifndef MIN_M #define MIN_M 100 #endif #ifndef MAX_M #define MAX_M 8000 #endif #ifndef INC_M #define INC_M 100 #endif #ifndef ROWMAJOR #define ROWMAJOR 0 #endif #if (ROWMAJOR==1) # define INCROW_A MAX_N # define INCCOL_A 1 #else # define INCROW_A 1 # define INCCOL_A MAX_M #endif #define MIN(X,Y) ((X)<(Y) ? (X) : (Y)) #define MAX(X,Y) ((X)>(Y) ? (X) : (Y)) double A_[MAX_N*MAX_N]; double A_0[MAX_N*MAX_N*MIN(INCROW_A,INCCOL_A)]; int main() { int m, n; randGeMatrix(MAX_N, MAX_N, A_, 1, MAX_M); printf("#%9s %9s %9s %9s", "m", "n", "INCROW_A", "INCCOL_A"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf(" %12s %12s %12s", "t", "MFLOPS", "err"); printf("\n"); for (m=MIN_M, n=MIN_N; n<=MAX_N && m<=MAX_M; m+=INC_M, n+=INC_N) { double t, dt, err; int runs = 1; int minMN = MIN(m,n); int maxMN = MAX(m,n); double ops = minMN*minMN*maxMN -(minMN*minMN*minMN/3.) -(minMN*minMN/2.); printf(" %9d %9d %9d %9d", m, n, INCROW_A, INCCOL_A); t = 0; runs = 0; do { dgecopy(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); dt = walltime(); lu_unblk_var1(m, n, A_0, INCROW_A, INCCOL_A); dt = walltime() - dt; t += dt; ++runs; } while (t<0.3); t /= runs; err = err_lu(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); printf(" %12.2e %12.2lf %12.2e", t, ops/(1000*1000*t), err); t = 0; runs = 0; do { dgecopy(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); dt = walltime(); lu_unblk_var2(m, n, A_0, INCROW_A, INCCOL_A); dt = walltime() - dt; t += dt; ++runs; } while (t<0.3); t /= runs; err = err_lu(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); printf(" %12.2e %12.2lf %12.2e", t, ops/(1000*1000*t), err); t = 0; runs = 0; do { dgecopy(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); dt = walltime(); lu_unblk_var3(m, n, A_0, INCROW_A, INCCOL_A); dt = walltime() - dt; t += dt; ++runs; } while (t<0.3); t /= runs; err = err_lu(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); printf(" %12.2e %12.2lf %12.2e", t, ops/(1000*1000*t), err); t = 0; runs = 0; do { dgecopy(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); dt = walltime(); lu_unblk_var4(m, n, A_0, INCROW_A, INCCOL_A); dt = walltime() - dt; t += dt; ++runs; } while (t<0.3); t /= runs; err = err_lu(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); printf(" %12.2e %12.2lf %12.2e", t, ops/(1000*1000*t), err); t = 0; runs = 0; do { dgecopy(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); dt = walltime(); lu_unblk_var5(m, n, A_0, INCROW_A, INCCOL_A); dt = walltime() - dt; t += dt; ++runs; } while (t<0.3); t /= runs; err = err_lu(m, n, A_, 1, MAX_M, A_0, INCROW_A, INCCOL_A); printf(" %12.2e %12.2lf %12.2e", t, ops/(1000*1000*t), err); printf("\n"); } return 0; }