jsv.tcc (flens/lapack/svd/jsv.tcc)

   1 /*

   2  *   Copyright (c) 2012, Michael Lehn

   3  *

   4  *   All rights reserved.

   5  *

   6  *   Redistribution and use in source and binary forms, with or without

   7  *   modification, are permitted provided that the following conditions

   8  *   are met:

   9  *

  10  *   1) Redistributions of source code must retain the above copyright

  11  *      notice, this list of conditions and the following disclaimer.

  12  *   2) Redistributions in binary form must reproduce the above copyright

  13  *      notice, this list of conditions and the following disclaimer in

  14  *      the documentation and/or other materials provided with the

  15  *      distribution.

  16  *   3) Neither the name of the FLENS development group nor the names of

  17  *      its contributors may be used to endorse or promote products derived

  18  *      from this software without specific prior written permission.

  19  *

  20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

  21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

  22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

  23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

  24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

  25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

  26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

  27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

  28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

  29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

  30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

  31  */

  32 

  33 /* Based on

  34  *

  35        SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,

  36       $                   M, N, A, LDA, SVA, U, LDU, V, LDV,

  37       $                   WORK, LWORK, IWORK, INFO )

  38  *

  39  *  -- LAPACK routine (version 3.3.1)                                    --

  40  *

  41  *  -- Contributed by Zlatko Drmac of the University of Zagreb and     --

  42  *  -- Kresimir Veselic of the Fernuniversitaet Hagen                  --

  43  *  -- April 2011                                                      --

  44  *

  45  *  -- LAPACK is a software package provided by Univ. of Tennessee,    --

  46  *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--

  47  *

  48  * This routine is also part of SIGMA (version 1.23, October 23. 2008.)

  49  * SIGMA is a library of algorithms for highly accurate algorithms for

  50  * computation of SVD, PSVD, QSVD, (H,K)-SVD, and for solution of the

  51  * eigenvalue problems Hx = lambda M x, H M x = lambda x with H, M > 0.

  52  *

  53  */

  54 

  55 #ifndef FLENS_LAPACK_SVD_JSV_TCC

  56 #define FLENS_LAPACK_SVD_JSV_TCC 1

  57 

  58 #include <flens/blas/blas.h>

  59 #include <flens/lapack/lapack.h>

  60 

  61 namespace flens { namespace lapack {

  62 

  63 //== generic lapack implementation =============================================

  64 /*

  65 template <typename MA, typename VSVA, typename MU, typename MV,

  66           typename VWORK, typename VIWORK>

  67 typename GeMatrix<MA>::IndexType

  68 jsv_generic(JSV::Accuracy             accuracy,

  69             JSV::JobU                 jobU,

  70             JSV::JobV                 jobV,

  71             bool                      restrictedRange,

  72             bool                      considerTransA,

  73             bool                      perturb,

  74             GeMatrix<MA>              &A,

  75             DenseVector<VSVA>         &sva,

  76             GeMatrix<MU>              &U,

  77             GeMatrix<MV>              &V,

  78             DenseVector<VWORK>        &work,

  79             DenseVector<VIWORK>       &iwork)

  80 {

  81     using std::abs;

  82     using std::max;

  83     using std::min;

  84     using std::sqrt;

  85 

  86     typedef typename GeMatrix<MA>::ElementType  ElementType;

  87     typedef typename GeMatrix<MA>::IndexType    IndexType;

  88 

  89     const ElementType Zero(0), One(1);

  90 

  91     const Underscore<IndexType>  _;

  92 

  93     const bool lsvec  = (jobU==ComputeU) || (jobU==FullsetU);

  94     const bool jracc  = (jobV=='J');

  95     const bool rsvec  = (jobV=='V') || jracc;

  96     const bool rowpiv = (jobA=='F') || (jobA=='G');

  97     const bool l2rank = (jobA=='R');

  98     const bool l2aber = (jobA=='A');

  99     const bool errest = (jobA=='E') || (jobA=='G');

 100     const bool l2tran = (jobT=='T');

 101     const bool l2kill = (jobR=='R');

 102     const bool defr   = (jobR=='N');

 103     const bool l2pert = (jobP=='P');

 104 

 105     IndexType info = 0;

 106 

 107 //

 108 //  Quick return for void matrix (Y3K safe)

 109 //#:)

 110     if (m==0 || n==0) {

 111         return info;

 112     }

 113 //

 114 //  Set numerical parameters

 115 //

 116 //! NOTE: Make sure DLAMCH() does not fail on the target architecture.

 117 //

 118     const ElementType eps     = lamch<ElementType>(Eps);

 119     const ElementType safeMin = lamch<ElementType>(SafeMin);

 120     const ElementType small   = safeMin / eps;

 121     const ElementType big     = lamch<ElementType>(OverflowThreshold);

 122 //

 123 //  Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N

 124 //

 125 //! If necessary, scale SVA() to protect the largest norm from

 126 //  overflow. It is possible that this scaling pushes the smallest

 127 //  column norm left from the underflow threshold (extreme case).

 128 //

 129     ElementType scaleM = One / sqrt(ElementType(m)*ElementType(n));

 130     bool noScale = true;

 131     bool goScale = true;

 132     for (IndexType p=1; p<=n; ++p) {

 133         aapp = Zero;

 134         aaqq = One;

 135         lassq(A(_,p), aapp, aaqq);

 136         if (aapp>big) {

 137             return -9;

 138         }

 139         aaqq = sqrt(aaqq);

 140         if (aapp<(big/aaqq) && noscal) {

 141             sva(p)  = aapp * aaqq;

 142         } else {

 143             noscal  = false;

 144             sva(p)  = aapp * ( aaqq * scalem )

 145             if (goScale) {

 146                 goScale = false;

 147                 sva(_(1,p)) *= scaleM;

 148             }

 149         }

 150     }

 151 

 152     if (noScale) {

 153         scaleM = One;

 154     }

 155 

 156     aapp = Zero;

 157     aaqq = big;

 158     for (IndexType p=1; p<=n) {

 159         aapp = max(aapp, sva(p));

 160         if (sva(p)!=Zero) {

 161             aaqq = min(aaqq, sva(p));

 162         }

 163     }

 164 //

 165 //    Quick return for zero M x N matrix

 166 //#:)

 167     if (aapp==Zero) {

 168         if (lsvec) {

 169             U = Zero;

 170             U.diag(0) = One;

 171         }

 172         if (rsvec) {

 173             V = Zero;

 174             V.diag(0) = One;

 175         }

 176         work(1) = One;

 177         work(2) = One;

 178         if (errest) {

 179             work(3) = One;

 180         }

 181         if (lsvec && rsvec) {

 182             work(4) = One;

 183             work(5) = One;

 184         }

 185         if (l2tran) {

 186             work(6) = Zero;

 187             work(7) = Zero;

 188         }

 189         iwork(1) = 0;

 190         iwork(2) = 0;

 191         iwork(3) = 0;

 192         return info;

 193     }

 194 //

 195 //  Issue warning if denormalized column norms detected. Override the

 196 //  high relative accuracy request. Issue licence to kill columns

 197 //  (set them to zero) whose norm is less than sigma_max / BIG (roughly).

 198 //#:(

 199     warning = 0

 200     if (aaqq<=SFMIN) {

 201         l2rank = true;

 202         l2kill = true;

 203         warning = 1;

 204     }

 205 //

 206 //  Quick return for one-column matrix

 207 //#:)

 208     auto U1 = U(_,_(1,n));

 209     if (n==1) {

 210 

 211         if (lsvec) {

 212             lascl(LASCL::FullMatrix, 0, 0, sva(1), scaleM, A);

 213             U1 = A;

 214 //          computing all M left singular vectors of the M x 1 matrix

 215             if (nu!=n) {

 216                 auto tau   = work(_(1,n));

 217                 auto _work = work(_(n+1,lWork));

 218                 qrf(U1, tau, _work);

 219                 orgqr(1, U, tau, _work);

 220                 U1 = A;

 221             }

 222         }

 223         if (rsvec) {

 224             V(1,1) = One

 225         }

 226         if (sva(1)<big*scaleM) {

 227             sva(1) /= scaleM;

 228             scalem = One;

 229         }

 230         work(1) = One / scaleM;

 231         work(2) = One;

 232         if (sva(1)!=Zero) {

 233             iwork(1) = 1;

 234             if (sva(1)/scaleM>=safeMin) {

 235                 iwork(2) = 1;

 236             } else {

 237                 iwork(2) = 0;

 238             }

 239         } else {

 240             iwork(1) = 0;

 241             iwork(2) = 0;

 242         }

 243         if (errest) {

 244             work(3) = One;

 245         }

 246         if (lsvec && rsvec) {

 247             work(4) = One;

 248             work(5) = One;

 249         }

 250         if (l2tran) {

 251             work(6) = Zero;

 252             work(7) = Zero;

 253         }

 254         return info;

 255 

 256     }

 257 

 258     bool transp = false;

 259     bool l2tran = l2tran && m==n;

 260 

 261     ElementType aatMax = -One;

 262     ElementType aatMin = big;

 263     if (rowpiv || l2tran) {

 264 //

 265 //      Compute the row norms, needed to determine row pivoting sequence

 266 //      (in the case of heavily row weighted A, row pivoting is strongly

 267 //      advised) and to collect information needed to compare the

 268 //      structures of A * A^t and A^t * A (in the case L2TRAN.EQ.true).

 269 //

 270         if (l2tran) {

 271             for (IndexType p=1; p<=m; ++p) {

 272                 xsc = Zero;

 273                 tmp = One;

 274                 lassq(A(p,_), xsc, tmp);

 275 //              DLASSQ gets both the ell_2 and the ell_infinity norm

 276 //              in one pass through the vector

 277                 work(m+n+p) = xsc * scaleM;

 278                 work(n+p)   = xsc * (scaleN*sqrt(tmp));

 279                 aatMax      = max(aatMax, work(n+p));

 280                 if (work(n+p)!=Zero) {

 281                     aatMin = min(aatMin, work(n+p));

 282                 }

 283             }

 284         } else {

 285             for (IndexType p=1; p<=m; ++p) {

 286                 const IndexType jp = blas::iamax(A(p,_));

 287                 work(m+n+p) = scaleM*abs(A(p,jp));

 288                 aatMax = max(aatMax, work(m+n+p));

 289                 aatMin = min(aatMin, work(m+n+p));

 290             }

 291         }

 292 

 293     }

 294 //

 295 //  For square matrix A try to determine whether A^t  would be  better

 296 //  input for the preconditioned Jacobi SVD, with faster convergence.

 297 //  The decision is based on an O(N) function of the vector of column

 298 //  and row norms of A, based on the Shannon entropy. This should give

 299 //  the right choice in most cases when the difference actually matters.

 300 //  It may fail and pick the slower converging side.

 301 //

 302     entra  = Zero

 303     entrat = Zero

 304     if (l2tran) {

 305 

 306         xsc = Zero;

 307         tmp = One;

 308         lassq(sva, xsc, tmp);

 309         tmp = One / tmp;

 310 

 311         entra = Zero

 312         for (IndexType p=1; p<=n; ++p) {

 313             const ElementType _big = pow(sva(p)/xsc, 2) * tmp;

 314             if (_big!=Zero) {

 315                 entra += _big * log(_big);

 316             }

 317         }

 318         entra = - entra / log(ElementType(n));

 319 //

 320 //      Now, SVA().^2/Trace(A^t * A) is a point in the probability simplex.

 321 //      It is derived from the diagonal of  A^t * A.  Do the same with the

 322 //      diagonal of A * A^t, compute the entropy of the corresponding

 323 //      probability distribution. Note that A * A^t and A^t * A have the

 324 //      same trace.

 325 //

 326         entrat = Zero

 327         for (IndexType p=n+1; p<=n+m; ++p) {

 328             const ElementType _big = pow(work(p)/xsc, 2) * tmp;

 329             if (_big!=Zero) {

 330                 entrat += _big * log(_big);

 331             }

 332         }

 333         entrat = -entrat / log(ElementType(m));

 334 //

 335 //      Analyze the entropies and decide A or A^t. Smaller entropy

 336 //      usually means better input for the algorithm.

 337 //

 338         transp = entrat<entra;

 339 //

 340 //      If A^t is better than A, transpose A.

 341 //

 342         if (transp) {

 343 //          In an optimal implementation, this trivial transpose

 344 //          should be replaced with faster transpose.

 345 //          TODO: in-place transpose:

 346 //                transpose(A);

 347             for (IndexType p=1; p<=n-1; ++p) {

 348                 for (IndexType q=p+1; q<=n; ++q) {

 349                     swap(A(q,p), A(p,q));

 350                 }

 351             }

 352             for (IndexType p=1; p<=n; ++p) {

 353                 work(m+n+p) = sva(p);

 354                 sva(p)      = work(n+p);

 355             }

 356             swap(aapp, aatMax);

 357             swap(aaqq, aatMin);

 358             swap(lsvec, rsvec);

 359             if (lsvec) {

 360                 // Lehn: transposing A is only considered if A is square,

 361                 //       so in this case m==n and therefore nu==n.  Or am I

 362                 //       wrong??

 363                 ASSERT(nu==n);

 364             }

 365 

 366             rowpiv = true;

 367         }

 368 

 369     }

 370 //

 371 //  Scale the matrix so that its maximal singular value remains less

 372 //  than DSQRT(BIG) -- the matrix is scaled so that its maximal column

 373 //  has Euclidean norm equal to DSQRT(BIG/N). The only reason to keep

 374 //  DSQRT(BIG) instead of BIG is the fact that DGEJSV uses LAPACK and

 375 //  BLAS routines that, in some implementations, are not capable of

 376 //  working in the full interval [SFMIN,BIG] and that they may provoke

 377 //  overflows in the intermediate results. If the singular values spread

 378 //  from SFMIN to BIG, then DGESVJ will compute them. So, in that case,

 379 //  one should use DGESVJ instead of DGEJSV.

 380 //

 381     const ElementType bigRoot = sqrt(big);

 382     tmp = sqrt(big/ElementType(n));

 383 

 384     lascl(LASCL::FullMatrix, 0, 0, aapp, tmp, sva);

 385     if (aaqq>aapp*safeMin) {

 386         aaqq = (aaqq/aapp) * tmp;

 387     } else {

 388         aaqq = (aaqq*tmp) / aapp;

 389     }

 390     tmp *= scaleM;

 391     lascl(LASCL::FullMatrix, 0, 0, aapp, tmp, A);

 392 //

 393 //  To undo scaling at the end of this procedure, multiply the

 394 //  computed singular values with USCAL2 / USCAL1.

 395 //

 396     uScale1 = tmp;

 397     uScale2 = aapp;

 398 

 399     if (l2kill) {

 400 //      L2KILL enforces computation of nonzero singular values in

 401 //      the restricted range of condition number of the initial A,

 402 //      sigma_max(A) / sigma_min(A) approx. DSQRT(BIG)/DSQRT(SFMIN).

 403         xsc = sqrt(safeMin);

 404     } else {

 405         xsc = small;

 406 //

 407 //      Now, if the condition number of A is too big,

 408 //      sigma_max(A) / sigma_min(A)>DSQRT(BIG/N) * EPSLN / SFMIN,

 409 //      as a precaution measure, the full SVD is computed using DGESVJ

 410 //      with accumulated Jacobi rotations. This provides numerically

 411 //      more robust computation, at the cost of slightly increased run

 412 //      time. Depending on the concrete implementation of BLAS and LAPACK

 413 //      (i.e. how they behave in presence of extreme ill-conditioning) the

 414 //      implementor may decide to remove this switch.

 415         if (aaqq<sqrt(safeMin) && lsvec && rsvec ) {

 416             jracc = true;

 417         }

 418 

 419     }

 420     if (aaqq<xsc) {

 421         for (IndexType p=1; p<=n; ++p) {

 422             if (sva(p)<xsc) {

 423                 A(_,p) = Zero;

 424                 sva(p) = Zero;

 425             }

 426         }

 427     }

 428 //

 429 //  Preconditioning using QR factorization with pivoting

 430 //

 431     if (rowpiv) {

 432 //      Optional row permutation (Bjoerck row pivoting):

 433 //      A result by Cox and Higham shows that the Bjoerck's

 434 //      row pivoting combined with standard column pivoting

 435 //      has similar effect as Powell-Reid complete pivoting.

 436 //      The ell-infinity norms of A are made nonincreasing.

 437         for (IndexType p=1; p<=m-1; ++p) {

 438             const IndexType q = blas::iamax(work(_(m+n+p,2*m+n))) + p - 1;

 439             iwork(2*n+p) = q;

 440             if (p!=q) {

 441                 swap(work(m+n+p), work(m+n+q));

 442             }

 443         }

 444         laswp(A, iwork(_(2*n+1, 2*n+m-1)));

 445     }

 446 //

 447 //  End of the preparation phase (scaling, optional sorting and

 448 //  transposing, optional flushing of small columns).

 449 //

 450 //  Preconditioning

 451 //

 452 //  If the full SVD is needed, the right singular vectors are computed

 453 //  from a matrix equation, and for that we need theoretical analysis

 454 //  of the Businger-Golub pivoting. So we use DGEQP3 as the first RR QRF.

 455 //  In all other cases the first RR QRF can be chosen by other criteria

 456 //  (eg speed by replacing global with restricted window pivoting, such

 457 //  as in SGEQPX from TOMS # 782). Good results will be obtained using

 458 //  SGEQPX with properly (!) chosen numerical parameters.

 459 //  Any improvement of DGEQP3 improves overal performance of DGEJSV.

 460 //

 461 //  A * P1 = Q1 * [ R1^t 0]^t:

 462     auto _tau   = work(_(1,n));

 463     auto _work  = work(_(n+1,lWork));

 464     auto _iwork = iwork(_(1,n));

 465 

 466 //      .. all columns are free columns

 467     _iwork = 0;

 468 

 469     qp3(A, _iwork, _tau, _work);

 470 //

 471 //  The upper triangular matrix R1 from the first QRF is inspected for

 472 //  rank deficiency and possibilities for deflation, or possible

 473 //  ill-conditioning. Depending on the user specified flag L2RANK,

 474 //  the procedure explores possibilities to reduce the numerical

 475 //  rank by inspecting the computed upper triangular factor. If

 476 //  L2RANK or l2aber are up, then DGEJSV will compute the SVD of

 477 //  A + dA, where ||dA|| <= f(M,N)*EPSLN.

 478 //

 479     IndexType nr = 1;

 480     if (l2aber) {

 481 //      Standard absolute error bound suffices. All sigma_i with

 482 //      sigma_i < N*EPSLN*||A|| are flushed to zero. This is an

 483 //      agressive enforcement of lower numerical rank by introducing a

 484 //      backward error of the order of N*EPSLN*||A||.

 485         tmp = sqrt(ElementType(n))*eps;

 486         for (IndexType p=2; p<=n; ++p) {

 487             if (abs(A(p,p))>=tmp*abs(A(1,1))) {

 488                 ++nr;

 489             } else {

 490                 break;

 491             }

 492         }

 493     } else if (l2rank) {

 494 //      .. similarly as above, only slightly more gentle (less agressive).

 495 //      Sudden drop on the diagonal of R1 is used as the criterion for

 496 //      close-to-rank-defficient.

 497         tmp = sqrt(safeMin);

 498         for (IndexType p=2; p<=n; ++p) {

 499             if (abs(A(p,p))<eps*abs(A(p-1,p-1)) || abs(A(p,p))<small

 500              || (l2kill && abs(A(p,p))<tmp))

 501             {

 502                 break;

 503             }

 504             ++nr;

 505         }

 506 

 507     } else {

 508 //      The goal is high relative accuracy. However, if the matrix

 509 //      has high scaled condition number the relative accuracy is in

 510 //      general not feasible. Later on, a condition number estimator

 511 //      will be deployed to estimate the scaled condition number.

 512 //      Here we just remove the underflowed part of the triangular

 513 //      factor. This prevents the situation in which the code is

 514 //      working hard to get the accuracy not warranted by the data.

 515         tmp = sqrt(safeMin);

 516         for (IndexType p=2; p<=n; ++p) {

 517             if (abs(A(p,p))<small || (l2kill && abs(A(p,p))<tmp)) {

 518                 break;

 519             }

 520             ++nr;

 521         }

 522 

 523     }

 524 

 525     bool almort = false;

 526     if (nr==n) {

 527         ElementType maxprj = One;

 528         for (IndexType p=2; p<=n; ++p) {

 529             maxprj = min(maxprj, abs(A(p,p))/sva(iwork(p)));

 530         }

 531         if (pow(maxprj,2)>=One-ElementType(n)*eps) {

 532             almort = true;

 533         }

 534     }

 535 

 536 

 537     sconda = -One;

 538     condr1 = -One;

 539     condr2 = -One;

 540 

 541     if (errest) {

 542         if (n==nr) {

 543             if (rsvec) {

 544 //              .. V is available as workspace

 545                 V.upper() = A(_(1,n),_).upper();

 546                 for (IndexType p=1; p<=n; ++p) {

 547                     tmp = sva(iwork(p));

 548                     V(_(1,p),p) *= One/tmp;

 549                 }

 550                 auto _work  = work(_(n+1, 4*n));

 551                 auto _iwork = iwork(_(2*n+m+1, 3*n+m));

 552                 pocon(V.upper(), One, tmp, _work, _iwork);

 553             } else if (lsvec) {

 554 //              .. U is available as workspace

 555                 auto _U = U(_(1,n),_(1,n));

 556                 _U.upper() = A(_(1,n),_).upper();

 557                 for (IndexType p=1; p<=n; ++p) {

 558                     tmp = sva(iwork(p));

 559                     U(_(1,p),p) *= One/tmp;

 560                 }

 561                 auto _work  = work(_(n+1, 4*n));

 562                 auto _iwork = iwork(_(2*n+m+1, 3*n+m));

 563                 pocon(_U.upper(), One, tmp, _work, _iwork);

 564             } else {

 565                 auto                       _work1 = work(_(n+1,n+n*n));

 566                 auto                       _work2 = work(_(n+n*n+1,n+n*n+3*n));

 567                 auto                       _iwork = work(_(2*n+m+1,3*n+m));

 568                 GeMatrixView<ElementType>  Work(n, n, _work1, n);

 569 

 570                 Work.upper() = A(_(1,n),_).upper();

 571                 for (IndexType p=1; p<=n; ++p) {

 572                     tmp = sva(iwork(p));

 573                     Work(_(1,p),p) *= One/tmp;

 574                 }

 575 //              .. the columns of R are scaled to have unit Euclidean lengths.

 576                 pocon(Work.upper(), One, tmp, _work2, _iwork);

 577             }

 578             sconda = One/sqrt(tmp);

 579 //          SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1).

 580 //          N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA

 581         } else {

 582             sconda = -One;

 583         }

 584     }

 585 

 586     l2pert = l2pert && abs(A(1,1)/A(nr,nr))>sqrt(bigRoot);

 587 //  If there is no violent scaling, artificial perturbation is not needed.

 588 //

 589 //  Phase 3:

 590 //

 591     if (!(rsvec || lsvec)) {

 592 //

 593 //      Singular Values only

 594 //

 595 //      .. transpose A(1:NR,1:N)

 596         for (IndexType p=1; p<=min(n-1, nr); ++p) {

 597             A(p,_(p+1,n)) = A(_(p+1,n),p);

 598         }

 599 //

 600 //      The following two DO-loops introduce small relative perturbation

 601 //      into the strict upper triangle of the lower triangular matrix.

 602 //      Small entries below the main diagonal are also changed.

 603 //      This modification is useful if the computing environment does not

 604 //      provide/allow FLUSH TO Zero underflow, for it prevents many

 605 //      annoying denormalized numbers in case of strongly scaled matrices.

 606 //      The perturbation is structured so that it does not introduce any

 607 //      new perturbation of the singular values, and it does not destroy

 608 //      the job done by the preconditioner.

 609 //      The licence for this perturbation is in the variable L2PERT, which

 610 //      should be false if FLUSH TO Zero underflow is active.

 611 //

 612         if (! almort) {

 613 

 614             if (l2pert) {

 615 //              XSC = DSQRT(SMALL)

 616                 xsc = eps / ElementType(n);

 617                 for (IndexType q=1; q<=nr; ++q) {

 618                     tmp = xsc*abs(A(q,q));

 619                     for (IndexType p=1; p<=n, ++p) {

 620                         if ((p>q && abs(A(p,q))<=tmp) || p<q) {

 621                             A(p,q) = sign(tmp, A(p,q));

 622                         }

 623                     }

 624                 }

 625             } else {

 626                 A(_(1,nr),_(1,nr)).strictUpper() = Zero;

 627             }

 628 //

 629 //          .. second preconditioning using the QR factorization

 630 //

 631             auto _A    = A(_(1,n),_(1,nr));

 632             auto _tau  = work(_(1,nr));

 633             auto _work = work(_(n+1,lWork));

 634             qrf(_A, _tau, _work);

 635 //

 636 //          .. and transpose upper to lower triangular

 637             for (IndexType p=1; p<=nr-1; ++p) {

 638                 A(p,_(p+1,nr)) = A(_(p+1,nr),p);

 639             }

 640 

 641         }

 642 //

 643 //      Row-cyclic Jacobi SVD algorithm with column pivoting

 644 //

 645 //      .. again some perturbation (a "background noise") is added

 646 //      to drown denormals

 647         if (l2pert) {

 648 //          XSC = DSQRT(SMALL)

 649             xsc = eps / ElementType(n);

 650             for (IndexType q=1; q<=nr; ++q) {

 651                 ElementType tmp = xsc*abs(A(q,q));

 652                 for (IndexType p=1; p<=nr; ++p) {

 653                     if (((p>q) && abs(A(p,q))<=tmp) || p<q) {

 654                         A(p,q) = sign(tmp, A(p,q));

 655                     }

 656                 }

 657             }

 658         } else {

 659             A(_(1,nr),_(1,nr)).strictUpper() = Zero;

 660         }

 661 //

 662 //      .. and one-sided Jacobi rotations are started on a lower

 663 //      triangular matrix (plus perturbation which is ignored in

 664 //      the part which destroys triangular form (confusing?!))

 665 //

 666         auto _A    = A(_(1,nr),_(1,nr));

 667         auto _sva  = sva(_(1,nr));

 668 

 669         svj(SVJ::Lower, SVJ::NoU, SVJ::NoV, _A, _sva, V, work);

 670 

 671         scaleM  = work(1);

 672         numRank = nint(work(2));

 673 

 674 

 675     } else if (rsvec && !lsvec) {

 676 //

 677 //      -> Singular Values and Right Singular Vectors <-

 678 //

 679         if (almort) {

 680 //

 681 //          .. in this case NR equals N

 682             ASSERT(nr==n);

 683             for (IndexType p=1; p<=nr; ++p) {

 684                 V(_(p,n),p) = A(p,_(p,n));

 685             }

 686 

 687             auto _V = V(_,_(1,nr));

 688             _V.strictUpper() = Zero;

 689             svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV, _V, sva, A, work);

 690 

 691             scaleM  = work(1);

 692             numRank = nint(work(2));

 693 

 694         } else {

 695 //

 696 //      .. two more QR factorizations ( one QRF is not enough, two require

 697 //      accumulated product of Jacobi rotations, three are perfect )

 698 //

 699             A(_(1,nr),_(1,nr)).strictLower() = Zero;

 700 

 701             auto _A     = A(_(1,nr),_);

 702             auto _tau1  = work(_(1,nr));

 703             auto _work1 = work(_(n+1,lWork));

 704             lqf(_A, _tau1, _work1);

 705 

 706             auto _V = V(_(1,nr),_(1,nr))

 707             _V.lower()       = _A(_,_(1,nr)).lower();

 708             _V.strictUpper() = Zero;

 709 

 710             auto _tau2  = work(_(n+1,n+nr));

 711             auto _work2 = work(_(2*n+1,lWork));

 712             qrf(_V, _tau2, _work2);

 713 

 714             for (IndexType p=1; p<=nr; ++p) {

 715                 V(_(p,nr),p) = V(p,_(p,nr));

 716             }

 717             _V.strictUpper() = Zero;

 718 

 719             auto _sva  = sva(_(1,nr));

 720             svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV, _V, _sva, U, _work1);

 721 

 722             scaleM  = work(n+1);

 723             numRank = nint(work(n+2));

 724             if (nr<n) {

 725                 V(_(nr+1,n),_(1,nr)) = Zero;

 726                 V(_(1,nr),_(nr+1,n)) = Zero;

 727 

 728                 V(_(nr+1,n),_(nr+1,n)) = Zero;

 729                 V(_(nr+1,n),_(nr+1,n)).diag(0) = One;

 730             }

 731 

 732             ormlq(Left, Trans, _A, _tau1, V, _work1);

 733 

 734         }

 735 

 736         for (IndexType p=1; p<=n; ++p) {

 737             A(iwork(p),_) = V(p,_);

 738         }

 739         V = A;

 740 

 741         if (transp) {

 742             U = V;

 743         }

 744 

 745     } else if (lsvec && !rsvec) {

 746 //

 747 //      .. Singular Values and Left Singular Vectors                 ..

 748 //

 749 //      .. second preconditioning step to avoid need to accumulate

 750 //      Jacobi rotations in the Jacobi iterations.

 751 

 752         auto _U = U(_(1,nr),_(1,nr));

 753         auto _tau   = work(_(n+1,n+nr));

 754         auto _sva   = sva(_(1,nr));

 755         auto _work1 = work(_(n+1,lWork));

 756         auto _work2 = work(_(2*n+1,lWork));

 757 

 758         for (IndexType p=1; p<=nr; ++p) {

 759             A(p,_(p,nr)) = U(_(p,nr),p);

 760         }

 761         _U.strictUpper() = Zero;

 762 

 763         qrf(U(_(1,n),_(1,nr)), _tau, _work2);

 764 

 765         for (IndexType p=1; p<=nr-1; ++p) {

 766             U(p,_(p+1,nr)) = U(_(p+1,nr),p);

 767         }

 768         _U.strictUpper() = Zero;

 769 

 770         svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV, _U, _sva, A, _work1);

 771         scaleM  = work(n+1);

 772         numRank = nint(work(n+2));

 773 

 774         if (nr<m) {

 775             U(_(nr+1,m),_(1,nr)) = Zero;

 776             if (nr<nu) {

 777                 U(_(1,nr),_(nr+1,nu))           = Zero;

 778 

 779                 U(_(nr+1,m),_(nr+1,nu))         = Zero;

 780                 U(_(nr+1,m),_(nr+1,nu)).diag(0) = One;

 781             }

 782         }

 783 

 784         ormqr(Left, NoTrans, A, work, U);

 785 

 786         if (rowpiv) {

 787             auto piv = iwork(_(2*n+1,2*n+m-1));

 788             laswp(U, piv.reverse());

 789         }

 790 

 791         for (IndexType p=1; p<=nu; ++p) {

 792             xsc = One / blas::nrm2(U(_,p));

 793             U(_,p) *= xsc;

 794         }

 795 

 796         if (transp) {

 797             V = U;

 798         }

 799 //

 800     } else {

 801 //

 802 //      .. Full SVD ..

 803 //

 804         if (!jracc) {

 805 

 806             if (!almort) {

 807 //

 808 //              Second Preconditioning Step (QRF [with pivoting])

 809 //              Note that the composition of TRANSPOSE, QRF and TRANSPOSE is

 810 //              equivalent to an LQF CALL. Since in many libraries the QRF

 811 //              seems to be better optimized than the LQF, we do explicit

 812 //              transpose and use the QRF. This is subject to changes in an

 813 //              optimized implementation of DGEJSV.

 814 //

 815                 for (IndexType p=1; p<=nr; ++p) {

 816                     V(_(p,n),p) = A(p,_(p,n));

 817                 }

 818 //

 819 //              .. the following two loops perturb small entries to avoid

 820 //              denormals in the second QR factorization, where they are

 821 //              as good as zeros. This is done to avoid painfully slow

 822 //              computation with denormals. The relative size of the

 823 //              perturbation is a parameter that can be changed by the

 824 //              implementer. This perturbation device will be obsolete on

 825 //              machines with properly implemented arithmetic.

 826 //              To switch it off, set L2PERT=false To remove it from  the

 827 //              code, remove the action under L2PERT=true, leave the ELSE

 828 //              part. The following two loops should be blocked and fused with

 829 //              the transposed copy above.

 830 //

 831                 if (l2pert) {

 832                     xsc = sqrt(small);

 833                     for (IndexType q=1; q<=nr; ++q) {

 834                         tmp = xsc*abs(V(q,q));

 835                         for (IndexType p=1; p<=n; ++p) {

 836                             if (p>q && abs(V(p,q))<=tmp || p<q) {

 837                                 V(p,q) = sign(tmp, V(p,q));

 838                             }

 839                             if (p<q) {

 840                                 V(p,q) = -V(p,q);

 841                             }

 842                         }

 843                     }

 844                 } else {

 845                     V(_(1,nr),_(1,nr)).strictUpper() = Zero;

 846                 }

 847 //

 848 //              Estimate the row scaled condition number of R1

 849 //              (If R1 is rectangular, N > NR, then the condition number

 850 //              of the leading NR x NR submatrix is estimated.)

 851 //

 852                 auto          _work1 = work(_(2*n+1,2*n+nr*nr));

 853                 auto          _work2 = work(_(2*n+nr*nr+1, 2*n+nr*nr+3*nr));

 854                 auto          _iwork = iwork(_(m+2*n+1,m+2*n+nr));

 855                 GeMatrixView<ElementType>  Work(nr, nr, _work1, nr);

 856 

 857                 Work.lower() = V.lower();

 858                 for (IndexType p=1; p<=nr; ++p) {

 859                     tmp = blas::nrm2(Work(_(p,nr),p));

 860                     Work(_(p,nr),p) *= One/tmp;

 861                 }

 862                 pocon(Work, One, tmp, _work2, _iwork);

 863                 condr1 = One / sqrt(tmp);

 864 //              .. here need a second oppinion on the condition number

 865 //              .. then assume worst case scenario

 866 //              R1 is OK for inverse <=> condr1 < DBLE(N)

 867 //              more conservative    <=> condr1 < DSQRT(DBLE(N))

 868 //

 869                 cond_ok = sqrt(ElementType(nr));

 870 //[TP]          COND_OK is a tuning parameter.

 871 

 872                 if (condr1<cond_ok) {

 873 //                  .. the second QRF without pivoting. Note: in an optimized

 874 //                  implementation, this QRF should be implemented as the QRF

 875 //                  of a lower triangular matrix.

 876 //                  R1^t = Q2 * R2

 877                     auto tau    = work(_(n+1,n+nr));

 878                     auto _work  = work(_(2*n+1,lWork));

 879 

 880                     qrf(V(_,_(1,nr)), tau, _work);

 881 

 882                     if (l2pert) {

 883                         xsc = sqrt(small) / Eps;

 884                         for (IndexType p=2; p<=nr; ++p) {

 885                             for (IndexType q=1; q<=p-1; ++q) {

 886                                 tmp = xsc*min(abs(V(p,p)), abs(V(q,q)));

 887                                 if (abs(V(q,p))<=tmp) {

 888                                     V(q,p) = sign(tmp, V(q,p));

 889                                 }

 890                             }

 891                         }

 892                     }

 893 //

 894                     if (nr!=n) {

 895                         auto _work = work(_(2*n+1, 2*n+n*nr));

 896                         GeMatrixView<ElementType>  Work(n, nr, _work, n);

 897 

 898                         Work = V(_,_(1,nr));

 899                     }

 900 //                  .. save ...

 901 //

 902 //               .. this transposed copy should be better than naive

 903 //                  TODO:  auto _V = V(_(1,nr),_(1,nr));

 904 //                         _V.lower() = transpose(_V.upper());

 905 //

 906                     for (IndexType p=1; p<=nr-1; ++p) {

 907                         V(_(p+1,nr),p) = V(p,_(p+1,nr));

 908                     }

 909 

 910                     condr2 = condr1;

 911 

 912                 } else {

 913 //

 914 //                  .. ill-conditioned case: second QRF with pivoting

 915 //                  Note that windowed pivoting would be equaly good

 916 //                  numerically, and more run-time efficient. So, in

 917 //                  an optimal implementation, the next call to DGEQP3

 918 //                  should be replaced with eg. CALL SGEQPX (ACM TOMS #782)

 919 //                  with properly (carefully) chosen parameters.

 920 //

 921 //                  R1^t * P2 = Q2 * R2

 922                     auto _V    = V(_,_(1,nr));

 923                     auto piv   = iwork(_(n+1,n+nr));

 924                     auto tau   = work(_(n+1,,n+nr));

 925                     auto _work = work(_(2*n+1, lWork));

 926 

 927                     piv = 0;

 928                     qp3(_V, piv, tau, _work);

 929                     if (l2pert) {

 930                         xsc = sqrt(small);

 931                         for (IndexType p=2; p<=nr; ++p) {

 932                             for (IndexType q=1; q<=p-1; ++q) {

 933                                 tmp = xsc*min(abs(V(p,p)), abs(V(q,q)));

 934                                 if (abs(V(q,p))<=tmp) {

 935                                     V(q,p) = sign(tmp, V(q,p));

 936                                 }

 937                             }

 938                         }

 939                     }

 940 

 941                     auto  _work1 = work(_(2*n+1, 2*n+n*nr));

 942                     GeMatrixView<ElementType>  Work1(n, nr, _work1, n);

 943 

 944                     Work1 = V(_,_(1,nr));

 945 

 946                     if (l2pert) {

 947                         xsc = sqrt(small);

 948                         for (IndexType p=2; p<=nr; ++p) {

 949                             for (IndexType q=1; q<=p-1; ++q) {

 950                                 tmp = xsc * min(abs(V(p,p)), abs(V(q,q)));

 951                                 V(p,q) = -sign(tmp, V(q,p));

 952                             }

 953                         }

 954                     } else {

 955                         V(_(1,nr),_(1,nr)).strictLower() = Zero;

 956                     }

 957 //                  Now, compute R2 = L3 * Q3, the LQ factorization.

 958                     auto _V     = V(_(1,nr),_(1,nr));

 959                     auto tau    = work(_(2*n+n*nr+1,2*n+n*nr+nr));

 960                     auto _work2 = work(_(2*n+n*nr+nr+1,lWork));

 961 

 962                     lqf(_V, tau, _work2);

 963 //                  .. and estimate the condition number

 964                     auto  _work3 = work(_(2*n+n*nr+nr+1,2*n+n*nr+nr+nr*nr));

 965                     GeMatrixView<ElementType>  Work3(nr, nr, _work3, nr);

 966 

 967                     Work3.lower() = _V.lower();

 968 

 969                     for (IndexType p=1; p<=nr; ++p) {

 970                         tmp = blas::nrm2(Work3(p,_(1,p)));

 971                         Work3(p,_(1,p)) *= One/tmp;

 972                     }

 973                     auto _work4 = work(_(2*n+n*nr+nr+nr*nr+1,

 974                                          2*n+n*nr+nr+nr*nr+3*nr));

 975                     auto _iwork = iwork(_(m+2*n+1, m+2*n+nr));

 976                     pocon(Work3.lower(), One, tmp, _work4, _iwork);

 977                     condr2 = One / sqrt(tmp);

 978 

 979                     if (condr2>=cond_ok) {

 980 //                      .. save the Householder vectors used for Q3

 981 //                      (this overwrittes the copy of R2, as it will not be

 982 //                      needed in this branch, but it does not overwritte the

 983 //                      Huseholder vectors of Q2.).

 984                         Work1(_(1,nr),_(1,nr)).upper() = V.upper();

 985 //                      .. and the rest of the information on Q3 is in

 986 //                      WORK(2*N+N*NR+1:2*N+N*NR+N)

 987                     }

 988 

 989                 }

 990 

 991                 if (l2pert) {

 992                     xsc = sqrt(small);

 993                     for (IndexType q=2; q<=nr; ++q) {

 994                         tmp = xsc * V(q,q);

 995                         for (IndexType p=1; p<=q-1; ++p) {

 996 //                          V(p,q) = - DSIGN( TEMP1, V(q,p) )

 997                             V(p,q) = -sign(tmp, V(p,q));

 998                         }

 999                     }

1000                 } else {

1001                     V(_(1,nr),_(1,nr)).strictLower();

1002                 }

1003 //

1004 //              Second preconditioning finished; continue with Jacobi SVD

1005 //              The input matrix is lower triangular.

1006 //

1007 //              Recover the right singular vectors as solution of a well

1008 //              conditioned triangular matrix equation.

1009 //

1010                 if (condr1<cond_ok) {

1011                     auto _U    = U(_(1,nr),_(1,nr));

1012                     auto _V    = V(_(1,nr),_(1,nr));

1013                     auto _sva  = sva(_(1,nr));

1014                     auto _work = work(_(2*n+n*nr+nr+1,lWork));

1015 

1016                     svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV,

1017                         _V, _sva, U, _work);

1018                     scaleM = _work(1);

1019                     numRank = nint(_work(2));

1020 

1021                     for (IndexType p=1; p<=nr; ++p) {

1022                         _U(_,p) = _V(_,p);

1023                         _V(_,p) *= sva(p);

1024                     }

1025 

1026 //                  .. pick the right matrix equation and solve it

1027 //

1028                     if (nr==n) {

1029 //:))                   .. best case, R1 is inverted. The solution of this

1030 //                      matrix equation is Q2*V2 = the product of the Jacobi

1031 //                      rotations used in DGESVJ, premultiplied with the

1032 //                      orthogonal matrix  from the second QR factorization.

1033                         const auto _A = A(_(1,nr),_(1,nr));

1034                         blas::sm(Left, NoTrans, One, _A.upper(), _V);

1035                     } else {

1036 //                      .. R1 is well conditioned, but non-square. Transpose(R2)

1037 //                      is inverted to get the product of the Jacobi rotations

1038 //                      used in DGESVJ. The Q-factor from the second QR

1039 //                      factorization is then built in explicitly.

1040 

1041                         auto  _work = work(_(2*n+1, 2*n+n*nr));

1042                         GeMatrixView<ElementType>  Work1(nr, nr, _work, n);

1043                         GeMatrixView<ElementType>  Work(n, nr, _work, n);

1044 

1045                         blas::sm(Left, NoTrans, One, Work1.upper(), _V);

1046                         if (nr<n) {

1047                             V(_(nr+1,n),_(1,nr)) = Zero;

1048                             V(_(1,nr),_(nr+1,n)) = Zero;

1049                             V(_(nr+1,n),_(nr+1,n)) = Zero;

1050                             V(_(nr+1,n),_(nr+1,n)).diag(0) = One;

1051                         }

1052                         auto tau = work(_(n+1,n+nr));

1053                         auto _work_ormqr = work(_(2*n+n*nr+nr+1,lWork));

1054                         ormqr(Left, NoTrans, Work, tau, V, _work_ormqr);

1055                     }

1056 //

1057                 } else if (condr2<cond_ok) {

1058 //

1059 //:)                .. the input matrix A is very likely a relative of

1060 //                  the Kahan matrix :)

1061 //                  The matrix R2 is inverted. The solution of the matrix

1062 //                  equation is Q3^T*V3 = the product of the Jacobi rotations

1063 //                  (appplied to the lower triangular L3 from the LQ

1064 //                  factorization of R2=L3*Q3), pre-multiplied with the

1065 //                  transposed Q3.

1066                     CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,

1067      $                  LDU, work(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )

1068                     SCALEM  = work(2*N+N*NR+NR+1)

1069                     numRank = IDNINT(work(2*N+N*NR+NR+2))

1070                     DO 3870 p = 1, NR

1071                         CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 )

1072                         CALL DSCAL( NR, SVA(p),    U(1,p), 1 )

1073  3870               CONTINUE

1074                     CALL DTRSM('L','U','N','N',NR,NR,One,work(2*N+1),N,U,LDU)

1075 //                  .. apply the permutation from the second QR factorization

1076                     DO 873 q = 1, NR

1077                         DO 872 p = 1, NR

1078                             work(2*N+N*NR+NR+iwork(N+p)) = U(p,q)

1079  872                    CONTINUE

1080                         DO 874 p = 1, NR

1081                             U(p,q) = work(2*N+N*NR+NR+p)

1082  874                    CONTINUE

1083  873                CONTINUE

1084                     if ( NR < N ) {

1085                         CALL DLASET( 'A',N-NR,NR,Zero,Zero,V(NR+1,1),LDV )

1086                         CALL DLASET( 'A',NR,N-NR,Zero,Zero,V(1,NR+1),LDV )

1087                         CALL DLASET( 'A',N-NR,N-NR,Zero,One,V(NR+1,NR+1),LDV )

1088                     }

1089                     CALL DORMQR( 'L','N',N,N,NR,work(2*N+1),N,work(N+1),

1090      $                      V,LDV,work(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )

1091                 } else {

1092 //                  Last line of defense.

1093 //#:(               This is a rather pathological case: no scaled condition

1094 //                  improvement after two pivoted QR factorizations. Other

1095 //                  possibility is that the rank revealing QR factorization

1096 //                  or the condition estimator has failed, or the COND_OK

1097 //                  is set very close to One (which is unnecessary). Normally,

1098 //                  this branch should never be executed, but in rare cases of

1099 //                  failure of the RRQR or condition estimator, the last line of

1100 //                  defense ensures that DGEJSV completes the task.

1101 //                  Compute the full SVD of L3 using DGESVJ with explicit

1102 //                  accumulation of Jacobi rotations.

1103                     CALL DGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,

1104      $                  LDU, work(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )

1105                     SCALEM  = work(2*N+N*NR+NR+1)

1106                     numRank = IDNINT(work(2*N+N*NR+NR+2))

1107                     if ( NR < N ) {

1108                         CALL DLASET( 'A',N-NR,NR,Zero,Zero,V(NR+1,1),LDV )

1109                         CALL DLASET( 'A',NR,N-NR,Zero,Zero,V(1,NR+1),LDV )

1110                         CALL DLASET( 'A',N-NR,N-NR,Zero,One,V(NR+1,NR+1),LDV )

1111                     }

1112                     CALL DORMQR( 'L','N',N,N,NR,work(2*N+1),N,work(N+1),

1113      $                      V,LDV,work(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )

1114 //

1115                     CALL DORMLQ( 'L', 'T', NR, NR, NR, work(2*N+1), N,

1116      $                      work(2*N+N*NR+1), U, LDU, work(2*N+N*NR+NR+1),

1117      $                      LWORK-2*N-N*NR-NR, IERR )

1118                     DO 773 q = 1, NR

1119                         DO 772 p = 1, NR

1120                             work(2*N+N*NR+NR+iwork(N+p)) = U(p,q)

1121  772                    CONTINUE

1122                         DO 774 p = 1, NR

1123                             U(p,q) = work(2*N+N*NR+NR+p)

1124  774                    CONTINUE

1125  773                CONTINUE

1126 //

1127                 }

1128 //

1129 //              Permute the rows of V using the (column) permutation from the

1130 //              first QRF. Also, scale the columns to make them unit in

1131 //              Euclidean norm. This applies to all cases.

1132 //

1133                 TEMP1 = DSQRT(DBLE(N)) * EPSLN

1134                 DO 1972 q = 1, N

1135                     DO 972 p = 1, N

1136                         work(2*N+N*NR+NR+iwork(p)) = V(p,q)

1137   972               CONTINUE

1138                     DO 973 p = 1, N

1139                         V(p,q) = work(2*N+N*NR+NR+p)

1140   973               CONTINUE

1141                     XSC = One / DNRM2( N, V(1,q), 1 )

1142                     if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )

1143      $                  CALL DSCAL( N, XSC, V(1,q), 1 )

1144  1972           CONTINUE

1145 //              At this moment, V contains the right singular vectors of A.

1146 //              Next, assemble the left singular vector matrix U (M x N).

1147                 if ( NR < M ) {

1148                     CALL DLASET( 'A', M-NR, NR, Zero, Zero, U(NR+1,1), LDU )

1149                     if ( NR < N1 ) {

1150                         CALL DLASET('A',NR,N1-NR,Zero,Zero,U(1,NR+1),LDU)

1151                         CALL DLASET('A',M-NR,N1-NR,Zero,One,U(NR+1,NR+1),LDU)

1152                     }

1153                 }

1154 //

1155 //              The Q matrix from the first QRF is built into the left singular

1156 //              matrix U. This applies to all cases.

1157 //

1158                 CALL DORMQR( 'Left', 'No_Tr', M, N1, N, A, LDA, work, U,

1159      $           LDU, work(N+1), LWORK-N, IERR )

1160 

1161 //              The columns of U are normalized. The cost is O(M*N) flops.

1162                 TEMP1 = DSQRT(DBLE(M)) * EPSLN

1163                 DO 1973 p = 1, NR

1164                     XSC = One / DNRM2( M, U(1,p), 1 )

1165                     if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )

1166      $                  CALL DSCAL( M, XSC, U(1,p), 1 )

1167  1973           CONTINUE

1168 //

1169 //              If the initial QRF is computed with row pivoting, the left

1170 //              singular vectors must be adjusted.

1171 //

1172                 if ( ROWPIV )

1173      $              CALL DLASWP( N1, U, LDU, 1, M-1, iwork(2*N+1), -1 )

1174 //

1175             } else {

1176 //

1177 //              .. the initial matrix A has almost orthogonal columns and

1178 //              the second QRF is not needed

1179 //

1180                 CALL DLACPY( 'Upper', N, N, A, LDA, work(N+1), N )

1181                 if ( L2PERT ) {

1182                     XSC = DSQRT(SMALL)

1183                     DO 5970 p = 2, N

1184                         TEMP1 = XSC * work( N + (p-1)*N + p )

1185                         DO 5971 q = 1, p - 1

1186                             work(N+(q-1)*N+p)=-DSIGN(TEMP1,work(N+(p-1)*N+q))

1187  5971                   CONTINUE

1188  5970               CONTINUE

1189                 } else {

1190                     CALL DLASET( 'Lower',N-1,N-1,Zero,Zero,work(N+2),N )

1191                 }

1192 //

1193                 CALL DGESVJ( 'Upper', 'U', 'N', N, N, work(N+1), N, SVA,

1194      $           N, U, LDU, work(N+N*N+1), LWORK-N-N*N, INFO )

1195 //

1196                 SCALEM  = work(N+N*N+1)

1197                 numRank = IDNINT(work(N+N*N+2))

1198                 DO 6970 p = 1, N

1199                     CALL DCOPY( N, work(N+(p-1)*N+1), 1, U(1,p), 1 )

1200                     CALL DSCAL( N, SVA(p), work(N+(p-1)*N+1), 1 )

1201  6970           CONTINUE

1202 //

1203                 CALL DTRSM( 'Left', 'Upper', 'NoTrans', 'No UD', N, N,

1204      $               One, A, LDA, work(N+1), N )

1205                 DO 6972 p = 1, N

1206                     CALL DCOPY( N, work(N+p), N, V(iwork(p),1), LDV )

1207  6972           CONTINUE

1208                 TEMP1 = DSQRT(DBLE(N))*EPSLN

1209                 DO 6971 p = 1, N

1210                     XSC = One / DNRM2( N, V(1,p), 1 )

1211                     if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )

1212      $                  CALL DSCAL( N, XSC, V(1,p), 1 )

1213  6971           CONTINUE

1214 //

1215 //              Assemble the left singular vector matrix U (M x N).

1216 //

1217                 if ( N < M ) {

1218                     CALL DLASET( 'A',  M-N, N, Zero, Zero, U(N+1,1), LDU )

1219                     if ( N < N1 ) {

1220                         CALL DLASET( 'A',N,  N1-N, Zero, Zero,  U(1,N+1),LDU )

1221                         CALL DLASET( 'A',M-N,N1-N, Zero, One,U(N+1,N+1),LDU )

1222                     }

1223                 }

1224                 CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, work, U,

1225      $           LDU, work(N+1), LWORK-N, IERR )

1226                 TEMP1 = DSQRT(DBLE(M))*EPSLN

1227                 DO 6973 p = 1, N1

1228                     XSC = One / DNRM2( M, U(1,p), 1 )

1229                     if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )

1230      $                  CALL DSCAL( M, XSC, U(1,p), 1 )

1231  6973           CONTINUE

1232 //

1233                 if ( ROWPIV )

1234      $              CALL DLASWP( N1, U, LDU, 1, M-1, iwork(2*N+1), -1 )

1235 //

1236             }

1237 //

1238 //          end of the  >> almost orthogonal case <<  in the full SVD

1239 //

1240         } else {

1241 //

1242 //          This branch deploys a preconditioned Jacobi SVD with explicitly

1243 //          accumulated rotations. It is included as optional, mainly for

1244 //          experimental purposes. It does perfom well, and can also be used.

1245 //          In this implementation, this branch will be automatically activated

1246 //          if the  condition number sigma_max(A) / sigma_min(A) is predicted

1247 //          to be greater than the overflow threshold. This is because the

1248 //          a posteriori computation of the singular vectors assumes robust

1249 //          implementation of BLAS and some LAPACK procedures, capable of

1250 //          working in presence of extreme values. Since that is not always

1251 //          the case, ...

1252 //

1253             DO 7968 p = 1, NR

1254                 CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )

1255  7968       CONTINUE

1256 //

1257             if ( L2PERT ) {

1258                 XSC = DSQRT(SMALL/EPSLN)

1259                 for (IndexType q=1; q<=nr; ++q) {

1260                     tmp = xsc*abs(V(q,q));

1261                     for (IndexType p=1; p<=n; ++p) {

1262                         if (p>q && abs(V(p,q))<=tmp || p<q) {

1263                             V(p,q) = sign(tmp, V(p,q));

1264                         }

1265                         if (p<q) {

1266                             V(p,q) = - V(p,q);

1267                         }

1268                     }

1269                 }

1270             } else {

1271                 CALL DLASET( 'U', NR-1, NR-1, Zero, Zero, V(1,2), LDV )

1272             }

1273 

1274             CALL DGEQRF( N, NR, V, LDV, work(N+1), work(2*N+1),

1275      $        LWORK-2*N, IERR )

1276             CALL DLACPY( 'L', N, NR, V, LDV, work(2*N+1), N )

1277 //

1278             DO 7969 p = 1, NR

1279                 CALL DCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )

1280  7969       CONTINUE

1281 

1282             if ( L2PERT ) {

1283                 XSC = DSQRT(SMALL/EPSLN)

1284                 DO 9970 q = 2, NR

1285                     DO 9971 p = 1, q - 1

1286                         TEMP1 = XSC * DMIN1(DABS(U(p,p)),DABS(U(q,q)))

1287                         U(p,q) = - DSIGN( TEMP1, U(q,p) )

1288  9971               CONTINUE

1289  9970           CONTINUE

1290             } else {

1291                 CALL DLASET('U', NR-1, NR-1, Zero, Zero, U(1,2), LDU )

1292             }

1293 

1294             CALL DGESVJ( 'G', 'U', 'V', NR, NR, U, LDU, SVA,

1295      $        N, V, LDV, work(2*N+N*NR+1), LWORK-2*N-N*NR, INFO )

1296             SCALEM  = work(2*N+N*NR+1)

1297             numRank = IDNINT(work(2*N+N*NR+2))

1298 

1299             if ( NR < N ) {

1300                 CALL DLASET( 'A',N-NR,NR,Zero,Zero,V(NR+1,1),LDV )

1301                 CALL DLASET( 'A',NR,N-NR,Zero,Zero,V(1,NR+1),LDV )

1302                 CALL DLASET( 'A',N-NR,N-NR,Zero,One,V(NR+1,NR+1),LDV )

1303             }

1304 

1305             CALL DORMQR( 'L','N',N,N,NR,work(2*N+1),N,work(N+1),

1306      $        V,LDV,work(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )

1307 //

1308 //          Permute the rows of V using the (column) permutation from the

1309 //          first QRF. Also, scale the columns to make them unit in

1310 //          Euclidean norm. This applies to all cases.

1311 //

1312             TEMP1 = DSQRT(DBLE(N)) * EPSLN

1313             DO 7972 q = 1, N

1314                 DO 8972 p = 1, N

1315                     work(2*N+N*NR+NR+iwork(p)) = V(p,q)

1316  8972           CONTINUE

1317                 DO 8973 p = 1, N

1318                     V(p,q) = work(2*N+N*NR+NR+p)

1319  8973           CONTINUE

1320                 XSC = One / DNRM2( N, V(1,q), 1 )

1321                 if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )

1322      $              CALL DSCAL( N, XSC, V(1,q), 1 )

1323  7972       CONTINUE

1324 //

1325 //          At this moment, V contains the right singular vectors of A.

1326 //          Next, assemble the left singular vector matrix U (M x N).

1327 //

1328             if (NR<M) {

1329                 CALL DLASET( 'A',  M-NR, NR, Zero, Zero, U(NR+1,1), LDU )

1330                 if (NR<N1) {

1331                     CALL DLASET( 'A',NR,  N1-NR, Zero, Zero,  U(1,NR+1),LDU )

1332                     CALL DLASET( 'A',M-NR,N1-NR, Zero, One,U(NR+1,NR+1),LDU )

1333                 }

1334             }

1335 

1336             CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, work, U,

1337      $        LDU, work(N+1), LWORK-N, IERR )

1338 

1339             if (ROWPIV)

1340      $         CALL DLASWP( N1, U, LDU, 1, M-1, iwork(2*N+1), -1 )

1341 

1342 

1343         }

1344         if ( TRANSP ) {

1345 //          .. swap U and V because the procedure worked on A^t

1346             for (IndexType p=1; p<=n; ++p) {

1347                 blas::swap(U(_,p),V(_,p));

1348             }

1349         }

1350 

1351     }

1352 //  end of the full SVD

1353 //

1354 //  Undo scaling, if necessary (and possible)

1355 //

1356     if ( USCAL2 <= (BIG/SVA(1))*USCAL1 ) {

1357         CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )

1358         USCAL1 = One

1359         USCAL2 = One

1360     }

1361 

1362     if ( NR < N ) {

1363         sva(_(nr+1,n)) = Zero;

1364     }

1365 

1366     work(1) = USCAL2 * SCALEM

1367     work(2) = USCAL1

1368     if ( errest ) work(3) = SCONDA

1369     if ( lsvec && rsvec ) {

1370         work(4) = condr1

1371         work(5) = condr2

1372     }

1373     if ( L2TRAN ) {

1374         work(6) = entra

1375         work(7) = entrat

1376     }

1377 

1378     iwork(1) = NR

1379     iwork(2) = numRank

1380     iwork(3) = warning

1381 }

1382 */

1383     //== interface for native lapack ===============================================

1384 #ifdef CHECK_CXXLAPACK

1385 

1386     template <typename MA, typename VSVA, typename MU, typename MV,

1387               typename VWORK, typename VIWORK>

1388 typename GeMatrix<MA>::IndexType

1389 jsv_native(JSV::Accuracy             accuracy,

1390            JSV::JobU                 jobU,

1391            JSV::JobV                 jobV,

1392            bool                      restrictedRange,

1393            bool                      considerTransA,

1394            bool                      perturb,

1395            GeMatrix<MA>              &A,

1396            DenseVector<VSVA>         &sva,

1397            GeMatrix<MU>              &U,

1398            GeMatrix<MV>              &V,

1399            DenseVector<VWORK>        &work,

1400            DenseVector<VIWORK>       &iwork)

1401 {

1402     typedef typename GeMatrix<MA>::ElementType  T;

1403 

1404     const char       JOBA = char(accuracy);

1405     const char       JOBU = char(jobU);

1406     const char       JOBV = char(jobV);

1407     const char       JOBR = (restrictedRange) ? 'R' : 'N';

1408     const char       JOBT = (considerTransA) ? 'T' : 'N';

1409     const char       JOBP = (perturb) ? 'P' : 'N';

1410     const INTEGER    M = A.numRows();

1411     const INTEGER    N = A.numCols();

1412     const INTEGER    LDA = A.leadingDimension();

1413     const INTEGER    LDU = U.leadingDimension();

1414     const INTEGER    LDV = V.leadingDimension();

1415     const INTEGER    LWORK = work.length();

1416     INTEGER          INFO;

1417 

1418     if (IsSame<T,DOUBLE>::value) {

1419         LAPACK_IMPL(dgejsv)(&JOBA,

1420                             &JOBU,

1421                             &JOBV,

1422                             &JOBR,

1423                             &JOBT,

1424                             &JOBP,

1425                             &M,

1426                             &N,

1427                             A.data(),

1428                             &LDA,

1429                             sva.data(),

1430                             U.data(),

1431                             &LDU,

1432                             V.data(),

1433                             &LDV,

1434                             work.data(),

1435                             &LWORK,

1436                             iwork.data(),

1437                             &INFO);

1438     } else {

1439         ASSERT(0);

1440     }

1441     return INFO;

1442 }

1443 

1444 #endif // CHECK_CXXLAPACK

1445 

1446 

1447 //== public interface ==========================================================

1448 template <typename MA, typename VSVA, typename MU, typename MV,

1449           typename VWORK, typename VIWORK>

1450 typename GeMatrix<MA>::IndexType

1451 jsv(JSV::Accuracy             accuracy,

1452     JSV::JobU                 jobU,

1453     JSV::JobV                 jobV,

1454     bool                      restrictedRange,

1455     bool                      considerTransA,

1456     bool                      perturb,

1457     GeMatrix<MA>              &A,

1458     DenseVector<VSVA>         &sva,

1459     GeMatrix<MU>              &U,

1460     GeMatrix<MV>              &V,

1461     DenseVector<VWORK>        &work,

1462     DenseVector<VIWORK>       &iwork)

1463 {

1464 //

1465 //  Test the input parameters

1466 //

1467 #   ifndef NDEBUG

1468     typedef typename GeMatrix<MA>::IndexType    IndexType;

1469 

1470     ASSERT(A.firstRow()==1);

1471     ASSERT(A.firstCol()==1);

1472 

1473     const IndexType m = A.numRows();

1474     const IndexType n = A.numCols();

1475 

1476     ASSERT(m>=n);

1477 

1478     ASSERT(sva.firstIndex()==1);

1479     ASSERT(sva.length()==n);

1480 

1481     ASSERT(U.firstRow()==1);

1482     ASSERT(U.firstCol()==1);

1483 

1484     ASSERT(V.firstRow()==1);

1485     ASSERT(V.firstCol()==1);

1486 

1487     ASSERT(iwork.length()==m+3*n);

1488 #   endif

1489 

1490 //

1491 //  Make copies of output arguments

1492 //

1493 #   ifdef CHECK_CXXLAPACK

1494     typename GeMatrix<MA>::NoView        A_org     = A;

1495     typename DenseVector<VSVA>::NoView   sva_org   = sva;

1496     typename GeMatrix<MU>::NoView        U_org     = U;

1497     typename GeMatrix<MV>::NoView        V_org     = V;

1498     typename DenseVector<VWORK>::NoView  work_org  = work;

1499 #   endif

1500 

1501 //

1502 //  Call implementation

1503 //

1504     /*

1505     IndexType info = jsv_generic(accuracy, jobU, jobV,

1506                                  restrictedRange, considerTransA, perturb,

1507                                  A, sva, U, V, work);

1508     */

1509     IndexType info = jsv_native(accuracy, jobU, jobV,

1510                                 restrictedRange, considerTransA, perturb,

1511                                 A, sva, U, V, work, iwork);

1512 

1513 #   ifdef CHECK_CXXLAPACK

1514 //

1515 //  Make copies of results computed by the generic implementation

1516 //

1517     typename GeMatrix<MA>::NoView        A_generic     = A;

1518     typename DenseVector<VSVA>::NoView   sva_generic   = sva;

1519     typename GeMatrix<MU>::NoView        U_generic     = U;

1520     typename GeMatrix<MV>::NoView        V_generic     = V;

1521     typename DenseVector<VWORK>::NoView  work_generic  = work;

1522 //

1523 //  restore output arguments

1524 //

1525     A    = A_org;

1526     sva  = sva_org;

1527     U    = U_org;

1528     V    = V_org;

1529     work = work_org;

1530 //

1531 //  Compare generic results with results from the native implementation

1532 //

1533     IndexType _info = jsv_native(accuracy, jobU, jobV,

1534                                  restrictedRange, considerTransA, perturb,

1535                                  A, sva, U, V, work, iwork);

1536     bool failed = false;

1537     if (! isIdentical(A_generic, A, "A_generic", "A")) {

1538         std::cerr << "CXXLAPACK: A_generic = " << A_generic << std::endl;

1539         std::cerr << "F77LAPACK: A = " << A << std::endl;

1540         failed = true;

1541     }

1542     if (! isIdentical(sva_generic, sva, "sva_generic", "sva")) {

1543         std::cerr << "CXXLAPACK: sva_generic = " << sva_generic << std::endl;

1544         std::cerr << "F77LAPACK: sva = " << sva << std::endl;

1545         failed = true;

1546     }

1547     if (! isIdentical(U_generic, U, "U_generic", "U")) {

1548         std::cerr << "CXXLAPACK: U_generic = " << U_generic << std::endl;

1549         std::cerr << "F77LAPACK: U = " << U << std::endl;

1550         failed = true;

1551     }

1552     if (! isIdentical(V_generic, V, "V_generic", "V")) {

1553         std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;

1554         std::cerr << "F77LAPACK: V = " << V << std::endl;

1555         failed = true;

1556     }

1557     if (! isIdentical(work_generic, work, "work_generic", "work")) {

1558         std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;

1559         std::cerr << "F77LAPACK: work = " << work << std::endl;

1560         failed = true;

1561     }

1562     if (! isIdentical(info, _info, "info", "_info")) {

1563         std::cerr << "CXXLAPACK: info = " << info << std::endl;

1564         std::cerr << "F77LAPACK: _info = " << _info << std::endl;

1565         failed = true;

1566     }

1567 

1568     if (failed) {

1569         std::cerr << "error in: jsv.tcc " << std::endl;

1570         ASSERT(0);

1571     } else {

1572         std::cerr << "passed: jsv.tcc " << std::endl;

1573     }

1574 #   endif

1575 

1576     return info;

1577 }

1578 

1579 //-- forwarding ----------------------------------------------------------------

1580 template <typename MA, typename VSVA, typename MU, typename MV,

1581          typename VWORK, typename VIWORK>

1582 typename MA::IndexType

1583 jsv(JSV::Accuracy             accuracy,

1584     JSV::JobU                 jobU,

1585     JSV::JobV                 jobV,

1586     bool                      restrictedRange,

1587     bool                      considerTransA,

1588     bool                      perturb,

1589     MA                        &&A,

1590     VSVA                      &&sva,

1591     MU                        &&U,

1592     MV                        &&V,

1593     VWORK                     &&work,

1594     VIWORK                    &&iwork)

1595 {

1596     typename MA::IndexType info;

1597 

1598     CHECKPOINT_ENTER;

1599     info = jsv(accuracy, jobU, jobV,

1600                restrictedRange, considerTransA, perturb,

1601                A, sva, U, V, work, iwork);

1602     CHECKPOINT_LEAVE;

1603 

1604     return info;

1605 }

1606 

1607 } } // namespace lapack, flens

1608 

1609 #endif // FLENS_LAPACK_SVD_JSV_TCC