1 /*
   2  *   Copyright (c) 2012, Michael Lehn
   3  *
   4  *   All rights reserved.
   5  *
   6  *   Redistribution and use in source and binary forms, with or without
   7  *   modification, are permitted provided that the following conditions
   8  *   are met:
   9  *
  10  *   1) Redistributions of source code must retain the above copyright
  11  *      notice, this list of conditions and the following disclaimer.
  12  *   2) Redistributions in binary form must reproduce the above copyright
  13  *      notice, this list of conditions and the following disclaimer in
  14  *      the documentation and/or other materials provided with the
  15  *      distribution.
  16  *   3) Neither the name of the FLENS development group nor the names of
  17  *      its contributors may be used to endorse or promote products derived
  18  *      from this software without specific prior written permission.
  19  *
  20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  */
  32 
  33 /* Based on
  34  *
  35        SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
  36       $                   M, N, A, LDA, SVA, U, LDU, V, LDV,
  37       $                   WORK, LWORK, IWORK, INFO )
  38  *
  39  *  -- LAPACK routine (version 3.3.1)                                    --
  40  *
  41  *  -- Contributed by Zlatko Drmac of the University of Zagreb and     --
  42  *  -- Kresimir Veselic of the Fernuniversitaet Hagen                  --
  43  *  -- April 2011                                                      --
  44  *
  45  *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
  46  *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
  47  *
  48  * This routine is also part of SIGMA (version 1.23, October 23. 2008.)
  49  * SIGMA is a library of algorithms for highly accurate algorithms for
  50  * computation of SVD, PSVD, QSVD, (H,K)-SVD, and for solution of the
  51  * eigenvalue problems Hx = lambda M x, H M x = lambda x with H, M > 0.
  52  *
  53  */
  54 
  55 #ifndef FLENS_LAPACK_SVD_JSV_TCC
  56 #define FLENS_LAPACK_SVD_JSV_TCC 1
  57 
  58 #include <flens/blas/blas.h>
  59 #include <flens/lapack/lapack.h>
  60 
  61 namespace flens { namespace lapack {
  62 
  63 //== generic lapack implementation =============================================
  64 /*
  65 template <typename MA, typename VSVA, typename MU, typename MV,
  66           typename VWORK, typename VIWORK>
  67 typename GeMatrix<MA>::IndexType
  68 jsv_generic(JSV::Accuracy             accuracy,
  69             JSV::JobU                 jobU,
  70             JSV::JobV                 jobV,
  71             bool                      restrictedRange,
  72             bool                      considerTransA,
  73             bool                      perturb,
  74             GeMatrix<MA>              &A,
  75             DenseVector<VSVA>         &sva,
  76             GeMatrix<MU>              &U,
  77             GeMatrix<MV>              &V,
  78             DenseVector<VWORK>        &work,
  79             DenseVector<VIWORK>       &iwork)
  80 {
  81     using std::abs;
  82     using std::max;
  83     using std::min;
  84     using std::sqrt;
  85 
  86     typedef typename GeMatrix<MA>::ElementType  ElementType;
  87     typedef typename GeMatrix<MA>::IndexType    IndexType;
  88 
  89     const ElementType Zero(0), One(1);
  90 
  91     const Underscore<IndexType>  _;
  92 
  93     const bool lsvec  = (jobU==ComputeU) || (jobU==FullsetU);
  94     const bool jracc  = (jobV=='J');
  95     const bool rsvec  = (jobV=='V') || jracc;
  96     const bool rowpiv = (jobA=='F') || (jobA=='G');
  97     const bool l2rank = (jobA=='R');
  98     const bool l2aber = (jobA=='A');
  99     const bool errest = (jobA=='E') || (jobA=='G');
 100     const bool l2tran = (jobT=='T');
 101     const bool l2kill = (jobR=='R');
 102     const bool defr   = (jobR=='N');
 103     const bool l2pert = (jobP=='P');
 104 
 105     IndexType info = 0;
 106 
 107 //
 108 //  Quick return for void matrix (Y3K safe)
 109 //#:)
 110     if (m==0 || n==0) {
 111         return info;
 112     }
 113 //
 114 //  Set numerical parameters
 115 //
 116 //! NOTE: Make sure DLAMCH() does not fail on the target architecture.
 117 //
 118     const ElementType eps     = lamch<ElementType>(Eps);
 119     const ElementType safeMin = lamch<ElementType>(SafeMin);
 120     const ElementType small   = safeMin / eps;
 121     const ElementType big     = lamch<ElementType>(OverflowThreshold);
 122 //
 123 //  Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N
 124 //
 125 //! If necessary, scale SVA() to protect the largest norm from
 126 //  overflow. It is possible that this scaling pushes the smallest
 127 //  column norm left from the underflow threshold (extreme case).
 128 //
 129     ElementType scaleM = One / sqrt(ElementType(m)*ElementType(n));
 130     bool noScale = true;
 131     bool goScale = true;
 132     for (IndexType p=1; p<=n; ++p) {
 133         aapp = Zero;
 134         aaqq = One;
 135         lassq(A(_,p), aapp, aaqq);
 136         if (aapp>big) {
 137             return -9;
 138         }
 139         aaqq = sqrt(aaqq);
 140         if (aapp<(big/aaqq) && noscal) {
 141             sva(p)  = aapp * aaqq;
 142         } else {
 143             noscal  = false;
 144             sva(p)  = aapp * ( aaqq * scalem )
 145             if (goScale) {
 146                 goScale = false;
 147                 sva(_(1,p)) *= scaleM;
 148             }
 149         }
 150     }
 151 
 152     if (noScale) {
 153         scaleM = One;
 154     }
 155 
 156     aapp = Zero;
 157     aaqq = big;
 158     for (IndexType p=1; p<=n) {
 159         aapp = max(aapp, sva(p));
 160         if (sva(p)!=Zero) {
 161             aaqq = min(aaqq, sva(p));
 162         }
 163     }
 164 //
 165 //    Quick return for zero M x N matrix
 166 //#:)
 167     if (aapp==Zero) {
 168         if (lsvec) {
 169             U = Zero;
 170             U.diag(0) = One;
 171         }
 172         if (rsvec) {
 173             V = Zero;
 174             V.diag(0) = One;
 175         }
 176         work(1) = One;
 177         work(2) = One;
 178         if (errest) {
 179             work(3) = One;
 180         }
 181         if (lsvec && rsvec) {
 182             work(4) = One;
 183             work(5) = One;
 184         }
 185         if (l2tran) {
 186             work(6) = Zero;
 187             work(7) = Zero;
 188         }
 189         iwork(1) = 0;
 190         iwork(2) = 0;
 191         iwork(3) = 0;
 192         return info;
 193     }
 194 //
 195 //  Issue warning if denormalized column norms detected. Override the
 196 //  high relative accuracy request. Issue licence to kill columns
 197 //  (set them to zero) whose norm is less than sigma_max / BIG (roughly).
 198 //#:(
 199     warning = 0
 200     if (aaqq<=SFMIN) {
 201         l2rank = true;
 202         l2kill = true;
 203         warning = 1;
 204     }
 205 //
 206 //  Quick return for one-column matrix
 207 //#:)
 208     auto U1 = U(_,_(1,n));
 209     if (n==1) {
 210 
 211         if (lsvec) {
 212             lascl(LASCL::FullMatrix, 0, 0, sva(1), scaleM, A);
 213             U1 = A;
 214 //          computing all M left singular vectors of the M x 1 matrix
 215             if (nu!=n) {
 216                 auto tau   = work(_(1,n));
 217                 auto _work = work(_(n+1,lWork));
 218                 qrf(U1, tau, _work);
 219                 orgqr(1, U, tau, _work);
 220                 U1 = A;
 221             }
 222         }
 223         if (rsvec) {
 224             V(1,1) = One
 225         }
 226         if (sva(1)<big*scaleM) {
 227             sva(1) /= scaleM;
 228             scalem = One;
 229         }
 230         work(1) = One / scaleM;
 231         work(2) = One;
 232         if (sva(1)!=Zero) {
 233             iwork(1) = 1;
 234             if (sva(1)/scaleM>=safeMin) {
 235                 iwork(2) = 1;
 236             } else {
 237                 iwork(2) = 0;
 238             }
 239         } else {
 240             iwork(1) = 0;
 241             iwork(2) = 0;
 242         }
 243         if (errest) {
 244             work(3) = One;
 245         }
 246         if (lsvec && rsvec) {
 247             work(4) = One;
 248             work(5) = One;
 249         }
 250         if (l2tran) {
 251             work(6) = Zero;
 252             work(7) = Zero;
 253         }
 254         return info;
 255 
 256     }
 257 
 258     bool transp = false;
 259     bool l2tran = l2tran && m==n;
 260 
 261     ElementType aatMax = -One;
 262     ElementType aatMin = big;
 263     if (rowpiv || l2tran) {
 264 //
 265 //      Compute the row norms, needed to determine row pivoting sequence
 266 //      (in the case of heavily row weighted A, row pivoting is strongly
 267 //      advised) and to collect information needed to compare the
 268 //      structures of A * A^t and A^t * A (in the case L2TRAN.EQ.true).
 269 //
 270         if (l2tran) {
 271             for (IndexType p=1; p<=m; ++p) {
 272                 xsc = Zero;
 273                 tmp = One;
 274                 lassq(A(p,_), xsc, tmp);
 275 //              DLASSQ gets both the ell_2 and the ell_infinity norm
 276 //              in one pass through the vector
 277                 work(m+n+p) = xsc * scaleM;
 278                 work(n+p)   = xsc * (scaleN*sqrt(tmp));
 279                 aatMax      = max(aatMax, work(n+p));
 280                 if (work(n+p)!=Zero) {
 281                     aatMin = min(aatMin, work(n+p));
 282                 }
 283             }
 284         } else {
 285             for (IndexType p=1; p<=m; ++p) {
 286                 const IndexType jp = blas::iamax(A(p,_));
 287                 work(m+n+p) = scaleM*abs(A(p,jp));
 288                 aatMax = max(aatMax, work(m+n+p));
 289                 aatMin = min(aatMin, work(m+n+p));
 290             }
 291         }
 292 
 293     }
 294 //
 295 //  For square matrix A try to determine whether A^t  would be  better
 296 //  input for the preconditioned Jacobi SVD, with faster convergence.
 297 //  The decision is based on an O(N) function of the vector of column
 298 //  and row norms of A, based on the Shannon entropy. This should give
 299 //  the right choice in most cases when the difference actually matters.
 300 //  It may fail and pick the slower converging side.
 301 //
 302     entra  = Zero
 303     entrat = Zero
 304     if (l2tran) {
 305 
 306         xsc = Zero;
 307         tmp = One;
 308         lassq(sva, xsc, tmp);
 309         tmp = One / tmp;
 310 
 311         entra = Zero
 312         for (IndexType p=1; p<=n; ++p) {
 313             const ElementType _big = pow(sva(p)/xsc, 2) * tmp;
 314             if (_big!=Zero) {
 315                 entra += _big * log(_big);
 316             }
 317         }
 318         entra = - entra / log(ElementType(n));
 319 //
 320 //      Now, SVA().^2/Trace(A^t * A) is a point in the probability simplex.
 321 //      It is derived from the diagonal of  A^t * A.  Do the same with the
 322 //      diagonal of A * A^t, compute the entropy of the corresponding
 323 //      probability distribution. Note that A * A^t and A^t * A have the
 324 //      same trace.
 325 //
 326         entrat = Zero
 327         for (IndexType p=n+1; p<=n+m; ++p) {
 328             const ElementType _big = pow(work(p)/xsc, 2) * tmp;
 329             if (_big!=Zero) {
 330                 entrat += _big * log(_big);
 331             }
 332         }
 333         entrat = -entrat / log(ElementType(m));
 334 //
 335 //      Analyze the entropies and decide A or A^t. Smaller entropy
 336 //      usually means better input for the algorithm.
 337 //
 338         transp = entrat<entra;
 339 //
 340 //      If A^t is better than A, transpose A.
 341 //
 342         if (transp) {
 343 //          In an optimal implementation, this trivial transpose
 344 //          should be replaced with faster transpose.
 345 //          TODO: in-place transpose:
 346 //                transpose(A);
 347             for (IndexType p=1; p<=n-1; ++p) {
 348                 for (IndexType q=p+1; q<=n; ++q) {
 349                     swap(A(q,p), A(p,q));
 350                 }
 351             }
 352             for (IndexType p=1; p<=n; ++p) {
 353                 work(m+n+p) = sva(p);
 354                 sva(p)      = work(n+p);
 355             }
 356             swap(aapp, aatMax);
 357             swap(aaqq, aatMin);
 358             swap(lsvec, rsvec);
 359             if (lsvec) {
 360                 // Lehn: transposing A is only considered if A is square,
 361                 //       so in this case m==n and therefore nu==n.  Or am I
 362                 //       wrong??
 363                 ASSERT(nu==n);
 364             }
 365 
 366             rowpiv = true;
 367         }
 368 
 369     }
 370 //
 371 //  Scale the matrix so that its maximal singular value remains less
 372 //  than DSQRT(BIG) -- the matrix is scaled so that its maximal column
 373 //  has Euclidean norm equal to DSQRT(BIG/N). The only reason to keep
 374 //  DSQRT(BIG) instead of BIG is the fact that DGEJSV uses LAPACK and
 375 //  BLAS routines that, in some implementations, are not capable of
 376 //  working in the full interval [SFMIN,BIG] and that they may provoke
 377 //  overflows in the intermediate results. If the singular values spread
 378 //  from SFMIN to BIG, then DGESVJ will compute them. So, in that case,
 379 //  one should use DGESVJ instead of DGEJSV.
 380 //
 381     const ElementType bigRoot = sqrt(big);
 382     tmp = sqrt(big/ElementType(n));
 383 
 384     lascl(LASCL::FullMatrix, 0, 0, aapp, tmp, sva);
 385     if (aaqq>aapp*safeMin) {
 386         aaqq = (aaqq/aapp) * tmp;
 387     } else {
 388         aaqq = (aaqq*tmp) / aapp;
 389     }
 390     tmp *= scaleM;
 391     lascl(LASCL::FullMatrix, 0, 0, aapp, tmp, A);
 392 //
 393 //  To undo scaling at the end of this procedure, multiply the
 394 //  computed singular values with USCAL2 / USCAL1.
 395 //
 396     uScale1 = tmp;
 397     uScale2 = aapp;
 398 
 399     if (l2kill) {
 400 //      L2KILL enforces computation of nonzero singular values in
 401 //      the restricted range of condition number of the initial A,
 402 //      sigma_max(A) / sigma_min(A) approx. DSQRT(BIG)/DSQRT(SFMIN).
 403         xsc = sqrt(safeMin);
 404     } else {
 405         xsc = small;
 406 //
 407 //      Now, if the condition number of A is too big,
 408 //      sigma_max(A) / sigma_min(A)>DSQRT(BIG/N) * EPSLN / SFMIN,
 409 //      as a precaution measure, the full SVD is computed using DGESVJ
 410 //      with accumulated Jacobi rotations. This provides numerically
 411 //      more robust computation, at the cost of slightly increased run
 412 //      time. Depending on the concrete implementation of BLAS and LAPACK
 413 //      (i.e. how they behave in presence of extreme ill-conditioning) the
 414 //      implementor may decide to remove this switch.
 415         if (aaqq<sqrt(safeMin) && lsvec && rsvec ) {
 416             jracc = true;
 417         }
 418 
 419     }
 420     if (aaqq<xsc) {
 421         for (IndexType p=1; p<=n; ++p) {
 422             if (sva(p)<xsc) {
 423                 A(_,p) = Zero;
 424                 sva(p) = Zero;
 425             }
 426         }
 427     }
 428 //
 429 //  Preconditioning using QR factorization with pivoting
 430 //
 431     if (rowpiv) {
 432 //      Optional row permutation (Bjoerck row pivoting):
 433 //      A result by Cox and Higham shows that the Bjoerck's
 434 //      row pivoting combined with standard column pivoting
 435 //      has similar effect as Powell-Reid complete pivoting.
 436 //      The ell-infinity norms of A are made nonincreasing.
 437         for (IndexType p=1; p<=m-1; ++p) {
 438             const IndexType q = blas::iamax(work(_(m+n+p,2*m+n))) + p - 1;
 439             iwork(2*n+p) = q;
 440             if (p!=q) {
 441                 swap(work(m+n+p), work(m+n+q));
 442             }
 443         }
 444         laswp(A, iwork(_(2*n+1, 2*n+m-1)));
 445     }
 446 //
 447 //  End of the preparation phase (scaling, optional sorting and
 448 //  transposing, optional flushing of small columns).
 449 //
 450 //  Preconditioning
 451 //
 452 //  If the full SVD is needed, the right singular vectors are computed
 453 //  from a matrix equation, and for that we need theoretical analysis
 454 //  of the Businger-Golub pivoting. So we use DGEQP3 as the first RR QRF.
 455 //  In all other cases the first RR QRF can be chosen by other criteria
 456 //  (eg speed by replacing global with restricted window pivoting, such
 457 //  as in SGEQPX from TOMS # 782). Good results will be obtained using
 458 //  SGEQPX with properly (!) chosen numerical parameters.
 459 //  Any improvement of DGEQP3 improves overal performance of DGEJSV.
 460 //
 461 //  A * P1 = Q1 * [ R1^t 0]^t:
 462     auto _tau   = work(_(1,n));
 463     auto _work  = work(_(n+1,lWork));
 464     auto _iwork = iwork(_(1,n));
 465 
 466 //      .. all columns are free columns
 467     _iwork = 0;
 468 
 469     qp3(A, _iwork, _tau, _work);
 470 //
 471 //  The upper triangular matrix R1 from the first QRF is inspected for
 472 //  rank deficiency and possibilities for deflation, or possible
 473 //  ill-conditioning. Depending on the user specified flag L2RANK,
 474 //  the procedure explores possibilities to reduce the numerical
 475 //  rank by inspecting the computed upper triangular factor. If
 476 //  L2RANK or l2aber are up, then DGEJSV will compute the SVD of
 477 //  A + dA, where ||dA|| <= f(M,N)*EPSLN.
 478 //
 479     IndexType nr = 1;
 480     if (l2aber) {
 481 //      Standard absolute error bound suffices. All sigma_i with
 482 //      sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
 483 //      agressive enforcement of lower numerical rank by introducing a
 484 //      backward error of the order of N*EPSLN*||A||.
 485         tmp = sqrt(ElementType(n))*eps;
 486         for (IndexType p=2; p<=n; ++p) {
 487             if (abs(A(p,p))>=tmp*abs(A(1,1))) {
 488                 ++nr;
 489             } else {
 490                 break;
 491             }
 492         }
 493     } else if (l2rank) {
 494 //      .. similarly as above, only slightly more gentle (less agressive).
 495 //      Sudden drop on the diagonal of R1 is used as the criterion for
 496 //      close-to-rank-defficient.
 497         tmp = sqrt(safeMin);
 498         for (IndexType p=2; p<=n; ++p) {
 499             if (abs(A(p,p))<eps*abs(A(p-1,p-1)) || abs(A(p,p))<small
 500              || (l2kill && abs(A(p,p))<tmp))
 501             {
 502                 break;
 503             }
 504             ++nr;
 505         }
 506 
 507     } else {
 508 //      The goal is high relative accuracy. However, if the matrix
 509 //      has high scaled condition number the relative accuracy is in
 510 //      general not feasible. Later on, a condition number estimator
 511 //      will be deployed to estimate the scaled condition number.
 512 //      Here we just remove the underflowed part of the triangular
 513 //      factor. This prevents the situation in which the code is
 514 //      working hard to get the accuracy not warranted by the data.
 515         tmp = sqrt(safeMin);
 516         for (IndexType p=2; p<=n; ++p) {
 517             if (abs(A(p,p))<small || (l2kill && abs(A(p,p))<tmp)) {
 518                 break;
 519             }
 520             ++nr;
 521         }
 522 
 523     }
 524 
 525     bool almort = false;
 526     if (nr==n) {
 527         ElementType maxprj = One;
 528         for (IndexType p=2; p<=n; ++p) {
 529             maxprj = min(maxprj, abs(A(p,p))/sva(iwork(p)));
 530         }
 531         if (pow(maxprj,2)>=One-ElementType(n)*eps) {
 532             almort = true;
 533         }
 534     }
 535 
 536 
 537     sconda = -One;
 538     condr1 = -One;
 539     condr2 = -One;
 540 
 541     if (errest) {
 542         if (n==nr) {
 543             if (rsvec) {
 544 //              .. V is available as workspace
 545                 V.upper() = A(_(1,n),_).upper();
 546                 for (IndexType p=1; p<=n; ++p) {
 547                     tmp = sva(iwork(p));
 548                     V(_(1,p),p) *= One/tmp;
 549                 }
 550                 auto _work  = work(_(n+1, 4*n));
 551                 auto _iwork = iwork(_(2*n+m+1, 3*n+m));
 552                 pocon(V.upper(), One, tmp, _work, _iwork);
 553             } else if (lsvec) {
 554 //              .. U is available as workspace
 555                 auto _U = U(_(1,n),_(1,n));
 556                 _U.upper() = A(_(1,n),_).upper();
 557                 for (IndexType p=1; p<=n; ++p) {
 558                     tmp = sva(iwork(p));
 559                     U(_(1,p),p) *= One/tmp;
 560                 }
 561                 auto _work  = work(_(n+1, 4*n));
 562                 auto _iwork = iwork(_(2*n+m+1, 3*n+m));
 563                 pocon(_U.upper(), One, tmp, _work, _iwork);
 564             } else {
 565                 auto                       _work1 = work(_(n+1,n+n*n));
 566                 auto                       _work2 = work(_(n+n*n+1,n+n*n+3*n));
 567                 auto                       _iwork = work(_(2*n+m+1,3*n+m));
 568                 GeMatrixView<ElementType>  Work(n, n, _work1, n);
 569 
 570                 Work.upper() = A(_(1,n),_).upper();
 571                 for (IndexType p=1; p<=n; ++p) {
 572                     tmp = sva(iwork(p));
 573                     Work(_(1,p),p) *= One/tmp;
 574                 }
 575 //              .. the columns of R are scaled to have unit Euclidean lengths.
 576                 pocon(Work.upper(), One, tmp, _work2, _iwork);
 577             }
 578             sconda = One/sqrt(tmp);
 579 //          SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1).
 580 //          N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
 581         } else {
 582             sconda = -One;
 583         }
 584     }
 585 
 586     l2pert = l2pert && abs(A(1,1)/A(nr,nr))>sqrt(bigRoot);
 587 //  If there is no violent scaling, artificial perturbation is not needed.
 588 //
 589 //  Phase 3:
 590 //
 591     if (!(rsvec || lsvec)) {
 592 //
 593 //      Singular Values only
 594 //
 595 //      .. transpose A(1:NR,1:N)
 596         for (IndexType p=1; p<=min(n-1, nr); ++p) {
 597             A(p,_(p+1,n)) = A(_(p+1,n),p);
 598         }
 599 //
 600 //      The following two DO-loops introduce small relative perturbation
 601 //      into the strict upper triangle of the lower triangular matrix.
 602 //      Small entries below the main diagonal are also changed.
 603 //      This modification is useful if the computing environment does not
 604 //      provide/allow FLUSH TO Zero underflow, for it prevents many
 605 //      annoying denormalized numbers in case of strongly scaled matrices.
 606 //      The perturbation is structured so that it does not introduce any
 607 //      new perturbation of the singular values, and it does not destroy
 608 //      the job done by the preconditioner.
 609 //      The licence for this perturbation is in the variable L2PERT, which
 610 //      should be false if FLUSH TO Zero underflow is active.
 611 //
 612         if (! almort) {
 613 
 614             if (l2pert) {
 615 //              XSC = DSQRT(SMALL)
 616                 xsc = eps / ElementType(n);
 617                 for (IndexType q=1; q<=nr; ++q) {
 618                     tmp = xsc*abs(A(q,q));
 619                     for (IndexType p=1; p<=n, ++p) {
 620                         if ((p>q && abs(A(p,q))<=tmp) || p<q) {
 621                             A(p,q) = sign(tmp, A(p,q));
 622                         }
 623                     }
 624                 }
 625             } else {
 626                 A(_(1,nr),_(1,nr)).strictUpper() = Zero;
 627             }
 628 //
 629 //          .. second preconditioning using the QR factorization
 630 //
 631             auto _A    = A(_(1,n),_(1,nr));
 632             auto _tau  = work(_(1,nr));
 633             auto _work = work(_(n+1,lWork));
 634             qrf(_A, _tau, _work);
 635 //
 636 //          .. and transpose upper to lower triangular
 637             for (IndexType p=1; p<=nr-1; ++p) {
 638                 A(p,_(p+1,nr)) = A(_(p+1,nr),p);
 639             }
 640 
 641         }
 642 //
 643 //      Row-cyclic Jacobi SVD algorithm with column pivoting
 644 //
 645 //      .. again some perturbation (a "background noise") is added
 646 //      to drown denormals
 647         if (l2pert) {
 648 //          XSC = DSQRT(SMALL)
 649             xsc = eps / ElementType(n);
 650             for (IndexType q=1; q<=nr; ++q) {
 651                 ElementType tmp = xsc*abs(A(q,q));
 652                 for (IndexType p=1; p<=nr; ++p) {
 653                     if (((p>q) && abs(A(p,q))<=tmp) || p<q) {
 654                         A(p,q) = sign(tmp, A(p,q));
 655                     }
 656                 }
 657             }
 658         } else {
 659             A(_(1,nr),_(1,nr)).strictUpper() = Zero;
 660         }
 661 //
 662 //      .. and one-sided Jacobi rotations are started on a lower
 663 //      triangular matrix (plus perturbation which is ignored in
 664 //      the part which destroys triangular form (confusing?!))
 665 //
 666         auto _A    = A(_(1,nr),_(1,nr));
 667         auto _sva  = sva(_(1,nr));
 668 
 669         svj(SVJ::Lower, SVJ::NoU, SVJ::NoV, _A, _sva, V, work);
 670 
 671         scaleM  = work(1);
 672         numRank = nint(work(2));
 673 
 674 
 675     } else if (rsvec && !lsvec) {
 676 //
 677 //      -> Singular Values and Right Singular Vectors <-
 678 //
 679         if (almort) {
 680 //
 681 //          .. in this case NR equals N
 682             ASSERT(nr==n);
 683             for (IndexType p=1; p<=nr; ++p) {
 684                 V(_(p,n),p) = A(p,_(p,n));
 685             }
 686 
 687             auto _V = V(_,_(1,nr));
 688             _V.strictUpper() = Zero;
 689             svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV, _V, sva, A, work);
 690 
 691             scaleM  = work(1);
 692             numRank = nint(work(2));
 693 
 694         } else {
 695 //
 696 //      .. two more QR factorizations ( one QRF is not enough, two require
 697 //      accumulated product of Jacobi rotations, three are perfect )
 698 //
 699             A(_(1,nr),_(1,nr)).strictLower() = Zero;
 700 
 701             auto _A     = A(_(1,nr),_);
 702             auto _tau1  = work(_(1,nr));
 703             auto _work1 = work(_(n+1,lWork));
 704             lqf(_A, _tau1, _work1);
 705 
 706             auto _V = V(_(1,nr),_(1,nr))
 707             _V.lower()       = _A(_,_(1,nr)).lower();
 708             _V.strictUpper() = Zero;
 709 
 710             auto _tau2  = work(_(n+1,n+nr));
 711             auto _work2 = work(_(2*n+1,lWork));
 712             qrf(_V, _tau2, _work2);
 713 
 714             for (IndexType p=1; p<=nr; ++p) {
 715                 V(_(p,nr),p) = V(p,_(p,nr));
 716             }
 717             _V.strictUpper() = Zero;
 718 
 719             auto _sva  = sva(_(1,nr));
 720             svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV, _V, _sva, U, _work1);
 721 
 722             scaleM  = work(n+1);
 723             numRank = nint(work(n+2));
 724             if (nr<n) {
 725                 V(_(nr+1,n),_(1,nr)) = Zero;
 726                 V(_(1,nr),_(nr+1,n)) = Zero;
 727 
 728                 V(_(nr+1,n),_(nr+1,n)) = Zero;
 729                 V(_(nr+1,n),_(nr+1,n)).diag(0) = One;
 730             }
 731 
 732             ormlq(Left, Trans, _A, _tau1, V, _work1);
 733 
 734         }
 735 
 736         for (IndexType p=1; p<=n; ++p) {
 737             A(iwork(p),_) = V(p,_);
 738         }
 739         V = A;
 740 
 741         if (transp) {
 742             U = V;
 743         }
 744 
 745     } else if (lsvec && !rsvec) {
 746 //
 747 //      .. Singular Values and Left Singular Vectors                 ..
 748 //
 749 //      .. second preconditioning step to avoid need to accumulate
 750 //      Jacobi rotations in the Jacobi iterations.
 751 
 752         auto _U = U(_(1,nr),_(1,nr));
 753         auto _tau   = work(_(n+1,n+nr));
 754         auto _sva   = sva(_(1,nr));
 755         auto _work1 = work(_(n+1,lWork));
 756         auto _work2 = work(_(2*n+1,lWork));
 757 
 758         for (IndexType p=1; p<=nr; ++p) {
 759             A(p,_(p,nr)) = U(_(p,nr),p);
 760         }
 761         _U.strictUpper() = Zero;
 762 
 763         qrf(U(_(1,n),_(1,nr)), _tau, _work2);
 764 
 765         for (IndexType p=1; p<=nr-1; ++p) {
 766             U(p,_(p+1,nr)) = U(_(p+1,nr),p);
 767         }
 768         _U.strictUpper() = Zero;
 769 
 770         svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV, _U, _sva, A, _work1);
 771         scaleM  = work(n+1);
 772         numRank = nint(work(n+2));
 773 
 774         if (nr<m) {
 775             U(_(nr+1,m),_(1,nr)) = Zero;
 776             if (nr<nu) {
 777                 U(_(1,nr),_(nr+1,nu))           = Zero;
 778 
 779                 U(_(nr+1,m),_(nr+1,nu))         = Zero;
 780                 U(_(nr+1,m),_(nr+1,nu)).diag(0) = One;
 781             }
 782         }
 783 
 784         ormqr(Left, NoTrans, A, work, U);
 785 
 786         if (rowpiv) {
 787             auto piv = iwork(_(2*n+1,2*n+m-1));
 788             laswp(U, piv.reverse());
 789         }
 790 
 791         for (IndexType p=1; p<=nu; ++p) {
 792             xsc = One / blas::nrm2(U(_,p));
 793             U(_,p) *= xsc;
 794         }
 795 
 796         if (transp) {
 797             V = U;
 798         }
 799 //
 800     } else {
 801 //
 802 //      .. Full SVD ..
 803 //
 804         if (!jracc) {
 805 
 806             if (!almort) {
 807 //
 808 //              Second Preconditioning Step (QRF [with pivoting])
 809 //              Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
 810 //              equivalent to an LQF CALL. Since in many libraries the QRF
 811 //              seems to be better optimized than the LQF, we do explicit
 812 //              transpose and use the QRF. This is subject to changes in an
 813 //              optimized implementation of DGEJSV.
 814 //
 815                 for (IndexType p=1; p<=nr; ++p) {
 816                     V(_(p,n),p) = A(p,_(p,n));
 817                 }
 818 //
 819 //              .. the following two loops perturb small entries to avoid
 820 //              denormals in the second QR factorization, where they are
 821 //              as good as zeros. This is done to avoid painfully slow
 822 //              computation with denormals. The relative size of the
 823 //              perturbation is a parameter that can be changed by the
 824 //              implementer. This perturbation device will be obsolete on
 825 //              machines with properly implemented arithmetic.
 826 //              To switch it off, set L2PERT=false To remove it from  the
 827 //              code, remove the action under L2PERT=true, leave the ELSE
 828 //              part. The following two loops should be blocked and fused with
 829 //              the transposed copy above.
 830 //
 831                 if (l2pert) {
 832                     xsc = sqrt(small);
 833                     for (IndexType q=1; q<=nr; ++q) {
 834                         tmp = xsc*abs(V(q,q));
 835                         for (IndexType p=1; p<=n; ++p) {
 836                             if (p>q && abs(V(p,q))<=tmp || p<q) {
 837                                 V(p,q) = sign(tmp, V(p,q));
 838                             }
 839                             if (p<q) {
 840                                 V(p,q) = -V(p,q);
 841                             }
 842                         }
 843                     }
 844                 } else {
 845                     V(_(1,nr),_(1,nr)).strictUpper() = Zero;
 846                 }
 847 //
 848 //              Estimate the row scaled condition number of R1
 849 //              (If R1 is rectangular, N > NR, then the condition number
 850 //              of the leading NR x NR submatrix is estimated.)
 851 //
 852                 auto          _work1 = work(_(2*n+1,2*n+nr*nr));
 853                 auto          _work2 = work(_(2*n+nr*nr+1, 2*n+nr*nr+3*nr));
 854                 auto          _iwork = iwork(_(m+2*n+1,m+2*n+nr));
 855                 GeMatrixView<ElementType>  Work(nr, nr, _work1, nr);
 856 
 857                 Work.lower() = V.lower();
 858                 for (IndexType p=1; p<=nr; ++p) {
 859                     tmp = blas::nrm2(Work(_(p,nr),p));
 860                     Work(_(p,nr),p) *= One/tmp;
 861                 }
 862                 pocon(Work, One, tmp, _work2, _iwork);
 863                 condr1 = One / sqrt(tmp);
 864 //              .. here need a second oppinion on the condition number
 865 //              .. then assume worst case scenario
 866 //              R1 is OK for inverse <=> condr1 < DBLE(N)
 867 //              more conservative    <=> condr1 < DSQRT(DBLE(N))
 868 //
 869                 cond_ok = sqrt(ElementType(nr));
 870 //[TP]          COND_OK is a tuning parameter.
 871 
 872                 if (condr1<cond_ok) {
 873 //                  .. the second QRF without pivoting. Note: in an optimized
 874 //                  implementation, this QRF should be implemented as the QRF
 875 //                  of a lower triangular matrix.
 876 //                  R1^t = Q2 * R2
 877                     auto tau    = work(_(n+1,n+nr));
 878                     auto _work  = work(_(2*n+1,lWork));
 879 
 880                     qrf(V(_,_(1,nr)), tau, _work);
 881 
 882                     if (l2pert) {
 883                         xsc = sqrt(small) / Eps;
 884                         for (IndexType p=2; p<=nr; ++p) {
 885                             for (IndexType q=1; q<=p-1; ++q) {
 886                                 tmp = xsc*min(abs(V(p,p)), abs(V(q,q)));
 887                                 if (abs(V(q,p))<=tmp) {
 888                                     V(q,p) = sign(tmp, V(q,p));
 889                                 }
 890                             }
 891                         }
 892                     }
 893 //
 894                     if (nr!=n) {
 895                         auto _work = work(_(2*n+1, 2*n+n*nr));
 896                         GeMatrixView<ElementType>  Work(n, nr, _work, n);
 897 
 898                         Work = V(_,_(1,nr));
 899                     }
 900 //                  .. save ...
 901 //
 902 //               .. this transposed copy should be better than naive
 903 //                  TODO:  auto _V = V(_(1,nr),_(1,nr));
 904 //                         _V.lower() = transpose(_V.upper());
 905 //
 906                     for (IndexType p=1; p<=nr-1; ++p) {
 907                         V(_(p+1,nr),p) = V(p,_(p+1,nr));
 908                     }
 909 
 910                     condr2 = condr1;
 911 
 912                 } else {
 913 //
 914 //                  .. ill-conditioned case: second QRF with pivoting
 915 //                  Note that windowed pivoting would be equaly good
 916 //                  numerically, and more run-time efficient. So, in
 917 //                  an optimal implementation, the next call to DGEQP3
 918 //                  should be replaced with eg. CALL SGEQPX (ACM TOMS #782)
 919 //                  with properly (carefully) chosen parameters.
 920 //
 921 //                  R1^t * P2 = Q2 * R2
 922                     auto _V    = V(_,_(1,nr));
 923                     auto piv   = iwork(_(n+1,n+nr));
 924                     auto tau   = work(_(n+1,,n+nr));
 925                     auto _work = work(_(2*n+1, lWork));
 926 
 927                     piv = 0;
 928                     qp3(_V, piv, tau, _work);
 929                     if (l2pert) {
 930                         xsc = sqrt(small);
 931                         for (IndexType p=2; p<=nr; ++p) {
 932                             for (IndexType q=1; q<=p-1; ++q) {
 933                                 tmp = xsc*min(abs(V(p,p)), abs(V(q,q)));
 934                                 if (abs(V(q,p))<=tmp) {
 935                                     V(q,p) = sign(tmp, V(q,p));
 936                                 }
 937                             }
 938                         }
 939                     }
 940 
 941                     auto  _work1 = work(_(2*n+1, 2*n+n*nr));
 942                     GeMatrixView<ElementType>  Work1(n, nr, _work1, n);
 943 
 944                     Work1 = V(_,_(1,nr));
 945 
 946                     if (l2pert) {
 947                         xsc = sqrt(small);
 948                         for (IndexType p=2; p<=nr; ++p) {
 949                             for (IndexType q=1; q<=p-1; ++q) {
 950                                 tmp = xsc * min(abs(V(p,p)), abs(V(q,q)));
 951                                 V(p,q) = -sign(tmp, V(q,p));
 952                             }
 953                         }
 954                     } else {
 955                         V(_(1,nr),_(1,nr)).strictLower() = Zero;
 956                     }
 957 //                  Now, compute R2 = L3 * Q3, the LQ factorization.
 958                     auto _V     = V(_(1,nr),_(1,nr));
 959                     auto tau    = work(_(2*n+n*nr+1,2*n+n*nr+nr));
 960                     auto _work2 = work(_(2*n+n*nr+nr+1,lWork));
 961 
 962                     lqf(_V, tau, _work2);
 963 //                  .. and estimate the condition number
 964                     auto  _work3 = work(_(2*n+n*nr+nr+1,2*n+n*nr+nr+nr*nr));
 965                     GeMatrixView<ElementType>  Work3(nr, nr, _work3, nr);
 966 
 967                     Work3.lower() = _V.lower();
 968 
 969                     for (IndexType p=1; p<=nr; ++p) {
 970                         tmp = blas::nrm2(Work3(p,_(1,p)));
 971                         Work3(p,_(1,p)) *= One/tmp;
 972                     }
 973                     auto _work4 = work(_(2*n+n*nr+nr+nr*nr+1,
 974                                          2*n+n*nr+nr+nr*nr+3*nr));
 975                     auto _iwork = iwork(_(m+2*n+1, m+2*n+nr));
 976                     pocon(Work3.lower(), One, tmp, _work4, _iwork);
 977                     condr2 = One / sqrt(tmp);
 978 
 979                     if (condr2>=cond_ok) {
 980 //                      .. save the Householder vectors used for Q3
 981 //                      (this overwrittes the copy of R2, as it will not be
 982 //                      needed in this branch, but it does not overwritte the
 983 //                      Huseholder vectors of Q2.).
 984                         Work1(_(1,nr),_(1,nr)).upper() = V.upper();
 985 //                      .. and the rest of the information on Q3 is in
 986 //                      WORK(2*N+N*NR+1:2*N+N*NR+N)
 987                     }
 988 
 989                 }
 990 
 991                 if (l2pert) {
 992                     xsc = sqrt(small);
 993                     for (IndexType q=2; q<=nr; ++q) {
 994                         tmp = xsc * V(q,q);
 995                         for (IndexType p=1; p<=q-1; ++p) {
 996 //                          V(p,q) = - DSIGN( TEMP1, V(q,p) )
 997                             V(p,q) = -sign(tmp, V(p,q));
 998                         }
 999                     }
1000                 } else {
1001                     V(_(1,nr),_(1,nr)).strictLower();
1002                 }
1003 //
1004 //              Second preconditioning finished; continue with Jacobi SVD
1005 //              The input matrix is lower triangular.
1006 //
1007 //              Recover the right singular vectors as solution of a well
1008 //              conditioned triangular matrix equation.
1009 //
1010                 if (condr1<cond_ok) {
1011                     auto _U    = U(_(1,nr),_(1,nr));
1012                     auto _V    = V(_(1,nr),_(1,nr));
1013                     auto _sva  = sva(_(1,nr));
1014                     auto _work = work(_(2*n+n*nr+nr+1,lWork));
1015 
1016                     svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV,
1017                         _V, _sva, U, _work);
1018                     scaleM = _work(1);
1019                     numRank = nint(_work(2));
1020 
1021                     for (IndexType p=1; p<=nr; ++p) {
1022                         _U(_,p) = _V(_,p);
1023                         _V(_,p) *= sva(p);
1024                     }
1025 
1026 //                  .. pick the right matrix equation and solve it
1027 //
1028                     if (nr==n) {
1029 //:))                   .. best case, R1 is inverted. The solution of this
1030 //                      matrix equation is Q2*V2 = the product of the Jacobi
1031 //                      rotations used in DGESVJ, premultiplied with the
1032 //                      orthogonal matrix  from the second QR factorization.
1033                         const auto _A = A(_(1,nr),_(1,nr));
1034                         blas::sm(Left, NoTrans, One, _A.upper(), _V);
1035                     } else {
1036 //                      .. R1 is well conditioned, but non-square. Transpose(R2)
1037 //                      is inverted to get the product of the Jacobi rotations
1038 //                      used in DGESVJ. The Q-factor from the second QR
1039 //                      factorization is then built in explicitly.
1040 
1041                         auto  _work = work(_(2*n+1, 2*n+n*nr));
1042                         GeMatrixView<ElementType>  Work1(nr, nr, _work, n);
1043                         GeMatrixView<ElementType>  Work(n, nr, _work, n);
1044 
1045                         blas::sm(Left, NoTrans, One, Work1.upper(), _V);
1046                         if (nr<n) {
1047                             V(_(nr+1,n),_(1,nr)) = Zero;
1048                             V(_(1,nr),_(nr+1,n)) = Zero;
1049                             V(_(nr+1,n),_(nr+1,n)) = Zero;
1050                             V(_(nr+1,n),_(nr+1,n)).diag(0) = One;
1051                         }
1052                         auto tau = work(_(n+1,n+nr));
1053                         auto _work_ormqr = work(_(2*n+n*nr+nr+1,lWork));
1054                         ormqr(Left, NoTrans, Work, tau, V, _work_ormqr);
1055                     }
1056 //
1057                 } else if (condr2<cond_ok) {
1058 //
1059 //:)                .. the input matrix A is very likely a relative of
1060 //                  the Kahan matrix :)
1061 //                  The matrix R2 is inverted. The solution of the matrix
1062 //                  equation is Q3^T*V3 = the product of the Jacobi rotations
1063 //                  (appplied to the lower triangular L3 from the LQ
1064 //                  factorization of R2=L3*Q3), pre-multiplied with the
1065 //                  transposed Q3.
1066                     CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
1067      $                  LDU, work(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )
1068                     SCALEM  = work(2*N+N*NR+NR+1)
1069                     numRank = IDNINT(work(2*N+N*NR+NR+2))
1070                     DO 3870 p = 1, NR
1071                         CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 )
1072                         CALL DSCAL( NR, SVA(p),    U(1,p), 1 )
1073  3870               CONTINUE
1074                     CALL DTRSM('L','U','N','N',NR,NR,One,work(2*N+1),N,U,LDU)
1075 //                  .. apply the permutation from the second QR factorization
1076                     DO 873 q = 1, NR
1077                         DO 872 p = 1, NR
1078                             work(2*N+N*NR+NR+iwork(N+p)) = U(p,q)
1079  872                    CONTINUE
1080                         DO 874 p = 1, NR
1081                             U(p,q) = work(2*N+N*NR+NR+p)
1082  874                    CONTINUE
1083  873                CONTINUE
1084                     if ( NR < N ) {
1085                         CALL DLASET( 'A',N-NR,NR,Zero,Zero,V(NR+1,1),LDV )
1086                         CALL DLASET( 'A',NR,N-NR,Zero,Zero,V(1,NR+1),LDV )
1087                         CALL DLASET( 'A',N-NR,N-NR,Zero,One,V(NR+1,NR+1),LDV )
1088                     }
1089                     CALL DORMQR( 'L','N',N,N,NR,work(2*N+1),N,work(N+1),
1090      $                      V,LDV,work(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
1091                 } else {
1092 //                  Last line of defense.
1093 //#:(               This is a rather pathological case: no scaled condition
1094 //                  improvement after two pivoted QR factorizations. Other
1095 //                  possibility is that the rank revealing QR factorization
1096 //                  or the condition estimator has failed, or the COND_OK
1097 //                  is set very close to One (which is unnecessary). Normally,
1098 //                  this branch should never be executed, but in rare cases of
1099 //                  failure of the RRQR or condition estimator, the last line of
1100 //                  defense ensures that DGEJSV completes the task.
1101 //                  Compute the full SVD of L3 using DGESVJ with explicit
1102 //                  accumulation of Jacobi rotations.
1103                     CALL DGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,
1104      $                  LDU, work(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )
1105                     SCALEM  = work(2*N+N*NR+NR+1)
1106                     numRank = IDNINT(work(2*N+N*NR+NR+2))
1107                     if ( NR < N ) {
1108                         CALL DLASET( 'A',N-NR,NR,Zero,Zero,V(NR+1,1),LDV )
1109                         CALL DLASET( 'A',NR,N-NR,Zero,Zero,V(1,NR+1),LDV )
1110                         CALL DLASET( 'A',N-NR,N-NR,Zero,One,V(NR+1,NR+1),LDV )
1111                     }
1112                     CALL DORMQR( 'L','N',N,N,NR,work(2*N+1),N,work(N+1),
1113      $                      V,LDV,work(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
1114 //
1115                     CALL DORMLQ( 'L', 'T', NR, NR, NR, work(2*N+1), N,
1116      $                      work(2*N+N*NR+1), U, LDU, work(2*N+N*NR+NR+1),
1117      $                      LWORK-2*N-N*NR-NR, IERR )
1118                     DO 773 q = 1, NR
1119                         DO 772 p = 1, NR
1120                             work(2*N+N*NR+NR+iwork(N+p)) = U(p,q)
1121  772                    CONTINUE
1122                         DO 774 p = 1, NR
1123                             U(p,q) = work(2*N+N*NR+NR+p)
1124  774                    CONTINUE
1125  773                CONTINUE
1126 //
1127                 }
1128 //
1129 //              Permute the rows of V using the (column) permutation from the
1130 //              first QRF. Also, scale the columns to make them unit in
1131 //              Euclidean norm. This applies to all cases.
1132 //
1133                 TEMP1 = DSQRT(DBLE(N)) * EPSLN
1134                 DO 1972 q = 1, N
1135                     DO 972 p = 1, N
1136                         work(2*N+N*NR+NR+iwork(p)) = V(p,q)
1137   972               CONTINUE
1138                     DO 973 p = 1, N
1139                         V(p,q) = work(2*N+N*NR+NR+p)
1140   973               CONTINUE
1141                     XSC = One / DNRM2( N, V(1,q), 1 )
1142                     if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1143      $                  CALL DSCAL( N, XSC, V(1,q), 1 )
1144  1972           CONTINUE
1145 //              At this moment, V contains the right singular vectors of A.
1146 //              Next, assemble the left singular vector matrix U (M x N).
1147                 if ( NR < M ) {
1148                     CALL DLASET( 'A', M-NR, NR, Zero, Zero, U(NR+1,1), LDU )
1149                     if ( NR < N1 ) {
1150                         CALL DLASET('A',NR,N1-NR,Zero,Zero,U(1,NR+1),LDU)
1151                         CALL DLASET('A',M-NR,N1-NR,Zero,One,U(NR+1,NR+1),LDU)
1152                     }
1153                 }
1154 //
1155 //              The Q matrix from the first QRF is built into the left singular
1156 //              matrix U. This applies to all cases.
1157 //
1158                 CALL DORMQR( 'Left', 'No_Tr', M, N1, N, A, LDA, work, U,
1159      $           LDU, work(N+1), LWORK-N, IERR )
1160 
1161 //              The columns of U are normalized. The cost is O(M*N) flops.
1162                 TEMP1 = DSQRT(DBLE(M)) * EPSLN
1163                 DO 1973 p = 1, NR
1164                     XSC = One / DNRM2( M, U(1,p), 1 )
1165                     if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1166      $                  CALL DSCAL( M, XSC, U(1,p), 1 )
1167  1973           CONTINUE
1168 //
1169 //              If the initial QRF is computed with row pivoting, the left
1170 //              singular vectors must be adjusted.
1171 //
1172                 if ( ROWPIV )
1173      $              CALL DLASWP( N1, U, LDU, 1, M-1, iwork(2*N+1), -1 )
1174 //
1175             } else {
1176 //
1177 //              .. the initial matrix A has almost orthogonal columns and
1178 //              the second QRF is not needed
1179 //
1180                 CALL DLACPY( 'Upper', N, N, A, LDA, work(N+1), N )
1181                 if ( L2PERT ) {
1182                     XSC = DSQRT(SMALL)
1183                     DO 5970 p = 2, N
1184                         TEMP1 = XSC * work( N + (p-1)*N + p )
1185                         DO 5971 q = 1, p - 1
1186                             work(N+(q-1)*N+p)=-DSIGN(TEMP1,work(N+(p-1)*N+q))
1187  5971                   CONTINUE
1188  5970               CONTINUE
1189                 } else {
1190                     CALL DLASET( 'Lower',N-1,N-1,Zero,Zero,work(N+2),N )
1191                 }
1192 //
1193                 CALL DGESVJ( 'Upper', 'U', 'N', N, N, work(N+1), N, SVA,
1194      $           N, U, LDU, work(N+N*N+1), LWORK-N-N*N, INFO )
1195 //
1196                 SCALEM  = work(N+N*N+1)
1197                 numRank = IDNINT(work(N+N*N+2))
1198                 DO 6970 p = 1, N
1199                     CALL DCOPY( N, work(N+(p-1)*N+1), 1, U(1,p), 1 )
1200                     CALL DSCAL( N, SVA(p), work(N+(p-1)*N+1), 1 )
1201  6970           CONTINUE
1202 //
1203                 CALL DTRSM( 'Left', 'Upper', 'NoTrans', 'No UD', N, N,
1204      $               One, A, LDA, work(N+1), N )
1205                 DO 6972 p = 1, N
1206                     CALL DCOPY( N, work(N+p), N, V(iwork(p),1), LDV )
1207  6972           CONTINUE
1208                 TEMP1 = DSQRT(DBLE(N))*EPSLN
1209                 DO 6971 p = 1, N
1210                     XSC = One / DNRM2( N, V(1,p), 1 )
1211                     if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1212      $                  CALL DSCAL( N, XSC, V(1,p), 1 )
1213  6971           CONTINUE
1214 //
1215 //              Assemble the left singular vector matrix U (M x N).
1216 //
1217                 if ( N < M ) {
1218                     CALL DLASET( 'A',  M-N, N, Zero, Zero, U(N+1,1), LDU )
1219                     if ( N < N1 ) {
1220                         CALL DLASET( 'A',N,  N1-N, Zero, Zero,  U(1,N+1),LDU )
1221                         CALL DLASET( 'A',M-N,N1-N, Zero, One,U(N+1,N+1),LDU )
1222                     }
1223                 }
1224                 CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, work, U,
1225      $           LDU, work(N+1), LWORK-N, IERR )
1226                 TEMP1 = DSQRT(DBLE(M))*EPSLN
1227                 DO 6973 p = 1, N1
1228                     XSC = One / DNRM2( M, U(1,p), 1 )
1229                     if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1230      $                  CALL DSCAL( M, XSC, U(1,p), 1 )
1231  6973           CONTINUE
1232 //
1233                 if ( ROWPIV )
1234      $              CALL DLASWP( N1, U, LDU, 1, M-1, iwork(2*N+1), -1 )
1235 //
1236             }
1237 //
1238 //          end of the  >> almost orthogonal case <<  in the full SVD
1239 //
1240         } else {
1241 //
1242 //          This branch deploys a preconditioned Jacobi SVD with explicitly
1243 //          accumulated rotations. It is included as optional, mainly for
1244 //          experimental purposes. It does perfom well, and can also be used.
1245 //          In this implementation, this branch will be automatically activated
1246 //          if the  condition number sigma_max(A) / sigma_min(A) is predicted
1247 //          to be greater than the overflow threshold. This is because the
1248 //          a posteriori computation of the singular vectors assumes robust
1249 //          implementation of BLAS and some LAPACK procedures, capable of
1250 //          working in presence of extreme values. Since that is not always
1251 //          the case, ...
1252 //
1253             DO 7968 p = 1, NR
1254                 CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
1255  7968       CONTINUE
1256 //
1257             if ( L2PERT ) {
1258                 XSC = DSQRT(SMALL/EPSLN)
1259                 for (IndexType q=1; q<=nr; ++q) {
1260                     tmp = xsc*abs(V(q,q));
1261                     for (IndexType p=1; p<=n; ++p) {
1262                         if (p>q && abs(V(p,q))<=tmp || p<q) {
1263                             V(p,q) = sign(tmp, V(p,q));
1264                         }
1265                         if (p<q) {
1266                             V(p,q) = - V(p,q);
1267                         }
1268                     }
1269                 }
1270             } else {
1271                 CALL DLASET( 'U', NR-1, NR-1, Zero, Zero, V(1,2), LDV )
1272             }
1273 
1274             CALL DGEQRF( N, NR, V, LDV, work(N+1), work(2*N+1),
1275      $        LWORK-2*N, IERR )
1276             CALL DLACPY( 'L', N, NR, V, LDV, work(2*N+1), N )
1277 //
1278             DO 7969 p = 1, NR
1279                 CALL DCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )
1280  7969       CONTINUE
1281 
1282             if ( L2PERT ) {
1283                 XSC = DSQRT(SMALL/EPSLN)
1284                 DO 9970 q = 2, NR
1285                     DO 9971 p = 1, q - 1
1286                         TEMP1 = XSC * DMIN1(DABS(U(p,p)),DABS(U(q,q)))
1287                         U(p,q) = - DSIGN( TEMP1, U(q,p) )
1288  9971               CONTINUE
1289  9970           CONTINUE
1290             } else {
1291                 CALL DLASET('U', NR-1, NR-1, Zero, Zero, U(1,2), LDU )
1292             }
1293 
1294             CALL DGESVJ( 'G', 'U', 'V', NR, NR, U, LDU, SVA,
1295      $        N, V, LDV, work(2*N+N*NR+1), LWORK-2*N-N*NR, INFO )
1296             SCALEM  = work(2*N+N*NR+1)
1297             numRank = IDNINT(work(2*N+N*NR+2))
1298 
1299             if ( NR < N ) {
1300                 CALL DLASET( 'A',N-NR,NR,Zero,Zero,V(NR+1,1),LDV )
1301                 CALL DLASET( 'A',NR,N-NR,Zero,Zero,V(1,NR+1),LDV )
1302                 CALL DLASET( 'A',N-NR,N-NR,Zero,One,V(NR+1,NR+1),LDV )
1303             }
1304 
1305             CALL DORMQR( 'L','N',N,N,NR,work(2*N+1),N,work(N+1),
1306      $        V,LDV,work(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
1307 //
1308 //          Permute the rows of V using the (column) permutation from the
1309 //          first QRF. Also, scale the columns to make them unit in
1310 //          Euclidean norm. This applies to all cases.
1311 //
1312             TEMP1 = DSQRT(DBLE(N)) * EPSLN
1313             DO 7972 q = 1, N
1314                 DO 8972 p = 1, N
1315                     work(2*N+N*NR+NR+iwork(p)) = V(p,q)
1316  8972           CONTINUE
1317                 DO 8973 p = 1, N
1318                     V(p,q) = work(2*N+N*NR+NR+p)
1319  8973           CONTINUE
1320                 XSC = One / DNRM2( N, V(1,q), 1 )
1321                 if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1322      $              CALL DSCAL( N, XSC, V(1,q), 1 )
1323  7972       CONTINUE
1324 //
1325 //          At this moment, V contains the right singular vectors of A.
1326 //          Next, assemble the left singular vector matrix U (M x N).
1327 //
1328             if (NR<M) {
1329                 CALL DLASET( 'A',  M-NR, NR, Zero, Zero, U(NR+1,1), LDU )
1330                 if (NR<N1) {
1331                     CALL DLASET( 'A',NR,  N1-NR, Zero, Zero,  U(1,NR+1),LDU )
1332                     CALL DLASET( 'A',M-NR,N1-NR, Zero, One,U(NR+1,NR+1),LDU )
1333                 }
1334             }
1335 
1336             CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, work, U,
1337      $        LDU, work(N+1), LWORK-N, IERR )
1338 
1339             if (ROWPIV)
1340      $         CALL DLASWP( N1, U, LDU, 1, M-1, iwork(2*N+1), -1 )
1341 
1342 
1343         }
1344         if ( TRANSP ) {
1345 //          .. swap U and V because the procedure worked on A^t
1346             for (IndexType p=1; p<=n; ++p) {
1347                 blas::swap(U(_,p),V(_,p));
1348             }
1349         }
1350 
1351     }
1352 //  end of the full SVD
1353 //
1354 //  Undo scaling, if necessary (and possible)
1355 //
1356     if ( USCAL2 <= (BIG/SVA(1))*USCAL1 ) {
1357         CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )
1358         USCAL1 = One
1359         USCAL2 = One
1360     }
1361 
1362     if ( NR < N ) {
1363         sva(_(nr+1,n)) = Zero;
1364     }
1365 
1366     work(1) = USCAL2 * SCALEM
1367     work(2) = USCAL1
1368     if ( errest ) work(3) = SCONDA
1369     if ( lsvec && rsvec ) {
1370         work(4) = condr1
1371         work(5) = condr2
1372     }
1373     if ( L2TRAN ) {
1374         work(6) = entra
1375         work(7) = entrat
1376     }
1377 
1378     iwork(1) = NR
1379     iwork(2) = numRank
1380     iwork(3) = warning
1381 }
1382 */
1383     //== interface for native lapack ===============================================
1384 #ifdef CHECK_CXXLAPACK
1385 
1386     template <typename MA, typename VSVA, typename MU, typename MV,
1387               typename VWORK, typename VIWORK>
1388 typename GeMatrix<MA>::IndexType
1389 jsv_native(JSV::Accuracy             accuracy,
1390            JSV::JobU                 jobU,
1391            JSV::JobV                 jobV,
1392            bool                      restrictedRange,
1393            bool                      considerTransA,
1394            bool                      perturb,
1395            GeMatrix<MA>              &A,
1396            DenseVector<VSVA>         &sva,
1397            GeMatrix<MU>              &U,
1398            GeMatrix<MV>              &V,
1399            DenseVector<VWORK>        &work,
1400            DenseVector<VIWORK>       &iwork)
1401 {
1402     typedef typename GeMatrix<MA>::ElementType  T;
1403 
1404     const char       JOBA = char(accuracy);
1405     const char       JOBU = char(jobU);
1406     const char       JOBV = char(jobV);
1407     const char       JOBR = (restrictedRange) ? 'R' : 'N';
1408     const char       JOBT = (considerTransA) ? 'T' : 'N';
1409     const char       JOBP = (perturb) ? 'P' : 'N';
1410     const INTEGER    M = A.numRows();
1411     const INTEGER    N = A.numCols();
1412     const INTEGER    LDA = A.leadingDimension();
1413     const INTEGER    LDU = U.leadingDimension();
1414     const INTEGER    LDV = V.leadingDimension();
1415     const INTEGER    LWORK = work.length();
1416     INTEGER          INFO;
1417 
1418     if (IsSame<T,DOUBLE>::value) {
1419         LAPACK_IMPL(dgejsv)(&JOBA,
1420                             &JOBU,
1421                             &JOBV,
1422                             &JOBR,
1423                             &JOBT,
1424                             &JOBP,
1425                             &M,
1426                             &N,
1427                             A.data(),
1428                             &LDA,
1429                             sva.data(),
1430                             U.data(),
1431                             &LDU,
1432                             V.data(),
1433                             &LDV,
1434                             work.data(),
1435                             &LWORK,
1436                             iwork.data(),
1437                             &INFO);
1438     } else {
1439         ASSERT(0);
1440     }
1441     return INFO;
1442 }
1443 
1444 #endif // CHECK_CXXLAPACK
1445 
1446 
1447 //== public interface ==========================================================
1448 template <typename MA, typename VSVA, typename MU, typename MV,
1449           typename VWORK, typename VIWORK>
1450 typename GeMatrix<MA>::IndexType
1451 jsv(JSV::Accuracy             accuracy,
1452     JSV::JobU                 jobU,
1453     JSV::JobV                 jobV,
1454     bool                      restrictedRange,
1455     bool                      considerTransA,
1456     bool                      perturb,
1457     GeMatrix<MA>              &A,
1458     DenseVector<VSVA>         &sva,
1459     GeMatrix<MU>              &U,
1460     GeMatrix<MV>              &V,
1461     DenseVector<VWORK>        &work,
1462     DenseVector<VIWORK>       &iwork)
1463 {
1464 //
1465 //  Test the input parameters
1466 //
1467 #   ifndef NDEBUG
1468     typedef typename GeMatrix<MA>::IndexType    IndexType;
1469 
1470     ASSERT(A.firstRow()==1);
1471     ASSERT(A.firstCol()==1);
1472 
1473     const IndexType m = A.numRows();
1474     const IndexType n = A.numCols();
1475 
1476     ASSERT(m>=n);
1477 
1478     ASSERT(sva.firstIndex()==1);
1479     ASSERT(sva.length()==n);
1480 
1481     ASSERT(U.firstRow()==1);
1482     ASSERT(U.firstCol()==1);
1483 
1484     ASSERT(V.firstRow()==1);
1485     ASSERT(V.firstCol()==1);
1486 
1487     ASSERT(iwork.length()==m+3*n);
1488 #   endif
1489 
1490 //
1491 //  Make copies of output arguments
1492 //
1493 #   ifdef CHECK_CXXLAPACK
1494     typename GeMatrix<MA>::NoView        A_org     = A;
1495     typename DenseVector<VSVA>::NoView   sva_org   = sva;
1496     typename GeMatrix<MU>::NoView        U_org     = U;
1497     typename GeMatrix<MV>::NoView        V_org     = V;
1498     typename DenseVector<VWORK>::NoView  work_org  = work;
1499 #   endif
1500 
1501 //
1502 //  Call implementation
1503 //
1504     /*
1505     IndexType info = jsv_generic(accuracy, jobU, jobV,
1506                                  restrictedRange, considerTransA, perturb,
1507                                  A, sva, U, V, work);
1508     */
1509     IndexType info = jsv_native(accuracy, jobU, jobV,
1510                                 restrictedRange, considerTransA, perturb,
1511                                 A, sva, U, V, work, iwork);
1512 
1513 #   ifdef CHECK_CXXLAPACK
1514 //
1515 //  Make copies of results computed by the generic implementation
1516 //
1517     typename GeMatrix<MA>::NoView        A_generic     = A;
1518     typename DenseVector<VSVA>::NoView   sva_generic   = sva;
1519     typename GeMatrix<MU>::NoView        U_generic     = U;
1520     typename GeMatrix<MV>::NoView        V_generic     = V;
1521     typename DenseVector<VWORK>::NoView  work_generic  = work;
1522 //
1523 //  restore output arguments
1524 //
1525     A    = A_org;
1526     sva  = sva_org;
1527     U    = U_org;
1528     V    = V_org;
1529     work = work_org;
1530 //
1531 //  Compare generic results with results from the native implementation
1532 //
1533     IndexType _info = jsv_native(accuracy, jobU, jobV,
1534                                  restrictedRange, considerTransA, perturb,
1535                                  A, sva, U, V, work, iwork);
1536     bool failed = false;
1537     if (! isIdentical(A_generic, A, "A_generic""A")) {
1538         std::cerr << "CXXLAPACK: A_generic = " << A_generic << std::endl;
1539         std::cerr << "F77LAPACK: A = " << A << std::endl;
1540         failed = true;
1541     }
1542     if (! isIdentical(sva_generic, sva, "sva_generic""sva")) {
1543         std::cerr << "CXXLAPACK: sva_generic = " << sva_generic << std::endl;
1544         std::cerr << "F77LAPACK: sva = " << sva << std::endl;
1545         failed = true;
1546     }
1547     if (! isIdentical(U_generic, U, "U_generic""U")) {
1548         std::cerr << "CXXLAPACK: U_generic = " << U_generic << std::endl;
1549         std::cerr << "F77LAPACK: U = " << U << std::endl;
1550         failed = true;
1551     }
1552     if (! isIdentical(V_generic, V, "V_generic""V")) {
1553         std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;
1554         std::cerr << "F77LAPACK: V = " << V << std::endl;
1555         failed = true;
1556     }
1557     if (! isIdentical(work_generic, work, "work_generic""work")) {
1558         std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
1559         std::cerr << "F77LAPACK: work = " << work << std::endl;
1560         failed = true;
1561     }
1562     if (! isIdentical(info, _info, "info""_info")) {
1563         std::cerr << "CXXLAPACK: info = " << info << std::endl;
1564         std::cerr << "F77LAPACK: _info = " << _info << std::endl;
1565         failed = true;
1566     }
1567 
1568     if (failed) {
1569         std::cerr << "error in: jsv.tcc " << std::endl;
1570         ASSERT(0);
1571     } else {
1572         std::cerr << "passed: jsv.tcc " << std::endl;
1573     }
1574 #   endif
1575 
1576     return info;
1577 }
1578 
1579 //-- forwarding ----------------------------------------------------------------
1580 template <typename MA, typename VSVA, typename MU, typename MV,
1581          typename VWORK, typename VIWORK>
1582 typename MA::IndexType
1583 jsv(JSV::Accuracy             accuracy,
1584     JSV::JobU                 jobU,
1585     JSV::JobV                 jobV,
1586     bool                      restrictedRange,
1587     bool                      considerTransA,
1588     bool                      perturb,
1589     MA                        &&A,
1590     VSVA                      &&sva,
1591     MU                        &&U,
1592     MV                        &&V,
1593     VWORK                     &&work,
1594     VIWORK                    &&iwork)
1595 {
1596     typename MA::IndexType info;
1597 
1598     CHECKPOINT_ENTER;
1599     info = jsv(accuracy, jobU, jobV,
1600                restrictedRange, considerTransA, perturb,
1601                A, sva, U, V, work, iwork);
1602     CHECKPOINT_LEAVE;
1603 
1604     return info;
1605 }
1606 
1607 } } // namespace lapack, flens
1608 
1609 #endif // FLENS_LAPACK_SVD_JSV_TCC