1 /*
   2  *   Copyright (c) 2012, Michael Lehn
   3  *
   4  *   All rights reserved.
   5  *
   6  *   Redistribution and use in source and binary forms, with or without
   7  *   modification, are permitted provided that the following conditions
   8  *   are met:
   9  *
  10  *   1) Redistributions of source code must retain the above copyright
  11  *      notice, this list of conditions and the following disclaimer.
  12  *   2) Redistributions in binary form must reproduce the above copyright
  13  *      notice, this list of conditions and the following disclaimer in
  14  *      the documentation and/or other materials provided with the
  15  *      distribution.
  16  *   3) Neither the name of the FLENS development group nor the names of
  17  *      its contributors may be used to endorse or promote products derived
  18  *      from this software without specific prior written permission.
  19  *
  20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  */
  32 
  33 /* Based on
  34  *
  35        SUBROUTINE DGESVJ( JOBA, JOBU, JOBV, M, N, A, LDA, SVA, MV, V,
  36       $                   LDV, WORK, LWORK, INFO )
  37  *
  38  *  -- LAPACK routine (version 3.3.1)                                  --
  39  *
  40  *  -- Contributed by Zlatko Drmac of the University of Zagreb and     --
  41  *  -- Kresimir Veselic of the Fernuniversitaet Hagen                  --
  42  *  -- April 2011                                                      --
  43  *
  44  *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
  45  *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
  46  *
  47  * This routine is also part of SIGMA (version 1.23, October 23. 2008.)
  48  * SIGMA is a library of algorithms for highly accurate algorithms for
  49  * computation of SVD, PSVD, QSVD, (H,K)-SVD, and for solution of the
  50  * eigenvalue problems Hx = lambda M x, H M x = lambda x with H, M > 0.
  51  *
  52  */
  53 
  54 #ifndef FLENS_LAPACK_SVD_SVJ_TCC
  55 #define FLENS_LAPACK_SVD_SVJ_TCC 1
  56 
  57 #include <flens/blas/blas.h>
  58 #include <flens/lapack/lapack.h>
  59 
  60 namespace flens { namespace lapack {
  61 
  62 //== generic lapack implementation =============================================
  63 
  64 template <typename MA, typename VSVA, typename MV, typename VWORK>
  65 typename GeMatrix<MA>::IndexType
  66 svj_generic(SVJ::TypeA                typeA,
  67             SVJ::JobU                 jobU,
  68             SVJ::JobV                 jobV,
  69             GeMatrix<MA>              &A,
  70             DenseVector<VSVA>         &sva,
  71             GeMatrix<MV>              &V,
  72             DenseVector<VWORK>        &work)
  73 {
  74     using std::abs;
  75     using std::max;
  76     using std::min;
  77     using std::sqrt;
  78     using std::swap;
  79 
  80     typedef typename GeMatrix<MA>::ElementType  ElementType;
  81     typedef typename GeMatrix<MA>::IndexType    IndexType;
  82 
  83     const ElementType  Zero(0), Half(0.5), One(1);
  84     const IndexType    nSweep = 30;
  85 
  86     const Underscore<IndexType>  _;
  87 
  88     ElementType fastr_data[5];
  89     DenseVectorView<ElementType>
  90         fastr  = typename DenseVectorView<ElementType>::Engine(5, fastr_data);
  91 
  92     const IndexType  m     = A.numRows();
  93     const IndexType  n     = A.numCols();
  94     const IndexType  mv    = V.numRows();
  95     const IndexType  lWork = work.length();
  96 
  97     const bool lower    = (typeA==SVJ::Lower);
  98     const bool upper    = (typeA==SVJ::Upper);
  99     const bool lhsVec   = (jobU==SVJ::ComputeU);
 100     const bool controlU = (jobU==SVJ::ControlU);
 101     const bool applyV   = (jobV==SVJ::ApplyV);
 102     const bool rhsVec   = (jobV==SVJ::ComputeV) || applyV;
 103 
 104     IndexType  info = 0;
 105 //
 106 //#:) Quick return for void matrix
 107 //
 108     if ((m==0) || (n==0)) {
 109         return info;
 110     }
 111 //
 112 //  Set numerical parameters
 113 //  The stopping criterion for Jacobi rotations is
 114 //
 115 //  max_{i<>j}|A(:,i)^T * A(:,j)|/(||A(:,i)||*||A(:,j)||) < CTOL*EPS
 116 //
 117 //  where EPS is the round-off and CTOL is defined as follows:
 118 //
 119     ElementType  cTol;
 120     if (controlU) {
 121 //      ... user controlled
 122         cTol = work(1);
 123     } else {
 124 //      ... default
 125         if (lhsVec || rhsVec) {
 126             cTol = sqrt(ElementType(m));
 127         } else {
 128             cTol = ElementType(m);
 129         }
 130     }
 131 //    ... and the machine dependent parameters are
 132 //[!]  (Make sure that DLAMCH() works properly on the target machine.)
 133 //
 134     const ElementType eps = lamch<ElementType>(Eps);
 135     const ElementType rootEps = sqrt(eps);
 136     const ElementType safeMin = lamch<ElementType>(SafeMin);
 137     const ElementType rootSafeMin = sqrt(safeMin);
 138     const ElementType small = safeMin / eps;
 139     const ElementType big = lamch<ElementType>(OverflowThreshold);
 140 //  const ElementType big = One / safeMin;
 141     const ElementType rootBig = One / rootSafeMin;
 142     const ElementType bigTheta = One / rootEps;
 143 
 144     const ElementType tol = cTol*eps;
 145     const ElementType rootTol = sqrt(tol);
 146 
 147     if (ElementType(m)*eps>=One) {
 148 //      we return -4 to keep it compatible with the LAPACK implementation
 149         return -4;
 150     }
 151 //
 152 //    Initialize the right singular vector matrix.
 153 //
 154     if (rhsVec && (!applyV)) {
 155         V         = Zero;
 156         V.diag(0) = One;
 157     }
 158 //
 159 //  Initialize SVA( 1:N ) = ( ||A e_i||_2, i = 1:N )
 160 //(!) If necessary, scale A to protect the largest singular value
 161 //  from overflow. It is possible that saving the largest singular
 162 //  value destroys the information about the small ones.
 163 //  This initial scaling is almost minimal in the sense that the
 164 //  goal is to make sure that no column norm overflows, and that
 165 //  DSQRT(N)*max_i SVA(i) does not overflow. If INFinite entries
 166 //  in A are detected, the procedure returns with INFO=-6.
 167 //
 168     ElementType skl = One / sqrt(ElementType(m)*ElementType(n));
 169     bool noScale = true;
 170     bool goScale = true;
 171 
 172     ElementType aapp, aaqq, tmp;
 173 
 174     if (lower) {
 175 //      the input matrix is M-by-N lower triangular (trapezoidal)
 176         for (IndexType p=1; p<=n; ++p) {
 177             aapp = Zero;
 178             aaqq = One;
 179             lassq(A(_(p,m),p), aapp, aaqq);
 180             if (aapp>big) {
 181                 return -6;
 182             }
 183             aaqq = sqrt(aaqq);
 184             if ((aapp<(big/aaqq)) && noScale) {
 185                 sva(p) = aapp*aaqq;
 186             } else {
 187                 noScale = false;
 188                 sva(p) = aapp*(aaqq*skl);
 189                 if (goScale) {
 190                     goScale = false;
 191                     sva(_(1,p-1)) *= skl;
 192                 }
 193             }
 194         }
 195     } else if (upper) {
 196 //      the input matrix is M-by-N upper triangular (trapezoidal)
 197         for (IndexType p=1; p<=n; ++p) {
 198             aapp = Zero;
 199             aaqq = One;
 200             lassq(A(_(1,p), p), aapp, aaqq);
 201             if (aapp>big) {
 202                 return -6;
 203             }
 204             aaqq = sqrt(aaqq);
 205             if ((aapp<(big/aaqq)) && noScale) {
 206                 sva(p) = aapp*aaqq;
 207             } else {
 208                 noScale = false;
 209                 sva(p) = aapp*(aaqq*skl);
 210                 if (goScale) {
 211                     goScale = false;
 212                     sva(_(1,p-1)) *= skl;
 213                 }
 214             }
 215         }
 216     } else {
 217 //      the input matrix is M-by-N general dense
 218         for (IndexType p=1; p<=n; ++p) {
 219             aapp = Zero;
 220             aaqq = One;
 221             lassq(A(_,p), aapp, aaqq);
 222             if (aapp>big) {
 223                 return -6;
 224             }
 225             aaqq = sqrt(aaqq);
 226             if ((aapp<(big/aaqq)) && noScale) {
 227                 sva(p) = aapp*aaqq;
 228             } else {
 229                 noScale = false;
 230                 sva(p) = aapp*(aaqq*skl);
 231                 if (goScale) {
 232                     goScale = false;
 233                     sva(_(1,p-1)) *= skl;
 234                 }
 235             }
 236         }
 237     }
 238 
 239     if (noScale) {
 240         skl = One;
 241     }
 242 //
 243 //  Move the smaller part of the spectrum from the underflow threshold
 244 //(!) Start by determining the position of the nonzero entries of the
 245 //  array SVA() relative to ( SFMIN, BIG ).
 246 //
 247     aapp = Zero;
 248     aaqq = big;
 249     for (IndexType p=1; p<=n; ++p) {
 250         if (sva(p)!=Zero) {
 251             aaqq = min(aaqq,sva(p));
 252         }
 253         aapp = max(aapp,sva(p));
 254     }
 255 //
 256 //#:) Quick return for zero matrix
 257 //
 258     if (aapp==Zero) {
 259         if (lhsVec) {
 260             A = Zero;
 261             A.diag(0) = One;
 262         }
 263         work(1)      = One;
 264         work(_(2,6)) = Zero;
 265         return info;
 266     }
 267 //
 268 //#:) Quick return for one-column matrix
 269 //
 270     if (n==1) {
 271         if (lhsVec) {
 272             lascl(LASCL::FullMatrix, 00, sva(1), skl, A);
 273         }
 274         work(1) = One / skl;
 275         if (sva(1)>=safeMin) {
 276             work(2) = One;
 277         } else {
 278             work(2) = Zero;
 279         }
 280         work(_(3,6)) = Zero;
 281         return info;
 282     }
 283 //
 284 //  Protect small singular values from underflow, and try to
 285 //  avoid underflows/overflows in computing Jacobi rotations.
 286 //
 287     ElementType cs, sn;
 288 
 289     sn = sqrt(safeMin/eps);
 290     tmp = sqrt(big/ElementType(n));
 291     if ((aapp<=sn) || (aaqq>=tmp) || (sn<=aaqq && aapp<=tmp)) {
 292         tmp = min(big, tmp/aapp);
 293 //      aaqq  = aaqq*tmp
 294 //      aapp  = aapp*tmp
 295     } else if ((aaqq<=sn) && (aapp<=tmp)) {
 296         tmp = min(sn/aaqq, big/(aapp*sqrt(ElementType(n))));
 297 //      aaqq  = aaqq*tmp
 298 //      aapp  = aapp*tmp
 299     } else if ((aaqq>=sn) && (aapp>=tmp)) {
 300         tmp = max(sn/aaqq, tmp/aapp);
 301 //      aaqq  = aaqq*tmp
 302 //      aapp  = aapp*tmp
 303     } else if ((aaqq<=sn) && (aapp>=tmp)) {
 304         tmp = min(sn/aaqq, big/(sqrt(ElementType(n))*aapp));
 305 //      aaqq  = aaqq*tmp
 306 //      aapp  = aapp*tmp
 307     } else {
 308         tmp = One;
 309     }
 310 //
 311 //  Scale, if necessary
 312 //
 313     if (tmp!=One) {
 314         lascl(LASCL::FullMatrix, 00, One, tmp, sva);
 315     }
 316     skl *= tmp;
 317     if (skl!=One) {
 318         if (upper) {
 319             lascl(LASCL::UpperTriangular, 00, One, skl, A);
 320         } else if (lower) {
 321             lascl(LASCL::LowerTriangular, 00, One, skl, A);
 322         } else {
 323             lascl(LASCL::FullMatrix, 00, One, skl, A);
 324         }
 325         skl = One / skl;
 326     }
 327 //
 328 //  Row-cyclic Jacobi SVD algorithm with column pivoting
 329 //
 330     const IndexType emptsw = n*(n-1)/2;
 331     IndexType notRot = 0;
 332     fastr(1) = Zero;
 333 //
 334 //  A is represented in factored form A = A * diag(WORK), where diag(WORK)
 335 //  is initialized to identity. WORK is updated during fast scaled
 336 //  rotations.
 337 //
 338     work(_(1,n)) = One;
 339 //
 340 //
 341     IndexType swBand = 3;
 342 //[TP] SWBAND is a tuning parameter [TP]. It is meaningful and effective
 343 //  if DGESVJ is used as a computational routine in the preconditioned
 344 //  Jacobi SVD algorithm DGESVJ. For sweeps i=1:SWBAND the procedure
 345 //  works on pivots inside a band-like region around the diagonal.
 346 //  The boundaries are determined dynamically, based on the number of
 347 //  pivots above a threshold.
 348 //
 349     const IndexType kbl = min(IndexType(8),n);
 350 //[TP] KBL is a tuning parameter that defines the tile size in the
 351 //  tiling of the p-q loops of pivot pairs. In general, an optimal
 352 //  value of KBL depends on the matrix dimensions and on the
 353 //  parameters of the computer's memory.
 354 //
 355     IndexType nbl = n / kbl;
 356     if (nbl*kbl!=n) {
 357         ++nbl;
 358     }
 359 
 360     const IndexType blSkip = pow(kbl, 2);
 361 //[TP] BLKSKIP is a tuning parameter that depends on SWBAND and KBL.
 362 //
 363     const IndexType rowSkip = min(IndexType(5), kbl);
 364 //[TP] ROWSKIP is a tuning parameter.
 365 //
 366     const IndexType lkAhead = 1;
 367 //[TP] LKAHEAD is a tuning parameter.
 368 //
 369 //  Quasi block transformations, using the lower (upper) triangular
 370 //  structure of the input matrix. The quasi-block-cycling usually
 371 //  invokes cubic convergence. Big part of this cycle is done inside
 372 //  canonical subspaces of dimensions less than M.
 373 //
 374     if ((lower || upper) && (n>max(IndexType(64), 4*kbl))) {
 375 //[TP]  The number of partition levels and the actual partition are
 376 //      tuning parameters.
 377         const IndexType n4 = n / 4;
 378         const IndexType n2 = n / 2;
 379         const IndexType n34 = 3*n4;
 380         const IndexType q = (applyV) ? 0 : 1;
 381         const IndexType qm = (applyV) ? mv : 0;
 382 
 383         auto _work = work(_(n+1,lWork));
 384 
 385         if (lower) {
 386 //
 387 //          This works very well on lower triangular matrices, in particular
 388 //          in the framework of the preconditioned Jacobi SVD (xGEJSV).
 389 //          The idea is simple:
 390 //          [+ 0 0 0]   Note that Jacobi transformations of [0 0]
 391 //          [+ + 0 0]                                       [0 0]
 392 //          [+ + x 0]   actually work on [x 0]              [x 0]
 393 //          [+ + x x]                    [x x].             [x x]
 394 //
 395             auto A1 = A(_(    1,m),_(    1, n4));
 396             auto A2 = A(_( n4+1,m),_( n4+1, n2));
 397             auto A3 = A(_( n2+1,m),_( n2+1,n34));
 398             auto A4 = A(_(n34+1,m),_(n34+1,  n));
 399 
 400             auto A12 = A(_(   1,m),_(   1,n2));
 401             auto A34 = A(_(n2+1,m),_(n2+1, n));
 402 
 403             auto V1 = V(_(      1, n4*q+qm),_(    1,n4));
 404             auto V2 = V(_( n4*q+1, n2*q+qm),_( n4+1,n2));
 405             auto V3 = V(_( n2*q+1,n34*q+qm),_( n2+1,n34));
 406             auto V4 = V(_(n34*q+1, mv*q+qm),_(n34+1,n));
 407 
 408             auto V12 = V(_(      1,n2*q+qm),_(   1,n2));
 409             auto V34 = V(_( n2*q+1,mv*q+qm),_(n2+1,n));
 410 
 411             auto d1 = work(_(    1, n4));
 412             auto d2 = work(_( n4+1, n2));
 413             auto d3 = work(_( n2+1,n34));
 414             auto d4 = work(_(n34+1,  n));
 415 
 416             auto d12 = work(_(    1,n2));
 417             auto d34 = work(_( n2+1, n));
 418 
 419             auto sva1 = sva(_(    1, n4));
 420             auto sva2 = sva(_( n4+1, n2));
 421             auto sva3 = sva(_( n2+1,n34));
 422             auto sva4 = sva(_(n34+1,  n));
 423 
 424             auto sva12 = sva(_(   1,n2));
 425             auto sva34 = sva(_(n2+1, n));
 426 
 427             svj0(jobV, A4, d4, sva4, V4, eps, safeMin, tol, 2, _work);
 428             svj0(jobV, A3, d3, sva3, V3, eps, safeMin, tol, 2, _work);
 429             svj1(jobV, n4, A34, d34, sva34, V34, eps, safeMin, tol, 1, _work);
 430             svj0(jobV, A2, d2, sva2, V2, eps, safeMin, tol, 1, _work);
 431             svj0(jobV, A1, d1, sva1, V1, eps, safeMin, tol, 1, _work);
 432             svj1(jobV, n4, A12, d12, sva12, V12, eps, safeMin, tol, 1, _work);
 433 
 434         } else if (upper) {
 435 
 436             auto A1 = A(_(1,   n4),_(    1,   n4));
 437             auto A2 = A(_(1,   n2),_( n4+1,n4+n4));
 438             auto A3 = A(_(1,n2+n4),_( n2+1,n2+n4));
 439 
 440             auto A12 = A(_(1,n2),_(1,n2));
 441 
 442             auto V1 = V(_(      1,     n4*q+qm),_(    1,   n4));
 443             auto V2 = V(_( n4*q+1,(n4+n4)*q+qm),_( n4+1,n4+n4));
 444             auto V3 = V(_( n2*q+1,(n2+n4)*q+qm),_( n2+1,n2+n4));
 445 
 446             auto V12 = V(_(1,n2*q+qm),_(1,n2));
 447 
 448             auto d1 = work(_(    1,   n4));
 449             auto d2 = work(_( n4+1,n4+n4));
 450             auto d3 = work(_( n2+1,n2+n4));
 451 
 452             auto d12 = work(_(1,n2));
 453 
 454             auto sva1 = sva(_(    1,   n4));
 455             auto sva2 = sva(_( n4+1,n4+n4));
 456             auto sva3 = sva(_( n2+1,n2+n4));
 457 
 458             auto sva12 = sva(_(1,n2));
 459 
 460             svj0(jobV, A1, d1, sva1, V1, eps, safeMin, tol, 2, _work);
 461             svj0(jobV, A2, d2, sva2, V2, eps, safeMin, tol, 1, _work);
 462             svj1(jobV, n4, A12, d12, sva12, V12, eps, safeMin, tol, 1, _work);
 463             svj0(jobV, A3, d3, sva3, V3, eps, safeMin, tol, 1, _work);
 464 
 465         }
 466 
 467     }
 468 //
 469 //  .. Row-cyclic pivot strategy with de Rijk's pivoting ..
 470 //
 471     ElementType max_aapq, aapq, aapp0, aqoap, apoaq;
 472     ElementType max_sinj;
 473     ElementType theta, thetaSign, t;
 474 
 475     IndexType   i, iswRot, pSkipped, ibr, igl, jgl, ir1, p, q, jbc, ijblsk;
 476 
 477     bool        converged = false;
 478     bool        rotOk;
 479 
 480     for (i=1; i<=nSweep; ++i) {
 481 //
 482 //  .. go go go ...
 483 //
 484        max_aapq = Zero;
 485        max_sinj = Zero;
 486 
 487        iswRot   = 0;
 488        pSkipped = 0;
 489 
 490        notRot   = 0;
 491 //
 492 //     Each sweep is unrolled using KBL-by-KBL tiles over the pivot pairs
 493 //     1 <= p < q <= N. This is the first step toward a blocked implementation
 494 //     of the rotations. New implementation, based on block transformations,
 495 //     is under development.
 496 //
 497        for (ibr=1; ibr<=nbl; ++ibr) {
 498 
 499           igl = (ibr-1)*kbl + 1;
 500 
 501           for (ir1=0; ir1<=min(lkAhead,nbl-ibr); ++ir1) {
 502 
 503              igl += ir1*kbl;
 504 
 505              for (p=igl; p<=min(igl+kbl-1,n-1); ++p) {
 506 //
 507 //    .. de Rijk's pivoting
 508 //
 509                 q = blas::iamax(sva(_(p,n))) + p - 1;
 510                 if (p!=q) {
 511                     blas::swap(A(_,p),A(_,q));
 512                     if (rhsVec) {
 513                         blas::swap(V(_,p),V(_,q));
 514                     }
 515                     swap(sva(p),sva(q));
 516                     swap(work(p),work(q));
 517                 }
 518 
 519                 if (ir1==0) {
 520 //
 521 //     Column norms are periodically updated by explicit
 522 //     norm computation.
 523 //     Caveat:
 524 //     Unfortunately, some BLAS implementations compute DNRM2(M,A(1,p),1)
 525 //     as DSQRT(DDOT(M,A(1,p),1,A(1,p),1)), which may cause the result to
 526 //     overflow for ||A(:,p)||_2 > DSQRT(overflow_threshold), and to
 527 //     underflow for ||A(:,p)||_2 < DSQRT(underflow_threshold).
 528 //     Hence, DNRM2 cannot be trusted, not even in the case when
 529 //     the true norm is far from the under(over)flow boundaries.
 530 //     If properly implemented DNRM2 is available, the IF-THEN-ELSE
 531 //     below should read "AAPP = DNRM2( M, A(1,p), 1 ) * WORK(p)".
 532 //
 533 
 534                    if ((sva(p)<rootBig) && (sva(p)>rootSafeMin)) {
 535                       sva(p) = blas::nrm2(A(_,p)) * work(p);
 536                    } else {
 537                        tmp = Zero;
 538                        aapp = One;
 539                        lassq(A(_,p), tmp, aapp);
 540                        sva(p) = tmp * sqrt(aapp) * work(p);
 541                    }
 542                    aapp = sva(p);
 543                 } else {
 544                    aapp = sva(p);
 545                 }
 546 
 547                 if (aapp>Zero) {
 548 
 549                    pSkipped = 0;
 550 
 551                    for (q=p+1; q<=min(igl+kbl-1,n); ++q) {
 552 
 553                       aaqq = sva(q);
 554 
 555                       if (aaqq>Zero) {
 556 
 557                          aapp0 = aapp;
 558                          if (aaqq>=One) {
 559                             rotOk = (small*aapp<=aaqq);
 560                             if (aapp<big/aaqq) {
 561                                aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)
 562                                     / aapp;
 563                             } else {
 564                                auto _work = work(_(n+1,n+m));
 565 
 566                                _work = A(_,p);
 567                                lascl(LASCL::FullMatrix, 00,
 568                                      aapp, work(p), _work);
 569                                aapq = _work*A(_,q)*work(q)/aaqq;
 570                             }
 571                          } else {
 572                             rotOk = (aapp<=aaqq/small);
 573                             if (aapp>(small/aaqq)) {
 574                                 aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)
 575                                      / aapp;
 576                             } else {
 577                                auto _work = work(_(n+1,n+m));
 578 
 579                                _work = A(_,q);
 580                                lascl(LASCL::FullMatrix, 00,
 581                                      aaqq, work(q), _work);
 582                                aapq = _work*A(_,p)*work(p)/aapp;
 583                             }
 584                          }
 585 
 586                          max_aapq = max(max_aapq, abs(aapq));
 587 //
 588 //     TO rotate or NOT to rotate, THAT is the question ...
 589 //
 590                          if (abs(aapq)>tol) {
 591 //
 592 //        .. rotate
 593 //[RTD]    ROTATED = ROTATED + ONE
 594 //
 595                             if (ir1==0) {
 596                                notRot = 0;
 597                                pSkipped = 0;
 598                                ++iswRot;
 599                             }
 600 
 601                             if (rotOk) {
 602 
 603                                aqoap = aaqq / aapp;
 604                                apoaq = aapp / aaqq;
 605                                theta = -Half*abs(aqoap-apoaq)/aapq;
 606 
 607                                if (abs(theta)>bigTheta) {
 608 
 609                                   t = Half / theta;
 610                                   fastr(3) =  t*work(p) / work(q);
 611                                   fastr(4) = -t*work(q) / work(p);
 612                                   blas::rotm(A(_,p), A(_,q), fastr);
 613                                   if (rhsVec) {
 614                                       blas::rotm(V(_,p), V(_,q), fastr);
 615                                   }
 616                                   sva(q) =aaqq*sqrt(max(Zero,One+t*apoaq*aapq));
 617                                   aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
 618                                   max_sinj = max(max_sinj, abs(t));
 619 
 620                                } else {
 621 //
 622 //              .. choose correct signum for THETA and rotate
 623 //
 624                                   thetaSign = -sign(One,aapq);
 625                                   t = One
 626                                     / (theta +thetaSign*sqrt(One+theta*theta));
 627                                   cs = sqrt(One / (One+t*t));
 628                                   sn = t*cs;
 629 
 630                                   max_sinj = max(max_sinj, abs(sn));
 631                                   sva(q) = aaqq
 632                                           *sqrt(max(Zero, One+t*apoaq*aapq));
 633                                   aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
 634 
 635                                   apoaq = work(p) / work(q);
 636                                   aqoap = work(q) / work(p);
 637                                   if (work(p)>=One) {
 638                                      if (work(q)>=One) {
 639                                         fastr(3) =  t*apoaq;
 640                                         fastr(4) = -t*aqoap;
 641                                         work(p) *= cs;
 642                                         work(q) *= cs;
 643                                         blas::rotm(A(_,p), A(_,q), fastr);
 644                                         if (rhsVec) {
 645                                             blas::rotm(V(_,p), V(_,q), fastr);
 646                                         }
 647                                      } else {
 648                                         A(_,p) -= t*aqoap*A(_,q);
 649                                         A(_,q) += cs*sn*apoaq*A(_,p);
 650                                         work(p) *= cs;
 651                                         work(q) /= cs;
 652                                         if (rhsVec) {
 653                                            V(_,p) -= t*aqoap*V(_,q);
 654                                            V(_,q) += cs*sn*apoaq*V(_,p);
 655                                         }
 656                                      }
 657                                   } else {
 658                                      if (work(q)>=One) {
 659                                         A(_,q) += t*apoaq*A(_,p);
 660                                         A(_,p) -= cs*sn*aqoap*A(_,q);
 661                                         work(p) /= cs;
 662                                         work(q) *= cs;
 663                                         if (rhsVec) {
 664                                            V(_,q) += t*apoaq*V(_,p);
 665                                            V(_,p) -= cs*sn*aqoap*V(_,q);
 666                                         }
 667                                      } else {
 668                                         if (work(p)>=work(q)) {
 669                                            A(_,p) -= t*aqoap*A(_,q);
 670                                            A(_,q) += cs*sn*apoaq*A(_,p);
 671                                            work(p) *= cs;
 672                                            work(q) /= cs;
 673                                            if (rhsVec) {
 674                                               V(_,p) -= t*aqoap*V(_,q);
 675                                               V(_,q) += cs*sn*apoaq*V(_,p);
 676                                            }
 677                                         } else {
 678                                            A(_,q) += t*apoaq*A(_,p);
 679                                            A(_,p) -= cs*sn*aqoap*A(_,q);
 680                                            work(p) /= cs;
 681                                            work(q) *= cs;
 682                                            if (rhsVec) {
 683                                               V(_,q) += t*apoaq*V(_,p);
 684                                               V(_,p) -= cs*sn*aqoap*V(_,q);
 685                                            }
 686                                         }
 687                                      }
 688                                   }
 689                                }
 690 
 691                             } else {
 692 //             .. have to use modified Gram-Schmidt like transformation
 693                                auto _work = work(_(n+1,lWork));
 694 
 695                                _work = A(_,p);
 696                                lascl(LASCL::FullMatrix, 00,
 697                                      aapp, One, _work);
 698                                lascl(LASCL::FullMatrix, 00,
 699                                      aaqq, One, A(_,q));
 700                                tmp = -aapq*work(p)/work(q);
 701                                A(_,q) += tmp*_work;
 702                                lascl(LASCL::FullMatrix, 00,
 703                                      One, aaqq, A(_,q));
 704                                sva(q) = aaqq*sqrt(max(Zero, One-aapq*aapq));
 705                                max_sinj = max(max_sinj, safeMin);
 706                             }
 707 //        END IF ROTOK THEN ... ELSE
 708 //
 709 //        In the case of cancellation in updating SVA(q), SVA(p)
 710 //        recompute SVA(q), SVA(p).
 711 //
 712                             if (pow(sva(q)/aaqq,2)<=rootEps) {
 713                                if ((aaqq<rootBig) && (aaqq>rootSafeMin)) {
 714                                   sva(q) = blas::nrm2(A(_,q))*work(q);
 715                                } else {
 716                                   t = Zero;
 717                                   aaqq = One;
 718                                   lassq(A(_,q), t, aaqq);
 719                                   sva(q) = t*sqrt(aaqq)*work(q);
 720                                }
 721                             }
 722                             if (aapp/aapp0<=rootEps) {
 723                                if ((aapp<rootBig) && (aapp>rootSafeMin)) {
 724                                   aapp = blas::nrm2(A(_,p))*work(p);
 725                                } else {
 726                                   t = Zero;
 727                                   aapp = One;
 728                                   lassq(A(_,p), t, aapp);
 729                                   aapp = t*sqrt(aapp)*work(p);
 730                                }
 731                                sva(p) = aapp;
 732                             }
 733 //
 734                          } else {
 735 //     A(:,p) and A(:,q) already numerically orthogonal
 736                             if (ir1==0) {
 737                                 ++notRot;
 738                             }
 739 //[RTD]    SKIPPED  = SKIPPED  + 1
 740                             ++pSkipped;
 741                          }
 742                       } else {
 743 //     A(:,q) is zero column
 744                          if (ir1==0) {
 745                              ++notRot;
 746                          }
 747                          ++pSkipped;
 748                       }
 749 //
 750                       if ((i<=swBand) && (pSkipped>rowSkip)) {
 751                          if (ir1==0) {
 752                              aapp = -aapp;
 753                          }
 754                          notRot = 0;
 755                          break;
 756                       }
 757 
 758                    }
 759 //  END q-LOOP
 760 //
 761                    sva(p) = aapp;
 762 
 763                 } else {
 764                    sva(p) = aapp;
 765                    if ((ir1==0) && (aapp==Zero)) {
 766                        notRot += min(igl+kbl-1,n) - p;
 767                    }
 768                 }
 769 
 770              }
 771 //  end of the p-loop
 772 //  end of doing the block ( ibr, ibr )
 773           }
 774 //  end of ir1-loop
 775 //
 776 //... go to the off diagonal blocks
 777 //
 778           igl = (ibr-1)*kbl + 1;
 779 
 780           for (jbc=ibr+1; jbc<=nbl; ++jbc) {
 781 
 782              jgl = (jbc-1)*kbl + 1;
 783 //
 784 //     doing the block at ( ibr, jbc )
 785 //
 786              ijblsk = 0;
 787              for (p=igl; p<=min(igl+kbl-1,n); ++p) {
 788 
 789                 aapp = sva(p);
 790                 if (aapp>Zero) {
 791 
 792                    pSkipped = 0;
 793 
 794                    for (q=jgl; q<=min(jgl+kbl-1,n); ++q) {
 795 
 796                       aaqq = sva(q);
 797                       if (aaqq>Zero) {
 798                          aapp0 = aapp;
 799 
 800 //
 801 //  .. M x 2 Jacobi SVD ..
 802 //
 803 //     Safe Gram matrix computation
 804 //
 805                          if (aaqq>=One) {
 806                             if (aapp>=aaqq) {
 807                                rotOk = (small*aapp)<=aaqq;
 808                             } else {
 809                                rotOk = (small*aaqq)<=aapp;
 810                             }
 811                             if (aapp<(big/aaqq)) {
 812                                aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)
 813                                     / aapp;
 814                             } else {
 815                                auto _work = work(_(n+1,n+m));
 816 
 817                                _work = A(_,p);
 818                                lascl(LASCL::FullMatrix, 00,
 819                                      aapp, work(p), _work);
 820                                aapq = _work*A(_,q)*work(q) / aaqq;
 821                             }
 822                          } else {
 823                             if (aapp>=aaqq) {
 824                                rotOk = aapp<=(aaqq/small);
 825                             } else {
 826                                rotOk = aaqq<=(aapp/small);
 827                             }
 828                             if (aapp>(small/aaqq)) {
 829                                aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)
 830                                     / aapp;
 831                             } else {
 832                                auto _work = work(_(n+1,n+m));
 833 
 834                                _work = A(_,q);
 835                                lascl(LASCL::FullMatrix, 00,
 836                                      aaqq, work(q), _work);
 837                                aapq = _work*A(_,p)*work(p)/aapp;
 838                             }
 839                          }
 840 
 841                          max_aapq = max(max_aapq, abs(aapq));
 842 //
 843 //     TO rotate or NOT to rotate, THAT is the question ...
 844 //
 845                          if (abs(aapq)>tol) {
 846                             notRot = 0;
 847 //[RTD]    ROTATED  = ROTATED + 1
 848                             pSkipped = 0;
 849                             ++iswRot;
 850 
 851                             if (rotOk) {
 852 
 853                                aqoap = aaqq / aapp;
 854                                apoaq = aapp / aaqq;
 855                                theta = -Half*abs(aqoap-apoaq)/aapq;
 856                                if (aaqq>aapp0) {
 857                                   theta = -theta;
 858                                }
 859 
 860                                if (abs(theta)>bigTheta) {
 861                                   t = Half/theta;
 862                                   fastr(3) =  t*work(p) / work(q);
 863                                   fastr(4) = -t*work(q) / work(p);
 864                                   blas::rotm(A(_,p), A(_,q), fastr);
 865                                   if (rhsVec) {
 866                                      blas::rotm(V(_,p), V(_,q), fastr);
 867                                   }
 868                                   sva(q) = aaqq
 869                                          * sqrt(max(Zero, One+t*apoaq*aapq));
 870                                   aapp *= sqrt(max(Zero,One-t*aqoap*aapq));
 871                                   max_sinj = max(max_sinj, abs(t));
 872                                } else {
 873 //
 874 //              .. choose correct signum for THETA and rotate
 875 //
 876                                   thetaSign = -sign(One,aapq);
 877                                   if (aaqq>aapp0) {
 878                                       thetaSign = -thetaSign;
 879                                   }
 880                                   t = One
 881                                     / (theta+thetaSign*sqrt(One+theta*theta));
 882                                   cs = sqrt(One / (One + t*t));
 883                                   sn = t*cs;
 884                                   max_sinj = max(max_sinj, abs(sn));
 885                                   sva(q) = aaqq
 886                                          * sqrt(max(Zero, One+t*apoaq*aapq));
 887                                   aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
 888 
 889                                   apoaq = work(p) / work(q);
 890                                   aqoap = work(q) / work(p);
 891                                   if (work(p)>=One) {
 892 
 893                                      if (work(q)>=One) {
 894                                         fastr(3) =  t*apoaq;
 895                                         fastr(4) = -t*aqoap;
 896                                         work(p) *= cs;
 897                                         work(q) *= cs;
 898                                         blas::rotm(A(_,p), A(_,q), fastr);
 899                                         if (rhsVec) {
 900                                             blas::rotm(V(_,p), V(_,q), fastr);
 901                                         }
 902                                      } else {
 903                                         A(_,p) -= t*aqoap*A(_,q);
 904                                         A(_,q) += cs*sn*apoaq*A(_,p);
 905                                         if (rhsVec) {
 906                                            V(_,p) -= t*aqoap*V(_,q);
 907                                            V(_,q) += cs*sn*apoaq*V(_,p);
 908                                         }
 909                                         work(p) *= cs;
 910                                         work(q) /= cs;
 911                                      }
 912                                   } else {
 913                                      if (work(q)>=One) {
 914                                         A(_,q) += t*apoaq*A(_,p);
 915                                         A(_,p) -= cs*sn*aqoap*A(_,q);
 916                                         if (rhsVec) {
 917                                            V(_,q) += t*apoaq*V(_,p);
 918                                            V(_,p) -= cs*sn*aqoap*V(_,q);
 919                                         }
 920                                         work(p) /= cs;
 921                                         work(q) *= cs;
 922                                      } else {
 923                                         if (work(p)>=work(q)) {
 924                                            A(_,p) -= t*aqoap*A(_,q);
 925                                            A(_,q) += cs*sn*apoaq*A(_,p);
 926                                            work(p) *= cs;
 927                                            work(q) /= cs;
 928                                            if (rhsVec) {
 929                                               V(_,p) -= t*aqoap*V(_,q);
 930                                               V(_,q) += cs*sn*apoaq*V(_,p);
 931                                            }
 932                                         } else {
 933                                            A(_,q) += t*apoaq*A(_,p);
 934                                            A(_,p) -= cs*sn*aqoap*A(_,q);
 935                                            work(p) /= cs;
 936                                            work(q) *= cs;
 937                                            if (rhsVec) {
 938                                               V(_,q) += t*apoaq*V(_,p);
 939                                               V(_,p) -= cs*sn*aqoap*V(_,q);
 940                                            }
 941                                         }
 942                                      }
 943                                   }
 944                                }
 945 
 946                             } else {
 947                                auto _work = work(_(n+1,lWork));
 948 
 949                                if (aapp>aaqq) {
 950                                   _work = A(_,p);
 951                                   lascl(LASCL::FullMatrix, 00,
 952                                         aapp, One, _work);
 953                                   lascl(LASCL::FullMatrix, 00,
 954                                         aaqq, One, A(_,q));
 955                                   tmp = -aapq*work(p) / work(q);
 956                                   A(_,q) += tmp*_work;
 957                                   lascl(LASCL::FullMatrix, 00,
 958                                         One, aaqq, A(_,q));
 959                                   sva(q) = aaqq*sqrt(max(Zero, One-aapq*aapq));
 960                                   max_sinj = max(max_sinj, safeMin);
 961                                } else {
 962                                   _work = A(_,q);
 963                                   lascl(LASCL::FullMatrix, 00,
 964                                         aaqq, One, _work);
 965                                   lascl(LASCL::FullMatrix, 00,
 966                                         aapp, One, A(_,p));
 967                                   tmp = -aapq*work(q) / work(p);
 968                                   A(_,p) += tmp * _work;
 969                                   lascl(LASCL::FullMatrix, 00,
 970                                         One, aapp, A(_,p));
 971                                   sva(p) = aapp*sqrt(max(Zero, One-aapq*aapq));
 972                                   max_sinj = max(max_sinj, safeMin);
 973                                }
 974                             }
 975 //        END IF ROTOK THEN ... ELSE
 976 //
 977 //        In the case of cancellation in updating SVA(q)
 978 //        .. recompute SVA(q)
 979                             if (pow(sva(q)/aaqq,2)<=rootEps) {
 980                                if ((aaqq<rootBig) && (aaqq>rootSafeMin)) {
 981                                   sva(q) = blas::nrm2(A(_,q))*work(q);
 982                                } else {
 983                                   t = Zero;
 984                                   aaqq = One;
 985                                   lassq(A(_,q), t, aaqq);
 986                                   sva(q) = t*sqrt(aaqq)*work(q);
 987                                }
 988                             }
 989                             if (pow(aapp/aapp0,2)<=rootEps) {
 990                                if ((aapp<rootBig) && (aapp>rootSafeMin)) {
 991                                   aapp = blas::nrm2(A(_,p))*work(p);
 992                                } else {
 993                                   t = Zero;
 994                                   aapp = One;
 995                                   lassq(A(_,p), t, aapp);
 996                                   aapp = t*sqrt(aapp)*work(p);
 997                                }
 998                                sva(p) = aapp;
 999                             }
1000 //             end of OK rotation
1001                          } else {
1002                             ++notRot;
1003 //[RTD]      SKIPPED  = SKIPPED  + 1
1004                             ++pSkipped;
1005                             ++ijblsk;
1006                          }
1007                       } else {
1008                          ++notRot;
1009                          ++pSkipped;
1010                          ++ijblsk;
1011                       }
1012 
1013                       if ((i<=swBand) && (ijblsk>=blSkip)) {
1014                          sva(p) = aapp;
1015                          notRot = 0;
1016                          goto jbcLoopExit;
1017                       }
1018                       if ((i<=swBand) && (pSkipped>rowSkip)) {
1019                          aapp = -aapp;
1020                          notRot = 0;
1021                          break;
1022                       }
1023 
1024                    }
1025 //       end of the q-loop
1026 
1027                    sva(p) = aapp;
1028 
1029                 } else {
1030 
1031                    if (aapp==Zero) {
1032                       notRot += min(jgl+kbl-1,n) -jgl + 1;
1033                    }
1034                    if (aapp<Zero) {
1035                       notRot = 0;
1036                    }
1037 
1038                 }
1039 
1040              }
1041 //  end of the p-loop
1042           }
1043 //  end of the jbc-loop
1044        jbcLoopExit:
1045 //  bailed out of the jbc-loop
1046           for (p=igl; p<=min(igl+kbl-1,n); ++p) {
1047              sva(p) = abs(sva(p));
1048           }
1049 //**
1050        }
1051 //  end of the ibr-loop
1052 //
1053 //  .. update SVA(N)
1054        if ((sva(n)<rootBig) && (sva(n)>rootSafeMin)) {
1055           sva(n) = blas::nrm2(A(_,n))*work(n);
1056        } else {
1057           t = Zero;
1058           aapp = One;
1059           lassq(A(_,n), t, aapp);
1060           sva(n) = t*sqrt(aapp)*work(n);
1061        }
1062 //
1063 //  Additional steering devices
1064 //
1065        if ((i<swBand) && ((max_aapq<=rootTol) || (iswRot<=n))) {
1066           swBand = i;
1067        }
1068 
1069        if (i>swBand+1 && max_aapq<sqrt(ElementType(n))*tol
1070         && ElementType(n)*max_aapq*max_sinj<tol) {
1071            converged = true;
1072            break;
1073        }
1074 
1075        if (notRot>=emptsw) {
1076            converged = true;
1077            break;
1078        }
1079 
1080     }
1081 //  end i=1:NSWEEP loop
1082 //
1083     if (converged) {
1084 //#:) INFO = 0 confirms successful iterations.
1085         info = 0;
1086     } else {
1087 //#:( Reaching this point means that the procedure has not converged.
1088         info = nSweep - 1;
1089     }
1090 //
1091 //  Sort the singular values and find how many are above
1092 //  the underflow threshold.
1093 //
1094     IndexType n2 = 0;
1095     IndexType n4 = 0;
1096     for (IndexType p=1; p<=n-1; ++p) {
1097         const IndexType q = blas::iamax(sva(_(p,n))) + p - 1;
1098         if (p!=q) {
1099             swap(sva(p), sva(q));
1100             swap(work(p), work(q));
1101             blas::swap(A(_,p), A(_,q));
1102             if (rhsVec) {
1103                 blas::swap(V(_,p), V(_,q));
1104             }
1105         }
1106         if (sva(p)!=Zero) {
1107             ++n4;
1108             if (sva(p)*skl>safeMin) {
1109                 ++n2;
1110             }
1111         }
1112     }
1113     if (sva(n)!=Zero) {
1114         ++n4;
1115         if (sva(n)*skl>safeMin) {
1116             ++n2;
1117         }
1118     }
1119 //
1120 //  Normalize the left singular vectors.
1121 //
1122     if (lhsVec || controlU) {
1123         for (IndexType p=1; p<=n2; ++p) {
1124             A(_,p) *= work(p)/sva(p);
1125         }
1126     }
1127 //
1128 //  Scale the product of Jacobi rotations (assemble the fast rotations).
1129 //
1130     if (rhsVec) {
1131         if (applyV) {
1132             for (IndexType p=1; p<=n; ++p) {
1133                 V(_,p) *= work(p);
1134             }
1135         } else {
1136             for (IndexType p=1; p<=n; ++p) {
1137                 V(_,p) *= One / blas::nrm2(V(_,p));
1138             }
1139         }
1140     }
1141 //
1142 //  Undo scaling, if necessary (and possible).
1143     if (((skl>One) && (sva(1)<big/skl)) || ((skl<One) && (sva(n2)>safeMin/skl))) {
1144         sva *= skl;
1145         skl = One;
1146     }
1147 //
1148     work(1) = skl;
1149 //  The singular values of A are SKL*SVA(1:N). If SKL.NE.ONE
1150 //  then some of the singular values may overflow or underflow and
1151 //  the spectrum is given in this factored representation.
1152 //
1153     work(2) = ElementType(n4);
1154 //  N4 is the number of computed nonzero singular values of A.
1155 //
1156     work(3) = ElementType(n2);
1157 //  N2 is the number of singular values of A greater than SFMIN.
1158 //  If N2<N, SVA(N2:N) contains ZEROS and/or denormalized numbers
1159 //  that may carry some information.
1160 //
1161     work(4) = ElementType(i);
1162 //  i is the index of the last sweep before declaring convergence.
1163 //
1164     work(5) = max_aapq;
1165 //  MXAAPQ is the largest absolute value of scaled pivots in the
1166 //  last sweep
1167 //
1168     work(6) = max_sinj;
1169 //  MXSINJ is the largest absolute value of the sines of Jacobi angles
1170 //  in the last sweep
1171 //
1172     return info;
1173 }
1174 
1175 //== interface for native lapack ===============================================
1176 #ifdef CHECK_CXXLAPACK
1177 
1178 template <typename MA, typename VSVA, typename MV, typename VWORK>
1179 typename GeMatrix<MA>::IndexType
1180 svj_native(SVJ::TypeA                typeA,
1181            SVJ::JobU                 jobU,
1182            SVJ::JobV                 jobV,
1183            GeMatrix<MA>              &A,
1184            DenseVector<VSVA>         &sva,
1185            GeMatrix<MV>              &V,
1186            DenseVector<VWORK>        &work)
1187 {
1188     typedef typename GeMatrix<MA>::ElementType  T;
1189 
1190     const char       JOBA = char(typeA);
1191     const char       JOBU = char(jobU);
1192     const char       JOBV = char(jobV);
1193     const INTEGER    M = A.numRows();
1194     const INTEGER    N = A.numCols();
1195     const INTEGER    LDA = A.leadingDimension();
1196     const INTEGER    _MV = V.numRows();
1197     const INTEGER    LDV = V.leadingDimension();
1198     const INTEGER    LWORK = work.length();
1199     INTEGER          INFO;
1200 
1201     if (IsSame<T,DOUBLE>::value) {
1202         LAPACK_IMPL(dgesvj)(&JOBA,
1203                             &JOBU,
1204                             &JOBV,
1205                             &M,
1206                             &N,
1207                             A.data(),
1208                             &LDA,
1209                             sva.data(),
1210                             &_MV,
1211                             V.data(),
1212                             &LDV,
1213                             work.data(),
1214                             &LWORK,
1215                             &INFO);
1216     } else {
1217         ASSERT(0);
1218     }
1219     return INFO;
1220 }
1221 
1222 #endif // CHECK_CXXLAPACK
1223 
1224 //== public interface ==========================================================
1225 template <typename MA, typename VSVA, typename MV, typename VWORK>
1226 typename GeMatrix<MA>::IndexType
1227 svj_(SVJ::TypeA                typeA,
1228      SVJ::JobU                 jobU,
1229      SVJ::JobV                 jobV,
1230      GeMatrix<MA>              &A,
1231      DenseVector<VSVA>         &sva,
1232      GeMatrix<MV>              &V,
1233      DenseVector<VWORK>        &work)
1234 {
1235     using std::max;
1236     using std::min;
1237 //
1238 //  Test the input parameters
1239 //
1240 #   ifndef NDEBUG
1241     typedef typename GeMatrix<MA>::IndexType    IndexType;
1242 
1243     ASSERT(A.firstRow()==1);
1244     ASSERT(A.firstCol()==1);
1245 
1246     const IndexType m = A.numRows();
1247     const IndexType n = A.numCols();
1248 
1249     ASSERT(m>=n);
1250 
1251     ASSERT(sva.firstIndex()==1);
1252     ASSERT(sva.length()==n);
1253 
1254     ASSERT(V.firstRow()==1);
1255     ASSERT(V.firstCol()==1);
1256 
1257     if (jobV==SVJ::ComputeV) {
1258         ASSERT(V.numCols()==n);
1259         ASSERT(V.numRows()==n);
1260     }
1261     if (jobV==SVJ::ApplyV) {
1262         ASSERT(V.numCols()==n);
1263     }
1264 
1265     if (work.length()>0) {
1266         ASSERT(work.length()>=max(IndexType(6),m+n));
1267     }
1268 #   endif
1269 
1270 //
1271 //  Make copies of output arguments
1272 //
1273 #   ifdef CHECK_CXXLAPACK
1274     typename GeMatrix<MA>::NoView       A_org     = A;
1275     typename DenseVector<VSVA>::NoView  sva_org   = sva;
1276     typename GeMatrix<MV>::NoView       V_org     = V;
1277     typename DenseVector<VWORK>::NoView work_org  = work;
1278 #   endif
1279 
1280 //
1281 //  Call implementation
1282 //
1283     IndexType info = svj_generic(typeA, jobU, jobV, A, sva, V, work);
1284 
1285 #   ifdef CHECK_CXXLAPACK
1286 //
1287 //  Make copies of results computed by the generic implementation
1288 //
1289     typename GeMatrix<MA>::NoView       A_generic     = A;
1290     typename DenseVector<VSVA>::NoView  sva_generic   = sva;
1291     typename GeMatrix<MV>::NoView       V_generic     = V;
1292     typename DenseVector<VWORK>::NoView work_generic  = work;
1293 //
1294 //  restore output arguments
1295 //
1296     A    = A_org;
1297     sva  = sva_org;
1298     V    = V_org;
1299     work = work_org;
1300 //
1301 //  Compare generic results with results from the native implementation
1302 //
1303     IndexType _info = svj_native(typeA, jobU, jobV, A, sva, V, work);
1304 
1305     bool failed = false;
1306     if (! isIdentical(A_generic, A, "A_generic""A")) {
1307         std::cerr << "CXXLAPACK: A_generic = " << A_generic << std::endl;
1308         std::cerr << "F77LAPACK: A = " << A << std::endl;
1309         failed = true;
1310     }
1311     if (! isIdentical(sva_generic, sva, "sva_generic""sva")) {
1312         std::cerr << "CXXLAPACK: sva_generic = " << sva_generic << std::endl;
1313         std::cerr << "F77LAPACK: sva = " << sva << std::endl;
1314         failed = true;
1315     }
1316     if (! isIdentical(V_generic, V, "V_generic""V")) {
1317         std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;
1318         std::cerr << "F77LAPACK: V = " << V << std::endl;
1319         failed = true;
1320     }
1321     if (! isIdentical(work_generic, work, "work_generic""work")) {
1322         std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
1323         std::cerr << "F77LAPACK: work = " << work << std::endl;
1324         failed = true;
1325     }
1326     if (! isIdentical(info, _info, "info""_info")) {
1327         std::cerr << "CXXLAPACK: info = " << info << std::endl;
1328         std::cerr << "F77LAPACK: _info = " << _info << std::endl;
1329         failed = true;
1330     }
1331 
1332     if (failed) {
1333         std::cerr << "error in: svj.tcc ("
1334                   << ", m = " << m
1335                   << ", n = " << n
1336                   << ", typeA = " << char(typeA)
1337                   << ", jobU = " << char(jobU)
1338                   << ", jobV = " << char(jobV)
1339                   << ", info = " << info
1340                   << ") " << std::endl;
1341         ASSERT(0);
1342     } else {
1343         /*
1344         std::cerr << "passed: svj.tcc ("
1345                   << ", m = " << m
1346                   << ", n = " << n
1347                   << ", typeA = " << char(typeA)
1348                   << ", jobU = " << char(jobU)
1349                   << ", jobV = " << char(jobV)
1350                   << ", info = " << info
1351                   << ") " << std::endl;
1352         */
1353     }
1354 #   endif
1355 
1356     return info;
1357 }
1358 
1359 template <typename MA, typename VSVA, typename MV, typename VWORK>
1360 typename GeMatrix<MA>::IndexType
1361 svj(SVJ::TypeA                typeA,
1362     SVJ::JobU                 jobU,
1363     SVJ::JobV                 jobV,
1364     GeMatrix<MA>              &A,
1365     DenseVector<VSVA>         &sva,
1366     GeMatrix<MV>              &V,
1367     DenseVector<VWORK>        &work)
1368 {
1369 #   ifdef CHECK_CXXLAPACK
1370     typename GeMatrix<MA>::NoView       A_org    = A;
1371     typename DenseVector<VSVA>::NoView  sva_org  = sva;
1372     typename GeMatrix<MV>::NoView       V_org    = V;
1373     typename DenseVector<VSVA>::NoView  work_org = work;
1374 
1375     svj_(SVJ::Lower, jobU, jobV, A, sva, V, work);
1376 
1377     A    = A_org;
1378     sva  = sva_org;
1379     V    = V_org;
1380     work = work_org;
1381 
1382     svj_(SVJ::Upper, jobU, jobV, A, sva, V, work);
1383 
1384     A    = A_org;
1385     sva  = sva_org;
1386     V    = V_org;
1387     work = work_org;
1388     svj_(SVJ::General, jobU, jobV, A, sva, V, work);
1389 
1390     A    = A_org;
1391     sva  = sva_org;
1392     V    = V_org;
1393     work = work_org;
1394 #   endif
1395 
1396     return svj_(typeA, jobU, jobV, A, sva, V, work);
1397 }
1398 
1399 
1400 template <typename MA, typename VSVA, typename MV, typename VWORK>
1401 typename GeMatrix<MA>::IndexType
1402 svj(SVJ::JobU                 jobU,
1403     SVJ::JobV                 jobV,
1404     GeMatrix<MA>              &A,
1405     DenseVector<VSVA>         &sva,
1406     GeMatrix<MV>              &V,
1407     DenseVector<VWORK>        &work)
1408 {
1409     return svj(SVJ::General, jobU, jobV, A, sva, V, work);
1410 }
1411 
1412 template <typename MA, typename VSVA, typename MV, typename VWORK>
1413 typename TrMatrix<MA>::IndexType
1414 svj(SVJ::JobU                 jobU,
1415     SVJ::JobV                 jobV,
1416     TrMatrix<MA>              &A,
1417     DenseVector<VSVA>         &sva,
1418     GeMatrix<MV>              &V,
1419     DenseVector<VWORK>        &work)
1420 {
1421     SVJ::TypeA  upLo = (A.upLo()==Upper) ? SVJ::Upper : SVJ::Lower;
1422 
1423     return svj(upLo, jobU, jobV, A.general(), sva, V, work);
1424 }
1425 
1426 //-- forwarding ----------------------------------------------------------------
1427 template <typename MA, typename VSVA, typename MV, typename VWORK>
1428 typename MA::IndexType
1429 svj(SVJ::TypeA                typeA,
1430     SVJ::JobU                 jobU,
1431     SVJ::JobV                 jobV,
1432     MA                        &&A,
1433     VSVA                      &&sva,
1434     MV                        &&V,
1435     VWORK                     &&work)
1436 {
1437     typename MA::IndexType info;
1438 
1439     CHECKPOINT_ENTER;
1440     info = svj(typeA, jobU, jobV, A, sva, V, work);
1441     CHECKPOINT_LEAVE;
1442 
1443     return info;
1444 }
1445 
1446 template <typename MA, typename VSVA, typename MV, typename VWORK>
1447 typename MA::IndexType
1448 svj(SVJ::JobU                 jobU,
1449     SVJ::JobV                 jobV,
1450     MA                        &&A,
1451     VSVA                      &&sva,
1452     MV                        &&V,
1453     VWORK                     &&work)
1454 {
1455     typename MA::IndexType info;
1456 
1457     CHECKPOINT_ENTER;
1458     info = svj(jobU, jobV, A, sva, V, work);
1459     CHECKPOINT_LEAVE;
1460 
1461     return info;
1462 }
1463 
1464 } } // namespace lapack, flens
1465 
1466 #endif // FLENS_LAPACK_SVD_SVJ_TCC