svj.tcc (flens/lapack/svd/svj.tcc)

   1 /*

   2  *   Copyright (c) 2012, Michael Lehn

   3  *

   4  *   All rights reserved.

   5  *

   6  *   Redistribution and use in source and binary forms, with or without

   7  *   modification, are permitted provided that the following conditions

   8  *   are met:

   9  *

  10  *   1) Redistributions of source code must retain the above copyright

  11  *      notice, this list of conditions and the following disclaimer.

  12  *   2) Redistributions in binary form must reproduce the above copyright

  13  *      notice, this list of conditions and the following disclaimer in

  14  *      the documentation and/or other materials provided with the

  15  *      distribution.

  16  *   3) Neither the name of the FLENS development group nor the names of

  17  *      its contributors may be used to endorse or promote products derived

  18  *      from this software without specific prior written permission.

  19  *

  20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

  21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

  22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

  23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

  24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

  25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

  26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

  27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

  28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

  29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

  30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

  31  */

  32 

  33 /* Based on

  34  *

  35        SUBROUTINE DGESVJ( JOBA, JOBU, JOBV, M, N, A, LDA, SVA, MV, V,

  36       $                   LDV, WORK, LWORK, INFO )

  37  *

  38  *  -- LAPACK routine (version 3.3.1)                                  --

  39  *

  40  *  -- Contributed by Zlatko Drmac of the University of Zagreb and     --

  41  *  -- Kresimir Veselic of the Fernuniversitaet Hagen                  --

  42  *  -- April 2011                                                      --

  43  *

  44  *  -- LAPACK is a software package provided by Univ. of Tennessee,    --

  45  *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--

  46  *

  47  * This routine is also part of SIGMA (version 1.23, October 23. 2008.)

  48  * SIGMA is a library of algorithms for highly accurate algorithms for

  49  * computation of SVD, PSVD, QSVD, (H,K)-SVD, and for solution of the

  50  * eigenvalue problems Hx = lambda M x, H M x = lambda x with H, M > 0.

  51  *

  52  */

  53 

  54 #ifndef FLENS_LAPACK_SVD_SVJ_TCC

  55 #define FLENS_LAPACK_SVD_SVJ_TCC 1

  56 

  57 #include <flens/blas/blas.h>

  58 #include <flens/lapack/lapack.h>

  59 

  60 namespace flens { namespace lapack {

  61 

  62 //== generic lapack implementation =============================================

  63 

  64 template <typename MA, typename VSVA, typename MV, typename VWORK>

  65 typename GeMatrix<MA>::IndexType

  66 svj_generic(SVJ::TypeA                typeA,

  67             SVJ::JobU                 jobU,

  68             SVJ::JobV                 jobV,

  69             GeMatrix<MA>              &A,

  70             DenseVector<VSVA>         &sva,

  71             GeMatrix<MV>              &V,

  72             DenseVector<VWORK>        &work)

  73 {

  74     using std::abs;

  75     using std::max;

  76     using std::min;

  77     using std::sqrt;

  78     using std::swap;

  79 

  80     typedef typename GeMatrix<MA>::ElementType  ElementType;

  81     typedef typename GeMatrix<MA>::IndexType    IndexType;

  82 

  83     const ElementType  Zero(0), Half(0.5), One(1);

  84     const IndexType    nSweep = 30;

  85 

  86     const Underscore<IndexType>  _;

  87 

  88     ElementType fastr_data[5];

  89     DenseVectorView<ElementType>

  90         fastr  = typename DenseVectorView<ElementType>::Engine(5, fastr_data);

  91 

  92     const IndexType  m     = A.numRows();

  93     const IndexType  n     = A.numCols();

  94     const IndexType  mv    = V.numRows();

  95     const IndexType  lWork = work.length();

  96 

  97     const bool lower    = (typeA==SVJ::Lower);

  98     const bool upper    = (typeA==SVJ::Upper);

  99     const bool lhsVec   = (jobU==SVJ::ComputeU);

 100     const bool controlU = (jobU==SVJ::ControlU);

 101     const bool applyV   = (jobV==SVJ::ApplyV);

 102     const bool rhsVec   = (jobV==SVJ::ComputeV) || applyV;

 103 

 104     IndexType  info = 0;

 105 //

 106 //#:) Quick return for void matrix

 107 //

 108     if ((m==0) || (n==0)) {

 109         return info;

 110     }

 111 //

 112 //  Set numerical parameters

 113 //  The stopping criterion for Jacobi rotations is

 114 //

 115 //  max_{i<>j}|A(:,i)^T * A(:,j)|/(||A(:,i)||*||A(:,j)||) < CTOL*EPS

 116 //

 117 //  where EPS is the round-off and CTOL is defined as follows:

 118 //

 119     ElementType  cTol;

 120     if (controlU) {

 121 //      ... user controlled

 122         cTol = work(1);

 123     } else {

 124 //      ... default

 125         if (lhsVec || rhsVec) {

 126             cTol = sqrt(ElementType(m));

 127         } else {

 128             cTol = ElementType(m);

 129         }

 130     }

 131 //    ... and the machine dependent parameters are

 132 //[!]  (Make sure that DLAMCH() works properly on the target machine.)

 133 //

 134     const ElementType eps = lamch<ElementType>(Eps);

 135     const ElementType rootEps = sqrt(eps);

 136     const ElementType safeMin = lamch<ElementType>(SafeMin);

 137     const ElementType rootSafeMin = sqrt(safeMin);

 138     const ElementType small = safeMin / eps;

 139     const ElementType big = lamch<ElementType>(OverflowThreshold);

 140 //  const ElementType big = One / safeMin;

 141     const ElementType rootBig = One / rootSafeMin;

 142     const ElementType bigTheta = One / rootEps;

 143 

 144     const ElementType tol = cTol*eps;

 145     const ElementType rootTol = sqrt(tol);

 146 

 147     if (ElementType(m)*eps>=One) {

 148 //      we return -4 to keep it compatible with the LAPACK implementation

 149         return -4;

 150     }

 151 //

 152 //    Initialize the right singular vector matrix.

 153 //

 154     if (rhsVec && (!applyV)) {

 155         V         = Zero;

 156         V.diag(0) = One;

 157     }

 158 //

 159 //  Initialize SVA( 1:N ) = ( ||A e_i||_2, i = 1:N )

 160 //(!) If necessary, scale A to protect the largest singular value

 161 //  from overflow. It is possible that saving the largest singular

 162 //  value destroys the information about the small ones.

 163 //  This initial scaling is almost minimal in the sense that the

 164 //  goal is to make sure that no column norm overflows, and that

 165 //  DSQRT(N)*max_i SVA(i) does not overflow. If INFinite entries

 166 //  in A are detected, the procedure returns with INFO=-6.

 167 //

 168     ElementType skl = One / sqrt(ElementType(m)*ElementType(n));

 169     bool noScale = true;

 170     bool goScale = true;

 171 

 172     ElementType aapp, aaqq, tmp;

 173 

 174     if (lower) {

 175 //      the input matrix is M-by-N lower triangular (trapezoidal)

 176         for (IndexType p=1; p<=n; ++p) {

 177             aapp = Zero;

 178             aaqq = One;

 179             lassq(A(_(p,m),p), aapp, aaqq);

 180             if (aapp>big) {

 181                 return -6;

 182             }

 183             aaqq = sqrt(aaqq);

 184             if ((aapp<(big/aaqq)) && noScale) {

 185                 sva(p) = aapp*aaqq;

 186             } else {

 187                 noScale = false;

 188                 sva(p) = aapp*(aaqq*skl);

 189                 if (goScale) {

 190                     goScale = false;

 191                     sva(_(1,p-1)) *= skl;

 192                 }

 193             }

 194         }

 195     } else if (upper) {

 196 //      the input matrix is M-by-N upper triangular (trapezoidal)

 197         for (IndexType p=1; p<=n; ++p) {

 198             aapp = Zero;

 199             aaqq = One;

 200             lassq(A(_(1,p), p), aapp, aaqq);

 201             if (aapp>big) {

 202                 return -6;

 203             }

 204             aaqq = sqrt(aaqq);

 205             if ((aapp<(big/aaqq)) && noScale) {

 206                 sva(p) = aapp*aaqq;

 207             } else {

 208                 noScale = false;

 209                 sva(p) = aapp*(aaqq*skl);

 210                 if (goScale) {

 211                     goScale = false;

 212                     sva(_(1,p-1)) *= skl;

 213                 }

 214             }

 215         }

 216     } else {

 217 //      the input matrix is M-by-N general dense

 218         for (IndexType p=1; p<=n; ++p) {

 219             aapp = Zero;

 220             aaqq = One;

 221             lassq(A(_,p), aapp, aaqq);

 222             if (aapp>big) {

 223                 return -6;

 224             }

 225             aaqq = sqrt(aaqq);

 226             if ((aapp<(big/aaqq)) && noScale) {

 227                 sva(p) = aapp*aaqq;

 228             } else {

 229                 noScale = false;

 230                 sva(p) = aapp*(aaqq*skl);

 231                 if (goScale) {

 232                     goScale = false;

 233                     sva(_(1,p-1)) *= skl;

 234                 }

 235             }

 236         }

 237     }

 238 

 239     if (noScale) {

 240         skl = One;

 241     }

 242 //

 243 //  Move the smaller part of the spectrum from the underflow threshold

 244 //(!) Start by determining the position of the nonzero entries of the

 245 //  array SVA() relative to ( SFMIN, BIG ).

 246 //

 247     aapp = Zero;

 248     aaqq = big;

 249     for (IndexType p=1; p<=n; ++p) {

 250         if (sva(p)!=Zero) {

 251             aaqq = min(aaqq,sva(p));

 252         }

 253         aapp = max(aapp,sva(p));

 254     }

 255 //

 256 //#:) Quick return for zero matrix

 257 //

 258     if (aapp==Zero) {

 259         if (lhsVec) {

 260             A = Zero;

 261             A.diag(0) = One;

 262         }

 263         work(1)      = One;

 264         work(_(2,6)) = Zero;

 265         return info;

 266     }

 267 //

 268 //#:) Quick return for one-column matrix

 269 //

 270     if (n==1) {

 271         if (lhsVec) {

 272             lascl(LASCL::FullMatrix, 0, 0, sva(1), skl, A);

 273         }

 274         work(1) = One / skl;

 275         if (sva(1)>=safeMin) {

 276             work(2) = One;

 277         } else {

 278             work(2) = Zero;

 279         }

 280         work(_(3,6)) = Zero;

 281         return info;

 282     }

 283 //

 284 //  Protect small singular values from underflow, and try to

 285 //  avoid underflows/overflows in computing Jacobi rotations.

 286 //

 287     ElementType cs, sn;

 288 

 289     sn = sqrt(safeMin/eps);

 290     tmp = sqrt(big/ElementType(n));

 291     if ((aapp<=sn) || (aaqq>=tmp) || (sn<=aaqq && aapp<=tmp)) {

 292         tmp = min(big, tmp/aapp);

 293 //      aaqq  = aaqq*tmp

 294 //      aapp  = aapp*tmp

 295     } else if ((aaqq<=sn) && (aapp<=tmp)) {

 296         tmp = min(sn/aaqq, big/(aapp*sqrt(ElementType(n))));

 297 //      aaqq  = aaqq*tmp

 298 //      aapp  = aapp*tmp

 299     } else if ((aaqq>=sn) && (aapp>=tmp)) {

 300         tmp = max(sn/aaqq, tmp/aapp);

 301 //      aaqq  = aaqq*tmp

 302 //      aapp  = aapp*tmp

 303     } else if ((aaqq<=sn) && (aapp>=tmp)) {

 304         tmp = min(sn/aaqq, big/(sqrt(ElementType(n))*aapp));

 305 //      aaqq  = aaqq*tmp

 306 //      aapp  = aapp*tmp

 307     } else {

 308         tmp = One;

 309     }

 310 //

 311 //  Scale, if necessary

 312 //

 313     if (tmp!=One) {

 314         lascl(LASCL::FullMatrix, 0, 0, One, tmp, sva);

 315     }

 316     skl *= tmp;

 317     if (skl!=One) {

 318         if (upper) {

 319             lascl(LASCL::UpperTriangular, 0, 0, One, skl, A);

 320         } else if (lower) {

 321             lascl(LASCL::LowerTriangular, 0, 0, One, skl, A);

 322         } else {

 323             lascl(LASCL::FullMatrix, 0, 0, One, skl, A);

 324         }

 325         skl = One / skl;

 326     }

 327 //

 328 //  Row-cyclic Jacobi SVD algorithm with column pivoting

 329 //

 330     const IndexType emptsw = n*(n-1)/2;

 331     IndexType notRot = 0;

 332     fastr(1) = Zero;

 333 //

 334 //  A is represented in factored form A = A * diag(WORK), where diag(WORK)

 335 //  is initialized to identity. WORK is updated during fast scaled

 336 //  rotations.

 337 //

 338     work(_(1,n)) = One;

 339 //

 340 //

 341     IndexType swBand = 3;

 342 //[TP] SWBAND is a tuning parameter [TP]. It is meaningful and effective

 343 //  if DGESVJ is used as a computational routine in the preconditioned

 344 //  Jacobi SVD algorithm DGESVJ. For sweeps i=1:SWBAND the procedure

 345 //  works on pivots inside a band-like region around the diagonal.

 346 //  The boundaries are determined dynamically, based on the number of

 347 //  pivots above a threshold.

 348 //

 349     const IndexType kbl = min(IndexType(8),n);

 350 //[TP] KBL is a tuning parameter that defines the tile size in the

 351 //  tiling of the p-q loops of pivot pairs. In general, an optimal

 352 //  value of KBL depends on the matrix dimensions and on the

 353 //  parameters of the computer's memory.

 354 //

 355     IndexType nbl = n / kbl;

 356     if (nbl*kbl!=n) {

 357         ++nbl;

 358     }

 359 

 360     const IndexType blSkip = pow(kbl, 2);

 361 //[TP] BLKSKIP is a tuning parameter that depends on SWBAND and KBL.

 362 //

 363     const IndexType rowSkip = min(IndexType(5), kbl);

 364 //[TP] ROWSKIP is a tuning parameter.

 365 //

 366     const IndexType lkAhead = 1;

 367 //[TP] LKAHEAD is a tuning parameter.

 368 //

 369 //  Quasi block transformations, using the lower (upper) triangular

 370 //  structure of the input matrix. The quasi-block-cycling usually

 371 //  invokes cubic convergence. Big part of this cycle is done inside

 372 //  canonical subspaces of dimensions less than M.

 373 //

 374     if ((lower || upper) && (n>max(IndexType(64), 4*kbl))) {

 375 //[TP]  The number of partition levels and the actual partition are

 376 //      tuning parameters.

 377         const IndexType n4 = n / 4;

 378         const IndexType n2 = n / 2;

 379         const IndexType n34 = 3*n4;

 380         const IndexType q = (applyV) ? 0 : 1;

 381         const IndexType qm = (applyV) ? mv : 0;

 382 

 383         auto _work = work(_(n+1,lWork));

 384 

 385         if (lower) {

 386 //

 387 //          This works very well on lower triangular matrices, in particular

 388 //          in the framework of the preconditioned Jacobi SVD (xGEJSV).

 389 //          The idea is simple:

 390 //          [+ 0 0 0]   Note that Jacobi transformations of [0 0]

 391 //          [+ + 0 0]                                       [0 0]

 392 //          [+ + x 0]   actually work on [x 0]              [x 0]

 393 //          [+ + x x]                    [x x].             [x x]

 394 //

 395             auto A1 = A(_(    1,m),_(    1, n4));

 396             auto A2 = A(_( n4+1,m),_( n4+1, n2));

 397             auto A3 = A(_( n2+1,m),_( n2+1,n34));

 398             auto A4 = A(_(n34+1,m),_(n34+1,  n));

 399 

 400             auto A12 = A(_(   1,m),_(   1,n2));

 401             auto A34 = A(_(n2+1,m),_(n2+1, n));

 402 

 403             auto V1 = V(_(      1, n4*q+qm),_(    1,n4));

 404             auto V2 = V(_( n4*q+1, n2*q+qm),_( n4+1,n2));

 405             auto V3 = V(_( n2*q+1,n34*q+qm),_( n2+1,n34));

 406             auto V4 = V(_(n34*q+1, mv*q+qm),_(n34+1,n));

 407 

 408             auto V12 = V(_(      1,n2*q+qm),_(   1,n2));

 409             auto V34 = V(_( n2*q+1,mv*q+qm),_(n2+1,n));

 410 

 411             auto d1 = work(_(    1, n4));

 412             auto d2 = work(_( n4+1, n2));

 413             auto d3 = work(_( n2+1,n34));

 414             auto d4 = work(_(n34+1,  n));

 415 

 416             auto d12 = work(_(    1,n2));

 417             auto d34 = work(_( n2+1, n));

 418 

 419             auto sva1 = sva(_(    1, n4));

 420             auto sva2 = sva(_( n4+1, n2));

 421             auto sva3 = sva(_( n2+1,n34));

 422             auto sva4 = sva(_(n34+1,  n));

 423 

 424             auto sva12 = sva(_(   1,n2));

 425             auto sva34 = sva(_(n2+1, n));

 426 

 427             svj0(jobV, A4, d4, sva4, V4, eps, safeMin, tol, 2, _work);

 428             svj0(jobV, A3, d3, sva3, V3, eps, safeMin, tol, 2, _work);

 429             svj1(jobV, n4, A34, d34, sva34, V34, eps, safeMin, tol, 1, _work);

 430             svj0(jobV, A2, d2, sva2, V2, eps, safeMin, tol, 1, _work);

 431             svj0(jobV, A1, d1, sva1, V1, eps, safeMin, tol, 1, _work);

 432             svj1(jobV, n4, A12, d12, sva12, V12, eps, safeMin, tol, 1, _work);

 433 

 434         } else if (upper) {

 435 

 436             auto A1 = A(_(1,   n4),_(    1,   n4));

 437             auto A2 = A(_(1,   n2),_( n4+1,n4+n4));

 438             auto A3 = A(_(1,n2+n4),_( n2+1,n2+n4));

 439 

 440             auto A12 = A(_(1,n2),_(1,n2));

 441 

 442             auto V1 = V(_(      1,     n4*q+qm),_(    1,   n4));

 443             auto V2 = V(_( n4*q+1,(n4+n4)*q+qm),_( n4+1,n4+n4));

 444             auto V3 = V(_( n2*q+1,(n2+n4)*q+qm),_( n2+1,n2+n4));

 445 

 446             auto V12 = V(_(1,n2*q+qm),_(1,n2));

 447 

 448             auto d1 = work(_(    1,   n4));

 449             auto d2 = work(_( n4+1,n4+n4));

 450             auto d3 = work(_( n2+1,n2+n4));

 451 

 452             auto d12 = work(_(1,n2));

 453 

 454             auto sva1 = sva(_(    1,   n4));

 455             auto sva2 = sva(_( n4+1,n4+n4));

 456             auto sva3 = sva(_( n2+1,n2+n4));

 457 

 458             auto sva12 = sva(_(1,n2));

 459 

 460             svj0(jobV, A1, d1, sva1, V1, eps, safeMin, tol, 2, _work);

 461             svj0(jobV, A2, d2, sva2, V2, eps, safeMin, tol, 1, _work);

 462             svj1(jobV, n4, A12, d12, sva12, V12, eps, safeMin, tol, 1, _work);

 463             svj0(jobV, A3, d3, sva3, V3, eps, safeMin, tol, 1, _work);

 464 

 465         }

 466 

 467     }

 468 //

 469 //  .. Row-cyclic pivot strategy with de Rijk's pivoting ..

 470 //

 471     ElementType max_aapq, aapq, aapp0, aqoap, apoaq;

 472     ElementType max_sinj;

 473     ElementType theta, thetaSign, t;

 474 

 475     IndexType   i, iswRot, pSkipped, ibr, igl, jgl, ir1, p, q, jbc, ijblsk;

 476 

 477     bool        converged = false;

 478     bool        rotOk;

 479 

 480     for (i=1; i<=nSweep; ++i) {

 481 //

 482 //  .. go go go ...

 483 //

 484        max_aapq = Zero;

 485        max_sinj = Zero;

 486 

 487        iswRot   = 0;

 488        pSkipped = 0;

 489 

 490        notRot   = 0;

 491 //

 492 //     Each sweep is unrolled using KBL-by-KBL tiles over the pivot pairs

 493 //     1 <= p < q <= N. This is the first step toward a blocked implementation

 494 //     of the rotations. New implementation, based on block transformations,

 495 //     is under development.

 496 //

 497        for (ibr=1; ibr<=nbl; ++ibr) {

 498 

 499           igl = (ibr-1)*kbl + 1;

 500 

 501           for (ir1=0; ir1<=min(lkAhead,nbl-ibr); ++ir1) {

 502 

 503              igl += ir1*kbl;

 504 

 505              for (p=igl; p<=min(igl+kbl-1,n-1); ++p) {

 506 //

 507 //    .. de Rijk's pivoting

 508 //

 509                 q = blas::iamax(sva(_(p,n))) + p - 1;

 510                 if (p!=q) {

 511                     blas::swap(A(_,p),A(_,q));

 512                     if (rhsVec) {

 513                         blas::swap(V(_,p),V(_,q));

 514                     }

 515                     swap(sva(p),sva(q));

 516                     swap(work(p),work(q));

 517                 }

 518 

 519                 if (ir1==0) {

 520 //

 521 //     Column norms are periodically updated by explicit

 522 //     norm computation.

 523 //     Caveat:

 524 //     Unfortunately, some BLAS implementations compute DNRM2(M,A(1,p),1)

 525 //     as DSQRT(DDOT(M,A(1,p),1,A(1,p),1)), which may cause the result to

 526 //     overflow for ||A(:,p)||_2 > DSQRT(overflow_threshold), and to

 527 //     underflow for ||A(:,p)||_2 < DSQRT(underflow_threshold).

 528 //     Hence, DNRM2 cannot be trusted, not even in the case when

 529 //     the true norm is far from the under(over)flow boundaries.

 530 //     If properly implemented DNRM2 is available, the IF-THEN-ELSE

 531 //     below should read "AAPP = DNRM2( M, A(1,p), 1 ) * WORK(p)".

 532 //

 533 

 534                    if ((sva(p)<rootBig) && (sva(p)>rootSafeMin)) {

 535                       sva(p) = blas::nrm2(A(_,p)) * work(p);

 536                    } else {

 537                        tmp = Zero;

 538                        aapp = One;

 539                        lassq(A(_,p), tmp, aapp);

 540                        sva(p) = tmp * sqrt(aapp) * work(p);

 541                    }

 542                    aapp = sva(p);

 543                 } else {

 544                    aapp = sva(p);

 545                 }

 546 

 547                 if (aapp>Zero) {

 548 

 549                    pSkipped = 0;

 550 

 551                    for (q=p+1; q<=min(igl+kbl-1,n); ++q) {

 552 

 553                       aaqq = sva(q);

 554 

 555                       if (aaqq>Zero) {

 556 

 557                          aapp0 = aapp;

 558                          if (aaqq>=One) {

 559                             rotOk = (small*aapp<=aaqq);

 560                             if (aapp<big/aaqq) {

 561                                aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)

 562                                     / aapp;

 563                             } else {

 564                                auto _work = work(_(n+1,n+m));

 565 

 566                                _work = A(_,p);

 567                                lascl(LASCL::FullMatrix, 0, 0,

 568                                      aapp, work(p), _work);

 569                                aapq = _work*A(_,q)*work(q)/aaqq;

 570                             }

 571                          } else {

 572                             rotOk = (aapp<=aaqq/small);

 573                             if (aapp>(small/aaqq)) {

 574                                 aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)

 575                                      / aapp;

 576                             } else {

 577                                auto _work = work(_(n+1,n+m));

 578 

 579                                _work = A(_,q);

 580                                lascl(LASCL::FullMatrix, 0, 0,

 581                                      aaqq, work(q), _work);

 582                                aapq = _work*A(_,p)*work(p)/aapp;

 583                             }

 584                          }

 585 

 586                          max_aapq = max(max_aapq, abs(aapq));

 587 //

 588 //     TO rotate or NOT to rotate, THAT is the question ...

 589 //

 590                          if (abs(aapq)>tol) {

 591 //

 592 //        .. rotate

 593 //[RTD]    ROTATED = ROTATED + ONE

 594 //

 595                             if (ir1==0) {

 596                                notRot = 0;

 597                                pSkipped = 0;

 598                                ++iswRot;

 599                             }

 600 

 601                             if (rotOk) {

 602 

 603                                aqoap = aaqq / aapp;

 604                                apoaq = aapp / aaqq;

 605                                theta = -Half*abs(aqoap-apoaq)/aapq;

 606 

 607                                if (abs(theta)>bigTheta) {

 608 

 609                                   t = Half / theta;

 610                                   fastr(3) =  t*work(p) / work(q);

 611                                   fastr(4) = -t*work(q) / work(p);

 612                                   blas::rotm(A(_,p), A(_,q), fastr);

 613                                   if (rhsVec) {

 614                                       blas::rotm(V(_,p), V(_,q), fastr);

 615                                   }

 616                                   sva(q) =aaqq*sqrt(max(Zero,One+t*apoaq*aapq));

 617                                   aapp *= sqrt(max(Zero, One-t*aqoap*aapq));

 618                                   max_sinj = max(max_sinj, abs(t));

 619 

 620                                } else {

 621 //

 622 //              .. choose correct signum for THETA and rotate

 623 //

 624                                   thetaSign = -sign(One,aapq);

 625                                   t = One

 626                                     / (theta +thetaSign*sqrt(One+theta*theta));

 627                                   cs = sqrt(One / (One+t*t));

 628                                   sn = t*cs;

 629 

 630                                   max_sinj = max(max_sinj, abs(sn));

 631                                   sva(q) = aaqq

 632                                           *sqrt(max(Zero, One+t*apoaq*aapq));

 633                                   aapp *= sqrt(max(Zero, One-t*aqoap*aapq));

 634 

 635                                   apoaq = work(p) / work(q);

 636                                   aqoap = work(q) / work(p);

 637                                   if (work(p)>=One) {

 638                                      if (work(q)>=One) {

 639                                         fastr(3) =  t*apoaq;

 640                                         fastr(4) = -t*aqoap;

 641                                         work(p) *= cs;

 642                                         work(q) *= cs;

 643                                         blas::rotm(A(_,p), A(_,q), fastr);

 644                                         if (rhsVec) {

 645                                             blas::rotm(V(_,p), V(_,q), fastr);

 646                                         }

 647                                      } else {

 648                                         A(_,p) -= t*aqoap*A(_,q);

 649                                         A(_,q) += cs*sn*apoaq*A(_,p);

 650                                         work(p) *= cs;

 651                                         work(q) /= cs;

 652                                         if (rhsVec) {

 653                                            V(_,p) -= t*aqoap*V(_,q);

 654                                            V(_,q) += cs*sn*apoaq*V(_,p);

 655                                         }

 656                                      }

 657                                   } else {

 658                                      if (work(q)>=One) {

 659                                         A(_,q) += t*apoaq*A(_,p);

 660                                         A(_,p) -= cs*sn*aqoap*A(_,q);

 661                                         work(p) /= cs;

 662                                         work(q) *= cs;

 663                                         if (rhsVec) {

 664                                            V(_,q) += t*apoaq*V(_,p);

 665                                            V(_,p) -= cs*sn*aqoap*V(_,q);

 666                                         }

 667                                      } else {

 668                                         if (work(p)>=work(q)) {

 669                                            A(_,p) -= t*aqoap*A(_,q);

 670                                            A(_,q) += cs*sn*apoaq*A(_,p);

 671                                            work(p) *= cs;

 672                                            work(q) /= cs;

 673                                            if (rhsVec) {

 674                                               V(_,p) -= t*aqoap*V(_,q);

 675                                               V(_,q) += cs*sn*apoaq*V(_,p);

 676                                            }

 677                                         } else {

 678                                            A(_,q) += t*apoaq*A(_,p);

 679                                            A(_,p) -= cs*sn*aqoap*A(_,q);

 680                                            work(p) /= cs;

 681                                            work(q) *= cs;

 682                                            if (rhsVec) {

 683                                               V(_,q) += t*apoaq*V(_,p);

 684                                               V(_,p) -= cs*sn*aqoap*V(_,q);

 685                                            }

 686                                         }

 687                                      }

 688                                   }

 689                                }

 690 

 691                             } else {

 692 //             .. have to use modified Gram-Schmidt like transformation

 693                                auto _work = work(_(n+1,lWork));

 694 

 695                                _work = A(_,p);

 696                                lascl(LASCL::FullMatrix, 0, 0,

 697                                      aapp, One, _work);

 698                                lascl(LASCL::FullMatrix, 0, 0,

 699                                      aaqq, One, A(_,q));

 700                                tmp = -aapq*work(p)/work(q);

 701                                A(_,q) += tmp*_work;

 702                                lascl(LASCL::FullMatrix, 0, 0,

 703                                      One, aaqq, A(_,q));

 704                                sva(q) = aaqq*sqrt(max(Zero, One-aapq*aapq));

 705                                max_sinj = max(max_sinj, safeMin);

 706                             }

 707 //        END IF ROTOK THEN ... ELSE

 708 //

 709 //        In the case of cancellation in updating SVA(q), SVA(p)

 710 //        recompute SVA(q), SVA(p).

 711 //

 712                             if (pow(sva(q)/aaqq,2)<=rootEps) {

 713                                if ((aaqq<rootBig) && (aaqq>rootSafeMin)) {

 714                                   sva(q) = blas::nrm2(A(_,q))*work(q);

 715                                } else {

 716                                   t = Zero;

 717                                   aaqq = One;

 718                                   lassq(A(_,q), t, aaqq);

 719                                   sva(q) = t*sqrt(aaqq)*work(q);

 720                                }

 721                             }

 722                             if (aapp/aapp0<=rootEps) {

 723                                if ((aapp<rootBig) && (aapp>rootSafeMin)) {

 724                                   aapp = blas::nrm2(A(_,p))*work(p);

 725                                } else {

 726                                   t = Zero;

 727                                   aapp = One;

 728                                   lassq(A(_,p), t, aapp);

 729                                   aapp = t*sqrt(aapp)*work(p);

 730                                }

 731                                sva(p) = aapp;

 732                             }

 733 //

 734                          } else {

 735 //     A(:,p) and A(:,q) already numerically orthogonal

 736                             if (ir1==0) {

 737                                 ++notRot;

 738                             }

 739 //[RTD]    SKIPPED  = SKIPPED  + 1

 740                             ++pSkipped;

 741                          }

 742                       } else {

 743 //     A(:,q) is zero column

 744                          if (ir1==0) {

 745                              ++notRot;

 746                          }

 747                          ++pSkipped;

 748                       }

 749 //

 750                       if ((i<=swBand) && (pSkipped>rowSkip)) {

 751                          if (ir1==0) {

 752                              aapp = -aapp;

 753                          }

 754                          notRot = 0;

 755                          break;

 756                       }

 757 

 758                    }

 759 //  END q-LOOP

 760 //

 761                    sva(p) = aapp;

 762 

 763                 } else {

 764                    sva(p) = aapp;

 765                    if ((ir1==0) && (aapp==Zero)) {

 766                        notRot += min(igl+kbl-1,n) - p;

 767                    }

 768                 }

 769 

 770              }

 771 //  end of the p-loop

 772 //  end of doing the block ( ibr, ibr )

 773           }

 774 //  end of ir1-loop

 775 //

 776 //... go to the off diagonal blocks

 777 //

 778           igl = (ibr-1)*kbl + 1;

 779 

 780           for (jbc=ibr+1; jbc<=nbl; ++jbc) {

 781 

 782              jgl = (jbc-1)*kbl + 1;

 783 //

 784 //     doing the block at ( ibr, jbc )

 785 //

 786              ijblsk = 0;

 787              for (p=igl; p<=min(igl+kbl-1,n); ++p) {

 788 

 789                 aapp = sva(p);

 790                 if (aapp>Zero) {

 791 

 792                    pSkipped = 0;

 793 

 794                    for (q=jgl; q<=min(jgl+kbl-1,n); ++q) {

 795 

 796                       aaqq = sva(q);

 797                       if (aaqq>Zero) {

 798                          aapp0 = aapp;

 799 

 800 //

 801 //  .. M x 2 Jacobi SVD ..

 802 //

 803 //     Safe Gram matrix computation

 804 //

 805                          if (aaqq>=One) {

 806                             if (aapp>=aaqq) {

 807                                rotOk = (small*aapp)<=aaqq;

 808                             } else {

 809                                rotOk = (small*aaqq)<=aapp;

 810                             }

 811                             if (aapp<(big/aaqq)) {

 812                                aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)

 813                                     / aapp;

 814                             } else {

 815                                auto _work = work(_(n+1,n+m));

 816 

 817                                _work = A(_,p);

 818                                lascl(LASCL::FullMatrix, 0, 0,

 819                                      aapp, work(p), _work);

 820                                aapq = _work*A(_,q)*work(q) / aaqq;

 821                             }

 822                          } else {

 823                             if (aapp>=aaqq) {

 824                                rotOk = aapp<=(aaqq/small);

 825                             } else {

 826                                rotOk = aaqq<=(aapp/small);

 827                             }

 828                             if (aapp>(small/aaqq)) {

 829                                aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)

 830                                     / aapp;

 831                             } else {

 832                                auto _work = work(_(n+1,n+m));

 833 

 834                                _work = A(_,q);

 835                                lascl(LASCL::FullMatrix, 0, 0,

 836                                      aaqq, work(q), _work);

 837                                aapq = _work*A(_,p)*work(p)/aapp;

 838                             }

 839                          }

 840 

 841                          max_aapq = max(max_aapq, abs(aapq));

 842 //

 843 //     TO rotate or NOT to rotate, THAT is the question ...

 844 //

 845                          if (abs(aapq)>tol) {

 846                             notRot = 0;

 847 //[RTD]    ROTATED  = ROTATED + 1

 848                             pSkipped = 0;

 849                             ++iswRot;

 850 

 851                             if (rotOk) {

 852 

 853                                aqoap = aaqq / aapp;

 854                                apoaq = aapp / aaqq;

 855                                theta = -Half*abs(aqoap-apoaq)/aapq;

 856                                if (aaqq>aapp0) {

 857                                   theta = -theta;

 858                                }

 859 

 860                                if (abs(theta)>bigTheta) {

 861                                   t = Half/theta;

 862                                   fastr(3) =  t*work(p) / work(q);

 863                                   fastr(4) = -t*work(q) / work(p);

 864                                   blas::rotm(A(_,p), A(_,q), fastr);

 865                                   if (rhsVec) {

 866                                      blas::rotm(V(_,p), V(_,q), fastr);

 867                                   }

 868                                   sva(q) = aaqq

 869                                          * sqrt(max(Zero, One+t*apoaq*aapq));

 870                                   aapp *= sqrt(max(Zero,One-t*aqoap*aapq));

 871                                   max_sinj = max(max_sinj, abs(t));

 872                                } else {

 873 //

 874 //              .. choose correct signum for THETA and rotate

 875 //

 876                                   thetaSign = -sign(One,aapq);

 877                                   if (aaqq>aapp0) {

 878                                       thetaSign = -thetaSign;

 879                                   }

 880                                   t = One

 881                                     / (theta+thetaSign*sqrt(One+theta*theta));

 882                                   cs = sqrt(One / (One + t*t));

 883                                   sn = t*cs;

 884                                   max_sinj = max(max_sinj, abs(sn));

 885                                   sva(q) = aaqq

 886                                          * sqrt(max(Zero, One+t*apoaq*aapq));

 887                                   aapp *= sqrt(max(Zero, One-t*aqoap*aapq));

 888 

 889                                   apoaq = work(p) / work(q);

 890                                   aqoap = work(q) / work(p);

 891                                   if (work(p)>=One) {

 892 

 893                                      if (work(q)>=One) {

 894                                         fastr(3) =  t*apoaq;

 895                                         fastr(4) = -t*aqoap;

 896                                         work(p) *= cs;

 897                                         work(q) *= cs;

 898                                         blas::rotm(A(_,p), A(_,q), fastr);

 899                                         if (rhsVec) {

 900                                             blas::rotm(V(_,p), V(_,q), fastr);

 901                                         }

 902                                      } else {

 903                                         A(_,p) -= t*aqoap*A(_,q);

 904                                         A(_,q) += cs*sn*apoaq*A(_,p);

 905                                         if (rhsVec) {

 906                                            V(_,p) -= t*aqoap*V(_,q);

 907                                            V(_,q) += cs*sn*apoaq*V(_,p);

 908                                         }

 909                                         work(p) *= cs;

 910                                         work(q) /= cs;

 911                                      }

 912                                   } else {

 913                                      if (work(q)>=One) {

 914                                         A(_,q) += t*apoaq*A(_,p);

 915                                         A(_,p) -= cs*sn*aqoap*A(_,q);

 916                                         if (rhsVec) {

 917                                            V(_,q) += t*apoaq*V(_,p);

 918                                            V(_,p) -= cs*sn*aqoap*V(_,q);

 919                                         }

 920                                         work(p) /= cs;

 921                                         work(q) *= cs;

 922                                      } else {

 923                                         if (work(p)>=work(q)) {

 924                                            A(_,p) -= t*aqoap*A(_,q);

 925                                            A(_,q) += cs*sn*apoaq*A(_,p);

 926                                            work(p) *= cs;

 927                                            work(q) /= cs;

 928                                            if (rhsVec) {

 929                                               V(_,p) -= t*aqoap*V(_,q);

 930                                               V(_,q) += cs*sn*apoaq*V(_,p);

 931                                            }

 932                                         } else {

 933                                            A(_,q) += t*apoaq*A(_,p);

 934                                            A(_,p) -= cs*sn*aqoap*A(_,q);

 935                                            work(p) /= cs;

 936                                            work(q) *= cs;

 937                                            if (rhsVec) {

 938                                               V(_,q) += t*apoaq*V(_,p);

 939                                               V(_,p) -= cs*sn*aqoap*V(_,q);

 940                                            }

 941                                         }

 942                                      }

 943                                   }

 944                                }

 945 

 946                             } else {

 947                                auto _work = work(_(n+1,lWork));

 948 

 949                                if (aapp>aaqq) {

 950                                   _work = A(_,p);

 951                                   lascl(LASCL::FullMatrix, 0, 0,

 952                                         aapp, One, _work);

 953                                   lascl(LASCL::FullMatrix, 0, 0,

 954                                         aaqq, One, A(_,q));

 955                                   tmp = -aapq*work(p) / work(q);

 956                                   A(_,q) += tmp*_work;

 957                                   lascl(LASCL::FullMatrix, 0, 0,

 958                                         One, aaqq, A(_,q));

 959                                   sva(q) = aaqq*sqrt(max(Zero, One-aapq*aapq));

 960                                   max_sinj = max(max_sinj, safeMin);

 961                                } else {

 962                                   _work = A(_,q);

 963                                   lascl(LASCL::FullMatrix, 0, 0,

 964                                         aaqq, One, _work);

 965                                   lascl(LASCL::FullMatrix, 0, 0,

 966                                         aapp, One, A(_,p));

 967                                   tmp = -aapq*work(q) / work(p);

 968                                   A(_,p) += tmp * _work;

 969                                   lascl(LASCL::FullMatrix, 0, 0,

 970                                         One, aapp, A(_,p));

 971                                   sva(p) = aapp*sqrt(max(Zero, One-aapq*aapq));

 972                                   max_sinj = max(max_sinj, safeMin);

 973                                }

 974                             }

 975 //        END IF ROTOK THEN ... ELSE

 976 //

 977 //        In the case of cancellation in updating SVA(q)

 978 //        .. recompute SVA(q)

 979                             if (pow(sva(q)/aaqq,2)<=rootEps) {

 980                                if ((aaqq<rootBig) && (aaqq>rootSafeMin)) {

 981                                   sva(q) = blas::nrm2(A(_,q))*work(q);

 982                                } else {

 983                                   t = Zero;

 984                                   aaqq = One;

 985                                   lassq(A(_,q), t, aaqq);

 986                                   sva(q) = t*sqrt(aaqq)*work(q);

 987                                }

 988                             }

 989                             if (pow(aapp/aapp0,2)<=rootEps) {

 990                                if ((aapp<rootBig) && (aapp>rootSafeMin)) {

 991                                   aapp = blas::nrm2(A(_,p))*work(p);

 992                                } else {

 993                                   t = Zero;

 994                                   aapp = One;

 995                                   lassq(A(_,p), t, aapp);

 996                                   aapp = t*sqrt(aapp)*work(p);

 997                                }

 998                                sva(p) = aapp;

 999                             }

1000 //             end of OK rotation

1001                          } else {

1002                             ++notRot;

1003 //[RTD]      SKIPPED  = SKIPPED  + 1

1004                             ++pSkipped;

1005                             ++ijblsk;

1006                          }

1007                       } else {

1008                          ++notRot;

1009                          ++pSkipped;

1010                          ++ijblsk;

1011                       }

1012 

1013                       if ((i<=swBand) && (ijblsk>=blSkip)) {

1014                          sva(p) = aapp;

1015                          notRot = 0;

1016                          goto jbcLoopExit;

1017                       }

1018                       if ((i<=swBand) && (pSkipped>rowSkip)) {

1019                          aapp = -aapp;

1020                          notRot = 0;

1021                          break;

1022                       }

1023 

1024                    }

1025 //       end of the q-loop

1026 

1027                    sva(p) = aapp;

1028 

1029                 } else {

1030 

1031                    if (aapp==Zero) {

1032                       notRot += min(jgl+kbl-1,n) -jgl + 1;

1033                    }

1034                    if (aapp<Zero) {

1035                       notRot = 0;

1036                    }

1037 

1038                 }

1039 

1040              }

1041 //  end of the p-loop

1042           }

1043 //  end of the jbc-loop

1044        jbcLoopExit:

1045 //  bailed out of the jbc-loop

1046           for (p=igl; p<=min(igl+kbl-1,n); ++p) {

1047              sva(p) = abs(sva(p));

1048           }

1049 //**

1050        }

1051 //  end of the ibr-loop

1052 //

1053 //  .. update SVA(N)

1054        if ((sva(n)<rootBig) && (sva(n)>rootSafeMin)) {

1055           sva(n) = blas::nrm2(A(_,n))*work(n);

1056        } else {

1057           t = Zero;

1058           aapp = One;

1059           lassq(A(_,n), t, aapp);

1060           sva(n) = t*sqrt(aapp)*work(n);

1061        }

1062 //

1063 //  Additional steering devices

1064 //

1065        if ((i<swBand) && ((max_aapq<=rootTol) || (iswRot<=n))) {

1066           swBand = i;

1067        }

1068 

1069        if (i>swBand+1 && max_aapq<sqrt(ElementType(n))*tol

1070         && ElementType(n)*max_aapq*max_sinj<tol) {

1071            converged = true;

1072            break;

1073        }

1074 

1075        if (notRot>=emptsw) {

1076            converged = true;

1077            break;

1078        }

1079 

1080     }

1081 //  end i=1:NSWEEP loop

1082 //

1083     if (converged) {

1084 //#:) INFO = 0 confirms successful iterations.

1085         info = 0;

1086     } else {

1087 //#:( Reaching this point means that the procedure has not converged.

1088         info = nSweep - 1;

1089     }

1090 //

1091 //  Sort the singular values and find how many are above

1092 //  the underflow threshold.

1093 //

1094     IndexType n2 = 0;

1095     IndexType n4 = 0;

1096     for (IndexType p=1; p<=n-1; ++p) {

1097         const IndexType q = blas::iamax(sva(_(p,n))) + p - 1;

1098         if (p!=q) {

1099             swap(sva(p), sva(q));

1100             swap(work(p), work(q));

1101             blas::swap(A(_,p), A(_,q));

1102             if (rhsVec) {

1103                 blas::swap(V(_,p), V(_,q));

1104             }

1105         }

1106         if (sva(p)!=Zero) {

1107             ++n4;

1108             if (sva(p)*skl>safeMin) {

1109                 ++n2;

1110             }

1111         }

1112     }

1113     if (sva(n)!=Zero) {

1114         ++n4;

1115         if (sva(n)*skl>safeMin) {

1116             ++n2;

1117         }

1118     }

1119 //

1120 //  Normalize the left singular vectors.

1121 //

1122     if (lhsVec || controlU) {

1123         for (IndexType p=1; p<=n2; ++p) {

1124             A(_,p) *= work(p)/sva(p);

1125         }

1126     }

1127 //

1128 //  Scale the product of Jacobi rotations (assemble the fast rotations).

1129 //

1130     if (rhsVec) {

1131         if (applyV) {

1132             for (IndexType p=1; p<=n; ++p) {

1133                 V(_,p) *= work(p);

1134             }

1135         } else {

1136             for (IndexType p=1; p<=n; ++p) {

1137                 V(_,p) *= One / blas::nrm2(V(_,p));

1138             }

1139         }

1140     }

1141 //

1142 //  Undo scaling, if necessary (and possible).

1143     if (((skl>One) && (sva(1)<big/skl)) || ((skl<One) && (sva(n2)>safeMin/skl))) {

1144         sva *= skl;

1145         skl = One;

1146     }

1147 //

1148     work(1) = skl;

1149 //  The singular values of A are SKL*SVA(1:N). If SKL.NE.ONE

1150 //  then some of the singular values may overflow or underflow and

1151 //  the spectrum is given in this factored representation.

1152 //

1153     work(2) = ElementType(n4);

1154 //  N4 is the number of computed nonzero singular values of A.

1155 //

1156     work(3) = ElementType(n2);

1157 //  N2 is the number of singular values of A greater than SFMIN.

1158 //  If N2<N, SVA(N2:N) contains ZEROS and/or denormalized numbers

1159 //  that may carry some information.

1160 //

1161     work(4) = ElementType(i);

1162 //  i is the index of the last sweep before declaring convergence.

1163 //

1164     work(5) = max_aapq;

1165 //  MXAAPQ is the largest absolute value of scaled pivots in the

1166 //  last sweep

1167 //

1168     work(6) = max_sinj;

1169 //  MXSINJ is the largest absolute value of the sines of Jacobi angles

1170 //  in the last sweep

1171 //

1172     return info;

1173 }

1174 

1175 //== interface for native lapack ===============================================

1176 #ifdef CHECK_CXXLAPACK

1177 

1178 template <typename MA, typename VSVA, typename MV, typename VWORK>

1179 typename GeMatrix<MA>::IndexType

1180 svj_native(SVJ::TypeA                typeA,

1181            SVJ::JobU                 jobU,

1182            SVJ::JobV                 jobV,

1183            GeMatrix<MA>              &A,

1184            DenseVector<VSVA>         &sva,

1185            GeMatrix<MV>              &V,

1186            DenseVector<VWORK>        &work)

1187 {

1188     typedef typename GeMatrix<MA>::ElementType  T;

1189 

1190     const char       JOBA = char(typeA);

1191     const char       JOBU = char(jobU);

1192     const char       JOBV = char(jobV);

1193     const INTEGER    M = A.numRows();

1194     const INTEGER    N = A.numCols();

1195     const INTEGER    LDA = A.leadingDimension();

1196     const INTEGER    _MV = V.numRows();

1197     const INTEGER    LDV = V.leadingDimension();

1198     const INTEGER    LWORK = work.length();

1199     INTEGER          INFO;

1200 

1201     if (IsSame<T,DOUBLE>::value) {

1202         LAPACK_IMPL(dgesvj)(&JOBA,

1203                             &JOBU,

1204                             &JOBV,

1205                             &M,

1206                             &N,

1207                             A.data(),

1208                             &LDA,

1209                             sva.data(),

1210                             &_MV,

1211                             V.data(),

1212                             &LDV,

1213                             work.data(),

1214                             &LWORK,

1215                             &INFO);

1216     } else {

1217         ASSERT(0);

1218     }

1219     return INFO;

1220 }

1221 

1222 #endif // CHECK_CXXLAPACK

1223 

1224 //== public interface ==========================================================

1225 template <typename MA, typename VSVA, typename MV, typename VWORK>

1226 typename GeMatrix<MA>::IndexType

1227 svj_(SVJ::TypeA                typeA,

1228      SVJ::JobU                 jobU,

1229      SVJ::JobV                 jobV,

1230      GeMatrix<MA>              &A,

1231      DenseVector<VSVA>         &sva,

1232      GeMatrix<MV>              &V,

1233      DenseVector<VWORK>        &work)

1234 {

1235     using std::max;

1236     using std::min;

1237 //

1238 //  Test the input parameters

1239 //

1240 #   ifndef NDEBUG

1241     typedef typename GeMatrix<MA>::IndexType    IndexType;

1242 

1243     ASSERT(A.firstRow()==1);

1244     ASSERT(A.firstCol()==1);

1245 

1246     const IndexType m = A.numRows();

1247     const IndexType n = A.numCols();

1248 

1249     ASSERT(m>=n);

1250 

1251     ASSERT(sva.firstIndex()==1);

1252     ASSERT(sva.length()==n);

1253 

1254     ASSERT(V.firstRow()==1);

1255     ASSERT(V.firstCol()==1);

1256 

1257     if (jobV==SVJ::ComputeV) {

1258         ASSERT(V.numCols()==n);

1259         ASSERT(V.numRows()==n);

1260     }

1261     if (jobV==SVJ::ApplyV) {

1262         ASSERT(V.numCols()==n);

1263     }

1264 

1265     if (work.length()>0) {

1266         ASSERT(work.length()>=max(IndexType(6),m+n));

1267     }

1268 #   endif

1269 

1270 //

1271 //  Make copies of output arguments

1272 //

1273 #   ifdef CHECK_CXXLAPACK

1274     typename GeMatrix<MA>::NoView       A_org     = A;

1275     typename DenseVector<VSVA>::NoView  sva_org   = sva;

1276     typename GeMatrix<MV>::NoView       V_org     = V;

1277     typename DenseVector<VWORK>::NoView work_org  = work;

1278 #   endif

1279 

1280 //

1281 //  Call implementation

1282 //

1283     IndexType info = svj_generic(typeA, jobU, jobV, A, sva, V, work);

1284 

1285 #   ifdef CHECK_CXXLAPACK

1286 //

1287 //  Make copies of results computed by the generic implementation

1288 //

1289     typename GeMatrix<MA>::NoView       A_generic     = A;

1290     typename DenseVector<VSVA>::NoView  sva_generic   = sva;

1291     typename GeMatrix<MV>::NoView       V_generic     = V;

1292     typename DenseVector<VWORK>::NoView work_generic  = work;

1293 //

1294 //  restore output arguments

1295 //

1296     A    = A_org;

1297     sva  = sva_org;

1298     V    = V_org;

1299     work = work_org;

1300 //

1301 //  Compare generic results with results from the native implementation

1302 //

1303     IndexType _info = svj_native(typeA, jobU, jobV, A, sva, V, work);

1304 

1305     bool failed = false;

1306     if (! isIdentical(A_generic, A, "A_generic", "A")) {

1307         std::cerr << "CXXLAPACK: A_generic = " << A_generic << std::endl;

1308         std::cerr << "F77LAPACK: A = " << A << std::endl;

1309         failed = true;

1310     }

1311     if (! isIdentical(sva_generic, sva, "sva_generic", "sva")) {

1312         std::cerr << "CXXLAPACK: sva_generic = " << sva_generic << std::endl;

1313         std::cerr << "F77LAPACK: sva = " << sva << std::endl;

1314         failed = true;

1315     }

1316     if (! isIdentical(V_generic, V, "V_generic", "V")) {

1317         std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;

1318         std::cerr << "F77LAPACK: V = " << V << std::endl;

1319         failed = true;

1320     }

1321     if (! isIdentical(work_generic, work, "work_generic", "work")) {

1322         std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;

1323         std::cerr << "F77LAPACK: work = " << work << std::endl;

1324         failed = true;

1325     }

1326     if (! isIdentical(info, _info, "info", "_info")) {

1327         std::cerr << "CXXLAPACK: info = " << info << std::endl;

1328         std::cerr << "F77LAPACK: _info = " << _info << std::endl;

1329         failed = true;

1330     }

1331 

1332     if (failed) {

1333         std::cerr << "error in: svj.tcc ("

1334                   << ", m = " << m

1335                   << ", n = " << n

1336                   << ", typeA = " << char(typeA)

1337                   << ", jobU = " << char(jobU)

1338                   << ", jobV = " << char(jobV)

1339                   << ", info = " << info

1340                   << ") " << std::endl;

1341         ASSERT(0);

1342     } else {

1343         /*

1344         std::cerr << "passed: svj.tcc ("

1345                   << ", m = " << m

1346                   << ", n = " << n

1347                   << ", typeA = " << char(typeA)

1348                   << ", jobU = " << char(jobU)

1349                   << ", jobV = " << char(jobV)

1350                   << ", info = " << info

1351                   << ") " << std::endl;

1352         */

1353     }

1354 #   endif

1355 

1356     return info;

1357 }

1358 

1359 template <typename MA, typename VSVA, typename MV, typename VWORK>

1360 typename GeMatrix<MA>::IndexType

1361 svj(SVJ::TypeA                typeA,

1362     SVJ::JobU                 jobU,

1363     SVJ::JobV                 jobV,

1364     GeMatrix<MA>              &A,

1365     DenseVector<VSVA>         &sva,

1366     GeMatrix<MV>              &V,

1367     DenseVector<VWORK>        &work)

1368 {

1369 #   ifdef CHECK_CXXLAPACK

1370     typename GeMatrix<MA>::NoView       A_org    = A;

1371     typename DenseVector<VSVA>::NoView  sva_org  = sva;

1372     typename GeMatrix<MV>::NoView       V_org    = V;

1373     typename DenseVector<VSVA>::NoView  work_org = work;

1374 

1375     svj_(SVJ::Lower, jobU, jobV, A, sva, V, work);

1376 

1377     A    = A_org;

1378     sva  = sva_org;

1379     V    = V_org;

1380     work = work_org;

1381 

1382     svj_(SVJ::Upper, jobU, jobV, A, sva, V, work);

1383 

1384     A    = A_org;

1385     sva  = sva_org;

1386     V    = V_org;

1387     work = work_org;

1388     svj_(SVJ::General, jobU, jobV, A, sva, V, work);

1389 

1390     A    = A_org;

1391     sva  = sva_org;

1392     V    = V_org;

1393     work = work_org;

1394 #   endif

1395 

1396     return svj_(typeA, jobU, jobV, A, sva, V, work);

1397 }

1398 

1399 

1400 template <typename MA, typename VSVA, typename MV, typename VWORK>

1401 typename GeMatrix<MA>::IndexType

1402 svj(SVJ::JobU                 jobU,

1403     SVJ::JobV                 jobV,

1404     GeMatrix<MA>              &A,

1405     DenseVector<VSVA>         &sva,

1406     GeMatrix<MV>              &V,

1407     DenseVector<VWORK>        &work)

1408 {

1409     return svj(SVJ::General, jobU, jobV, A, sva, V, work);

1410 }

1411 

1412 template <typename MA, typename VSVA, typename MV, typename VWORK>

1413 typename TrMatrix<MA>::IndexType

1414 svj(SVJ::JobU                 jobU,

1415     SVJ::JobV                 jobV,

1416     TrMatrix<MA>              &A,

1417     DenseVector<VSVA>         &sva,

1418     GeMatrix<MV>              &V,

1419     DenseVector<VWORK>        &work)

1420 {

1421     SVJ::TypeA  upLo = (A.upLo()==Upper) ? SVJ::Upper : SVJ::Lower;

1422 

1423     return svj(upLo, jobU, jobV, A.general(), sva, V, work);

1424 }

1425 

1426 //-- forwarding ----------------------------------------------------------------

1427 template <typename MA, typename VSVA, typename MV, typename VWORK>

1428 typename MA::IndexType

1429 svj(SVJ::TypeA                typeA,

1430     SVJ::JobU                 jobU,

1431     SVJ::JobV                 jobV,

1432     MA                        &&A,

1433     VSVA                      &&sva,

1434     MV                        &&V,

1435     VWORK                     &&work)

1436 {

1437     typename MA::IndexType info;

1438 

1439     CHECKPOINT_ENTER;

1440     info = svj(typeA, jobU, jobV, A, sva, V, work);

1441     CHECKPOINT_LEAVE;

1442 

1443     return info;

1444 }

1445 

1446 template <typename MA, typename VSVA, typename MV, typename VWORK>

1447 typename MA::IndexType

1448 svj(SVJ::JobU                 jobU,

1449     SVJ::JobV                 jobV,

1450     MA                        &&A,

1451     VSVA                      &&sva,

1452     MV                        &&V,

1453     VWORK                     &&work)

1454 {

1455     typename MA::IndexType info;

1456 

1457     CHECKPOINT_ENTER;

1458     info = svj(jobU, jobV, A, sva, V, work);

1459     CHECKPOINT_LEAVE;

1460 

1461     return info;

1462 }

1463 

1464 } } // namespace lapack, flens

1465 

1466 #endif // FLENS_LAPACK_SVD_SVJ_TCC