1 /*
2 * Copyright (c) 2012, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Based on
34 *
35 SUBROUTINE DGESVJ( JOBA, JOBU, JOBV, M, N, A, LDA, SVA, MV, V,
36 $ LDV, WORK, LWORK, INFO )
37 *
38 * -- LAPACK routine (version 3.3.1) --
39 *
40 * -- Contributed by Zlatko Drmac of the University of Zagreb and --
41 * -- Kresimir Veselic of the Fernuniversitaet Hagen --
42 * -- April 2011 --
43 *
44 * -- LAPACK is a software package provided by Univ. of Tennessee, --
45 * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
46 *
47 * This routine is also part of SIGMA (version 1.23, October 23. 2008.)
48 * SIGMA is a library of algorithms for highly accurate algorithms for
49 * computation of SVD, PSVD, QSVD, (H,K)-SVD, and for solution of the
50 * eigenvalue problems Hx = lambda M x, H M x = lambda x with H, M > 0.
51 *
52 */
53
54 #ifndef FLENS_LAPACK_SVD_SVJ_TCC
55 #define FLENS_LAPACK_SVD_SVJ_TCC 1
56
57 #include <flens/blas/blas.h>
58 #include <flens/lapack/lapack.h>
59
60 namespace flens { namespace lapack {
61
62 //== generic lapack implementation =============================================
63
64 template <typename MA, typename VSVA, typename MV, typename VWORK>
65 typename GeMatrix<MA>::IndexType
66 svj_generic(SVJ::TypeA typeA,
67 SVJ::JobU jobU,
68 SVJ::JobV jobV,
69 GeMatrix<MA> &A,
70 DenseVector<VSVA> &sva,
71 GeMatrix<MV> &V,
72 DenseVector<VWORK> &work)
73 {
74 using std::abs;
75 using std::max;
76 using std::min;
77 using std::sqrt;
78 using std::swap;
79
80 typedef typename GeMatrix<MA>::ElementType ElementType;
81 typedef typename GeMatrix<MA>::IndexType IndexType;
82
83 const ElementType Zero(0), Half(0.5), One(1);
84 const IndexType nSweep = 30;
85
86 const Underscore<IndexType> _;
87
88 ElementType fastr_data[5];
89 DenseVectorView<ElementType>
90 fastr = typename DenseVectorView<ElementType>::Engine(5, fastr_data);
91
92 const IndexType m = A.numRows();
93 const IndexType n = A.numCols();
94 const IndexType mv = V.numRows();
95 const IndexType lWork = work.length();
96
97 const bool lower = (typeA==SVJ::Lower);
98 const bool upper = (typeA==SVJ::Upper);
99 const bool lhsVec = (jobU==SVJ::ComputeU);
100 const bool controlU = (jobU==SVJ::ControlU);
101 const bool applyV = (jobV==SVJ::ApplyV);
102 const bool rhsVec = (jobV==SVJ::ComputeV) || applyV;
103
104 IndexType info = 0;
105 //
106 //#:) Quick return for void matrix
107 //
108 if ((m==0) || (n==0)) {
109 return info;
110 }
111 //
112 // Set numerical parameters
113 // The stopping criterion for Jacobi rotations is
114 //
115 // max_{i<>j}|A(:,i)^T * A(:,j)|/(||A(:,i)||*||A(:,j)||) < CTOL*EPS
116 //
117 // where EPS is the round-off and CTOL is defined as follows:
118 //
119 ElementType cTol;
120 if (controlU) {
121 // ... user controlled
122 cTol = work(1);
123 } else {
124 // ... default
125 if (lhsVec || rhsVec) {
126 cTol = sqrt(ElementType(m));
127 } else {
128 cTol = ElementType(m);
129 }
130 }
131 // ... and the machine dependent parameters are
132 //[!] (Make sure that DLAMCH() works properly on the target machine.)
133 //
134 const ElementType eps = lamch<ElementType>(Eps);
135 const ElementType rootEps = sqrt(eps);
136 const ElementType safeMin = lamch<ElementType>(SafeMin);
137 const ElementType rootSafeMin = sqrt(safeMin);
138 const ElementType small = safeMin / eps;
139 const ElementType big = lamch<ElementType>(OverflowThreshold);
140 // const ElementType big = One / safeMin;
141 const ElementType rootBig = One / rootSafeMin;
142 const ElementType bigTheta = One / rootEps;
143
144 const ElementType tol = cTol*eps;
145 const ElementType rootTol = sqrt(tol);
146
147 if (ElementType(m)*eps>=One) {
148 // we return -4 to keep it compatible with the LAPACK implementation
149 return -4;
150 }
151 //
152 // Initialize the right singular vector matrix.
153 //
154 if (rhsVec && (!applyV)) {
155 V = Zero;
156 V.diag(0) = One;
157 }
158 //
159 // Initialize SVA( 1:N ) = ( ||A e_i||_2, i = 1:N )
160 //(!) If necessary, scale A to protect the largest singular value
161 // from overflow. It is possible that saving the largest singular
162 // value destroys the information about the small ones.
163 // This initial scaling is almost minimal in the sense that the
164 // goal is to make sure that no column norm overflows, and that
165 // DSQRT(N)*max_i SVA(i) does not overflow. If INFinite entries
166 // in A are detected, the procedure returns with INFO=-6.
167 //
168 ElementType skl = One / sqrt(ElementType(m)*ElementType(n));
169 bool noScale = true;
170 bool goScale = true;
171
172 ElementType aapp, aaqq, tmp;
173
174 if (lower) {
175 // the input matrix is M-by-N lower triangular (trapezoidal)
176 for (IndexType p=1; p<=n; ++p) {
177 aapp = Zero;
178 aaqq = One;
179 lassq(A(_(p,m),p), aapp, aaqq);
180 if (aapp>big) {
181 return -6;
182 }
183 aaqq = sqrt(aaqq);
184 if ((aapp<(big/aaqq)) && noScale) {
185 sva(p) = aapp*aaqq;
186 } else {
187 noScale = false;
188 sva(p) = aapp*(aaqq*skl);
189 if (goScale) {
190 goScale = false;
191 sva(_(1,p-1)) *= skl;
192 }
193 }
194 }
195 } else if (upper) {
196 // the input matrix is M-by-N upper triangular (trapezoidal)
197 for (IndexType p=1; p<=n; ++p) {
198 aapp = Zero;
199 aaqq = One;
200 lassq(A(_(1,p), p), aapp, aaqq);
201 if (aapp>big) {
202 return -6;
203 }
204 aaqq = sqrt(aaqq);
205 if ((aapp<(big/aaqq)) && noScale) {
206 sva(p) = aapp*aaqq;
207 } else {
208 noScale = false;
209 sva(p) = aapp*(aaqq*skl);
210 if (goScale) {
211 goScale = false;
212 sva(_(1,p-1)) *= skl;
213 }
214 }
215 }
216 } else {
217 // the input matrix is M-by-N general dense
218 for (IndexType p=1; p<=n; ++p) {
219 aapp = Zero;
220 aaqq = One;
221 lassq(A(_,p), aapp, aaqq);
222 if (aapp>big) {
223 return -6;
224 }
225 aaqq = sqrt(aaqq);
226 if ((aapp<(big/aaqq)) && noScale) {
227 sva(p) = aapp*aaqq;
228 } else {
229 noScale = false;
230 sva(p) = aapp*(aaqq*skl);
231 if (goScale) {
232 goScale = false;
233 sva(_(1,p-1)) *= skl;
234 }
235 }
236 }
237 }
238
239 if (noScale) {
240 skl = One;
241 }
242 //
243 // Move the smaller part of the spectrum from the underflow threshold
244 //(!) Start by determining the position of the nonzero entries of the
245 // array SVA() relative to ( SFMIN, BIG ).
246 //
247 aapp = Zero;
248 aaqq = big;
249 for (IndexType p=1; p<=n; ++p) {
250 if (sva(p)!=Zero) {
251 aaqq = min(aaqq,sva(p));
252 }
253 aapp = max(aapp,sva(p));
254 }
255 //
256 //#:) Quick return for zero matrix
257 //
258 if (aapp==Zero) {
259 if (lhsVec) {
260 A = Zero;
261 A.diag(0) = One;
262 }
263 work(1) = One;
264 work(_(2,6)) = Zero;
265 return info;
266 }
267 //
268 //#:) Quick return for one-column matrix
269 //
270 if (n==1) {
271 if (lhsVec) {
272 lascl(LASCL::FullMatrix, 0, 0, sva(1), skl, A);
273 }
274 work(1) = One / skl;
275 if (sva(1)>=safeMin) {
276 work(2) = One;
277 } else {
278 work(2) = Zero;
279 }
280 work(_(3,6)) = Zero;
281 return info;
282 }
283 //
284 // Protect small singular values from underflow, and try to
285 // avoid underflows/overflows in computing Jacobi rotations.
286 //
287 ElementType cs, sn;
288
289 sn = sqrt(safeMin/eps);
290 tmp = sqrt(big/ElementType(n));
291 if ((aapp<=sn) || (aaqq>=tmp) || (sn<=aaqq && aapp<=tmp)) {
292 tmp = min(big, tmp/aapp);
293 // aaqq = aaqq*tmp
294 // aapp = aapp*tmp
295 } else if ((aaqq<=sn) && (aapp<=tmp)) {
296 tmp = min(sn/aaqq, big/(aapp*sqrt(ElementType(n))));
297 // aaqq = aaqq*tmp
298 // aapp = aapp*tmp
299 } else if ((aaqq>=sn) && (aapp>=tmp)) {
300 tmp = max(sn/aaqq, tmp/aapp);
301 // aaqq = aaqq*tmp
302 // aapp = aapp*tmp
303 } else if ((aaqq<=sn) && (aapp>=tmp)) {
304 tmp = min(sn/aaqq, big/(sqrt(ElementType(n))*aapp));
305 // aaqq = aaqq*tmp
306 // aapp = aapp*tmp
307 } else {
308 tmp = One;
309 }
310 //
311 // Scale, if necessary
312 //
313 if (tmp!=One) {
314 lascl(LASCL::FullMatrix, 0, 0, One, tmp, sva);
315 }
316 skl *= tmp;
317 if (skl!=One) {
318 if (upper) {
319 lascl(LASCL::UpperTriangular, 0, 0, One, skl, A);
320 } else if (lower) {
321 lascl(LASCL::LowerTriangular, 0, 0, One, skl, A);
322 } else {
323 lascl(LASCL::FullMatrix, 0, 0, One, skl, A);
324 }
325 skl = One / skl;
326 }
327 //
328 // Row-cyclic Jacobi SVD algorithm with column pivoting
329 //
330 const IndexType emptsw = n*(n-1)/2;
331 IndexType notRot = 0;
332 fastr(1) = Zero;
333 //
334 // A is represented in factored form A = A * diag(WORK), where diag(WORK)
335 // is initialized to identity. WORK is updated during fast scaled
336 // rotations.
337 //
338 work(_(1,n)) = One;
339 //
340 //
341 IndexType swBand = 3;
342 //[TP] SWBAND is a tuning parameter [TP]. It is meaningful and effective
343 // if DGESVJ is used as a computational routine in the preconditioned
344 // Jacobi SVD algorithm DGESVJ. For sweeps i=1:SWBAND the procedure
345 // works on pivots inside a band-like region around the diagonal.
346 // The boundaries are determined dynamically, based on the number of
347 // pivots above a threshold.
348 //
349 const IndexType kbl = min(IndexType(8),n);
350 //[TP] KBL is a tuning parameter that defines the tile size in the
351 // tiling of the p-q loops of pivot pairs. In general, an optimal
352 // value of KBL depends on the matrix dimensions and on the
353 // parameters of the computer's memory.
354 //
355 IndexType nbl = n / kbl;
356 if (nbl*kbl!=n) {
357 ++nbl;
358 }
359
360 const IndexType blSkip = pow(kbl, 2);
361 //[TP] BLKSKIP is a tuning parameter that depends on SWBAND and KBL.
362 //
363 const IndexType rowSkip = min(IndexType(5), kbl);
364 //[TP] ROWSKIP is a tuning parameter.
365 //
366 const IndexType lkAhead = 1;
367 //[TP] LKAHEAD is a tuning parameter.
368 //
369 // Quasi block transformations, using the lower (upper) triangular
370 // structure of the input matrix. The quasi-block-cycling usually
371 // invokes cubic convergence. Big part of this cycle is done inside
372 // canonical subspaces of dimensions less than M.
373 //
374 if ((lower || upper) && (n>max(IndexType(64), 4*kbl))) {
375 //[TP] The number of partition levels and the actual partition are
376 // tuning parameters.
377 const IndexType n4 = n / 4;
378 const IndexType n2 = n / 2;
379 const IndexType n34 = 3*n4;
380 const IndexType q = (applyV) ? 0 : 1;
381 const IndexType qm = (applyV) ? mv : 0;
382
383 auto _work = work(_(n+1,lWork));
384
385 if (lower) {
386 //
387 // This works very well on lower triangular matrices, in particular
388 // in the framework of the preconditioned Jacobi SVD (xGEJSV).
389 // The idea is simple:
390 // [+ 0 0 0] Note that Jacobi transformations of [0 0]
391 // [+ + 0 0] [0 0]
392 // [+ + x 0] actually work on [x 0] [x 0]
393 // [+ + x x] [x x]. [x x]
394 //
395 auto A1 = A(_( 1,m),_( 1, n4));
396 auto A2 = A(_( n4+1,m),_( n4+1, n2));
397 auto A3 = A(_( n2+1,m),_( n2+1,n34));
398 auto A4 = A(_(n34+1,m),_(n34+1, n));
399
400 auto A12 = A(_( 1,m),_( 1,n2));
401 auto A34 = A(_(n2+1,m),_(n2+1, n));
402
403 auto V1 = V(_( 1, n4*q+qm),_( 1,n4));
404 auto V2 = V(_( n4*q+1, n2*q+qm),_( n4+1,n2));
405 auto V3 = V(_( n2*q+1,n34*q+qm),_( n2+1,n34));
406 auto V4 = V(_(n34*q+1, mv*q+qm),_(n34+1,n));
407
408 auto V12 = V(_( 1,n2*q+qm),_( 1,n2));
409 auto V34 = V(_( n2*q+1,mv*q+qm),_(n2+1,n));
410
411 auto d1 = work(_( 1, n4));
412 auto d2 = work(_( n4+1, n2));
413 auto d3 = work(_( n2+1,n34));
414 auto d4 = work(_(n34+1, n));
415
416 auto d12 = work(_( 1,n2));
417 auto d34 = work(_( n2+1, n));
418
419 auto sva1 = sva(_( 1, n4));
420 auto sva2 = sva(_( n4+1, n2));
421 auto sva3 = sva(_( n2+1,n34));
422 auto sva4 = sva(_(n34+1, n));
423
424 auto sva12 = sva(_( 1,n2));
425 auto sva34 = sva(_(n2+1, n));
426
427 svj0(jobV, A4, d4, sva4, V4, eps, safeMin, tol, 2, _work);
428 svj0(jobV, A3, d3, sva3, V3, eps, safeMin, tol, 2, _work);
429 svj1(jobV, n4, A34, d34, sva34, V34, eps, safeMin, tol, 1, _work);
430 svj0(jobV, A2, d2, sva2, V2, eps, safeMin, tol, 1, _work);
431 svj0(jobV, A1, d1, sva1, V1, eps, safeMin, tol, 1, _work);
432 svj1(jobV, n4, A12, d12, sva12, V12, eps, safeMin, tol, 1, _work);
433
434 } else if (upper) {
435
436 auto A1 = A(_(1, n4),_( 1, n4));
437 auto A2 = A(_(1, n2),_( n4+1,n4+n4));
438 auto A3 = A(_(1,n2+n4),_( n2+1,n2+n4));
439
440 auto A12 = A(_(1,n2),_(1,n2));
441
442 auto V1 = V(_( 1, n4*q+qm),_( 1, n4));
443 auto V2 = V(_( n4*q+1,(n4+n4)*q+qm),_( n4+1,n4+n4));
444 auto V3 = V(_( n2*q+1,(n2+n4)*q+qm),_( n2+1,n2+n4));
445
446 auto V12 = V(_(1,n2*q+qm),_(1,n2));
447
448 auto d1 = work(_( 1, n4));
449 auto d2 = work(_( n4+1,n4+n4));
450 auto d3 = work(_( n2+1,n2+n4));
451
452 auto d12 = work(_(1,n2));
453
454 auto sva1 = sva(_( 1, n4));
455 auto sva2 = sva(_( n4+1,n4+n4));
456 auto sva3 = sva(_( n2+1,n2+n4));
457
458 auto sva12 = sva(_(1,n2));
459
460 svj0(jobV, A1, d1, sva1, V1, eps, safeMin, tol, 2, _work);
461 svj0(jobV, A2, d2, sva2, V2, eps, safeMin, tol, 1, _work);
462 svj1(jobV, n4, A12, d12, sva12, V12, eps, safeMin, tol, 1, _work);
463 svj0(jobV, A3, d3, sva3, V3, eps, safeMin, tol, 1, _work);
464
465 }
466
467 }
468 //
469 // .. Row-cyclic pivot strategy with de Rijk's pivoting ..
470 //
471 ElementType max_aapq, aapq, aapp0, aqoap, apoaq;
472 ElementType max_sinj;
473 ElementType theta, thetaSign, t;
474
475 IndexType i, iswRot, pSkipped, ibr, igl, jgl, ir1, p, q, jbc, ijblsk;
476
477 bool converged = false;
478 bool rotOk;
479
480 for (i=1; i<=nSweep; ++i) {
481 //
482 // .. go go go ...
483 //
484 max_aapq = Zero;
485 max_sinj = Zero;
486
487 iswRot = 0;
488 pSkipped = 0;
489
490 notRot = 0;
491 //
492 // Each sweep is unrolled using KBL-by-KBL tiles over the pivot pairs
493 // 1 <= p < q <= N. This is the first step toward a blocked implementation
494 // of the rotations. New implementation, based on block transformations,
495 // is under development.
496 //
497 for (ibr=1; ibr<=nbl; ++ibr) {
498
499 igl = (ibr-1)*kbl + 1;
500
501 for (ir1=0; ir1<=min(lkAhead,nbl-ibr); ++ir1) {
502
503 igl += ir1*kbl;
504
505 for (p=igl; p<=min(igl+kbl-1,n-1); ++p) {
506 //
507 // .. de Rijk's pivoting
508 //
509 q = blas::iamax(sva(_(p,n))) + p - 1;
510 if (p!=q) {
511 blas::swap(A(_,p),A(_,q));
512 if (rhsVec) {
513 blas::swap(V(_,p),V(_,q));
514 }
515 swap(sva(p),sva(q));
516 swap(work(p),work(q));
517 }
518
519 if (ir1==0) {
520 //
521 // Column norms are periodically updated by explicit
522 // norm computation.
523 // Caveat:
524 // Unfortunately, some BLAS implementations compute DNRM2(M,A(1,p),1)
525 // as DSQRT(DDOT(M,A(1,p),1,A(1,p),1)), which may cause the result to
526 // overflow for ||A(:,p)||_2 > DSQRT(overflow_threshold), and to
527 // underflow for ||A(:,p)||_2 < DSQRT(underflow_threshold).
528 // Hence, DNRM2 cannot be trusted, not even in the case when
529 // the true norm is far from the under(over)flow boundaries.
530 // If properly implemented DNRM2 is available, the IF-THEN-ELSE
531 // below should read "AAPP = DNRM2( M, A(1,p), 1 ) * WORK(p)".
532 //
533
534 if ((sva(p)<rootBig) && (sva(p)>rootSafeMin)) {
535 sva(p) = blas::nrm2(A(_,p)) * work(p);
536 } else {
537 tmp = Zero;
538 aapp = One;
539 lassq(A(_,p), tmp, aapp);
540 sva(p) = tmp * sqrt(aapp) * work(p);
541 }
542 aapp = sva(p);
543 } else {
544 aapp = sva(p);
545 }
546
547 if (aapp>Zero) {
548
549 pSkipped = 0;
550
551 for (q=p+1; q<=min(igl+kbl-1,n); ++q) {
552
553 aaqq = sva(q);
554
555 if (aaqq>Zero) {
556
557 aapp0 = aapp;
558 if (aaqq>=One) {
559 rotOk = (small*aapp<=aaqq);
560 if (aapp<big/aaqq) {
561 aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)
562 / aapp;
563 } else {
564 auto _work = work(_(n+1,n+m));
565
566 _work = A(_,p);
567 lascl(LASCL::FullMatrix, 0, 0,
568 aapp, work(p), _work);
569 aapq = _work*A(_,q)*work(q)/aaqq;
570 }
571 } else {
572 rotOk = (aapp<=aaqq/small);
573 if (aapp>(small/aaqq)) {
574 aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)
575 / aapp;
576 } else {
577 auto _work = work(_(n+1,n+m));
578
579 _work = A(_,q);
580 lascl(LASCL::FullMatrix, 0, 0,
581 aaqq, work(q), _work);
582 aapq = _work*A(_,p)*work(p)/aapp;
583 }
584 }
585
586 max_aapq = max(max_aapq, abs(aapq));
587 //
588 // TO rotate or NOT to rotate, THAT is the question ...
589 //
590 if (abs(aapq)>tol) {
591 //
592 // .. rotate
593 //[RTD] ROTATED = ROTATED + ONE
594 //
595 if (ir1==0) {
596 notRot = 0;
597 pSkipped = 0;
598 ++iswRot;
599 }
600
601 if (rotOk) {
602
603 aqoap = aaqq / aapp;
604 apoaq = aapp / aaqq;
605 theta = -Half*abs(aqoap-apoaq)/aapq;
606
607 if (abs(theta)>bigTheta) {
608
609 t = Half / theta;
610 fastr(3) = t*work(p) / work(q);
611 fastr(4) = -t*work(q) / work(p);
612 blas::rotm(A(_,p), A(_,q), fastr);
613 if (rhsVec) {
614 blas::rotm(V(_,p), V(_,q), fastr);
615 }
616 sva(q) =aaqq*sqrt(max(Zero,One+t*apoaq*aapq));
617 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
618 max_sinj = max(max_sinj, abs(t));
619
620 } else {
621 //
622 // .. choose correct signum for THETA and rotate
623 //
624 thetaSign = -sign(One,aapq);
625 t = One
626 / (theta +thetaSign*sqrt(One+theta*theta));
627 cs = sqrt(One / (One+t*t));
628 sn = t*cs;
629
630 max_sinj = max(max_sinj, abs(sn));
631 sva(q) = aaqq
632 *sqrt(max(Zero, One+t*apoaq*aapq));
633 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
634
635 apoaq = work(p) / work(q);
636 aqoap = work(q) / work(p);
637 if (work(p)>=One) {
638 if (work(q)>=One) {
639 fastr(3) = t*apoaq;
640 fastr(4) = -t*aqoap;
641 work(p) *= cs;
642 work(q) *= cs;
643 blas::rotm(A(_,p), A(_,q), fastr);
644 if (rhsVec) {
645 blas::rotm(V(_,p), V(_,q), fastr);
646 }
647 } else {
648 A(_,p) -= t*aqoap*A(_,q);
649 A(_,q) += cs*sn*apoaq*A(_,p);
650 work(p) *= cs;
651 work(q) /= cs;
652 if (rhsVec) {
653 V(_,p) -= t*aqoap*V(_,q);
654 V(_,q) += cs*sn*apoaq*V(_,p);
655 }
656 }
657 } else {
658 if (work(q)>=One) {
659 A(_,q) += t*apoaq*A(_,p);
660 A(_,p) -= cs*sn*aqoap*A(_,q);
661 work(p) /= cs;
662 work(q) *= cs;
663 if (rhsVec) {
664 V(_,q) += t*apoaq*V(_,p);
665 V(_,p) -= cs*sn*aqoap*V(_,q);
666 }
667 } else {
668 if (work(p)>=work(q)) {
669 A(_,p) -= t*aqoap*A(_,q);
670 A(_,q) += cs*sn*apoaq*A(_,p);
671 work(p) *= cs;
672 work(q) /= cs;
673 if (rhsVec) {
674 V(_,p) -= t*aqoap*V(_,q);
675 V(_,q) += cs*sn*apoaq*V(_,p);
676 }
677 } else {
678 A(_,q) += t*apoaq*A(_,p);
679 A(_,p) -= cs*sn*aqoap*A(_,q);
680 work(p) /= cs;
681 work(q) *= cs;
682 if (rhsVec) {
683 V(_,q) += t*apoaq*V(_,p);
684 V(_,p) -= cs*sn*aqoap*V(_,q);
685 }
686 }
687 }
688 }
689 }
690
691 } else {
692 // .. have to use modified Gram-Schmidt like transformation
693 auto _work = work(_(n+1,lWork));
694
695 _work = A(_,p);
696 lascl(LASCL::FullMatrix, 0, 0,
697 aapp, One, _work);
698 lascl(LASCL::FullMatrix, 0, 0,
699 aaqq, One, A(_,q));
700 tmp = -aapq*work(p)/work(q);
701 A(_,q) += tmp*_work;
702 lascl(LASCL::FullMatrix, 0, 0,
703 One, aaqq, A(_,q));
704 sva(q) = aaqq*sqrt(max(Zero, One-aapq*aapq));
705 max_sinj = max(max_sinj, safeMin);
706 }
707 // END IF ROTOK THEN ... ELSE
708 //
709 // In the case of cancellation in updating SVA(q), SVA(p)
710 // recompute SVA(q), SVA(p).
711 //
712 if (pow(sva(q)/aaqq,2)<=rootEps) {
713 if ((aaqq<rootBig) && (aaqq>rootSafeMin)) {
714 sva(q) = blas::nrm2(A(_,q))*work(q);
715 } else {
716 t = Zero;
717 aaqq = One;
718 lassq(A(_,q), t, aaqq);
719 sva(q) = t*sqrt(aaqq)*work(q);
720 }
721 }
722 if (aapp/aapp0<=rootEps) {
723 if ((aapp<rootBig) && (aapp>rootSafeMin)) {
724 aapp = blas::nrm2(A(_,p))*work(p);
725 } else {
726 t = Zero;
727 aapp = One;
728 lassq(A(_,p), t, aapp);
729 aapp = t*sqrt(aapp)*work(p);
730 }
731 sva(p) = aapp;
732 }
733 //
734 } else {
735 // A(:,p) and A(:,q) already numerically orthogonal
736 if (ir1==0) {
737 ++notRot;
738 }
739 //[RTD] SKIPPED = SKIPPED + 1
740 ++pSkipped;
741 }
742 } else {
743 // A(:,q) is zero column
744 if (ir1==0) {
745 ++notRot;
746 }
747 ++pSkipped;
748 }
749 //
750 if ((i<=swBand) && (pSkipped>rowSkip)) {
751 if (ir1==0) {
752 aapp = -aapp;
753 }
754 notRot = 0;
755 break;
756 }
757
758 }
759 // END q-LOOP
760 //
761 sva(p) = aapp;
762
763 } else {
764 sva(p) = aapp;
765 if ((ir1==0) && (aapp==Zero)) {
766 notRot += min(igl+kbl-1,n) - p;
767 }
768 }
769
770 }
771 // end of the p-loop
772 // end of doing the block ( ibr, ibr )
773 }
774 // end of ir1-loop
775 //
776 //... go to the off diagonal blocks
777 //
778 igl = (ibr-1)*kbl + 1;
779
780 for (jbc=ibr+1; jbc<=nbl; ++jbc) {
781
782 jgl = (jbc-1)*kbl + 1;
783 //
784 // doing the block at ( ibr, jbc )
785 //
786 ijblsk = 0;
787 for (p=igl; p<=min(igl+kbl-1,n); ++p) {
788
789 aapp = sva(p);
790 if (aapp>Zero) {
791
792 pSkipped = 0;
793
794 for (q=jgl; q<=min(jgl+kbl-1,n); ++q) {
795
796 aaqq = sva(q);
797 if (aaqq>Zero) {
798 aapp0 = aapp;
799
800 //
801 // .. M x 2 Jacobi SVD ..
802 //
803 // Safe Gram matrix computation
804 //
805 if (aaqq>=One) {
806 if (aapp>=aaqq) {
807 rotOk = (small*aapp)<=aaqq;
808 } else {
809 rotOk = (small*aaqq)<=aapp;
810 }
811 if (aapp<(big/aaqq)) {
812 aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)
813 / aapp;
814 } else {
815 auto _work = work(_(n+1,n+m));
816
817 _work = A(_,p);
818 lascl(LASCL::FullMatrix, 0, 0,
819 aapp, work(p), _work);
820 aapq = _work*A(_,q)*work(q) / aaqq;
821 }
822 } else {
823 if (aapp>=aaqq) {
824 rotOk = aapp<=(aaqq/small);
825 } else {
826 rotOk = aaqq<=(aapp/small);
827 }
828 if (aapp>(small/aaqq)) {
829 aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)
830 / aapp;
831 } else {
832 auto _work = work(_(n+1,n+m));
833
834 _work = A(_,q);
835 lascl(LASCL::FullMatrix, 0, 0,
836 aaqq, work(q), _work);
837 aapq = _work*A(_,p)*work(p)/aapp;
838 }
839 }
840
841 max_aapq = max(max_aapq, abs(aapq));
842 //
843 // TO rotate or NOT to rotate, THAT is the question ...
844 //
845 if (abs(aapq)>tol) {
846 notRot = 0;
847 //[RTD] ROTATED = ROTATED + 1
848 pSkipped = 0;
849 ++iswRot;
850
851 if (rotOk) {
852
853 aqoap = aaqq / aapp;
854 apoaq = aapp / aaqq;
855 theta = -Half*abs(aqoap-apoaq)/aapq;
856 if (aaqq>aapp0) {
857 theta = -theta;
858 }
859
860 if (abs(theta)>bigTheta) {
861 t = Half/theta;
862 fastr(3) = t*work(p) / work(q);
863 fastr(4) = -t*work(q) / work(p);
864 blas::rotm(A(_,p), A(_,q), fastr);
865 if (rhsVec) {
866 blas::rotm(V(_,p), V(_,q), fastr);
867 }
868 sva(q) = aaqq
869 * sqrt(max(Zero, One+t*apoaq*aapq));
870 aapp *= sqrt(max(Zero,One-t*aqoap*aapq));
871 max_sinj = max(max_sinj, abs(t));
872 } else {
873 //
874 // .. choose correct signum for THETA and rotate
875 //
876 thetaSign = -sign(One,aapq);
877 if (aaqq>aapp0) {
878 thetaSign = -thetaSign;
879 }
880 t = One
881 / (theta+thetaSign*sqrt(One+theta*theta));
882 cs = sqrt(One / (One + t*t));
883 sn = t*cs;
884 max_sinj = max(max_sinj, abs(sn));
885 sva(q) = aaqq
886 * sqrt(max(Zero, One+t*apoaq*aapq));
887 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
888
889 apoaq = work(p) / work(q);
890 aqoap = work(q) / work(p);
891 if (work(p)>=One) {
892
893 if (work(q)>=One) {
894 fastr(3) = t*apoaq;
895 fastr(4) = -t*aqoap;
896 work(p) *= cs;
897 work(q) *= cs;
898 blas::rotm(A(_,p), A(_,q), fastr);
899 if (rhsVec) {
900 blas::rotm(V(_,p), V(_,q), fastr);
901 }
902 } else {
903 A(_,p) -= t*aqoap*A(_,q);
904 A(_,q) += cs*sn*apoaq*A(_,p);
905 if (rhsVec) {
906 V(_,p) -= t*aqoap*V(_,q);
907 V(_,q) += cs*sn*apoaq*V(_,p);
908 }
909 work(p) *= cs;
910 work(q) /= cs;
911 }
912 } else {
913 if (work(q)>=One) {
914 A(_,q) += t*apoaq*A(_,p);
915 A(_,p) -= cs*sn*aqoap*A(_,q);
916 if (rhsVec) {
917 V(_,q) += t*apoaq*V(_,p);
918 V(_,p) -= cs*sn*aqoap*V(_,q);
919 }
920 work(p) /= cs;
921 work(q) *= cs;
922 } else {
923 if (work(p)>=work(q)) {
924 A(_,p) -= t*aqoap*A(_,q);
925 A(_,q) += cs*sn*apoaq*A(_,p);
926 work(p) *= cs;
927 work(q) /= cs;
928 if (rhsVec) {
929 V(_,p) -= t*aqoap*V(_,q);
930 V(_,q) += cs*sn*apoaq*V(_,p);
931 }
932 } else {
933 A(_,q) += t*apoaq*A(_,p);
934 A(_,p) -= cs*sn*aqoap*A(_,q);
935 work(p) /= cs;
936 work(q) *= cs;
937 if (rhsVec) {
938 V(_,q) += t*apoaq*V(_,p);
939 V(_,p) -= cs*sn*aqoap*V(_,q);
940 }
941 }
942 }
943 }
944 }
945
946 } else {
947 auto _work = work(_(n+1,lWork));
948
949 if (aapp>aaqq) {
950 _work = A(_,p);
951 lascl(LASCL::FullMatrix, 0, 0,
952 aapp, One, _work);
953 lascl(LASCL::FullMatrix, 0, 0,
954 aaqq, One, A(_,q));
955 tmp = -aapq*work(p) / work(q);
956 A(_,q) += tmp*_work;
957 lascl(LASCL::FullMatrix, 0, 0,
958 One, aaqq, A(_,q));
959 sva(q) = aaqq*sqrt(max(Zero, One-aapq*aapq));
960 max_sinj = max(max_sinj, safeMin);
961 } else {
962 _work = A(_,q);
963 lascl(LASCL::FullMatrix, 0, 0,
964 aaqq, One, _work);
965 lascl(LASCL::FullMatrix, 0, 0,
966 aapp, One, A(_,p));
967 tmp = -aapq*work(q) / work(p);
968 A(_,p) += tmp * _work;
969 lascl(LASCL::FullMatrix, 0, 0,
970 One, aapp, A(_,p));
971 sva(p) = aapp*sqrt(max(Zero, One-aapq*aapq));
972 max_sinj = max(max_sinj, safeMin);
973 }
974 }
975 // END IF ROTOK THEN ... ELSE
976 //
977 // In the case of cancellation in updating SVA(q)
978 // .. recompute SVA(q)
979 if (pow(sva(q)/aaqq,2)<=rootEps) {
980 if ((aaqq<rootBig) && (aaqq>rootSafeMin)) {
981 sva(q) = blas::nrm2(A(_,q))*work(q);
982 } else {
983 t = Zero;
984 aaqq = One;
985 lassq(A(_,q), t, aaqq);
986 sva(q) = t*sqrt(aaqq)*work(q);
987 }
988 }
989 if (pow(aapp/aapp0,2)<=rootEps) {
990 if ((aapp<rootBig) && (aapp>rootSafeMin)) {
991 aapp = blas::nrm2(A(_,p))*work(p);
992 } else {
993 t = Zero;
994 aapp = One;
995 lassq(A(_,p), t, aapp);
996 aapp = t*sqrt(aapp)*work(p);
997 }
998 sva(p) = aapp;
999 }
1000 // end of OK rotation
1001 } else {
1002 ++notRot;
1003 //[RTD] SKIPPED = SKIPPED + 1
1004 ++pSkipped;
1005 ++ijblsk;
1006 }
1007 } else {
1008 ++notRot;
1009 ++pSkipped;
1010 ++ijblsk;
1011 }
1012
1013 if ((i<=swBand) && (ijblsk>=blSkip)) {
1014 sva(p) = aapp;
1015 notRot = 0;
1016 goto jbcLoopExit;
1017 }
1018 if ((i<=swBand) && (pSkipped>rowSkip)) {
1019 aapp = -aapp;
1020 notRot = 0;
1021 break;
1022 }
1023
1024 }
1025 // end of the q-loop
1026
1027 sva(p) = aapp;
1028
1029 } else {
1030
1031 if (aapp==Zero) {
1032 notRot += min(jgl+kbl-1,n) -jgl + 1;
1033 }
1034 if (aapp<Zero) {
1035 notRot = 0;
1036 }
1037
1038 }
1039
1040 }
1041 // end of the p-loop
1042 }
1043 // end of the jbc-loop
1044 jbcLoopExit:
1045 // bailed out of the jbc-loop
1046 for (p=igl; p<=min(igl+kbl-1,n); ++p) {
1047 sva(p) = abs(sva(p));
1048 }
1049 //**
1050 }
1051 // end of the ibr-loop
1052 //
1053 // .. update SVA(N)
1054 if ((sva(n)<rootBig) && (sva(n)>rootSafeMin)) {
1055 sva(n) = blas::nrm2(A(_,n))*work(n);
1056 } else {
1057 t = Zero;
1058 aapp = One;
1059 lassq(A(_,n), t, aapp);
1060 sva(n) = t*sqrt(aapp)*work(n);
1061 }
1062 //
1063 // Additional steering devices
1064 //
1065 if ((i<swBand) && ((max_aapq<=rootTol) || (iswRot<=n))) {
1066 swBand = i;
1067 }
1068
1069 if (i>swBand+1 && max_aapq<sqrt(ElementType(n))*tol
1070 && ElementType(n)*max_aapq*max_sinj<tol) {
1071 converged = true;
1072 break;
1073 }
1074
1075 if (notRot>=emptsw) {
1076 converged = true;
1077 break;
1078 }
1079
1080 }
1081 // end i=1:NSWEEP loop
1082 //
1083 if (converged) {
1084 //#:) INFO = 0 confirms successful iterations.
1085 info = 0;
1086 } else {
1087 //#:( Reaching this point means that the procedure has not converged.
1088 info = nSweep - 1;
1089 }
1090 //
1091 // Sort the singular values and find how many are above
1092 // the underflow threshold.
1093 //
1094 IndexType n2 = 0;
1095 IndexType n4 = 0;
1096 for (IndexType p=1; p<=n-1; ++p) {
1097 const IndexType q = blas::iamax(sva(_(p,n))) + p - 1;
1098 if (p!=q) {
1099 swap(sva(p), sva(q));
1100 swap(work(p), work(q));
1101 blas::swap(A(_,p), A(_,q));
1102 if (rhsVec) {
1103 blas::swap(V(_,p), V(_,q));
1104 }
1105 }
1106 if (sva(p)!=Zero) {
1107 ++n4;
1108 if (sva(p)*skl>safeMin) {
1109 ++n2;
1110 }
1111 }
1112 }
1113 if (sva(n)!=Zero) {
1114 ++n4;
1115 if (sva(n)*skl>safeMin) {
1116 ++n2;
1117 }
1118 }
1119 //
1120 // Normalize the left singular vectors.
1121 //
1122 if (lhsVec || controlU) {
1123 for (IndexType p=1; p<=n2; ++p) {
1124 A(_,p) *= work(p)/sva(p);
1125 }
1126 }
1127 //
1128 // Scale the product of Jacobi rotations (assemble the fast rotations).
1129 //
1130 if (rhsVec) {
1131 if (applyV) {
1132 for (IndexType p=1; p<=n; ++p) {
1133 V(_,p) *= work(p);
1134 }
1135 } else {
1136 for (IndexType p=1; p<=n; ++p) {
1137 V(_,p) *= One / blas::nrm2(V(_,p));
1138 }
1139 }
1140 }
1141 //
1142 // Undo scaling, if necessary (and possible).
1143 if (((skl>One) && (sva(1)<big/skl)) || ((skl<One) && (sva(n2)>safeMin/skl))) {
1144 sva *= skl;
1145 skl = One;
1146 }
1147 //
1148 work(1) = skl;
1149 // The singular values of A are SKL*SVA(1:N). If SKL.NE.ONE
1150 // then some of the singular values may overflow or underflow and
1151 // the spectrum is given in this factored representation.
1152 //
1153 work(2) = ElementType(n4);
1154 // N4 is the number of computed nonzero singular values of A.
1155 //
1156 work(3) = ElementType(n2);
1157 // N2 is the number of singular values of A greater than SFMIN.
1158 // If N2<N, SVA(N2:N) contains ZEROS and/or denormalized numbers
1159 // that may carry some information.
1160 //
1161 work(4) = ElementType(i);
1162 // i is the index of the last sweep before declaring convergence.
1163 //
1164 work(5) = max_aapq;
1165 // MXAAPQ is the largest absolute value of scaled pivots in the
1166 // last sweep
1167 //
1168 work(6) = max_sinj;
1169 // MXSINJ is the largest absolute value of the sines of Jacobi angles
1170 // in the last sweep
1171 //
1172 return info;
1173 }
1174
1175 //== interface for native lapack ===============================================
1176 #ifdef CHECK_CXXLAPACK
1177
1178 template <typename MA, typename VSVA, typename MV, typename VWORK>
1179 typename GeMatrix<MA>::IndexType
1180 svj_native(SVJ::TypeA typeA,
1181 SVJ::JobU jobU,
1182 SVJ::JobV jobV,
1183 GeMatrix<MA> &A,
1184 DenseVector<VSVA> &sva,
1185 GeMatrix<MV> &V,
1186 DenseVector<VWORK> &work)
1187 {
1188 typedef typename GeMatrix<MA>::ElementType T;
1189
1190 const char JOBA = char(typeA);
1191 const char JOBU = char(jobU);
1192 const char JOBV = char(jobV);
1193 const INTEGER M = A.numRows();
1194 const INTEGER N = A.numCols();
1195 const INTEGER LDA = A.leadingDimension();
1196 const INTEGER _MV = V.numRows();
1197 const INTEGER LDV = V.leadingDimension();
1198 const INTEGER LWORK = work.length();
1199 INTEGER INFO;
1200
1201 if (IsSame<T,DOUBLE>::value) {
1202 LAPACK_IMPL(dgesvj)(&JOBA,
1203 &JOBU,
1204 &JOBV,
1205 &M,
1206 &N,
1207 A.data(),
1208 &LDA,
1209 sva.data(),
1210 &_MV,
1211 V.data(),
1212 &LDV,
1213 work.data(),
1214 &LWORK,
1215 &INFO);
1216 } else {
1217 ASSERT(0);
1218 }
1219 return INFO;
1220 }
1221
1222 #endif // CHECK_CXXLAPACK
1223
1224 //== public interface ==========================================================
1225 template <typename MA, typename VSVA, typename MV, typename VWORK>
1226 typename GeMatrix<MA>::IndexType
1227 svj_(SVJ::TypeA typeA,
1228 SVJ::JobU jobU,
1229 SVJ::JobV jobV,
1230 GeMatrix<MA> &A,
1231 DenseVector<VSVA> &sva,
1232 GeMatrix<MV> &V,
1233 DenseVector<VWORK> &work)
1234 {
1235 using std::max;
1236 using std::min;
1237 //
1238 // Test the input parameters
1239 //
1240 # ifndef NDEBUG
1241 typedef typename GeMatrix<MA>::IndexType IndexType;
1242
1243 ASSERT(A.firstRow()==1);
1244 ASSERT(A.firstCol()==1);
1245
1246 const IndexType m = A.numRows();
1247 const IndexType n = A.numCols();
1248
1249 ASSERT(m>=n);
1250
1251 ASSERT(sva.firstIndex()==1);
1252 ASSERT(sva.length()==n);
1253
1254 ASSERT(V.firstRow()==1);
1255 ASSERT(V.firstCol()==1);
1256
1257 if (jobV==SVJ::ComputeV) {
1258 ASSERT(V.numCols()==n);
1259 ASSERT(V.numRows()==n);
1260 }
1261 if (jobV==SVJ::ApplyV) {
1262 ASSERT(V.numCols()==n);
1263 }
1264
1265 if (work.length()>0) {
1266 ASSERT(work.length()>=max(IndexType(6),m+n));
1267 }
1268 # endif
1269
1270 //
1271 // Make copies of output arguments
1272 //
1273 # ifdef CHECK_CXXLAPACK
1274 typename GeMatrix<MA>::NoView A_org = A;
1275 typename DenseVector<VSVA>::NoView sva_org = sva;
1276 typename GeMatrix<MV>::NoView V_org = V;
1277 typename DenseVector<VWORK>::NoView work_org = work;
1278 # endif
1279
1280 //
1281 // Call implementation
1282 //
1283 IndexType info = svj_generic(typeA, jobU, jobV, A, sva, V, work);
1284
1285 # ifdef CHECK_CXXLAPACK
1286 //
1287 // Make copies of results computed by the generic implementation
1288 //
1289 typename GeMatrix<MA>::NoView A_generic = A;
1290 typename DenseVector<VSVA>::NoView sva_generic = sva;
1291 typename GeMatrix<MV>::NoView V_generic = V;
1292 typename DenseVector<VWORK>::NoView work_generic = work;
1293 //
1294 // restore output arguments
1295 //
1296 A = A_org;
1297 sva = sva_org;
1298 V = V_org;
1299 work = work_org;
1300 //
1301 // Compare generic results with results from the native implementation
1302 //
1303 IndexType _info = svj_native(typeA, jobU, jobV, A, sva, V, work);
1304
1305 bool failed = false;
1306 if (! isIdentical(A_generic, A, "A_generic", "A")) {
1307 std::cerr << "CXXLAPACK: A_generic = " << A_generic << std::endl;
1308 std::cerr << "F77LAPACK: A = " << A << std::endl;
1309 failed = true;
1310 }
1311 if (! isIdentical(sva_generic, sva, "sva_generic", "sva")) {
1312 std::cerr << "CXXLAPACK: sva_generic = " << sva_generic << std::endl;
1313 std::cerr << "F77LAPACK: sva = " << sva << std::endl;
1314 failed = true;
1315 }
1316 if (! isIdentical(V_generic, V, "V_generic", "V")) {
1317 std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;
1318 std::cerr << "F77LAPACK: V = " << V << std::endl;
1319 failed = true;
1320 }
1321 if (! isIdentical(work_generic, work, "work_generic", "work")) {
1322 std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
1323 std::cerr << "F77LAPACK: work = " << work << std::endl;
1324 failed = true;
1325 }
1326 if (! isIdentical(info, _info, "info", "_info")) {
1327 std::cerr << "CXXLAPACK: info = " << info << std::endl;
1328 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
1329 failed = true;
1330 }
1331
1332 if (failed) {
1333 std::cerr << "error in: svj.tcc ("
1334 << ", m = " << m
1335 << ", n = " << n
1336 << ", typeA = " << char(typeA)
1337 << ", jobU = " << char(jobU)
1338 << ", jobV = " << char(jobV)
1339 << ", info = " << info
1340 << ") " << std::endl;
1341 ASSERT(0);
1342 } else {
1343 /*
1344 std::cerr << "passed: svj.tcc ("
1345 << ", m = " << m
1346 << ", n = " << n
1347 << ", typeA = " << char(typeA)
1348 << ", jobU = " << char(jobU)
1349 << ", jobV = " << char(jobV)
1350 << ", info = " << info
1351 << ") " << std::endl;
1352 */
1353 }
1354 # endif
1355
1356 return info;
1357 }
1358
1359 template <typename MA, typename VSVA, typename MV, typename VWORK>
1360 typename GeMatrix<MA>::IndexType
1361 svj(SVJ::TypeA typeA,
1362 SVJ::JobU jobU,
1363 SVJ::JobV jobV,
1364 GeMatrix<MA> &A,
1365 DenseVector<VSVA> &sva,
1366 GeMatrix<MV> &V,
1367 DenseVector<VWORK> &work)
1368 {
1369 # ifdef CHECK_CXXLAPACK
1370 typename GeMatrix<MA>::NoView A_org = A;
1371 typename DenseVector<VSVA>::NoView sva_org = sva;
1372 typename GeMatrix<MV>::NoView V_org = V;
1373 typename DenseVector<VSVA>::NoView work_org = work;
1374
1375 svj_(SVJ::Lower, jobU, jobV, A, sva, V, work);
1376
1377 A = A_org;
1378 sva = sva_org;
1379 V = V_org;
1380 work = work_org;
1381
1382 svj_(SVJ::Upper, jobU, jobV, A, sva, V, work);
1383
1384 A = A_org;
1385 sva = sva_org;
1386 V = V_org;
1387 work = work_org;
1388 svj_(SVJ::General, jobU, jobV, A, sva, V, work);
1389
1390 A = A_org;
1391 sva = sva_org;
1392 V = V_org;
1393 work = work_org;
1394 # endif
1395
1396 return svj_(typeA, jobU, jobV, A, sva, V, work);
1397 }
1398
1399
1400 template <typename MA, typename VSVA, typename MV, typename VWORK>
1401 typename GeMatrix<MA>::IndexType
1402 svj(SVJ::JobU jobU,
1403 SVJ::JobV jobV,
1404 GeMatrix<MA> &A,
1405 DenseVector<VSVA> &sva,
1406 GeMatrix<MV> &V,
1407 DenseVector<VWORK> &work)
1408 {
1409 return svj(SVJ::General, jobU, jobV, A, sva, V, work);
1410 }
1411
1412 template <typename MA, typename VSVA, typename MV, typename VWORK>
1413 typename TrMatrix<MA>::IndexType
1414 svj(SVJ::JobU jobU,
1415 SVJ::JobV jobV,
1416 TrMatrix<MA> &A,
1417 DenseVector<VSVA> &sva,
1418 GeMatrix<MV> &V,
1419 DenseVector<VWORK> &work)
1420 {
1421 SVJ::TypeA upLo = (A.upLo()==Upper) ? SVJ::Upper : SVJ::Lower;
1422
1423 return svj(upLo, jobU, jobV, A.general(), sva, V, work);
1424 }
1425
1426 //-- forwarding ----------------------------------------------------------------
1427 template <typename MA, typename VSVA, typename MV, typename VWORK>
1428 typename MA::IndexType
1429 svj(SVJ::TypeA typeA,
1430 SVJ::JobU jobU,
1431 SVJ::JobV jobV,
1432 MA &&A,
1433 VSVA &&sva,
1434 MV &&V,
1435 VWORK &&work)
1436 {
1437 typename MA::IndexType info;
1438
1439 CHECKPOINT_ENTER;
1440 info = svj(typeA, jobU, jobV, A, sva, V, work);
1441 CHECKPOINT_LEAVE;
1442
1443 return info;
1444 }
1445
1446 template <typename MA, typename VSVA, typename MV, typename VWORK>
1447 typename MA::IndexType
1448 svj(SVJ::JobU jobU,
1449 SVJ::JobV jobV,
1450 MA &&A,
1451 VSVA &&sva,
1452 MV &&V,
1453 VWORK &&work)
1454 {
1455 typename MA::IndexType info;
1456
1457 CHECKPOINT_ENTER;
1458 info = svj(jobU, jobV, A, sva, V, work);
1459 CHECKPOINT_LEAVE;
1460
1461 return info;
1462 }
1463
1464 } } // namespace lapack, flens
1465
1466 #endif // FLENS_LAPACK_SVD_SVJ_TCC
2 * Copyright (c) 2012, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Based on
34 *
35 SUBROUTINE DGESVJ( JOBA, JOBU, JOBV, M, N, A, LDA, SVA, MV, V,
36 $ LDV, WORK, LWORK, INFO )
37 *
38 * -- LAPACK routine (version 3.3.1) --
39 *
40 * -- Contributed by Zlatko Drmac of the University of Zagreb and --
41 * -- Kresimir Veselic of the Fernuniversitaet Hagen --
42 * -- April 2011 --
43 *
44 * -- LAPACK is a software package provided by Univ. of Tennessee, --
45 * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
46 *
47 * This routine is also part of SIGMA (version 1.23, October 23. 2008.)
48 * SIGMA is a library of algorithms for highly accurate algorithms for
49 * computation of SVD, PSVD, QSVD, (H,K)-SVD, and for solution of the
50 * eigenvalue problems Hx = lambda M x, H M x = lambda x with H, M > 0.
51 *
52 */
53
54 #ifndef FLENS_LAPACK_SVD_SVJ_TCC
55 #define FLENS_LAPACK_SVD_SVJ_TCC 1
56
57 #include <flens/blas/blas.h>
58 #include <flens/lapack/lapack.h>
59
60 namespace flens { namespace lapack {
61
62 //== generic lapack implementation =============================================
63
64 template <typename MA, typename VSVA, typename MV, typename VWORK>
65 typename GeMatrix<MA>::IndexType
66 svj_generic(SVJ::TypeA typeA,
67 SVJ::JobU jobU,
68 SVJ::JobV jobV,
69 GeMatrix<MA> &A,
70 DenseVector<VSVA> &sva,
71 GeMatrix<MV> &V,
72 DenseVector<VWORK> &work)
73 {
74 using std::abs;
75 using std::max;
76 using std::min;
77 using std::sqrt;
78 using std::swap;
79
80 typedef typename GeMatrix<MA>::ElementType ElementType;
81 typedef typename GeMatrix<MA>::IndexType IndexType;
82
83 const ElementType Zero(0), Half(0.5), One(1);
84 const IndexType nSweep = 30;
85
86 const Underscore<IndexType> _;
87
88 ElementType fastr_data[5];
89 DenseVectorView<ElementType>
90 fastr = typename DenseVectorView<ElementType>::Engine(5, fastr_data);
91
92 const IndexType m = A.numRows();
93 const IndexType n = A.numCols();
94 const IndexType mv = V.numRows();
95 const IndexType lWork = work.length();
96
97 const bool lower = (typeA==SVJ::Lower);
98 const bool upper = (typeA==SVJ::Upper);
99 const bool lhsVec = (jobU==SVJ::ComputeU);
100 const bool controlU = (jobU==SVJ::ControlU);
101 const bool applyV = (jobV==SVJ::ApplyV);
102 const bool rhsVec = (jobV==SVJ::ComputeV) || applyV;
103
104 IndexType info = 0;
105 //
106 //#:) Quick return for void matrix
107 //
108 if ((m==0) || (n==0)) {
109 return info;
110 }
111 //
112 // Set numerical parameters
113 // The stopping criterion for Jacobi rotations is
114 //
115 // max_{i<>j}|A(:,i)^T * A(:,j)|/(||A(:,i)||*||A(:,j)||) < CTOL*EPS
116 //
117 // where EPS is the round-off and CTOL is defined as follows:
118 //
119 ElementType cTol;
120 if (controlU) {
121 // ... user controlled
122 cTol = work(1);
123 } else {
124 // ... default
125 if (lhsVec || rhsVec) {
126 cTol = sqrt(ElementType(m));
127 } else {
128 cTol = ElementType(m);
129 }
130 }
131 // ... and the machine dependent parameters are
132 //[!] (Make sure that DLAMCH() works properly on the target machine.)
133 //
134 const ElementType eps = lamch<ElementType>(Eps);
135 const ElementType rootEps = sqrt(eps);
136 const ElementType safeMin = lamch<ElementType>(SafeMin);
137 const ElementType rootSafeMin = sqrt(safeMin);
138 const ElementType small = safeMin / eps;
139 const ElementType big = lamch<ElementType>(OverflowThreshold);
140 // const ElementType big = One / safeMin;
141 const ElementType rootBig = One / rootSafeMin;
142 const ElementType bigTheta = One / rootEps;
143
144 const ElementType tol = cTol*eps;
145 const ElementType rootTol = sqrt(tol);
146
147 if (ElementType(m)*eps>=One) {
148 // we return -4 to keep it compatible with the LAPACK implementation
149 return -4;
150 }
151 //
152 // Initialize the right singular vector matrix.
153 //
154 if (rhsVec && (!applyV)) {
155 V = Zero;
156 V.diag(0) = One;
157 }
158 //
159 // Initialize SVA( 1:N ) = ( ||A e_i||_2, i = 1:N )
160 //(!) If necessary, scale A to protect the largest singular value
161 // from overflow. It is possible that saving the largest singular
162 // value destroys the information about the small ones.
163 // This initial scaling is almost minimal in the sense that the
164 // goal is to make sure that no column norm overflows, and that
165 // DSQRT(N)*max_i SVA(i) does not overflow. If INFinite entries
166 // in A are detected, the procedure returns with INFO=-6.
167 //
168 ElementType skl = One / sqrt(ElementType(m)*ElementType(n));
169 bool noScale = true;
170 bool goScale = true;
171
172 ElementType aapp, aaqq, tmp;
173
174 if (lower) {
175 // the input matrix is M-by-N lower triangular (trapezoidal)
176 for (IndexType p=1; p<=n; ++p) {
177 aapp = Zero;
178 aaqq = One;
179 lassq(A(_(p,m),p), aapp, aaqq);
180 if (aapp>big) {
181 return -6;
182 }
183 aaqq = sqrt(aaqq);
184 if ((aapp<(big/aaqq)) && noScale) {
185 sva(p) = aapp*aaqq;
186 } else {
187 noScale = false;
188 sva(p) = aapp*(aaqq*skl);
189 if (goScale) {
190 goScale = false;
191 sva(_(1,p-1)) *= skl;
192 }
193 }
194 }
195 } else if (upper) {
196 // the input matrix is M-by-N upper triangular (trapezoidal)
197 for (IndexType p=1; p<=n; ++p) {
198 aapp = Zero;
199 aaqq = One;
200 lassq(A(_(1,p), p), aapp, aaqq);
201 if (aapp>big) {
202 return -6;
203 }
204 aaqq = sqrt(aaqq);
205 if ((aapp<(big/aaqq)) && noScale) {
206 sva(p) = aapp*aaqq;
207 } else {
208 noScale = false;
209 sva(p) = aapp*(aaqq*skl);
210 if (goScale) {
211 goScale = false;
212 sva(_(1,p-1)) *= skl;
213 }
214 }
215 }
216 } else {
217 // the input matrix is M-by-N general dense
218 for (IndexType p=1; p<=n; ++p) {
219 aapp = Zero;
220 aaqq = One;
221 lassq(A(_,p), aapp, aaqq);
222 if (aapp>big) {
223 return -6;
224 }
225 aaqq = sqrt(aaqq);
226 if ((aapp<(big/aaqq)) && noScale) {
227 sva(p) = aapp*aaqq;
228 } else {
229 noScale = false;
230 sva(p) = aapp*(aaqq*skl);
231 if (goScale) {
232 goScale = false;
233 sva(_(1,p-1)) *= skl;
234 }
235 }
236 }
237 }
238
239 if (noScale) {
240 skl = One;
241 }
242 //
243 // Move the smaller part of the spectrum from the underflow threshold
244 //(!) Start by determining the position of the nonzero entries of the
245 // array SVA() relative to ( SFMIN, BIG ).
246 //
247 aapp = Zero;
248 aaqq = big;
249 for (IndexType p=1; p<=n; ++p) {
250 if (sva(p)!=Zero) {
251 aaqq = min(aaqq,sva(p));
252 }
253 aapp = max(aapp,sva(p));
254 }
255 //
256 //#:) Quick return for zero matrix
257 //
258 if (aapp==Zero) {
259 if (lhsVec) {
260 A = Zero;
261 A.diag(0) = One;
262 }
263 work(1) = One;
264 work(_(2,6)) = Zero;
265 return info;
266 }
267 //
268 //#:) Quick return for one-column matrix
269 //
270 if (n==1) {
271 if (lhsVec) {
272 lascl(LASCL::FullMatrix, 0, 0, sva(1), skl, A);
273 }
274 work(1) = One / skl;
275 if (sva(1)>=safeMin) {
276 work(2) = One;
277 } else {
278 work(2) = Zero;
279 }
280 work(_(3,6)) = Zero;
281 return info;
282 }
283 //
284 // Protect small singular values from underflow, and try to
285 // avoid underflows/overflows in computing Jacobi rotations.
286 //
287 ElementType cs, sn;
288
289 sn = sqrt(safeMin/eps);
290 tmp = sqrt(big/ElementType(n));
291 if ((aapp<=sn) || (aaqq>=tmp) || (sn<=aaqq && aapp<=tmp)) {
292 tmp = min(big, tmp/aapp);
293 // aaqq = aaqq*tmp
294 // aapp = aapp*tmp
295 } else if ((aaqq<=sn) && (aapp<=tmp)) {
296 tmp = min(sn/aaqq, big/(aapp*sqrt(ElementType(n))));
297 // aaqq = aaqq*tmp
298 // aapp = aapp*tmp
299 } else if ((aaqq>=sn) && (aapp>=tmp)) {
300 tmp = max(sn/aaqq, tmp/aapp);
301 // aaqq = aaqq*tmp
302 // aapp = aapp*tmp
303 } else if ((aaqq<=sn) && (aapp>=tmp)) {
304 tmp = min(sn/aaqq, big/(sqrt(ElementType(n))*aapp));
305 // aaqq = aaqq*tmp
306 // aapp = aapp*tmp
307 } else {
308 tmp = One;
309 }
310 //
311 // Scale, if necessary
312 //
313 if (tmp!=One) {
314 lascl(LASCL::FullMatrix, 0, 0, One, tmp, sva);
315 }
316 skl *= tmp;
317 if (skl!=One) {
318 if (upper) {
319 lascl(LASCL::UpperTriangular, 0, 0, One, skl, A);
320 } else if (lower) {
321 lascl(LASCL::LowerTriangular, 0, 0, One, skl, A);
322 } else {
323 lascl(LASCL::FullMatrix, 0, 0, One, skl, A);
324 }
325 skl = One / skl;
326 }
327 //
328 // Row-cyclic Jacobi SVD algorithm with column pivoting
329 //
330 const IndexType emptsw = n*(n-1)/2;
331 IndexType notRot = 0;
332 fastr(1) = Zero;
333 //
334 // A is represented in factored form A = A * diag(WORK), where diag(WORK)
335 // is initialized to identity. WORK is updated during fast scaled
336 // rotations.
337 //
338 work(_(1,n)) = One;
339 //
340 //
341 IndexType swBand = 3;
342 //[TP] SWBAND is a tuning parameter [TP]. It is meaningful and effective
343 // if DGESVJ is used as a computational routine in the preconditioned
344 // Jacobi SVD algorithm DGESVJ. For sweeps i=1:SWBAND the procedure
345 // works on pivots inside a band-like region around the diagonal.
346 // The boundaries are determined dynamically, based on the number of
347 // pivots above a threshold.
348 //
349 const IndexType kbl = min(IndexType(8),n);
350 //[TP] KBL is a tuning parameter that defines the tile size in the
351 // tiling of the p-q loops of pivot pairs. In general, an optimal
352 // value of KBL depends on the matrix dimensions and on the
353 // parameters of the computer's memory.
354 //
355 IndexType nbl = n / kbl;
356 if (nbl*kbl!=n) {
357 ++nbl;
358 }
359
360 const IndexType blSkip = pow(kbl, 2);
361 //[TP] BLKSKIP is a tuning parameter that depends on SWBAND and KBL.
362 //
363 const IndexType rowSkip = min(IndexType(5), kbl);
364 //[TP] ROWSKIP is a tuning parameter.
365 //
366 const IndexType lkAhead = 1;
367 //[TP] LKAHEAD is a tuning parameter.
368 //
369 // Quasi block transformations, using the lower (upper) triangular
370 // structure of the input matrix. The quasi-block-cycling usually
371 // invokes cubic convergence. Big part of this cycle is done inside
372 // canonical subspaces of dimensions less than M.
373 //
374 if ((lower || upper) && (n>max(IndexType(64), 4*kbl))) {
375 //[TP] The number of partition levels and the actual partition are
376 // tuning parameters.
377 const IndexType n4 = n / 4;
378 const IndexType n2 = n / 2;
379 const IndexType n34 = 3*n4;
380 const IndexType q = (applyV) ? 0 : 1;
381 const IndexType qm = (applyV) ? mv : 0;
382
383 auto _work = work(_(n+1,lWork));
384
385 if (lower) {
386 //
387 // This works very well on lower triangular matrices, in particular
388 // in the framework of the preconditioned Jacobi SVD (xGEJSV).
389 // The idea is simple:
390 // [+ 0 0 0] Note that Jacobi transformations of [0 0]
391 // [+ + 0 0] [0 0]
392 // [+ + x 0] actually work on [x 0] [x 0]
393 // [+ + x x] [x x]. [x x]
394 //
395 auto A1 = A(_( 1,m),_( 1, n4));
396 auto A2 = A(_( n4+1,m),_( n4+1, n2));
397 auto A3 = A(_( n2+1,m),_( n2+1,n34));
398 auto A4 = A(_(n34+1,m),_(n34+1, n));
399
400 auto A12 = A(_( 1,m),_( 1,n2));
401 auto A34 = A(_(n2+1,m),_(n2+1, n));
402
403 auto V1 = V(_( 1, n4*q+qm),_( 1,n4));
404 auto V2 = V(_( n4*q+1, n2*q+qm),_( n4+1,n2));
405 auto V3 = V(_( n2*q+1,n34*q+qm),_( n2+1,n34));
406 auto V4 = V(_(n34*q+1, mv*q+qm),_(n34+1,n));
407
408 auto V12 = V(_( 1,n2*q+qm),_( 1,n2));
409 auto V34 = V(_( n2*q+1,mv*q+qm),_(n2+1,n));
410
411 auto d1 = work(_( 1, n4));
412 auto d2 = work(_( n4+1, n2));
413 auto d3 = work(_( n2+1,n34));
414 auto d4 = work(_(n34+1, n));
415
416 auto d12 = work(_( 1,n2));
417 auto d34 = work(_( n2+1, n));
418
419 auto sva1 = sva(_( 1, n4));
420 auto sva2 = sva(_( n4+1, n2));
421 auto sva3 = sva(_( n2+1,n34));
422 auto sva4 = sva(_(n34+1, n));
423
424 auto sva12 = sva(_( 1,n2));
425 auto sva34 = sva(_(n2+1, n));
426
427 svj0(jobV, A4, d4, sva4, V4, eps, safeMin, tol, 2, _work);
428 svj0(jobV, A3, d3, sva3, V3, eps, safeMin, tol, 2, _work);
429 svj1(jobV, n4, A34, d34, sva34, V34, eps, safeMin, tol, 1, _work);
430 svj0(jobV, A2, d2, sva2, V2, eps, safeMin, tol, 1, _work);
431 svj0(jobV, A1, d1, sva1, V1, eps, safeMin, tol, 1, _work);
432 svj1(jobV, n4, A12, d12, sva12, V12, eps, safeMin, tol, 1, _work);
433
434 } else if (upper) {
435
436 auto A1 = A(_(1, n4),_( 1, n4));
437 auto A2 = A(_(1, n2),_( n4+1,n4+n4));
438 auto A3 = A(_(1,n2+n4),_( n2+1,n2+n4));
439
440 auto A12 = A(_(1,n2),_(1,n2));
441
442 auto V1 = V(_( 1, n4*q+qm),_( 1, n4));
443 auto V2 = V(_( n4*q+1,(n4+n4)*q+qm),_( n4+1,n4+n4));
444 auto V3 = V(_( n2*q+1,(n2+n4)*q+qm),_( n2+1,n2+n4));
445
446 auto V12 = V(_(1,n2*q+qm),_(1,n2));
447
448 auto d1 = work(_( 1, n4));
449 auto d2 = work(_( n4+1,n4+n4));
450 auto d3 = work(_( n2+1,n2+n4));
451
452 auto d12 = work(_(1,n2));
453
454 auto sva1 = sva(_( 1, n4));
455 auto sva2 = sva(_( n4+1,n4+n4));
456 auto sva3 = sva(_( n2+1,n2+n4));
457
458 auto sva12 = sva(_(1,n2));
459
460 svj0(jobV, A1, d1, sva1, V1, eps, safeMin, tol, 2, _work);
461 svj0(jobV, A2, d2, sva2, V2, eps, safeMin, tol, 1, _work);
462 svj1(jobV, n4, A12, d12, sva12, V12, eps, safeMin, tol, 1, _work);
463 svj0(jobV, A3, d3, sva3, V3, eps, safeMin, tol, 1, _work);
464
465 }
466
467 }
468 //
469 // .. Row-cyclic pivot strategy with de Rijk's pivoting ..
470 //
471 ElementType max_aapq, aapq, aapp0, aqoap, apoaq;
472 ElementType max_sinj;
473 ElementType theta, thetaSign, t;
474
475 IndexType i, iswRot, pSkipped, ibr, igl, jgl, ir1, p, q, jbc, ijblsk;
476
477 bool converged = false;
478 bool rotOk;
479
480 for (i=1; i<=nSweep; ++i) {
481 //
482 // .. go go go ...
483 //
484 max_aapq = Zero;
485 max_sinj = Zero;
486
487 iswRot = 0;
488 pSkipped = 0;
489
490 notRot = 0;
491 //
492 // Each sweep is unrolled using KBL-by-KBL tiles over the pivot pairs
493 // 1 <= p < q <= N. This is the first step toward a blocked implementation
494 // of the rotations. New implementation, based on block transformations,
495 // is under development.
496 //
497 for (ibr=1; ibr<=nbl; ++ibr) {
498
499 igl = (ibr-1)*kbl + 1;
500
501 for (ir1=0; ir1<=min(lkAhead,nbl-ibr); ++ir1) {
502
503 igl += ir1*kbl;
504
505 for (p=igl; p<=min(igl+kbl-1,n-1); ++p) {
506 //
507 // .. de Rijk's pivoting
508 //
509 q = blas::iamax(sva(_(p,n))) + p - 1;
510 if (p!=q) {
511 blas::swap(A(_,p),A(_,q));
512 if (rhsVec) {
513 blas::swap(V(_,p),V(_,q));
514 }
515 swap(sva(p),sva(q));
516 swap(work(p),work(q));
517 }
518
519 if (ir1==0) {
520 //
521 // Column norms are periodically updated by explicit
522 // norm computation.
523 // Caveat:
524 // Unfortunately, some BLAS implementations compute DNRM2(M,A(1,p),1)
525 // as DSQRT(DDOT(M,A(1,p),1,A(1,p),1)), which may cause the result to
526 // overflow for ||A(:,p)||_2 > DSQRT(overflow_threshold), and to
527 // underflow for ||A(:,p)||_2 < DSQRT(underflow_threshold).
528 // Hence, DNRM2 cannot be trusted, not even in the case when
529 // the true norm is far from the under(over)flow boundaries.
530 // If properly implemented DNRM2 is available, the IF-THEN-ELSE
531 // below should read "AAPP = DNRM2( M, A(1,p), 1 ) * WORK(p)".
532 //
533
534 if ((sva(p)<rootBig) && (sva(p)>rootSafeMin)) {
535 sva(p) = blas::nrm2(A(_,p)) * work(p);
536 } else {
537 tmp = Zero;
538 aapp = One;
539 lassq(A(_,p), tmp, aapp);
540 sva(p) = tmp * sqrt(aapp) * work(p);
541 }
542 aapp = sva(p);
543 } else {
544 aapp = sva(p);
545 }
546
547 if (aapp>Zero) {
548
549 pSkipped = 0;
550
551 for (q=p+1; q<=min(igl+kbl-1,n); ++q) {
552
553 aaqq = sva(q);
554
555 if (aaqq>Zero) {
556
557 aapp0 = aapp;
558 if (aaqq>=One) {
559 rotOk = (small*aapp<=aaqq);
560 if (aapp<big/aaqq) {
561 aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)
562 / aapp;
563 } else {
564 auto _work = work(_(n+1,n+m));
565
566 _work = A(_,p);
567 lascl(LASCL::FullMatrix, 0, 0,
568 aapp, work(p), _work);
569 aapq = _work*A(_,q)*work(q)/aaqq;
570 }
571 } else {
572 rotOk = (aapp<=aaqq/small);
573 if (aapp>(small/aaqq)) {
574 aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)
575 / aapp;
576 } else {
577 auto _work = work(_(n+1,n+m));
578
579 _work = A(_,q);
580 lascl(LASCL::FullMatrix, 0, 0,
581 aaqq, work(q), _work);
582 aapq = _work*A(_,p)*work(p)/aapp;
583 }
584 }
585
586 max_aapq = max(max_aapq, abs(aapq));
587 //
588 // TO rotate or NOT to rotate, THAT is the question ...
589 //
590 if (abs(aapq)>tol) {
591 //
592 // .. rotate
593 //[RTD] ROTATED = ROTATED + ONE
594 //
595 if (ir1==0) {
596 notRot = 0;
597 pSkipped = 0;
598 ++iswRot;
599 }
600
601 if (rotOk) {
602
603 aqoap = aaqq / aapp;
604 apoaq = aapp / aaqq;
605 theta = -Half*abs(aqoap-apoaq)/aapq;
606
607 if (abs(theta)>bigTheta) {
608
609 t = Half / theta;
610 fastr(3) = t*work(p) / work(q);
611 fastr(4) = -t*work(q) / work(p);
612 blas::rotm(A(_,p), A(_,q), fastr);
613 if (rhsVec) {
614 blas::rotm(V(_,p), V(_,q), fastr);
615 }
616 sva(q) =aaqq*sqrt(max(Zero,One+t*apoaq*aapq));
617 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
618 max_sinj = max(max_sinj, abs(t));
619
620 } else {
621 //
622 // .. choose correct signum for THETA and rotate
623 //
624 thetaSign = -sign(One,aapq);
625 t = One
626 / (theta +thetaSign*sqrt(One+theta*theta));
627 cs = sqrt(One / (One+t*t));
628 sn = t*cs;
629
630 max_sinj = max(max_sinj, abs(sn));
631 sva(q) = aaqq
632 *sqrt(max(Zero, One+t*apoaq*aapq));
633 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
634
635 apoaq = work(p) / work(q);
636 aqoap = work(q) / work(p);
637 if (work(p)>=One) {
638 if (work(q)>=One) {
639 fastr(3) = t*apoaq;
640 fastr(4) = -t*aqoap;
641 work(p) *= cs;
642 work(q) *= cs;
643 blas::rotm(A(_,p), A(_,q), fastr);
644 if (rhsVec) {
645 blas::rotm(V(_,p), V(_,q), fastr);
646 }
647 } else {
648 A(_,p) -= t*aqoap*A(_,q);
649 A(_,q) += cs*sn*apoaq*A(_,p);
650 work(p) *= cs;
651 work(q) /= cs;
652 if (rhsVec) {
653 V(_,p) -= t*aqoap*V(_,q);
654 V(_,q) += cs*sn*apoaq*V(_,p);
655 }
656 }
657 } else {
658 if (work(q)>=One) {
659 A(_,q) += t*apoaq*A(_,p);
660 A(_,p) -= cs*sn*aqoap*A(_,q);
661 work(p) /= cs;
662 work(q) *= cs;
663 if (rhsVec) {
664 V(_,q) += t*apoaq*V(_,p);
665 V(_,p) -= cs*sn*aqoap*V(_,q);
666 }
667 } else {
668 if (work(p)>=work(q)) {
669 A(_,p) -= t*aqoap*A(_,q);
670 A(_,q) += cs*sn*apoaq*A(_,p);
671 work(p) *= cs;
672 work(q) /= cs;
673 if (rhsVec) {
674 V(_,p) -= t*aqoap*V(_,q);
675 V(_,q) += cs*sn*apoaq*V(_,p);
676 }
677 } else {
678 A(_,q) += t*apoaq*A(_,p);
679 A(_,p) -= cs*sn*aqoap*A(_,q);
680 work(p) /= cs;
681 work(q) *= cs;
682 if (rhsVec) {
683 V(_,q) += t*apoaq*V(_,p);
684 V(_,p) -= cs*sn*aqoap*V(_,q);
685 }
686 }
687 }
688 }
689 }
690
691 } else {
692 // .. have to use modified Gram-Schmidt like transformation
693 auto _work = work(_(n+1,lWork));
694
695 _work = A(_,p);
696 lascl(LASCL::FullMatrix, 0, 0,
697 aapp, One, _work);
698 lascl(LASCL::FullMatrix, 0, 0,
699 aaqq, One, A(_,q));
700 tmp = -aapq*work(p)/work(q);
701 A(_,q) += tmp*_work;
702 lascl(LASCL::FullMatrix, 0, 0,
703 One, aaqq, A(_,q));
704 sva(q) = aaqq*sqrt(max(Zero, One-aapq*aapq));
705 max_sinj = max(max_sinj, safeMin);
706 }
707 // END IF ROTOK THEN ... ELSE
708 //
709 // In the case of cancellation in updating SVA(q), SVA(p)
710 // recompute SVA(q), SVA(p).
711 //
712 if (pow(sva(q)/aaqq,2)<=rootEps) {
713 if ((aaqq<rootBig) && (aaqq>rootSafeMin)) {
714 sva(q) = blas::nrm2(A(_,q))*work(q);
715 } else {
716 t = Zero;
717 aaqq = One;
718 lassq(A(_,q), t, aaqq);
719 sva(q) = t*sqrt(aaqq)*work(q);
720 }
721 }
722 if (aapp/aapp0<=rootEps) {
723 if ((aapp<rootBig) && (aapp>rootSafeMin)) {
724 aapp = blas::nrm2(A(_,p))*work(p);
725 } else {
726 t = Zero;
727 aapp = One;
728 lassq(A(_,p), t, aapp);
729 aapp = t*sqrt(aapp)*work(p);
730 }
731 sva(p) = aapp;
732 }
733 //
734 } else {
735 // A(:,p) and A(:,q) already numerically orthogonal
736 if (ir1==0) {
737 ++notRot;
738 }
739 //[RTD] SKIPPED = SKIPPED + 1
740 ++pSkipped;
741 }
742 } else {
743 // A(:,q) is zero column
744 if (ir1==0) {
745 ++notRot;
746 }
747 ++pSkipped;
748 }
749 //
750 if ((i<=swBand) && (pSkipped>rowSkip)) {
751 if (ir1==0) {
752 aapp = -aapp;
753 }
754 notRot = 0;
755 break;
756 }
757
758 }
759 // END q-LOOP
760 //
761 sva(p) = aapp;
762
763 } else {
764 sva(p) = aapp;
765 if ((ir1==0) && (aapp==Zero)) {
766 notRot += min(igl+kbl-1,n) - p;
767 }
768 }
769
770 }
771 // end of the p-loop
772 // end of doing the block ( ibr, ibr )
773 }
774 // end of ir1-loop
775 //
776 //... go to the off diagonal blocks
777 //
778 igl = (ibr-1)*kbl + 1;
779
780 for (jbc=ibr+1; jbc<=nbl; ++jbc) {
781
782 jgl = (jbc-1)*kbl + 1;
783 //
784 // doing the block at ( ibr, jbc )
785 //
786 ijblsk = 0;
787 for (p=igl; p<=min(igl+kbl-1,n); ++p) {
788
789 aapp = sva(p);
790 if (aapp>Zero) {
791
792 pSkipped = 0;
793
794 for (q=jgl; q<=min(jgl+kbl-1,n); ++q) {
795
796 aaqq = sva(q);
797 if (aaqq>Zero) {
798 aapp0 = aapp;
799
800 //
801 // .. M x 2 Jacobi SVD ..
802 //
803 // Safe Gram matrix computation
804 //
805 if (aaqq>=One) {
806 if (aapp>=aaqq) {
807 rotOk = (small*aapp)<=aaqq;
808 } else {
809 rotOk = (small*aaqq)<=aapp;
810 }
811 if (aapp<(big/aaqq)) {
812 aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)
813 / aapp;
814 } else {
815 auto _work = work(_(n+1,n+m));
816
817 _work = A(_,p);
818 lascl(LASCL::FullMatrix, 0, 0,
819 aapp, work(p), _work);
820 aapq = _work*A(_,q)*work(q) / aaqq;
821 }
822 } else {
823 if (aapp>=aaqq) {
824 rotOk = aapp<=(aaqq/small);
825 } else {
826 rotOk = aaqq<=(aapp/small);
827 }
828 if (aapp>(small/aaqq)) {
829 aapq = (A(_,p)*A(_,q)*work(p)*work(q)/aaqq)
830 / aapp;
831 } else {
832 auto _work = work(_(n+1,n+m));
833
834 _work = A(_,q);
835 lascl(LASCL::FullMatrix, 0, 0,
836 aaqq, work(q), _work);
837 aapq = _work*A(_,p)*work(p)/aapp;
838 }
839 }
840
841 max_aapq = max(max_aapq, abs(aapq));
842 //
843 // TO rotate or NOT to rotate, THAT is the question ...
844 //
845 if (abs(aapq)>tol) {
846 notRot = 0;
847 //[RTD] ROTATED = ROTATED + 1
848 pSkipped = 0;
849 ++iswRot;
850
851 if (rotOk) {
852
853 aqoap = aaqq / aapp;
854 apoaq = aapp / aaqq;
855 theta = -Half*abs(aqoap-apoaq)/aapq;
856 if (aaqq>aapp0) {
857 theta = -theta;
858 }
859
860 if (abs(theta)>bigTheta) {
861 t = Half/theta;
862 fastr(3) = t*work(p) / work(q);
863 fastr(4) = -t*work(q) / work(p);
864 blas::rotm(A(_,p), A(_,q), fastr);
865 if (rhsVec) {
866 blas::rotm(V(_,p), V(_,q), fastr);
867 }
868 sva(q) = aaqq
869 * sqrt(max(Zero, One+t*apoaq*aapq));
870 aapp *= sqrt(max(Zero,One-t*aqoap*aapq));
871 max_sinj = max(max_sinj, abs(t));
872 } else {
873 //
874 // .. choose correct signum for THETA and rotate
875 //
876 thetaSign = -sign(One,aapq);
877 if (aaqq>aapp0) {
878 thetaSign = -thetaSign;
879 }
880 t = One
881 / (theta+thetaSign*sqrt(One+theta*theta));
882 cs = sqrt(One / (One + t*t));
883 sn = t*cs;
884 max_sinj = max(max_sinj, abs(sn));
885 sva(q) = aaqq
886 * sqrt(max(Zero, One+t*apoaq*aapq));
887 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
888
889 apoaq = work(p) / work(q);
890 aqoap = work(q) / work(p);
891 if (work(p)>=One) {
892
893 if (work(q)>=One) {
894 fastr(3) = t*apoaq;
895 fastr(4) = -t*aqoap;
896 work(p) *= cs;
897 work(q) *= cs;
898 blas::rotm(A(_,p), A(_,q), fastr);
899 if (rhsVec) {
900 blas::rotm(V(_,p), V(_,q), fastr);
901 }
902 } else {
903 A(_,p) -= t*aqoap*A(_,q);
904 A(_,q) += cs*sn*apoaq*A(_,p);
905 if (rhsVec) {
906 V(_,p) -= t*aqoap*V(_,q);
907 V(_,q) += cs*sn*apoaq*V(_,p);
908 }
909 work(p) *= cs;
910 work(q) /= cs;
911 }
912 } else {
913 if (work(q)>=One) {
914 A(_,q) += t*apoaq*A(_,p);
915 A(_,p) -= cs*sn*aqoap*A(_,q);
916 if (rhsVec) {
917 V(_,q) += t*apoaq*V(_,p);
918 V(_,p) -= cs*sn*aqoap*V(_,q);
919 }
920 work(p) /= cs;
921 work(q) *= cs;
922 } else {
923 if (work(p)>=work(q)) {
924 A(_,p) -= t*aqoap*A(_,q);
925 A(_,q) += cs*sn*apoaq*A(_,p);
926 work(p) *= cs;
927 work(q) /= cs;
928 if (rhsVec) {
929 V(_,p) -= t*aqoap*V(_,q);
930 V(_,q) += cs*sn*apoaq*V(_,p);
931 }
932 } else {
933 A(_,q) += t*apoaq*A(_,p);
934 A(_,p) -= cs*sn*aqoap*A(_,q);
935 work(p) /= cs;
936 work(q) *= cs;
937 if (rhsVec) {
938 V(_,q) += t*apoaq*V(_,p);
939 V(_,p) -= cs*sn*aqoap*V(_,q);
940 }
941 }
942 }
943 }
944 }
945
946 } else {
947 auto _work = work(_(n+1,lWork));
948
949 if (aapp>aaqq) {
950 _work = A(_,p);
951 lascl(LASCL::FullMatrix, 0, 0,
952 aapp, One, _work);
953 lascl(LASCL::FullMatrix, 0, 0,
954 aaqq, One, A(_,q));
955 tmp = -aapq*work(p) / work(q);
956 A(_,q) += tmp*_work;
957 lascl(LASCL::FullMatrix, 0, 0,
958 One, aaqq, A(_,q));
959 sva(q) = aaqq*sqrt(max(Zero, One-aapq*aapq));
960 max_sinj = max(max_sinj, safeMin);
961 } else {
962 _work = A(_,q);
963 lascl(LASCL::FullMatrix, 0, 0,
964 aaqq, One, _work);
965 lascl(LASCL::FullMatrix, 0, 0,
966 aapp, One, A(_,p));
967 tmp = -aapq*work(q) / work(p);
968 A(_,p) += tmp * _work;
969 lascl(LASCL::FullMatrix, 0, 0,
970 One, aapp, A(_,p));
971 sva(p) = aapp*sqrt(max(Zero, One-aapq*aapq));
972 max_sinj = max(max_sinj, safeMin);
973 }
974 }
975 // END IF ROTOK THEN ... ELSE
976 //
977 // In the case of cancellation in updating SVA(q)
978 // .. recompute SVA(q)
979 if (pow(sva(q)/aaqq,2)<=rootEps) {
980 if ((aaqq<rootBig) && (aaqq>rootSafeMin)) {
981 sva(q) = blas::nrm2(A(_,q))*work(q);
982 } else {
983 t = Zero;
984 aaqq = One;
985 lassq(A(_,q), t, aaqq);
986 sva(q) = t*sqrt(aaqq)*work(q);
987 }
988 }
989 if (pow(aapp/aapp0,2)<=rootEps) {
990 if ((aapp<rootBig) && (aapp>rootSafeMin)) {
991 aapp = blas::nrm2(A(_,p))*work(p);
992 } else {
993 t = Zero;
994 aapp = One;
995 lassq(A(_,p), t, aapp);
996 aapp = t*sqrt(aapp)*work(p);
997 }
998 sva(p) = aapp;
999 }
1000 // end of OK rotation
1001 } else {
1002 ++notRot;
1003 //[RTD] SKIPPED = SKIPPED + 1
1004 ++pSkipped;
1005 ++ijblsk;
1006 }
1007 } else {
1008 ++notRot;
1009 ++pSkipped;
1010 ++ijblsk;
1011 }
1012
1013 if ((i<=swBand) && (ijblsk>=blSkip)) {
1014 sva(p) = aapp;
1015 notRot = 0;
1016 goto jbcLoopExit;
1017 }
1018 if ((i<=swBand) && (pSkipped>rowSkip)) {
1019 aapp = -aapp;
1020 notRot = 0;
1021 break;
1022 }
1023
1024 }
1025 // end of the q-loop
1026
1027 sva(p) = aapp;
1028
1029 } else {
1030
1031 if (aapp==Zero) {
1032 notRot += min(jgl+kbl-1,n) -jgl + 1;
1033 }
1034 if (aapp<Zero) {
1035 notRot = 0;
1036 }
1037
1038 }
1039
1040 }
1041 // end of the p-loop
1042 }
1043 // end of the jbc-loop
1044 jbcLoopExit:
1045 // bailed out of the jbc-loop
1046 for (p=igl; p<=min(igl+kbl-1,n); ++p) {
1047 sva(p) = abs(sva(p));
1048 }
1049 //**
1050 }
1051 // end of the ibr-loop
1052 //
1053 // .. update SVA(N)
1054 if ((sva(n)<rootBig) && (sva(n)>rootSafeMin)) {
1055 sva(n) = blas::nrm2(A(_,n))*work(n);
1056 } else {
1057 t = Zero;
1058 aapp = One;
1059 lassq(A(_,n), t, aapp);
1060 sva(n) = t*sqrt(aapp)*work(n);
1061 }
1062 //
1063 // Additional steering devices
1064 //
1065 if ((i<swBand) && ((max_aapq<=rootTol) || (iswRot<=n))) {
1066 swBand = i;
1067 }
1068
1069 if (i>swBand+1 && max_aapq<sqrt(ElementType(n))*tol
1070 && ElementType(n)*max_aapq*max_sinj<tol) {
1071 converged = true;
1072 break;
1073 }
1074
1075 if (notRot>=emptsw) {
1076 converged = true;
1077 break;
1078 }
1079
1080 }
1081 // end i=1:NSWEEP loop
1082 //
1083 if (converged) {
1084 //#:) INFO = 0 confirms successful iterations.
1085 info = 0;
1086 } else {
1087 //#:( Reaching this point means that the procedure has not converged.
1088 info = nSweep - 1;
1089 }
1090 //
1091 // Sort the singular values and find how many are above
1092 // the underflow threshold.
1093 //
1094 IndexType n2 = 0;
1095 IndexType n4 = 0;
1096 for (IndexType p=1; p<=n-1; ++p) {
1097 const IndexType q = blas::iamax(sva(_(p,n))) + p - 1;
1098 if (p!=q) {
1099 swap(sva(p), sva(q));
1100 swap(work(p), work(q));
1101 blas::swap(A(_,p), A(_,q));
1102 if (rhsVec) {
1103 blas::swap(V(_,p), V(_,q));
1104 }
1105 }
1106 if (sva(p)!=Zero) {
1107 ++n4;
1108 if (sva(p)*skl>safeMin) {
1109 ++n2;
1110 }
1111 }
1112 }
1113 if (sva(n)!=Zero) {
1114 ++n4;
1115 if (sva(n)*skl>safeMin) {
1116 ++n2;
1117 }
1118 }
1119 //
1120 // Normalize the left singular vectors.
1121 //
1122 if (lhsVec || controlU) {
1123 for (IndexType p=1; p<=n2; ++p) {
1124 A(_,p) *= work(p)/sva(p);
1125 }
1126 }
1127 //
1128 // Scale the product of Jacobi rotations (assemble the fast rotations).
1129 //
1130 if (rhsVec) {
1131 if (applyV) {
1132 for (IndexType p=1; p<=n; ++p) {
1133 V(_,p) *= work(p);
1134 }
1135 } else {
1136 for (IndexType p=1; p<=n; ++p) {
1137 V(_,p) *= One / blas::nrm2(V(_,p));
1138 }
1139 }
1140 }
1141 //
1142 // Undo scaling, if necessary (and possible).
1143 if (((skl>One) && (sva(1)<big/skl)) || ((skl<One) && (sva(n2)>safeMin/skl))) {
1144 sva *= skl;
1145 skl = One;
1146 }
1147 //
1148 work(1) = skl;
1149 // The singular values of A are SKL*SVA(1:N). If SKL.NE.ONE
1150 // then some of the singular values may overflow or underflow and
1151 // the spectrum is given in this factored representation.
1152 //
1153 work(2) = ElementType(n4);
1154 // N4 is the number of computed nonzero singular values of A.
1155 //
1156 work(3) = ElementType(n2);
1157 // N2 is the number of singular values of A greater than SFMIN.
1158 // If N2<N, SVA(N2:N) contains ZEROS and/or denormalized numbers
1159 // that may carry some information.
1160 //
1161 work(4) = ElementType(i);
1162 // i is the index of the last sweep before declaring convergence.
1163 //
1164 work(5) = max_aapq;
1165 // MXAAPQ is the largest absolute value of scaled pivots in the
1166 // last sweep
1167 //
1168 work(6) = max_sinj;
1169 // MXSINJ is the largest absolute value of the sines of Jacobi angles
1170 // in the last sweep
1171 //
1172 return info;
1173 }
1174
1175 //== interface for native lapack ===============================================
1176 #ifdef CHECK_CXXLAPACK
1177
1178 template <typename MA, typename VSVA, typename MV, typename VWORK>
1179 typename GeMatrix<MA>::IndexType
1180 svj_native(SVJ::TypeA typeA,
1181 SVJ::JobU jobU,
1182 SVJ::JobV jobV,
1183 GeMatrix<MA> &A,
1184 DenseVector<VSVA> &sva,
1185 GeMatrix<MV> &V,
1186 DenseVector<VWORK> &work)
1187 {
1188 typedef typename GeMatrix<MA>::ElementType T;
1189
1190 const char JOBA = char(typeA);
1191 const char JOBU = char(jobU);
1192 const char JOBV = char(jobV);
1193 const INTEGER M = A.numRows();
1194 const INTEGER N = A.numCols();
1195 const INTEGER LDA = A.leadingDimension();
1196 const INTEGER _MV = V.numRows();
1197 const INTEGER LDV = V.leadingDimension();
1198 const INTEGER LWORK = work.length();
1199 INTEGER INFO;
1200
1201 if (IsSame<T,DOUBLE>::value) {
1202 LAPACK_IMPL(dgesvj)(&JOBA,
1203 &JOBU,
1204 &JOBV,
1205 &M,
1206 &N,
1207 A.data(),
1208 &LDA,
1209 sva.data(),
1210 &_MV,
1211 V.data(),
1212 &LDV,
1213 work.data(),
1214 &LWORK,
1215 &INFO);
1216 } else {
1217 ASSERT(0);
1218 }
1219 return INFO;
1220 }
1221
1222 #endif // CHECK_CXXLAPACK
1223
1224 //== public interface ==========================================================
1225 template <typename MA, typename VSVA, typename MV, typename VWORK>
1226 typename GeMatrix<MA>::IndexType
1227 svj_(SVJ::TypeA typeA,
1228 SVJ::JobU jobU,
1229 SVJ::JobV jobV,
1230 GeMatrix<MA> &A,
1231 DenseVector<VSVA> &sva,
1232 GeMatrix<MV> &V,
1233 DenseVector<VWORK> &work)
1234 {
1235 using std::max;
1236 using std::min;
1237 //
1238 // Test the input parameters
1239 //
1240 # ifndef NDEBUG
1241 typedef typename GeMatrix<MA>::IndexType IndexType;
1242
1243 ASSERT(A.firstRow()==1);
1244 ASSERT(A.firstCol()==1);
1245
1246 const IndexType m = A.numRows();
1247 const IndexType n = A.numCols();
1248
1249 ASSERT(m>=n);
1250
1251 ASSERT(sva.firstIndex()==1);
1252 ASSERT(sva.length()==n);
1253
1254 ASSERT(V.firstRow()==1);
1255 ASSERT(V.firstCol()==1);
1256
1257 if (jobV==SVJ::ComputeV) {
1258 ASSERT(V.numCols()==n);
1259 ASSERT(V.numRows()==n);
1260 }
1261 if (jobV==SVJ::ApplyV) {
1262 ASSERT(V.numCols()==n);
1263 }
1264
1265 if (work.length()>0) {
1266 ASSERT(work.length()>=max(IndexType(6),m+n));
1267 }
1268 # endif
1269
1270 //
1271 // Make copies of output arguments
1272 //
1273 # ifdef CHECK_CXXLAPACK
1274 typename GeMatrix<MA>::NoView A_org = A;
1275 typename DenseVector<VSVA>::NoView sva_org = sva;
1276 typename GeMatrix<MV>::NoView V_org = V;
1277 typename DenseVector<VWORK>::NoView work_org = work;
1278 # endif
1279
1280 //
1281 // Call implementation
1282 //
1283 IndexType info = svj_generic(typeA, jobU, jobV, A, sva, V, work);
1284
1285 # ifdef CHECK_CXXLAPACK
1286 //
1287 // Make copies of results computed by the generic implementation
1288 //
1289 typename GeMatrix<MA>::NoView A_generic = A;
1290 typename DenseVector<VSVA>::NoView sva_generic = sva;
1291 typename GeMatrix<MV>::NoView V_generic = V;
1292 typename DenseVector<VWORK>::NoView work_generic = work;
1293 //
1294 // restore output arguments
1295 //
1296 A = A_org;
1297 sva = sva_org;
1298 V = V_org;
1299 work = work_org;
1300 //
1301 // Compare generic results with results from the native implementation
1302 //
1303 IndexType _info = svj_native(typeA, jobU, jobV, A, sva, V, work);
1304
1305 bool failed = false;
1306 if (! isIdentical(A_generic, A, "A_generic", "A")) {
1307 std::cerr << "CXXLAPACK: A_generic = " << A_generic << std::endl;
1308 std::cerr << "F77LAPACK: A = " << A << std::endl;
1309 failed = true;
1310 }
1311 if (! isIdentical(sva_generic, sva, "sva_generic", "sva")) {
1312 std::cerr << "CXXLAPACK: sva_generic = " << sva_generic << std::endl;
1313 std::cerr << "F77LAPACK: sva = " << sva << std::endl;
1314 failed = true;
1315 }
1316 if (! isIdentical(V_generic, V, "V_generic", "V")) {
1317 std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;
1318 std::cerr << "F77LAPACK: V = " << V << std::endl;
1319 failed = true;
1320 }
1321 if (! isIdentical(work_generic, work, "work_generic", "work")) {
1322 std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
1323 std::cerr << "F77LAPACK: work = " << work << std::endl;
1324 failed = true;
1325 }
1326 if (! isIdentical(info, _info, "info", "_info")) {
1327 std::cerr << "CXXLAPACK: info = " << info << std::endl;
1328 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
1329 failed = true;
1330 }
1331
1332 if (failed) {
1333 std::cerr << "error in: svj.tcc ("
1334 << ", m = " << m
1335 << ", n = " << n
1336 << ", typeA = " << char(typeA)
1337 << ", jobU = " << char(jobU)
1338 << ", jobV = " << char(jobV)
1339 << ", info = " << info
1340 << ") " << std::endl;
1341 ASSERT(0);
1342 } else {
1343 /*
1344 std::cerr << "passed: svj.tcc ("
1345 << ", m = " << m
1346 << ", n = " << n
1347 << ", typeA = " << char(typeA)
1348 << ", jobU = " << char(jobU)
1349 << ", jobV = " << char(jobV)
1350 << ", info = " << info
1351 << ") " << std::endl;
1352 */
1353 }
1354 # endif
1355
1356 return info;
1357 }
1358
1359 template <typename MA, typename VSVA, typename MV, typename VWORK>
1360 typename GeMatrix<MA>::IndexType
1361 svj(SVJ::TypeA typeA,
1362 SVJ::JobU jobU,
1363 SVJ::JobV jobV,
1364 GeMatrix<MA> &A,
1365 DenseVector<VSVA> &sva,
1366 GeMatrix<MV> &V,
1367 DenseVector<VWORK> &work)
1368 {
1369 # ifdef CHECK_CXXLAPACK
1370 typename GeMatrix<MA>::NoView A_org = A;
1371 typename DenseVector<VSVA>::NoView sva_org = sva;
1372 typename GeMatrix<MV>::NoView V_org = V;
1373 typename DenseVector<VSVA>::NoView work_org = work;
1374
1375 svj_(SVJ::Lower, jobU, jobV, A, sva, V, work);
1376
1377 A = A_org;
1378 sva = sva_org;
1379 V = V_org;
1380 work = work_org;
1381
1382 svj_(SVJ::Upper, jobU, jobV, A, sva, V, work);
1383
1384 A = A_org;
1385 sva = sva_org;
1386 V = V_org;
1387 work = work_org;
1388 svj_(SVJ::General, jobU, jobV, A, sva, V, work);
1389
1390 A = A_org;
1391 sva = sva_org;
1392 V = V_org;
1393 work = work_org;
1394 # endif
1395
1396 return svj_(typeA, jobU, jobV, A, sva, V, work);
1397 }
1398
1399
1400 template <typename MA, typename VSVA, typename MV, typename VWORK>
1401 typename GeMatrix<MA>::IndexType
1402 svj(SVJ::JobU jobU,
1403 SVJ::JobV jobV,
1404 GeMatrix<MA> &A,
1405 DenseVector<VSVA> &sva,
1406 GeMatrix<MV> &V,
1407 DenseVector<VWORK> &work)
1408 {
1409 return svj(SVJ::General, jobU, jobV, A, sva, V, work);
1410 }
1411
1412 template <typename MA, typename VSVA, typename MV, typename VWORK>
1413 typename TrMatrix<MA>::IndexType
1414 svj(SVJ::JobU jobU,
1415 SVJ::JobV jobV,
1416 TrMatrix<MA> &A,
1417 DenseVector<VSVA> &sva,
1418 GeMatrix<MV> &V,
1419 DenseVector<VWORK> &work)
1420 {
1421 SVJ::TypeA upLo = (A.upLo()==Upper) ? SVJ::Upper : SVJ::Lower;
1422
1423 return svj(upLo, jobU, jobV, A.general(), sva, V, work);
1424 }
1425
1426 //-- forwarding ----------------------------------------------------------------
1427 template <typename MA, typename VSVA, typename MV, typename VWORK>
1428 typename MA::IndexType
1429 svj(SVJ::TypeA typeA,
1430 SVJ::JobU jobU,
1431 SVJ::JobV jobV,
1432 MA &&A,
1433 VSVA &&sva,
1434 MV &&V,
1435 VWORK &&work)
1436 {
1437 typename MA::IndexType info;
1438
1439 CHECKPOINT_ENTER;
1440 info = svj(typeA, jobU, jobV, A, sva, V, work);
1441 CHECKPOINT_LEAVE;
1442
1443 return info;
1444 }
1445
1446 template <typename MA, typename VSVA, typename MV, typename VWORK>
1447 typename MA::IndexType
1448 svj(SVJ::JobU jobU,
1449 SVJ::JobV jobV,
1450 MA &&A,
1451 VSVA &&sva,
1452 MV &&V,
1453 VWORK &&work)
1454 {
1455 typename MA::IndexType info;
1456
1457 CHECKPOINT_ENTER;
1458 info = svj(jobU, jobV, A, sva, V, work);
1459 CHECKPOINT_LEAVE;
1460
1461 return info;
1462 }
1463
1464 } } // namespace lapack, flens
1465
1466 #endif // FLENS_LAPACK_SVD_SVJ_TCC