1 /*
2 * Copyright (c) 2012, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Based on
34 *
35 SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
36 $ M, N, A, LDA, SVA, U, LDU, V, LDV,
37 $ WORK, LWORK, IWORK, INFO )
38 *
39 * -- LAPACK routine (version 3.3.1) --
40 *
41 * -- Contributed by Zlatko Drmac of the University of Zagreb and --
42 * -- Kresimir Veselic of the Fernuniversitaet Hagen --
43 * -- April 2011 --
44 *
45 * -- LAPACK is a software package provided by Univ. of Tennessee, --
46 * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
47 *
48 * This routine is also part of SIGMA (version 1.23, October 23. 2008.)
49 * SIGMA is a library of algorithms for highly accurate algorithms for
50 * computation of SVD, PSVD, QSVD, (H,K)-SVD, and for solution of the
51 * eigenvalue problems Hx = lambda M x, H M x = lambda x with H, M > 0.
52 *
53 */
54
55 #ifndef FLENS_LAPACK_SVD_JSV_TCC
56 #define FLENS_LAPACK_SVD_JSV_TCC 1
57
58 #include <flens/blas/blas.h>
59 #include <flens/lapack/lapack.h>
60
61 namespace flens { namespace lapack {
62
63 //== generic lapack implementation =============================================
64 /*
65 template <typename MA, typename VSVA, typename MU, typename MV,
66 typename VWORK, typename VIWORK>
67 typename GeMatrix<MA>::IndexType
68 jsv_generic(JSV::Accuracy accuracy,
69 JSV::JobU jobU,
70 JSV::JobV jobV,
71 bool restrictedRange,
72 bool considerTransA,
73 bool perturb,
74 GeMatrix<MA> &A,
75 DenseVector<VSVA> &sva,
76 GeMatrix<MU> &U,
77 GeMatrix<MV> &V,
78 DenseVector<VWORK> &work,
79 DenseVector<VIWORK> &iwork)
80 {
81 using std::abs;
82 using std::max;
83 using std::min;
84 using std::sqrt;
85
86 typedef typename GeMatrix<MA>::ElementType ElementType;
87 typedef typename GeMatrix<MA>::IndexType IndexType;
88
89 const ElementType Zero(0), One(1);
90
91 const Underscore<IndexType> _;
92
93 const bool lsvec = (jobU==ComputeU) || (jobU==FullsetU);
94 const bool jracc = (jobV=='J');
95 const bool rsvec = (jobV=='V') || jracc;
96 const bool rowpiv = (jobA=='F') || (jobA=='G');
97 const bool l2rank = (jobA=='R');
98 const bool l2aber = (jobA=='A');
99 const bool errest = (jobA=='E') || (jobA=='G');
100 const bool l2tran = (jobT=='T');
101 const bool l2kill = (jobR=='R');
102 const bool defr = (jobR=='N');
103 const bool l2pert = (jobP=='P');
104
105 IndexType info = 0;
106
107 //
108 // Quick return for void matrix (Y3K safe)
109 //#:)
110 if (m==0 || n==0) {
111 return info;
112 }
113 //
114 // Set numerical parameters
115 //
116 //! NOTE: Make sure DLAMCH() does not fail on the target architecture.
117 //
118 const ElementType eps = lamch<ElementType>(Eps);
119 const ElementType safeMin = lamch<ElementType>(SafeMin);
120 const ElementType small = safeMin / eps;
121 const ElementType big = lamch<ElementType>(OverflowThreshold);
122 //
123 // Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N
124 //
125 //! If necessary, scale SVA() to protect the largest norm from
126 // overflow. It is possible that this scaling pushes the smallest
127 // column norm left from the underflow threshold (extreme case).
128 //
129 ElementType scaleM = One / sqrt(ElementType(m)*ElementType(n));
130 bool noScale = true;
131 bool goScale = true;
132 for (IndexType p=1; p<=n; ++p) {
133 aapp = Zero;
134 aaqq = One;
135 lassq(A(_,p), aapp, aaqq);
136 if (aapp>big) {
137 return -9;
138 }
139 aaqq = sqrt(aaqq);
140 if (aapp<(big/aaqq) && noscal) {
141 sva(p) = aapp * aaqq;
142 } else {
143 noscal = false;
144 sva(p) = aapp * ( aaqq * scalem )
145 if (goScale) {
146 goScale = false;
147 sva(_(1,p)) *= scaleM;
148 }
149 }
150 }
151
152 if (noScale) {
153 scaleM = One;
154 }
155
156 aapp = Zero;
157 aaqq = big;
158 for (IndexType p=1; p<=n) {
159 aapp = max(aapp, sva(p));
160 if (sva(p)!=Zero) {
161 aaqq = min(aaqq, sva(p));
162 }
163 }
164 //
165 // Quick return for zero M x N matrix
166 //#:)
167 if (aapp==Zero) {
168 if (lsvec) {
169 U = Zero;
170 U.diag(0) = One;
171 }
172 if (rsvec) {
173 V = Zero;
174 V.diag(0) = One;
175 }
176 work(1) = One;
177 work(2) = One;
178 if (errest) {
179 work(3) = One;
180 }
181 if (lsvec && rsvec) {
182 work(4) = One;
183 work(5) = One;
184 }
185 if (l2tran) {
186 work(6) = Zero;
187 work(7) = Zero;
188 }
189 iwork(1) = 0;
190 iwork(2) = 0;
191 iwork(3) = 0;
192 return info;
193 }
194 //
195 // Issue warning if denormalized column norms detected. Override the
196 // high relative accuracy request. Issue licence to kill columns
197 // (set them to zero) whose norm is less than sigma_max / BIG (roughly).
198 //#:(
199 warning = 0
200 if (aaqq<=SFMIN) {
201 l2rank = true;
202 l2kill = true;
203 warning = 1;
204 }
205 //
206 // Quick return for one-column matrix
207 //#:)
208 auto U1 = U(_,_(1,n));
209 if (n==1) {
210
211 if (lsvec) {
212 lascl(LASCL::FullMatrix, 0, 0, sva(1), scaleM, A);
213 U1 = A;
214 // computing all M left singular vectors of the M x 1 matrix
215 if (nu!=n) {
216 auto tau = work(_(1,n));
217 auto _work = work(_(n+1,lWork));
218 qrf(U1, tau, _work);
219 orgqr(1, U, tau, _work);
220 U1 = A;
221 }
222 }
223 if (rsvec) {
224 V(1,1) = One
225 }
226 if (sva(1)<big*scaleM) {
227 sva(1) /= scaleM;
228 scalem = One;
229 }
230 work(1) = One / scaleM;
231 work(2) = One;
232 if (sva(1)!=Zero) {
233 iwork(1) = 1;
234 if (sva(1)/scaleM>=safeMin) {
235 iwork(2) = 1;
236 } else {
237 iwork(2) = 0;
238 }
239 } else {
240 iwork(1) = 0;
241 iwork(2) = 0;
242 }
243 if (errest) {
244 work(3) = One;
245 }
246 if (lsvec && rsvec) {
247 work(4) = One;
248 work(5) = One;
249 }
250 if (l2tran) {
251 work(6) = Zero;
252 work(7) = Zero;
253 }
254 return info;
255
256 }
257
258 bool transp = false;
259 bool l2tran = l2tran && m==n;
260
261 ElementType aatMax = -One;
262 ElementType aatMin = big;
263 if (rowpiv || l2tran) {
264 //
265 // Compute the row norms, needed to determine row pivoting sequence
266 // (in the case of heavily row weighted A, row pivoting is strongly
267 // advised) and to collect information needed to compare the
268 // structures of A * A^t and A^t * A (in the case L2TRAN.EQ.true).
269 //
270 if (l2tran) {
271 for (IndexType p=1; p<=m; ++p) {
272 xsc = Zero;
273 tmp = One;
274 lassq(A(p,_), xsc, tmp);
275 // DLASSQ gets both the ell_2 and the ell_infinity norm
276 // in one pass through the vector
277 work(m+n+p) = xsc * scaleM;
278 work(n+p) = xsc * (scaleN*sqrt(tmp));
279 aatMax = max(aatMax, work(n+p));
280 if (work(n+p)!=Zero) {
281 aatMin = min(aatMin, work(n+p));
282 }
283 }
284 } else {
285 for (IndexType p=1; p<=m; ++p) {
286 const IndexType jp = blas::iamax(A(p,_));
287 work(m+n+p) = scaleM*abs(A(p,jp));
288 aatMax = max(aatMax, work(m+n+p));
289 aatMin = min(aatMin, work(m+n+p));
290 }
291 }
292
293 }
294 //
295 // For square matrix A try to determine whether A^t would be better
296 // input for the preconditioned Jacobi SVD, with faster convergence.
297 // The decision is based on an O(N) function of the vector of column
298 // and row norms of A, based on the Shannon entropy. This should give
299 // the right choice in most cases when the difference actually matters.
300 // It may fail and pick the slower converging side.
301 //
302 entra = Zero
303 entrat = Zero
304 if (l2tran) {
305
306 xsc = Zero;
307 tmp = One;
308 lassq(sva, xsc, tmp);
309 tmp = One / tmp;
310
311 entra = Zero
312 for (IndexType p=1; p<=n; ++p) {
313 const ElementType _big = pow(sva(p)/xsc, 2) * tmp;
314 if (_big!=Zero) {
315 entra += _big * log(_big);
316 }
317 }
318 entra = - entra / log(ElementType(n));
319 //
320 // Now, SVA().^2/Trace(A^t * A) is a point in the probability simplex.
321 // It is derived from the diagonal of A^t * A. Do the same with the
322 // diagonal of A * A^t, compute the entropy of the corresponding
323 // probability distribution. Note that A * A^t and A^t * A have the
324 // same trace.
325 //
326 entrat = Zero
327 for (IndexType p=n+1; p<=n+m; ++p) {
328 const ElementType _big = pow(work(p)/xsc, 2) * tmp;
329 if (_big!=Zero) {
330 entrat += _big * log(_big);
331 }
332 }
333 entrat = -entrat / log(ElementType(m));
334 //
335 // Analyze the entropies and decide A or A^t. Smaller entropy
336 // usually means better input for the algorithm.
337 //
338 transp = entrat<entra;
339 //
340 // If A^t is better than A, transpose A.
341 //
342 if (transp) {
343 // In an optimal implementation, this trivial transpose
344 // should be replaced with faster transpose.
345 // TODO: in-place transpose:
346 // transpose(A);
347 for (IndexType p=1; p<=n-1; ++p) {
348 for (IndexType q=p+1; q<=n; ++q) {
349 swap(A(q,p), A(p,q));
350 }
351 }
352 for (IndexType p=1; p<=n; ++p) {
353 work(m+n+p) = sva(p);
354 sva(p) = work(n+p);
355 }
356 swap(aapp, aatMax);
357 swap(aaqq, aatMin);
358 swap(lsvec, rsvec);
359 if (lsvec) {
360 // Lehn: transposing A is only considered if A is square,
361 // so in this case m==n and therefore nu==n. Or am I
362 // wrong??
363 ASSERT(nu==n);
364 }
365
366 rowpiv = true;
367 }
368
369 }
370 //
371 // Scale the matrix so that its maximal singular value remains less
372 // than DSQRT(BIG) -- the matrix is scaled so that its maximal column
373 // has Euclidean norm equal to DSQRT(BIG/N). The only reason to keep
374 // DSQRT(BIG) instead of BIG is the fact that DGEJSV uses LAPACK and
375 // BLAS routines that, in some implementations, are not capable of
376 // working in the full interval [SFMIN,BIG] and that they may provoke
377 // overflows in the intermediate results. If the singular values spread
378 // from SFMIN to BIG, then DGESVJ will compute them. So, in that case,
379 // one should use DGESVJ instead of DGEJSV.
380 //
381 const ElementType bigRoot = sqrt(big);
382 tmp = sqrt(big/ElementType(n));
383
384 lascl(LASCL::FullMatrix, 0, 0, aapp, tmp, sva);
385 if (aaqq>aapp*safeMin) {
386 aaqq = (aaqq/aapp) * tmp;
387 } else {
388 aaqq = (aaqq*tmp) / aapp;
389 }
390 tmp *= scaleM;
391 lascl(LASCL::FullMatrix, 0, 0, aapp, tmp, A);
392 //
393 // To undo scaling at the end of this procedure, multiply the
394 // computed singular values with USCAL2 / USCAL1.
395 //
396 uScale1 = tmp;
397 uScale2 = aapp;
398
399 if (l2kill) {
400 // L2KILL enforces computation of nonzero singular values in
401 // the restricted range of condition number of the initial A,
402 // sigma_max(A) / sigma_min(A) approx. DSQRT(BIG)/DSQRT(SFMIN).
403 xsc = sqrt(safeMin);
404 } else {
405 xsc = small;
406 //
407 // Now, if the condition number of A is too big,
408 // sigma_max(A) / sigma_min(A)>DSQRT(BIG/N) * EPSLN / SFMIN,
409 // as a precaution measure, the full SVD is computed using DGESVJ
410 // with accumulated Jacobi rotations. This provides numerically
411 // more robust computation, at the cost of slightly increased run
412 // time. Depending on the concrete implementation of BLAS and LAPACK
413 // (i.e. how they behave in presence of extreme ill-conditioning) the
414 // implementor may decide to remove this switch.
415 if (aaqq<sqrt(safeMin) && lsvec && rsvec ) {
416 jracc = true;
417 }
418
419 }
420 if (aaqq<xsc) {
421 for (IndexType p=1; p<=n; ++p) {
422 if (sva(p)<xsc) {
423 A(_,p) = Zero;
424 sva(p) = Zero;
425 }
426 }
427 }
428 //
429 // Preconditioning using QR factorization with pivoting
430 //
431 if (rowpiv) {
432 // Optional row permutation (Bjoerck row pivoting):
433 // A result by Cox and Higham shows that the Bjoerck's
434 // row pivoting combined with standard column pivoting
435 // has similar effect as Powell-Reid complete pivoting.
436 // The ell-infinity norms of A are made nonincreasing.
437 for (IndexType p=1; p<=m-1; ++p) {
438 const IndexType q = blas::iamax(work(_(m+n+p,2*m+n))) + p - 1;
439 iwork(2*n+p) = q;
440 if (p!=q) {
441 swap(work(m+n+p), work(m+n+q));
442 }
443 }
444 laswp(A, iwork(_(2*n+1, 2*n+m-1)));
445 }
446 //
447 // End of the preparation phase (scaling, optional sorting and
448 // transposing, optional flushing of small columns).
449 //
450 // Preconditioning
451 //
452 // If the full SVD is needed, the right singular vectors are computed
453 // from a matrix equation, and for that we need theoretical analysis
454 // of the Businger-Golub pivoting. So we use DGEQP3 as the first RR QRF.
455 // In all other cases the first RR QRF can be chosen by other criteria
456 // (eg speed by replacing global with restricted window pivoting, such
457 // as in SGEQPX from TOMS # 782). Good results will be obtained using
458 // SGEQPX with properly (!) chosen numerical parameters.
459 // Any improvement of DGEQP3 improves overal performance of DGEJSV.
460 //
461 // A * P1 = Q1 * [ R1^t 0]^t:
462 auto _tau = work(_(1,n));
463 auto _work = work(_(n+1,lWork));
464 auto _iwork = iwork(_(1,n));
465
466 // .. all columns are free columns
467 _iwork = 0;
468
469 qp3(A, _iwork, _tau, _work);
470 //
471 // The upper triangular matrix R1 from the first QRF is inspected for
472 // rank deficiency and possibilities for deflation, or possible
473 // ill-conditioning. Depending on the user specified flag L2RANK,
474 // the procedure explores possibilities to reduce the numerical
475 // rank by inspecting the computed upper triangular factor. If
476 // L2RANK or l2aber are up, then DGEJSV will compute the SVD of
477 // A + dA, where ||dA|| <= f(M,N)*EPSLN.
478 //
479 IndexType nr = 1;
480 if (l2aber) {
481 // Standard absolute error bound suffices. All sigma_i with
482 // sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
483 // agressive enforcement of lower numerical rank by introducing a
484 // backward error of the order of N*EPSLN*||A||.
485 tmp = sqrt(ElementType(n))*eps;
486 for (IndexType p=2; p<=n; ++p) {
487 if (abs(A(p,p))>=tmp*abs(A(1,1))) {
488 ++nr;
489 } else {
490 break;
491 }
492 }
493 } else if (l2rank) {
494 // .. similarly as above, only slightly more gentle (less agressive).
495 // Sudden drop on the diagonal of R1 is used as the criterion for
496 // close-to-rank-defficient.
497 tmp = sqrt(safeMin);
498 for (IndexType p=2; p<=n; ++p) {
499 if (abs(A(p,p))<eps*abs(A(p-1,p-1)) || abs(A(p,p))<small
500 || (l2kill && abs(A(p,p))<tmp))
501 {
502 break;
503 }
504 ++nr;
505 }
506
507 } else {
508 // The goal is high relative accuracy. However, if the matrix
509 // has high scaled condition number the relative accuracy is in
510 // general not feasible. Later on, a condition number estimator
511 // will be deployed to estimate the scaled condition number.
512 // Here we just remove the underflowed part of the triangular
513 // factor. This prevents the situation in which the code is
514 // working hard to get the accuracy not warranted by the data.
515 tmp = sqrt(safeMin);
516 for (IndexType p=2; p<=n; ++p) {
517 if (abs(A(p,p))<small || (l2kill && abs(A(p,p))<tmp)) {
518 break;
519 }
520 ++nr;
521 }
522
523 }
524
525 bool almort = false;
526 if (nr==n) {
527 ElementType maxprj = One;
528 for (IndexType p=2; p<=n; ++p) {
529 maxprj = min(maxprj, abs(A(p,p))/sva(iwork(p)));
530 }
531 if (pow(maxprj,2)>=One-ElementType(n)*eps) {
532 almort = true;
533 }
534 }
535
536
537 sconda = -One;
538 condr1 = -One;
539 condr2 = -One;
540
541 if (errest) {
542 if (n==nr) {
543 if (rsvec) {
544 // .. V is available as workspace
545 V.upper() = A(_(1,n),_).upper();
546 for (IndexType p=1; p<=n; ++p) {
547 tmp = sva(iwork(p));
548 V(_(1,p),p) *= One/tmp;
549 }
550 auto _work = work(_(n+1, 4*n));
551 auto _iwork = iwork(_(2*n+m+1, 3*n+m));
552 pocon(V.upper(), One, tmp, _work, _iwork);
553 } else if (lsvec) {
554 // .. U is available as workspace
555 auto _U = U(_(1,n),_(1,n));
556 _U.upper() = A(_(1,n),_).upper();
557 for (IndexType p=1; p<=n; ++p) {
558 tmp = sva(iwork(p));
559 U(_(1,p),p) *= One/tmp;
560 }
561 auto _work = work(_(n+1, 4*n));
562 auto _iwork = iwork(_(2*n+m+1, 3*n+m));
563 pocon(_U.upper(), One, tmp, _work, _iwork);
564 } else {
565 auto _work1 = work(_(n+1,n+n*n));
566 auto _work2 = work(_(n+n*n+1,n+n*n+3*n));
567 auto _iwork = work(_(2*n+m+1,3*n+m));
568 GeMatrixView<ElementType> Work(n, n, _work1, n);
569
570 Work.upper() = A(_(1,n),_).upper();
571 for (IndexType p=1; p<=n; ++p) {
572 tmp = sva(iwork(p));
573 Work(_(1,p),p) *= One/tmp;
574 }
575 // .. the columns of R are scaled to have unit Euclidean lengths.
576 pocon(Work.upper(), One, tmp, _work2, _iwork);
577 }
578 sconda = One/sqrt(tmp);
579 // SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1).
580 // N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
581 } else {
582 sconda = -One;
583 }
584 }
585
586 l2pert = l2pert && abs(A(1,1)/A(nr,nr))>sqrt(bigRoot);
587 // If there is no violent scaling, artificial perturbation is not needed.
588 //
589 // Phase 3:
590 //
591 if (!(rsvec || lsvec)) {
592 //
593 // Singular Values only
594 //
595 // .. transpose A(1:NR,1:N)
596 for (IndexType p=1; p<=min(n-1, nr); ++p) {
597 A(p,_(p+1,n)) = A(_(p+1,n),p);
598 }
599 //
600 // The following two DO-loops introduce small relative perturbation
601 // into the strict upper triangle of the lower triangular matrix.
602 // Small entries below the main diagonal are also changed.
603 // This modification is useful if the computing environment does not
604 // provide/allow FLUSH TO Zero underflow, for it prevents many
605 // annoying denormalized numbers in case of strongly scaled matrices.
606 // The perturbation is structured so that it does not introduce any
607 // new perturbation of the singular values, and it does not destroy
608 // the job done by the preconditioner.
609 // The licence for this perturbation is in the variable L2PERT, which
610 // should be false if FLUSH TO Zero underflow is active.
611 //
612 if (! almort) {
613
614 if (l2pert) {
615 // XSC = DSQRT(SMALL)
616 xsc = eps / ElementType(n);
617 for (IndexType q=1; q<=nr; ++q) {
618 tmp = xsc*abs(A(q,q));
619 for (IndexType p=1; p<=n, ++p) {
620 if ((p>q && abs(A(p,q))<=tmp) || p<q) {
621 A(p,q) = sign(tmp, A(p,q));
622 }
623 }
624 }
625 } else {
626 A(_(1,nr),_(1,nr)).strictUpper() = Zero;
627 }
628 //
629 // .. second preconditioning using the QR factorization
630 //
631 auto _A = A(_(1,n),_(1,nr));
632 auto _tau = work(_(1,nr));
633 auto _work = work(_(n+1,lWork));
634 qrf(_A, _tau, _work);
635 //
636 // .. and transpose upper to lower triangular
637 for (IndexType p=1; p<=nr-1; ++p) {
638 A(p,_(p+1,nr)) = A(_(p+1,nr),p);
639 }
640
641 }
642 //
643 // Row-cyclic Jacobi SVD algorithm with column pivoting
644 //
645 // .. again some perturbation (a "background noise") is added
646 // to drown denormals
647 if (l2pert) {
648 // XSC = DSQRT(SMALL)
649 xsc = eps / ElementType(n);
650 for (IndexType q=1; q<=nr; ++q) {
651 ElementType tmp = xsc*abs(A(q,q));
652 for (IndexType p=1; p<=nr; ++p) {
653 if (((p>q) && abs(A(p,q))<=tmp) || p<q) {
654 A(p,q) = sign(tmp, A(p,q));
655 }
656 }
657 }
658 } else {
659 A(_(1,nr),_(1,nr)).strictUpper() = Zero;
660 }
661 //
662 // .. and one-sided Jacobi rotations are started on a lower
663 // triangular matrix (plus perturbation which is ignored in
664 // the part which destroys triangular form (confusing?!))
665 //
666 auto _A = A(_(1,nr),_(1,nr));
667 auto _sva = sva(_(1,nr));
668
669 svj(SVJ::Lower, SVJ::NoU, SVJ::NoV, _A, _sva, V, work);
670
671 scaleM = work(1);
672 numRank = nint(work(2));
673
674
675 } else if (rsvec && !lsvec) {
676 //
677 // -> Singular Values and Right Singular Vectors <-
678 //
679 if (almort) {
680 //
681 // .. in this case NR equals N
682 ASSERT(nr==n);
683 for (IndexType p=1; p<=nr; ++p) {
684 V(_(p,n),p) = A(p,_(p,n));
685 }
686
687 auto _V = V(_,_(1,nr));
688 _V.strictUpper() = Zero;
689 svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV, _V, sva, A, work);
690
691 scaleM = work(1);
692 numRank = nint(work(2));
693
694 } else {
695 //
696 // .. two more QR factorizations ( one QRF is not enough, two require
697 // accumulated product of Jacobi rotations, three are perfect )
698 //
699 A(_(1,nr),_(1,nr)).strictLower() = Zero;
700
701 auto _A = A(_(1,nr),_);
702 auto _tau1 = work(_(1,nr));
703 auto _work1 = work(_(n+1,lWork));
704 lqf(_A, _tau1, _work1);
705
706 auto _V = V(_(1,nr),_(1,nr))
707 _V.lower() = _A(_,_(1,nr)).lower();
708 _V.strictUpper() = Zero;
709
710 auto _tau2 = work(_(n+1,n+nr));
711 auto _work2 = work(_(2*n+1,lWork));
712 qrf(_V, _tau2, _work2);
713
714 for (IndexType p=1; p<=nr; ++p) {
715 V(_(p,nr),p) = V(p,_(p,nr));
716 }
717 _V.strictUpper() = Zero;
718
719 auto _sva = sva(_(1,nr));
720 svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV, _V, _sva, U, _work1);
721
722 scaleM = work(n+1);
723 numRank = nint(work(n+2));
724 if (nr<n) {
725 V(_(nr+1,n),_(1,nr)) = Zero;
726 V(_(1,nr),_(nr+1,n)) = Zero;
727
728 V(_(nr+1,n),_(nr+1,n)) = Zero;
729 V(_(nr+1,n),_(nr+1,n)).diag(0) = One;
730 }
731
732 ormlq(Left, Trans, _A, _tau1, V, _work1);
733
734 }
735
736 for (IndexType p=1; p<=n; ++p) {
737 A(iwork(p),_) = V(p,_);
738 }
739 V = A;
740
741 if (transp) {
742 U = V;
743 }
744
745 } else if (lsvec && !rsvec) {
746 //
747 // .. Singular Values and Left Singular Vectors ..
748 //
749 // .. second preconditioning step to avoid need to accumulate
750 // Jacobi rotations in the Jacobi iterations.
751
752 auto _U = U(_(1,nr),_(1,nr));
753 auto _tau = work(_(n+1,n+nr));
754 auto _sva = sva(_(1,nr));
755 auto _work1 = work(_(n+1,lWork));
756 auto _work2 = work(_(2*n+1,lWork));
757
758 for (IndexType p=1; p<=nr; ++p) {
759 A(p,_(p,nr)) = U(_(p,nr),p);
760 }
761 _U.strictUpper() = Zero;
762
763 qrf(U(_(1,n),_(1,nr)), _tau, _work2);
764
765 for (IndexType p=1; p<=nr-1; ++p) {
766 U(p,_(p+1,nr)) = U(_(p+1,nr),p);
767 }
768 _U.strictUpper() = Zero;
769
770 svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV, _U, _sva, A, _work1);
771 scaleM = work(n+1);
772 numRank = nint(work(n+2));
773
774 if (nr<m) {
775 U(_(nr+1,m),_(1,nr)) = Zero;
776 if (nr<nu) {
777 U(_(1,nr),_(nr+1,nu)) = Zero;
778
779 U(_(nr+1,m),_(nr+1,nu)) = Zero;
780 U(_(nr+1,m),_(nr+1,nu)).diag(0) = One;
781 }
782 }
783
784 ormqr(Left, NoTrans, A, work, U);
785
786 if (rowpiv) {
787 auto piv = iwork(_(2*n+1,2*n+m-1));
788 laswp(U, piv.reverse());
789 }
790
791 for (IndexType p=1; p<=nu; ++p) {
792 xsc = One / blas::nrm2(U(_,p));
793 U(_,p) *= xsc;
794 }
795
796 if (transp) {
797 V = U;
798 }
799 //
800 } else {
801 //
802 // .. Full SVD ..
803 //
804 if (!jracc) {
805
806 if (!almort) {
807 //
808 // Second Preconditioning Step (QRF [with pivoting])
809 // Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
810 // equivalent to an LQF CALL. Since in many libraries the QRF
811 // seems to be better optimized than the LQF, we do explicit
812 // transpose and use the QRF. This is subject to changes in an
813 // optimized implementation of DGEJSV.
814 //
815 for (IndexType p=1; p<=nr; ++p) {
816 V(_(p,n),p) = A(p,_(p,n));
817 }
818 //
819 // .. the following two loops perturb small entries to avoid
820 // denormals in the second QR factorization, where they are
821 // as good as zeros. This is done to avoid painfully slow
822 // computation with denormals. The relative size of the
823 // perturbation is a parameter that can be changed by the
824 // implementer. This perturbation device will be obsolete on
825 // machines with properly implemented arithmetic.
826 // To switch it off, set L2PERT=false To remove it from the
827 // code, remove the action under L2PERT=true, leave the ELSE
828 // part. The following two loops should be blocked and fused with
829 // the transposed copy above.
830 //
831 if (l2pert) {
832 xsc = sqrt(small);
833 for (IndexType q=1; q<=nr; ++q) {
834 tmp = xsc*abs(V(q,q));
835 for (IndexType p=1; p<=n; ++p) {
836 if (p>q && abs(V(p,q))<=tmp || p<q) {
837 V(p,q) = sign(tmp, V(p,q));
838 }
839 if (p<q) {
840 V(p,q) = -V(p,q);
841 }
842 }
843 }
844 } else {
845 V(_(1,nr),_(1,nr)).strictUpper() = Zero;
846 }
847 //
848 // Estimate the row scaled condition number of R1
849 // (If R1 is rectangular, N > NR, then the condition number
850 // of the leading NR x NR submatrix is estimated.)
851 //
852 auto _work1 = work(_(2*n+1,2*n+nr*nr));
853 auto _work2 = work(_(2*n+nr*nr+1, 2*n+nr*nr+3*nr));
854 auto _iwork = iwork(_(m+2*n+1,m+2*n+nr));
855 GeMatrixView<ElementType> Work(nr, nr, _work1, nr);
856
857 Work.lower() = V.lower();
858 for (IndexType p=1; p<=nr; ++p) {
859 tmp = blas::nrm2(Work(_(p,nr),p));
860 Work(_(p,nr),p) *= One/tmp;
861 }
862 pocon(Work, One, tmp, _work2, _iwork);
863 condr1 = One / sqrt(tmp);
864 // .. here need a second oppinion on the condition number
865 // .. then assume worst case scenario
866 // R1 is OK for inverse <=> condr1 < DBLE(N)
867 // more conservative <=> condr1 < DSQRT(DBLE(N))
868 //
869 cond_ok = sqrt(ElementType(nr));
870 //[TP] COND_OK is a tuning parameter.
871
872 if (condr1<cond_ok) {
873 // .. the second QRF without pivoting. Note: in an optimized
874 // implementation, this QRF should be implemented as the QRF
875 // of a lower triangular matrix.
876 // R1^t = Q2 * R2
877 auto tau = work(_(n+1,n+nr));
878 auto _work = work(_(2*n+1,lWork));
879
880 qrf(V(_,_(1,nr)), tau, _work);
881
882 if (l2pert) {
883 xsc = sqrt(small) / Eps;
884 for (IndexType p=2; p<=nr; ++p) {
885 for (IndexType q=1; q<=p-1; ++q) {
886 tmp = xsc*min(abs(V(p,p)), abs(V(q,q)));
887 if (abs(V(q,p))<=tmp) {
888 V(q,p) = sign(tmp, V(q,p));
889 }
890 }
891 }
892 }
893 //
894 if (nr!=n) {
895 auto _work = work(_(2*n+1, 2*n+n*nr));
896 GeMatrixView<ElementType> Work(n, nr, _work, n);
897
898 Work = V(_,_(1,nr));
899 }
900 // .. save ...
901 //
902 // .. this transposed copy should be better than naive
903 // TODO: auto _V = V(_(1,nr),_(1,nr));
904 // _V.lower() = transpose(_V.upper());
905 //
906 for (IndexType p=1; p<=nr-1; ++p) {
907 V(_(p+1,nr),p) = V(p,_(p+1,nr));
908 }
909
910 condr2 = condr1;
911
912 } else {
913 //
914 // .. ill-conditioned case: second QRF with pivoting
915 // Note that windowed pivoting would be equaly good
916 // numerically, and more run-time efficient. So, in
917 // an optimal implementation, the next call to DGEQP3
918 // should be replaced with eg. CALL SGEQPX (ACM TOMS #782)
919 // with properly (carefully) chosen parameters.
920 //
921 // R1^t * P2 = Q2 * R2
922 auto _V = V(_,_(1,nr));
923 auto piv = iwork(_(n+1,n+nr));
924 auto tau = work(_(n+1,,n+nr));
925 auto _work = work(_(2*n+1, lWork));
926
927 piv = 0;
928 qp3(_V, piv, tau, _work);
929 if (l2pert) {
930 xsc = sqrt(small);
931 for (IndexType p=2; p<=nr; ++p) {
932 for (IndexType q=1; q<=p-1; ++q) {
933 tmp = xsc*min(abs(V(p,p)), abs(V(q,q)));
934 if (abs(V(q,p))<=tmp) {
935 V(q,p) = sign(tmp, V(q,p));
936 }
937 }
938 }
939 }
940
941 auto _work1 = work(_(2*n+1, 2*n+n*nr));
942 GeMatrixView<ElementType> Work1(n, nr, _work1, n);
943
944 Work1 = V(_,_(1,nr));
945
946 if (l2pert) {
947 xsc = sqrt(small);
948 for (IndexType p=2; p<=nr; ++p) {
949 for (IndexType q=1; q<=p-1; ++q) {
950 tmp = xsc * min(abs(V(p,p)), abs(V(q,q)));
951 V(p,q) = -sign(tmp, V(q,p));
952 }
953 }
954 } else {
955 V(_(1,nr),_(1,nr)).strictLower() = Zero;
956 }
957 // Now, compute R2 = L3 * Q3, the LQ factorization.
958 auto _V = V(_(1,nr),_(1,nr));
959 auto tau = work(_(2*n+n*nr+1,2*n+n*nr+nr));
960 auto _work2 = work(_(2*n+n*nr+nr+1,lWork));
961
962 lqf(_V, tau, _work2);
963 // .. and estimate the condition number
964 auto _work3 = work(_(2*n+n*nr+nr+1,2*n+n*nr+nr+nr*nr));
965 GeMatrixView<ElementType> Work3(nr, nr, _work3, nr);
966
967 Work3.lower() = _V.lower();
968
969 for (IndexType p=1; p<=nr; ++p) {
970 tmp = blas::nrm2(Work3(p,_(1,p)));
971 Work3(p,_(1,p)) *= One/tmp;
972 }
973 auto _work4 = work(_(2*n+n*nr+nr+nr*nr+1,
974 2*n+n*nr+nr+nr*nr+3*nr));
975 auto _iwork = iwork(_(m+2*n+1, m+2*n+nr));
976 pocon(Work3.lower(), One, tmp, _work4, _iwork);
977 condr2 = One / sqrt(tmp);
978
979 if (condr2>=cond_ok) {
980 // .. save the Householder vectors used for Q3
981 // (this overwrittes the copy of R2, as it will not be
982 // needed in this branch, but it does not overwritte the
983 // Huseholder vectors of Q2.).
984 Work1(_(1,nr),_(1,nr)).upper() = V.upper();
985 // .. and the rest of the information on Q3 is in
986 // WORK(2*N+N*NR+1:2*N+N*NR+N)
987 }
988
989 }
990
991 if (l2pert) {
992 xsc = sqrt(small);
993 for (IndexType q=2; q<=nr; ++q) {
994 tmp = xsc * V(q,q);
995 for (IndexType p=1; p<=q-1; ++p) {
996 // V(p,q) = - DSIGN( TEMP1, V(q,p) )
997 V(p,q) = -sign(tmp, V(p,q));
998 }
999 }
1000 } else {
1001 V(_(1,nr),_(1,nr)).strictLower();
1002 }
1003 //
1004 // Second preconditioning finished; continue with Jacobi SVD
1005 // The input matrix is lower triangular.
1006 //
1007 // Recover the right singular vectors as solution of a well
1008 // conditioned triangular matrix equation.
1009 //
1010 if (condr1<cond_ok) {
1011 auto _U = U(_(1,nr),_(1,nr));
1012 auto _V = V(_(1,nr),_(1,nr));
1013 auto _sva = sva(_(1,nr));
1014 auto _work = work(_(2*n+n*nr+nr+1,lWork));
1015
1016 svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV,
1017 _V, _sva, U, _work);
1018 scaleM = _work(1);
1019 numRank = nint(_work(2));
1020
1021 for (IndexType p=1; p<=nr; ++p) {
1022 _U(_,p) = _V(_,p);
1023 _V(_,p) *= sva(p);
1024 }
1025
1026 // .. pick the right matrix equation and solve it
1027 //
1028 if (nr==n) {
1029 //:)) .. best case, R1 is inverted. The solution of this
1030 // matrix equation is Q2*V2 = the product of the Jacobi
1031 // rotations used in DGESVJ, premultiplied with the
1032 // orthogonal matrix from the second QR factorization.
1033 const auto _A = A(_(1,nr),_(1,nr));
1034 blas::sm(Left, NoTrans, One, _A.upper(), _V);
1035 } else {
1036 // .. R1 is well conditioned, but non-square. Transpose(R2)
1037 // is inverted to get the product of the Jacobi rotations
1038 // used in DGESVJ. The Q-factor from the second QR
1039 // factorization is then built in explicitly.
1040
1041 auto _work = work(_(2*n+1, 2*n+n*nr));
1042 GeMatrixView<ElementType> Work1(nr, nr, _work, n);
1043 GeMatrixView<ElementType> Work(n, nr, _work, n);
1044
1045 blas::sm(Left, NoTrans, One, Work1.upper(), _V);
1046 if (nr<n) {
1047 V(_(nr+1,n),_(1,nr)) = Zero;
1048 V(_(1,nr),_(nr+1,n)) = Zero;
1049 V(_(nr+1,n),_(nr+1,n)) = Zero;
1050 V(_(nr+1,n),_(nr+1,n)).diag(0) = One;
1051 }
1052 auto tau = work(_(n+1,n+nr));
1053 auto _work_ormqr = work(_(2*n+n*nr+nr+1,lWork));
1054 ormqr(Left, NoTrans, Work, tau, V, _work_ormqr);
1055 }
1056 //
1057 } else if (condr2<cond_ok) {
1058 //
1059 //:) .. the input matrix A is very likely a relative of
1060 // the Kahan matrix :)
1061 // The matrix R2 is inverted. The solution of the matrix
1062 // equation is Q3^T*V3 = the product of the Jacobi rotations
1063 // (appplied to the lower triangular L3 from the LQ
1064 // factorization of R2=L3*Q3), pre-multiplied with the
1065 // transposed Q3.
1066 CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
1067 $ LDU, work(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )
1068 SCALEM = work(2*N+N*NR+NR+1)
1069 numRank = IDNINT(work(2*N+N*NR+NR+2))
1070 DO 3870 p = 1, NR
1071 CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 )
1072 CALL DSCAL( NR, SVA(p), U(1,p), 1 )
1073 3870 CONTINUE
1074 CALL DTRSM('L','U','N','N',NR,NR,One,work(2*N+1),N,U,LDU)
1075 // .. apply the permutation from the second QR factorization
1076 DO 873 q = 1, NR
1077 DO 872 p = 1, NR
1078 work(2*N+N*NR+NR+iwork(N+p)) = U(p,q)
1079 872 CONTINUE
1080 DO 874 p = 1, NR
1081 U(p,q) = work(2*N+N*NR+NR+p)
1082 874 CONTINUE
1083 873 CONTINUE
1084 if ( NR < N ) {
1085 CALL DLASET( 'A',N-NR,NR,Zero,Zero,V(NR+1,1),LDV )
1086 CALL DLASET( 'A',NR,N-NR,Zero,Zero,V(1,NR+1),LDV )
1087 CALL DLASET( 'A',N-NR,N-NR,Zero,One,V(NR+1,NR+1),LDV )
1088 }
1089 CALL DORMQR( 'L','N',N,N,NR,work(2*N+1),N,work(N+1),
1090 $ V,LDV,work(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
1091 } else {
1092 // Last line of defense.
1093 //#:( This is a rather pathological case: no scaled condition
1094 // improvement after two pivoted QR factorizations. Other
1095 // possibility is that the rank revealing QR factorization
1096 // or the condition estimator has failed, or the COND_OK
1097 // is set very close to One (which is unnecessary). Normally,
1098 // this branch should never be executed, but in rare cases of
1099 // failure of the RRQR or condition estimator, the last line of
1100 // defense ensures that DGEJSV completes the task.
1101 // Compute the full SVD of L3 using DGESVJ with explicit
1102 // accumulation of Jacobi rotations.
1103 CALL DGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,
1104 $ LDU, work(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )
1105 SCALEM = work(2*N+N*NR+NR+1)
1106 numRank = IDNINT(work(2*N+N*NR+NR+2))
1107 if ( NR < N ) {
1108 CALL DLASET( 'A',N-NR,NR,Zero,Zero,V(NR+1,1),LDV )
1109 CALL DLASET( 'A',NR,N-NR,Zero,Zero,V(1,NR+1),LDV )
1110 CALL DLASET( 'A',N-NR,N-NR,Zero,One,V(NR+1,NR+1),LDV )
1111 }
1112 CALL DORMQR( 'L','N',N,N,NR,work(2*N+1),N,work(N+1),
1113 $ V,LDV,work(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
1114 //
1115 CALL DORMLQ( 'L', 'T', NR, NR, NR, work(2*N+1), N,
1116 $ work(2*N+N*NR+1), U, LDU, work(2*N+N*NR+NR+1),
1117 $ LWORK-2*N-N*NR-NR, IERR )
1118 DO 773 q = 1, NR
1119 DO 772 p = 1, NR
1120 work(2*N+N*NR+NR+iwork(N+p)) = U(p,q)
1121 772 CONTINUE
1122 DO 774 p = 1, NR
1123 U(p,q) = work(2*N+N*NR+NR+p)
1124 774 CONTINUE
1125 773 CONTINUE
1126 //
1127 }
1128 //
1129 // Permute the rows of V using the (column) permutation from the
1130 // first QRF. Also, scale the columns to make them unit in
1131 // Euclidean norm. This applies to all cases.
1132 //
1133 TEMP1 = DSQRT(DBLE(N)) * EPSLN
1134 DO 1972 q = 1, N
1135 DO 972 p = 1, N
1136 work(2*N+N*NR+NR+iwork(p)) = V(p,q)
1137 972 CONTINUE
1138 DO 973 p = 1, N
1139 V(p,q) = work(2*N+N*NR+NR+p)
1140 973 CONTINUE
1141 XSC = One / DNRM2( N, V(1,q), 1 )
1142 if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1143 $ CALL DSCAL( N, XSC, V(1,q), 1 )
1144 1972 CONTINUE
1145 // At this moment, V contains the right singular vectors of A.
1146 // Next, assemble the left singular vector matrix U (M x N).
1147 if ( NR < M ) {
1148 CALL DLASET( 'A', M-NR, NR, Zero, Zero, U(NR+1,1), LDU )
1149 if ( NR < N1 ) {
1150 CALL DLASET('A',NR,N1-NR,Zero,Zero,U(1,NR+1),LDU)
1151 CALL DLASET('A',M-NR,N1-NR,Zero,One,U(NR+1,NR+1),LDU)
1152 }
1153 }
1154 //
1155 // The Q matrix from the first QRF is built into the left singular
1156 // matrix U. This applies to all cases.
1157 //
1158 CALL DORMQR( 'Left', 'No_Tr', M, N1, N, A, LDA, work, U,
1159 $ LDU, work(N+1), LWORK-N, IERR )
1160
1161 // The columns of U are normalized. The cost is O(M*N) flops.
1162 TEMP1 = DSQRT(DBLE(M)) * EPSLN
1163 DO 1973 p = 1, NR
1164 XSC = One / DNRM2( M, U(1,p), 1 )
1165 if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1166 $ CALL DSCAL( M, XSC, U(1,p), 1 )
1167 1973 CONTINUE
1168 //
1169 // If the initial QRF is computed with row pivoting, the left
1170 // singular vectors must be adjusted.
1171 //
1172 if ( ROWPIV )
1173 $ CALL DLASWP( N1, U, LDU, 1, M-1, iwork(2*N+1), -1 )
1174 //
1175 } else {
1176 //
1177 // .. the initial matrix A has almost orthogonal columns and
1178 // the second QRF is not needed
1179 //
1180 CALL DLACPY( 'Upper', N, N, A, LDA, work(N+1), N )
1181 if ( L2PERT ) {
1182 XSC = DSQRT(SMALL)
1183 DO 5970 p = 2, N
1184 TEMP1 = XSC * work( N + (p-1)*N + p )
1185 DO 5971 q = 1, p - 1
1186 work(N+(q-1)*N+p)=-DSIGN(TEMP1,work(N+(p-1)*N+q))
1187 5971 CONTINUE
1188 5970 CONTINUE
1189 } else {
1190 CALL DLASET( 'Lower',N-1,N-1,Zero,Zero,work(N+2),N )
1191 }
1192 //
1193 CALL DGESVJ( 'Upper', 'U', 'N', N, N, work(N+1), N, SVA,
1194 $ N, U, LDU, work(N+N*N+1), LWORK-N-N*N, INFO )
1195 //
1196 SCALEM = work(N+N*N+1)
1197 numRank = IDNINT(work(N+N*N+2))
1198 DO 6970 p = 1, N
1199 CALL DCOPY( N, work(N+(p-1)*N+1), 1, U(1,p), 1 )
1200 CALL DSCAL( N, SVA(p), work(N+(p-1)*N+1), 1 )
1201 6970 CONTINUE
1202 //
1203 CALL DTRSM( 'Left', 'Upper', 'NoTrans', 'No UD', N, N,
1204 $ One, A, LDA, work(N+1), N )
1205 DO 6972 p = 1, N
1206 CALL DCOPY( N, work(N+p), N, V(iwork(p),1), LDV )
1207 6972 CONTINUE
1208 TEMP1 = DSQRT(DBLE(N))*EPSLN
1209 DO 6971 p = 1, N
1210 XSC = One / DNRM2( N, V(1,p), 1 )
1211 if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1212 $ CALL DSCAL( N, XSC, V(1,p), 1 )
1213 6971 CONTINUE
1214 //
1215 // Assemble the left singular vector matrix U (M x N).
1216 //
1217 if ( N < M ) {
1218 CALL DLASET( 'A', M-N, N, Zero, Zero, U(N+1,1), LDU )
1219 if ( N < N1 ) {
1220 CALL DLASET( 'A',N, N1-N, Zero, Zero, U(1,N+1),LDU )
1221 CALL DLASET( 'A',M-N,N1-N, Zero, One,U(N+1,N+1),LDU )
1222 }
1223 }
1224 CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, work, U,
1225 $ LDU, work(N+1), LWORK-N, IERR )
1226 TEMP1 = DSQRT(DBLE(M))*EPSLN
1227 DO 6973 p = 1, N1
1228 XSC = One / DNRM2( M, U(1,p), 1 )
1229 if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1230 $ CALL DSCAL( M, XSC, U(1,p), 1 )
1231 6973 CONTINUE
1232 //
1233 if ( ROWPIV )
1234 $ CALL DLASWP( N1, U, LDU, 1, M-1, iwork(2*N+1), -1 )
1235 //
1236 }
1237 //
1238 // end of the >> almost orthogonal case << in the full SVD
1239 //
1240 } else {
1241 //
1242 // This branch deploys a preconditioned Jacobi SVD with explicitly
1243 // accumulated rotations. It is included as optional, mainly for
1244 // experimental purposes. It does perfom well, and can also be used.
1245 // In this implementation, this branch will be automatically activated
1246 // if the condition number sigma_max(A) / sigma_min(A) is predicted
1247 // to be greater than the overflow threshold. This is because the
1248 // a posteriori computation of the singular vectors assumes robust
1249 // implementation of BLAS and some LAPACK procedures, capable of
1250 // working in presence of extreme values. Since that is not always
1251 // the case, ...
1252 //
1253 DO 7968 p = 1, NR
1254 CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
1255 7968 CONTINUE
1256 //
1257 if ( L2PERT ) {
1258 XSC = DSQRT(SMALL/EPSLN)
1259 for (IndexType q=1; q<=nr; ++q) {
1260 tmp = xsc*abs(V(q,q));
1261 for (IndexType p=1; p<=n; ++p) {
1262 if (p>q && abs(V(p,q))<=tmp || p<q) {
1263 V(p,q) = sign(tmp, V(p,q));
1264 }
1265 if (p<q) {
1266 V(p,q) = - V(p,q);
1267 }
1268 }
1269 }
1270 } else {
1271 CALL DLASET( 'U', NR-1, NR-1, Zero, Zero, V(1,2), LDV )
1272 }
1273
1274 CALL DGEQRF( N, NR, V, LDV, work(N+1), work(2*N+1),
1275 $ LWORK-2*N, IERR )
1276 CALL DLACPY( 'L', N, NR, V, LDV, work(2*N+1), N )
1277 //
1278 DO 7969 p = 1, NR
1279 CALL DCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )
1280 7969 CONTINUE
1281
1282 if ( L2PERT ) {
1283 XSC = DSQRT(SMALL/EPSLN)
1284 DO 9970 q = 2, NR
1285 DO 9971 p = 1, q - 1
1286 TEMP1 = XSC * DMIN1(DABS(U(p,p)),DABS(U(q,q)))
1287 U(p,q) = - DSIGN( TEMP1, U(q,p) )
1288 9971 CONTINUE
1289 9970 CONTINUE
1290 } else {
1291 CALL DLASET('U', NR-1, NR-1, Zero, Zero, U(1,2), LDU )
1292 }
1293
1294 CALL DGESVJ( 'G', 'U', 'V', NR, NR, U, LDU, SVA,
1295 $ N, V, LDV, work(2*N+N*NR+1), LWORK-2*N-N*NR, INFO )
1296 SCALEM = work(2*N+N*NR+1)
1297 numRank = IDNINT(work(2*N+N*NR+2))
1298
1299 if ( NR < N ) {
1300 CALL DLASET( 'A',N-NR,NR,Zero,Zero,V(NR+1,1),LDV )
1301 CALL DLASET( 'A',NR,N-NR,Zero,Zero,V(1,NR+1),LDV )
1302 CALL DLASET( 'A',N-NR,N-NR,Zero,One,V(NR+1,NR+1),LDV )
1303 }
1304
1305 CALL DORMQR( 'L','N',N,N,NR,work(2*N+1),N,work(N+1),
1306 $ V,LDV,work(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
1307 //
1308 // Permute the rows of V using the (column) permutation from the
1309 // first QRF. Also, scale the columns to make them unit in
1310 // Euclidean norm. This applies to all cases.
1311 //
1312 TEMP1 = DSQRT(DBLE(N)) * EPSLN
1313 DO 7972 q = 1, N
1314 DO 8972 p = 1, N
1315 work(2*N+N*NR+NR+iwork(p)) = V(p,q)
1316 8972 CONTINUE
1317 DO 8973 p = 1, N
1318 V(p,q) = work(2*N+N*NR+NR+p)
1319 8973 CONTINUE
1320 XSC = One / DNRM2( N, V(1,q), 1 )
1321 if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1322 $ CALL DSCAL( N, XSC, V(1,q), 1 )
1323 7972 CONTINUE
1324 //
1325 // At this moment, V contains the right singular vectors of A.
1326 // Next, assemble the left singular vector matrix U (M x N).
1327 //
1328 if (NR<M) {
1329 CALL DLASET( 'A', M-NR, NR, Zero, Zero, U(NR+1,1), LDU )
1330 if (NR<N1) {
1331 CALL DLASET( 'A',NR, N1-NR, Zero, Zero, U(1,NR+1),LDU )
1332 CALL DLASET( 'A',M-NR,N1-NR, Zero, One,U(NR+1,NR+1),LDU )
1333 }
1334 }
1335
1336 CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, work, U,
1337 $ LDU, work(N+1), LWORK-N, IERR )
1338
1339 if (ROWPIV)
1340 $ CALL DLASWP( N1, U, LDU, 1, M-1, iwork(2*N+1), -1 )
1341
1342
1343 }
1344 if ( TRANSP ) {
1345 // .. swap U and V because the procedure worked on A^t
1346 for (IndexType p=1; p<=n; ++p) {
1347 blas::swap(U(_,p),V(_,p));
1348 }
1349 }
1350
1351 }
1352 // end of the full SVD
1353 //
1354 // Undo scaling, if necessary (and possible)
1355 //
1356 if ( USCAL2 <= (BIG/SVA(1))*USCAL1 ) {
1357 CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )
1358 USCAL1 = One
1359 USCAL2 = One
1360 }
1361
1362 if ( NR < N ) {
1363 sva(_(nr+1,n)) = Zero;
1364 }
1365
1366 work(1) = USCAL2 * SCALEM
1367 work(2) = USCAL1
1368 if ( errest ) work(3) = SCONDA
1369 if ( lsvec && rsvec ) {
1370 work(4) = condr1
1371 work(5) = condr2
1372 }
1373 if ( L2TRAN ) {
1374 work(6) = entra
1375 work(7) = entrat
1376 }
1377
1378 iwork(1) = NR
1379 iwork(2) = numRank
1380 iwork(3) = warning
1381 }
1382 */
1383 //== interface for native lapack ===============================================
1384 #ifdef CHECK_CXXLAPACK
1385
1386 template <typename MA, typename VSVA, typename MU, typename MV,
1387 typename VWORK, typename VIWORK>
1388 typename GeMatrix<MA>::IndexType
1389 jsv_native(JSV::Accuracy accuracy,
1390 JSV::JobU jobU,
1391 JSV::JobV jobV,
1392 bool restrictedRange,
1393 bool considerTransA,
1394 bool perturb,
1395 GeMatrix<MA> &A,
1396 DenseVector<VSVA> &sva,
1397 GeMatrix<MU> &U,
1398 GeMatrix<MV> &V,
1399 DenseVector<VWORK> &work,
1400 DenseVector<VIWORK> &iwork)
1401 {
1402 typedef typename GeMatrix<MA>::ElementType T;
1403
1404 const char JOBA = char(accuracy);
1405 const char JOBU = char(jobU);
1406 const char JOBV = char(jobV);
1407 const char JOBR = (restrictedRange) ? 'R' : 'N';
1408 const char JOBT = (considerTransA) ? 'T' : 'N';
1409 const char JOBP = (perturb) ? 'P' : 'N';
1410 const INTEGER M = A.numRows();
1411 const INTEGER N = A.numCols();
1412 const INTEGER LDA = A.leadingDimension();
1413 const INTEGER LDU = U.leadingDimension();
1414 const INTEGER LDV = V.leadingDimension();
1415 const INTEGER LWORK = work.length();
1416 INTEGER INFO;
1417
1418 if (IsSame<T,DOUBLE>::value) {
1419 LAPACK_IMPL(dgejsv)(&JOBA,
1420 &JOBU,
1421 &JOBV,
1422 &JOBR,
1423 &JOBT,
1424 &JOBP,
1425 &M,
1426 &N,
1427 A.data(),
1428 &LDA,
1429 sva.data(),
1430 U.data(),
1431 &LDU,
1432 V.data(),
1433 &LDV,
1434 work.data(),
1435 &LWORK,
1436 iwork.data(),
1437 &INFO);
1438 } else {
1439 ASSERT(0);
1440 }
1441 return INFO;
1442 }
1443
1444 #endif // CHECK_CXXLAPACK
1445
1446
1447 //== public interface ==========================================================
1448 template <typename MA, typename VSVA, typename MU, typename MV,
1449 typename VWORK, typename VIWORK>
1450 typename GeMatrix<MA>::IndexType
1451 jsv(JSV::Accuracy accuracy,
1452 JSV::JobU jobU,
1453 JSV::JobV jobV,
1454 bool restrictedRange,
1455 bool considerTransA,
1456 bool perturb,
1457 GeMatrix<MA> &A,
1458 DenseVector<VSVA> &sva,
1459 GeMatrix<MU> &U,
1460 GeMatrix<MV> &V,
1461 DenseVector<VWORK> &work,
1462 DenseVector<VIWORK> &iwork)
1463 {
1464 //
1465 // Test the input parameters
1466 //
1467 # ifndef NDEBUG
1468 typedef typename GeMatrix<MA>::IndexType IndexType;
1469
1470 ASSERT(A.firstRow()==1);
1471 ASSERT(A.firstCol()==1);
1472
1473 const IndexType m = A.numRows();
1474 const IndexType n = A.numCols();
1475
1476 ASSERT(m>=n);
1477
1478 ASSERT(sva.firstIndex()==1);
1479 ASSERT(sva.length()==n);
1480
1481 ASSERT(U.firstRow()==1);
1482 ASSERT(U.firstCol()==1);
1483
1484 ASSERT(V.firstRow()==1);
1485 ASSERT(V.firstCol()==1);
1486
1487 ASSERT(iwork.length()==m+3*n);
1488 # endif
1489
1490 //
1491 // Make copies of output arguments
1492 //
1493 # ifdef CHECK_CXXLAPACK
1494 typename GeMatrix<MA>::NoView A_org = A;
1495 typename DenseVector<VSVA>::NoView sva_org = sva;
1496 typename GeMatrix<MU>::NoView U_org = U;
1497 typename GeMatrix<MV>::NoView V_org = V;
1498 typename DenseVector<VWORK>::NoView work_org = work;
1499 # endif
1500
1501 //
1502 // Call implementation
1503 //
1504 /*
1505 IndexType info = jsv_generic(accuracy, jobU, jobV,
1506 restrictedRange, considerTransA, perturb,
1507 A, sva, U, V, work);
1508 */
1509 IndexType info = jsv_native(accuracy, jobU, jobV,
1510 restrictedRange, considerTransA, perturb,
1511 A, sva, U, V, work, iwork);
1512
1513 # ifdef CHECK_CXXLAPACK
1514 //
1515 // Make copies of results computed by the generic implementation
1516 //
1517 typename GeMatrix<MA>::NoView A_generic = A;
1518 typename DenseVector<VSVA>::NoView sva_generic = sva;
1519 typename GeMatrix<MU>::NoView U_generic = U;
1520 typename GeMatrix<MV>::NoView V_generic = V;
1521 typename DenseVector<VWORK>::NoView work_generic = work;
1522 //
1523 // restore output arguments
1524 //
1525 A = A_org;
1526 sva = sva_org;
1527 U = U_org;
1528 V = V_org;
1529 work = work_org;
1530 //
1531 // Compare generic results with results from the native implementation
1532 //
1533 IndexType _info = jsv_native(accuracy, jobU, jobV,
1534 restrictedRange, considerTransA, perturb,
1535 A, sva, U, V, work, iwork);
1536 bool failed = false;
1537 if (! isIdentical(A_generic, A, "A_generic", "A")) {
1538 std::cerr << "CXXLAPACK: A_generic = " << A_generic << std::endl;
1539 std::cerr << "F77LAPACK: A = " << A << std::endl;
1540 failed = true;
1541 }
1542 if (! isIdentical(sva_generic, sva, "sva_generic", "sva")) {
1543 std::cerr << "CXXLAPACK: sva_generic = " << sva_generic << std::endl;
1544 std::cerr << "F77LAPACK: sva = " << sva << std::endl;
1545 failed = true;
1546 }
1547 if (! isIdentical(U_generic, U, "U_generic", "U")) {
1548 std::cerr << "CXXLAPACK: U_generic = " << U_generic << std::endl;
1549 std::cerr << "F77LAPACK: U = " << U << std::endl;
1550 failed = true;
1551 }
1552 if (! isIdentical(V_generic, V, "V_generic", "V")) {
1553 std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;
1554 std::cerr << "F77LAPACK: V = " << V << std::endl;
1555 failed = true;
1556 }
1557 if (! isIdentical(work_generic, work, "work_generic", "work")) {
1558 std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
1559 std::cerr << "F77LAPACK: work = " << work << std::endl;
1560 failed = true;
1561 }
1562 if (! isIdentical(info, _info, "info", "_info")) {
1563 std::cerr << "CXXLAPACK: info = " << info << std::endl;
1564 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
1565 failed = true;
1566 }
1567
1568 if (failed) {
1569 std::cerr << "error in: jsv.tcc " << std::endl;
1570 ASSERT(0);
1571 } else {
1572 std::cerr << "passed: jsv.tcc " << std::endl;
1573 }
1574 # endif
1575
1576 return info;
1577 }
1578
1579 //-- forwarding ----------------------------------------------------------------
1580 template <typename MA, typename VSVA, typename MU, typename MV,
1581 typename VWORK, typename VIWORK>
1582 typename MA::IndexType
1583 jsv(JSV::Accuracy accuracy,
1584 JSV::JobU jobU,
1585 JSV::JobV jobV,
1586 bool restrictedRange,
1587 bool considerTransA,
1588 bool perturb,
1589 MA &&A,
1590 VSVA &&sva,
1591 MU &&U,
1592 MV &&V,
1593 VWORK &&work,
1594 VIWORK &&iwork)
1595 {
1596 typename MA::IndexType info;
1597
1598 CHECKPOINT_ENTER;
1599 info = jsv(accuracy, jobU, jobV,
1600 restrictedRange, considerTransA, perturb,
1601 A, sva, U, V, work, iwork);
1602 CHECKPOINT_LEAVE;
1603
1604 return info;
1605 }
1606
1607 } } // namespace lapack, flens
1608
1609 #endif // FLENS_LAPACK_SVD_JSV_TCC
2 * Copyright (c) 2012, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Based on
34 *
35 SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
36 $ M, N, A, LDA, SVA, U, LDU, V, LDV,
37 $ WORK, LWORK, IWORK, INFO )
38 *
39 * -- LAPACK routine (version 3.3.1) --
40 *
41 * -- Contributed by Zlatko Drmac of the University of Zagreb and --
42 * -- Kresimir Veselic of the Fernuniversitaet Hagen --
43 * -- April 2011 --
44 *
45 * -- LAPACK is a software package provided by Univ. of Tennessee, --
46 * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
47 *
48 * This routine is also part of SIGMA (version 1.23, October 23. 2008.)
49 * SIGMA is a library of algorithms for highly accurate algorithms for
50 * computation of SVD, PSVD, QSVD, (H,K)-SVD, and for solution of the
51 * eigenvalue problems Hx = lambda M x, H M x = lambda x with H, M > 0.
52 *
53 */
54
55 #ifndef FLENS_LAPACK_SVD_JSV_TCC
56 #define FLENS_LAPACK_SVD_JSV_TCC 1
57
58 #include <flens/blas/blas.h>
59 #include <flens/lapack/lapack.h>
60
61 namespace flens { namespace lapack {
62
63 //== generic lapack implementation =============================================
64 /*
65 template <typename MA, typename VSVA, typename MU, typename MV,
66 typename VWORK, typename VIWORK>
67 typename GeMatrix<MA>::IndexType
68 jsv_generic(JSV::Accuracy accuracy,
69 JSV::JobU jobU,
70 JSV::JobV jobV,
71 bool restrictedRange,
72 bool considerTransA,
73 bool perturb,
74 GeMatrix<MA> &A,
75 DenseVector<VSVA> &sva,
76 GeMatrix<MU> &U,
77 GeMatrix<MV> &V,
78 DenseVector<VWORK> &work,
79 DenseVector<VIWORK> &iwork)
80 {
81 using std::abs;
82 using std::max;
83 using std::min;
84 using std::sqrt;
85
86 typedef typename GeMatrix<MA>::ElementType ElementType;
87 typedef typename GeMatrix<MA>::IndexType IndexType;
88
89 const ElementType Zero(0), One(1);
90
91 const Underscore<IndexType> _;
92
93 const bool lsvec = (jobU==ComputeU) || (jobU==FullsetU);
94 const bool jracc = (jobV=='J');
95 const bool rsvec = (jobV=='V') || jracc;
96 const bool rowpiv = (jobA=='F') || (jobA=='G');
97 const bool l2rank = (jobA=='R');
98 const bool l2aber = (jobA=='A');
99 const bool errest = (jobA=='E') || (jobA=='G');
100 const bool l2tran = (jobT=='T');
101 const bool l2kill = (jobR=='R');
102 const bool defr = (jobR=='N');
103 const bool l2pert = (jobP=='P');
104
105 IndexType info = 0;
106
107 //
108 // Quick return for void matrix (Y3K safe)
109 //#:)
110 if (m==0 || n==0) {
111 return info;
112 }
113 //
114 // Set numerical parameters
115 //
116 //! NOTE: Make sure DLAMCH() does not fail on the target architecture.
117 //
118 const ElementType eps = lamch<ElementType>(Eps);
119 const ElementType safeMin = lamch<ElementType>(SafeMin);
120 const ElementType small = safeMin / eps;
121 const ElementType big = lamch<ElementType>(OverflowThreshold);
122 //
123 // Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N
124 //
125 //! If necessary, scale SVA() to protect the largest norm from
126 // overflow. It is possible that this scaling pushes the smallest
127 // column norm left from the underflow threshold (extreme case).
128 //
129 ElementType scaleM = One / sqrt(ElementType(m)*ElementType(n));
130 bool noScale = true;
131 bool goScale = true;
132 for (IndexType p=1; p<=n; ++p) {
133 aapp = Zero;
134 aaqq = One;
135 lassq(A(_,p), aapp, aaqq);
136 if (aapp>big) {
137 return -9;
138 }
139 aaqq = sqrt(aaqq);
140 if (aapp<(big/aaqq) && noscal) {
141 sva(p) = aapp * aaqq;
142 } else {
143 noscal = false;
144 sva(p) = aapp * ( aaqq * scalem )
145 if (goScale) {
146 goScale = false;
147 sva(_(1,p)) *= scaleM;
148 }
149 }
150 }
151
152 if (noScale) {
153 scaleM = One;
154 }
155
156 aapp = Zero;
157 aaqq = big;
158 for (IndexType p=1; p<=n) {
159 aapp = max(aapp, sva(p));
160 if (sva(p)!=Zero) {
161 aaqq = min(aaqq, sva(p));
162 }
163 }
164 //
165 // Quick return for zero M x N matrix
166 //#:)
167 if (aapp==Zero) {
168 if (lsvec) {
169 U = Zero;
170 U.diag(0) = One;
171 }
172 if (rsvec) {
173 V = Zero;
174 V.diag(0) = One;
175 }
176 work(1) = One;
177 work(2) = One;
178 if (errest) {
179 work(3) = One;
180 }
181 if (lsvec && rsvec) {
182 work(4) = One;
183 work(5) = One;
184 }
185 if (l2tran) {
186 work(6) = Zero;
187 work(7) = Zero;
188 }
189 iwork(1) = 0;
190 iwork(2) = 0;
191 iwork(3) = 0;
192 return info;
193 }
194 //
195 // Issue warning if denormalized column norms detected. Override the
196 // high relative accuracy request. Issue licence to kill columns
197 // (set them to zero) whose norm is less than sigma_max / BIG (roughly).
198 //#:(
199 warning = 0
200 if (aaqq<=SFMIN) {
201 l2rank = true;
202 l2kill = true;
203 warning = 1;
204 }
205 //
206 // Quick return for one-column matrix
207 //#:)
208 auto U1 = U(_,_(1,n));
209 if (n==1) {
210
211 if (lsvec) {
212 lascl(LASCL::FullMatrix, 0, 0, sva(1), scaleM, A);
213 U1 = A;
214 // computing all M left singular vectors of the M x 1 matrix
215 if (nu!=n) {
216 auto tau = work(_(1,n));
217 auto _work = work(_(n+1,lWork));
218 qrf(U1, tau, _work);
219 orgqr(1, U, tau, _work);
220 U1 = A;
221 }
222 }
223 if (rsvec) {
224 V(1,1) = One
225 }
226 if (sva(1)<big*scaleM) {
227 sva(1) /= scaleM;
228 scalem = One;
229 }
230 work(1) = One / scaleM;
231 work(2) = One;
232 if (sva(1)!=Zero) {
233 iwork(1) = 1;
234 if (sva(1)/scaleM>=safeMin) {
235 iwork(2) = 1;
236 } else {
237 iwork(2) = 0;
238 }
239 } else {
240 iwork(1) = 0;
241 iwork(2) = 0;
242 }
243 if (errest) {
244 work(3) = One;
245 }
246 if (lsvec && rsvec) {
247 work(4) = One;
248 work(5) = One;
249 }
250 if (l2tran) {
251 work(6) = Zero;
252 work(7) = Zero;
253 }
254 return info;
255
256 }
257
258 bool transp = false;
259 bool l2tran = l2tran && m==n;
260
261 ElementType aatMax = -One;
262 ElementType aatMin = big;
263 if (rowpiv || l2tran) {
264 //
265 // Compute the row norms, needed to determine row pivoting sequence
266 // (in the case of heavily row weighted A, row pivoting is strongly
267 // advised) and to collect information needed to compare the
268 // structures of A * A^t and A^t * A (in the case L2TRAN.EQ.true).
269 //
270 if (l2tran) {
271 for (IndexType p=1; p<=m; ++p) {
272 xsc = Zero;
273 tmp = One;
274 lassq(A(p,_), xsc, tmp);
275 // DLASSQ gets both the ell_2 and the ell_infinity norm
276 // in one pass through the vector
277 work(m+n+p) = xsc * scaleM;
278 work(n+p) = xsc * (scaleN*sqrt(tmp));
279 aatMax = max(aatMax, work(n+p));
280 if (work(n+p)!=Zero) {
281 aatMin = min(aatMin, work(n+p));
282 }
283 }
284 } else {
285 for (IndexType p=1; p<=m; ++p) {
286 const IndexType jp = blas::iamax(A(p,_));
287 work(m+n+p) = scaleM*abs(A(p,jp));
288 aatMax = max(aatMax, work(m+n+p));
289 aatMin = min(aatMin, work(m+n+p));
290 }
291 }
292
293 }
294 //
295 // For square matrix A try to determine whether A^t would be better
296 // input for the preconditioned Jacobi SVD, with faster convergence.
297 // The decision is based on an O(N) function of the vector of column
298 // and row norms of A, based on the Shannon entropy. This should give
299 // the right choice in most cases when the difference actually matters.
300 // It may fail and pick the slower converging side.
301 //
302 entra = Zero
303 entrat = Zero
304 if (l2tran) {
305
306 xsc = Zero;
307 tmp = One;
308 lassq(sva, xsc, tmp);
309 tmp = One / tmp;
310
311 entra = Zero
312 for (IndexType p=1; p<=n; ++p) {
313 const ElementType _big = pow(sva(p)/xsc, 2) * tmp;
314 if (_big!=Zero) {
315 entra += _big * log(_big);
316 }
317 }
318 entra = - entra / log(ElementType(n));
319 //
320 // Now, SVA().^2/Trace(A^t * A) is a point in the probability simplex.
321 // It is derived from the diagonal of A^t * A. Do the same with the
322 // diagonal of A * A^t, compute the entropy of the corresponding
323 // probability distribution. Note that A * A^t and A^t * A have the
324 // same trace.
325 //
326 entrat = Zero
327 for (IndexType p=n+1; p<=n+m; ++p) {
328 const ElementType _big = pow(work(p)/xsc, 2) * tmp;
329 if (_big!=Zero) {
330 entrat += _big * log(_big);
331 }
332 }
333 entrat = -entrat / log(ElementType(m));
334 //
335 // Analyze the entropies and decide A or A^t. Smaller entropy
336 // usually means better input for the algorithm.
337 //
338 transp = entrat<entra;
339 //
340 // If A^t is better than A, transpose A.
341 //
342 if (transp) {
343 // In an optimal implementation, this trivial transpose
344 // should be replaced with faster transpose.
345 // TODO: in-place transpose:
346 // transpose(A);
347 for (IndexType p=1; p<=n-1; ++p) {
348 for (IndexType q=p+1; q<=n; ++q) {
349 swap(A(q,p), A(p,q));
350 }
351 }
352 for (IndexType p=1; p<=n; ++p) {
353 work(m+n+p) = sva(p);
354 sva(p) = work(n+p);
355 }
356 swap(aapp, aatMax);
357 swap(aaqq, aatMin);
358 swap(lsvec, rsvec);
359 if (lsvec) {
360 // Lehn: transposing A is only considered if A is square,
361 // so in this case m==n and therefore nu==n. Or am I
362 // wrong??
363 ASSERT(nu==n);
364 }
365
366 rowpiv = true;
367 }
368
369 }
370 //
371 // Scale the matrix so that its maximal singular value remains less
372 // than DSQRT(BIG) -- the matrix is scaled so that its maximal column
373 // has Euclidean norm equal to DSQRT(BIG/N). The only reason to keep
374 // DSQRT(BIG) instead of BIG is the fact that DGEJSV uses LAPACK and
375 // BLAS routines that, in some implementations, are not capable of
376 // working in the full interval [SFMIN,BIG] and that they may provoke
377 // overflows in the intermediate results. If the singular values spread
378 // from SFMIN to BIG, then DGESVJ will compute them. So, in that case,
379 // one should use DGESVJ instead of DGEJSV.
380 //
381 const ElementType bigRoot = sqrt(big);
382 tmp = sqrt(big/ElementType(n));
383
384 lascl(LASCL::FullMatrix, 0, 0, aapp, tmp, sva);
385 if (aaqq>aapp*safeMin) {
386 aaqq = (aaqq/aapp) * tmp;
387 } else {
388 aaqq = (aaqq*tmp) / aapp;
389 }
390 tmp *= scaleM;
391 lascl(LASCL::FullMatrix, 0, 0, aapp, tmp, A);
392 //
393 // To undo scaling at the end of this procedure, multiply the
394 // computed singular values with USCAL2 / USCAL1.
395 //
396 uScale1 = tmp;
397 uScale2 = aapp;
398
399 if (l2kill) {
400 // L2KILL enforces computation of nonzero singular values in
401 // the restricted range of condition number of the initial A,
402 // sigma_max(A) / sigma_min(A) approx. DSQRT(BIG)/DSQRT(SFMIN).
403 xsc = sqrt(safeMin);
404 } else {
405 xsc = small;
406 //
407 // Now, if the condition number of A is too big,
408 // sigma_max(A) / sigma_min(A)>DSQRT(BIG/N) * EPSLN / SFMIN,
409 // as a precaution measure, the full SVD is computed using DGESVJ
410 // with accumulated Jacobi rotations. This provides numerically
411 // more robust computation, at the cost of slightly increased run
412 // time. Depending on the concrete implementation of BLAS and LAPACK
413 // (i.e. how they behave in presence of extreme ill-conditioning) the
414 // implementor may decide to remove this switch.
415 if (aaqq<sqrt(safeMin) && lsvec && rsvec ) {
416 jracc = true;
417 }
418
419 }
420 if (aaqq<xsc) {
421 for (IndexType p=1; p<=n; ++p) {
422 if (sva(p)<xsc) {
423 A(_,p) = Zero;
424 sva(p) = Zero;
425 }
426 }
427 }
428 //
429 // Preconditioning using QR factorization with pivoting
430 //
431 if (rowpiv) {
432 // Optional row permutation (Bjoerck row pivoting):
433 // A result by Cox and Higham shows that the Bjoerck's
434 // row pivoting combined with standard column pivoting
435 // has similar effect as Powell-Reid complete pivoting.
436 // The ell-infinity norms of A are made nonincreasing.
437 for (IndexType p=1; p<=m-1; ++p) {
438 const IndexType q = blas::iamax(work(_(m+n+p,2*m+n))) + p - 1;
439 iwork(2*n+p) = q;
440 if (p!=q) {
441 swap(work(m+n+p), work(m+n+q));
442 }
443 }
444 laswp(A, iwork(_(2*n+1, 2*n+m-1)));
445 }
446 //
447 // End of the preparation phase (scaling, optional sorting and
448 // transposing, optional flushing of small columns).
449 //
450 // Preconditioning
451 //
452 // If the full SVD is needed, the right singular vectors are computed
453 // from a matrix equation, and for that we need theoretical analysis
454 // of the Businger-Golub pivoting. So we use DGEQP3 as the first RR QRF.
455 // In all other cases the first RR QRF can be chosen by other criteria
456 // (eg speed by replacing global with restricted window pivoting, such
457 // as in SGEQPX from TOMS # 782). Good results will be obtained using
458 // SGEQPX with properly (!) chosen numerical parameters.
459 // Any improvement of DGEQP3 improves overal performance of DGEJSV.
460 //
461 // A * P1 = Q1 * [ R1^t 0]^t:
462 auto _tau = work(_(1,n));
463 auto _work = work(_(n+1,lWork));
464 auto _iwork = iwork(_(1,n));
465
466 // .. all columns are free columns
467 _iwork = 0;
468
469 qp3(A, _iwork, _tau, _work);
470 //
471 // The upper triangular matrix R1 from the first QRF is inspected for
472 // rank deficiency and possibilities for deflation, or possible
473 // ill-conditioning. Depending on the user specified flag L2RANK,
474 // the procedure explores possibilities to reduce the numerical
475 // rank by inspecting the computed upper triangular factor. If
476 // L2RANK or l2aber are up, then DGEJSV will compute the SVD of
477 // A + dA, where ||dA|| <= f(M,N)*EPSLN.
478 //
479 IndexType nr = 1;
480 if (l2aber) {
481 // Standard absolute error bound suffices. All sigma_i with
482 // sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
483 // agressive enforcement of lower numerical rank by introducing a
484 // backward error of the order of N*EPSLN*||A||.
485 tmp = sqrt(ElementType(n))*eps;
486 for (IndexType p=2; p<=n; ++p) {
487 if (abs(A(p,p))>=tmp*abs(A(1,1))) {
488 ++nr;
489 } else {
490 break;
491 }
492 }
493 } else if (l2rank) {
494 // .. similarly as above, only slightly more gentle (less agressive).
495 // Sudden drop on the diagonal of R1 is used as the criterion for
496 // close-to-rank-defficient.
497 tmp = sqrt(safeMin);
498 for (IndexType p=2; p<=n; ++p) {
499 if (abs(A(p,p))<eps*abs(A(p-1,p-1)) || abs(A(p,p))<small
500 || (l2kill && abs(A(p,p))<tmp))
501 {
502 break;
503 }
504 ++nr;
505 }
506
507 } else {
508 // The goal is high relative accuracy. However, if the matrix
509 // has high scaled condition number the relative accuracy is in
510 // general not feasible. Later on, a condition number estimator
511 // will be deployed to estimate the scaled condition number.
512 // Here we just remove the underflowed part of the triangular
513 // factor. This prevents the situation in which the code is
514 // working hard to get the accuracy not warranted by the data.
515 tmp = sqrt(safeMin);
516 for (IndexType p=2; p<=n; ++p) {
517 if (abs(A(p,p))<small || (l2kill && abs(A(p,p))<tmp)) {
518 break;
519 }
520 ++nr;
521 }
522
523 }
524
525 bool almort = false;
526 if (nr==n) {
527 ElementType maxprj = One;
528 for (IndexType p=2; p<=n; ++p) {
529 maxprj = min(maxprj, abs(A(p,p))/sva(iwork(p)));
530 }
531 if (pow(maxprj,2)>=One-ElementType(n)*eps) {
532 almort = true;
533 }
534 }
535
536
537 sconda = -One;
538 condr1 = -One;
539 condr2 = -One;
540
541 if (errest) {
542 if (n==nr) {
543 if (rsvec) {
544 // .. V is available as workspace
545 V.upper() = A(_(1,n),_).upper();
546 for (IndexType p=1; p<=n; ++p) {
547 tmp = sva(iwork(p));
548 V(_(1,p),p) *= One/tmp;
549 }
550 auto _work = work(_(n+1, 4*n));
551 auto _iwork = iwork(_(2*n+m+1, 3*n+m));
552 pocon(V.upper(), One, tmp, _work, _iwork);
553 } else if (lsvec) {
554 // .. U is available as workspace
555 auto _U = U(_(1,n),_(1,n));
556 _U.upper() = A(_(1,n),_).upper();
557 for (IndexType p=1; p<=n; ++p) {
558 tmp = sva(iwork(p));
559 U(_(1,p),p) *= One/tmp;
560 }
561 auto _work = work(_(n+1, 4*n));
562 auto _iwork = iwork(_(2*n+m+1, 3*n+m));
563 pocon(_U.upper(), One, tmp, _work, _iwork);
564 } else {
565 auto _work1 = work(_(n+1,n+n*n));
566 auto _work2 = work(_(n+n*n+1,n+n*n+3*n));
567 auto _iwork = work(_(2*n+m+1,3*n+m));
568 GeMatrixView<ElementType> Work(n, n, _work1, n);
569
570 Work.upper() = A(_(1,n),_).upper();
571 for (IndexType p=1; p<=n; ++p) {
572 tmp = sva(iwork(p));
573 Work(_(1,p),p) *= One/tmp;
574 }
575 // .. the columns of R are scaled to have unit Euclidean lengths.
576 pocon(Work.upper(), One, tmp, _work2, _iwork);
577 }
578 sconda = One/sqrt(tmp);
579 // SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1).
580 // N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
581 } else {
582 sconda = -One;
583 }
584 }
585
586 l2pert = l2pert && abs(A(1,1)/A(nr,nr))>sqrt(bigRoot);
587 // If there is no violent scaling, artificial perturbation is not needed.
588 //
589 // Phase 3:
590 //
591 if (!(rsvec || lsvec)) {
592 //
593 // Singular Values only
594 //
595 // .. transpose A(1:NR,1:N)
596 for (IndexType p=1; p<=min(n-1, nr); ++p) {
597 A(p,_(p+1,n)) = A(_(p+1,n),p);
598 }
599 //
600 // The following two DO-loops introduce small relative perturbation
601 // into the strict upper triangle of the lower triangular matrix.
602 // Small entries below the main diagonal are also changed.
603 // This modification is useful if the computing environment does not
604 // provide/allow FLUSH TO Zero underflow, for it prevents many
605 // annoying denormalized numbers in case of strongly scaled matrices.
606 // The perturbation is structured so that it does not introduce any
607 // new perturbation of the singular values, and it does not destroy
608 // the job done by the preconditioner.
609 // The licence for this perturbation is in the variable L2PERT, which
610 // should be false if FLUSH TO Zero underflow is active.
611 //
612 if (! almort) {
613
614 if (l2pert) {
615 // XSC = DSQRT(SMALL)
616 xsc = eps / ElementType(n);
617 for (IndexType q=1; q<=nr; ++q) {
618 tmp = xsc*abs(A(q,q));
619 for (IndexType p=1; p<=n, ++p) {
620 if ((p>q && abs(A(p,q))<=tmp) || p<q) {
621 A(p,q) = sign(tmp, A(p,q));
622 }
623 }
624 }
625 } else {
626 A(_(1,nr),_(1,nr)).strictUpper() = Zero;
627 }
628 //
629 // .. second preconditioning using the QR factorization
630 //
631 auto _A = A(_(1,n),_(1,nr));
632 auto _tau = work(_(1,nr));
633 auto _work = work(_(n+1,lWork));
634 qrf(_A, _tau, _work);
635 //
636 // .. and transpose upper to lower triangular
637 for (IndexType p=1; p<=nr-1; ++p) {
638 A(p,_(p+1,nr)) = A(_(p+1,nr),p);
639 }
640
641 }
642 //
643 // Row-cyclic Jacobi SVD algorithm with column pivoting
644 //
645 // .. again some perturbation (a "background noise") is added
646 // to drown denormals
647 if (l2pert) {
648 // XSC = DSQRT(SMALL)
649 xsc = eps / ElementType(n);
650 for (IndexType q=1; q<=nr; ++q) {
651 ElementType tmp = xsc*abs(A(q,q));
652 for (IndexType p=1; p<=nr; ++p) {
653 if (((p>q) && abs(A(p,q))<=tmp) || p<q) {
654 A(p,q) = sign(tmp, A(p,q));
655 }
656 }
657 }
658 } else {
659 A(_(1,nr),_(1,nr)).strictUpper() = Zero;
660 }
661 //
662 // .. and one-sided Jacobi rotations are started on a lower
663 // triangular matrix (plus perturbation which is ignored in
664 // the part which destroys triangular form (confusing?!))
665 //
666 auto _A = A(_(1,nr),_(1,nr));
667 auto _sva = sva(_(1,nr));
668
669 svj(SVJ::Lower, SVJ::NoU, SVJ::NoV, _A, _sva, V, work);
670
671 scaleM = work(1);
672 numRank = nint(work(2));
673
674
675 } else if (rsvec && !lsvec) {
676 //
677 // -> Singular Values and Right Singular Vectors <-
678 //
679 if (almort) {
680 //
681 // .. in this case NR equals N
682 ASSERT(nr==n);
683 for (IndexType p=1; p<=nr; ++p) {
684 V(_(p,n),p) = A(p,_(p,n));
685 }
686
687 auto _V = V(_,_(1,nr));
688 _V.strictUpper() = Zero;
689 svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV, _V, sva, A, work);
690
691 scaleM = work(1);
692 numRank = nint(work(2));
693
694 } else {
695 //
696 // .. two more QR factorizations ( one QRF is not enough, two require
697 // accumulated product of Jacobi rotations, three are perfect )
698 //
699 A(_(1,nr),_(1,nr)).strictLower() = Zero;
700
701 auto _A = A(_(1,nr),_);
702 auto _tau1 = work(_(1,nr));
703 auto _work1 = work(_(n+1,lWork));
704 lqf(_A, _tau1, _work1);
705
706 auto _V = V(_(1,nr),_(1,nr))
707 _V.lower() = _A(_,_(1,nr)).lower();
708 _V.strictUpper() = Zero;
709
710 auto _tau2 = work(_(n+1,n+nr));
711 auto _work2 = work(_(2*n+1,lWork));
712 qrf(_V, _tau2, _work2);
713
714 for (IndexType p=1; p<=nr; ++p) {
715 V(_(p,nr),p) = V(p,_(p,nr));
716 }
717 _V.strictUpper() = Zero;
718
719 auto _sva = sva(_(1,nr));
720 svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV, _V, _sva, U, _work1);
721
722 scaleM = work(n+1);
723 numRank = nint(work(n+2));
724 if (nr<n) {
725 V(_(nr+1,n),_(1,nr)) = Zero;
726 V(_(1,nr),_(nr+1,n)) = Zero;
727
728 V(_(nr+1,n),_(nr+1,n)) = Zero;
729 V(_(nr+1,n),_(nr+1,n)).diag(0) = One;
730 }
731
732 ormlq(Left, Trans, _A, _tau1, V, _work1);
733
734 }
735
736 for (IndexType p=1; p<=n; ++p) {
737 A(iwork(p),_) = V(p,_);
738 }
739 V = A;
740
741 if (transp) {
742 U = V;
743 }
744
745 } else if (lsvec && !rsvec) {
746 //
747 // .. Singular Values and Left Singular Vectors ..
748 //
749 // .. second preconditioning step to avoid need to accumulate
750 // Jacobi rotations in the Jacobi iterations.
751
752 auto _U = U(_(1,nr),_(1,nr));
753 auto _tau = work(_(n+1,n+nr));
754 auto _sva = sva(_(1,nr));
755 auto _work1 = work(_(n+1,lWork));
756 auto _work2 = work(_(2*n+1,lWork));
757
758 for (IndexType p=1; p<=nr; ++p) {
759 A(p,_(p,nr)) = U(_(p,nr),p);
760 }
761 _U.strictUpper() = Zero;
762
763 qrf(U(_(1,n),_(1,nr)), _tau, _work2);
764
765 for (IndexType p=1; p<=nr-1; ++p) {
766 U(p,_(p+1,nr)) = U(_(p+1,nr),p);
767 }
768 _U.strictUpper() = Zero;
769
770 svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV, _U, _sva, A, _work1);
771 scaleM = work(n+1);
772 numRank = nint(work(n+2));
773
774 if (nr<m) {
775 U(_(nr+1,m),_(1,nr)) = Zero;
776 if (nr<nu) {
777 U(_(1,nr),_(nr+1,nu)) = Zero;
778
779 U(_(nr+1,m),_(nr+1,nu)) = Zero;
780 U(_(nr+1,m),_(nr+1,nu)).diag(0) = One;
781 }
782 }
783
784 ormqr(Left, NoTrans, A, work, U);
785
786 if (rowpiv) {
787 auto piv = iwork(_(2*n+1,2*n+m-1));
788 laswp(U, piv.reverse());
789 }
790
791 for (IndexType p=1; p<=nu; ++p) {
792 xsc = One / blas::nrm2(U(_,p));
793 U(_,p) *= xsc;
794 }
795
796 if (transp) {
797 V = U;
798 }
799 //
800 } else {
801 //
802 // .. Full SVD ..
803 //
804 if (!jracc) {
805
806 if (!almort) {
807 //
808 // Second Preconditioning Step (QRF [with pivoting])
809 // Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
810 // equivalent to an LQF CALL. Since in many libraries the QRF
811 // seems to be better optimized than the LQF, we do explicit
812 // transpose and use the QRF. This is subject to changes in an
813 // optimized implementation of DGEJSV.
814 //
815 for (IndexType p=1; p<=nr; ++p) {
816 V(_(p,n),p) = A(p,_(p,n));
817 }
818 //
819 // .. the following two loops perturb small entries to avoid
820 // denormals in the second QR factorization, where they are
821 // as good as zeros. This is done to avoid painfully slow
822 // computation with denormals. The relative size of the
823 // perturbation is a parameter that can be changed by the
824 // implementer. This perturbation device will be obsolete on
825 // machines with properly implemented arithmetic.
826 // To switch it off, set L2PERT=false To remove it from the
827 // code, remove the action under L2PERT=true, leave the ELSE
828 // part. The following two loops should be blocked and fused with
829 // the transposed copy above.
830 //
831 if (l2pert) {
832 xsc = sqrt(small);
833 for (IndexType q=1; q<=nr; ++q) {
834 tmp = xsc*abs(V(q,q));
835 for (IndexType p=1; p<=n; ++p) {
836 if (p>q && abs(V(p,q))<=tmp || p<q) {
837 V(p,q) = sign(tmp, V(p,q));
838 }
839 if (p<q) {
840 V(p,q) = -V(p,q);
841 }
842 }
843 }
844 } else {
845 V(_(1,nr),_(1,nr)).strictUpper() = Zero;
846 }
847 //
848 // Estimate the row scaled condition number of R1
849 // (If R1 is rectangular, N > NR, then the condition number
850 // of the leading NR x NR submatrix is estimated.)
851 //
852 auto _work1 = work(_(2*n+1,2*n+nr*nr));
853 auto _work2 = work(_(2*n+nr*nr+1, 2*n+nr*nr+3*nr));
854 auto _iwork = iwork(_(m+2*n+1,m+2*n+nr));
855 GeMatrixView<ElementType> Work(nr, nr, _work1, nr);
856
857 Work.lower() = V.lower();
858 for (IndexType p=1; p<=nr; ++p) {
859 tmp = blas::nrm2(Work(_(p,nr),p));
860 Work(_(p,nr),p) *= One/tmp;
861 }
862 pocon(Work, One, tmp, _work2, _iwork);
863 condr1 = One / sqrt(tmp);
864 // .. here need a second oppinion on the condition number
865 // .. then assume worst case scenario
866 // R1 is OK for inverse <=> condr1 < DBLE(N)
867 // more conservative <=> condr1 < DSQRT(DBLE(N))
868 //
869 cond_ok = sqrt(ElementType(nr));
870 //[TP] COND_OK is a tuning parameter.
871
872 if (condr1<cond_ok) {
873 // .. the second QRF without pivoting. Note: in an optimized
874 // implementation, this QRF should be implemented as the QRF
875 // of a lower triangular matrix.
876 // R1^t = Q2 * R2
877 auto tau = work(_(n+1,n+nr));
878 auto _work = work(_(2*n+1,lWork));
879
880 qrf(V(_,_(1,nr)), tau, _work);
881
882 if (l2pert) {
883 xsc = sqrt(small) / Eps;
884 for (IndexType p=2; p<=nr; ++p) {
885 for (IndexType q=1; q<=p-1; ++q) {
886 tmp = xsc*min(abs(V(p,p)), abs(V(q,q)));
887 if (abs(V(q,p))<=tmp) {
888 V(q,p) = sign(tmp, V(q,p));
889 }
890 }
891 }
892 }
893 //
894 if (nr!=n) {
895 auto _work = work(_(2*n+1, 2*n+n*nr));
896 GeMatrixView<ElementType> Work(n, nr, _work, n);
897
898 Work = V(_,_(1,nr));
899 }
900 // .. save ...
901 //
902 // .. this transposed copy should be better than naive
903 // TODO: auto _V = V(_(1,nr),_(1,nr));
904 // _V.lower() = transpose(_V.upper());
905 //
906 for (IndexType p=1; p<=nr-1; ++p) {
907 V(_(p+1,nr),p) = V(p,_(p+1,nr));
908 }
909
910 condr2 = condr1;
911
912 } else {
913 //
914 // .. ill-conditioned case: second QRF with pivoting
915 // Note that windowed pivoting would be equaly good
916 // numerically, and more run-time efficient. So, in
917 // an optimal implementation, the next call to DGEQP3
918 // should be replaced with eg. CALL SGEQPX (ACM TOMS #782)
919 // with properly (carefully) chosen parameters.
920 //
921 // R1^t * P2 = Q2 * R2
922 auto _V = V(_,_(1,nr));
923 auto piv = iwork(_(n+1,n+nr));
924 auto tau = work(_(n+1,,n+nr));
925 auto _work = work(_(2*n+1, lWork));
926
927 piv = 0;
928 qp3(_V, piv, tau, _work);
929 if (l2pert) {
930 xsc = sqrt(small);
931 for (IndexType p=2; p<=nr; ++p) {
932 for (IndexType q=1; q<=p-1; ++q) {
933 tmp = xsc*min(abs(V(p,p)), abs(V(q,q)));
934 if (abs(V(q,p))<=tmp) {
935 V(q,p) = sign(tmp, V(q,p));
936 }
937 }
938 }
939 }
940
941 auto _work1 = work(_(2*n+1, 2*n+n*nr));
942 GeMatrixView<ElementType> Work1(n, nr, _work1, n);
943
944 Work1 = V(_,_(1,nr));
945
946 if (l2pert) {
947 xsc = sqrt(small);
948 for (IndexType p=2; p<=nr; ++p) {
949 for (IndexType q=1; q<=p-1; ++q) {
950 tmp = xsc * min(abs(V(p,p)), abs(V(q,q)));
951 V(p,q) = -sign(tmp, V(q,p));
952 }
953 }
954 } else {
955 V(_(1,nr),_(1,nr)).strictLower() = Zero;
956 }
957 // Now, compute R2 = L3 * Q3, the LQ factorization.
958 auto _V = V(_(1,nr),_(1,nr));
959 auto tau = work(_(2*n+n*nr+1,2*n+n*nr+nr));
960 auto _work2 = work(_(2*n+n*nr+nr+1,lWork));
961
962 lqf(_V, tau, _work2);
963 // .. and estimate the condition number
964 auto _work3 = work(_(2*n+n*nr+nr+1,2*n+n*nr+nr+nr*nr));
965 GeMatrixView<ElementType> Work3(nr, nr, _work3, nr);
966
967 Work3.lower() = _V.lower();
968
969 for (IndexType p=1; p<=nr; ++p) {
970 tmp = blas::nrm2(Work3(p,_(1,p)));
971 Work3(p,_(1,p)) *= One/tmp;
972 }
973 auto _work4 = work(_(2*n+n*nr+nr+nr*nr+1,
974 2*n+n*nr+nr+nr*nr+3*nr));
975 auto _iwork = iwork(_(m+2*n+1, m+2*n+nr));
976 pocon(Work3.lower(), One, tmp, _work4, _iwork);
977 condr2 = One / sqrt(tmp);
978
979 if (condr2>=cond_ok) {
980 // .. save the Householder vectors used for Q3
981 // (this overwrittes the copy of R2, as it will not be
982 // needed in this branch, but it does not overwritte the
983 // Huseholder vectors of Q2.).
984 Work1(_(1,nr),_(1,nr)).upper() = V.upper();
985 // .. and the rest of the information on Q3 is in
986 // WORK(2*N+N*NR+1:2*N+N*NR+N)
987 }
988
989 }
990
991 if (l2pert) {
992 xsc = sqrt(small);
993 for (IndexType q=2; q<=nr; ++q) {
994 tmp = xsc * V(q,q);
995 for (IndexType p=1; p<=q-1; ++p) {
996 // V(p,q) = - DSIGN( TEMP1, V(q,p) )
997 V(p,q) = -sign(tmp, V(p,q));
998 }
999 }
1000 } else {
1001 V(_(1,nr),_(1,nr)).strictLower();
1002 }
1003 //
1004 // Second preconditioning finished; continue with Jacobi SVD
1005 // The input matrix is lower triangular.
1006 //
1007 // Recover the right singular vectors as solution of a well
1008 // conditioned triangular matrix equation.
1009 //
1010 if (condr1<cond_ok) {
1011 auto _U = U(_(1,nr),_(1,nr));
1012 auto _V = V(_(1,nr),_(1,nr));
1013 auto _sva = sva(_(1,nr));
1014 auto _work = work(_(2*n+n*nr+nr+1,lWork));
1015
1016 svj(SVJ::Lower, SVJ::ComputeU, SVJ::NoV,
1017 _V, _sva, U, _work);
1018 scaleM = _work(1);
1019 numRank = nint(_work(2));
1020
1021 for (IndexType p=1; p<=nr; ++p) {
1022 _U(_,p) = _V(_,p);
1023 _V(_,p) *= sva(p);
1024 }
1025
1026 // .. pick the right matrix equation and solve it
1027 //
1028 if (nr==n) {
1029 //:)) .. best case, R1 is inverted. The solution of this
1030 // matrix equation is Q2*V2 = the product of the Jacobi
1031 // rotations used in DGESVJ, premultiplied with the
1032 // orthogonal matrix from the second QR factorization.
1033 const auto _A = A(_(1,nr),_(1,nr));
1034 blas::sm(Left, NoTrans, One, _A.upper(), _V);
1035 } else {
1036 // .. R1 is well conditioned, but non-square. Transpose(R2)
1037 // is inverted to get the product of the Jacobi rotations
1038 // used in DGESVJ. The Q-factor from the second QR
1039 // factorization is then built in explicitly.
1040
1041 auto _work = work(_(2*n+1, 2*n+n*nr));
1042 GeMatrixView<ElementType> Work1(nr, nr, _work, n);
1043 GeMatrixView<ElementType> Work(n, nr, _work, n);
1044
1045 blas::sm(Left, NoTrans, One, Work1.upper(), _V);
1046 if (nr<n) {
1047 V(_(nr+1,n),_(1,nr)) = Zero;
1048 V(_(1,nr),_(nr+1,n)) = Zero;
1049 V(_(nr+1,n),_(nr+1,n)) = Zero;
1050 V(_(nr+1,n),_(nr+1,n)).diag(0) = One;
1051 }
1052 auto tau = work(_(n+1,n+nr));
1053 auto _work_ormqr = work(_(2*n+n*nr+nr+1,lWork));
1054 ormqr(Left, NoTrans, Work, tau, V, _work_ormqr);
1055 }
1056 //
1057 } else if (condr2<cond_ok) {
1058 //
1059 //:) .. the input matrix A is very likely a relative of
1060 // the Kahan matrix :)
1061 // The matrix R2 is inverted. The solution of the matrix
1062 // equation is Q3^T*V3 = the product of the Jacobi rotations
1063 // (appplied to the lower triangular L3 from the LQ
1064 // factorization of R2=L3*Q3), pre-multiplied with the
1065 // transposed Q3.
1066 CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
1067 $ LDU, work(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )
1068 SCALEM = work(2*N+N*NR+NR+1)
1069 numRank = IDNINT(work(2*N+N*NR+NR+2))
1070 DO 3870 p = 1, NR
1071 CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 )
1072 CALL DSCAL( NR, SVA(p), U(1,p), 1 )
1073 3870 CONTINUE
1074 CALL DTRSM('L','U','N','N',NR,NR,One,work(2*N+1),N,U,LDU)
1075 // .. apply the permutation from the second QR factorization
1076 DO 873 q = 1, NR
1077 DO 872 p = 1, NR
1078 work(2*N+N*NR+NR+iwork(N+p)) = U(p,q)
1079 872 CONTINUE
1080 DO 874 p = 1, NR
1081 U(p,q) = work(2*N+N*NR+NR+p)
1082 874 CONTINUE
1083 873 CONTINUE
1084 if ( NR < N ) {
1085 CALL DLASET( 'A',N-NR,NR,Zero,Zero,V(NR+1,1),LDV )
1086 CALL DLASET( 'A',NR,N-NR,Zero,Zero,V(1,NR+1),LDV )
1087 CALL DLASET( 'A',N-NR,N-NR,Zero,One,V(NR+1,NR+1),LDV )
1088 }
1089 CALL DORMQR( 'L','N',N,N,NR,work(2*N+1),N,work(N+1),
1090 $ V,LDV,work(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
1091 } else {
1092 // Last line of defense.
1093 //#:( This is a rather pathological case: no scaled condition
1094 // improvement after two pivoted QR factorizations. Other
1095 // possibility is that the rank revealing QR factorization
1096 // or the condition estimator has failed, or the COND_OK
1097 // is set very close to One (which is unnecessary). Normally,
1098 // this branch should never be executed, but in rare cases of
1099 // failure of the RRQR or condition estimator, the last line of
1100 // defense ensures that DGEJSV completes the task.
1101 // Compute the full SVD of L3 using DGESVJ with explicit
1102 // accumulation of Jacobi rotations.
1103 CALL DGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,
1104 $ LDU, work(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )
1105 SCALEM = work(2*N+N*NR+NR+1)
1106 numRank = IDNINT(work(2*N+N*NR+NR+2))
1107 if ( NR < N ) {
1108 CALL DLASET( 'A',N-NR,NR,Zero,Zero,V(NR+1,1),LDV )
1109 CALL DLASET( 'A',NR,N-NR,Zero,Zero,V(1,NR+1),LDV )
1110 CALL DLASET( 'A',N-NR,N-NR,Zero,One,V(NR+1,NR+1),LDV )
1111 }
1112 CALL DORMQR( 'L','N',N,N,NR,work(2*N+1),N,work(N+1),
1113 $ V,LDV,work(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
1114 //
1115 CALL DORMLQ( 'L', 'T', NR, NR, NR, work(2*N+1), N,
1116 $ work(2*N+N*NR+1), U, LDU, work(2*N+N*NR+NR+1),
1117 $ LWORK-2*N-N*NR-NR, IERR )
1118 DO 773 q = 1, NR
1119 DO 772 p = 1, NR
1120 work(2*N+N*NR+NR+iwork(N+p)) = U(p,q)
1121 772 CONTINUE
1122 DO 774 p = 1, NR
1123 U(p,q) = work(2*N+N*NR+NR+p)
1124 774 CONTINUE
1125 773 CONTINUE
1126 //
1127 }
1128 //
1129 // Permute the rows of V using the (column) permutation from the
1130 // first QRF. Also, scale the columns to make them unit in
1131 // Euclidean norm. This applies to all cases.
1132 //
1133 TEMP1 = DSQRT(DBLE(N)) * EPSLN
1134 DO 1972 q = 1, N
1135 DO 972 p = 1, N
1136 work(2*N+N*NR+NR+iwork(p)) = V(p,q)
1137 972 CONTINUE
1138 DO 973 p = 1, N
1139 V(p,q) = work(2*N+N*NR+NR+p)
1140 973 CONTINUE
1141 XSC = One / DNRM2( N, V(1,q), 1 )
1142 if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1143 $ CALL DSCAL( N, XSC, V(1,q), 1 )
1144 1972 CONTINUE
1145 // At this moment, V contains the right singular vectors of A.
1146 // Next, assemble the left singular vector matrix U (M x N).
1147 if ( NR < M ) {
1148 CALL DLASET( 'A', M-NR, NR, Zero, Zero, U(NR+1,1), LDU )
1149 if ( NR < N1 ) {
1150 CALL DLASET('A',NR,N1-NR,Zero,Zero,U(1,NR+1),LDU)
1151 CALL DLASET('A',M-NR,N1-NR,Zero,One,U(NR+1,NR+1),LDU)
1152 }
1153 }
1154 //
1155 // The Q matrix from the first QRF is built into the left singular
1156 // matrix U. This applies to all cases.
1157 //
1158 CALL DORMQR( 'Left', 'No_Tr', M, N1, N, A, LDA, work, U,
1159 $ LDU, work(N+1), LWORK-N, IERR )
1160
1161 // The columns of U are normalized. The cost is O(M*N) flops.
1162 TEMP1 = DSQRT(DBLE(M)) * EPSLN
1163 DO 1973 p = 1, NR
1164 XSC = One / DNRM2( M, U(1,p), 1 )
1165 if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1166 $ CALL DSCAL( M, XSC, U(1,p), 1 )
1167 1973 CONTINUE
1168 //
1169 // If the initial QRF is computed with row pivoting, the left
1170 // singular vectors must be adjusted.
1171 //
1172 if ( ROWPIV )
1173 $ CALL DLASWP( N1, U, LDU, 1, M-1, iwork(2*N+1), -1 )
1174 //
1175 } else {
1176 //
1177 // .. the initial matrix A has almost orthogonal columns and
1178 // the second QRF is not needed
1179 //
1180 CALL DLACPY( 'Upper', N, N, A, LDA, work(N+1), N )
1181 if ( L2PERT ) {
1182 XSC = DSQRT(SMALL)
1183 DO 5970 p = 2, N
1184 TEMP1 = XSC * work( N + (p-1)*N + p )
1185 DO 5971 q = 1, p - 1
1186 work(N+(q-1)*N+p)=-DSIGN(TEMP1,work(N+(p-1)*N+q))
1187 5971 CONTINUE
1188 5970 CONTINUE
1189 } else {
1190 CALL DLASET( 'Lower',N-1,N-1,Zero,Zero,work(N+2),N )
1191 }
1192 //
1193 CALL DGESVJ( 'Upper', 'U', 'N', N, N, work(N+1), N, SVA,
1194 $ N, U, LDU, work(N+N*N+1), LWORK-N-N*N, INFO )
1195 //
1196 SCALEM = work(N+N*N+1)
1197 numRank = IDNINT(work(N+N*N+2))
1198 DO 6970 p = 1, N
1199 CALL DCOPY( N, work(N+(p-1)*N+1), 1, U(1,p), 1 )
1200 CALL DSCAL( N, SVA(p), work(N+(p-1)*N+1), 1 )
1201 6970 CONTINUE
1202 //
1203 CALL DTRSM( 'Left', 'Upper', 'NoTrans', 'No UD', N, N,
1204 $ One, A, LDA, work(N+1), N )
1205 DO 6972 p = 1, N
1206 CALL DCOPY( N, work(N+p), N, V(iwork(p),1), LDV )
1207 6972 CONTINUE
1208 TEMP1 = DSQRT(DBLE(N))*EPSLN
1209 DO 6971 p = 1, N
1210 XSC = One / DNRM2( N, V(1,p), 1 )
1211 if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1212 $ CALL DSCAL( N, XSC, V(1,p), 1 )
1213 6971 CONTINUE
1214 //
1215 // Assemble the left singular vector matrix U (M x N).
1216 //
1217 if ( N < M ) {
1218 CALL DLASET( 'A', M-N, N, Zero, Zero, U(N+1,1), LDU )
1219 if ( N < N1 ) {
1220 CALL DLASET( 'A',N, N1-N, Zero, Zero, U(1,N+1),LDU )
1221 CALL DLASET( 'A',M-N,N1-N, Zero, One,U(N+1,N+1),LDU )
1222 }
1223 }
1224 CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, work, U,
1225 $ LDU, work(N+1), LWORK-N, IERR )
1226 TEMP1 = DSQRT(DBLE(M))*EPSLN
1227 DO 6973 p = 1, N1
1228 XSC = One / DNRM2( M, U(1,p), 1 )
1229 if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1230 $ CALL DSCAL( M, XSC, U(1,p), 1 )
1231 6973 CONTINUE
1232 //
1233 if ( ROWPIV )
1234 $ CALL DLASWP( N1, U, LDU, 1, M-1, iwork(2*N+1), -1 )
1235 //
1236 }
1237 //
1238 // end of the >> almost orthogonal case << in the full SVD
1239 //
1240 } else {
1241 //
1242 // This branch deploys a preconditioned Jacobi SVD with explicitly
1243 // accumulated rotations. It is included as optional, mainly for
1244 // experimental purposes. It does perfom well, and can also be used.
1245 // In this implementation, this branch will be automatically activated
1246 // if the condition number sigma_max(A) / sigma_min(A) is predicted
1247 // to be greater than the overflow threshold. This is because the
1248 // a posteriori computation of the singular vectors assumes robust
1249 // implementation of BLAS and some LAPACK procedures, capable of
1250 // working in presence of extreme values. Since that is not always
1251 // the case, ...
1252 //
1253 DO 7968 p = 1, NR
1254 CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
1255 7968 CONTINUE
1256 //
1257 if ( L2PERT ) {
1258 XSC = DSQRT(SMALL/EPSLN)
1259 for (IndexType q=1; q<=nr; ++q) {
1260 tmp = xsc*abs(V(q,q));
1261 for (IndexType p=1; p<=n; ++p) {
1262 if (p>q && abs(V(p,q))<=tmp || p<q) {
1263 V(p,q) = sign(tmp, V(p,q));
1264 }
1265 if (p<q) {
1266 V(p,q) = - V(p,q);
1267 }
1268 }
1269 }
1270 } else {
1271 CALL DLASET( 'U', NR-1, NR-1, Zero, Zero, V(1,2), LDV )
1272 }
1273
1274 CALL DGEQRF( N, NR, V, LDV, work(N+1), work(2*N+1),
1275 $ LWORK-2*N, IERR )
1276 CALL DLACPY( 'L', N, NR, V, LDV, work(2*N+1), N )
1277 //
1278 DO 7969 p = 1, NR
1279 CALL DCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )
1280 7969 CONTINUE
1281
1282 if ( L2PERT ) {
1283 XSC = DSQRT(SMALL/EPSLN)
1284 DO 9970 q = 2, NR
1285 DO 9971 p = 1, q - 1
1286 TEMP1 = XSC * DMIN1(DABS(U(p,p)),DABS(U(q,q)))
1287 U(p,q) = - DSIGN( TEMP1, U(q,p) )
1288 9971 CONTINUE
1289 9970 CONTINUE
1290 } else {
1291 CALL DLASET('U', NR-1, NR-1, Zero, Zero, U(1,2), LDU )
1292 }
1293
1294 CALL DGESVJ( 'G', 'U', 'V', NR, NR, U, LDU, SVA,
1295 $ N, V, LDV, work(2*N+N*NR+1), LWORK-2*N-N*NR, INFO )
1296 SCALEM = work(2*N+N*NR+1)
1297 numRank = IDNINT(work(2*N+N*NR+2))
1298
1299 if ( NR < N ) {
1300 CALL DLASET( 'A',N-NR,NR,Zero,Zero,V(NR+1,1),LDV )
1301 CALL DLASET( 'A',NR,N-NR,Zero,Zero,V(1,NR+1),LDV )
1302 CALL DLASET( 'A',N-NR,N-NR,Zero,One,V(NR+1,NR+1),LDV )
1303 }
1304
1305 CALL DORMQR( 'L','N',N,N,NR,work(2*N+1),N,work(N+1),
1306 $ V,LDV,work(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
1307 //
1308 // Permute the rows of V using the (column) permutation from the
1309 // first QRF. Also, scale the columns to make them unit in
1310 // Euclidean norm. This applies to all cases.
1311 //
1312 TEMP1 = DSQRT(DBLE(N)) * EPSLN
1313 DO 7972 q = 1, N
1314 DO 8972 p = 1, N
1315 work(2*N+N*NR+NR+iwork(p)) = V(p,q)
1316 8972 CONTINUE
1317 DO 8973 p = 1, N
1318 V(p,q) = work(2*N+N*NR+NR+p)
1319 8973 CONTINUE
1320 XSC = One / DNRM2( N, V(1,q), 1 )
1321 if ( (XSC < (One-TEMP1)) || (XSC>(One+TEMP1)) )
1322 $ CALL DSCAL( N, XSC, V(1,q), 1 )
1323 7972 CONTINUE
1324 //
1325 // At this moment, V contains the right singular vectors of A.
1326 // Next, assemble the left singular vector matrix U (M x N).
1327 //
1328 if (NR<M) {
1329 CALL DLASET( 'A', M-NR, NR, Zero, Zero, U(NR+1,1), LDU )
1330 if (NR<N1) {
1331 CALL DLASET( 'A',NR, N1-NR, Zero, Zero, U(1,NR+1),LDU )
1332 CALL DLASET( 'A',M-NR,N1-NR, Zero, One,U(NR+1,NR+1),LDU )
1333 }
1334 }
1335
1336 CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, work, U,
1337 $ LDU, work(N+1), LWORK-N, IERR )
1338
1339 if (ROWPIV)
1340 $ CALL DLASWP( N1, U, LDU, 1, M-1, iwork(2*N+1), -1 )
1341
1342
1343 }
1344 if ( TRANSP ) {
1345 // .. swap U and V because the procedure worked on A^t
1346 for (IndexType p=1; p<=n; ++p) {
1347 blas::swap(U(_,p),V(_,p));
1348 }
1349 }
1350
1351 }
1352 // end of the full SVD
1353 //
1354 // Undo scaling, if necessary (and possible)
1355 //
1356 if ( USCAL2 <= (BIG/SVA(1))*USCAL1 ) {
1357 CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )
1358 USCAL1 = One
1359 USCAL2 = One
1360 }
1361
1362 if ( NR < N ) {
1363 sva(_(nr+1,n)) = Zero;
1364 }
1365
1366 work(1) = USCAL2 * SCALEM
1367 work(2) = USCAL1
1368 if ( errest ) work(3) = SCONDA
1369 if ( lsvec && rsvec ) {
1370 work(4) = condr1
1371 work(5) = condr2
1372 }
1373 if ( L2TRAN ) {
1374 work(6) = entra
1375 work(7) = entrat
1376 }
1377
1378 iwork(1) = NR
1379 iwork(2) = numRank
1380 iwork(3) = warning
1381 }
1382 */
1383 //== interface for native lapack ===============================================
1384 #ifdef CHECK_CXXLAPACK
1385
1386 template <typename MA, typename VSVA, typename MU, typename MV,
1387 typename VWORK, typename VIWORK>
1388 typename GeMatrix<MA>::IndexType
1389 jsv_native(JSV::Accuracy accuracy,
1390 JSV::JobU jobU,
1391 JSV::JobV jobV,
1392 bool restrictedRange,
1393 bool considerTransA,
1394 bool perturb,
1395 GeMatrix<MA> &A,
1396 DenseVector<VSVA> &sva,
1397 GeMatrix<MU> &U,
1398 GeMatrix<MV> &V,
1399 DenseVector<VWORK> &work,
1400 DenseVector<VIWORK> &iwork)
1401 {
1402 typedef typename GeMatrix<MA>::ElementType T;
1403
1404 const char JOBA = char(accuracy);
1405 const char JOBU = char(jobU);
1406 const char JOBV = char(jobV);
1407 const char JOBR = (restrictedRange) ? 'R' : 'N';
1408 const char JOBT = (considerTransA) ? 'T' : 'N';
1409 const char JOBP = (perturb) ? 'P' : 'N';
1410 const INTEGER M = A.numRows();
1411 const INTEGER N = A.numCols();
1412 const INTEGER LDA = A.leadingDimension();
1413 const INTEGER LDU = U.leadingDimension();
1414 const INTEGER LDV = V.leadingDimension();
1415 const INTEGER LWORK = work.length();
1416 INTEGER INFO;
1417
1418 if (IsSame<T,DOUBLE>::value) {
1419 LAPACK_IMPL(dgejsv)(&JOBA,
1420 &JOBU,
1421 &JOBV,
1422 &JOBR,
1423 &JOBT,
1424 &JOBP,
1425 &M,
1426 &N,
1427 A.data(),
1428 &LDA,
1429 sva.data(),
1430 U.data(),
1431 &LDU,
1432 V.data(),
1433 &LDV,
1434 work.data(),
1435 &LWORK,
1436 iwork.data(),
1437 &INFO);
1438 } else {
1439 ASSERT(0);
1440 }
1441 return INFO;
1442 }
1443
1444 #endif // CHECK_CXXLAPACK
1445
1446
1447 //== public interface ==========================================================
1448 template <typename MA, typename VSVA, typename MU, typename MV,
1449 typename VWORK, typename VIWORK>
1450 typename GeMatrix<MA>::IndexType
1451 jsv(JSV::Accuracy accuracy,
1452 JSV::JobU jobU,
1453 JSV::JobV jobV,
1454 bool restrictedRange,
1455 bool considerTransA,
1456 bool perturb,
1457 GeMatrix<MA> &A,
1458 DenseVector<VSVA> &sva,
1459 GeMatrix<MU> &U,
1460 GeMatrix<MV> &V,
1461 DenseVector<VWORK> &work,
1462 DenseVector<VIWORK> &iwork)
1463 {
1464 //
1465 // Test the input parameters
1466 //
1467 # ifndef NDEBUG
1468 typedef typename GeMatrix<MA>::IndexType IndexType;
1469
1470 ASSERT(A.firstRow()==1);
1471 ASSERT(A.firstCol()==1);
1472
1473 const IndexType m = A.numRows();
1474 const IndexType n = A.numCols();
1475
1476 ASSERT(m>=n);
1477
1478 ASSERT(sva.firstIndex()==1);
1479 ASSERT(sva.length()==n);
1480
1481 ASSERT(U.firstRow()==1);
1482 ASSERT(U.firstCol()==1);
1483
1484 ASSERT(V.firstRow()==1);
1485 ASSERT(V.firstCol()==1);
1486
1487 ASSERT(iwork.length()==m+3*n);
1488 # endif
1489
1490 //
1491 // Make copies of output arguments
1492 //
1493 # ifdef CHECK_CXXLAPACK
1494 typename GeMatrix<MA>::NoView A_org = A;
1495 typename DenseVector<VSVA>::NoView sva_org = sva;
1496 typename GeMatrix<MU>::NoView U_org = U;
1497 typename GeMatrix<MV>::NoView V_org = V;
1498 typename DenseVector<VWORK>::NoView work_org = work;
1499 # endif
1500
1501 //
1502 // Call implementation
1503 //
1504 /*
1505 IndexType info = jsv_generic(accuracy, jobU, jobV,
1506 restrictedRange, considerTransA, perturb,
1507 A, sva, U, V, work);
1508 */
1509 IndexType info = jsv_native(accuracy, jobU, jobV,
1510 restrictedRange, considerTransA, perturb,
1511 A, sva, U, V, work, iwork);
1512
1513 # ifdef CHECK_CXXLAPACK
1514 //
1515 // Make copies of results computed by the generic implementation
1516 //
1517 typename GeMatrix<MA>::NoView A_generic = A;
1518 typename DenseVector<VSVA>::NoView sva_generic = sva;
1519 typename GeMatrix<MU>::NoView U_generic = U;
1520 typename GeMatrix<MV>::NoView V_generic = V;
1521 typename DenseVector<VWORK>::NoView work_generic = work;
1522 //
1523 // restore output arguments
1524 //
1525 A = A_org;
1526 sva = sva_org;
1527 U = U_org;
1528 V = V_org;
1529 work = work_org;
1530 //
1531 // Compare generic results with results from the native implementation
1532 //
1533 IndexType _info = jsv_native(accuracy, jobU, jobV,
1534 restrictedRange, considerTransA, perturb,
1535 A, sva, U, V, work, iwork);
1536 bool failed = false;
1537 if (! isIdentical(A_generic, A, "A_generic", "A")) {
1538 std::cerr << "CXXLAPACK: A_generic = " << A_generic << std::endl;
1539 std::cerr << "F77LAPACK: A = " << A << std::endl;
1540 failed = true;
1541 }
1542 if (! isIdentical(sva_generic, sva, "sva_generic", "sva")) {
1543 std::cerr << "CXXLAPACK: sva_generic = " << sva_generic << std::endl;
1544 std::cerr << "F77LAPACK: sva = " << sva << std::endl;
1545 failed = true;
1546 }
1547 if (! isIdentical(U_generic, U, "U_generic", "U")) {
1548 std::cerr << "CXXLAPACK: U_generic = " << U_generic << std::endl;
1549 std::cerr << "F77LAPACK: U = " << U << std::endl;
1550 failed = true;
1551 }
1552 if (! isIdentical(V_generic, V, "V_generic", "V")) {
1553 std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;
1554 std::cerr << "F77LAPACK: V = " << V << std::endl;
1555 failed = true;
1556 }
1557 if (! isIdentical(work_generic, work, "work_generic", "work")) {
1558 std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
1559 std::cerr << "F77LAPACK: work = " << work << std::endl;
1560 failed = true;
1561 }
1562 if (! isIdentical(info, _info, "info", "_info")) {
1563 std::cerr << "CXXLAPACK: info = " << info << std::endl;
1564 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
1565 failed = true;
1566 }
1567
1568 if (failed) {
1569 std::cerr << "error in: jsv.tcc " << std::endl;
1570 ASSERT(0);
1571 } else {
1572 std::cerr << "passed: jsv.tcc " << std::endl;
1573 }
1574 # endif
1575
1576 return info;
1577 }
1578
1579 //-- forwarding ----------------------------------------------------------------
1580 template <typename MA, typename VSVA, typename MU, typename MV,
1581 typename VWORK, typename VIWORK>
1582 typename MA::IndexType
1583 jsv(JSV::Accuracy accuracy,
1584 JSV::JobU jobU,
1585 JSV::JobV jobV,
1586 bool restrictedRange,
1587 bool considerTransA,
1588 bool perturb,
1589 MA &&A,
1590 VSVA &&sva,
1591 MU &&U,
1592 MV &&V,
1593 VWORK &&work,
1594 VIWORK &&iwork)
1595 {
1596 typename MA::IndexType info;
1597
1598 CHECKPOINT_ENTER;
1599 info = jsv(accuracy, jobU, jobV,
1600 restrictedRange, considerTransA, perturb,
1601 A, sva, U, V, work, iwork);
1602 CHECKPOINT_LEAVE;
1603
1604 return info;
1605 }
1606
1607 } } // namespace lapack, flens
1608
1609 #endif // FLENS_LAPACK_SVD_JSV_TCC