1 /*
2 * Copyright (c) 2012, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Based on
34 *
35 SUBROUTINE DGSVJ0( JOBV, M, N, A, LDA, D, SVA, MV, V, LDV, EPS,
36 $ SFMIN, TOL, NSWEEP, WORK, LWORK, INFO )
37 *
38 * -- LAPACK routine (version 3.3.1) --
39 *
40 * -- Contributed by Zlatko Drmac of the University of Zagreb and --
41 * -- Kresimir Veselic of the Fernuniversitaet Hagen --
42 * -- April 2011 --
43 *
44 * -- LAPACK is a software package provided by Univ. of Tennessee, --
45 * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
46 *
47 * This routine is also part of SIGMA (version 1.23, October 23. 2008.)
48 * SIGMA is a library of algorithms for highly accurate algorithms for
49 * computation of SVD, PSVD, QSVD, (H,K)-SVD, and for solution of the
50 * eigenvalue problems Hx = lambda M x, H M x = lambda x with H, M > 0.
51 *
52 */
53
54 #ifndef FLENS_LAPACK_SVD_SVJ0_TCC
55 #define FLENS_LAPACK_SVD_SVJ0_TCC 1
56
57 #include <flens/blas/blas.h>
58 #include <flens/lapack/lapack.h>
59
60 namespace flens { namespace lapack {
61
62 //== generic lapack implementation =============================================
63
64 template <typename MA, typename VD, typename VSVA, typename MV, typename VWORK>
65 typename GeMatrix<MA>::IndexType
66 svj0_generic(SVJ::JobV jobV,
67 GeMatrix<MA> &A,
68 DenseVector<VD> &d,
69 DenseVector<VSVA> &sva,
70 GeMatrix<MV> &V,
71 const typename GeMatrix<MA>::ElementType &eps,
72 const typename GeMatrix<MA>::ElementType &safeMin,
73 const typename GeMatrix<MA>::ElementType &tol,
74 typename GeMatrix<MA>::IndexType nSweep,
75 DenseVector<VWORK> &work)
76 {
77 using std::abs;
78 using std::max;
79 using std::min;
80 using std::sqrt;
81 using std::swap;
82
83 typedef typename GeMatrix<MA>::ElementType ElementType;
84 typedef typename GeMatrix<MA>::IndexType IndexType;
85
86 const ElementType Zero(0), Half(0.5), One(1);
87
88 const Underscore<IndexType> _;
89
90 ElementType fastr_data[5];
91 DenseVectorView<ElementType>
92 fastr = typename DenseVectorView<ElementType>::Engine(5, fastr_data);
93
94 const IndexType m = A.numRows();
95 const IndexType n = A.numCols();
96
97 auto _work = work(_(1,m));
98
99 const bool applyV = (jobV==SVJ::ApplyV);
100 const bool rhsVec = (jobV==SVJ::ComputeV) || applyV;
101
102 const ElementType rootEps = sqrt(eps);
103 const ElementType rootSafeMin = sqrt(safeMin);
104 const ElementType small = safeMin / eps;
105 const ElementType big = One / safeMin;
106 const ElementType rootBig = One / rootSafeMin;
107 const ElementType bigTheta = One / rootEps;
108 const ElementType rootTol = sqrt(tol);
109
110 IndexType info = 0;
111 //
112 // -#- Row-cyclic Jacobi SVD algorithm with column pivoting -#-
113 //
114 const IndexType emptsw = n*(n-1)/2;
115 IndexType notRot = 0;
116 fastr(1) = Zero;
117 //
118 // -#- Row-cyclic pivot strategy with de Rijk's pivoting -#-
119 //
120
121 IndexType swBand = 0;
122 //[TP] SWBAND is a tuning parameter. It is meaningful and effective
123 // if SGESVJ is used as a computational routine in the preconditioned
124 // Jacobi SVD algorithm SGESVJ. For sweeps i=1:SWBAND the procedure
125 // ......
126
127 const IndexType kbl = min(IndexType(8),n);
128 //[TP] KBL is a tuning parameter that defines the tile size in the
129 // tiling of the p-q loops of pivot pairs. In general, an optimal
130 // value of KBL depends on the matrix dimensions and on the
131 // parameters of the computer's memory.
132 //
133 IndexType nbl = n / kbl;
134 if (nbl*kbl!=n) {
135 ++nbl;
136 }
137
138 const IndexType blSkip = pow(kbl,2) + 1;
139 //[TP] BLKSKIP is a tuning parameter that depends on SWBAND and KBL.
140
141 const IndexType rowSkip = min(IndexType(5),kbl);
142 //[TP] ROWSKIP is a tuning parameter.
143
144 const IndexType lkAhead = 1;
145 //[TP] LKAHEAD is a tuning parameter.
146 IndexType pSkipped = 0;
147
148 ElementType aapp, aapp0, aaqq, aapq, aqoap, apoaq;
149 ElementType tmp, cs, sn, t, theta, thetaSign;
150
151 bool rotOk;
152 bool converged = false;
153
154 for (IndexType i=1; i<=nSweep; ++i) {
155 // .. go go go ...
156 //
157 ElementType max_aapq = Zero;
158 ElementType max_sinj = Zero;
159
160 IndexType iswRot = 0;
161
162 pSkipped = 0;
163 notRot = 0;
164
165 for (IndexType ibr=1; ibr<=nbl; ++ibr) {
166
167 IndexType igl = (ibr-1)*kbl + 1;
168
169 for (IndexType ir1=0; ir1<=min(lkAhead, nbl-ibr); ++ir1) {
170
171 igl += ir1*kbl;
172
173 for (IndexType p=igl; p<=min(igl+kbl-1,n-1); ++p) {
174
175 // .. de Rijk's pivoting
176 IndexType q = blas::iamax(sva(_(p,n))) + p - 1;
177 if (p!=q) {
178 blas::swap(A(_,p), A(_,q));
179 if (rhsVec) {
180 blas::swap(V(_,p), V(_,q));
181 }
182 swap(sva(p),sva(q));
183 swap(d(p),d(q));
184 }
185
186 if (ir1==0) {
187 //
188 // Column norms are periodically updated by explicit
189 // norm computation.
190 // Caveat:
191 // Some BLAS implementations compute DNRM2(M,A(1,p),1)
192 // as DSQRT(DDOT(M,A(1,p),1,A(1,p),1)), which may result in
193 // overflow for ||A(:,p)||_2 > DSQRT(overflow_threshold), and
194 // undeflow for ||A(:,p)||_2 < DSQRT(underflow_threshold).
195 // Hence, DNRM2 cannot be trusted, not even in the case when
196 // the true norm is far from the under(over)flow boundaries.
197 // If properly implemented DNRM2 is available, the IF-THEN-ELSE
198 // below should read "AAPP = DNRM2( M, A(1,p), 1 ) * D(p)".
199 //
200 if (sva(p)<rootBig && sva(p)>rootSafeMin) {
201 sva(p) = blas::nrm2(A(_,p))*d(p);
202 } else {
203 ElementType tmp = Zero;
204 aapp = One;
205 lassq(A(_,p), tmp, aapp);
206 sva(p) = tmp*sqrt(aapp)*d(p);
207 }
208 aapp = sva(p);
209 } else {
210 aapp = sva(p);
211 }
212
213 if (aapp>Zero) {
214
215 pSkipped = 0;
216
217 for (IndexType q=p+1; q<=min(igl+kbl-1,n); ++q) {
218
219 aaqq = sva(q);
220
221 if (aaqq>Zero) {
222
223 aapp0 = aapp;
224 if (aaqq>=One) {
225 rotOk = small*aapp<=aaqq;
226 if (aapp<big/aaqq) {
227 aapq = A(_,p)*A(_,q)*d(p)*d(q)/aaqq/aapp;
228 } else {
229 _work = A(_,p);
230 lascl(LASCL::FullMatrix, 0, 0,
231 aapp, d(p), _work);
232 aapq = _work*A(_,q)*d(q)/aaqq;
233 }
234 } else {
235 rotOk = aapp<=aaqq/small;
236 if (aapp>small/aaqq) {
237 aapq = A(_,p)*A(_,q)*d(p)*d(q)/aaqq/aapp;
238 } else {
239 _work = A(_,q);
240 lascl(LASCL::FullMatrix, 0, 0,
241 aaqq, d(q), _work);
242 aapq = _work*A(_,p)*d(p)/aapp;
243 }
244 }
245 //
246 max_aapq = max(max_aapq, abs(aapq));
247 //
248 // TO rotate or NOT to rotate, THAT is the question ...
249 //
250 if (abs(aapq)>tol) {
251 //
252 // .. rotate
253 // ROTATED = ROTATED + ONE
254 //
255 if (ir1==0) {
256 notRot = 0;
257 pSkipped = 0;
258 ++iswRot;
259 }
260
261 if (rotOk) {
262
263 aqoap = aaqq / aapp;
264 apoaq = aapp / aaqq;
265 theta = -Half*abs(aqoap-apoaq)/aapq;
266
267 if (abs(theta)>bigTheta) {
268
269 t = Half / theta;
270 fastr(3) = t*d(p)/d(q);
271 fastr(4) = -t*d(q)/d(p);
272 blas::rotm(A(_,p), A(_,q), fastr);
273 if (rhsVec) {
274 blas::rotm(V(_,p), V(_,q), fastr);
275 }
276 sva(q) = aaqq
277 *sqrt(max(Zero, One+t*apoaq*aapq));
278 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
279 max_sinj = max(max_sinj, abs(t));
280
281 } else {
282 //
283 // .. choose correct signum for THETA and rotate
284 //
285 thetaSign = -sign(One, aapq);
286 t = One
287 / (theta+thetaSign*sqrt(One+theta*theta));
288 cs = sqrt(One/(One+t*t));
289 sn = t*cs;
290
291 max_sinj = max(max_sinj, abs(sn));
292 sva(q) = aaqq
293 * sqrt(max(Zero, One+t*apoaq*aapq));
294 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
295
296 apoaq = d(p) / d(q);
297 aqoap = d(q) / d(p);
298 if (d(p)>=One) {
299 if (d(q)>=One) {
300 fastr(3) = t*apoaq;
301 fastr(4) = -t*aqoap;
302 d(p) *= cs;
303 d(q) *= cs;
304 blas::rotm(A(_,p), A(_,q), fastr);
305 if (rhsVec) {
306 blas::rotm(V(_,p), V(_,q), fastr);
307 }
308 } else {
309 A(_,p) -= t*aqoap*A(_,q);
310 A(_,q) += cs*sn*apoaq*A(_,p);
311 d(p) *= cs;
312 d(q) /= cs;
313 if (rhsVec) {
314 V(_,p) -= t*aqoap*V(_,q);
315 V(_,q) += cs*sn*apoaq*V(_,p);
316 }
317 }
318 } else {
319 if (d(q)>=One) {
320 A(_,q) += t*apoaq*A(_,p);
321 A(_,p) -= cs*sn*aqoap*A(_,q);
322 d(p) /= cs;
323 d(q) *= cs;
324 if (rhsVec) {
325 V(_,q) += t*apoaq*V(_,p);
326 V(_,p) -= cs*sn*aqoap*V(_,q);
327 }
328 } else {
329 if (d(p)>=d(q)) {
330 A(_,p) -= t*aqoap*A(_,q);
331 A(_,q) += cs*sn*apoaq*A(_,p);
332 d(p) *= cs;
333 d(q) /= cs;
334 if (rhsVec) {
335 V(_,p) -= t*aqoap*V(_,q);
336 V(_,q) += cs*sn*apoaq*V(_,p);
337 }
338 } else {
339 A(_,q) += t*apoaq*A(_,p);
340 A(_,p) -= cs*sn*aqoap*A(_,q);
341 d(p) /= cs;
342 d(q) *= cs;
343 if (rhsVec) {
344 V(_,q) += t*apoaq*V(_,p);
345 V(_,p) -= cs*sn*aqoap*V(_,q);
346 }
347 }
348 }
349 }
350 }
351 //
352 } else {
353 // .. have to use modified Gram-Schmidt like transformation
354 _work = A(_,p);
355 lascl(LASCL::FullMatrix, 0, 0,
356 aapp, One, _work);
357 lascl(LASCL::FullMatrix, 0, 0,
358 aaqq, One, A(_,q));
359 tmp = -aapq*d(p)/d(q);
360 A(_,q) += tmp*_work;
361 lascl(LASCL::FullMatrix, 0, 0,
362 One, aaqq, A(_,q));
363 sva(q) = aaqq*sqrt(max(Zero,One-aapq*aapq));
364 max_sinj = max(max_sinj, safeMin);
365 }
366 // END IF ROTOK THEN ... ELSE
367 //
368 // In the case of cancellation in updating SVA(q), SVA(p)
369 // recompute SVA(q), SVA(p).
370 if (pow(sva(q)/aaqq,2)<=rootEps) {
371 if (aaqq<rootBig && aaqq>rootSafeMin) {
372 sva(q) = blas::nrm2(A(_,q))*d(q);
373 } else {
374 t = Zero;
375 aaqq = One;
376 lassq(A(_,q), t, aaqq);
377 sva(q) = t*sqrt(aaqq)*d(q);
378 }
379 }
380 if (aapp/aapp0<=rootEps) {
381 if (aapp<rootBig && aapp>rootSafeMin) {
382 aapp = blas::nrm2(A(_,p))*d(p);
383 } else {
384 t = Zero;
385 aapp = One;
386 lassq(A(_,p), t, aapp);
387 aapp = t*sqrt(aapp)*d(p);
388 }
389 sva(p) = aapp;
390 }
391 //
392 } else {
393 // A(:,p) and A(:,q) already numerically orthogonal
394 if (ir1==0) {
395 ++notRot;
396 }
397 ++pSkipped;
398 }
399 } else {
400 // A(:,q) is zero column
401 if (ir1==0) {
402 ++notRot;
403 }
404 ++pSkipped;
405 }
406
407 if (i<=swBand && pSkipped>rowSkip) {
408 if (ir1==0) {
409 aapp = -aapp;
410 }
411 notRot = 0;
412 break;
413 }
414
415 }
416 // END q-LOOP
417 //
418
419 // bailed out of q-loop
420
421 sva(p) = aapp;
422
423 } else {
424 sva(p) = aapp;
425 if (ir1==0 && aapp==Zero) {
426 notRot += min(igl+kbl-1,n) - p;
427 }
428 }
429
430 }
431 // end of the p-loop
432 // end of doing the block ( ibr, ibr )
433 }
434 // end of ir1-loop
435 //
436 //........................................................
437 //... go to the off diagonal blocks
438 //
439 igl = (ibr-1)*kbl + 1;
440
441 for (IndexType jbc=ibr+1; jbc<=nbl; ++jbc) {
442
443 IndexType jgl = (jbc-1)*kbl + 1;
444 //
445 // doing the block at ( ibr, jbc )
446 //
447 IndexType ijblsk = 0;
448 for (IndexType p=igl; p<=min(igl+kbl-1,n); ++p) {
449
450 aapp = sva(p);
451
452 if (aapp>Zero) {
453
454 pSkipped = 0;
455
456 for (IndexType q=jgl; q<=min(jgl+kbl-1,n); ++q) {
457
458 aaqq = sva(q);
459
460 if (aaqq>Zero) {
461 aapp0 = aapp;
462 //
463 // -#- M x 2 Jacobi SVD -#-
464 //
465 // -#- Safe Gram matrix computation -#-
466 //
467 if (aaqq>=One) {
468 if (aapp>=aaqq) {
469 rotOk = small*aapp<=aaqq;
470 } else {
471 rotOk = small*aaqq<=aapp;
472 }
473 if (aapp<big/aaqq) {
474 aapq = A(_,p)*A(_,q)*d(p)*d(q)/aaqq/aapp;
475 } else {
476 _work = A(_,p);
477 lascl(LASCL::FullMatrix, 0, 0,
478 aapp, d(p), _work);
479 aapq = _work*A(_,q)*d(q)/aaqq;
480 }
481 } else {
482 if (aapp>=aaqq) {
483 rotOk = aapp<=aaqq/small;
484 } else {
485 rotOk = aaqq<=aapp/small;
486 }
487 if (aapp>small/aaqq) {
488 aapq = A(_,p)*A(_,q)*d(p)*d(q)/aaqq/aapp;
489 } else {
490 _work = A(_,q);
491 lascl(LASCL::FullMatrix, 0, 0,
492 aaqq, d(q), _work);
493 aapq = _work*A(_,p)*d(p)/aapp;
494 }
495 }
496
497 max_aapq = max(max_aapq, abs(aapq));
498 //
499 // TO rotate or NOT to rotate, THAT is the question ...
500 //
501 if (abs(aapq)>tol) {
502 notRot = 0;
503 // ROTATED = ROTATED + 1
504 pSkipped = 0;
505 ++iswRot;
506
507 if (rotOk) {
508
509 aqoap = aaqq / aapp;
510 apoaq = aapp / aaqq;
511 theta = -Half*abs(aqoap-apoaq)/aapq;
512 if (aaqq>aapp0) {
513 theta = -theta;
514 }
515
516 if (abs(theta)>bigTheta) {
517 t = Half / theta;
518 fastr(3) = t*d(p)/d(q);
519 fastr(4) = -t*d(q)/d(p);
520 blas::rotm(A(_,p), A(_,q), fastr);
521 if (rhsVec) {
522 blas::rotm(V(_,p), V(_,q), fastr);
523 }
524 sva(q) = aaqq
525 *sqrt(max(Zero, One+t*apoaq*aapq));
526 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
527 max_sinj = max(max_sinj, abs(t));
528 } else {
529 //
530 // .. choose correct signum for THETA and rotate
531 //
532 thetaSign = -sign(One, aapq);
533 if (aaqq>aapp0) {
534 thetaSign = -thetaSign;
535 }
536 t = One
537 / (theta+thetaSign*sqrt(One+theta*theta));
538 cs = sqrt(One/(One+t*t));
539 sn = t*cs;
540 max_sinj = max(max_sinj, abs(sn));
541 sva(q) = aaqq
542 *sqrt(max(Zero, One+t*apoaq*aapq));
543 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
544
545 apoaq = d(p) / d(q);
546 aqoap = d(q) / d(p);
547 if (d(p)>=One) {
548
549 if (d(q)>=One) {
550 fastr(3) = t*apoaq;
551 fastr(4) = -t*aqoap;
552 d(p) *= cs;
553 d(q) *= cs;
554 blas::rotm(A(_,p), A(_,q), fastr);
555 if (rhsVec) {
556 blas::rotm(V(_,p), V(_,q), fastr);
557 }
558 } else {
559 A(_,p) -= t*aqoap*A(_,q);
560 A(_,q) += cs*sn*apoaq*A(_,p);
561 if (rhsVec) {
562 V(_,p) -= t*aqoap*V(_,q);
563 V(_,q) += cs*sn*apoaq*V(_,p);
564 }
565 d(p) *= cs;
566 d(q) /= cs;
567 }
568 } else {
569 if (d(q)>=One) {
570 A(_,q) += t*apoaq*A(_,p);
571 A(_,p) -= cs*sn*aqoap*A(_,q);
572 if (rhsVec) {
573 V(_,q) += t*apoaq*V(_,p);
574 V(_,p) -= cs*sn*aqoap*V(_,q);
575 }
576 d(p) /= cs;
577 d(q) *= cs;
578 } else {
579 if (d(p)>=d(q)) {
580 A(_,p) -= t*aqoap*A(_,q);
581 A(_,q) += cs*sn*apoaq*A(_,p);
582 d(p) *= cs;
583 d(q) /= cs;
584 if (rhsVec) {
585 V(_,p) -= t*aqoap*V(_,q);
586 V(_,q) += cs*sn*apoaq*V(_,p);
587 }
588 } else {
589 A(_,q) += t*apoaq*A(_,p);
590 A(_,p) -= cs*sn*aqoap*A(_,q);
591 d(p) /= cs;
592 d(q) *= cs;
593 if (rhsVec) {
594 V(_,q) += t*apoaq*V(_,p);
595 V(_,p) -= cs*sn*aqoap*V(_,q);
596 }
597 }
598 }
599 }
600 }
601
602 } else {
603 if (aapp>aaqq) {
604 _work = A(_,p);
605 lascl(LASCL::FullMatrix, 0, 0,
606 aapp, One, _work);
607 lascl(LASCL::FullMatrix, 0, 0,
608 aaqq, One, A(_,q));
609 tmp = -aapq*d(p)/d(q);
610 A(_,q) += tmp*_work;
611 lascl(LASCL::FullMatrix, 0, 0,
612 One, aaqq, A(_,q));
613 sva(q) = aaqq*sqrt(max(Zero, One-aapq*aapq));
614 max_sinj = max(max_sinj, safeMin);
615 } else {
616 _work = A(_,q);
617 lascl(LASCL::FullMatrix, 0, 0,
618 aaqq, One, _work);
619 lascl(LASCL::FullMatrix, 0, 0,
620 aapp, One, A(_,p));
621 tmp = -aapq*d(q)/d(p);
622 A(_,p) += tmp*_work;
623 lascl(LASCL::FullMatrix, 0, 0,
624 One, aapp, A(_,p));
625 sva(p) = aapp*sqrt(max(Zero,One-aapq*aapq));
626 max_sinj = max(max_sinj, safeMin);
627 }
628 }
629 // END IF ROTOK THEN ... ELSE
630 //
631 // In the case of cancellation in updating SVA(q)
632 // .. recompute SVA(q)
633 if (pow(sva(q)/aaqq, 2)<=rootEps) {
634 if (aaqq<rootBig && aaqq>rootSafeMin) {
635 sva(q) = blas::nrm2(A(_,q))*d(q);
636 } else {
637 t = Zero;
638 aaqq = One;
639 lassq(A(_,q), t, aaqq);
640 sva(q) = t*sqrt(aaqq)*d(q);
641 }
642 }
643 if (pow(aapp/aapp0, 2)<=rootEps) {
644 if (aapp<rootBig && aapp>rootSafeMin) {
645 aapp = blas::nrm2(A(_,p))*d(p);
646 } else {
647 t = Zero;
648 aapp = One;
649 lassq(A(_,p), t, aapp);
650 aapp = t*sqrt(aapp)*d(p);
651 }
652 sva(p) = aapp;
653 }
654 // end of OK rotation
655 } else {
656 ++notRot;
657 ++pSkipped;
658 ++ijblsk;
659 }
660 } else {
661 ++notRot;
662 ++pSkipped;
663 ++ijblsk;
664 }
665
666 if (i<=swBand && ijblsk>=blSkip) {
667 sva(p) = aapp;
668 notRot = 0;
669 goto jbcLoopExit;
670 }
671 if (i<=swBand && pSkipped>rowSkip) {
672 aapp = -aapp;
673 notRot = 0;
674 break;
675 }
676
677 }
678 // end of the q-loop
679
680 sva(p) = aapp;
681
682 } else {
683 if (aapp==Zero) {
684 notRot += min(jgl+kbl-1, n) -jgl + 1;
685 }
686 if (aapp<Zero) {
687 notRot = 0;
688 }
689 }
690
691 }
692 // end of the p-loop
693 }
694 // end of the jbc-loop
695 jbcLoopExit:
696 //2011 bailed out of the jbc-loop
697 for (IndexType p=igl; p<=min(igl+kbl-1,n); ++p) {
698 sva(p) = abs(sva(p));
699 }
700
701 }
702 //2000 :: end of the ibr-loop
703 //
704 // .. update SVA(N)
705 if (sva(n)<rootBig && sva(n)>rootSafeMin) {
706 sva(n) = blas::nrm2(A(_,n))*d(n);
707 } else {
708 ElementType t = Zero;
709 aapp = One;
710 lassq(A(_,n), t, aapp);
711 sva(n) = t*sqrt(aapp)*d(n);
712 }
713 //
714 // Additional steering devices
715 //
716 if (i<swBand && (max_aapq<=rootTol || iswRot<=n)) {
717 swBand = i;
718 }
719
720 if (i>swBand+1 && max_aapq<n*tol && n*max_aapq*max_sinj<tol) {
721 converged = true;
722 break;
723 }
724
725 if (notRot>=emptsw) {
726 converged = true;
727 break;
728 }
729
730 }
731 // end i=1:NSWEEP loop
732 if (converged) {
733 //#:) Reaching this point means that during the i-th sweep all pivots were
734 // below the given tolerance, causing early exit.
735 info = 0;
736 } else {
737 //#:) Reaching this point means that the procedure has comleted the given
738 // number of iterations.
739 info = nSweep - 1;
740 }
741 //
742 // Sort the vector D.
743 for (IndexType p=1; p<=n-1; ++p) {
744 IndexType q = blas::iamax(sva(_(p,n))) + p - 1;
745 if (p!=q) {
746 swap(sva(p), sva(q));
747 swap(d(p), d(q));
748 blas::swap(A(_,p), A(_,q));
749 if (rhsVec) {
750 blas::swap(V(_,p), V(_,q));
751 }
752 }
753 }
754 return info;
755 }
756
757 //== interface for native lapack ===============================================
758 #ifdef CHECK_CXXLAPACK
759
760 template <typename MA, typename VD, typename VSVA, typename MV, typename VWORK>
761 typename GeMatrix<MA>::IndexType
762 svj0_native(SVJ::JobV jobV,
763 GeMatrix<MA> &A,
764 DenseVector<VD> &d,
765 DenseVector<VSVA> &sva,
766 GeMatrix<MV> &V,
767 const typename GeMatrix<MA>::ElementType &eps,
768 const typename GeMatrix<MA>::ElementType &safeMin,
769 const typename GeMatrix<MA>::ElementType &tol,
770 typename GeMatrix<MA>::IndexType nSweep,
771 DenseVector<VWORK> &work)
772 {
773 typedef typename GeMatrix<MA>::ElementType T;
774
775 const char JOBV = char(jobV);
776 const INTEGER M = A.numRows();
777 const INTEGER N = A.numCols();
778 const INTEGER LDA = A.leadingDimension();
779 const INTEGER _MV = V.numRows();
780 const INTEGER LDV = V.leadingDimension();
781 const DOUBLE EPS = eps;
782 const DOUBLE SFMIN = safeMin;
783 const DOUBLE TOL = tol;
784 const INTEGER NSWEEP = nSweep;
785 const INTEGER LWORK = work.length();
786 INTEGER INFO;
787
788 if (IsSame<T,DOUBLE>::value) {
789 LAPACK_DECL(dgsvj0)(&JOBV,
790 &M,
791 &N,
792 A.data(),
793 &LDA,
794 d.data(),
795 sva.data(),
796 &_MV,
797 V.data(),
798 &LDV,
799 &EPS,
800 &SFMIN,
801 &TOL,
802 &NSWEEP,
803 work.data(),
804 &LWORK,
805 &INFO);
806 } else {
807 ASSERT(0);
808 }
809 ASSERT(INFO>=0);
810 return INFO;
811 }
812
813 #endif // CHECK_CXXLAPACK
814
815 //== public interface ==========================================================
816 template <typename MA, typename VD, typename VSVA, typename MV, typename VWORK>
817 typename GeMatrix<MA>::IndexType
818 svj0(SVJ::JobV jobV,
819 GeMatrix<MA> &A,
820 DenseVector<VD> &d,
821 DenseVector<VSVA> &sva,
822 GeMatrix<MV> &V,
823 const typename GeMatrix<MA>::ElementType &eps,
824 const typename GeMatrix<MA>::ElementType &safeMin,
825 const typename GeMatrix<MA>::ElementType &tol,
826 typename GeMatrix<MA>::IndexType nSweep,
827 DenseVector<VWORK> &work)
828 {
829 using std::max;
830 using std::min;
831
832 //
833 // Test the input parameters
834 //
835 # ifndef NDEBUG
836 typedef typename GeMatrix<MA>::IndexType IndexType;
837
838 ASSERT(A.firstRow()==1);
839 ASSERT(A.firstCol()==1);
840
841 const IndexType m = A.numRows();
842 const IndexType n = A.numCols();
843
844 ASSERT(m>=n);
845
846 ASSERT(d.firstIndex()==1);
847 ASSERT(d.length()==n);
848
849 ASSERT(sva.firstIndex()==1);
850 ASSERT(sva.length()==n);
851
852 ASSERT(V.firstRow()==1);
853 ASSERT(V.firstCol()==1);
854
855 if (jobV==SVJ::ComputeV) {
856 ASSERT(V.numCols()==n);
857 ASSERT(V.numRows()==n);
858 }
859 if (jobV==SVJ::ApplyV) {
860 ASSERT(V.numCols()==n);
861 }
862
863 if (work.length()>0) {
864 ASSERT(work.length()>=m);
865 }
866 # endif
867 //
868 // Make copies of output arguments
869 //
870 # ifdef CHECK_CXXLAPACK
871 typename GeMatrix<MA>::NoView A_org = A;
872 typename DenseVector<VD>::NoView d_org = d;
873 typename DenseVector<VSVA>::NoView sva_org = sva;
874 typename GeMatrix<MV>::NoView V_org = V;
875 typename DenseVector<VWORK>::NoView work_org = work;
876 # endif
877
878 //
879 // Call implementation
880 //
881 IndexType info = svj0_generic(jobV, A, d, sva, V, eps, safeMin, tol,
882 nSweep, work);
883
884 # ifdef CHECK_CXXLAPACK
885 //
886 // Make copies of results computed by the generic implementation
887 //
888 typename GeMatrix<MA>::NoView A_generic = A;
889 typename DenseVector<VD>::NoView d_generic = d;
890 typename DenseVector<VSVA>::NoView sva_generic = sva;
891 typename GeMatrix<MV>::NoView V_generic = V;
892 typename DenseVector<VWORK>::NoView work_generic = work;
893 //
894 // restore output arguments
895 //
896 A = A_org;
897 d = d_org;
898 sva = sva_org;
899 V = V_org;
900 work = work_org;
901 //
902 // Compare generic results with results from the native implementation
903 //
904 IndexType _info = svj0_native(jobV, A, d, sva, V, eps, safeMin, tol,
905 nSweep, work);
906
907 bool failed = false;
908 if (! isIdentical(A_generic, A, "A_generic", "A")) {
909 std::cerr << "CXXLAPACK: A_generic = " << A_generic << std::endl;
910 std::cerr << "F77LAPACK: A = " << A << std::endl;
911 failed = true;
912 }
913 if (! isIdentical(d_generic, d, "d_generic", "d")) {
914 std::cerr << "CXXLAPACK: d_generic = " << d_generic << std::endl;
915 std::cerr << "F77LAPACK: d = " << d << std::endl;
916 failed = true;
917 }
918 if (! isIdentical(sva_generic, sva, "sva_generic", "sva")) {
919 std::cerr << "CXXLAPACK: sva_generic = " << sva_generic << std::endl;
920 std::cerr << "F77LAPACK: sva = " << sva << std::endl;
921 failed = true;
922 }
923 if (! isIdentical(V_generic, V, "V_generic", "V")) {
924 std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;
925 std::cerr << "F77LAPACK: V = " << V << std::endl;
926 failed = true;
927 }
928 if (! isIdentical(work_generic, work, "work_generic", "work")) {
929 std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
930 std::cerr << "F77LAPACK: work = " << work << std::endl;
931 failed = true;
932 }
933 if (! isIdentical(info, _info, "info", "_info")) {
934 std::cerr << "CXXLAPACK: info = " << info << std::endl;
935 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
936 failed = true;
937 }
938
939 if (failed) {
940 std::cerr << "error in: svj0.tcc" << std::endl;
941 ASSERT(0);
942 } else {
943 //std::cerr << "passed: svj0.tcc" << std::endl;
944 }
945 # endif
946
947 return info;
948 }
949
950 //-- forwarding ----------------------------------------------------------------
951 template <typename MA, typename VD, typename VSVA, typename MV, typename VWORK>
952 typename MA::IndexType
953 svj0(SVJ::JobV jobV,
954 MA &&A,
955 VD &&d,
956 VSVA &&sva,
957 MV &&V,
958 const typename MA::ElementType &eps,
959 const typename MA::ElementType &safeMin,
960 const typename MA::ElementType &tol,
961 typename MA::IndexType nSweep,
962 VWORK &&work)
963 {
964 typename MA::IndexType info;
965
966 CHECKPOINT_ENTER;
967 info = svj0(jobV, A, d, sva, V, eps, safeMin, tol, nSweep, work);
968 CHECKPOINT_LEAVE;
969
970 return info;
971 }
972
973 } } // namespace lapack, flens
974
975 #endif // FLENS_LAPACK_SVD_SVJ0_TCC
2 * Copyright (c) 2012, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Based on
34 *
35 SUBROUTINE DGSVJ0( JOBV, M, N, A, LDA, D, SVA, MV, V, LDV, EPS,
36 $ SFMIN, TOL, NSWEEP, WORK, LWORK, INFO )
37 *
38 * -- LAPACK routine (version 3.3.1) --
39 *
40 * -- Contributed by Zlatko Drmac of the University of Zagreb and --
41 * -- Kresimir Veselic of the Fernuniversitaet Hagen --
42 * -- April 2011 --
43 *
44 * -- LAPACK is a software package provided by Univ. of Tennessee, --
45 * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
46 *
47 * This routine is also part of SIGMA (version 1.23, October 23. 2008.)
48 * SIGMA is a library of algorithms for highly accurate algorithms for
49 * computation of SVD, PSVD, QSVD, (H,K)-SVD, and for solution of the
50 * eigenvalue problems Hx = lambda M x, H M x = lambda x with H, M > 0.
51 *
52 */
53
54 #ifndef FLENS_LAPACK_SVD_SVJ0_TCC
55 #define FLENS_LAPACK_SVD_SVJ0_TCC 1
56
57 #include <flens/blas/blas.h>
58 #include <flens/lapack/lapack.h>
59
60 namespace flens { namespace lapack {
61
62 //== generic lapack implementation =============================================
63
64 template <typename MA, typename VD, typename VSVA, typename MV, typename VWORK>
65 typename GeMatrix<MA>::IndexType
66 svj0_generic(SVJ::JobV jobV,
67 GeMatrix<MA> &A,
68 DenseVector<VD> &d,
69 DenseVector<VSVA> &sva,
70 GeMatrix<MV> &V,
71 const typename GeMatrix<MA>::ElementType &eps,
72 const typename GeMatrix<MA>::ElementType &safeMin,
73 const typename GeMatrix<MA>::ElementType &tol,
74 typename GeMatrix<MA>::IndexType nSweep,
75 DenseVector<VWORK> &work)
76 {
77 using std::abs;
78 using std::max;
79 using std::min;
80 using std::sqrt;
81 using std::swap;
82
83 typedef typename GeMatrix<MA>::ElementType ElementType;
84 typedef typename GeMatrix<MA>::IndexType IndexType;
85
86 const ElementType Zero(0), Half(0.5), One(1);
87
88 const Underscore<IndexType> _;
89
90 ElementType fastr_data[5];
91 DenseVectorView<ElementType>
92 fastr = typename DenseVectorView<ElementType>::Engine(5, fastr_data);
93
94 const IndexType m = A.numRows();
95 const IndexType n = A.numCols();
96
97 auto _work = work(_(1,m));
98
99 const bool applyV = (jobV==SVJ::ApplyV);
100 const bool rhsVec = (jobV==SVJ::ComputeV) || applyV;
101
102 const ElementType rootEps = sqrt(eps);
103 const ElementType rootSafeMin = sqrt(safeMin);
104 const ElementType small = safeMin / eps;
105 const ElementType big = One / safeMin;
106 const ElementType rootBig = One / rootSafeMin;
107 const ElementType bigTheta = One / rootEps;
108 const ElementType rootTol = sqrt(tol);
109
110 IndexType info = 0;
111 //
112 // -#- Row-cyclic Jacobi SVD algorithm with column pivoting -#-
113 //
114 const IndexType emptsw = n*(n-1)/2;
115 IndexType notRot = 0;
116 fastr(1) = Zero;
117 //
118 // -#- Row-cyclic pivot strategy with de Rijk's pivoting -#-
119 //
120
121 IndexType swBand = 0;
122 //[TP] SWBAND is a tuning parameter. It is meaningful and effective
123 // if SGESVJ is used as a computational routine in the preconditioned
124 // Jacobi SVD algorithm SGESVJ. For sweeps i=1:SWBAND the procedure
125 // ......
126
127 const IndexType kbl = min(IndexType(8),n);
128 //[TP] KBL is a tuning parameter that defines the tile size in the
129 // tiling of the p-q loops of pivot pairs. In general, an optimal
130 // value of KBL depends on the matrix dimensions and on the
131 // parameters of the computer's memory.
132 //
133 IndexType nbl = n / kbl;
134 if (nbl*kbl!=n) {
135 ++nbl;
136 }
137
138 const IndexType blSkip = pow(kbl,2) + 1;
139 //[TP] BLKSKIP is a tuning parameter that depends on SWBAND and KBL.
140
141 const IndexType rowSkip = min(IndexType(5),kbl);
142 //[TP] ROWSKIP is a tuning parameter.
143
144 const IndexType lkAhead = 1;
145 //[TP] LKAHEAD is a tuning parameter.
146 IndexType pSkipped = 0;
147
148 ElementType aapp, aapp0, aaqq, aapq, aqoap, apoaq;
149 ElementType tmp, cs, sn, t, theta, thetaSign;
150
151 bool rotOk;
152 bool converged = false;
153
154 for (IndexType i=1; i<=nSweep; ++i) {
155 // .. go go go ...
156 //
157 ElementType max_aapq = Zero;
158 ElementType max_sinj = Zero;
159
160 IndexType iswRot = 0;
161
162 pSkipped = 0;
163 notRot = 0;
164
165 for (IndexType ibr=1; ibr<=nbl; ++ibr) {
166
167 IndexType igl = (ibr-1)*kbl + 1;
168
169 for (IndexType ir1=0; ir1<=min(lkAhead, nbl-ibr); ++ir1) {
170
171 igl += ir1*kbl;
172
173 for (IndexType p=igl; p<=min(igl+kbl-1,n-1); ++p) {
174
175 // .. de Rijk's pivoting
176 IndexType q = blas::iamax(sva(_(p,n))) + p - 1;
177 if (p!=q) {
178 blas::swap(A(_,p), A(_,q));
179 if (rhsVec) {
180 blas::swap(V(_,p), V(_,q));
181 }
182 swap(sva(p),sva(q));
183 swap(d(p),d(q));
184 }
185
186 if (ir1==0) {
187 //
188 // Column norms are periodically updated by explicit
189 // norm computation.
190 // Caveat:
191 // Some BLAS implementations compute DNRM2(M,A(1,p),1)
192 // as DSQRT(DDOT(M,A(1,p),1,A(1,p),1)), which may result in
193 // overflow for ||A(:,p)||_2 > DSQRT(overflow_threshold), and
194 // undeflow for ||A(:,p)||_2 < DSQRT(underflow_threshold).
195 // Hence, DNRM2 cannot be trusted, not even in the case when
196 // the true norm is far from the under(over)flow boundaries.
197 // If properly implemented DNRM2 is available, the IF-THEN-ELSE
198 // below should read "AAPP = DNRM2( M, A(1,p), 1 ) * D(p)".
199 //
200 if (sva(p)<rootBig && sva(p)>rootSafeMin) {
201 sva(p) = blas::nrm2(A(_,p))*d(p);
202 } else {
203 ElementType tmp = Zero;
204 aapp = One;
205 lassq(A(_,p), tmp, aapp);
206 sva(p) = tmp*sqrt(aapp)*d(p);
207 }
208 aapp = sva(p);
209 } else {
210 aapp = sva(p);
211 }
212
213 if (aapp>Zero) {
214
215 pSkipped = 0;
216
217 for (IndexType q=p+1; q<=min(igl+kbl-1,n); ++q) {
218
219 aaqq = sva(q);
220
221 if (aaqq>Zero) {
222
223 aapp0 = aapp;
224 if (aaqq>=One) {
225 rotOk = small*aapp<=aaqq;
226 if (aapp<big/aaqq) {
227 aapq = A(_,p)*A(_,q)*d(p)*d(q)/aaqq/aapp;
228 } else {
229 _work = A(_,p);
230 lascl(LASCL::FullMatrix, 0, 0,
231 aapp, d(p), _work);
232 aapq = _work*A(_,q)*d(q)/aaqq;
233 }
234 } else {
235 rotOk = aapp<=aaqq/small;
236 if (aapp>small/aaqq) {
237 aapq = A(_,p)*A(_,q)*d(p)*d(q)/aaqq/aapp;
238 } else {
239 _work = A(_,q);
240 lascl(LASCL::FullMatrix, 0, 0,
241 aaqq, d(q), _work);
242 aapq = _work*A(_,p)*d(p)/aapp;
243 }
244 }
245 //
246 max_aapq = max(max_aapq, abs(aapq));
247 //
248 // TO rotate or NOT to rotate, THAT is the question ...
249 //
250 if (abs(aapq)>tol) {
251 //
252 // .. rotate
253 // ROTATED = ROTATED + ONE
254 //
255 if (ir1==0) {
256 notRot = 0;
257 pSkipped = 0;
258 ++iswRot;
259 }
260
261 if (rotOk) {
262
263 aqoap = aaqq / aapp;
264 apoaq = aapp / aaqq;
265 theta = -Half*abs(aqoap-apoaq)/aapq;
266
267 if (abs(theta)>bigTheta) {
268
269 t = Half / theta;
270 fastr(3) = t*d(p)/d(q);
271 fastr(4) = -t*d(q)/d(p);
272 blas::rotm(A(_,p), A(_,q), fastr);
273 if (rhsVec) {
274 blas::rotm(V(_,p), V(_,q), fastr);
275 }
276 sva(q) = aaqq
277 *sqrt(max(Zero, One+t*apoaq*aapq));
278 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
279 max_sinj = max(max_sinj, abs(t));
280
281 } else {
282 //
283 // .. choose correct signum for THETA and rotate
284 //
285 thetaSign = -sign(One, aapq);
286 t = One
287 / (theta+thetaSign*sqrt(One+theta*theta));
288 cs = sqrt(One/(One+t*t));
289 sn = t*cs;
290
291 max_sinj = max(max_sinj, abs(sn));
292 sva(q) = aaqq
293 * sqrt(max(Zero, One+t*apoaq*aapq));
294 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
295
296 apoaq = d(p) / d(q);
297 aqoap = d(q) / d(p);
298 if (d(p)>=One) {
299 if (d(q)>=One) {
300 fastr(3) = t*apoaq;
301 fastr(4) = -t*aqoap;
302 d(p) *= cs;
303 d(q) *= cs;
304 blas::rotm(A(_,p), A(_,q), fastr);
305 if (rhsVec) {
306 blas::rotm(V(_,p), V(_,q), fastr);
307 }
308 } else {
309 A(_,p) -= t*aqoap*A(_,q);
310 A(_,q) += cs*sn*apoaq*A(_,p);
311 d(p) *= cs;
312 d(q) /= cs;
313 if (rhsVec) {
314 V(_,p) -= t*aqoap*V(_,q);
315 V(_,q) += cs*sn*apoaq*V(_,p);
316 }
317 }
318 } else {
319 if (d(q)>=One) {
320 A(_,q) += t*apoaq*A(_,p);
321 A(_,p) -= cs*sn*aqoap*A(_,q);
322 d(p) /= cs;
323 d(q) *= cs;
324 if (rhsVec) {
325 V(_,q) += t*apoaq*V(_,p);
326 V(_,p) -= cs*sn*aqoap*V(_,q);
327 }
328 } else {
329 if (d(p)>=d(q)) {
330 A(_,p) -= t*aqoap*A(_,q);
331 A(_,q) += cs*sn*apoaq*A(_,p);
332 d(p) *= cs;
333 d(q) /= cs;
334 if (rhsVec) {
335 V(_,p) -= t*aqoap*V(_,q);
336 V(_,q) += cs*sn*apoaq*V(_,p);
337 }
338 } else {
339 A(_,q) += t*apoaq*A(_,p);
340 A(_,p) -= cs*sn*aqoap*A(_,q);
341 d(p) /= cs;
342 d(q) *= cs;
343 if (rhsVec) {
344 V(_,q) += t*apoaq*V(_,p);
345 V(_,p) -= cs*sn*aqoap*V(_,q);
346 }
347 }
348 }
349 }
350 }
351 //
352 } else {
353 // .. have to use modified Gram-Schmidt like transformation
354 _work = A(_,p);
355 lascl(LASCL::FullMatrix, 0, 0,
356 aapp, One, _work);
357 lascl(LASCL::FullMatrix, 0, 0,
358 aaqq, One, A(_,q));
359 tmp = -aapq*d(p)/d(q);
360 A(_,q) += tmp*_work;
361 lascl(LASCL::FullMatrix, 0, 0,
362 One, aaqq, A(_,q));
363 sva(q) = aaqq*sqrt(max(Zero,One-aapq*aapq));
364 max_sinj = max(max_sinj, safeMin);
365 }
366 // END IF ROTOK THEN ... ELSE
367 //
368 // In the case of cancellation in updating SVA(q), SVA(p)
369 // recompute SVA(q), SVA(p).
370 if (pow(sva(q)/aaqq,2)<=rootEps) {
371 if (aaqq<rootBig && aaqq>rootSafeMin) {
372 sva(q) = blas::nrm2(A(_,q))*d(q);
373 } else {
374 t = Zero;
375 aaqq = One;
376 lassq(A(_,q), t, aaqq);
377 sva(q) = t*sqrt(aaqq)*d(q);
378 }
379 }
380 if (aapp/aapp0<=rootEps) {
381 if (aapp<rootBig && aapp>rootSafeMin) {
382 aapp = blas::nrm2(A(_,p))*d(p);
383 } else {
384 t = Zero;
385 aapp = One;
386 lassq(A(_,p), t, aapp);
387 aapp = t*sqrt(aapp)*d(p);
388 }
389 sva(p) = aapp;
390 }
391 //
392 } else {
393 // A(:,p) and A(:,q) already numerically orthogonal
394 if (ir1==0) {
395 ++notRot;
396 }
397 ++pSkipped;
398 }
399 } else {
400 // A(:,q) is zero column
401 if (ir1==0) {
402 ++notRot;
403 }
404 ++pSkipped;
405 }
406
407 if (i<=swBand && pSkipped>rowSkip) {
408 if (ir1==0) {
409 aapp = -aapp;
410 }
411 notRot = 0;
412 break;
413 }
414
415 }
416 // END q-LOOP
417 //
418
419 // bailed out of q-loop
420
421 sva(p) = aapp;
422
423 } else {
424 sva(p) = aapp;
425 if (ir1==0 && aapp==Zero) {
426 notRot += min(igl+kbl-1,n) - p;
427 }
428 }
429
430 }
431 // end of the p-loop
432 // end of doing the block ( ibr, ibr )
433 }
434 // end of ir1-loop
435 //
436 //........................................................
437 //... go to the off diagonal blocks
438 //
439 igl = (ibr-1)*kbl + 1;
440
441 for (IndexType jbc=ibr+1; jbc<=nbl; ++jbc) {
442
443 IndexType jgl = (jbc-1)*kbl + 1;
444 //
445 // doing the block at ( ibr, jbc )
446 //
447 IndexType ijblsk = 0;
448 for (IndexType p=igl; p<=min(igl+kbl-1,n); ++p) {
449
450 aapp = sva(p);
451
452 if (aapp>Zero) {
453
454 pSkipped = 0;
455
456 for (IndexType q=jgl; q<=min(jgl+kbl-1,n); ++q) {
457
458 aaqq = sva(q);
459
460 if (aaqq>Zero) {
461 aapp0 = aapp;
462 //
463 // -#- M x 2 Jacobi SVD -#-
464 //
465 // -#- Safe Gram matrix computation -#-
466 //
467 if (aaqq>=One) {
468 if (aapp>=aaqq) {
469 rotOk = small*aapp<=aaqq;
470 } else {
471 rotOk = small*aaqq<=aapp;
472 }
473 if (aapp<big/aaqq) {
474 aapq = A(_,p)*A(_,q)*d(p)*d(q)/aaqq/aapp;
475 } else {
476 _work = A(_,p);
477 lascl(LASCL::FullMatrix, 0, 0,
478 aapp, d(p), _work);
479 aapq = _work*A(_,q)*d(q)/aaqq;
480 }
481 } else {
482 if (aapp>=aaqq) {
483 rotOk = aapp<=aaqq/small;
484 } else {
485 rotOk = aaqq<=aapp/small;
486 }
487 if (aapp>small/aaqq) {
488 aapq = A(_,p)*A(_,q)*d(p)*d(q)/aaqq/aapp;
489 } else {
490 _work = A(_,q);
491 lascl(LASCL::FullMatrix, 0, 0,
492 aaqq, d(q), _work);
493 aapq = _work*A(_,p)*d(p)/aapp;
494 }
495 }
496
497 max_aapq = max(max_aapq, abs(aapq));
498 //
499 // TO rotate or NOT to rotate, THAT is the question ...
500 //
501 if (abs(aapq)>tol) {
502 notRot = 0;
503 // ROTATED = ROTATED + 1
504 pSkipped = 0;
505 ++iswRot;
506
507 if (rotOk) {
508
509 aqoap = aaqq / aapp;
510 apoaq = aapp / aaqq;
511 theta = -Half*abs(aqoap-apoaq)/aapq;
512 if (aaqq>aapp0) {
513 theta = -theta;
514 }
515
516 if (abs(theta)>bigTheta) {
517 t = Half / theta;
518 fastr(3) = t*d(p)/d(q);
519 fastr(4) = -t*d(q)/d(p);
520 blas::rotm(A(_,p), A(_,q), fastr);
521 if (rhsVec) {
522 blas::rotm(V(_,p), V(_,q), fastr);
523 }
524 sva(q) = aaqq
525 *sqrt(max(Zero, One+t*apoaq*aapq));
526 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
527 max_sinj = max(max_sinj, abs(t));
528 } else {
529 //
530 // .. choose correct signum for THETA and rotate
531 //
532 thetaSign = -sign(One, aapq);
533 if (aaqq>aapp0) {
534 thetaSign = -thetaSign;
535 }
536 t = One
537 / (theta+thetaSign*sqrt(One+theta*theta));
538 cs = sqrt(One/(One+t*t));
539 sn = t*cs;
540 max_sinj = max(max_sinj, abs(sn));
541 sva(q) = aaqq
542 *sqrt(max(Zero, One+t*apoaq*aapq));
543 aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
544
545 apoaq = d(p) / d(q);
546 aqoap = d(q) / d(p);
547 if (d(p)>=One) {
548
549 if (d(q)>=One) {
550 fastr(3) = t*apoaq;
551 fastr(4) = -t*aqoap;
552 d(p) *= cs;
553 d(q) *= cs;
554 blas::rotm(A(_,p), A(_,q), fastr);
555 if (rhsVec) {
556 blas::rotm(V(_,p), V(_,q), fastr);
557 }
558 } else {
559 A(_,p) -= t*aqoap*A(_,q);
560 A(_,q) += cs*sn*apoaq*A(_,p);
561 if (rhsVec) {
562 V(_,p) -= t*aqoap*V(_,q);
563 V(_,q) += cs*sn*apoaq*V(_,p);
564 }
565 d(p) *= cs;
566 d(q) /= cs;
567 }
568 } else {
569 if (d(q)>=One) {
570 A(_,q) += t*apoaq*A(_,p);
571 A(_,p) -= cs*sn*aqoap*A(_,q);
572 if (rhsVec) {
573 V(_,q) += t*apoaq*V(_,p);
574 V(_,p) -= cs*sn*aqoap*V(_,q);
575 }
576 d(p) /= cs;
577 d(q) *= cs;
578 } else {
579 if (d(p)>=d(q)) {
580 A(_,p) -= t*aqoap*A(_,q);
581 A(_,q) += cs*sn*apoaq*A(_,p);
582 d(p) *= cs;
583 d(q) /= cs;
584 if (rhsVec) {
585 V(_,p) -= t*aqoap*V(_,q);
586 V(_,q) += cs*sn*apoaq*V(_,p);
587 }
588 } else {
589 A(_,q) += t*apoaq*A(_,p);
590 A(_,p) -= cs*sn*aqoap*A(_,q);
591 d(p) /= cs;
592 d(q) *= cs;
593 if (rhsVec) {
594 V(_,q) += t*apoaq*V(_,p);
595 V(_,p) -= cs*sn*aqoap*V(_,q);
596 }
597 }
598 }
599 }
600 }
601
602 } else {
603 if (aapp>aaqq) {
604 _work = A(_,p);
605 lascl(LASCL::FullMatrix, 0, 0,
606 aapp, One, _work);
607 lascl(LASCL::FullMatrix, 0, 0,
608 aaqq, One, A(_,q));
609 tmp = -aapq*d(p)/d(q);
610 A(_,q) += tmp*_work;
611 lascl(LASCL::FullMatrix, 0, 0,
612 One, aaqq, A(_,q));
613 sva(q) = aaqq*sqrt(max(Zero, One-aapq*aapq));
614 max_sinj = max(max_sinj, safeMin);
615 } else {
616 _work = A(_,q);
617 lascl(LASCL::FullMatrix, 0, 0,
618 aaqq, One, _work);
619 lascl(LASCL::FullMatrix, 0, 0,
620 aapp, One, A(_,p));
621 tmp = -aapq*d(q)/d(p);
622 A(_,p) += tmp*_work;
623 lascl(LASCL::FullMatrix, 0, 0,
624 One, aapp, A(_,p));
625 sva(p) = aapp*sqrt(max(Zero,One-aapq*aapq));
626 max_sinj = max(max_sinj, safeMin);
627 }
628 }
629 // END IF ROTOK THEN ... ELSE
630 //
631 // In the case of cancellation in updating SVA(q)
632 // .. recompute SVA(q)
633 if (pow(sva(q)/aaqq, 2)<=rootEps) {
634 if (aaqq<rootBig && aaqq>rootSafeMin) {
635 sva(q) = blas::nrm2(A(_,q))*d(q);
636 } else {
637 t = Zero;
638 aaqq = One;
639 lassq(A(_,q), t, aaqq);
640 sva(q) = t*sqrt(aaqq)*d(q);
641 }
642 }
643 if (pow(aapp/aapp0, 2)<=rootEps) {
644 if (aapp<rootBig && aapp>rootSafeMin) {
645 aapp = blas::nrm2(A(_,p))*d(p);
646 } else {
647 t = Zero;
648 aapp = One;
649 lassq(A(_,p), t, aapp);
650 aapp = t*sqrt(aapp)*d(p);
651 }
652 sva(p) = aapp;
653 }
654 // end of OK rotation
655 } else {
656 ++notRot;
657 ++pSkipped;
658 ++ijblsk;
659 }
660 } else {
661 ++notRot;
662 ++pSkipped;
663 ++ijblsk;
664 }
665
666 if (i<=swBand && ijblsk>=blSkip) {
667 sva(p) = aapp;
668 notRot = 0;
669 goto jbcLoopExit;
670 }
671 if (i<=swBand && pSkipped>rowSkip) {
672 aapp = -aapp;
673 notRot = 0;
674 break;
675 }
676
677 }
678 // end of the q-loop
679
680 sva(p) = aapp;
681
682 } else {
683 if (aapp==Zero) {
684 notRot += min(jgl+kbl-1, n) -jgl + 1;
685 }
686 if (aapp<Zero) {
687 notRot = 0;
688 }
689 }
690
691 }
692 // end of the p-loop
693 }
694 // end of the jbc-loop
695 jbcLoopExit:
696 //2011 bailed out of the jbc-loop
697 for (IndexType p=igl; p<=min(igl+kbl-1,n); ++p) {
698 sva(p) = abs(sva(p));
699 }
700
701 }
702 //2000 :: end of the ibr-loop
703 //
704 // .. update SVA(N)
705 if (sva(n)<rootBig && sva(n)>rootSafeMin) {
706 sva(n) = blas::nrm2(A(_,n))*d(n);
707 } else {
708 ElementType t = Zero;
709 aapp = One;
710 lassq(A(_,n), t, aapp);
711 sva(n) = t*sqrt(aapp)*d(n);
712 }
713 //
714 // Additional steering devices
715 //
716 if (i<swBand && (max_aapq<=rootTol || iswRot<=n)) {
717 swBand = i;
718 }
719
720 if (i>swBand+1 && max_aapq<n*tol && n*max_aapq*max_sinj<tol) {
721 converged = true;
722 break;
723 }
724
725 if (notRot>=emptsw) {
726 converged = true;
727 break;
728 }
729
730 }
731 // end i=1:NSWEEP loop
732 if (converged) {
733 //#:) Reaching this point means that during the i-th sweep all pivots were
734 // below the given tolerance, causing early exit.
735 info = 0;
736 } else {
737 //#:) Reaching this point means that the procedure has comleted the given
738 // number of iterations.
739 info = nSweep - 1;
740 }
741 //
742 // Sort the vector D.
743 for (IndexType p=1; p<=n-1; ++p) {
744 IndexType q = blas::iamax(sva(_(p,n))) + p - 1;
745 if (p!=q) {
746 swap(sva(p), sva(q));
747 swap(d(p), d(q));
748 blas::swap(A(_,p), A(_,q));
749 if (rhsVec) {
750 blas::swap(V(_,p), V(_,q));
751 }
752 }
753 }
754 return info;
755 }
756
757 //== interface for native lapack ===============================================
758 #ifdef CHECK_CXXLAPACK
759
760 template <typename MA, typename VD, typename VSVA, typename MV, typename VWORK>
761 typename GeMatrix<MA>::IndexType
762 svj0_native(SVJ::JobV jobV,
763 GeMatrix<MA> &A,
764 DenseVector<VD> &d,
765 DenseVector<VSVA> &sva,
766 GeMatrix<MV> &V,
767 const typename GeMatrix<MA>::ElementType &eps,
768 const typename GeMatrix<MA>::ElementType &safeMin,
769 const typename GeMatrix<MA>::ElementType &tol,
770 typename GeMatrix<MA>::IndexType nSweep,
771 DenseVector<VWORK> &work)
772 {
773 typedef typename GeMatrix<MA>::ElementType T;
774
775 const char JOBV = char(jobV);
776 const INTEGER M = A.numRows();
777 const INTEGER N = A.numCols();
778 const INTEGER LDA = A.leadingDimension();
779 const INTEGER _MV = V.numRows();
780 const INTEGER LDV = V.leadingDimension();
781 const DOUBLE EPS = eps;
782 const DOUBLE SFMIN = safeMin;
783 const DOUBLE TOL = tol;
784 const INTEGER NSWEEP = nSweep;
785 const INTEGER LWORK = work.length();
786 INTEGER INFO;
787
788 if (IsSame<T,DOUBLE>::value) {
789 LAPACK_DECL(dgsvj0)(&JOBV,
790 &M,
791 &N,
792 A.data(),
793 &LDA,
794 d.data(),
795 sva.data(),
796 &_MV,
797 V.data(),
798 &LDV,
799 &EPS,
800 &SFMIN,
801 &TOL,
802 &NSWEEP,
803 work.data(),
804 &LWORK,
805 &INFO);
806 } else {
807 ASSERT(0);
808 }
809 ASSERT(INFO>=0);
810 return INFO;
811 }
812
813 #endif // CHECK_CXXLAPACK
814
815 //== public interface ==========================================================
816 template <typename MA, typename VD, typename VSVA, typename MV, typename VWORK>
817 typename GeMatrix<MA>::IndexType
818 svj0(SVJ::JobV jobV,
819 GeMatrix<MA> &A,
820 DenseVector<VD> &d,
821 DenseVector<VSVA> &sva,
822 GeMatrix<MV> &V,
823 const typename GeMatrix<MA>::ElementType &eps,
824 const typename GeMatrix<MA>::ElementType &safeMin,
825 const typename GeMatrix<MA>::ElementType &tol,
826 typename GeMatrix<MA>::IndexType nSweep,
827 DenseVector<VWORK> &work)
828 {
829 using std::max;
830 using std::min;
831
832 //
833 // Test the input parameters
834 //
835 # ifndef NDEBUG
836 typedef typename GeMatrix<MA>::IndexType IndexType;
837
838 ASSERT(A.firstRow()==1);
839 ASSERT(A.firstCol()==1);
840
841 const IndexType m = A.numRows();
842 const IndexType n = A.numCols();
843
844 ASSERT(m>=n);
845
846 ASSERT(d.firstIndex()==1);
847 ASSERT(d.length()==n);
848
849 ASSERT(sva.firstIndex()==1);
850 ASSERT(sva.length()==n);
851
852 ASSERT(V.firstRow()==1);
853 ASSERT(V.firstCol()==1);
854
855 if (jobV==SVJ::ComputeV) {
856 ASSERT(V.numCols()==n);
857 ASSERT(V.numRows()==n);
858 }
859 if (jobV==SVJ::ApplyV) {
860 ASSERT(V.numCols()==n);
861 }
862
863 if (work.length()>0) {
864 ASSERT(work.length()>=m);
865 }
866 # endif
867 //
868 // Make copies of output arguments
869 //
870 # ifdef CHECK_CXXLAPACK
871 typename GeMatrix<MA>::NoView A_org = A;
872 typename DenseVector<VD>::NoView d_org = d;
873 typename DenseVector<VSVA>::NoView sva_org = sva;
874 typename GeMatrix<MV>::NoView V_org = V;
875 typename DenseVector<VWORK>::NoView work_org = work;
876 # endif
877
878 //
879 // Call implementation
880 //
881 IndexType info = svj0_generic(jobV, A, d, sva, V, eps, safeMin, tol,
882 nSweep, work);
883
884 # ifdef CHECK_CXXLAPACK
885 //
886 // Make copies of results computed by the generic implementation
887 //
888 typename GeMatrix<MA>::NoView A_generic = A;
889 typename DenseVector<VD>::NoView d_generic = d;
890 typename DenseVector<VSVA>::NoView sva_generic = sva;
891 typename GeMatrix<MV>::NoView V_generic = V;
892 typename DenseVector<VWORK>::NoView work_generic = work;
893 //
894 // restore output arguments
895 //
896 A = A_org;
897 d = d_org;
898 sva = sva_org;
899 V = V_org;
900 work = work_org;
901 //
902 // Compare generic results with results from the native implementation
903 //
904 IndexType _info = svj0_native(jobV, A, d, sva, V, eps, safeMin, tol,
905 nSweep, work);
906
907 bool failed = false;
908 if (! isIdentical(A_generic, A, "A_generic", "A")) {
909 std::cerr << "CXXLAPACK: A_generic = " << A_generic << std::endl;
910 std::cerr << "F77LAPACK: A = " << A << std::endl;
911 failed = true;
912 }
913 if (! isIdentical(d_generic, d, "d_generic", "d")) {
914 std::cerr << "CXXLAPACK: d_generic = " << d_generic << std::endl;
915 std::cerr << "F77LAPACK: d = " << d << std::endl;
916 failed = true;
917 }
918 if (! isIdentical(sva_generic, sva, "sva_generic", "sva")) {
919 std::cerr << "CXXLAPACK: sva_generic = " << sva_generic << std::endl;
920 std::cerr << "F77LAPACK: sva = " << sva << std::endl;
921 failed = true;
922 }
923 if (! isIdentical(V_generic, V, "V_generic", "V")) {
924 std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;
925 std::cerr << "F77LAPACK: V = " << V << std::endl;
926 failed = true;
927 }
928 if (! isIdentical(work_generic, work, "work_generic", "work")) {
929 std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
930 std::cerr << "F77LAPACK: work = " << work << std::endl;
931 failed = true;
932 }
933 if (! isIdentical(info, _info, "info", "_info")) {
934 std::cerr << "CXXLAPACK: info = " << info << std::endl;
935 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
936 failed = true;
937 }
938
939 if (failed) {
940 std::cerr << "error in: svj0.tcc" << std::endl;
941 ASSERT(0);
942 } else {
943 //std::cerr << "passed: svj0.tcc" << std::endl;
944 }
945 # endif
946
947 return info;
948 }
949
950 //-- forwarding ----------------------------------------------------------------
951 template <typename MA, typename VD, typename VSVA, typename MV, typename VWORK>
952 typename MA::IndexType
953 svj0(SVJ::JobV jobV,
954 MA &&A,
955 VD &&d,
956 VSVA &&sva,
957 MV &&V,
958 const typename MA::ElementType &eps,
959 const typename MA::ElementType &safeMin,
960 const typename MA::ElementType &tol,
961 typename MA::IndexType nSweep,
962 VWORK &&work)
963 {
964 typename MA::IndexType info;
965
966 CHECKPOINT_ENTER;
967 info = svj0(jobV, A, d, sva, V, eps, safeMin, tol, nSweep, work);
968 CHECKPOINT_LEAVE;
969
970 return info;
971 }
972
973 } } // namespace lapack, flens
974
975 #endif // FLENS_LAPACK_SVD_SVJ0_TCC