1 /*
2 * Copyright (c) 2011, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Based on
34 *
35 SUBROUTINE DLAQR0( WANTT, WANTZ, N, ILO, IHI, H, LDH, WR, WI,
36 $ ILOZ, IHIZ, Z, LDZ, WORK, LWORK, INFO )
37 *
38 * -- LAPACK auxiliary routine (version 3.2) --
39 * Univ. of Tennessee, Univ. of California Berkeley,
40 * Univ. of Colorado Denver and NAG Ltd..
41 * November 2006
42 */
43
44 #ifndef FLENS_LAPACK_EIG_LAQR0_TCC
45 #define FLENS_LAPACK_EIG_LAQR0_TCC 1
46
47 #include <flens/blas/blas.h>
48 #include <flens/lapack/lapack.h>
49
50 namespace flens { namespace lapack {
51
52 //== generic lapack implementation =============================================
53
54 template <typename IndexType, typename MH>
55 IndexType
56 laqr0_generic_wsq(bool wantT,
57 bool wantZ,
58 IndexType iLo,
59 IndexType iHi,
60 const GeMatrix<MH> &H)
61 {
62 using std::max;
63 using std::min;
64
65 typedef typename GeMatrix<MH>::ElementType T;
66
67 const IndexType nTiny = 11;
68 const IndexType n = H.numRows();
69
70 if ((n==0) || (n<=nTiny)) {
71 return 1;
72 }
73
74 char job[3];
75 job[0] = (wantT) ? 'S' : 'E';
76 job[1] = (wantZ) ? 'V' : 'N';
77 job[2] = 0;
78 //
79 // ==== NWR = recommended deflation window size. At this
80 // . point, N .GT. NTINY = 11, so there is enough
81 // . subdiagonal workspace for NWR.GE.2 as required.
82 // . (In fact, there is enough subdiagonal space for
83 // . NWR.GE.3.) ====
84 //
85 IndexType nwr = ilaenv<T>(13, "LAQR0", job, n, iLo, iHi, -1);
86 nwr = max(IndexType(2), nwr);
87 nwr = min(min(IndexType(iHi-iLo+1), (n-1)/3), nwr);
88 //
89 // ==== NSR = recommended number of simultaneous shifts.
90 // . At this point N .GT. NTINY = 11, so there is at
91 // . enough subdiagonal workspace for NSR to be even
92 // . and greater than or equal to two as required. ====
93 //
94 IndexType nsr = ilaenv<T>(15, "LAQR0", job, n, iLo, iHi, -1);
95 nsr = min(min(nsr, (n+6)/9), IndexType(iHi-iLo));
96 nsr = max(IndexType(2), nsr-(nsr%2));
97 //
98 // ==== Estimate optimal workspace ====
99 //
100 // ==== Workspace query call to DLAQR3 ====
101 //
102 IndexType lWorkOpt = laqr3_wsq(IndexType(iLo), IndexType(iHi), nwr+1, H);
103 //
104 // ==== Optimal workspace = MAX(DLAQR5, DLAQR3) ====
105 //
106 return max(3*nsr/2, lWorkOpt);
107 }
108
109 template <typename IndexType, typename MH, typename VWR, typename VWI,
110 typename MZ, typename VWORK>
111 IndexType
112 laqr0_generic(bool wantT,
113 bool wantZ,
114 IndexType iLo,
115 IndexType iHi,
116 GeMatrix<MH> &H,
117 DenseVector<VWR> &wr,
118 DenseVector<VWI> &wi,
119 IndexType iLoZ,
120 IndexType iHiZ,
121 GeMatrix<MZ> &Z,
122 DenseVector<VWORK> &work)
123 {
124 using std::abs;
125 using std::max;
126 using std::min;
127 using std::swap;
128
129 typedef typename GeMatrix<MH>::ElementType T;
130
131 const Underscore<IndexType> _;
132
133 const IndexType n = H.numRows();
134
135 // ==== Matrices of order NTINY or smaller must be processed by
136 // . DLAHQR because of insufficient subdiagonal scratch space.
137 // . (This is a hard limit.) ====
138 const IndexType nTiny = 11;
139
140 // ==== Exceptional deflation windows: try to cure rare
141 // . slow convergence by varying the size of the
142 // . deflation window after KEXNW iterations. ====
143 const IndexType kexNw = 5;
144
145 //
146 // ==== Exceptional shifts: try to cure rare slow convergence
147 // . with ad-hoc exceptional shifts every KEXSH iterations.
148 // . ====
149 const IndexType kexSh = 6;
150
151 //
152 // ==== The constants WILK1 and WILK2 are used to form the
153 // . exceptional shifts. ====
154 const T wilk1 = T(0.75),
155 wilk2 = T(-0.4375);
156
157 const T Zero(0), One(1);
158
159 IndexType info = 0;
160 IndexType lWork, lWorkOpt;
161
162 IndexType nDec = -1;
163
164 //
165 // ==== Perform and apply a workspace query if necessary ====
166 //
167 if (work.length()==0) {
168 lWorkOpt = laqr0_generic_wsq(wantT, wantZ, iLo, iHi, H);
169 work.resize(lWorkOpt);
170 }
171
172 lWork = work.length();
173
174 //
175 // ==== Quick return for N = 0: nothing to do. ====
176 //
177 if (n==0) {
178 work(1) = One;
179 return info;
180 }
181
182 if (n<=nTiny) {
183 //
184 // ==== Tiny matrices must use DLAHQR. ====
185 //
186 lWorkOpt = 1;
187 info = lahqr(wantT, wantZ, iLo, iHi, H, wr, wi, iLoZ, iHiZ, Z);
188 } else {
189 //
190 // ==== Use small bulge multi-shift QR with aggressive early
191 // . deflation on larger-than-tiny matrices. ====
192 //
193 // ==== Hope for the best. ====
194 //
195 info = 0;
196 //
197 // ==== Set up job flags for ILAENV. ====
198 //
199 char job[3];
200 job[0] = (wantT) ? 'S' : 'E';
201 job[1] = (wantZ) ? 'V' : 'N';
202 job[2] = 0;
203 //
204 // ==== NWR = recommended deflation window size. At this
205 // . point, N .GT. NTINY = 11, so there is enough
206 // . subdiagonal workspace for NWR.GE.2 as required.
207 // . (In fact, there is enough subdiagonal space for
208 // . NWR.GE.3.) ====
209 //
210 IndexType nwr = ilaenv<T>(13, "LAQR0", job, n, iLo, iHi, lWork);
211 nwr = max(IndexType(2), nwr);
212 nwr = min(min(IndexType(iHi-iLo+1), (n-1)/3), nwr);
213 //
214 // ==== NSR = recommended number of simultaneous shifts.
215 // . At this point N .GT. NTINY = 11, so there is at
216 // . enough subdiagonal workspace for NSR to be even
217 // . and greater than or equal to two as required. ====
218 //
219 IndexType nsr = ilaenv<T>(15, "LAQR0", job, n, iLo, iHi, lWork);
220 nsr = min(min(nsr, (n+6)/9), IndexType(iHi-iLo));
221 nsr = max(IndexType(2), nsr - (nsr%2));
222 //
223 // ==== Estimate optimal workspace ====
224 //
225 // ==== Workspace query call to DLAQR3 ====
226 //
227 lWorkOpt = laqr3_wsq(IndexType(iLo), IndexType(iHi), nwr+1, H);
228 //
229 // ==== Optimal workspace = MAX(DLAQR5, DLAQR3) ====
230 //
231 lWorkOpt = max(3*nsr/2, lWorkOpt);
232 //
233 // ==== DLAHQR/DLAQR0 crossover point ====
234 //
235 IndexType nMin = ilaenv<T>(12, "LAQR0", job, n, iLo, iHi, lWork);
236 nMin = max(nTiny, nMin);
237 //
238 // ==== Nibble crossover point ====
239 //
240 IndexType nibble = ilaenv<T>(14, "LAQR0", job, n, iLo, iHi, lWork);
241 nibble = max(IndexType(0), nibble);
242 //
243 // ==== Accumulate reflections during ttswp? Use block
244 // . 2-by-2 structure during matrix-matrix multiply? ====
245 //
246 IndexType kacc22 = ilaenv<T>(16, "LAQR0", job, n, iLo, iHi, lWork);
247 kacc22 = max(IndexType(0), kacc22);
248 kacc22 = min(IndexType(2), kacc22);
249 //
250 // ==== NWMAX = the largest possible deflation window for
251 // . which there is sufficient workspace. ====
252 //
253 IndexType nwMax = min((n-1)/3, lWork/2);
254 IndexType nw = nwMax;
255 //
256 // ==== NSMAX = the Largest number of simultaneous shifts
257 // . for which there is sufficient workspace. ====
258 //
259 IndexType nsMax = min((n+6 )/9, 2*lWork/3);
260 nsMax = nsMax - (nsMax%2);
261 //
262 // ==== NDFL: an iteration count restarted at deflation. ====
263 //
264 IndexType nDfl = 1;
265 //
266 // ==== ITMAX = iteration limit ====
267 //
268 IndexType itMax = max(IndexType(30), 2*kexSh)
269 * max(IndexType(10), IndexType(iHi-iLo+1));
270 //
271 // ==== Last row and column in the active block ====
272 //
273 IndexType kBot = iHi;
274 //
275 // ==== Main Loop ====
276 //
277 IndexType it;
278 for (it=1; it<=itMax; ++it) {
279 //
280 // ==== Done when KBOT falls below ILO ====
281 //
282 if (kBot<iLo) {
283 break;
284 }
285 //
286 // ==== Locate active block ====
287 //
288 IndexType k;
289 for (k=kBot; k>=iLo+1; --k) {
290 if (H(k,k-1)==Zero) {
291 break;
292 }
293 }
294 ASSERT(k==iLo || H(k,k-1)==Zero);
295 const IndexType kTop = k;
296 //
297 // ==== Select deflation window size:
298 // . Typical Case:
299 // . If possible and advisable, nibble the entire
300 // . active block. If not, use size MIN(NWR,NWMAX)
301 // . or MIN(NWR+1,NWMAX) depending upon which has
302 // . the smaller corresponding subdiagonal entry
303 // . (a heuristic).
304 // .
305 // . Exceptional Case:
306 // . If there have been no deflations in KEXNW or
307 // . more iterations, then vary the deflation window
308 // . size. At first, because, larger windows are,
309 // . in general, more powerful than smaller ones,
310 // . rapidly increase the window to the maximum possible.
311 // . Then, gradually reduce the window size. ====
312 //
313 IndexType nh = kBot - kTop + 1;
314 IndexType nwUpBd = min(nh, nwMax);
315 if (nDfl<kexNw) {
316 nw = min(nwUpBd, nwr);
317 } else {
318 nw = min(nwUpBd, 2*nw);
319 }
320 if (nw<nwMax) {
321 if (nw>=nh-1) {
322 nw = nh;
323 } else {
324 const IndexType kwTop = kBot - nw + 1;
325 if (abs(H(kwTop,kwTop-1))>abs(H(kwTop-1, kwTop-2))) {
326 ++nw;
327 }
328 }
329 }
330 if (nDfl<kexNw) {
331 nDec = -1;
332 } else if (nDec>=0 || nw>=nwUpBd) {
333 ++nDec;
334 if (nw-nDec<2) {
335 nDec = 0;
336 }
337 nw -= nDec;
338 }
339 //
340 // ==== Aggressive early deflation:
341 // . split workspace under the subdiagonal into
342 // . - an nw-by-nw work array V in the lower
343 // . left-hand-corner,
344 // . - an NW-by-at-least-NW-but-more-is-better
345 // . (NW-by-NHO) horizontal work array along
346 // . the bottom edge,
347 // . - an at-least-NW-but-more-is-better (NHV-by-NW)
348 // . vertical work array along the left-hand-edge.
349 // . ====
350 //
351 auto _V = H(_(n-nw+1, n), _( 1, nw));
352 auto _T = H(_(n-nw+1, n), _(nw+1, n-nw-1));
353 auto _WV = H(_( nw+2, n-nw), _( 1, nw));
354 //
355 // ==== Aggressive early deflation ====
356 //
357 IndexType ls, ld;
358
359 laqr3(wantT, wantZ, kTop, kBot, nw, H,
360 IndexType(iLoZ), IndexType(iHiZ), Z,
361 ls, ld, wr(_(1,kBot)), wi(_(1,kBot)),
362 _V, _T, _WV, work);
363 //
364 // ==== Adjust KBOT accounting for new deflations. ====
365 //
366 kBot -= ld;
367 //
368 // ==== KS points to the shifts. ====
369 //
370 IndexType ks = kBot - ls + 1;
371 //
372 // ==== Skip an expensive QR sweep if there is a (partly
373 // . heuristic) reason to expect that many eigenvalues
374 // . will deflate without it. Here, the QR sweep is
375 // . skipped if many eigenvalues have just been deflated
376 // . or if the remaining active block is small.
377 //
378 if ((ld==0)
379 || ((100*ld<=nw*nibble) && (kBot-kTop+1>min(nMin, nwMax))))
380 {
381 //
382 // ==== NS = nominal number of simultaneous shifts.
383 // . This may be lowered (slightly) if DLAQR3
384 // . did not provide that many shifts. ====
385 //
386 IndexType ns = min(min(nsMax, nsr),
387 max(IndexType(2), kBot-kTop));
388 ns = ns - (ns % 2);
389 //
390 // ==== If there have been no deflations
391 // . in a multiple of KEXSH iterations,
392 // . then try exceptional shifts.
393 // . Otherwise use shifts provided by
394 // . DLAQR3 above or from the eigenvalues
395 // . of a trailing principal submatrix. ====
396 //
397 if (nDfl%kexSh==0) {
398 ks = kBot - ns + 1;
399 for (IndexType i=kBot; i>=max(ks+1,kTop+2); i-=2) {
400 const T ss = abs(H(i,i-1)) + abs(H(i-1,i-2));
401 T aa = wilk1*ss + H(i,i);
402 T bb = ss;
403 T cc = wilk2*ss;
404 T dd = aa;
405 T cs, sn;
406 lanv2(aa, bb, cc, dd,
407 wr(i-1), wi(i-1), wr(i), wi(i),
408 cs, sn);
409 }
410 if (ks==kTop) {
411 wr(ks+1) = H(ks+1, ks+1);
412 wi(ks+1) = Zero;
413 wr(ks) = wr(ks+1);
414 wi(ks) = wi(ks+1);
415 }
416 } else {
417 //
418 // ==== Got NS/2 or fewer shifts? Use DLAQR4 or
419 // . DLAHQR on a trailing principal submatrix to
420 // . get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
421 // . there is enough space below the subdiagonal
422 // . to fit an NS-by-NS scratch array.) ====
423 //
424 if (kBot-ks+1<=ns/2) {
425 ks = kBot - ns +1;
426 H(_(ks,kBot),_(1,ns)) = H(_(ks,kBot),_(ks,kBot));
427 if (ns>nMin) {
428 // TODO: avoid the need for ZDummy
429 typename GeMatrix<MZ>::NoView ZDummy;
430 ks += laqr4(false, false,
431 IndexType(1), ns,
432 H(_(ks,kBot),_(1,ns)),
433 wr(_(ks,kBot)), wi(_(ks,kBot)),
434 IndexType(1), IndexType(1),
435 ZDummy, work);
436 } else {
437 // TODO: avoid the need for ZDummy
438 typename GeMatrix<MZ>::NoView ZDummy;
439 ks += lahqr(false, false,
440 IndexType(1), ns,
441 H(_(ks,kBot),_(1,ns)),
442 wr(_(ks,kBot)), wi(_(ks,kBot)),
443 IndexType(1), IndexType(1),
444 ZDummy);
445 }
446 //
447 // ==== In case of a rare QR failure use
448 // . eigenvalues of the trailing 2-by-2
449 // . principal submatrix. ====
450 //
451 if (ks>=kBot) {
452 T aa = H(kBot-1,kBot-1);
453 T cc = H(kBot, kBot-1);
454 T bb = H(kBot-1,kBot);
455 T dd = H(kBot, kBot);
456 T cs, sn;
457 lanv2(aa, bb, cc, dd,
458 wr(kBot-1), wi(kBot-1), wr(kBot), wi(kBot),
459 cs, sn);
460 ks = kBot - 1;
461 }
462 }
463
464 if (kBot-ks+1>ns) {
465 //
466 // ==== Sort the shifts (Helps a little)
467 // . Bubble sort keeps complex conjugate
468 // . pairs together. ====
469 //
470 bool sorted = false;
471 for (IndexType k=kBot; k>=ks+1; --k) {
472 if (sorted) {
473 break;
474 }
475 sorted = true;
476 for (IndexType i=ks; i<=k-1; ++i) {
477 if (abs(wr(i))+abs(wi(i))
478 < abs(wr(i+1))+abs(wi(i+1)))
479 {
480 sorted = false;
481 swap(wr(i), wr(i+1));
482 swap(wi(i), wi(i+1));
483 }
484 }
485 }
486 }
487 //
488 // ==== Shuffle shifts into pairs of real shifts
489 // . and pairs of complex conjugate shifts
490 // . assuming complex conjugate shifts are
491 // . already adjacent to one another. (Yes,
492 // . they are.) ====
493 //
494 for (IndexType i=kBot; i>=ks+2; i-=2) {
495 if (wi(i)!=-wi(i-1)) {
496 T tmp = wr(i);
497 wr(i) = wr(i-1);
498 wr(i-1) = wr(i-2);
499 wr(i-2) = tmp;
500
501 tmp = wi(i);
502 wi(i) = wi(i-1);
503 wi(i-1) = wi(i-2);
504 wi(i-2) = tmp;
505 }
506 }
507 }
508 //
509 // ==== If there are only two shifts and both are
510 // . real, then use only one. ====
511 //
512 if (kBot-ks+1==2) {
513 if (wi(kBot)==0) {
514 const T _H = H(kBot,kBot);
515 if (abs(wr(kBot)-_H) < abs(wr(kBot-1)-_H)) {
516 wr(kBot-1) = wr(kBot);
517 } else {
518 wr(kBot) = wr(kBot-1);
519 }
520 }
521 }
522 //
523 // ==== Use up to NS of the the smallest magnatiude
524 // . shifts. If there aren't NS shifts available,
525 // . then use them all, possibly dropping one to
526 // . make the number of shifts even. ====
527 //
528 ns = min(ns, kBot-ks+1);
529 ns = ns - (ns%2);
530 ks = kBot - ns + 1;
531 //
532 // ==== Small-bulge multi-shift QR sweep:
533 // . split workspace under the subdiagonal into
534 // . - a KDU-by-KDU work array U in the lower
535 // . left-hand-corner,
536 // . - a KDU-by-at-least-KDU-but-more-is-better
537 // . (KDU-by-NHo) horizontal work array WH along
538 // . the bottom edge,
539 // . - and an at-least-KDU-but-more-is-better-by-KDU
540 // . (NVE-by-KDU) vertical work WV arrow along
541 // . the left-hand-edge. ====
542 //
543 IndexType kdu = 3*ns - 3;
544 IndexType ku = n - kdu + 1;
545 IndexType kmv = kdu + 4;
546 IndexType nho = (n-kdu+1-4) - (kdu+1) + 1;
547
548 typedef typename GeMatrix<MH>::View GeMatrixView;
549 GeMatrixView _V(IndexType(3), ns/2, work(_(1,3*ns/2)));
550 auto _U = H(_( ku, n), _( 1, kdu));
551 auto _WV = H(_(kmv,n-kdu), _( 1, kdu));
552 auto _WH = H(_( ku, n), _(kdu+1,kdu+nho));
553 //
554 // ==== Small-bulge multi-shift QR sweep ====
555 //
556 laqr5(wantT, wantZ, kacc22, kTop, kBot, ns,
557 wr(_(ks,kBot)), wi(_(ks,kBot)), H,
558 IndexType(iLoZ), IndexType(iHiZ), Z,
559 _V, _U, _WV, _WH);
560 }
561 //
562 // ==== Note progress (or the lack of it). ====
563 //
564 if (ld>0) {
565 nDfl = 1;
566 } else {
567 ++nDfl;
568 }
569 //
570 // ==== End of main loop ====
571 //
572 }
573 //
574 // ==== Iteration limit exceeded. Set INFO to show where
575 // . the problem occurred and exit. ====
576 //
577 if (it>itMax) {
578 info = kBot;
579 }
580 }
581
582 work(1) = lWorkOpt;
583 return info;
584 }
585
586 //== interface for native lapack ===============================================
587
588 #ifdef CHECK_CXXLAPACK
589
590 template <typename IndexType, typename MH>
591 IndexType
592 laqr0_native_wsq(bool wantT,
593 bool wantZ,
594 IndexType iLo,
595 IndexType iHi,
596 const GeMatrix<MH> &H)
597 {
598 typedef typename GeMatrix<MH>::ElementType T;
599
600 const LOGICAL WANTT = wantT;
601 const LOGICAL WANTZ = wantZ;
602 const INTEGER N = H.numRows();
603 const INTEGER ILO = iLo;
604 const INTEGER IHI = iHi;
605 const INTEGER LDH = H.leadingDimension();
606 const INTEGER ILOZ = 0;
607 const INTEGER IHIZ = 0;
608 const INTEGER LDZ = 0;
609 T WORK;
610 T DUMMY;
611 const INTEGER LWORK = -1;
612 INTEGER INFO;
613
614 if (IsSame<T,DOUBLE>::value) {
615 LAPACK_IMPL(dlaqr0)(&WANTT,
616 &WANTZ,
617 &N,
618 &ILO,
619 &IHI,
620 &DUMMY,
621 &LDH,
622 &DUMMY,
623 &DUMMY,
624 &ILOZ,
625 &IHIZ,
626 &DUMMY,
627 &LDZ,
628 &WORK,
629 &LWORK,
630 &INFO);
631 } else {
632 ASSERT(0);
633 }
634 return WORK;
635 }
636
637 template <typename IndexType, typename MH, typename VWR, typename VWI,
638 typename MZ, typename VWORK>
639 IndexType
640 laqr0_native(bool wantT,
641 bool wantZ,
642 IndexType iLo,
643 IndexType iHi,
644 GeMatrix<MH> &H,
645 DenseVector<VWR> &wr,
646 DenseVector<VWI> &wi,
647 IndexType iLoZ,
648 IndexType iHiZ,
649 GeMatrix<MZ> &Z,
650 DenseVector<VWORK> &work)
651 {
652 typedef typename GeMatrix<MH>::ElementType T;
653
654 const LOGICAL WANTT = wantT;
655 const LOGICAL WANTZ = wantZ;
656 const INTEGER N = H.numRows();
657 const INTEGER ILO = iLo;
658 const INTEGER IHI = iHi;
659 const INTEGER LDH = H.leadingDimension();
660 const INTEGER ILOZ = iLoZ;
661 const INTEGER IHIZ = iHiZ;
662 const INTEGER LDZ = Z.leadingDimension();
663 const INTEGER LWORK = work.length();
664 INTEGER INFO;
665
666 if (IsSame<T,DOUBLE>::value) {
667 LAPACK_IMPL(dlaqr0)(&WANTT,
668 &WANTZ,
669 &N,
670 &ILO,
671 &IHI,
672 H.data(),
673 &LDH,
674 wr.data(),
675 wi.data(),
676 &ILOZ,
677 &IHIZ,
678 Z.data(),
679 &LDZ,
680 work.data(),
681 &LWORK,
682 &INFO);
683 } else {
684 ASSERT(0);
685 }
686 ASSERT(INFO>=0);
687 return INFO;
688 }
689
690 #endif // CHECK_CXXLAPACK
691
692 //== public interface ==========================================================
693 template <typename IndexType, typename MH>
694 IndexType
695 laqr0_wsq(bool wantT,
696 bool wantZ,
697 IndexType iLo,
698 IndexType iHi,
699 const GeMatrix<MH> &H)
700 {
701 using std::max;
702 //
703 // Test the input parameters
704 //
705 # ifndef NDEBUG
706 ASSERT(H.firstRow()==1);
707 ASSERT(H.firstCol()==1);
708 ASSERT(H.numRows()==H.numCols());
709
710 const IndexType n = H.numRows();
711
712 if (n>0) {
713 ASSERT(1<=iLo);
714 ASSERT(iLo<=iHi);
715 ASSERT(iHi<=n);
716 } else {
717 ASSERT(iLo==1);
718 ASSERT(iHi==0);
719 }
720 # endif
721
722 //
723 // Call implementation
724 //
725 IndexType info = laqr0_generic_wsq(wantT, wantZ, iLo, iHi, H);
726
727 # ifdef CHECK_CXXLAPACK
728 //
729 // Compare results
730 //
731 IndexType _info = laqr0_native_wsq(wantT, wantZ, iLo, iHi, H);
732
733 if (info!=_info) {
734 std::cerr << "CXXLAPACK: info = " << info << std::endl;
735 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
736 ASSERT(0);
737 }
738 # endif
739 return info;
740 }
741
742 template <typename IndexType, typename MH, typename VWR, typename VWI,
743 typename MZ, typename VWORK>
744 IndexType
745 laqr0(bool wantT,
746 bool wantZ,
747 IndexType iLo,
748 IndexType iHi,
749 GeMatrix<MH> &H,
750 DenseVector<VWR> &wr,
751 DenseVector<VWI> &wi,
752 IndexType iLoZ,
753 IndexType iHiZ,
754 GeMatrix<MZ> &Z,
755 DenseVector<VWORK> &work)
756 {
757 LAPACK_DEBUG_OUT("laqr0");
758
759 using std::max;
760 //
761 // Test the input parameters
762 //
763 # ifndef NDEBUG
764 ASSERT(H.firstRow()==1);
765 ASSERT(H.firstCol()==1);
766 ASSERT(H.numRows()==H.numCols());
767
768 const IndexType n = H.numRows();
769
770 if (n>0) {
771 ASSERT(1<=iLo);
772 ASSERT(iLo<=iHi);
773 ASSERT(iHi<=n);
774 } else {
775 ASSERT(iLo==1);
776 ASSERT(iHi==0);
777 }
778
779 ASSERT(wr.firstIndex()==1);
780 ASSERT(wr.length()>=iHi);
781
782 ASSERT(wi.firstIndex()==1);
783 ASSERT(wi.length()>=iHi);
784
785 ASSERT(1<=iLoZ);
786 ASSERT(iLoZ<=iLo);
787 ASSERT(iHi<=iHiZ);
788 ASSERT(iHiZ<=n);
789
790 ASSERT(Z.firstRow()==1);
791 ASSERT(Z.firstCol()==1);
792 ASSERT(Z.numRows()>=iHi);
793 ASSERT(Z.numCols()>=iHi);
794
795 ASSERT((work.length()==0) || (work.length()>=n));
796 # endif
797
798 //
799 // Make copies of output arguments
800 //
801 # ifdef CHECK_CXXLAPACK
802 typename GeMatrix<MH>::NoView H_org = H;
803 typename DenseVector<VWR>::NoView wr_org = wr;
804 typename DenseVector<VWI>::NoView wi_org = wi;
805 typename GeMatrix<MZ>::NoView Z_org = Z;
806 typename DenseVector<VWORK>::NoView work_org = work;
807 # endif
808
809 //
810 // Call implementation
811 //
812 IndexType info = laqr0_generic(wantT, wantZ, iLo, iHi, H, wr, wi,
813 iLoZ, iHiZ, Z, work);
814
815 # ifdef CHECK_CXXLAPACK
816 //
817 // Make copies of results computed by the generic implementation
818 //
819 typename GeMatrix<MH>::NoView H_generic = H;
820 typename DenseVector<VWR>::NoView wr_generic = wr;
821 typename DenseVector<VWI>::NoView wi_generic = wi;
822 typename GeMatrix<MZ>::NoView Z_generic = Z;
823 typename DenseVector<VWORK>::NoView work_generic = work;
824 //
825 // restore output arguments
826 //
827 H = H_org;
828 wr = wr_org;
829 wi = wi_org;
830 Z = Z_org;
831 work = work_org;
832 //
833 // Compare generic results with results from the native implementation
834 //
835 IndexType _info = laqr0_native(wantT, wantZ, iLo, iHi, H, wr, wi,
836 iLoZ, iHiZ, Z, work);
837
838 bool failed = false;
839 if (! isIdentical(H_generic, H, "H_generic", "H")) {
840 std::cerr << "CXXLAPACK: H_generic = " << H_generic << std::endl;
841 std::cerr << "F77LAPACK: H = " << H << std::endl;
842 failed = true;
843 }
844
845 if (! isIdentical(wr_generic, wr, "wr_generic", "wr")) {
846 std::cerr << "CXXLAPACK: wr_generic = " << wr_generic << std::endl;
847 std::cerr << "F77LAPACK: wr = " << wr << std::endl;
848 failed = true;
849 }
850
851 if (! isIdentical(wi_generic, wi, "wi_generic", "wi")) {
852 std::cerr << "CXXLAPACK: wi_generic = " << wi_generic << std::endl;
853 std::cerr << "F77LAPACK: wi = " << wi << std::endl;
854 failed = true;
855 }
856
857 if (! isIdentical(Z_generic, Z, "Z_generic", "Z")) {
858 std::cerr << "CXXLAPACK: Z_generic = " << Z_generic << std::endl;
859 std::cerr << "F77LAPACK: Z = " << Z << std::endl;
860 failed = true;
861 }
862
863 if (! isIdentical(info, _info, " info", "_info")) {
864 std::cerr << "CXXLAPACK: info = " << info << std::endl;
865 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
866 failed = true;
867 }
868
869 if (! isIdentical(work_generic, work, "work_generic", "work")) {
870 std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
871 std::cerr << "F77LAPACK: work = " << work << std::endl;
872 failed = true;
873 }
874
875 if (failed) {
876 std::cerr << "error in: laqr0.tcc" << std::endl;
877 ASSERT(0);
878 } else {
879 // std::cerr << "passed: laqr0.tcc" << std::endl;
880 }
881 # endif
882
883 return info;
884 }
885
886 //-- forwarding ----------------------------------------------------------------
887 template <typename IndexType, typename MH>
888 IndexType
889 laqr0_wsq(bool wantT,
890 bool wantZ,
891 IndexType iLo,
892 IndexType iHi,
893 const MH &&H)
894 {
895 CHECKPOINT_ENTER;
896 const IndexType info = laqr0_wsq(wantT, wantZ, iLo, iHi, H);
897 CHECKPOINT_LEAVE;
898
899 return info;
900 }
901
902 template <typename IndexType, typename MH, typename VWR, typename VWI,
903 typename MZ, typename VWORK>
904 IndexType
905 laqr0(bool wantT,
906 bool wantZ,
907 IndexType iLo,
908 IndexType iHi,
909 MH &&H,
910 VWR &&wr,
911 VWI &&wi,
912 IndexType iLoZ,
913 IndexType iHiZ,
914 MZ &&Z,
915 VWORK &&work)
916 {
917 CHECKPOINT_ENTER;
918 const IndexType info = laqr0(wantT, wantZ, iLo, iHi, H, wr, wi,
919 iLoZ, iHiZ, Z, work);
920 CHECKPOINT_LEAVE;
921
922 return info;
923 }
924
925 } } // namespace lapack, flens
926
927 #endif // FLENS_LAPACK_EIG_LAQR0_TCC
2 * Copyright (c) 2011, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Based on
34 *
35 SUBROUTINE DLAQR0( WANTT, WANTZ, N, ILO, IHI, H, LDH, WR, WI,
36 $ ILOZ, IHIZ, Z, LDZ, WORK, LWORK, INFO )
37 *
38 * -- LAPACK auxiliary routine (version 3.2) --
39 * Univ. of Tennessee, Univ. of California Berkeley,
40 * Univ. of Colorado Denver and NAG Ltd..
41 * November 2006
42 */
43
44 #ifndef FLENS_LAPACK_EIG_LAQR0_TCC
45 #define FLENS_LAPACK_EIG_LAQR0_TCC 1
46
47 #include <flens/blas/blas.h>
48 #include <flens/lapack/lapack.h>
49
50 namespace flens { namespace lapack {
51
52 //== generic lapack implementation =============================================
53
54 template <typename IndexType, typename MH>
55 IndexType
56 laqr0_generic_wsq(bool wantT,
57 bool wantZ,
58 IndexType iLo,
59 IndexType iHi,
60 const GeMatrix<MH> &H)
61 {
62 using std::max;
63 using std::min;
64
65 typedef typename GeMatrix<MH>::ElementType T;
66
67 const IndexType nTiny = 11;
68 const IndexType n = H.numRows();
69
70 if ((n==0) || (n<=nTiny)) {
71 return 1;
72 }
73
74 char job[3];
75 job[0] = (wantT) ? 'S' : 'E';
76 job[1] = (wantZ) ? 'V' : 'N';
77 job[2] = 0;
78 //
79 // ==== NWR = recommended deflation window size. At this
80 // . point, N .GT. NTINY = 11, so there is enough
81 // . subdiagonal workspace for NWR.GE.2 as required.
82 // . (In fact, there is enough subdiagonal space for
83 // . NWR.GE.3.) ====
84 //
85 IndexType nwr = ilaenv<T>(13, "LAQR0", job, n, iLo, iHi, -1);
86 nwr = max(IndexType(2), nwr);
87 nwr = min(min(IndexType(iHi-iLo+1), (n-1)/3), nwr);
88 //
89 // ==== NSR = recommended number of simultaneous shifts.
90 // . At this point N .GT. NTINY = 11, so there is at
91 // . enough subdiagonal workspace for NSR to be even
92 // . and greater than or equal to two as required. ====
93 //
94 IndexType nsr = ilaenv<T>(15, "LAQR0", job, n, iLo, iHi, -1);
95 nsr = min(min(nsr, (n+6)/9), IndexType(iHi-iLo));
96 nsr = max(IndexType(2), nsr-(nsr%2));
97 //
98 // ==== Estimate optimal workspace ====
99 //
100 // ==== Workspace query call to DLAQR3 ====
101 //
102 IndexType lWorkOpt = laqr3_wsq(IndexType(iLo), IndexType(iHi), nwr+1, H);
103 //
104 // ==== Optimal workspace = MAX(DLAQR5, DLAQR3) ====
105 //
106 return max(3*nsr/2, lWorkOpt);
107 }
108
109 template <typename IndexType, typename MH, typename VWR, typename VWI,
110 typename MZ, typename VWORK>
111 IndexType
112 laqr0_generic(bool wantT,
113 bool wantZ,
114 IndexType iLo,
115 IndexType iHi,
116 GeMatrix<MH> &H,
117 DenseVector<VWR> &wr,
118 DenseVector<VWI> &wi,
119 IndexType iLoZ,
120 IndexType iHiZ,
121 GeMatrix<MZ> &Z,
122 DenseVector<VWORK> &work)
123 {
124 using std::abs;
125 using std::max;
126 using std::min;
127 using std::swap;
128
129 typedef typename GeMatrix<MH>::ElementType T;
130
131 const Underscore<IndexType> _;
132
133 const IndexType n = H.numRows();
134
135 // ==== Matrices of order NTINY or smaller must be processed by
136 // . DLAHQR because of insufficient subdiagonal scratch space.
137 // . (This is a hard limit.) ====
138 const IndexType nTiny = 11;
139
140 // ==== Exceptional deflation windows: try to cure rare
141 // . slow convergence by varying the size of the
142 // . deflation window after KEXNW iterations. ====
143 const IndexType kexNw = 5;
144
145 //
146 // ==== Exceptional shifts: try to cure rare slow convergence
147 // . with ad-hoc exceptional shifts every KEXSH iterations.
148 // . ====
149 const IndexType kexSh = 6;
150
151 //
152 // ==== The constants WILK1 and WILK2 are used to form the
153 // . exceptional shifts. ====
154 const T wilk1 = T(0.75),
155 wilk2 = T(-0.4375);
156
157 const T Zero(0), One(1);
158
159 IndexType info = 0;
160 IndexType lWork, lWorkOpt;
161
162 IndexType nDec = -1;
163
164 //
165 // ==== Perform and apply a workspace query if necessary ====
166 //
167 if (work.length()==0) {
168 lWorkOpt = laqr0_generic_wsq(wantT, wantZ, iLo, iHi, H);
169 work.resize(lWorkOpt);
170 }
171
172 lWork = work.length();
173
174 //
175 // ==== Quick return for N = 0: nothing to do. ====
176 //
177 if (n==0) {
178 work(1) = One;
179 return info;
180 }
181
182 if (n<=nTiny) {
183 //
184 // ==== Tiny matrices must use DLAHQR. ====
185 //
186 lWorkOpt = 1;
187 info = lahqr(wantT, wantZ, iLo, iHi, H, wr, wi, iLoZ, iHiZ, Z);
188 } else {
189 //
190 // ==== Use small bulge multi-shift QR with aggressive early
191 // . deflation on larger-than-tiny matrices. ====
192 //
193 // ==== Hope for the best. ====
194 //
195 info = 0;
196 //
197 // ==== Set up job flags for ILAENV. ====
198 //
199 char job[3];
200 job[0] = (wantT) ? 'S' : 'E';
201 job[1] = (wantZ) ? 'V' : 'N';
202 job[2] = 0;
203 //
204 // ==== NWR = recommended deflation window size. At this
205 // . point, N .GT. NTINY = 11, so there is enough
206 // . subdiagonal workspace for NWR.GE.2 as required.
207 // . (In fact, there is enough subdiagonal space for
208 // . NWR.GE.3.) ====
209 //
210 IndexType nwr = ilaenv<T>(13, "LAQR0", job, n, iLo, iHi, lWork);
211 nwr = max(IndexType(2), nwr);
212 nwr = min(min(IndexType(iHi-iLo+1), (n-1)/3), nwr);
213 //
214 // ==== NSR = recommended number of simultaneous shifts.
215 // . At this point N .GT. NTINY = 11, so there is at
216 // . enough subdiagonal workspace for NSR to be even
217 // . and greater than or equal to two as required. ====
218 //
219 IndexType nsr = ilaenv<T>(15, "LAQR0", job, n, iLo, iHi, lWork);
220 nsr = min(min(nsr, (n+6)/9), IndexType(iHi-iLo));
221 nsr = max(IndexType(2), nsr - (nsr%2));
222 //
223 // ==== Estimate optimal workspace ====
224 //
225 // ==== Workspace query call to DLAQR3 ====
226 //
227 lWorkOpt = laqr3_wsq(IndexType(iLo), IndexType(iHi), nwr+1, H);
228 //
229 // ==== Optimal workspace = MAX(DLAQR5, DLAQR3) ====
230 //
231 lWorkOpt = max(3*nsr/2, lWorkOpt);
232 //
233 // ==== DLAHQR/DLAQR0 crossover point ====
234 //
235 IndexType nMin = ilaenv<T>(12, "LAQR0", job, n, iLo, iHi, lWork);
236 nMin = max(nTiny, nMin);
237 //
238 // ==== Nibble crossover point ====
239 //
240 IndexType nibble = ilaenv<T>(14, "LAQR0", job, n, iLo, iHi, lWork);
241 nibble = max(IndexType(0), nibble);
242 //
243 // ==== Accumulate reflections during ttswp? Use block
244 // . 2-by-2 structure during matrix-matrix multiply? ====
245 //
246 IndexType kacc22 = ilaenv<T>(16, "LAQR0", job, n, iLo, iHi, lWork);
247 kacc22 = max(IndexType(0), kacc22);
248 kacc22 = min(IndexType(2), kacc22);
249 //
250 // ==== NWMAX = the largest possible deflation window for
251 // . which there is sufficient workspace. ====
252 //
253 IndexType nwMax = min((n-1)/3, lWork/2);
254 IndexType nw = nwMax;
255 //
256 // ==== NSMAX = the Largest number of simultaneous shifts
257 // . for which there is sufficient workspace. ====
258 //
259 IndexType nsMax = min((n+6 )/9, 2*lWork/3);
260 nsMax = nsMax - (nsMax%2);
261 //
262 // ==== NDFL: an iteration count restarted at deflation. ====
263 //
264 IndexType nDfl = 1;
265 //
266 // ==== ITMAX = iteration limit ====
267 //
268 IndexType itMax = max(IndexType(30), 2*kexSh)
269 * max(IndexType(10), IndexType(iHi-iLo+1));
270 //
271 // ==== Last row and column in the active block ====
272 //
273 IndexType kBot = iHi;
274 //
275 // ==== Main Loop ====
276 //
277 IndexType it;
278 for (it=1; it<=itMax; ++it) {
279 //
280 // ==== Done when KBOT falls below ILO ====
281 //
282 if (kBot<iLo) {
283 break;
284 }
285 //
286 // ==== Locate active block ====
287 //
288 IndexType k;
289 for (k=kBot; k>=iLo+1; --k) {
290 if (H(k,k-1)==Zero) {
291 break;
292 }
293 }
294 ASSERT(k==iLo || H(k,k-1)==Zero);
295 const IndexType kTop = k;
296 //
297 // ==== Select deflation window size:
298 // . Typical Case:
299 // . If possible and advisable, nibble the entire
300 // . active block. If not, use size MIN(NWR,NWMAX)
301 // . or MIN(NWR+1,NWMAX) depending upon which has
302 // . the smaller corresponding subdiagonal entry
303 // . (a heuristic).
304 // .
305 // . Exceptional Case:
306 // . If there have been no deflations in KEXNW or
307 // . more iterations, then vary the deflation window
308 // . size. At first, because, larger windows are,
309 // . in general, more powerful than smaller ones,
310 // . rapidly increase the window to the maximum possible.
311 // . Then, gradually reduce the window size. ====
312 //
313 IndexType nh = kBot - kTop + 1;
314 IndexType nwUpBd = min(nh, nwMax);
315 if (nDfl<kexNw) {
316 nw = min(nwUpBd, nwr);
317 } else {
318 nw = min(nwUpBd, 2*nw);
319 }
320 if (nw<nwMax) {
321 if (nw>=nh-1) {
322 nw = nh;
323 } else {
324 const IndexType kwTop = kBot - nw + 1;
325 if (abs(H(kwTop,kwTop-1))>abs(H(kwTop-1, kwTop-2))) {
326 ++nw;
327 }
328 }
329 }
330 if (nDfl<kexNw) {
331 nDec = -1;
332 } else if (nDec>=0 || nw>=nwUpBd) {
333 ++nDec;
334 if (nw-nDec<2) {
335 nDec = 0;
336 }
337 nw -= nDec;
338 }
339 //
340 // ==== Aggressive early deflation:
341 // . split workspace under the subdiagonal into
342 // . - an nw-by-nw work array V in the lower
343 // . left-hand-corner,
344 // . - an NW-by-at-least-NW-but-more-is-better
345 // . (NW-by-NHO) horizontal work array along
346 // . the bottom edge,
347 // . - an at-least-NW-but-more-is-better (NHV-by-NW)
348 // . vertical work array along the left-hand-edge.
349 // . ====
350 //
351 auto _V = H(_(n-nw+1, n), _( 1, nw));
352 auto _T = H(_(n-nw+1, n), _(nw+1, n-nw-1));
353 auto _WV = H(_( nw+2, n-nw), _( 1, nw));
354 //
355 // ==== Aggressive early deflation ====
356 //
357 IndexType ls, ld;
358
359 laqr3(wantT, wantZ, kTop, kBot, nw, H,
360 IndexType(iLoZ), IndexType(iHiZ), Z,
361 ls, ld, wr(_(1,kBot)), wi(_(1,kBot)),
362 _V, _T, _WV, work);
363 //
364 // ==== Adjust KBOT accounting for new deflations. ====
365 //
366 kBot -= ld;
367 //
368 // ==== KS points to the shifts. ====
369 //
370 IndexType ks = kBot - ls + 1;
371 //
372 // ==== Skip an expensive QR sweep if there is a (partly
373 // . heuristic) reason to expect that many eigenvalues
374 // . will deflate without it. Here, the QR sweep is
375 // . skipped if many eigenvalues have just been deflated
376 // . or if the remaining active block is small.
377 //
378 if ((ld==0)
379 || ((100*ld<=nw*nibble) && (kBot-kTop+1>min(nMin, nwMax))))
380 {
381 //
382 // ==== NS = nominal number of simultaneous shifts.
383 // . This may be lowered (slightly) if DLAQR3
384 // . did not provide that many shifts. ====
385 //
386 IndexType ns = min(min(nsMax, nsr),
387 max(IndexType(2), kBot-kTop));
388 ns = ns - (ns % 2);
389 //
390 // ==== If there have been no deflations
391 // . in a multiple of KEXSH iterations,
392 // . then try exceptional shifts.
393 // . Otherwise use shifts provided by
394 // . DLAQR3 above or from the eigenvalues
395 // . of a trailing principal submatrix. ====
396 //
397 if (nDfl%kexSh==0) {
398 ks = kBot - ns + 1;
399 for (IndexType i=kBot; i>=max(ks+1,kTop+2); i-=2) {
400 const T ss = abs(H(i,i-1)) + abs(H(i-1,i-2));
401 T aa = wilk1*ss + H(i,i);
402 T bb = ss;
403 T cc = wilk2*ss;
404 T dd = aa;
405 T cs, sn;
406 lanv2(aa, bb, cc, dd,
407 wr(i-1), wi(i-1), wr(i), wi(i),
408 cs, sn);
409 }
410 if (ks==kTop) {
411 wr(ks+1) = H(ks+1, ks+1);
412 wi(ks+1) = Zero;
413 wr(ks) = wr(ks+1);
414 wi(ks) = wi(ks+1);
415 }
416 } else {
417 //
418 // ==== Got NS/2 or fewer shifts? Use DLAQR4 or
419 // . DLAHQR on a trailing principal submatrix to
420 // . get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
421 // . there is enough space below the subdiagonal
422 // . to fit an NS-by-NS scratch array.) ====
423 //
424 if (kBot-ks+1<=ns/2) {
425 ks = kBot - ns +1;
426 H(_(ks,kBot),_(1,ns)) = H(_(ks,kBot),_(ks,kBot));
427 if (ns>nMin) {
428 // TODO: avoid the need for ZDummy
429 typename GeMatrix<MZ>::NoView ZDummy;
430 ks += laqr4(false, false,
431 IndexType(1), ns,
432 H(_(ks,kBot),_(1,ns)),
433 wr(_(ks,kBot)), wi(_(ks,kBot)),
434 IndexType(1), IndexType(1),
435 ZDummy, work);
436 } else {
437 // TODO: avoid the need for ZDummy
438 typename GeMatrix<MZ>::NoView ZDummy;
439 ks += lahqr(false, false,
440 IndexType(1), ns,
441 H(_(ks,kBot),_(1,ns)),
442 wr(_(ks,kBot)), wi(_(ks,kBot)),
443 IndexType(1), IndexType(1),
444 ZDummy);
445 }
446 //
447 // ==== In case of a rare QR failure use
448 // . eigenvalues of the trailing 2-by-2
449 // . principal submatrix. ====
450 //
451 if (ks>=kBot) {
452 T aa = H(kBot-1,kBot-1);
453 T cc = H(kBot, kBot-1);
454 T bb = H(kBot-1,kBot);
455 T dd = H(kBot, kBot);
456 T cs, sn;
457 lanv2(aa, bb, cc, dd,
458 wr(kBot-1), wi(kBot-1), wr(kBot), wi(kBot),
459 cs, sn);
460 ks = kBot - 1;
461 }
462 }
463
464 if (kBot-ks+1>ns) {
465 //
466 // ==== Sort the shifts (Helps a little)
467 // . Bubble sort keeps complex conjugate
468 // . pairs together. ====
469 //
470 bool sorted = false;
471 for (IndexType k=kBot; k>=ks+1; --k) {
472 if (sorted) {
473 break;
474 }
475 sorted = true;
476 for (IndexType i=ks; i<=k-1; ++i) {
477 if (abs(wr(i))+abs(wi(i))
478 < abs(wr(i+1))+abs(wi(i+1)))
479 {
480 sorted = false;
481 swap(wr(i), wr(i+1));
482 swap(wi(i), wi(i+1));
483 }
484 }
485 }
486 }
487 //
488 // ==== Shuffle shifts into pairs of real shifts
489 // . and pairs of complex conjugate shifts
490 // . assuming complex conjugate shifts are
491 // . already adjacent to one another. (Yes,
492 // . they are.) ====
493 //
494 for (IndexType i=kBot; i>=ks+2; i-=2) {
495 if (wi(i)!=-wi(i-1)) {
496 T tmp = wr(i);
497 wr(i) = wr(i-1);
498 wr(i-1) = wr(i-2);
499 wr(i-2) = tmp;
500
501 tmp = wi(i);
502 wi(i) = wi(i-1);
503 wi(i-1) = wi(i-2);
504 wi(i-2) = tmp;
505 }
506 }
507 }
508 //
509 // ==== If there are only two shifts and both are
510 // . real, then use only one. ====
511 //
512 if (kBot-ks+1==2) {
513 if (wi(kBot)==0) {
514 const T _H = H(kBot,kBot);
515 if (abs(wr(kBot)-_H) < abs(wr(kBot-1)-_H)) {
516 wr(kBot-1) = wr(kBot);
517 } else {
518 wr(kBot) = wr(kBot-1);
519 }
520 }
521 }
522 //
523 // ==== Use up to NS of the the smallest magnatiude
524 // . shifts. If there aren't NS shifts available,
525 // . then use them all, possibly dropping one to
526 // . make the number of shifts even. ====
527 //
528 ns = min(ns, kBot-ks+1);
529 ns = ns - (ns%2);
530 ks = kBot - ns + 1;
531 //
532 // ==== Small-bulge multi-shift QR sweep:
533 // . split workspace under the subdiagonal into
534 // . - a KDU-by-KDU work array U in the lower
535 // . left-hand-corner,
536 // . - a KDU-by-at-least-KDU-but-more-is-better
537 // . (KDU-by-NHo) horizontal work array WH along
538 // . the bottom edge,
539 // . - and an at-least-KDU-but-more-is-better-by-KDU
540 // . (NVE-by-KDU) vertical work WV arrow along
541 // . the left-hand-edge. ====
542 //
543 IndexType kdu = 3*ns - 3;
544 IndexType ku = n - kdu + 1;
545 IndexType kmv = kdu + 4;
546 IndexType nho = (n-kdu+1-4) - (kdu+1) + 1;
547
548 typedef typename GeMatrix<MH>::View GeMatrixView;
549 GeMatrixView _V(IndexType(3), ns/2, work(_(1,3*ns/2)));
550 auto _U = H(_( ku, n), _( 1, kdu));
551 auto _WV = H(_(kmv,n-kdu), _( 1, kdu));
552 auto _WH = H(_( ku, n), _(kdu+1,kdu+nho));
553 //
554 // ==== Small-bulge multi-shift QR sweep ====
555 //
556 laqr5(wantT, wantZ, kacc22, kTop, kBot, ns,
557 wr(_(ks,kBot)), wi(_(ks,kBot)), H,
558 IndexType(iLoZ), IndexType(iHiZ), Z,
559 _V, _U, _WV, _WH);
560 }
561 //
562 // ==== Note progress (or the lack of it). ====
563 //
564 if (ld>0) {
565 nDfl = 1;
566 } else {
567 ++nDfl;
568 }
569 //
570 // ==== End of main loop ====
571 //
572 }
573 //
574 // ==== Iteration limit exceeded. Set INFO to show where
575 // . the problem occurred and exit. ====
576 //
577 if (it>itMax) {
578 info = kBot;
579 }
580 }
581
582 work(1) = lWorkOpt;
583 return info;
584 }
585
586 //== interface for native lapack ===============================================
587
588 #ifdef CHECK_CXXLAPACK
589
590 template <typename IndexType, typename MH>
591 IndexType
592 laqr0_native_wsq(bool wantT,
593 bool wantZ,
594 IndexType iLo,
595 IndexType iHi,
596 const GeMatrix<MH> &H)
597 {
598 typedef typename GeMatrix<MH>::ElementType T;
599
600 const LOGICAL WANTT = wantT;
601 const LOGICAL WANTZ = wantZ;
602 const INTEGER N = H.numRows();
603 const INTEGER ILO = iLo;
604 const INTEGER IHI = iHi;
605 const INTEGER LDH = H.leadingDimension();
606 const INTEGER ILOZ = 0;
607 const INTEGER IHIZ = 0;
608 const INTEGER LDZ = 0;
609 T WORK;
610 T DUMMY;
611 const INTEGER LWORK = -1;
612 INTEGER INFO;
613
614 if (IsSame<T,DOUBLE>::value) {
615 LAPACK_IMPL(dlaqr0)(&WANTT,
616 &WANTZ,
617 &N,
618 &ILO,
619 &IHI,
620 &DUMMY,
621 &LDH,
622 &DUMMY,
623 &DUMMY,
624 &ILOZ,
625 &IHIZ,
626 &DUMMY,
627 &LDZ,
628 &WORK,
629 &LWORK,
630 &INFO);
631 } else {
632 ASSERT(0);
633 }
634 return WORK;
635 }
636
637 template <typename IndexType, typename MH, typename VWR, typename VWI,
638 typename MZ, typename VWORK>
639 IndexType
640 laqr0_native(bool wantT,
641 bool wantZ,
642 IndexType iLo,
643 IndexType iHi,
644 GeMatrix<MH> &H,
645 DenseVector<VWR> &wr,
646 DenseVector<VWI> &wi,
647 IndexType iLoZ,
648 IndexType iHiZ,
649 GeMatrix<MZ> &Z,
650 DenseVector<VWORK> &work)
651 {
652 typedef typename GeMatrix<MH>::ElementType T;
653
654 const LOGICAL WANTT = wantT;
655 const LOGICAL WANTZ = wantZ;
656 const INTEGER N = H.numRows();
657 const INTEGER ILO = iLo;
658 const INTEGER IHI = iHi;
659 const INTEGER LDH = H.leadingDimension();
660 const INTEGER ILOZ = iLoZ;
661 const INTEGER IHIZ = iHiZ;
662 const INTEGER LDZ = Z.leadingDimension();
663 const INTEGER LWORK = work.length();
664 INTEGER INFO;
665
666 if (IsSame<T,DOUBLE>::value) {
667 LAPACK_IMPL(dlaqr0)(&WANTT,
668 &WANTZ,
669 &N,
670 &ILO,
671 &IHI,
672 H.data(),
673 &LDH,
674 wr.data(),
675 wi.data(),
676 &ILOZ,
677 &IHIZ,
678 Z.data(),
679 &LDZ,
680 work.data(),
681 &LWORK,
682 &INFO);
683 } else {
684 ASSERT(0);
685 }
686 ASSERT(INFO>=0);
687 return INFO;
688 }
689
690 #endif // CHECK_CXXLAPACK
691
692 //== public interface ==========================================================
693 template <typename IndexType, typename MH>
694 IndexType
695 laqr0_wsq(bool wantT,
696 bool wantZ,
697 IndexType iLo,
698 IndexType iHi,
699 const GeMatrix<MH> &H)
700 {
701 using std::max;
702 //
703 // Test the input parameters
704 //
705 # ifndef NDEBUG
706 ASSERT(H.firstRow()==1);
707 ASSERT(H.firstCol()==1);
708 ASSERT(H.numRows()==H.numCols());
709
710 const IndexType n = H.numRows();
711
712 if (n>0) {
713 ASSERT(1<=iLo);
714 ASSERT(iLo<=iHi);
715 ASSERT(iHi<=n);
716 } else {
717 ASSERT(iLo==1);
718 ASSERT(iHi==0);
719 }
720 # endif
721
722 //
723 // Call implementation
724 //
725 IndexType info = laqr0_generic_wsq(wantT, wantZ, iLo, iHi, H);
726
727 # ifdef CHECK_CXXLAPACK
728 //
729 // Compare results
730 //
731 IndexType _info = laqr0_native_wsq(wantT, wantZ, iLo, iHi, H);
732
733 if (info!=_info) {
734 std::cerr << "CXXLAPACK: info = " << info << std::endl;
735 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
736 ASSERT(0);
737 }
738 # endif
739 return info;
740 }
741
742 template <typename IndexType, typename MH, typename VWR, typename VWI,
743 typename MZ, typename VWORK>
744 IndexType
745 laqr0(bool wantT,
746 bool wantZ,
747 IndexType iLo,
748 IndexType iHi,
749 GeMatrix<MH> &H,
750 DenseVector<VWR> &wr,
751 DenseVector<VWI> &wi,
752 IndexType iLoZ,
753 IndexType iHiZ,
754 GeMatrix<MZ> &Z,
755 DenseVector<VWORK> &work)
756 {
757 LAPACK_DEBUG_OUT("laqr0");
758
759 using std::max;
760 //
761 // Test the input parameters
762 //
763 # ifndef NDEBUG
764 ASSERT(H.firstRow()==1);
765 ASSERT(H.firstCol()==1);
766 ASSERT(H.numRows()==H.numCols());
767
768 const IndexType n = H.numRows();
769
770 if (n>0) {
771 ASSERT(1<=iLo);
772 ASSERT(iLo<=iHi);
773 ASSERT(iHi<=n);
774 } else {
775 ASSERT(iLo==1);
776 ASSERT(iHi==0);
777 }
778
779 ASSERT(wr.firstIndex()==1);
780 ASSERT(wr.length()>=iHi);
781
782 ASSERT(wi.firstIndex()==1);
783 ASSERT(wi.length()>=iHi);
784
785 ASSERT(1<=iLoZ);
786 ASSERT(iLoZ<=iLo);
787 ASSERT(iHi<=iHiZ);
788 ASSERT(iHiZ<=n);
789
790 ASSERT(Z.firstRow()==1);
791 ASSERT(Z.firstCol()==1);
792 ASSERT(Z.numRows()>=iHi);
793 ASSERT(Z.numCols()>=iHi);
794
795 ASSERT((work.length()==0) || (work.length()>=n));
796 # endif
797
798 //
799 // Make copies of output arguments
800 //
801 # ifdef CHECK_CXXLAPACK
802 typename GeMatrix<MH>::NoView H_org = H;
803 typename DenseVector<VWR>::NoView wr_org = wr;
804 typename DenseVector<VWI>::NoView wi_org = wi;
805 typename GeMatrix<MZ>::NoView Z_org = Z;
806 typename DenseVector<VWORK>::NoView work_org = work;
807 # endif
808
809 //
810 // Call implementation
811 //
812 IndexType info = laqr0_generic(wantT, wantZ, iLo, iHi, H, wr, wi,
813 iLoZ, iHiZ, Z, work);
814
815 # ifdef CHECK_CXXLAPACK
816 //
817 // Make copies of results computed by the generic implementation
818 //
819 typename GeMatrix<MH>::NoView H_generic = H;
820 typename DenseVector<VWR>::NoView wr_generic = wr;
821 typename DenseVector<VWI>::NoView wi_generic = wi;
822 typename GeMatrix<MZ>::NoView Z_generic = Z;
823 typename DenseVector<VWORK>::NoView work_generic = work;
824 //
825 // restore output arguments
826 //
827 H = H_org;
828 wr = wr_org;
829 wi = wi_org;
830 Z = Z_org;
831 work = work_org;
832 //
833 // Compare generic results with results from the native implementation
834 //
835 IndexType _info = laqr0_native(wantT, wantZ, iLo, iHi, H, wr, wi,
836 iLoZ, iHiZ, Z, work);
837
838 bool failed = false;
839 if (! isIdentical(H_generic, H, "H_generic", "H")) {
840 std::cerr << "CXXLAPACK: H_generic = " << H_generic << std::endl;
841 std::cerr << "F77LAPACK: H = " << H << std::endl;
842 failed = true;
843 }
844
845 if (! isIdentical(wr_generic, wr, "wr_generic", "wr")) {
846 std::cerr << "CXXLAPACK: wr_generic = " << wr_generic << std::endl;
847 std::cerr << "F77LAPACK: wr = " << wr << std::endl;
848 failed = true;
849 }
850
851 if (! isIdentical(wi_generic, wi, "wi_generic", "wi")) {
852 std::cerr << "CXXLAPACK: wi_generic = " << wi_generic << std::endl;
853 std::cerr << "F77LAPACK: wi = " << wi << std::endl;
854 failed = true;
855 }
856
857 if (! isIdentical(Z_generic, Z, "Z_generic", "Z")) {
858 std::cerr << "CXXLAPACK: Z_generic = " << Z_generic << std::endl;
859 std::cerr << "F77LAPACK: Z = " << Z << std::endl;
860 failed = true;
861 }
862
863 if (! isIdentical(info, _info, " info", "_info")) {
864 std::cerr << "CXXLAPACK: info = " << info << std::endl;
865 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
866 failed = true;
867 }
868
869 if (! isIdentical(work_generic, work, "work_generic", "work")) {
870 std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
871 std::cerr << "F77LAPACK: work = " << work << std::endl;
872 failed = true;
873 }
874
875 if (failed) {
876 std::cerr << "error in: laqr0.tcc" << std::endl;
877 ASSERT(0);
878 } else {
879 // std::cerr << "passed: laqr0.tcc" << std::endl;
880 }
881 # endif
882
883 return info;
884 }
885
886 //-- forwarding ----------------------------------------------------------------
887 template <typename IndexType, typename MH>
888 IndexType
889 laqr0_wsq(bool wantT,
890 bool wantZ,
891 IndexType iLo,
892 IndexType iHi,
893 const MH &&H)
894 {
895 CHECKPOINT_ENTER;
896 const IndexType info = laqr0_wsq(wantT, wantZ, iLo, iHi, H);
897 CHECKPOINT_LEAVE;
898
899 return info;
900 }
901
902 template <typename IndexType, typename MH, typename VWR, typename VWI,
903 typename MZ, typename VWORK>
904 IndexType
905 laqr0(bool wantT,
906 bool wantZ,
907 IndexType iLo,
908 IndexType iHi,
909 MH &&H,
910 VWR &&wr,
911 VWI &&wi,
912 IndexType iLoZ,
913 IndexType iHiZ,
914 MZ &&Z,
915 VWORK &&work)
916 {
917 CHECKPOINT_ENTER;
918 const IndexType info = laqr0(wantT, wantZ, iLo, iHi, H, wr, wi,
919 iLoZ, iHiZ, Z, work);
920 CHECKPOINT_LEAVE;
921
922 return info;
923 }
924
925 } } // namespace lapack, flens
926
927 #endif // FLENS_LAPACK_EIG_LAQR0_TCC