1 /*
2 * Copyright (c) 2011, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Based on
34 *
35 SUBROUTINE DLAQR4( WANTT, WANTZ, N, ILO, IHI, H, LDH, WR, WI,
36 $ ILOZ, IHIZ, Z, LDZ, WORK, LWORK, INFO )
37 *
38 * -- LAPACK auxiliary routine (version 3.2) --
39 * Univ. of Tennessee, Univ. of California Berkeley,
40 * Univ. of Colorado Denver and NAG Ltd..
41 * November 2006
42 */
43
44 #ifndef FLENS_LAPACK_EIG_LAQR4_TCC
45 #define FLENS_LAPACK_EIG_LAQR4_TCC 1
46
47 #include <flens/blas/blas.h>
48 #include <flens/lapack/lapack.h>
49
50 namespace flens { namespace lapack {
51
52 //== generic lapack implementation =============================================
53
54 template <typename IndexType, typename MH>
55 IndexType
56 laqr4_generic_wsq(bool wantT,
57 bool wantZ,
58 IndexType iLo,
59 IndexType iHi,
60 const GeMatrix<MH> &H)
61 {
62 using std::max;
63 using std::min;
64
65 typedef typename GeMatrix<MH>::ElementType T;
66
67 const IndexType nTiny = 11;
68 const IndexType n = H.numRows();
69
70 if ((n==0) || (n<=nTiny)) {
71 return 1;
72 }
73
74 char job[3];
75 job[0] = (wantT) ? 'S' : 'E';
76 job[1] = (wantZ) ? 'V' : 'N';
77 job[2] = 0;
78 //
79 // ==== NWR = recommended deflation window size. At this
80 // . point, N .GT. NTINY = 11, so there is enough
81 // . subdiagonal workspace for NWR.GE.2 as required.
82 // . (In fact, there is enough subdiagonal space for
83 // . NWR.GE.3.) ====
84 //
85 IndexType nwr = ilaenv<T>(13, "LAQR4", job, n, iLo, iHi, -1);
86 nwr = max(IndexType(2), nwr);
87 nwr = min(min(iHi-iLo+1, (n-1)/3), nwr);
88 //
89 // ==== NSR = recommended number of simultaneous shifts.
90 // . At this point N .GT. NTINY = 11, so there is at
91 // . enough subdiagonal workspace for NSR to be even
92 // . and greater than or equal to two as required. ====
93 //
94 IndexType nsr = ilaenv<T>(15, "LAQR4", job, n, iLo, iHi, -1);
95 nsr = min(min(nsr, (n+6)/9), iHi-iLo);
96 nsr = max(IndexType(2), nsr-(nsr%2));
97 //
98 // ==== Estimate optimal workspace ====
99 //
100 // ==== Workspace query call to DLAQR2 ====
101 //
102 IndexType lWorkOpt = laqr2_wsq(iLo, iHi, nwr+1, H);
103 //
104 // ==== Optimal workspace = MAX(DLAQR5, DLAQR2) ====
105 //
106 return max(3*nsr/2, lWorkOpt);
107 }
108
109 template <typename IndexType, typename MH, typename VWR, typename VWI,
110 typename MZ, typename VWORK>
111 IndexType
112 laqr4_generic(bool wantT,
113 bool wantZ,
114 IndexType iLo,
115 IndexType iHi,
116 GeMatrix<MH> &H,
117 DenseVector<VWR> &wr,
118 DenseVector<VWI> &wi,
119 IndexType iLoZ,
120 IndexType iHiZ,
121 GeMatrix<MZ> &Z,
122 DenseVector<VWORK> &work)
123 {
124 using std::abs;
125 using std::max;
126 using std::min;
127 using std::swap;
128
129 typedef typename GeMatrix<MH>::ElementType T;
130
131 const Underscore<IndexType> _;
132
133 const IndexType n = H.numRows();
134
135 // ==== Matrices of order NTINY or smaller must be processed by
136 // . DLAHQR because of insufficient subdiagonal scratch space.
137 // . (This is a hard limit.) ====
138 const IndexType nTiny = 11;
139
140 // ==== Exceptional deflation windows: try to cure rare
141 // . slow convergence by varying the size of the
142 // . deflation window after KEXNW iterations. ====
143 const IndexType kexNw = 5;
144
145 //
146 // ==== Exceptional shifts: try to cure rare slow convergence
147 // . with ad-hoc exceptional shifts every KEXSH iterations.
148 // . ====
149 const IndexType kexSh = 6;
150
151 //
152 // ==== The constants WILK1 and WILK2 are used to form the
153 // . exceptional shifts. ====
154 const T wilk1 = T(0.75),
155 wilk2 = T(-0.4375);
156
157 const T Zero(0), One(1);
158
159 IndexType info = 0;
160 IndexType lWork, lWorkOpt;
161
162 IndexType nDec = -1;
163
164 //
165 // ==== Perform and apply a workspace query if necessary ====
166 //
167 if (work.length()==0) {
168 lWorkOpt = laqr0_generic_wsq(wantT, wantZ, iLo, iHi, H);
169 work.resize(lWorkOpt);
170 }
171
172 lWork = work.length();
173
174 //
175 // ==== Quick return for N = 0: nothing to do. ====
176 //
177 if (n==0) {
178 work(1) = One;
179 return info;
180 }
181
182 if (n<=nTiny) {
183 //
184 // ==== Tiny matrices must use DLAHQR. ====
185 //
186 lWorkOpt = 1;
187 info = lahqr(wantT, wantZ, iLo, iHi, H, wr, wi, iLoZ, iHiZ, Z);
188 } else {
189 //
190 // ==== Use small bulge multi-shift QR with aggressive early
191 // . deflation on larger-than-tiny matrices. ====
192 //
193 // ==== Hope for the best. ====
194 //
195 info = 0;
196 //
197 // ==== Set up job flags for ILAENV. ====
198 //
199 char job[3];
200 job[0] = (wantT) ? 'S' : 'E';
201 job[1] = (wantZ) ? 'V' : 'N';
202 job[2] = 0;
203 //
204 // ==== NWR = recommended deflation window size. At this
205 // . point, N .GT. NTINY = 11, so there is enough
206 // . subdiagonal workspace for NWR.GE.2 as required.
207 // . (In fact, there is enough subdiagonal space for
208 // . NWR.GE.3.) ====
209 //
210 IndexType nwr = ilaenv<T>(13, "LAQR0", job, n, iLo, iHi, lWork);
211 nwr = max(IndexType(2), nwr);
212 nwr = min(min(iHi-iLo+1, (n-1)/3), nwr);
213 //
214 // ==== NSR = recommended number of simultaneous shifts.
215 // . At this point N .GT. NTINY = 11, so there is at
216 // . enough subdiagonal workspace for NSR to be even
217 // . and greater than or equal to two as required. ====
218 //
219 IndexType nsr = ilaenv<T>(15, "LAQR0", job, n, iLo, iHi, lWork);
220 nsr = min(min(nsr, (n+6)/9), iHi-iLo);
221 nsr = max(IndexType(2), nsr - (nsr%2));
222 //
223 // ==== Estimate optimal workspace ====
224 //
225 // ==== Workspace query call to DLAQR2 ====
226 //
227 lWorkOpt = laqr2_wsq(iLo, iHi, nwr+1, H);
228 //
229 // ==== Optimal workspace = MAX(DLAQR5, DLAQR2) ====
230 //
231 lWorkOpt = max(3*nsr/2, lWorkOpt);
232 //
233 // ==== DLAHQR/DLAQR0 crossover point ====
234 //
235 IndexType nMin = ilaenv<T>(12, "LAQR0", job, n, iLo, iHi, lWork);
236 nMin = max(nTiny, nMin);
237 //
238 // ==== Nibble crossover point ====
239 //
240 IndexType nibble = ilaenv<T>(14, "LAQR0", job, n, iLo, iHi, lWork);
241 nibble = max(IndexType(0), nibble);
242 //
243 // ==== Accumulate reflections during ttswp? Use block
244 // . 2-by-2 structure during matrix-matrix multiply? ====
245 //
246 IndexType kacc22 = ilaenv<T>(16, "LAQR0", job, n, iLo, iHi, lWork);
247 kacc22 = max(IndexType(0), kacc22);
248 kacc22 = min(IndexType(2), kacc22);
249 //
250 // ==== NWMAX = the largest possible deflation window for
251 // . which there is sufficient workspace. ====
252 //
253 IndexType nwMax = min((n-1)/3, lWork/2);
254 IndexType nw = nwMax;
255 //
256 // ==== NSMAX = the Largest number of simultaneous shifts
257 // . for which there is sufficient workspace. ====
258 //
259 IndexType nsMax = min((n+6 )/9, 2*lWork/3);
260 nsMax = nsMax - (nsMax%2);
261 //
262 // ==== NDFL: an iteration count restarted at deflation. ====
263 //
264 IndexType nDfl = 1;
265 //
266 // ==== ITMAX = iteration limit ====
267 //
268 IndexType itMax = max(IndexType(30), 2*kexSh)
269 * max(IndexType(10), (iHi-iLo+1));
270 //
271 // ==== Last row and column in the active block ====
272 //
273 IndexType kBot = iHi;
274 //
275 // ==== Main Loop ====
276 //
277 IndexType it;
278 for (it=1; it<=itMax; ++it) {
279 //
280 // ==== Done when KBOT falls below ILO ====
281 //
282 if (kBot<iLo) {
283 break;
284 }
285 //
286 // ==== Locate active block ====
287 //
288 IndexType k;
289 for (k=kBot; k>=iLo+1; --k) {
290 if (H(k,k-1)==Zero) {
291 break;
292 }
293 }
294 ASSERT(k==iLo || H(k,k-1)==Zero);
295 const IndexType kTop = k;
296 //
297 // ==== Select deflation window size:
298 // . Typical Case:
299 // . If possible and advisable, nibble the entire
300 // . active block. If not, use size MIN(NWR,NWMAX)
301 // . or MIN(NWR+1,NWMAX) depending upon which has
302 // . the smaller corresponding subdiagonal entry
303 // . (a heuristic).
304 // .
305 // . Exceptional Case:
306 // . If there have been no deflations in KEXNW or
307 // . more iterations, then vary the deflation window
308 // . size. At first, because, larger windows are,
309 // . in general, more powerful than smaller ones,
310 // . rapidly increase the window to the maximum possible.
311 // . Then, gradually reduce the window size. ====
312 //
313 IndexType nh = kBot - kTop + 1;
314 IndexType nwUpBd = min(nh, nwMax);
315 if (nDfl<kexNw) {
316 nw = min(nwUpBd, nwr);
317 } else {
318 nw = min(nwUpBd, 2*nw);
319 }
320 if (nw<nwMax) {
321 if (nw>=nh-1) {
322 nw = nh;
323 } else {
324 const IndexType kwTop = kBot - nw + 1;
325 if (abs(H(kwTop,kwTop-1))>abs(H(kwTop-1, kwTop-2))) {
326 ++nw;
327 }
328 }
329 }
330 if (nDfl<kexNw) {
331 nDec = -1;
332 } else if (nDec>=0 || nw>=nwUpBd) {
333 ++nDec;
334 if (nw-nDec<2) {
335 nDec = 0;
336 }
337 nw -= nDec;
338 }
339 //
340 // ==== Aggressive early deflation:
341 // . split workspace under the subdiagonal into
342 // . - an nw-by-nw work array V in the lower
343 // . left-hand-corner,
344 // . - an NW-by-at-least-NW-but-more-is-better
345 // . (NW-by-NHO) horizontal work array along
346 // . the bottom edge,
347 // . - an at-least-NW-but-more-is-better (NHV-by-NW)
348 // . vertical work array along the left-hand-edge.
349 // . ====
350 //
351 auto _V = H(_(n-nw+1, n), _( 1, nw));
352 auto _T = H(_(n-nw+1, n), _(nw+1, n-nw-1));
353 auto _WV = H(_( nw+2, n-nw), _( 1, nw));
354 //
355 // ==== Aggressive early deflation ====
356 //
357 IndexType ls, ld;
358
359 laqr2(wantT, wantZ, kTop, kBot, nw, H, iLoZ, iHiZ, Z,
360 ls, ld, wr(_(1,kBot)), wi(_(1,kBot)),
361 _V, _T, _WV, work);
362 //
363 // ==== Adjust KBOT accounting for new deflations. ====
364 //
365 kBot -= ld;
366 //
367 // ==== KS points to the shifts. ====
368 //
369 IndexType ks = kBot - ls + 1;
370 //
371 // ==== Skip an expensive QR sweep if there is a (partly
372 // . heuristic) reason to expect that many eigenvalues
373 // . will deflate without it. Here, the QR sweep is
374 // . skipped if many eigenvalues have just been deflated
375 // . or if the remaining active block is small.
376 //
377 if ((ld==0)
378 || ((100*ld<=nw*nibble) && (kBot-kTop+1>min(nMin, nwMax))))
379 {
380 //
381 // ==== NS = nominal number of simultaneous shifts.
382 // . This may be lowered (slightly) if DLAQR2
383 // . did not provide that many shifts. ====
384 //
385 IndexType ns = min(min(nsMax, nsr),
386 max(IndexType(2), kBot-kTop));
387 ns = ns - (ns % 2);
388 //
389 // ==== If there have been no deflations
390 // . in a multiple of KEXSH iterations,
391 // . then try exceptional shifts.
392 // . Otherwise use shifts provided by
393 // . DLAQR2 above or from the eigenvalues
394 // . of a trailing principal submatrix. ====
395 //
396 if (nDfl%kexSh==0) {
397 ks = kBot - ns + 1;
398 for (IndexType i=kBot; i>=max(ks+1,kTop+2); i-=2) {
399 const T ss = abs(H(i,i-1)) + abs(H(i-1,i-2));
400 T aa = wilk1*ss + H(i,i);
401 T bb = ss;
402 T cc = wilk2*ss;
403 T dd = aa;
404 T cs, sn;
405 lanv2(aa, bb, cc, dd,
406 wr(i-1), wi(i-1), wr(i), wi(i),
407 cs, sn);
408 }
409 if (ks==kTop) {
410 wr(ks+1) = H(ks+1, ks+1);
411 wi(ks+1) = Zero;
412 wr(ks) = wr(ks+1);
413 wi(ks) = wi(ks+1);
414 }
415 } else {
416 //
417 // ==== Got NS/2 or fewer shifts? Use DLAQR4 or
418 // . DLAHQR on a trailing principal submatrix to
419 // . get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
420 // . there is enough space below the subdiagonal
421 // . to fit an NS-by-NS scratch array.) ====
422 //
423 if (kBot-ks+1<=ns/2) {
424 ks = kBot - ns +1;
425 H(_(ks,kBot),_(1,ns)) = H(_(ks,kBot),_(ks,kBot));
426 if (ns>nMin) {
427 // TODO: avoid the need for ZDummy
428 typename GeMatrix<MZ>::NoView ZDummy;
429 ks += laqr4(false, false,
430 IndexType(1), ns,
431 H(_(ks,kBot),_(1,ns)),
432 wr(_(ks,kBot)), wi(_(ks,kBot)),
433 IndexType(1), IndexType(1),
434 ZDummy, work);
435 } else {
436 // TODO: avoid the need for ZDummy
437 typename GeMatrix<MZ>::NoView ZDummy;
438 ks += lahqr(false, false,
439 IndexType(1), ns,
440 H(_(ks,kBot),_(1,ns)),
441 wr(_(ks,kBot)), wi(_(ks,kBot)),
442 IndexType(1), IndexType(1),
443 ZDummy);
444 }
445 //
446 // ==== In case of a rare QR failure use
447 // . eigenvalues of the trailing 2-by-2
448 // . principal submatrix. ====
449 //
450 if (ks>=kBot) {
451 T aa = H(kBot-1,kBot-1);
452 T cc = H(kBot, kBot-1);
453 T bb = H(kBot-1,kBot);
454 T dd = H(kBot, kBot);
455 T cs, sn;
456 lanv2(aa, bb, cc, dd,
457 wr(kBot-1), wi(kBot-1), wr(kBot), wi(kBot),
458 cs, sn);
459 ks = kBot - 1;
460 }
461 }
462
463 if (kBot-ks+1>ns) {
464 //
465 // ==== Sort the shifts (Helps a little)
466 // . Bubble sort keeps complex conjugate
467 // . pairs together. ====
468 //
469 bool sorted = false;
470 for (IndexType k=kBot; k>=ks+1; --k) {
471 if (sorted) {
472 break;
473 }
474 sorted = true;
475 for (IndexType i=ks; i<=k-1; ++i) {
476 if (abs(wr(i))+abs(wi(i))
477 < abs(wr(i+1))+abs(wi(i+1)))
478 {
479 sorted = false;
480 swap(wr(i), wr(i+1));
481 swap(wi(i), wi(i+1));
482 }
483 }
484 }
485 }
486 //
487 // ==== Shuffle shifts into pairs of real shifts
488 // . and pairs of complex conjugate shifts
489 // . assuming complex conjugate shifts are
490 // . already adjacent to one another. (Yes,
491 // . they are.) ====
492 //
493 for (IndexType i=kBot; i>=ks+2; i-=2) {
494 if (wi(i)!=-wi(i-1)) {
495 T tmp = wr(i);
496 wr(i) = wr(i-1);
497 wr(i-1) = wr(i-2);
498 wr(i-2) = tmp;
499
500 tmp = wi(i);
501 wi(i) = wi(i-1);
502 wi(i-1) = wi(i-2);
503 wi(i-2) = tmp;
504 }
505 }
506 }
507 //
508 // ==== If there are only two shifts and both are
509 // . real, then use only one. ====
510 //
511 if (kBot-ks+1==2) {
512 if (wi(kBot)==0) {
513 const T _H = H(kBot,kBot);
514 if (abs(wr(kBot)-_H) < abs(wr(kBot-1)-_H)) {
515 wr(kBot-1) = wr(kBot);
516 } else {
517 wr(kBot) = wr(kBot-1);
518 }
519 }
520 }
521 //
522 // ==== Use up to NS of the the smallest magnatiude
523 // . shifts. If there aren't NS shifts available,
524 // . then use them all, possibly dropping one to
525 // . make the number of shifts even. ====
526 //
527 ns = min(ns, kBot-ks+1);
528 ns = ns - (ns%2);
529 ks = kBot - ns + 1;
530 //
531 // ==== Small-bulge multi-shift QR sweep:
532 // . split workspace under the subdiagonal into
533 // . - a KDU-by-KDU work array U in the lower
534 // . left-hand-corner,
535 // . - a KDU-by-at-least-KDU-but-more-is-better
536 // . (KDU-by-NHo) horizontal work array WH along
537 // . the bottom edge,
538 // . - and an at-least-KDU-but-more-is-better-by-KDU
539 // . (NVE-by-KDU) vertical work WV arrow along
540 // . the left-hand-edge. ====
541 //
542 IndexType kdu = 3*ns - 3;
543 IndexType ku = n - kdu + 1;
544 IndexType kwv = kdu + 4;
545 IndexType nho = (n-kdu+1-4) - (kdu+1) + 1;
546
547 // NOTE: _WH.numCols()==_WV.numRows()
548
549 typedef typename GeMatrix<MH>::View GeMatrixView;
550 GeMatrixView _V(IndexType(3), ns/2, work(_(1,3*ns/2)));
551 auto _U = H(_( ku, n), _( 1, kdu));
552 auto _WV = H(_(kwv,n-kdu), _( 1, kdu));
553 auto _WH = H(_( ku, n), _(kdu+1,kdu+nho));
554 //
555 // ==== Small-bulge multi-shift QR sweep ====
556 //
557 laqr5(wantT, wantZ, kacc22, kTop, kBot, ns,
558 wr(_(ks,kBot)), wi(_(ks,kBot)), H,
559 iLoZ, iHiZ, Z,
560 _V, _U, _WV, _WH);
561 }
562 //
563 // ==== Note progress (or the lack of it). ====
564 //
565 if (ld>0) {
566 nDfl = 1;
567 } else {
568 ++nDfl;
569 }
570 //
571 // ==== End of main loop ====
572 //
573 }
574 //
575 // ==== Iteration limit exceeded. Set INFO to show where
576 // . the problem occurred and exit. ====
577 //
578 if (it>itMax) {
579 info = kBot;
580 }
581 }
582
583 work(1) = lWorkOpt;
584 return info;
585 }
586
587 //== interface for native lapack ===============================================
588
589 #ifdef CHECK_CXXLAPACK
590
591 template <typename IndexType, typename MH>
592 IndexType
593 laqr4_native_wsq(bool wantT,
594 bool wantZ,
595 IndexType iLo,
596 IndexType iHi,
597 const GeMatrix<MH> &H)
598 {
599 typedef typename GeMatrix<MH>::ElementType T;
600
601 const LOGICAL WANTT = wantT;
602 const LOGICAL WANTZ = wantZ;
603 const INTEGER N = H.numRows();
604 const INTEGER ILO = iLo;
605 const INTEGER IHI = iHi;
606 const INTEGER LDH = H.leadingDimension();
607 const INTEGER ILOZ = 1;
608 const INTEGER IHIZ = 0;
609 const INTEGER LDZ = 1;
610 T WORK;
611 T DUMMY;
612 const INTEGER LWORK = -1;
613 INTEGER INFO;
614
615 if (IsSame<T,DOUBLE>::value) {
616 LAPACK_IMPL(dlaqr4)(&WANTT,
617 &WANTZ,
618 &N,
619 &ILO,
620 &IHI,
621 &DUMMY,
622 &LDH,
623 &DUMMY,
624 &DUMMY,
625 &ILOZ,
626 &IHIZ,
627 &DUMMY,
628 &LDZ,
629 &WORK,
630 &LWORK,
631 &INFO);
632 } else {
633 ASSERT(0);
634 }
635 return WORK;
636 }
637
638 template <typename IndexType, typename MH, typename VWR, typename VWI,
639 typename MZ, typename VWORK>
640 IndexType
641 laqr4_native(bool wantT,
642 bool wantZ,
643 IndexType iLo,
644 IndexType iHi,
645 GeMatrix<MH> &H,
646 DenseVector<VWR> &wr,
647 DenseVector<VWI> &wi,
648 IndexType iLoZ,
649 IndexType iHiZ,
650 GeMatrix<MZ> &Z,
651 DenseVector<VWORK> &work)
652 {
653 typedef typename GeMatrix<MH>::ElementType T;
654
655 const LOGICAL WANTT = wantT;
656 const LOGICAL WANTZ = wantZ;
657 const INTEGER N = H.numRows();
658 const INTEGER ILO = iLo;
659 const INTEGER IHI = iHi;
660 const INTEGER LDH = H.leadingDimension();
661 const INTEGER ILOZ = iLoZ;
662 const INTEGER IHIZ = iHiZ;
663 const INTEGER LDZ = Z.leadingDimension();
664 const INTEGER LWORK = work.length();
665 INTEGER INFO;
666
667 if (IsSame<T,DOUBLE>::value) {
668 LAPACK_IMPL(dlaqr4)(&WANTT,
669 &WANTZ,
670 &N,
671 &ILO,
672 &IHI,
673 H.data(),
674 &LDH,
675 wr.data(),
676 wi.data(),
677 &ILOZ,
678 &IHIZ,
679 Z.data(),
680 &LDZ,
681 work.data(),
682 &LWORK,
683 &INFO);
684 } else {
685 ASSERT(0);
686 }
687 ASSERT(INFO>=0);
688 return INFO;
689 }
690
691 #endif // CHECK_CXXLAPACK
692
693 //== public interface ==========================================================
694 template <typename IndexType, typename MH>
695 IndexType
696 laqr4_wsq(bool wantT,
697 bool wantZ,
698 IndexType iLo,
699 IndexType iHi,
700 const GeMatrix<MH> &H)
701 {
702 using std::max;
703 //
704 // Test the input parameters
705 //
706 # ifndef NDEBUG
707 ASSERT(H.firstRow()==1);
708 ASSERT(H.firstCol()==1);
709 ASSERT(H.numRows()==H.numCols());
710
711 const IndexType n = H.numRows();
712
713 if (n>0) {
714 ASSERT(1<=iLo);
715 ASSERT(iLo<=iHi);
716 ASSERT(iHi<=n);
717 } else {
718 ASSERT(iLo==1);
719 ASSERT(iHi==0);
720 }
721
722 ASSERT(1<=iLo);
723 ASSERT(iLo<=iHi);
724 ASSERT(iHi<=n);
725 # endif
726
727 //
728 // Call implementation
729 //
730 // TODO: call generic implementation
731 IndexType info = laqr4_generic_wsq(wantT, wantZ, iLo, iHi, H);
732
733 # ifdef CHECK_CXXLAPACK
734 //
735 // Compare results
736 //
737 IndexType _info = laqr4_native_wsq(wantT, wantZ, iLo, iHi, H);
738
739 if (info!=_info) {
740 std::cerr << "CXXLAPACK: info = " << info << std::endl;
741 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
742 ASSERT(0);
743 }
744 # endif
745 return info;
746
747 }
748
749 template <typename IndexType, typename MH, typename VWR, typename VWI,
750 typename MZ, typename VWORK>
751 IndexType
752 laqr4(bool wantT,
753 bool wantZ,
754 IndexType iLo,
755 IndexType iHi,
756 GeMatrix<MH> &H,
757 DenseVector<VWR> &wr,
758 DenseVector<VWI> &wi,
759 IndexType iLoZ,
760 IndexType iHiZ,
761 GeMatrix<MZ> &Z,
762 DenseVector<VWORK> &work)
763 {
764 LAPACK_DEBUG_OUT("laqr4");
765
766 using std::max;
767 //
768 // Test the input parameters
769 //
770 # ifndef NDEBUG
771 ASSERT(H.firstRow()==1);
772 ASSERT(H.firstCol()==1);
773 ASSERT(H.numRows()==H.numCols());
774
775 const IndexType n = H.numRows();
776
777 if (n>0) {
778 ASSERT(1<=iLo);
779 ASSERT(iLo<=iHi);
780 ASSERT(iHi<=n);
781 } else {
782 ASSERT(iLo==1);
783 ASSERT(iHi==0);
784 }
785
786 ASSERT(wr.firstIndex()==1);
787 ASSERT(wr.length()>=iHi);
788
789 ASSERT(wi.firstIndex()==1);
790 ASSERT(wi.length()>=iHi);
791
792 if (wantZ) {
793 ASSERT(1<=iLoZ);
794 ASSERT(iLoZ<=iLo);
795 ASSERT(iHi<=iHiZ);
796 ASSERT(iHiZ<=n);
797
798 ASSERT(Z.firstRow()==1);
799 ASSERT(Z.firstCol()==1);
800 ASSERT(Z.numRows()>=iHi);
801 ASSERT(Z.numCols()>=iHi);
802 }
803
804 ASSERT((work.length()==0) || (work.length()>=n));
805 # endif
806
807 //
808 // Make copies of output arguments
809 //
810 # ifdef CHECK_CXXLAPACK
811 typename GeMatrix<MH>::NoView H_org = H;
812 typename DenseVector<VWR>::NoView wr_org = wr;
813 typename DenseVector<VWI>::NoView wi_org = wi;
814 typename GeMatrix<MZ>::NoView Z_org = Z;
815 typename DenseVector<VWORK>::NoView work_org = work;
816 # endif
817
818 //
819 // Call implementation
820 //
821 IndexType info = laqr4_generic(wantT, wantZ, iLo, iHi, H, wr, wi,
822 iLoZ, iHiZ, Z, work);
823 # ifdef CHECK_CXXLAPACK
824 //
825 // Make copies of results computed by the generic implementation
826 //
827 typename GeMatrix<MH>::NoView H_generic = H;
828 typename DenseVector<VWR>::NoView wr_generic = wr;
829 typename DenseVector<VWI>::NoView wi_generic = wi;
830 typename GeMatrix<MZ>::NoView Z_generic = Z;
831 typename DenseVector<VWORK>::NoView work_generic = work;
832 //
833 // restore output arguments
834 //
835 H = H_org;
836 wr = wr_org;
837 wi = wi_org;
838 Z = Z_org;
839 work = work_org;
840
841 //
842 // Compare results
843 //
844 IndexType _info = laqr4_native(wantT, wantZ, iLo, iHi, H, wr, wi,
845 iLoZ, iHiZ, Z, work);
846
847 bool failed = false;
848 if (! isIdentical(H_generic, H, "H_generic", "H")) {
849 std::cerr << "CXXLAPACK: H_generic = " << H_generic << std::endl;
850 std::cerr << "F77LAPACK: H = " << H << std::endl;
851 failed = true;
852 }
853
854 if (! isIdentical(wr_generic, wr, "wr_generic", "wr")) {
855 std::cerr << "CXXLAPACK: wr_generic = " << wr_generic << std::endl;
856 std::cerr << "F77LAPACK: wr = " << wr << std::endl;
857 failed = true;
858 }
859
860 if (! isIdentical(wi_generic, wi, "wi_generic", "wi")) {
861 std::cerr << "CXXLAPACK: wi_generic = " << wi_generic << std::endl;
862 std::cerr << "F77LAPACK: wi = " << wi << std::endl;
863 failed = true;
864 }
865
866 if (! isIdentical(Z_generic, Z, "Z_generic", "Z")) {
867 std::cerr << "CXXLAPACK: Z_generic = " << Z_generic << std::endl;
868 std::cerr << "F77LAPACK: Z = " << Z << std::endl;
869 failed = true;
870 }
871
872 if (! isIdentical(info, _info, " info", "_info")) {
873 std::cerr << "CXXLAPACK: info = " << info << std::endl;
874 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
875 failed = true;
876 }
877
878 if (! isIdentical(work_generic, work, " work_generic", "work")) {
879 std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
880 std::cerr << "F77LAPACK: work = " << work << std::endl;
881 failed = true;
882 }
883
884 if (failed) {
885 std::cerr << "error in: laqr4.tcc" << std::endl;
886 ASSERT(0);
887 } else {
888 // std::cerr << "passed: laqr4.tcc" << std::endl;
889 }
890 # endif
891 return info;
892 }
893
894 //-- forwarding ----------------------------------------------------------------
895 template <typename IndexType, typename MH>
896 IndexType
897 laqr4_wsq(bool wantT,
898 bool wantZ,
899 IndexType iLo,
900 IndexType iHi,
901 const MH &&H)
902 {
903 CHECKPOINT_ENTER;
904 const IndexType info = laqr4_wsq(wantT, wantZ, iLo, iHi, H);
905 CHECKPOINT_LEAVE;
906
907 return info;
908 }
909
910 template <typename IndexType, typename MH, typename VWR, typename VWI,
911 typename MZ, typename VWORK>
912 IndexType
913 laqr4(bool wantT,
914 bool wantZ,
915 IndexType iLo,
916 IndexType iHi,
917 MH &&H,
918 VWR &&wr,
919 VWI &&wi,
920 IndexType iLoZ,
921 IndexType iHiZ,
922 MZ &&Z,
923 VWORK &&work)
924 {
925 CHECKPOINT_ENTER;
926 const IndexType info = laqr4(wantT, wantZ, iLo, iHi, H, wr, wi,
927 iLoZ, iHiZ, Z, work);
928 CHECKPOINT_LEAVE;
929
930 return info;
931 }
932
933 } } // namespace lapack, flens
934
935 #endif // FLENS_LAPACK_EIG_LAQR4_TCC
2 * Copyright (c) 2011, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Based on
34 *
35 SUBROUTINE DLAQR4( WANTT, WANTZ, N, ILO, IHI, H, LDH, WR, WI,
36 $ ILOZ, IHIZ, Z, LDZ, WORK, LWORK, INFO )
37 *
38 * -- LAPACK auxiliary routine (version 3.2) --
39 * Univ. of Tennessee, Univ. of California Berkeley,
40 * Univ. of Colorado Denver and NAG Ltd..
41 * November 2006
42 */
43
44 #ifndef FLENS_LAPACK_EIG_LAQR4_TCC
45 #define FLENS_LAPACK_EIG_LAQR4_TCC 1
46
47 #include <flens/blas/blas.h>
48 #include <flens/lapack/lapack.h>
49
50 namespace flens { namespace lapack {
51
52 //== generic lapack implementation =============================================
53
54 template <typename IndexType, typename MH>
55 IndexType
56 laqr4_generic_wsq(bool wantT,
57 bool wantZ,
58 IndexType iLo,
59 IndexType iHi,
60 const GeMatrix<MH> &H)
61 {
62 using std::max;
63 using std::min;
64
65 typedef typename GeMatrix<MH>::ElementType T;
66
67 const IndexType nTiny = 11;
68 const IndexType n = H.numRows();
69
70 if ((n==0) || (n<=nTiny)) {
71 return 1;
72 }
73
74 char job[3];
75 job[0] = (wantT) ? 'S' : 'E';
76 job[1] = (wantZ) ? 'V' : 'N';
77 job[2] = 0;
78 //
79 // ==== NWR = recommended deflation window size. At this
80 // . point, N .GT. NTINY = 11, so there is enough
81 // . subdiagonal workspace for NWR.GE.2 as required.
82 // . (In fact, there is enough subdiagonal space for
83 // . NWR.GE.3.) ====
84 //
85 IndexType nwr = ilaenv<T>(13, "LAQR4", job, n, iLo, iHi, -1);
86 nwr = max(IndexType(2), nwr);
87 nwr = min(min(iHi-iLo+1, (n-1)/3), nwr);
88 //
89 // ==== NSR = recommended number of simultaneous shifts.
90 // . At this point N .GT. NTINY = 11, so there is at
91 // . enough subdiagonal workspace for NSR to be even
92 // . and greater than or equal to two as required. ====
93 //
94 IndexType nsr = ilaenv<T>(15, "LAQR4", job, n, iLo, iHi, -1);
95 nsr = min(min(nsr, (n+6)/9), iHi-iLo);
96 nsr = max(IndexType(2), nsr-(nsr%2));
97 //
98 // ==== Estimate optimal workspace ====
99 //
100 // ==== Workspace query call to DLAQR2 ====
101 //
102 IndexType lWorkOpt = laqr2_wsq(iLo, iHi, nwr+1, H);
103 //
104 // ==== Optimal workspace = MAX(DLAQR5, DLAQR2) ====
105 //
106 return max(3*nsr/2, lWorkOpt);
107 }
108
109 template <typename IndexType, typename MH, typename VWR, typename VWI,
110 typename MZ, typename VWORK>
111 IndexType
112 laqr4_generic(bool wantT,
113 bool wantZ,
114 IndexType iLo,
115 IndexType iHi,
116 GeMatrix<MH> &H,
117 DenseVector<VWR> &wr,
118 DenseVector<VWI> &wi,
119 IndexType iLoZ,
120 IndexType iHiZ,
121 GeMatrix<MZ> &Z,
122 DenseVector<VWORK> &work)
123 {
124 using std::abs;
125 using std::max;
126 using std::min;
127 using std::swap;
128
129 typedef typename GeMatrix<MH>::ElementType T;
130
131 const Underscore<IndexType> _;
132
133 const IndexType n = H.numRows();
134
135 // ==== Matrices of order NTINY or smaller must be processed by
136 // . DLAHQR because of insufficient subdiagonal scratch space.
137 // . (This is a hard limit.) ====
138 const IndexType nTiny = 11;
139
140 // ==== Exceptional deflation windows: try to cure rare
141 // . slow convergence by varying the size of the
142 // . deflation window after KEXNW iterations. ====
143 const IndexType kexNw = 5;
144
145 //
146 // ==== Exceptional shifts: try to cure rare slow convergence
147 // . with ad-hoc exceptional shifts every KEXSH iterations.
148 // . ====
149 const IndexType kexSh = 6;
150
151 //
152 // ==== The constants WILK1 and WILK2 are used to form the
153 // . exceptional shifts. ====
154 const T wilk1 = T(0.75),
155 wilk2 = T(-0.4375);
156
157 const T Zero(0), One(1);
158
159 IndexType info = 0;
160 IndexType lWork, lWorkOpt;
161
162 IndexType nDec = -1;
163
164 //
165 // ==== Perform and apply a workspace query if necessary ====
166 //
167 if (work.length()==0) {
168 lWorkOpt = laqr0_generic_wsq(wantT, wantZ, iLo, iHi, H);
169 work.resize(lWorkOpt);
170 }
171
172 lWork = work.length();
173
174 //
175 // ==== Quick return for N = 0: nothing to do. ====
176 //
177 if (n==0) {
178 work(1) = One;
179 return info;
180 }
181
182 if (n<=nTiny) {
183 //
184 // ==== Tiny matrices must use DLAHQR. ====
185 //
186 lWorkOpt = 1;
187 info = lahqr(wantT, wantZ, iLo, iHi, H, wr, wi, iLoZ, iHiZ, Z);
188 } else {
189 //
190 // ==== Use small bulge multi-shift QR with aggressive early
191 // . deflation on larger-than-tiny matrices. ====
192 //
193 // ==== Hope for the best. ====
194 //
195 info = 0;
196 //
197 // ==== Set up job flags for ILAENV. ====
198 //
199 char job[3];
200 job[0] = (wantT) ? 'S' : 'E';
201 job[1] = (wantZ) ? 'V' : 'N';
202 job[2] = 0;
203 //
204 // ==== NWR = recommended deflation window size. At this
205 // . point, N .GT. NTINY = 11, so there is enough
206 // . subdiagonal workspace for NWR.GE.2 as required.
207 // . (In fact, there is enough subdiagonal space for
208 // . NWR.GE.3.) ====
209 //
210 IndexType nwr = ilaenv<T>(13, "LAQR0", job, n, iLo, iHi, lWork);
211 nwr = max(IndexType(2), nwr);
212 nwr = min(min(iHi-iLo+1, (n-1)/3), nwr);
213 //
214 // ==== NSR = recommended number of simultaneous shifts.
215 // . At this point N .GT. NTINY = 11, so there is at
216 // . enough subdiagonal workspace for NSR to be even
217 // . and greater than or equal to two as required. ====
218 //
219 IndexType nsr = ilaenv<T>(15, "LAQR0", job, n, iLo, iHi, lWork);
220 nsr = min(min(nsr, (n+6)/9), iHi-iLo);
221 nsr = max(IndexType(2), nsr - (nsr%2));
222 //
223 // ==== Estimate optimal workspace ====
224 //
225 // ==== Workspace query call to DLAQR2 ====
226 //
227 lWorkOpt = laqr2_wsq(iLo, iHi, nwr+1, H);
228 //
229 // ==== Optimal workspace = MAX(DLAQR5, DLAQR2) ====
230 //
231 lWorkOpt = max(3*nsr/2, lWorkOpt);
232 //
233 // ==== DLAHQR/DLAQR0 crossover point ====
234 //
235 IndexType nMin = ilaenv<T>(12, "LAQR0", job, n, iLo, iHi, lWork);
236 nMin = max(nTiny, nMin);
237 //
238 // ==== Nibble crossover point ====
239 //
240 IndexType nibble = ilaenv<T>(14, "LAQR0", job, n, iLo, iHi, lWork);
241 nibble = max(IndexType(0), nibble);
242 //
243 // ==== Accumulate reflections during ttswp? Use block
244 // . 2-by-2 structure during matrix-matrix multiply? ====
245 //
246 IndexType kacc22 = ilaenv<T>(16, "LAQR0", job, n, iLo, iHi, lWork);
247 kacc22 = max(IndexType(0), kacc22);
248 kacc22 = min(IndexType(2), kacc22);
249 //
250 // ==== NWMAX = the largest possible deflation window for
251 // . which there is sufficient workspace. ====
252 //
253 IndexType nwMax = min((n-1)/3, lWork/2);
254 IndexType nw = nwMax;
255 //
256 // ==== NSMAX = the Largest number of simultaneous shifts
257 // . for which there is sufficient workspace. ====
258 //
259 IndexType nsMax = min((n+6 )/9, 2*lWork/3);
260 nsMax = nsMax - (nsMax%2);
261 //
262 // ==== NDFL: an iteration count restarted at deflation. ====
263 //
264 IndexType nDfl = 1;
265 //
266 // ==== ITMAX = iteration limit ====
267 //
268 IndexType itMax = max(IndexType(30), 2*kexSh)
269 * max(IndexType(10), (iHi-iLo+1));
270 //
271 // ==== Last row and column in the active block ====
272 //
273 IndexType kBot = iHi;
274 //
275 // ==== Main Loop ====
276 //
277 IndexType it;
278 for (it=1; it<=itMax; ++it) {
279 //
280 // ==== Done when KBOT falls below ILO ====
281 //
282 if (kBot<iLo) {
283 break;
284 }
285 //
286 // ==== Locate active block ====
287 //
288 IndexType k;
289 for (k=kBot; k>=iLo+1; --k) {
290 if (H(k,k-1)==Zero) {
291 break;
292 }
293 }
294 ASSERT(k==iLo || H(k,k-1)==Zero);
295 const IndexType kTop = k;
296 //
297 // ==== Select deflation window size:
298 // . Typical Case:
299 // . If possible and advisable, nibble the entire
300 // . active block. If not, use size MIN(NWR,NWMAX)
301 // . or MIN(NWR+1,NWMAX) depending upon which has
302 // . the smaller corresponding subdiagonal entry
303 // . (a heuristic).
304 // .
305 // . Exceptional Case:
306 // . If there have been no deflations in KEXNW or
307 // . more iterations, then vary the deflation window
308 // . size. At first, because, larger windows are,
309 // . in general, more powerful than smaller ones,
310 // . rapidly increase the window to the maximum possible.
311 // . Then, gradually reduce the window size. ====
312 //
313 IndexType nh = kBot - kTop + 1;
314 IndexType nwUpBd = min(nh, nwMax);
315 if (nDfl<kexNw) {
316 nw = min(nwUpBd, nwr);
317 } else {
318 nw = min(nwUpBd, 2*nw);
319 }
320 if (nw<nwMax) {
321 if (nw>=nh-1) {
322 nw = nh;
323 } else {
324 const IndexType kwTop = kBot - nw + 1;
325 if (abs(H(kwTop,kwTop-1))>abs(H(kwTop-1, kwTop-2))) {
326 ++nw;
327 }
328 }
329 }
330 if (nDfl<kexNw) {
331 nDec = -1;
332 } else if (nDec>=0 || nw>=nwUpBd) {
333 ++nDec;
334 if (nw-nDec<2) {
335 nDec = 0;
336 }
337 nw -= nDec;
338 }
339 //
340 // ==== Aggressive early deflation:
341 // . split workspace under the subdiagonal into
342 // . - an nw-by-nw work array V in the lower
343 // . left-hand-corner,
344 // . - an NW-by-at-least-NW-but-more-is-better
345 // . (NW-by-NHO) horizontal work array along
346 // . the bottom edge,
347 // . - an at-least-NW-but-more-is-better (NHV-by-NW)
348 // . vertical work array along the left-hand-edge.
349 // . ====
350 //
351 auto _V = H(_(n-nw+1, n), _( 1, nw));
352 auto _T = H(_(n-nw+1, n), _(nw+1, n-nw-1));
353 auto _WV = H(_( nw+2, n-nw), _( 1, nw));
354 //
355 // ==== Aggressive early deflation ====
356 //
357 IndexType ls, ld;
358
359 laqr2(wantT, wantZ, kTop, kBot, nw, H, iLoZ, iHiZ, Z,
360 ls, ld, wr(_(1,kBot)), wi(_(1,kBot)),
361 _V, _T, _WV, work);
362 //
363 // ==== Adjust KBOT accounting for new deflations. ====
364 //
365 kBot -= ld;
366 //
367 // ==== KS points to the shifts. ====
368 //
369 IndexType ks = kBot - ls + 1;
370 //
371 // ==== Skip an expensive QR sweep if there is a (partly
372 // . heuristic) reason to expect that many eigenvalues
373 // . will deflate without it. Here, the QR sweep is
374 // . skipped if many eigenvalues have just been deflated
375 // . or if the remaining active block is small.
376 //
377 if ((ld==0)
378 || ((100*ld<=nw*nibble) && (kBot-kTop+1>min(nMin, nwMax))))
379 {
380 //
381 // ==== NS = nominal number of simultaneous shifts.
382 // . This may be lowered (slightly) if DLAQR2
383 // . did not provide that many shifts. ====
384 //
385 IndexType ns = min(min(nsMax, nsr),
386 max(IndexType(2), kBot-kTop));
387 ns = ns - (ns % 2);
388 //
389 // ==== If there have been no deflations
390 // . in a multiple of KEXSH iterations,
391 // . then try exceptional shifts.
392 // . Otherwise use shifts provided by
393 // . DLAQR2 above or from the eigenvalues
394 // . of a trailing principal submatrix. ====
395 //
396 if (nDfl%kexSh==0) {
397 ks = kBot - ns + 1;
398 for (IndexType i=kBot; i>=max(ks+1,kTop+2); i-=2) {
399 const T ss = abs(H(i,i-1)) + abs(H(i-1,i-2));
400 T aa = wilk1*ss + H(i,i);
401 T bb = ss;
402 T cc = wilk2*ss;
403 T dd = aa;
404 T cs, sn;
405 lanv2(aa, bb, cc, dd,
406 wr(i-1), wi(i-1), wr(i), wi(i),
407 cs, sn);
408 }
409 if (ks==kTop) {
410 wr(ks+1) = H(ks+1, ks+1);
411 wi(ks+1) = Zero;
412 wr(ks) = wr(ks+1);
413 wi(ks) = wi(ks+1);
414 }
415 } else {
416 //
417 // ==== Got NS/2 or fewer shifts? Use DLAQR4 or
418 // . DLAHQR on a trailing principal submatrix to
419 // . get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
420 // . there is enough space below the subdiagonal
421 // . to fit an NS-by-NS scratch array.) ====
422 //
423 if (kBot-ks+1<=ns/2) {
424 ks = kBot - ns +1;
425 H(_(ks,kBot),_(1,ns)) = H(_(ks,kBot),_(ks,kBot));
426 if (ns>nMin) {
427 // TODO: avoid the need for ZDummy
428 typename GeMatrix<MZ>::NoView ZDummy;
429 ks += laqr4(false, false,
430 IndexType(1), ns,
431 H(_(ks,kBot),_(1,ns)),
432 wr(_(ks,kBot)), wi(_(ks,kBot)),
433 IndexType(1), IndexType(1),
434 ZDummy, work);
435 } else {
436 // TODO: avoid the need for ZDummy
437 typename GeMatrix<MZ>::NoView ZDummy;
438 ks += lahqr(false, false,
439 IndexType(1), ns,
440 H(_(ks,kBot),_(1,ns)),
441 wr(_(ks,kBot)), wi(_(ks,kBot)),
442 IndexType(1), IndexType(1),
443 ZDummy);
444 }
445 //
446 // ==== In case of a rare QR failure use
447 // . eigenvalues of the trailing 2-by-2
448 // . principal submatrix. ====
449 //
450 if (ks>=kBot) {
451 T aa = H(kBot-1,kBot-1);
452 T cc = H(kBot, kBot-1);
453 T bb = H(kBot-1,kBot);
454 T dd = H(kBot, kBot);
455 T cs, sn;
456 lanv2(aa, bb, cc, dd,
457 wr(kBot-1), wi(kBot-1), wr(kBot), wi(kBot),
458 cs, sn);
459 ks = kBot - 1;
460 }
461 }
462
463 if (kBot-ks+1>ns) {
464 //
465 // ==== Sort the shifts (Helps a little)
466 // . Bubble sort keeps complex conjugate
467 // . pairs together. ====
468 //
469 bool sorted = false;
470 for (IndexType k=kBot; k>=ks+1; --k) {
471 if (sorted) {
472 break;
473 }
474 sorted = true;
475 for (IndexType i=ks; i<=k-1; ++i) {
476 if (abs(wr(i))+abs(wi(i))
477 < abs(wr(i+1))+abs(wi(i+1)))
478 {
479 sorted = false;
480 swap(wr(i), wr(i+1));
481 swap(wi(i), wi(i+1));
482 }
483 }
484 }
485 }
486 //
487 // ==== Shuffle shifts into pairs of real shifts
488 // . and pairs of complex conjugate shifts
489 // . assuming complex conjugate shifts are
490 // . already adjacent to one another. (Yes,
491 // . they are.) ====
492 //
493 for (IndexType i=kBot; i>=ks+2; i-=2) {
494 if (wi(i)!=-wi(i-1)) {
495 T tmp = wr(i);
496 wr(i) = wr(i-1);
497 wr(i-1) = wr(i-2);
498 wr(i-2) = tmp;
499
500 tmp = wi(i);
501 wi(i) = wi(i-1);
502 wi(i-1) = wi(i-2);
503 wi(i-2) = tmp;
504 }
505 }
506 }
507 //
508 // ==== If there are only two shifts and both are
509 // . real, then use only one. ====
510 //
511 if (kBot-ks+1==2) {
512 if (wi(kBot)==0) {
513 const T _H = H(kBot,kBot);
514 if (abs(wr(kBot)-_H) < abs(wr(kBot-1)-_H)) {
515 wr(kBot-1) = wr(kBot);
516 } else {
517 wr(kBot) = wr(kBot-1);
518 }
519 }
520 }
521 //
522 // ==== Use up to NS of the the smallest magnatiude
523 // . shifts. If there aren't NS shifts available,
524 // . then use them all, possibly dropping one to
525 // . make the number of shifts even. ====
526 //
527 ns = min(ns, kBot-ks+1);
528 ns = ns - (ns%2);
529 ks = kBot - ns + 1;
530 //
531 // ==== Small-bulge multi-shift QR sweep:
532 // . split workspace under the subdiagonal into
533 // . - a KDU-by-KDU work array U in the lower
534 // . left-hand-corner,
535 // . - a KDU-by-at-least-KDU-but-more-is-better
536 // . (KDU-by-NHo) horizontal work array WH along
537 // . the bottom edge,
538 // . - and an at-least-KDU-but-more-is-better-by-KDU
539 // . (NVE-by-KDU) vertical work WV arrow along
540 // . the left-hand-edge. ====
541 //
542 IndexType kdu = 3*ns - 3;
543 IndexType ku = n - kdu + 1;
544 IndexType kwv = kdu + 4;
545 IndexType nho = (n-kdu+1-4) - (kdu+1) + 1;
546
547 // NOTE: _WH.numCols()==_WV.numRows()
548
549 typedef typename GeMatrix<MH>::View GeMatrixView;
550 GeMatrixView _V(IndexType(3), ns/2, work(_(1,3*ns/2)));
551 auto _U = H(_( ku, n), _( 1, kdu));
552 auto _WV = H(_(kwv,n-kdu), _( 1, kdu));
553 auto _WH = H(_( ku, n), _(kdu+1,kdu+nho));
554 //
555 // ==== Small-bulge multi-shift QR sweep ====
556 //
557 laqr5(wantT, wantZ, kacc22, kTop, kBot, ns,
558 wr(_(ks,kBot)), wi(_(ks,kBot)), H,
559 iLoZ, iHiZ, Z,
560 _V, _U, _WV, _WH);
561 }
562 //
563 // ==== Note progress (or the lack of it). ====
564 //
565 if (ld>0) {
566 nDfl = 1;
567 } else {
568 ++nDfl;
569 }
570 //
571 // ==== End of main loop ====
572 //
573 }
574 //
575 // ==== Iteration limit exceeded. Set INFO to show where
576 // . the problem occurred and exit. ====
577 //
578 if (it>itMax) {
579 info = kBot;
580 }
581 }
582
583 work(1) = lWorkOpt;
584 return info;
585 }
586
587 //== interface for native lapack ===============================================
588
589 #ifdef CHECK_CXXLAPACK
590
591 template <typename IndexType, typename MH>
592 IndexType
593 laqr4_native_wsq(bool wantT,
594 bool wantZ,
595 IndexType iLo,
596 IndexType iHi,
597 const GeMatrix<MH> &H)
598 {
599 typedef typename GeMatrix<MH>::ElementType T;
600
601 const LOGICAL WANTT = wantT;
602 const LOGICAL WANTZ = wantZ;
603 const INTEGER N = H.numRows();
604 const INTEGER ILO = iLo;
605 const INTEGER IHI = iHi;
606 const INTEGER LDH = H.leadingDimension();
607 const INTEGER ILOZ = 1;
608 const INTEGER IHIZ = 0;
609 const INTEGER LDZ = 1;
610 T WORK;
611 T DUMMY;
612 const INTEGER LWORK = -1;
613 INTEGER INFO;
614
615 if (IsSame<T,DOUBLE>::value) {
616 LAPACK_IMPL(dlaqr4)(&WANTT,
617 &WANTZ,
618 &N,
619 &ILO,
620 &IHI,
621 &DUMMY,
622 &LDH,
623 &DUMMY,
624 &DUMMY,
625 &ILOZ,
626 &IHIZ,
627 &DUMMY,
628 &LDZ,
629 &WORK,
630 &LWORK,
631 &INFO);
632 } else {
633 ASSERT(0);
634 }
635 return WORK;
636 }
637
638 template <typename IndexType, typename MH, typename VWR, typename VWI,
639 typename MZ, typename VWORK>
640 IndexType
641 laqr4_native(bool wantT,
642 bool wantZ,
643 IndexType iLo,
644 IndexType iHi,
645 GeMatrix<MH> &H,
646 DenseVector<VWR> &wr,
647 DenseVector<VWI> &wi,
648 IndexType iLoZ,
649 IndexType iHiZ,
650 GeMatrix<MZ> &Z,
651 DenseVector<VWORK> &work)
652 {
653 typedef typename GeMatrix<MH>::ElementType T;
654
655 const LOGICAL WANTT = wantT;
656 const LOGICAL WANTZ = wantZ;
657 const INTEGER N = H.numRows();
658 const INTEGER ILO = iLo;
659 const INTEGER IHI = iHi;
660 const INTEGER LDH = H.leadingDimension();
661 const INTEGER ILOZ = iLoZ;
662 const INTEGER IHIZ = iHiZ;
663 const INTEGER LDZ = Z.leadingDimension();
664 const INTEGER LWORK = work.length();
665 INTEGER INFO;
666
667 if (IsSame<T,DOUBLE>::value) {
668 LAPACK_IMPL(dlaqr4)(&WANTT,
669 &WANTZ,
670 &N,
671 &ILO,
672 &IHI,
673 H.data(),
674 &LDH,
675 wr.data(),
676 wi.data(),
677 &ILOZ,
678 &IHIZ,
679 Z.data(),
680 &LDZ,
681 work.data(),
682 &LWORK,
683 &INFO);
684 } else {
685 ASSERT(0);
686 }
687 ASSERT(INFO>=0);
688 return INFO;
689 }
690
691 #endif // CHECK_CXXLAPACK
692
693 //== public interface ==========================================================
694 template <typename IndexType, typename MH>
695 IndexType
696 laqr4_wsq(bool wantT,
697 bool wantZ,
698 IndexType iLo,
699 IndexType iHi,
700 const GeMatrix<MH> &H)
701 {
702 using std::max;
703 //
704 // Test the input parameters
705 //
706 # ifndef NDEBUG
707 ASSERT(H.firstRow()==1);
708 ASSERT(H.firstCol()==1);
709 ASSERT(H.numRows()==H.numCols());
710
711 const IndexType n = H.numRows();
712
713 if (n>0) {
714 ASSERT(1<=iLo);
715 ASSERT(iLo<=iHi);
716 ASSERT(iHi<=n);
717 } else {
718 ASSERT(iLo==1);
719 ASSERT(iHi==0);
720 }
721
722 ASSERT(1<=iLo);
723 ASSERT(iLo<=iHi);
724 ASSERT(iHi<=n);
725 # endif
726
727 //
728 // Call implementation
729 //
730 // TODO: call generic implementation
731 IndexType info = laqr4_generic_wsq(wantT, wantZ, iLo, iHi, H);
732
733 # ifdef CHECK_CXXLAPACK
734 //
735 // Compare results
736 //
737 IndexType _info = laqr4_native_wsq(wantT, wantZ, iLo, iHi, H);
738
739 if (info!=_info) {
740 std::cerr << "CXXLAPACK: info = " << info << std::endl;
741 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
742 ASSERT(0);
743 }
744 # endif
745 return info;
746
747 }
748
749 template <typename IndexType, typename MH, typename VWR, typename VWI,
750 typename MZ, typename VWORK>
751 IndexType
752 laqr4(bool wantT,
753 bool wantZ,
754 IndexType iLo,
755 IndexType iHi,
756 GeMatrix<MH> &H,
757 DenseVector<VWR> &wr,
758 DenseVector<VWI> &wi,
759 IndexType iLoZ,
760 IndexType iHiZ,
761 GeMatrix<MZ> &Z,
762 DenseVector<VWORK> &work)
763 {
764 LAPACK_DEBUG_OUT("laqr4");
765
766 using std::max;
767 //
768 // Test the input parameters
769 //
770 # ifndef NDEBUG
771 ASSERT(H.firstRow()==1);
772 ASSERT(H.firstCol()==1);
773 ASSERT(H.numRows()==H.numCols());
774
775 const IndexType n = H.numRows();
776
777 if (n>0) {
778 ASSERT(1<=iLo);
779 ASSERT(iLo<=iHi);
780 ASSERT(iHi<=n);
781 } else {
782 ASSERT(iLo==1);
783 ASSERT(iHi==0);
784 }
785
786 ASSERT(wr.firstIndex()==1);
787 ASSERT(wr.length()>=iHi);
788
789 ASSERT(wi.firstIndex()==1);
790 ASSERT(wi.length()>=iHi);
791
792 if (wantZ) {
793 ASSERT(1<=iLoZ);
794 ASSERT(iLoZ<=iLo);
795 ASSERT(iHi<=iHiZ);
796 ASSERT(iHiZ<=n);
797
798 ASSERT(Z.firstRow()==1);
799 ASSERT(Z.firstCol()==1);
800 ASSERT(Z.numRows()>=iHi);
801 ASSERT(Z.numCols()>=iHi);
802 }
803
804 ASSERT((work.length()==0) || (work.length()>=n));
805 # endif
806
807 //
808 // Make copies of output arguments
809 //
810 # ifdef CHECK_CXXLAPACK
811 typename GeMatrix<MH>::NoView H_org = H;
812 typename DenseVector<VWR>::NoView wr_org = wr;
813 typename DenseVector<VWI>::NoView wi_org = wi;
814 typename GeMatrix<MZ>::NoView Z_org = Z;
815 typename DenseVector<VWORK>::NoView work_org = work;
816 # endif
817
818 //
819 // Call implementation
820 //
821 IndexType info = laqr4_generic(wantT, wantZ, iLo, iHi, H, wr, wi,
822 iLoZ, iHiZ, Z, work);
823 # ifdef CHECK_CXXLAPACK
824 //
825 // Make copies of results computed by the generic implementation
826 //
827 typename GeMatrix<MH>::NoView H_generic = H;
828 typename DenseVector<VWR>::NoView wr_generic = wr;
829 typename DenseVector<VWI>::NoView wi_generic = wi;
830 typename GeMatrix<MZ>::NoView Z_generic = Z;
831 typename DenseVector<VWORK>::NoView work_generic = work;
832 //
833 // restore output arguments
834 //
835 H = H_org;
836 wr = wr_org;
837 wi = wi_org;
838 Z = Z_org;
839 work = work_org;
840
841 //
842 // Compare results
843 //
844 IndexType _info = laqr4_native(wantT, wantZ, iLo, iHi, H, wr, wi,
845 iLoZ, iHiZ, Z, work);
846
847 bool failed = false;
848 if (! isIdentical(H_generic, H, "H_generic", "H")) {
849 std::cerr << "CXXLAPACK: H_generic = " << H_generic << std::endl;
850 std::cerr << "F77LAPACK: H = " << H << std::endl;
851 failed = true;
852 }
853
854 if (! isIdentical(wr_generic, wr, "wr_generic", "wr")) {
855 std::cerr << "CXXLAPACK: wr_generic = " << wr_generic << std::endl;
856 std::cerr << "F77LAPACK: wr = " << wr << std::endl;
857 failed = true;
858 }
859
860 if (! isIdentical(wi_generic, wi, "wi_generic", "wi")) {
861 std::cerr << "CXXLAPACK: wi_generic = " << wi_generic << std::endl;
862 std::cerr << "F77LAPACK: wi = " << wi << std::endl;
863 failed = true;
864 }
865
866 if (! isIdentical(Z_generic, Z, "Z_generic", "Z")) {
867 std::cerr << "CXXLAPACK: Z_generic = " << Z_generic << std::endl;
868 std::cerr << "F77LAPACK: Z = " << Z << std::endl;
869 failed = true;
870 }
871
872 if (! isIdentical(info, _info, " info", "_info")) {
873 std::cerr << "CXXLAPACK: info = " << info << std::endl;
874 std::cerr << "F77LAPACK: _info = " << _info << std::endl;
875 failed = true;
876 }
877
878 if (! isIdentical(work_generic, work, " work_generic", "work")) {
879 std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
880 std::cerr << "F77LAPACK: work = " << work << std::endl;
881 failed = true;
882 }
883
884 if (failed) {
885 std::cerr << "error in: laqr4.tcc" << std::endl;
886 ASSERT(0);
887 } else {
888 // std::cerr << "passed: laqr4.tcc" << std::endl;
889 }
890 # endif
891 return info;
892 }
893
894 //-- forwarding ----------------------------------------------------------------
895 template <typename IndexType, typename MH>
896 IndexType
897 laqr4_wsq(bool wantT,
898 bool wantZ,
899 IndexType iLo,
900 IndexType iHi,
901 const MH &&H)
902 {
903 CHECKPOINT_ENTER;
904 const IndexType info = laqr4_wsq(wantT, wantZ, iLo, iHi, H);
905 CHECKPOINT_LEAVE;
906
907 return info;
908 }
909
910 template <typename IndexType, typename MH, typename VWR, typename VWI,
911 typename MZ, typename VWORK>
912 IndexType
913 laqr4(bool wantT,
914 bool wantZ,
915 IndexType iLo,
916 IndexType iHi,
917 MH &&H,
918 VWR &&wr,
919 VWI &&wi,
920 IndexType iLoZ,
921 IndexType iHiZ,
922 MZ &&Z,
923 VWORK &&work)
924 {
925 CHECKPOINT_ENTER;
926 const IndexType info = laqr4(wantT, wantZ, iLo, iHi, H, wr, wi,
927 iLoZ, iHiZ, Z, work);
928 CHECKPOINT_LEAVE;
929
930 return info;
931 }
932
933 } } // namespace lapack, flens
934
935 #endif // FLENS_LAPACK_EIG_LAQR4_TCC