1 /*
  2  *   Copyright (c) 2011, Michael Lehn
  3  *
  4  *   All rights reserved.
  5  *
  6  *   Redistribution and use in source and binary forms, with or without
  7  *   modification, are permitted provided that the following conditions
  8  *   are met:
  9  *
 10  *   1) Redistributions of source code must retain the above copyright
 11  *      notice, this list of conditions and the following disclaimer.
 12  *   2) Redistributions in binary form must reproduce the above copyright
 13  *      notice, this list of conditions and the following disclaimer in
 14  *      the documentation and/or other materials provided with the
 15  *      distribution.
 16  *   3) Neither the name of the FLENS development group nor the names of
 17  *      its contributors may be used to endorse or promote products derived
 18  *      from this software without specific prior written permission.
 19  *
 20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31  */
 32 
 33 /* Based on
 34  *
 35        SUBROUTINE DLAQR4( WANTT, WANTZ, N, ILO, IHI, H, LDH, WR, WI,
 36       $                   ILOZ, IHIZ, Z, LDZ, WORK, LWORK, INFO )
 37  *
 38  *  -- LAPACK auxiliary routine (version 3.2) --
 39  *     Univ. of Tennessee, Univ. of California Berkeley,
 40  *     Univ. of Colorado Denver and NAG Ltd..
 41  *     November 2006
 42  */
 43 
 44 #ifndef FLENS_LAPACK_EIG_LAQR4_TCC
 45 #define FLENS_LAPACK_EIG_LAQR4_TCC 1
 46 
 47 #include <flens/blas/blas.h>
 48 #include <flens/lapack/lapack.h>
 49 
 50 namespace flens { namespace lapack {
 51 
 52 //== generic lapack implementation =============================================
 53 
 54 template <typename IndexType, typename MH>
 55 IndexType
 56 laqr4_generic_wsq(bool                  wantT,
 57                   bool                  wantZ,
 58                   IndexType             iLo,
 59                   IndexType             iHi,
 60                   const GeMatrix<MH>    &H)
 61 {
 62     using std::max;
 63     using std::min;
 64 
 65     typedef typename GeMatrix<MH>::ElementType  T;
 66 
 67     const IndexType nTiny   = 11;
 68     const IndexType n       = H.numRows();
 69 
 70     if ((n==0) || (n<=nTiny)) {
 71         return 1;
 72     }
 73 
 74     char job[3];
 75     job[0] = (wantT) ? 'S' : 'E';
 76     job[1] = (wantZ) ? 'V' : 'N';
 77     job[2] = 0;
 78 //  
 79 //  ==== NWR = recommended deflation window size.  At this
 80 //  .    point,  N .GT. NTINY = 11, so there is enough
 81 //  .    subdiagonal workspace for NWR.GE.2 as required.
 82 //  .    (In fact, there is enough subdiagonal space for
 83 //  .    NWR.GE.3.) ====
 84 //  
 85     IndexType nwr = ilaenv<T>(13"LAQR4", job, n, iLo, iHi, -1);
 86     nwr = max(IndexType(2), nwr);
 87     nwr = min(min(iHi-iLo+1, (n-1)/3), nwr);
 88 //
 89 //  ==== NSR = recommended number of simultaneous shifts.
 90 //  .    At this point N .GT. NTINY = 11, so there is at
 91 //  .    enough subdiagonal workspace for NSR to be even
 92 //  .    and greater than or equal to two as required. ====
 93 //
 94     IndexType nsr = ilaenv<T>(15"LAQR4", job, n, iLo, iHi, -1);
 95     nsr = min(min(nsr, (n+6)/9), iHi-iLo);
 96     nsr = max(IndexType(2), nsr-(nsr%2));
 97 //
 98 //  ==== Estimate optimal workspace ====
 99 //
100 //  ==== Workspace query call to DLAQR2 ====
101 //
102     IndexType lWorkOpt = laqr2_wsq(iLo, iHi, nwr+1, H);
103 //
104 //  ==== Optimal workspace = MAX(DLAQR5, DLAQR2) ====
105 //
106     return max(3*nsr/2, lWorkOpt);
107 }
108 
109 template <typename IndexType, typename MH, typename VWR, typename VWI,
110           typename MZ, typename VWORK>
111 IndexType
112 laqr4_generic(bool                  wantT,
113               bool                  wantZ,
114               IndexType             iLo,
115               IndexType             iHi,
116               GeMatrix<MH>          &H,
117               DenseVector<VWR>      &wr,
118               DenseVector<VWI>      &wi,
119               IndexType             iLoZ,
120               IndexType             iHiZ,
121               GeMatrix<MZ>          &Z,
122               DenseVector<VWORK>    &work)
123 {
124     using std::abs;
125     using std::max;
126     using std::min;
127     using std::swap;
128 
129     typedef typename GeMatrix<MH>::ElementType  T;
130 
131     const Underscore<IndexType>     _;
132 
133     const IndexType n       = H.numRows();
134 
135 //  ==== Matrices of order NTINY or smaller must be processed by
136 //  .    DLAHQR because of insufficient subdiagonal scratch space.
137 //  .    (This is a hard limit.) ====
138     const IndexType nTiny   = 11;
139 
140 //  ==== Exceptional deflation windows:  try to cure rare
141 //  .    slow convergence by varying the size of the
142 //  .    deflation window after KEXNW iterations. ====
143     const IndexType kexNw   = 5;
144 
145 //
146 //  ==== Exceptional shifts: try to cure rare slow convergence
147 //  .    with ad-hoc exceptional shifts every KEXSH iterations.
148 //  .    ====
149     const IndexType kexSh   = 6;
150 
151 //
152 //  ==== The constants WILK1 and WILK2 are used to form the
153 //  .    exceptional shifts. ====
154     const T         wilk1 = T(0.75),
155                     wilk2 = T(-0.4375);
156 
157     const T         Zero(0), One(1);
158 
159     IndexType info          = 0;
160     IndexType lWork, lWorkOpt;
161 
162     IndexType nDec          = -1;
163 
164 //
165 //  ==== Perform and apply a workspace query if necessary ====
166 //
167     if (work.length()==0) {
168         lWorkOpt = laqr0_generic_wsq(wantT, wantZ, iLo, iHi, H);
169         work.resize(lWorkOpt);
170     }
171 
172     lWork = work.length();
173 
174 //
175 //  ==== Quick return for N = 0: nothing to do. ====
176 //
177     if (n==0) {
178         work(1) = One;
179         return info;
180     }
181 
182     if (n<=nTiny) {
183 //
184 //      ==== Tiny matrices must use DLAHQR. ====
185 //
186         lWorkOpt = 1;
187         info = lahqr(wantT, wantZ, iLo, iHi, H, wr, wi, iLoZ, iHiZ, Z);
188     } else {
189 //
190 //      ==== Use small bulge multi-shift QR with aggressive early
191 //      .    deflation on larger-than-tiny matrices. ====
192 //
193 //      ==== Hope for the best. ====
194 //
195         info = 0;
196 //
197 //      ==== Set up job flags for ILAENV. ====
198 //
199         char job[3];
200         job[0] = (wantT) ? 'S' : 'E';
201         job[1] = (wantZ) ? 'V' : 'N';
202         job[2] = 0;
203 //
204 //      ==== NWR = recommended deflation window size.  At this
205 //      .    point,  N .GT. NTINY = 11, so there is enough
206 //      .    subdiagonal workspace for NWR.GE.2 as required.
207 //      .    (In fact, there is enough subdiagonal space for
208 //      .    NWR.GE.3.) ====
209 //
210         IndexType nwr = ilaenv<T>(13"LAQR0", job, n, iLo, iHi, lWork);
211         nwr = max(IndexType(2), nwr);
212         nwr = min(min(iHi-iLo+1, (n-1)/3), nwr);
213 //
214 //      ==== NSR = recommended number of simultaneous shifts.
215 //      .    At this point N .GT. NTINY = 11, so there is at
216 //      .    enough subdiagonal workspace for NSR to be even
217 //      .    and greater than or equal to two as required. ====
218 //
219         IndexType nsr = ilaenv<T>(15"LAQR0", job, n, iLo, iHi, lWork);
220         nsr = min(min(nsr, (n+6)/9), iHi-iLo);
221         nsr = max(IndexType(2), nsr - (nsr%2));
222 //
223 //      ==== Estimate optimal workspace ====
224 //
225 //      ==== Workspace query call to DLAQR2 ====
226 //
227         lWorkOpt = laqr2_wsq(iLo, iHi, nwr+1, H);
228 //
229 //      ==== Optimal workspace = MAX(DLAQR5, DLAQR2) ====
230 //
231         lWorkOpt = max(3*nsr/2, lWorkOpt);
232 //
233 //      ==== DLAHQR/DLAQR0 crossover point ====
234 //
235         IndexType nMin = ilaenv<T>(12"LAQR0", job, n, iLo, iHi, lWork);
236         nMin = max(nTiny, nMin);
237 //
238 //      ==== Nibble crossover point ====
239 //
240         IndexType nibble = ilaenv<T>(14"LAQR0", job, n, iLo, iHi, lWork);
241         nibble = max(IndexType(0), nibble);
242 //
243 //      ==== Accumulate reflections during ttswp?  Use block
244 //      .    2-by-2 structure during matrix-matrix multiply? ====
245 //
246         IndexType kacc22 = ilaenv<T>(16"LAQR0", job, n, iLo, iHi, lWork);
247         kacc22 = max(IndexType(0), kacc22);
248         kacc22 = min(IndexType(2), kacc22);
249 //
250 //      ==== NWMAX = the largest possible deflation window for
251 //      .    which there is sufficient workspace. ====
252 //
253         IndexType nwMax = min((n-1)/3, lWork/2);
254         IndexType nw = nwMax;
255 //
256 //      ==== NSMAX = the Largest number of simultaneous shifts
257 //      .    for which there is sufficient workspace. ====
258 //
259         IndexType nsMax = min((n+6 )/92*lWork/3);
260         nsMax = nsMax - (nsMax%2);
261 //
262 //      ==== NDFL: an iteration count restarted at deflation. ====
263 //
264         IndexType nDfl = 1;
265 //
266 //      ==== ITMAX = iteration limit ====
267 //
268         IndexType itMax = max(IndexType(30), 2*kexSh)
269                         * max(IndexType(10), (iHi-iLo+1));
270 //
271 //      ==== Last row and column in the active block ====
272 //
273         IndexType kBot = iHi;
274 //
275 //      ==== Main Loop ====
276 //
277         IndexType it;
278         for (it=1; it<=itMax; ++it) {
279 //
280 //          ==== Done when KBOT falls below ILO ====
281 //
282             if (kBot<iLo) {
283                 break;
284             }
285 //
286 //          ==== Locate active block ====
287 //
288             IndexType k;
289             for (k=kBot; k>=iLo+1; --k) {
290                 if (H(k,k-1)==Zero) {
291                     break;
292                 }
293             }
294             ASSERT(k==iLo || H(k,k-1)==Zero);
295             const IndexType kTop = k;
296 // 
297 //          ==== Select deflation window size:
298 //          .    Typical Case:
299 //          .      If possible and advisable, nibble the entire
300 //          .      active block.  If not, use size MIN(NWR,NWMAX)
301 //          .      or MIN(NWR+1,NWMAX) depending upon which has
302 //          .      the smaller corresponding subdiagonal entry
303 //          .      (a heuristic).
304 //          .
305 //          .    Exceptional Case:
306 //          .      If there have been no deflations in KEXNW or
307 //          .      more iterations, then vary the deflation window
308 //          .      size.   At first, because, larger windows are,
309 //          .      in general, more powerful than smaller ones,
310 //          .      rapidly increase the window to the maximum possible.
311 //          .      Then, gradually reduce the window size. ====
312 // 
313             IndexType nh = kBot - kTop + 1;
314             IndexType nwUpBd = min(nh, nwMax);
315             if (nDfl<kexNw) {
316                 nw = min(nwUpBd, nwr);
317             } else {
318                 nw = min(nwUpBd, 2*nw);
319             }
320             if (nw<nwMax) {
321                 if (nw>=nh-1) {
322                     nw = nh;
323                 } else {
324                     const IndexType kwTop = kBot - nw + 1;
325                     if (abs(H(kwTop,kwTop-1))>abs(H(kwTop-1, kwTop-2))) {
326                         ++nw;
327                     }
328                 }
329             }
330             if (nDfl<kexNw) {
331                 nDec = -1;
332             } else if (nDec>=0 || nw>=nwUpBd) {
333                 ++nDec;
334                 if (nw-nDec<2) {
335                     nDec = 0;
336                 }
337                 nw -= nDec;
338             }
339 // 
340 //          ==== Aggressive early deflation:
341 //          .    split workspace under the subdiagonal into
342 //          .      - an nw-by-nw work array V in the lower
343 //          .        left-hand-corner,
344 //          .      - an NW-by-at-least-NW-but-more-is-better
345 //          .        (NW-by-NHO) horizontal work array along
346 //          .        the bottom edge,
347 //          .      - an at-least-NW-but-more-is-better (NHV-by-NW)
348 //          .        vertical work array along the left-hand-edge.
349 //          .        ====
350 // 
351             auto _V     = H(_(n-nw+1,    n), _(   1,     nw));
352             auto _T     = H(_(n-nw+1,    n), _(nw+1, n-nw-1));
353             auto _WV    = H(_(  nw+2, n-nw), _(   1,     nw));
354 //
355 //          ==== Aggressive early deflation ====
356 //
357             IndexType   ls, ld;
358 
359             laqr2(wantT, wantZ, kTop, kBot, nw, H, iLoZ, iHiZ, Z,
360                   ls, ld, wr(_(1,kBot)), wi(_(1,kBot)),
361                   _V, _T, _WV, work);
362 //
363 //          ==== Adjust KBOT accounting for new deflations. ====
364 //
365             kBot -= ld;
366 //
367 //          ==== KS points to the shifts. ====
368 //
369             IndexType ks = kBot - ls + 1;
370 //
371 //          ==== Skip an expensive QR sweep if there is a (partly
372 //          .    heuristic) reason to expect that many eigenvalues
373 //          .    will deflate without it.  Here, the QR sweep is
374 //          .    skipped if many eigenvalues have just been deflated
375 //          .    or if the remaining active block is small.
376 //
377             if ((ld==0)
378              || ((100*ld<=nw*nibble) && (kBot-kTop+1>min(nMin, nwMax))))
379             {
380 //
381 //              ==== NS = nominal number of simultaneous shifts.
382 //              .    This may be lowered (slightly) if DLAQR2
383 //              .    did not provide that many shifts. ====
384 //
385                 IndexType ns = min(min(nsMax, nsr),
386                                    max(IndexType(2), kBot-kTop));
387                 ns = ns - (ns % 2);
388 //
389 //              ==== If there have been no deflations
390 //              .    in a multiple of KEXSH iterations,
391 //              .    then try exceptional shifts.
392 //              .    Otherwise use shifts provided by
393 //              .    DLAQR2 above or from the eigenvalues
394 //              .    of a trailing principal submatrix. ====
395 //
396                 if (nDfl%kexSh==0) {
397                     ks = kBot - ns + 1;
398                     for (IndexType i=kBot; i>=max(ks+1,kTop+2); i-=2) {
399                         const T ss = abs(H(i,i-1)) + abs(H(i-1,i-2));
400                         T aa = wilk1*ss + H(i,i);
401                         T bb = ss;
402                         T cc = wilk2*ss;
403                         T dd = aa;
404                         T cs, sn;
405                         lanv2(aa, bb, cc, dd,
406                               wr(i-1), wi(i-1), wr(i), wi(i),
407                               cs, sn);
408                     }
409                     if (ks==kTop) {
410                         wr(ks+1)    = H(ks+1, ks+1);
411                         wi(ks+1)    = Zero;
412                         wr(ks)      = wr(ks+1);
413                         wi(ks)      = wi(ks+1);
414                     }
415                 } else {
416 //
417 //                  ==== Got NS/2 or fewer shifts? Use DLAQR4 or
418 //                  .    DLAHQR on a trailing principal submatrix to
419 //                  .    get more. (Since NS.LE.NSMAX.LE.(N+6)/9,
420 //                  .    there is enough space below the subdiagonal
421 //                  .    to fit an NS-by-NS scratch array.) ====
422 //
423                     if (kBot-ks+1<=ns/2) {
424                         ks = kBot - ns +1;
425                         H(_(ks,kBot),_(1,ns)) = H(_(ks,kBot),_(ks,kBot));
426                         if (ns>nMin) {
427                             // TODO: avoid the need for ZDummy
428                             typename GeMatrix<MZ>::NoView ZDummy;
429                             ks += laqr4(falsefalse,
430                                         IndexType(1), ns,
431                                         H(_(ks,kBot),_(1,ns)),
432                                         wr(_(ks,kBot)), wi(_(ks,kBot)),
433                                         IndexType(1), IndexType(1),
434                                         ZDummy, work);
435                         } else {
436                             // TODO: avoid the need for ZDummy
437                             typename GeMatrix<MZ>::NoView ZDummy;
438                             ks += lahqr(falsefalse,
439                                         IndexType(1), ns,
440                                         H(_(ks,kBot),_(1,ns)),
441                                         wr(_(ks,kBot)), wi(_(ks,kBot)),
442                                         IndexType(1), IndexType(1),
443                                         ZDummy);
444                         }
445 //
446 //                      ==== In case of a rare QR failure use
447 //                      .    eigenvalues of the trailing 2-by-2
448 //                      .    principal submatrix.  ====
449 //
450                         if (ks>=kBot) {
451                             T aa = H(kBot-1,kBot-1);
452                             T cc = H(kBot,  kBot-1);
453                             T bb = H(kBot-1,kBot);
454                             T dd = H(kBot,  kBot);
455                             T cs, sn;
456                             lanv2(aa, bb, cc, dd,
457                                   wr(kBot-1), wi(kBot-1), wr(kBot), wi(kBot),
458                                   cs, sn);
459                             ks = kBot - 1;
460                         }
461                     }
462 
463                     if (kBot-ks+1>ns) {
464 //
465 //                      ==== Sort the shifts (Helps a little)
466 //                      .    Bubble sort keeps complex conjugate
467 //                      .    pairs together. ====
468 //
469                         bool sorted = false;
470                         for (IndexType k=kBot; k>=ks+1; --k) {
471                             if (sorted) {
472                                 break;
473                             }
474                             sorted = true;
475                             for (IndexType i=ks; i<=k-1; ++i) {
476                                 if (abs(wr(i))+abs(wi(i))
477                                     < abs(wr(i+1))+abs(wi(i+1)))
478                                 {
479                                     sorted = false;
480                                     swap(wr(i), wr(i+1));
481                                     swap(wi(i), wi(i+1));
482                                 }
483                             }
484                         }
485                     }
486 //
487 //                  ==== Shuffle shifts into pairs of real shifts
488 //                  .    and pairs of complex conjugate shifts
489 //                  .    assuming complex conjugate shifts are
490 //                  .    already adjacent to one another. (Yes,
491 //                  .    they are.)  ====
492 //
493                     for (IndexType i=kBot; i>=ks+2; i-=2) {
494                         if (wi(i)!=-wi(i-1)) {
495                             T tmp  = wr(i);
496                             wr(i)   = wr(i-1);
497                             wr(i-1) = wr(i-2);
498                             wr(i-2) = tmp;
499 
500                             tmp     = wi(i);
501                             wi(i)   = wi(i-1);
502                             wi(i-1) = wi(i-2);
503                             wi(i-2) = tmp;
504                         }
505                     }
506                 }
507 //
508 //              ==== If there are only two shifts and both are
509 //              .    real, then use only one.  ====
510 //
511                 if (kBot-ks+1==2) {
512                     if (wi(kBot)==0) {
513                         const T _H = H(kBot,kBot);
514                         if (abs(wr(kBot)-_H) < abs(wr(kBot-1)-_H)) {
515                             wr(kBot-1) = wr(kBot);
516                         } else {
517                             wr(kBot) = wr(kBot-1);
518                         }
519                     }
520                 }
521 //
522 //              ==== Use up to NS of the the smallest magnatiude
523 //              .    shifts.  If there aren't NS shifts available,
524 //              .    then use them all, possibly dropping one to
525 //              .    make the number of shifts even. ====
526 //
527                 ns = min(ns, kBot-ks+1);
528                 ns = ns - (ns%2);
529                 ks = kBot - ns + 1;
530 //
531 //              ==== Small-bulge multi-shift QR sweep:
532 //              .    split workspace under the subdiagonal into
533 //              .    - a KDU-by-KDU work array U in the lower
534 //              .      left-hand-corner,
535 //              .    - a KDU-by-at-least-KDU-but-more-is-better
536 //              .      (KDU-by-NHo) horizontal work array WH along
537 //              .      the bottom edge,
538 //              .    - and an at-least-KDU-but-more-is-better-by-KDU
539 //              .      (NVE-by-KDU) vertical work WV arrow along
540 //              .      the left-hand-edge. ====
541 //
542                 IndexType kdu   = 3*ns - 3;
543                 IndexType ku    = n - kdu + 1;
544                 IndexType kwv   = kdu + 4;
545                 IndexType nho   = (n-kdu+1-4) - (kdu+1) + 1;
546 
547 //              NOTE: _WH.numCols()==_WV.numRows()
548 
549                 typedef typename GeMatrix<MH>::View GeMatrixView;
550                 GeMatrixView _V(IndexType(3), ns/2, work(_(1,3*ns/2)));
551                 auto _U     = H(_( ku,    n), _(    1,    kdu));
552                 auto _WV    = H(_(kwv,n-kdu), _(    1,    kdu));
553                 auto _WH    = H(_( ku,    n), _(kdu+1,kdu+nho));
554 //
555 //              ==== Small-bulge multi-shift QR sweep ====
556 //
557                 laqr5(wantT, wantZ, kacc22, kTop, kBot, ns,
558                       wr(_(ks,kBot)), wi(_(ks,kBot)), H,
559                       iLoZ, iHiZ, Z,
560                       _V, _U, _WV, _WH);
561             }
562 //
563 //          ==== Note progress (or the lack of it). ====
564 //
565             if (ld>0) {
566                 nDfl = 1;
567             } else {
568                 ++nDfl;
569             }
570 //
571 //          ==== End of main loop ====
572 //
573         }
574 //
575 //      ==== Iteration limit exceeded.  Set INFO to show where
576 //      .    the problem occurred and exit. ====
577 //
578         if (it>itMax) {
579             info = kBot;
580         }
581     }
582 
583     work(1) = lWorkOpt;
584     return info;
585 }
586 
587 //== interface for native lapack ===============================================
588 
589 #ifdef CHECK_CXXLAPACK
590 
591 template <typename IndexType, typename MH>
592 IndexType
593 laqr4_native_wsq(bool                  wantT,
594                  bool                  wantZ,
595                  IndexType             iLo,
596                  IndexType             iHi,
597                  const GeMatrix<MH>    &H)
598 {
599     typedef typename GeMatrix<MH>::ElementType  T;
600 
601     const LOGICAL   WANTT   = wantT;
602     const LOGICAL   WANTZ   = wantZ;
603     const INTEGER   N       = H.numRows();
604     const INTEGER   ILO     = iLo;
605     const INTEGER   IHI     = iHi;
606     const INTEGER   LDH     = H.leadingDimension();
607     const INTEGER   ILOZ    = 1;
608     const INTEGER   IHIZ    = 0;
609     const INTEGER   LDZ     = 1;
610     T               WORK;
611     T               DUMMY;
612     const INTEGER   LWORK   = -1;
613     INTEGER         INFO;
614 
615     if (IsSame<T,DOUBLE>::value) {
616         LAPACK_IMPL(dlaqr4)(&WANTT,
617                             &WANTZ,
618                             &N,
619                             &ILO,
620                             &IHI,
621                             &DUMMY,
622                             &LDH,
623                             &DUMMY,
624                             &DUMMY,
625                             &ILOZ,
626                             &IHIZ,
627                             &DUMMY,
628                             &LDZ,
629                             &WORK,
630                             &LWORK,
631                             &INFO);
632     } else {
633         ASSERT(0);
634     }
635     return WORK;
636 }
637 
638 template <typename IndexType, typename MH, typename VWR, typename VWI,
639           typename MZ, typename VWORK>
640 IndexType
641 laqr4_native(bool                  wantT,
642              bool                  wantZ,
643              IndexType             iLo,
644              IndexType             iHi,
645              GeMatrix<MH>          &H,
646              DenseVector<VWR>      &wr,
647              DenseVector<VWI>      &wi,
648              IndexType             iLoZ,
649              IndexType             iHiZ,
650              GeMatrix<MZ>          &Z,
651              DenseVector<VWORK>    &work)
652 {
653     typedef typename GeMatrix<MH>::ElementType  T;
654 
655     const LOGICAL   WANTT   = wantT;
656     const LOGICAL   WANTZ   = wantZ;
657     const INTEGER   N       = H.numRows();
658     const INTEGER   ILO     = iLo;
659     const INTEGER   IHI     = iHi;
660     const INTEGER   LDH     = H.leadingDimension();
661     const INTEGER   ILOZ    = iLoZ;
662     const INTEGER   IHIZ    = iHiZ;
663     const INTEGER   LDZ     = Z.leadingDimension();
664     const INTEGER   LWORK   = work.length();
665     INTEGER         INFO;
666 
667     if (IsSame<T,DOUBLE>::value) {
668         LAPACK_IMPL(dlaqr4)(&WANTT,
669                             &WANTZ,
670                             &N,
671                             &ILO,
672                             &IHI,
673                             H.data(),
674                             &LDH,
675                             wr.data(),
676                             wi.data(),
677                             &ILOZ,
678                             &IHIZ,
679                             Z.data(),
680                             &LDZ,
681                             work.data(),
682                             &LWORK,
683                             &INFO);
684     } else {
685         ASSERT(0);
686     }
687     ASSERT(INFO>=0);
688     return INFO;
689 }
690 
691 #endif // CHECK_CXXLAPACK
692 
693 //== public interface ==========================================================
694 template <typename IndexType, typename MH>
695 IndexType
696 laqr4_wsq(bool                  wantT,
697           bool                  wantZ,
698           IndexType             iLo,
699           IndexType             iHi,
700           const GeMatrix<MH>    &H)
701 {
702     using std::max;
703 //
704 //  Test the input parameters
705 //
706 #   ifndef NDEBUG
707     ASSERT(H.firstRow()==1);
708     ASSERT(H.firstCol()==1);
709     ASSERT(H.numRows()==H.numCols());
710 
711     const IndexType n = H.numRows();
712 
713     if (n>0) {
714         ASSERT(1<=iLo);
715         ASSERT(iLo<=iHi);
716         ASSERT(iHi<=n);
717     } else {
718         ASSERT(iLo==1);
719         ASSERT(iHi==0);
720     }
721 
722     ASSERT(1<=iLo);
723     ASSERT(iLo<=iHi);
724     ASSERT(iHi<=n);
725 #   endif
726 
727 //
728 //  Call implementation
729 //
730     // TODO: call generic implementation
731     IndexType info = laqr4_generic_wsq(wantT, wantZ, iLo, iHi, H);
732 
733 #   ifdef CHECK_CXXLAPACK
734 //
735 //  Compare results
736 //
737     IndexType _info = laqr4_native_wsq(wantT, wantZ, iLo, iHi, H);
738 
739     if (info!=_info) {
740         std::cerr << "CXXLAPACK:  info = " << info << std::endl;
741         std::cerr << "F77LAPACK: _info = " << _info << std::endl;
742         ASSERT(0);
743     }
744 #   endif
745     return info;
746 
747 }
748 
749 template <typename IndexType, typename MH, typename VWR, typename VWI,
750           typename MZ, typename VWORK>
751 IndexType
752 laqr4(bool                  wantT,
753       bool                  wantZ,
754       IndexType             iLo,
755       IndexType             iHi,
756       GeMatrix<MH>          &H,
757       DenseVector<VWR>      &wr,
758       DenseVector<VWI>      &wi,
759       IndexType             iLoZ,
760       IndexType             iHiZ,
761       GeMatrix<MZ>          &Z,
762       DenseVector<VWORK>    &work)
763 {
764     LAPACK_DEBUG_OUT("laqr4");
765 
766     using std::max;
767 //
768 //  Test the input parameters
769 //
770 #   ifndef NDEBUG
771     ASSERT(H.firstRow()==1);
772     ASSERT(H.firstCol()==1);
773     ASSERT(H.numRows()==H.numCols());
774 
775     const IndexType n = H.numRows();
776 
777     if (n>0) {
778         ASSERT(1<=iLo);
779         ASSERT(iLo<=iHi);
780         ASSERT(iHi<=n);
781     } else {
782         ASSERT(iLo==1);
783         ASSERT(iHi==0);
784     }
785 
786     ASSERT(wr.firstIndex()==1);
787     ASSERT(wr.length()>=iHi);
788 
789     ASSERT(wi.firstIndex()==1);
790     ASSERT(wi.length()>=iHi);
791 
792     if (wantZ) {
793         ASSERT(1<=iLoZ);
794         ASSERT(iLoZ<=iLo);
795         ASSERT(iHi<=iHiZ);
796         ASSERT(iHiZ<=n);
797 
798         ASSERT(Z.firstRow()==1);
799         ASSERT(Z.firstCol()==1);
800         ASSERT(Z.numRows()>=iHi);
801         ASSERT(Z.numCols()>=iHi);
802     }
803 
804     ASSERT((work.length()==0) || (work.length()>=n));
805 #   endif
806 
807 //
808 //  Make copies of output arguments
809 //
810 #   ifdef CHECK_CXXLAPACK
811     typename GeMatrix<MH>::NoView          H_org       = H;
812     typename DenseVector<VWR>::NoView      wr_org      = wr;
813     typename DenseVector<VWI>::NoView      wi_org      = wi;
814     typename GeMatrix<MZ>::NoView          Z_org       = Z;
815     typename DenseVector<VWORK>::NoView    work_org    = work;
816 #   endif
817 
818 //
819 //  Call implementation
820 //
821     IndexType info = laqr4_generic(wantT, wantZ, iLo, iHi, H, wr, wi,
822                                   iLoZ, iHiZ, Z, work);
823 #   ifdef CHECK_CXXLAPACK
824 //
825 //  Make copies of results computed by the generic implementation
826 //
827     typename GeMatrix<MH>::NoView          H_generic       = H;
828     typename DenseVector<VWR>::NoView      wr_generic      = wr;
829     typename DenseVector<VWI>::NoView      wi_generic      = wi;
830     typename GeMatrix<MZ>::NoView          Z_generic       = Z;
831     typename DenseVector<VWORK>::NoView    work_generic    = work;
832 //
833 //  restore output arguments
834 //
835     H = H_org;
836     wr = wr_org;
837     wi = wi_org;
838     Z = Z_org;
839     work = work_org;
840 
841 //
842 //  Compare results
843 //
844     IndexType _info = laqr4_native(wantT, wantZ, iLo, iHi, H, wr, wi,
845                                    iLoZ, iHiZ, Z, work);
846 
847     bool failed = false;
848     if (! isIdentical(H_generic, H, "H_generic""H")) {
849         std::cerr << "CXXLAPACK: H_generic = " << H_generic << std::endl;
850         std::cerr << "F77LAPACK: H = " << H << std::endl;
851         failed = true;
852     }
853 
854     if (! isIdentical(wr_generic, wr, "wr_generic""wr")) {
855         std::cerr << "CXXLAPACK: wr_generic = " << wr_generic << std::endl;
856         std::cerr << "F77LAPACK: wr = " << wr << std::endl;
857         failed = true;
858     }
859 
860     if (! isIdentical(wi_generic, wi, "wi_generic""wi")) {
861         std::cerr << "CXXLAPACK: wi_generic = " << wi_generic << std::endl;
862         std::cerr << "F77LAPACK: wi = " << wi << std::endl;
863         failed = true;
864     }
865 
866     if (! isIdentical(Z_generic, Z, "Z_generic""Z")) {
867         std::cerr << "CXXLAPACK: Z_generic = " << Z_generic << std::endl;
868         std::cerr << "F77LAPACK: Z = " << Z << std::endl;
869         failed = true;
870     }
871 
872     if (! isIdentical(info, _info, " info""_info")) {
873         std::cerr << "CXXLAPACK:  info = " << info << std::endl;
874         std::cerr << "F77LAPACK: _info = " << _info << std::endl;
875         failed = true;
876     }
877 
878     if (! isIdentical(work_generic, work, " work_generic""work")) {
879         std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
880         std::cerr << "F77LAPACK: work = " << work << std::endl;
881         failed = true;
882     }
883 
884     if (failed) {
885         std::cerr << "error in: laqr4.tcc" << std::endl;
886         ASSERT(0);
887     } else {
888 //        std::cerr << "passed: laqr4.tcc" << std::endl;
889     }
890 #   endif
891     return info;
892 }
893 
894 //-- forwarding ----------------------------------------------------------------
895 template <typename IndexType, typename MH>
896 IndexType
897 laqr4_wsq(bool                  wantT,
898           bool                  wantZ,
899           IndexType             iLo,
900           IndexType             iHi,
901           const MH              &&H)
902 {
903     CHECKPOINT_ENTER;
904     const IndexType info = laqr4_wsq(wantT, wantZ, iLo, iHi, H);
905     CHECKPOINT_LEAVE;
906 
907     return info;
908 }
909 
910 template <typename IndexType, typename MH, typename VWR, typename VWI,
911           typename MZ, typename VWORK>
912 IndexType
913 laqr4(bool                  wantT,
914       bool                  wantZ,
915       IndexType             iLo,
916       IndexType             iHi,
917       MH                    &&H,
918       VWR                   &&wr,
919       VWI                   &&wi,
920       IndexType             iLoZ,
921       IndexType             iHiZ,
922       MZ                    &&Z,
923       VWORK                 &&work)
924 {
925     CHECKPOINT_ENTER;
926     const IndexType info = laqr4(wantT, wantZ, iLo, iHi, H, wr, wi,
927                                  iLoZ, iHiZ, Z, work);
928     CHECKPOINT_LEAVE;
929 
930     return info;
931 }
932 
933 } } // namespace lapack, flens
934 
935 #endif // FLENS_LAPACK_EIG_LAQR4_TCC