1 /*
2 * Copyright (c) 2011, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Based on
34 *
35 SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
36 $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U,
37 $ LDU, NV, WV, LDWV, NH, WH, LDWH )
38 *
39 * -- LAPACK auxiliary routine (version 3.3.0) --
40 * -- LAPACK is a software package provided by Univ. of Tennessee, --
41 * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
42 * November 2010
43 *
44 */
45
46 #ifndef FLENS_LAPACK_EIG_LAQR5_TCC
47 #define FLENS_LAPACK_EIG_LAQR5_TCC 1
48
49 #include <flens/blas/blas.h>
50 #include <flens/lapack/lapack.h>
51
52 namespace flens { namespace lapack {
53
54 //== generic lapack implementation =============================================
55
56 template <typename IndexType, typename VSR, typename VSI, typename MH,
57 typename MZ, typename MV, typename MU, typename MWV, typename MWH>
58 void
59 laqr5_generic(bool wantT,
60 bool wantZ,
61 IndexType kacc22,
62 IndexType kTop,
63 IndexType kBot,
64 IndexType nShifts,
65 DenseVector<VSR> &sr,
66 DenseVector<VSI> &si,
67 GeMatrix<MH> &H,
68 IndexType iLoZ,
69 IndexType iHiZ,
70 GeMatrix<MZ> &Z,
71 GeMatrix<MV> &V,
72 GeMatrix<MU> &U,
73 GeMatrix<MWV> &WV,
74 GeMatrix<MWH> &WH)
75 {
76 using std::abs;
77 using std::max;
78 using std::min;
79
80 typedef typename GeMatrix<MH>::ElementType T;
81
82 const T Zero(0), One(1);
83
84 const Underscore<IndexType> _;
85
86 const IndexType n = H.numRows();
87 const IndexType nv = WV.numRows();
88 const IndexType nh = WH.numCols();
89
90 typedef typename GeMatrix<MH>::VectorView VectorView;
91 T vtBuffer[3];
92 VectorView vt = typename VectorView::Engine(3, vtBuffer);
93
94 //
95 // ==== If there are no shifts, then there is nothing to do. ====
96 //
97 if (nShifts<2) {
98 return;
99 }
100 //
101 // ==== If the active block is empty or 1-by-1, then there
102 // . is nothing to do. ====
103 //
104 if (kTop>=kBot) {
105 return;
106 }
107 //
108 // ==== Shuffle shifts into pairs of real shifts and pairs
109 // . of complex conjugate shifts assuming complex
110 // . conjugate shifts are already adjacent to one
111 // . another. ====
112 //
113 for (IndexType i=1; i<=nShifts-2; i+=2) {
114 if (si(i)!=-si(i+1)) {
115 T tmp = sr(i);
116 sr(i) = sr(i+1);
117 sr(i+1) = sr(i+2);
118 sr(i+2) = tmp;
119
120 tmp = si(i);
121 si(i) = si(i+1);
122 si(i+1) = si(i+2);
123 si(i+2) = tmp;
124 }
125 }
126 //
127 // ==== NSHFTS is supposed to be even, but if it is odd,
128 // . then simply reduce it by one. The shuffle above
129 // . ensures that the dropped shift is real and that
130 // . the remaining shifts are paired. ====
131 //
132 const IndexType ns = nShifts - (nShifts % 2);
133 //
134 // ==== Machine constants for deflation ====
135 //
136 T safeMin = lamch<T>(SafeMin);
137 T safeMax = One/safeMin;
138 labad(safeMin, safeMax);
139
140 const T ulp = lamch<T>(Precision);
141 const T smallNum = safeMin*(T(n)/ulp);
142 //
143 // ==== Use accumulated reflections to update far-from-diagonal
144 // . entries ? ====
145 //
146 const bool accum = (kacc22==1) || (kacc22==2);
147 //
148 // ==== If so, exploit the 2-by-2 block structure? ====
149 //
150 const bool blk22 = (ns>2) && (kacc22==2);
151 //
152 // ==== clear trash ====
153 //
154 if (kTop+2<=kBot) {
155 H(kTop+2,kTop) = Zero;
156 }
157 //
158 // ==== NBMPS = number of 2-shift bulges in the chain ====
159 //
160 const IndexType nBmps = ns/2;
161 //
162 // ==== KDU = width of slab ====
163 //
164 const IndexType kdu = 6*nBmps - 3;
165 //
166 // ==== Create and chase chains of NBMPS bulges ====
167 //
168 for (IndexType inCol=3*(1-nBmps)+kTop-1; inCol<=kBot-2; inCol+=3*nBmps-2) {
169 IndexType ndCol = inCol + kdu;
170 if (accum) {
171 auto U_ = U(_(1,kdu),_(1,kdu));
172 U_ = Zero;
173 U_.diag(0) = One;
174 }
175 //
176 // ==== Near-the-diagonal bulge chase. The following loop
177 // . performs the near-the-diagonal part of a small bulge
178 // . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal
179 // . chunk extends from column INCOL to column NDCOL
180 // . (including both column INCOL and column NDCOL). The
181 // . following loop chases a 3*NBMPS column long chain of
182 // . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL
183 // . may be less than KTOP and and NDCOL may be greater than
184 // . KBOT indicating phantom columns from which to chase
185 // . bulges before they are actually introduced or to which
186 // . to chase bulges beyond column KBOT.) ====
187 //
188 const IndexType krColMax = min(inCol+3*nBmps-3, kBot-2);
189
190 for (IndexType krCol=inCol; krCol<=krColMax; ++krCol) {
191 //
192 // ==== Bulges number MTOP to MBOT are active double implicit
193 // . shift bulges. There may or may not also be small
194 // . 2-by-2 bulge, if there is room. The inactive bulges
195 // . (if any) must wait until the active bulges have moved
196 // . down the diagonal to make room. The phantom matrix
197 // . paradigm described above helps keep track. ====
198 //
199 const IndexType mTop = max(IndexType(1), ((kTop-1)-krCol+2)/3+1);
200 const IndexType mBot = min(nBmps, (kBot-krCol)/3);
201 const IndexType m22 = mBot + 1;
202 const bool bmp22 = (mBot<nBmps ) && (krCol+3*(m22-1))==(kBot-2);
203 //
204 // ==== Generate reflections to chase the chain right
205 // . one column. (The minimum value of K is KTOP-1.) ====
206 //
207 IndexType k;
208 T alpha, beta;
209
210 for (IndexType m=mTop; m<=mBot; ++m) {
211 k = krCol + 3*(m-1);
212 if (k==kTop-1) {
213 laqr1(H(_(kTop,kTop+2),_(kTop,kTop+2)),
214 sr(2*m-1), si(2*m-1), sr(2*m), si(2*m),
215 V(_(1,3),m));
216 alpha = V(1,m);
217 larfg(IndexType(3), alpha, V(_(2,3),m), V(1,m));
218 } else {
219 beta = H(k+1,k);
220 V(2,m) = H(k+2,k);
221 V(3,m) = H(k+3,k);
222 larfg(IndexType(3), beta, V(_(2,3),m), V(1,m));
223 //
224 // ==== A Bulge may collapse because of vigilant
225 // . deflation or destructive underflow. In the
226 // . underflow case, try the two-small-subdiagonals
227 // . trick to try to reinflate the bulge. ====
228 //
229 if (H(k+3,k)!=Zero
230 || H(k+3,k+1)!=Zero
231 || H(k+3,k+2)==Zero) {
232 //
233 // ==== Typical case: not collapsed (yet). ====
234 //
235 H(k+1, k) = beta;
236 H(k+2, k) = Zero;
237 H(k+3, k) = Zero;
238 } else {
239 //
240 // ==== Atypical case: collapsed. Attempt to
241 // . reintroduce ignoring H(K+1,K) and H(K+2,K).
242 // . If the fill resulting from the new
243 // . reflector is too large, then abandon it.
244 // . Otherwise, use the new one. ====
245 //
246 laqr1(H(_(k+1,k+3),_(k+1,k+3)),
247 sr(2*m-1), si(2*m-1), sr(2*m), si(2*m),
248 vt);
249 alpha = vt(1);
250 larfg(3, alpha, vt(_(2,3)), vt(1));
251 const T refSum = vt(1)*(H(k+1,k) + vt(2)*H(k+2,k));
252 if (abs(H(k+2,k)-refSum*vt(2)) + abs(refSum*vt(3))
253 > ulp*(abs(H(k,k))+abs(H(k+1,k+1))+abs(H(k+2,k+2))))
254 {
255 //
256 // ==== Starting a new bulge here would
257 // . create non-negligible fill. Use
258 // . the old one with trepidation. ====
259 //
260 H(k+1, k) = beta;
261 H(k+2, k) = Zero;
262 H(k+3, k) = Zero;
263 } else {
264 //
265 // ==== Stating a new bulge here would
266 // . create only negligible fill.
267 // . Replace the old reflector with
268 // . the new one. ====
269 //
270 H(k+1, k) -= refSum;
271 H(k+2, k) = Zero;
272 H(k+3, k) = Zero;
273 V(1, m) = vt(1);
274 V(2, m) = vt(2);
275 V(3, m) = vt(3);
276 }
277 }
278 }
279 }
280 //
281 // ==== Generate a 2-by-2 reflection, if needed. ====
282 //
283 k = krCol + 3*(m22-1);
284 if (bmp22) {
285 if (k==kTop-1) {
286 laqr1(H(_(k+1,k+2),_(k+1,k+2)),
287 sr(2*m22-1), si(2*m22-1), sr(2*m22), si(2*m22),
288 V(_(1,2),m22));
289 beta = V(1, m22);
290 larfg(IndexType(2), beta, V(_(2,2),m22), V(1,m22));
291 } else {
292 beta = H(k+1, k);
293 V(2, m22) = H(k+2,k);
294 larfg(IndexType(2), beta, V(_(2,2),m22), V(1,m22));
295 H(k+1, k) = beta;
296 H(k+2, k) = Zero;
297 }
298 }
299 //
300 // ==== Multiply H by reflections from the left ====
301 //
302 IndexType jBot;
303 if (accum) {
304 jBot = min(ndCol, kBot);
305 } else if (wantT) {
306 jBot = n;
307 } else {
308 jBot = kBot;
309 }
310 for (IndexType j=max(kTop,krCol); j<=jBot; ++j) {
311 IndexType mEnd = min(mBot, (j-krCol+2)/3);
312 for (IndexType m=mTop; m<=mEnd; ++m) {
313 k = krCol + 3*(m-1);
314 const T refSum = V(1,m)*(H(k+1,j) + V(2,m)*H(k+2,j)
315 + V(3,m)*H(k+3,j));
316 H(k+1,j) -= refSum;
317 H(k+2,j) -= refSum*V(2,m);
318 H(k+3,j) -= refSum*V(3,m);
319 }
320 }
321 if (bmp22) {
322 k = krCol + 3*(m22-1);
323 for (IndexType j=max(k+1,kTop); j<=jBot; ++j) {
324 const T refSum = V(1,m22)*(H(k+1,j)+V(2,m22)*H(k+2,j));
325 H(k+1,j) -= refSum;
326 H(k+2,j) -= refSum*V(2,m22);
327 }
328 }
329 //
330 // ==== Multiply H by reflections from the right.
331 // . Delay filling in the last row until the
332 // . vigilant deflation check is complete. ====
333 //
334 IndexType jTop;
335 if (accum) {
336 jTop = max(kTop, inCol);
337 } else if (wantT) {
338 jTop = 1;
339 } else {
340 jTop = kTop;
341 }
342 for (IndexType m=mTop; m<=mBot; ++m) {
343 if (V(1,m)!=Zero) {
344 k = krCol + 3*(m-1);
345 for (IndexType j=jTop; j<=min(kBot,k+3); ++j) {
346 const T refSum = V(1,m)*(H(j,k+1) + V(2,m)*H(j,k+2)
347 + V(3,m)*H(j,k+3));
348 H(j, k+1) -= refSum;
349 H(j, k+2) -= refSum*V(2, m);
350 H(j, k+3) -= refSum*V(3, m);
351 }
352
353 if (accum) {
354 //
355 // ==== Accumulate U. (If necessary, update Z later
356 // . with with an efficient matrix-matrix
357 // . multiply.) ====
358 //
359 IndexType kms = k - inCol;
360 IndexType j1 = max(IndexType(1),kTop-inCol);
361
362 for (IndexType j=j1; j<=kdu; ++j) {
363 const T refSum = V(1,m)*(U(j,kms+1)
364 + V(2,m)*U(j,kms+2)
365 + V(3,m)*U(j,kms+3));
366 U(j, kms+1) -= refSum;
367 U(j, kms+2) -= refSum*V(2, m);
368 U(j, kms+3) -= refSum*V(3, m);
369 }
370 } else if (wantZ) {
371 //
372 // ==== U is not accumulated, so update Z
373 // . now by multiplying by reflections
374 // . from the right. ====
375 //
376 for (IndexType j=iLoZ; j<=iHiZ; ++j) {
377 const T refSum = V(1,m)*(Z(j,k+1)
378 +V(2,m)*Z(j,k+2)
379 +V(3,m)*Z(j,k+3));
380 Z(j,k+1) -= refSum;
381 Z(j,k+2) -= refSum*V(2,m);
382 Z(j,k+3) -= refSum*V(3,m);
383 }
384 }
385 }
386 }
387 //
388 // ==== Special case: 2-by-2 reflection (if needed) ====
389 //
390 k = krCol + 3*(m22-1);
391 if (bmp22) {
392 if (V(1,m22)!=Zero) {
393 for (IndexType j=jTop; j<=min(kBot,k+3); ++j) {
394 const T refSum = V(1,m22)*(H(j,k+1)+V(2,m22)*H(j,k+2));
395 H(j,k+1) -= refSum;
396 H(j,k+2) -= refSum*V(2,m22);
397 }
398
399 if (accum) {
400 IndexType kms = k - inCol;
401 IndexType j1 = max(IndexType(1),kTop-inCol);
402
403 for (IndexType j=j1; j<=kdu; ++j) {
404 const T refSum = V(1,m22)*(U(j,kms+1)
405 + V(2,m22)*U(j,kms+2));
406 U(j,kms+1 ) -= refSum;
407 U(j,kms+2 ) -= refSum*V(2,m22);
408 }
409 } else if (wantZ) {
410 for (IndexType j=iLoZ; j<=iHiZ; ++j) {
411 const T refSum = V(1,m22)*(Z(j,k+1)
412 + V(2,m22)*Z(j,k+2));
413 Z(j,k+1) -= refSum;
414 Z(j,k+2) -= refSum*V(2,m22);
415 }
416 }
417 }
418 }
419 //
420 // ==== Vigilant deflation check ====
421 //
422 IndexType mStart = mTop;
423 if (krCol+3*(mStart-1)<kTop) {
424 ++mStart;
425 }
426 IndexType mEnd = mBot;
427 if (bmp22) {
428 ++mEnd;
429 }
430 if (krCol==kBot-2) {
431 ++mEnd;
432 }
433 for (IndexType m=mStart; m<=mEnd; ++m) {
434 k = min(kBot-1, krCol+3*(m-1));
435 //
436 // ==== The following convergence test requires that
437 // . the tradition small-compared-to-nearby-diagonals
438 // . criterion and the Ahues & Tisseur (LAWN 122, 1997)
439 // . criteria both be satisfied. The latter improves
440 // . accuracy in some examples. Falling back on an
441 // . alternate convergence criterion when TST1 or TST2
442 // . is zero (as done here) is traditional but probably
443 // . unnecessary. ====
444 //
445 if (H(k+1,k)!=Zero) {
446 T test1 = abs(H(k,k)) + abs(H(k+1,k+1));
447 if (test1==Zero) {
448 if (k>=kTop+1) {
449 test1 += abs(H(k,k-1));
450 }
451 if (k>=kTop+2) {
452 test1 += abs(H(k,k-2));
453 }
454 if (k>=kTop+3) {
455 test1 += abs(H(k,k-3));
456 }
457 if (k<=kBot-2) {
458 test1 += abs(H(k+2,k+1));
459 }
460 if (k<=kBot-3) {
461 test1 += abs(H(k+3,k+1));
462 }
463 if (k<=kBot-4) {
464 test1 += abs(H(k+4,k+1));
465 }
466 }
467 if (abs(H(k+1,k))<=max(smallNum, ulp*test1)) {
468 const T H12 = max(abs(H(k+1,k)), abs(H(k,k+1)));
469 const T H21 = min(abs(H(k+1,k)), abs(H(k,k+1)));
470 const T H11 = max(abs(H(k+1,k+1)),
471 abs(H(k,k)-H(k+1,k+1)));
472 const T H22 = min(abs(H(k+1,k+1)),
473 abs(H(k,k)-H(k+1,k+1)));
474 const T scal = H11 + H12;
475 const T test2 = H22*(H11/scal);
476
477 if (test2==Zero
478 || H21*(H12/scal)<=max(smallNum,ulp*test2))
479 {
480 H(k+1,k) = Zero;
481 }
482 }
483 }
484 }
485 //
486 // ==== Fill in the last row of each bulge. ====
487 //
488 mEnd = min(nBmps, (kBot-krCol-1)/3);
489 for (IndexType m=mTop; m<=mEnd; ++m) {
490 k = krCol + 3*(m-1);
491 const T refSum = V(1,m)*V(3,m)*H(k+4,k+3);
492 H(k+4,k+1) = -refSum;
493 H(k+4,k+2) = -refSum*V(2,m);
494 H(k+4,k+3) -= refSum*V(3,m);
495 }
496 //
497 // ==== End of near-the-diagonal bulge chase. ====
498 //
499 }
500 //
501 // ==== Use U (if accumulated) to update far-from-diagonal
502 // . entries in H. If required, use U to update Z as
503 // . well. ====
504 //
505 if (accum) {
506 IndexType jTop, jBot;
507 if (wantT) {
508 jTop = 1;
509 jBot = n;
510 } else {
511 jTop = kTop;
512 jBot = kBot;
513 }
514 if ((!blk22) || (inCol<kTop) || (ndCol>kBot) || (ns<=2)) {
515 //
516 // ==== Updates not exploiting the 2-by-2 block
517 // . structure of U. K1 and NU keep track of
518 // . the location and size of U in the special
519 // . cases of introducing bulges and chasing
520 // . bulges off the bottom. In these special
521 // . cases and in case the number of shifts
522 // . is NS = 2, there is no 2-by-2 block
523 // . structure to exploit. ====
524 //
525 const IndexType k1 = max(IndexType(1), kTop-inCol);
526 const IndexType nu = (kdu-max(IndexType(0), ndCol-kBot)) -k1+1;
527 const IndexType _nu = (kdu-max(IndexType(0), ndCol-kBot));
528 //
529 // ==== Horizontal Multiply ====
530 //
531 for (IndexType jCol=min(ndCol,kBot)+1; jCol<=jBot; jCol+=nh) {
532 const IndexType jLen = min(nh, jBot-jCol+1);
533
534 auto _U = U(_(k1, _nu), _(k1, _nu));
535 auto _H = H(_(inCol+k1,inCol+_nu),_(jCol,jCol+jLen-1));
536 auto _WH = WH(_(1,nu),_(1,jLen));
537
538 blas::mm(ConjTrans, NoTrans, One, _U, _H, Zero, _WH);
539 _H = _WH;
540 }
541 //
542 // ==== Vertical multiply ====
543 //
544 for (IndexType jRow=jTop; jRow<=max(kTop,inCol)-1; jRow+=nv) {
545 const IndexType jLen = min(nv, max(kTop,inCol)-jRow);
546
547 auto _H = H(_(jRow,jRow+jLen-1),_(inCol+k1,inCol+_nu));
548 auto _U = U(_(k1,_nu),_(k1,_nu));
549 auto _WV = WV(_(1,jLen),_(1,nu));
550
551 blas::mm(NoTrans, NoTrans, One, _H, _U, Zero, _WV);
552 _H = _WV;
553 }
554 //
555 // ==== Z multiply (also vertical) ====
556 //
557 if (wantZ) {
558 for (IndexType jRow=iLoZ; jRow<=iHiZ; jRow+=nv) {
559 const IndexType jLen = min(nv, iHiZ-jRow+1);
560
561 auto _Z = Z(_(jRow,jRow+jLen-1),_(inCol+k1,inCol+_nu));
562 auto _U = U(_(k1,_nu),_(k1,_nu));
563 auto _WV = WV(_(1,jLen),_(1,nu));
564
565 blas::mm(NoTrans, NoTrans, One, _Z, _U, Zero, _WV);
566 _Z = _WV;
567 }
568 }
569 } else {
570 //
571 // ==== Updates exploiting U's 2-by-2 block structure.
572 // . (I2, I4, J2, J4 are the last rows and columns
573 // . of the blocks.) ====
574 //
575 const IndexType i2 = (kdu+1 ) / 2;
576 const IndexType i4 = kdu;
577 const IndexType j2 = i4 - i2;
578 const IndexType j4 = kdu;
579 //
580 // ==== KZS and KNZ deal with the band of zeros
581 // . along the diagonal of one of the triangular
582 // . blocks. ====
583 //
584 const IndexType kZs = (j4-j2) - (ns+1);
585 const IndexType kNz = ns + 1;
586 //
587 // ==== Horizontal multiply ====
588 //
589 for (IndexType jCol=min(ndCol, kBot)+1; jCol<=jBot; jCol+=nh) {
590 const IndexType jLen = min(nh, jBot-jCol+1);
591 //
592 // ==== Copy bottom of H to top+KZS of scratch ====
593 // (The first KZS rows get multiplied by zero.) ====
594 //
595 WH(_(kZs+1,kZs+kNz),_(1,jLen))
596 = H(_(inCol+j2+1,inCol+j2+kNz),_(jCol,jCol+jLen-1));
597 //
598 // ==== Multiply by U21**T ====
599 //
600 WH(_(1,kZs),_(1,jLen)) = Zero;
601 blas::mm(Left, ConjTrans, One,
602 U(_(j2+1,j2+kNz),_(kZs+1,kZs+kNz)).upper(),
603 WH(_(kZs+1,kZs+kNz),_(1,jLen)));
604 //
605 // ==== Multiply top of H by U11**T ====
606 //
607 blas::mm(ConjTrans, NoTrans, One,
608 U(_(1,j2),_(1,i2)),
609 H(_(inCol+1,inCol+j2),_(jCol,jCol+jLen-1)),
610 One,
611 WH(_(1,i2),_(1,jLen)));
612 //
613 // ==== Copy top of H to bottom of WH ====
614 //
615 WH(_(i2+1,i2+j2),_(1,jLen))
616 = H(_(inCol+1,inCol+j2),_(jCol,jCol+jLen-1));
617 //
618 // ==== Multiply by U21**T ====
619 //
620 blas::mm(Left, ConjTrans, One,
621 U(_(1,j2),_(i2+1,i2+j2)).lower(),
622 WH(_(i2+1,i2+j2),_(1,jLen)));
623 //
624 // ==== Multiply by U22 ====
625 //
626 blas::mm(ConjTrans, NoTrans, One,
627 U(_(j2+1,j4),_(i2+1,i4)),
628 H(_(inCol+j2+1,inCol+j4),_(jCol,jCol+jLen-1)),
629 One,
630 WH(_(i2+1,i4),_(1,jLen)));
631 //
632 // ==== Copy it back ====
633 //
634 H(_(inCol+1,inCol+kdu),_(jCol,jCol+jLen-1))
635 = WH(_(1,kdu),_(1,jLen));
636 }
637 //
638 // ==== Vertical multiply ====
639 //
640 for (IndexType jRow=jTop; jRow<=max(inCol,kTop)-1; jRow+=nv) {
641 const IndexType jLen = min(nv, max(inCol,kTop) -jRow);
642 //
643 // ==== Copy right of H to scratch (the first KZS
644 // . columns get multiplied by zero) ====
645 //
646 WV(_(1,jLen),_(kZs+1,kZs+kNz))
647 = H(_(jRow,jRow+jLen-1),_(inCol+j2+1, inCol+j2+kNz));
648 //
649 // ==== Multiply by U21 ====
650 //
651 WV(_(1,jLen),_(1,kZs)) = Zero;
652 blas::mm(Right, NoTrans, One,
653 U(_(j2+1,j2+kNz),_(kZs+1,kZs+kNz)).upper(),
654 WV(_(1,jLen),_(kZs+1,kZs+kNz)));
655 //
656 // ==== Multiply by U11 ====
657 //
658 blas::mm(NoTrans, NoTrans, One,
659 H(_(jRow,jRow+jLen-1),_(inCol+1,inCol+j2)),
660 U(_(1,j2),_(1,i2)),
661 One,
662 WV(_(1,jLen),_(1,i2)));
663 //
664 // ==== Copy left of H to right of scratch ====
665 //
666 WV(_(1,jLen),_(i2+1,i2+j2))
667 = H(_(jRow,jRow+jLen-1), _(inCol+1,inCol+j2));
668 //
669 // ==== Multiply by U21 ====
670 //
671 blas::mm(Right, NoTrans, One,
672 U(_(1,i4-i2),_(i2+1,i4)).lower(),
673 WV(_(1,jLen),_(i2+1,i4)));
674 //
675 // ==== Multiply by U22 ====
676 //
677 blas::mm(NoTrans, NoTrans, One,
678 H(_(jRow,jRow+jLen-1),_(inCol+j2+1,inCol+j4)),
679 U(_(j2+1,j4),_(i2+1,i4)),
680 One,
681 WV(_(1,jLen),_(i2+1,i4)));
682 //
683 // ==== Copy it back ====
684 //
685 H(_(jRow,jRow+jLen-1),_(inCol+1,inCol+kdu))
686 = WV(_(1,jLen),_(1,kdu));
687 }
688 //
689 // ==== Multiply Z (also vertical) ====
690 //
691 if (wantZ) {
692 for (IndexType jRow=iLoZ; jRow<=iHiZ; jRow+=nv) {
693 const IndexType jLen = min(nv,iHiZ-jRow+1);
694 //
695 // ==== Copy right of Z to left of scratch (first
696 // . KZS columns get multiplied by zero) ====
697 //
698 WV(_(1,jLen),_(kZs+1,kZs+kNz))
699 = Z(_(jRow,jRow+jLen-1),_(inCol+j2+1,inCol+j2+kNz));
700 //
701 // ==== Multiply by U12 ====
702 //
703 WV(_(1,jLen),_(1,kZs)) = Zero;
704 blas::mm(Right, NoTrans, One,
705 U(_(j2+1,j2+kNz), _(kZs+1,kZs+kNz)).upper(),
706 WV(_(1,jLen),_(kZs+1,kZs+kNz)));
707 //
708 // ==== Multiply by U11 ====
709 //
710 blas::mm(NoTrans, NoTrans, One,
711 Z(_(jRow,jRow+jLen-1), _(inCol+1,inCol+j2)),
712 U(_(1,j2),_(1,i2)),
713 One,
714 WV(_(1,jLen),_(1,i2)));
715 //
716 // ==== Copy left of Z to right of scratch ====
717 //
718 WV(_(1,jLen),_(i2+1,i2+j2))
719 = Z(_(jRow,jRow+jLen-1), _(inCol+1,inCol+j2));
720 //
721 // ==== Multiply by U21 ====
722 //
723 blas::mm(Right, NoTrans, One,
724 U(_(1,i4-i2),_(i2+1,i4)).lower(),
725 WV(_(1,jLen),_(i2+1,i4)));
726 //
727 // ==== Multiply by U22 ====
728 //
729 blas::mm(NoTrans, NoTrans, One,
730 Z(_(jRow,jRow+jLen-1),_(inCol+j2+1,inCol+j4)),
731 U(_(j2+1,j4),_(i2+1,i4)),
732 One,
733 WV(_(1,jLen),_(i2+1,i4)));
734 //
735 // ==== Copy the result back to Z ====
736 //
737 Z(_(jRow,jRow+jLen-1),_(inCol+1,inCol+kdu))
738 = WV(_(1,jLen),_(1,kdu));
739 }
740 }
741 }
742 }
743 }
744 }
745
746 //== interface for native lapack ===============================================
747
748 #ifdef CHECK_CXXLAPACK
749
750 template <typename IndexType, typename VSR, typename VSI, typename MH,
751 typename MZ, typename MV, typename MU, typename MWV, typename MWH>
752 void
753 laqr5_native(bool wantT,
754 bool wantZ,
755 IndexType kacc22,
756 IndexType kTop,
757 IndexType kBot,
758 IndexType nShifts,
759 DenseVector<VSR> &sr,
760 DenseVector<VSI> &si,
761 GeMatrix<MH> &H,
762 IndexType iLoZ,
763 IndexType iHiZ,
764 GeMatrix<MZ> &Z,
765 GeMatrix<MV> &V,
766 GeMatrix<MU> &U,
767 GeMatrix<MWV> &WV,
768 GeMatrix<MWH> &WH)
769 {
770 typedef typename GeMatrix<MH>::ElementType T;
771
772 const LOGICAL WANTT = wantT;
773 const LOGICAL WANTZ = wantZ;
774 const INTEGER KACC22 = kacc22;
775 const INTEGER N = H.numRows();
776 const INTEGER KTOP = kTop;
777 const INTEGER KBOT = kBot;
778 const INTEGER NSHFTS = nShifts;
779 const INTEGER LDH = H.leadingDimension();
780 const INTEGER ILOZ = iLoZ;
781 const INTEGER IHIZ = iHiZ;
782 const INTEGER LDZ = Z.leadingDimension();
783 const INTEGER LDV = V.leadingDimension();
784 const INTEGER LDU = U.leadingDimension();
785 const INTEGER NV = WV.numRows();
786 const INTEGER LDWV = WV.leadingDimension();
787 const INTEGER NH = WH.numCols();
788 const INTEGER LDWH = WH.leadingDimension();
789
790 if (IsSame<T,DOUBLE>::value) {
791 LAPACK_IMPL(dlaqr5)(&WANTT,
792 &WANTZ,
793 &KACC22,
794 &N,
795 &KTOP,
796 &KBOT,
797 &NSHFTS,
798 sr.data(),
799 si.data(),
800 H.data(),
801 &LDH,
802 &ILOZ,
803 &IHIZ,
804 Z.data(),
805 &LDZ,
806 V.data(),
807 &LDV,
808 U.data(),
809 &LDU,
810 &NV,
811 WV.data(),
812 &LDWV,
813 &NH,
814 WH.data(),
815 &LDWH);
816 } else {
817 ASSERT(0);
818 }
819 }
820
821 #endif // CHECK_CXXLAPACK
822
823 //== public interface ==========================================================
824 template <typename IndexType, typename VSR, typename VSI, typename MH,
825 typename MZ, typename MV, typename MU, typename MWV, typename MWH>
826 void
827 laqr5(bool wantT,
828 bool wantZ,
829 IndexType kacc22,
830 IndexType kTop,
831 IndexType kBot,
832 IndexType nShifts,
833 DenseVector<VSR> &sr,
834 DenseVector<VSI> &si,
835 GeMatrix<MH> &H,
836 IndexType iLoZ,
837 IndexType iHiZ,
838 GeMatrix<MZ> &Z,
839 GeMatrix<MV> &V,
840 GeMatrix<MU> &U,
841 GeMatrix<MWV> &WV,
842 GeMatrix<MWH> &WH)
843 {
844 LAPACK_DEBUG_OUT("laqr5");
845
846 using std::max;
847 //
848 // Test the input parameters
849 //
850 # ifndef NDEBUG
851 ASSERT((kacc22==0)||(kacc22==1)||(kacc22==2));
852 ASSERT(H.firstRow()==1);
853 ASSERT(H.firstCol()==1);
854 ASSERT(H.numRows()==H.numCols());
855
856 const IndexType n = H.numRows();
857
858 ASSERT(1<=kTop);
859 ASSERT(kBot<=n);
860
861 ASSERT(nShifts>0);
862 ASSERT(nShifts % 2 == 0);
863
864 ASSERT(sr.length()==nShifts);
865 ASSERT(si.length()==nShifts);
866
867 if (wantZ) {
868 ASSERT(1<=iLoZ);
869 ASSERT(iLoZ<=iHiZ);
870 ASSERT(iHiZ<=n);
871 }
872
873 ASSERT(V.firstRow()==1);
874 ASSERT(V.firstCol()==1);
875 ASSERT(V.numRows()>=3);
876 ASSERT(V.numCols()==nShifts/2);
877
878 ASSERT(U.firstRow()==1);
879 ASSERT(U.firstCol()==1);
880 ASSERT(U.numRows()>=3*nShifts-3);
881
882 ASSERT(WH.firstRow()==1);
883 ASSERT(WH.firstCol()==1);
884 ASSERT(WH.numRows()>=3*nShifts-3);
885 ASSERT(WH.numCols()>=1);
886
887 ASSERT(WV.firstRow()==1);
888 ASSERT(WV.firstCol()==1);
889 ASSERT(WV.numRows()>=1);
890 ASSERT(WV.numCols()>=3*nShifts-3);
891 # endif
892
893 //
894 // Make copies of output arguments
895 //
896 # ifdef CHECK_CXXLAPACK
897 typename DenseVector<VSR>::NoView sr_org = sr;
898 typename DenseVector<VSI>::NoView si_org = si;
899 typename GeMatrix<MH>::NoView H_org = H;
900 typename GeMatrix<MZ>::NoView Z_org = Z;
901 typename GeMatrix<MV>::NoView V_org = V;
902 typename GeMatrix<MU>::NoView U_org = U;
903 typename GeMatrix<MWV>::NoView WV_org = WV;
904 typename GeMatrix<MWH>::NoView WH_org = WH;
905 # endif
906
907 //
908 // Call implementation
909 //
910 laqr5_generic(wantT, wantZ, kacc22, kTop, kBot, nShifts,
911 sr, si, H, iLoZ, iHiZ, Z,
912 V, U, WV, WH);
913
914 # ifdef CHECK_CXXLAPACK
915 //
916 // Make copies of results computed by the generic implementation
917 //
918 typename DenseVector<VSR>::NoView sr_generic = sr;
919 typename DenseVector<VSI>::NoView si_generic = si;
920 typename GeMatrix<MH>::NoView H_generic = H;
921 typename GeMatrix<MZ>::NoView Z_generic = Z;
922 typename GeMatrix<MV>::NoView V_generic = V;
923 typename GeMatrix<MU>::NoView U_generic = U;
924 typename GeMatrix<MWV>::NoView WV_generic = WV;
925 typename GeMatrix<MWH>::NoView WH_generic = WH;
926 //
927 // restore output arguments
928 //
929 sr = sr_org;
930 si = si_org;
931 H = H_org;
932 Z = Z_org;
933 V = V_org;
934 U = U_org;
935 WV = WV_org;
936 WH = WH_org;
937
938 //
939 // Compare generic results with results from the native implementation
940 //
941 laqr5_native(wantT, wantZ, kacc22, kTop, kBot, nShifts,
942 sr, si, H, iLoZ, iHiZ, Z,
943 V, U, WV, WH);
944
945 bool failed = false;
946 if (! isIdentical(sr_generic, sr, "sr_generic", "sr")) {
947 // std::cerr << "CXXLAPACK: sr_generic = " << sr_generic << std::endl;
948 // std::cerr << "F77LAPACK: sr = " << sr << std::endl;
949 failed = true;
950 }
951
952 if (! isIdentical(si_generic, si, "si_generic", "si")) {
953 // std::cerr << "CXXLAPACK: si_generic = " << si_generic << std::endl;
954 // std::cerr << "F77LAPACK: si = " << si << std::endl;
955 failed = true;
956 }
957
958 if (! isIdentical(H_generic, H, "H_generic", "H")) {
959 // std::cerr << "CXXLAPACK: H_generic = " << H_generic << std::endl;
960 // std::cerr << "F77LAPACK: H = " << H << std::endl;
961 failed = true;
962 }
963
964 if (! isIdentical(Z_generic, Z, "Z_generic", "Z")) {
965 // std::cerr << "CXXLAPACK: Z_generic = " << Z_generic << std::endl;
966 // std::cerr << "F77LAPACK: Z = " << Z << std::endl;
967 failed = true;
968 }
969
970 if (! isIdentical(V_generic, V, "V_generic", "V")) {
971 // std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;
972 // std::cerr << "F77LAPACK: V = " << V << std::endl;
973 failed = true;
974 }
975
976 if (! isIdentical(U_generic, U, "U_generic", "U")) {
977 // std::cerr << "CXXLAPACK: U_generic = " << U_generic << std::endl;
978 // std::cerr << "F77LAPACK: U = " << U << std::endl;
979 failed = true;
980 }
981
982 if (! isIdentical(WV_generic, WV, "WV_generic", "WV")) {
983 // std::cerr << "CXXLAPACK: WV_generic = " << WV_generic << std::endl;
984 // std::cerr << "F77LAPACK: WV = " << WV << std::endl;
985 failed = true;
986 }
987
988 if (! isIdentical(WH_generic, WH, "WH_generic", "WH")) {
989 // std::cerr << "CXXLAPACK: WH_generic = " << WH_generic << std::endl;
990 // std::cerr << "F77LAPACK: WH = " << WH << std::endl;
991 failed = true;
992 }
993
994 if (failed) {
995 std::cerr << "error in: laqr5.tcc" << std::endl;
996 std::cerr << "N = H.numRows() = " << H.numRows() << std::endl;
997 std::cerr << "H.numCols() = " << H.numCols() << std::endl;
998 std::cerr << "NV = WV.numRows() = " << WV.numRows() << std::endl;
999 std::cerr << "WV.numCols() = " << WV.numCols() << std::endl;
1000 std::cerr << "NH = WH.numRows() = " << WH.numRows() << std::endl;
1001 std::cerr << "WH.numCols() = " << WH.numCols() << std::endl;
1002 ASSERT(0);
1003 } else {
1004 // std::cerr << "passed: laqr5.tcc" << std::endl;
1005 }
1006 # endif
1007 }
1008
1009 //-- forwarding ----------------------------------------------------------------
1010 template <typename IndexType, typename VSR, typename VSI, typename MH,
1011 typename MZ, typename MV, typename MU, typename MWV, typename MWH>
1012 void
1013 laqr5(bool wantT,
1014 bool wantZ,
1015 IndexType kacc22,
1016 IndexType kTop,
1017 IndexType kBot,
1018 IndexType nShifts,
1019 VSR &&sr,
1020 VSI &&si,
1021 MH &&H,
1022 IndexType iLoZ,
1023 IndexType iHiZ,
1024 MZ &&Z,
1025 MV &&V,
1026 MU &&U,
1027 MWV &&WV,
1028 MWH &&WH)
1029 {
1030 CHECKPOINT_ENTER;
1031 laqr5(wantT, wantZ, kacc22, kTop, kBot, nShifts, sr, si, H, iLoZ, iHiZ, Z,
1032 V, U, WV, WH);
1033 CHECKPOINT_LEAVE;
1034 }
1035
1036 } } // namespace lapack, flens
1037
1038 #endif // FLENS_LAPACK_EIG_LAQR5_TCC
2 * Copyright (c) 2011, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Based on
34 *
35 SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
36 $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U,
37 $ LDU, NV, WV, LDWV, NH, WH, LDWH )
38 *
39 * -- LAPACK auxiliary routine (version 3.3.0) --
40 * -- LAPACK is a software package provided by Univ. of Tennessee, --
41 * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
42 * November 2010
43 *
44 */
45
46 #ifndef FLENS_LAPACK_EIG_LAQR5_TCC
47 #define FLENS_LAPACK_EIG_LAQR5_TCC 1
48
49 #include <flens/blas/blas.h>
50 #include <flens/lapack/lapack.h>
51
52 namespace flens { namespace lapack {
53
54 //== generic lapack implementation =============================================
55
56 template <typename IndexType, typename VSR, typename VSI, typename MH,
57 typename MZ, typename MV, typename MU, typename MWV, typename MWH>
58 void
59 laqr5_generic(bool wantT,
60 bool wantZ,
61 IndexType kacc22,
62 IndexType kTop,
63 IndexType kBot,
64 IndexType nShifts,
65 DenseVector<VSR> &sr,
66 DenseVector<VSI> &si,
67 GeMatrix<MH> &H,
68 IndexType iLoZ,
69 IndexType iHiZ,
70 GeMatrix<MZ> &Z,
71 GeMatrix<MV> &V,
72 GeMatrix<MU> &U,
73 GeMatrix<MWV> &WV,
74 GeMatrix<MWH> &WH)
75 {
76 using std::abs;
77 using std::max;
78 using std::min;
79
80 typedef typename GeMatrix<MH>::ElementType T;
81
82 const T Zero(0), One(1);
83
84 const Underscore<IndexType> _;
85
86 const IndexType n = H.numRows();
87 const IndexType nv = WV.numRows();
88 const IndexType nh = WH.numCols();
89
90 typedef typename GeMatrix<MH>::VectorView VectorView;
91 T vtBuffer[3];
92 VectorView vt = typename VectorView::Engine(3, vtBuffer);
93
94 //
95 // ==== If there are no shifts, then there is nothing to do. ====
96 //
97 if (nShifts<2) {
98 return;
99 }
100 //
101 // ==== If the active block is empty or 1-by-1, then there
102 // . is nothing to do. ====
103 //
104 if (kTop>=kBot) {
105 return;
106 }
107 //
108 // ==== Shuffle shifts into pairs of real shifts and pairs
109 // . of complex conjugate shifts assuming complex
110 // . conjugate shifts are already adjacent to one
111 // . another. ====
112 //
113 for (IndexType i=1; i<=nShifts-2; i+=2) {
114 if (si(i)!=-si(i+1)) {
115 T tmp = sr(i);
116 sr(i) = sr(i+1);
117 sr(i+1) = sr(i+2);
118 sr(i+2) = tmp;
119
120 tmp = si(i);
121 si(i) = si(i+1);
122 si(i+1) = si(i+2);
123 si(i+2) = tmp;
124 }
125 }
126 //
127 // ==== NSHFTS is supposed to be even, but if it is odd,
128 // . then simply reduce it by one. The shuffle above
129 // . ensures that the dropped shift is real and that
130 // . the remaining shifts are paired. ====
131 //
132 const IndexType ns = nShifts - (nShifts % 2);
133 //
134 // ==== Machine constants for deflation ====
135 //
136 T safeMin = lamch<T>(SafeMin);
137 T safeMax = One/safeMin;
138 labad(safeMin, safeMax);
139
140 const T ulp = lamch<T>(Precision);
141 const T smallNum = safeMin*(T(n)/ulp);
142 //
143 // ==== Use accumulated reflections to update far-from-diagonal
144 // . entries ? ====
145 //
146 const bool accum = (kacc22==1) || (kacc22==2);
147 //
148 // ==== If so, exploit the 2-by-2 block structure? ====
149 //
150 const bool blk22 = (ns>2) && (kacc22==2);
151 //
152 // ==== clear trash ====
153 //
154 if (kTop+2<=kBot) {
155 H(kTop+2,kTop) = Zero;
156 }
157 //
158 // ==== NBMPS = number of 2-shift bulges in the chain ====
159 //
160 const IndexType nBmps = ns/2;
161 //
162 // ==== KDU = width of slab ====
163 //
164 const IndexType kdu = 6*nBmps - 3;
165 //
166 // ==== Create and chase chains of NBMPS bulges ====
167 //
168 for (IndexType inCol=3*(1-nBmps)+kTop-1; inCol<=kBot-2; inCol+=3*nBmps-2) {
169 IndexType ndCol = inCol + kdu;
170 if (accum) {
171 auto U_ = U(_(1,kdu),_(1,kdu));
172 U_ = Zero;
173 U_.diag(0) = One;
174 }
175 //
176 // ==== Near-the-diagonal bulge chase. The following loop
177 // . performs the near-the-diagonal part of a small bulge
178 // . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal
179 // . chunk extends from column INCOL to column NDCOL
180 // . (including both column INCOL and column NDCOL). The
181 // . following loop chases a 3*NBMPS column long chain of
182 // . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL
183 // . may be less than KTOP and and NDCOL may be greater than
184 // . KBOT indicating phantom columns from which to chase
185 // . bulges before they are actually introduced or to which
186 // . to chase bulges beyond column KBOT.) ====
187 //
188 const IndexType krColMax = min(inCol+3*nBmps-3, kBot-2);
189
190 for (IndexType krCol=inCol; krCol<=krColMax; ++krCol) {
191 //
192 // ==== Bulges number MTOP to MBOT are active double implicit
193 // . shift bulges. There may or may not also be small
194 // . 2-by-2 bulge, if there is room. The inactive bulges
195 // . (if any) must wait until the active bulges have moved
196 // . down the diagonal to make room. The phantom matrix
197 // . paradigm described above helps keep track. ====
198 //
199 const IndexType mTop = max(IndexType(1), ((kTop-1)-krCol+2)/3+1);
200 const IndexType mBot = min(nBmps, (kBot-krCol)/3);
201 const IndexType m22 = mBot + 1;
202 const bool bmp22 = (mBot<nBmps ) && (krCol+3*(m22-1))==(kBot-2);
203 //
204 // ==== Generate reflections to chase the chain right
205 // . one column. (The minimum value of K is KTOP-1.) ====
206 //
207 IndexType k;
208 T alpha, beta;
209
210 for (IndexType m=mTop; m<=mBot; ++m) {
211 k = krCol + 3*(m-1);
212 if (k==kTop-1) {
213 laqr1(H(_(kTop,kTop+2),_(kTop,kTop+2)),
214 sr(2*m-1), si(2*m-1), sr(2*m), si(2*m),
215 V(_(1,3),m));
216 alpha = V(1,m);
217 larfg(IndexType(3), alpha, V(_(2,3),m), V(1,m));
218 } else {
219 beta = H(k+1,k);
220 V(2,m) = H(k+2,k);
221 V(3,m) = H(k+3,k);
222 larfg(IndexType(3), beta, V(_(2,3),m), V(1,m));
223 //
224 // ==== A Bulge may collapse because of vigilant
225 // . deflation or destructive underflow. In the
226 // . underflow case, try the two-small-subdiagonals
227 // . trick to try to reinflate the bulge. ====
228 //
229 if (H(k+3,k)!=Zero
230 || H(k+3,k+1)!=Zero
231 || H(k+3,k+2)==Zero) {
232 //
233 // ==== Typical case: not collapsed (yet). ====
234 //
235 H(k+1, k) = beta;
236 H(k+2, k) = Zero;
237 H(k+3, k) = Zero;
238 } else {
239 //
240 // ==== Atypical case: collapsed. Attempt to
241 // . reintroduce ignoring H(K+1,K) and H(K+2,K).
242 // . If the fill resulting from the new
243 // . reflector is too large, then abandon it.
244 // . Otherwise, use the new one. ====
245 //
246 laqr1(H(_(k+1,k+3),_(k+1,k+3)),
247 sr(2*m-1), si(2*m-1), sr(2*m), si(2*m),
248 vt);
249 alpha = vt(1);
250 larfg(3, alpha, vt(_(2,3)), vt(1));
251 const T refSum = vt(1)*(H(k+1,k) + vt(2)*H(k+2,k));
252 if (abs(H(k+2,k)-refSum*vt(2)) + abs(refSum*vt(3))
253 > ulp*(abs(H(k,k))+abs(H(k+1,k+1))+abs(H(k+2,k+2))))
254 {
255 //
256 // ==== Starting a new bulge here would
257 // . create non-negligible fill. Use
258 // . the old one with trepidation. ====
259 //
260 H(k+1, k) = beta;
261 H(k+2, k) = Zero;
262 H(k+3, k) = Zero;
263 } else {
264 //
265 // ==== Stating a new bulge here would
266 // . create only negligible fill.
267 // . Replace the old reflector with
268 // . the new one. ====
269 //
270 H(k+1, k) -= refSum;
271 H(k+2, k) = Zero;
272 H(k+3, k) = Zero;
273 V(1, m) = vt(1);
274 V(2, m) = vt(2);
275 V(3, m) = vt(3);
276 }
277 }
278 }
279 }
280 //
281 // ==== Generate a 2-by-2 reflection, if needed. ====
282 //
283 k = krCol + 3*(m22-1);
284 if (bmp22) {
285 if (k==kTop-1) {
286 laqr1(H(_(k+1,k+2),_(k+1,k+2)),
287 sr(2*m22-1), si(2*m22-1), sr(2*m22), si(2*m22),
288 V(_(1,2),m22));
289 beta = V(1, m22);
290 larfg(IndexType(2), beta, V(_(2,2),m22), V(1,m22));
291 } else {
292 beta = H(k+1, k);
293 V(2, m22) = H(k+2,k);
294 larfg(IndexType(2), beta, V(_(2,2),m22), V(1,m22));
295 H(k+1, k) = beta;
296 H(k+2, k) = Zero;
297 }
298 }
299 //
300 // ==== Multiply H by reflections from the left ====
301 //
302 IndexType jBot;
303 if (accum) {
304 jBot = min(ndCol, kBot);
305 } else if (wantT) {
306 jBot = n;
307 } else {
308 jBot = kBot;
309 }
310 for (IndexType j=max(kTop,krCol); j<=jBot; ++j) {
311 IndexType mEnd = min(mBot, (j-krCol+2)/3);
312 for (IndexType m=mTop; m<=mEnd; ++m) {
313 k = krCol + 3*(m-1);
314 const T refSum = V(1,m)*(H(k+1,j) + V(2,m)*H(k+2,j)
315 + V(3,m)*H(k+3,j));
316 H(k+1,j) -= refSum;
317 H(k+2,j) -= refSum*V(2,m);
318 H(k+3,j) -= refSum*V(3,m);
319 }
320 }
321 if (bmp22) {
322 k = krCol + 3*(m22-1);
323 for (IndexType j=max(k+1,kTop); j<=jBot; ++j) {
324 const T refSum = V(1,m22)*(H(k+1,j)+V(2,m22)*H(k+2,j));
325 H(k+1,j) -= refSum;
326 H(k+2,j) -= refSum*V(2,m22);
327 }
328 }
329 //
330 // ==== Multiply H by reflections from the right.
331 // . Delay filling in the last row until the
332 // . vigilant deflation check is complete. ====
333 //
334 IndexType jTop;
335 if (accum) {
336 jTop = max(kTop, inCol);
337 } else if (wantT) {
338 jTop = 1;
339 } else {
340 jTop = kTop;
341 }
342 for (IndexType m=mTop; m<=mBot; ++m) {
343 if (V(1,m)!=Zero) {
344 k = krCol + 3*(m-1);
345 for (IndexType j=jTop; j<=min(kBot,k+3); ++j) {
346 const T refSum = V(1,m)*(H(j,k+1) + V(2,m)*H(j,k+2)
347 + V(3,m)*H(j,k+3));
348 H(j, k+1) -= refSum;
349 H(j, k+2) -= refSum*V(2, m);
350 H(j, k+3) -= refSum*V(3, m);
351 }
352
353 if (accum) {
354 //
355 // ==== Accumulate U. (If necessary, update Z later
356 // . with with an efficient matrix-matrix
357 // . multiply.) ====
358 //
359 IndexType kms = k - inCol;
360 IndexType j1 = max(IndexType(1),kTop-inCol);
361
362 for (IndexType j=j1; j<=kdu; ++j) {
363 const T refSum = V(1,m)*(U(j,kms+1)
364 + V(2,m)*U(j,kms+2)
365 + V(3,m)*U(j,kms+3));
366 U(j, kms+1) -= refSum;
367 U(j, kms+2) -= refSum*V(2, m);
368 U(j, kms+3) -= refSum*V(3, m);
369 }
370 } else if (wantZ) {
371 //
372 // ==== U is not accumulated, so update Z
373 // . now by multiplying by reflections
374 // . from the right. ====
375 //
376 for (IndexType j=iLoZ; j<=iHiZ; ++j) {
377 const T refSum = V(1,m)*(Z(j,k+1)
378 +V(2,m)*Z(j,k+2)
379 +V(3,m)*Z(j,k+3));
380 Z(j,k+1) -= refSum;
381 Z(j,k+2) -= refSum*V(2,m);
382 Z(j,k+3) -= refSum*V(3,m);
383 }
384 }
385 }
386 }
387 //
388 // ==== Special case: 2-by-2 reflection (if needed) ====
389 //
390 k = krCol + 3*(m22-1);
391 if (bmp22) {
392 if (V(1,m22)!=Zero) {
393 for (IndexType j=jTop; j<=min(kBot,k+3); ++j) {
394 const T refSum = V(1,m22)*(H(j,k+1)+V(2,m22)*H(j,k+2));
395 H(j,k+1) -= refSum;
396 H(j,k+2) -= refSum*V(2,m22);
397 }
398
399 if (accum) {
400 IndexType kms = k - inCol;
401 IndexType j1 = max(IndexType(1),kTop-inCol);
402
403 for (IndexType j=j1; j<=kdu; ++j) {
404 const T refSum = V(1,m22)*(U(j,kms+1)
405 + V(2,m22)*U(j,kms+2));
406 U(j,kms+1 ) -= refSum;
407 U(j,kms+2 ) -= refSum*V(2,m22);
408 }
409 } else if (wantZ) {
410 for (IndexType j=iLoZ; j<=iHiZ; ++j) {
411 const T refSum = V(1,m22)*(Z(j,k+1)
412 + V(2,m22)*Z(j,k+2));
413 Z(j,k+1) -= refSum;
414 Z(j,k+2) -= refSum*V(2,m22);
415 }
416 }
417 }
418 }
419 //
420 // ==== Vigilant deflation check ====
421 //
422 IndexType mStart = mTop;
423 if (krCol+3*(mStart-1)<kTop) {
424 ++mStart;
425 }
426 IndexType mEnd = mBot;
427 if (bmp22) {
428 ++mEnd;
429 }
430 if (krCol==kBot-2) {
431 ++mEnd;
432 }
433 for (IndexType m=mStart; m<=mEnd; ++m) {
434 k = min(kBot-1, krCol+3*(m-1));
435 //
436 // ==== The following convergence test requires that
437 // . the tradition small-compared-to-nearby-diagonals
438 // . criterion and the Ahues & Tisseur (LAWN 122, 1997)
439 // . criteria both be satisfied. The latter improves
440 // . accuracy in some examples. Falling back on an
441 // . alternate convergence criterion when TST1 or TST2
442 // . is zero (as done here) is traditional but probably
443 // . unnecessary. ====
444 //
445 if (H(k+1,k)!=Zero) {
446 T test1 = abs(H(k,k)) + abs(H(k+1,k+1));
447 if (test1==Zero) {
448 if (k>=kTop+1) {
449 test1 += abs(H(k,k-1));
450 }
451 if (k>=kTop+2) {
452 test1 += abs(H(k,k-2));
453 }
454 if (k>=kTop+3) {
455 test1 += abs(H(k,k-3));
456 }
457 if (k<=kBot-2) {
458 test1 += abs(H(k+2,k+1));
459 }
460 if (k<=kBot-3) {
461 test1 += abs(H(k+3,k+1));
462 }
463 if (k<=kBot-4) {
464 test1 += abs(H(k+4,k+1));
465 }
466 }
467 if (abs(H(k+1,k))<=max(smallNum, ulp*test1)) {
468 const T H12 = max(abs(H(k+1,k)), abs(H(k,k+1)));
469 const T H21 = min(abs(H(k+1,k)), abs(H(k,k+1)));
470 const T H11 = max(abs(H(k+1,k+1)),
471 abs(H(k,k)-H(k+1,k+1)));
472 const T H22 = min(abs(H(k+1,k+1)),
473 abs(H(k,k)-H(k+1,k+1)));
474 const T scal = H11 + H12;
475 const T test2 = H22*(H11/scal);
476
477 if (test2==Zero
478 || H21*(H12/scal)<=max(smallNum,ulp*test2))
479 {
480 H(k+1,k) = Zero;
481 }
482 }
483 }
484 }
485 //
486 // ==== Fill in the last row of each bulge. ====
487 //
488 mEnd = min(nBmps, (kBot-krCol-1)/3);
489 for (IndexType m=mTop; m<=mEnd; ++m) {
490 k = krCol + 3*(m-1);
491 const T refSum = V(1,m)*V(3,m)*H(k+4,k+3);
492 H(k+4,k+1) = -refSum;
493 H(k+4,k+2) = -refSum*V(2,m);
494 H(k+4,k+3) -= refSum*V(3,m);
495 }
496 //
497 // ==== End of near-the-diagonal bulge chase. ====
498 //
499 }
500 //
501 // ==== Use U (if accumulated) to update far-from-diagonal
502 // . entries in H. If required, use U to update Z as
503 // . well. ====
504 //
505 if (accum) {
506 IndexType jTop, jBot;
507 if (wantT) {
508 jTop = 1;
509 jBot = n;
510 } else {
511 jTop = kTop;
512 jBot = kBot;
513 }
514 if ((!blk22) || (inCol<kTop) || (ndCol>kBot) || (ns<=2)) {
515 //
516 // ==== Updates not exploiting the 2-by-2 block
517 // . structure of U. K1 and NU keep track of
518 // . the location and size of U in the special
519 // . cases of introducing bulges and chasing
520 // . bulges off the bottom. In these special
521 // . cases and in case the number of shifts
522 // . is NS = 2, there is no 2-by-2 block
523 // . structure to exploit. ====
524 //
525 const IndexType k1 = max(IndexType(1), kTop-inCol);
526 const IndexType nu = (kdu-max(IndexType(0), ndCol-kBot)) -k1+1;
527 const IndexType _nu = (kdu-max(IndexType(0), ndCol-kBot));
528 //
529 // ==== Horizontal Multiply ====
530 //
531 for (IndexType jCol=min(ndCol,kBot)+1; jCol<=jBot; jCol+=nh) {
532 const IndexType jLen = min(nh, jBot-jCol+1);
533
534 auto _U = U(_(k1, _nu), _(k1, _nu));
535 auto _H = H(_(inCol+k1,inCol+_nu),_(jCol,jCol+jLen-1));
536 auto _WH = WH(_(1,nu),_(1,jLen));
537
538 blas::mm(ConjTrans, NoTrans, One, _U, _H, Zero, _WH);
539 _H = _WH;
540 }
541 //
542 // ==== Vertical multiply ====
543 //
544 for (IndexType jRow=jTop; jRow<=max(kTop,inCol)-1; jRow+=nv) {
545 const IndexType jLen = min(nv, max(kTop,inCol)-jRow);
546
547 auto _H = H(_(jRow,jRow+jLen-1),_(inCol+k1,inCol+_nu));
548 auto _U = U(_(k1,_nu),_(k1,_nu));
549 auto _WV = WV(_(1,jLen),_(1,nu));
550
551 blas::mm(NoTrans, NoTrans, One, _H, _U, Zero, _WV);
552 _H = _WV;
553 }
554 //
555 // ==== Z multiply (also vertical) ====
556 //
557 if (wantZ) {
558 for (IndexType jRow=iLoZ; jRow<=iHiZ; jRow+=nv) {
559 const IndexType jLen = min(nv, iHiZ-jRow+1);
560
561 auto _Z = Z(_(jRow,jRow+jLen-1),_(inCol+k1,inCol+_nu));
562 auto _U = U(_(k1,_nu),_(k1,_nu));
563 auto _WV = WV(_(1,jLen),_(1,nu));
564
565 blas::mm(NoTrans, NoTrans, One, _Z, _U, Zero, _WV);
566 _Z = _WV;
567 }
568 }
569 } else {
570 //
571 // ==== Updates exploiting U's 2-by-2 block structure.
572 // . (I2, I4, J2, J4 are the last rows and columns
573 // . of the blocks.) ====
574 //
575 const IndexType i2 = (kdu+1 ) / 2;
576 const IndexType i4 = kdu;
577 const IndexType j2 = i4 - i2;
578 const IndexType j4 = kdu;
579 //
580 // ==== KZS and KNZ deal with the band of zeros
581 // . along the diagonal of one of the triangular
582 // . blocks. ====
583 //
584 const IndexType kZs = (j4-j2) - (ns+1);
585 const IndexType kNz = ns + 1;
586 //
587 // ==== Horizontal multiply ====
588 //
589 for (IndexType jCol=min(ndCol, kBot)+1; jCol<=jBot; jCol+=nh) {
590 const IndexType jLen = min(nh, jBot-jCol+1);
591 //
592 // ==== Copy bottom of H to top+KZS of scratch ====
593 // (The first KZS rows get multiplied by zero.) ====
594 //
595 WH(_(kZs+1,kZs+kNz),_(1,jLen))
596 = H(_(inCol+j2+1,inCol+j2+kNz),_(jCol,jCol+jLen-1));
597 //
598 // ==== Multiply by U21**T ====
599 //
600 WH(_(1,kZs),_(1,jLen)) = Zero;
601 blas::mm(Left, ConjTrans, One,
602 U(_(j2+1,j2+kNz),_(kZs+1,kZs+kNz)).upper(),
603 WH(_(kZs+1,kZs+kNz),_(1,jLen)));
604 //
605 // ==== Multiply top of H by U11**T ====
606 //
607 blas::mm(ConjTrans, NoTrans, One,
608 U(_(1,j2),_(1,i2)),
609 H(_(inCol+1,inCol+j2),_(jCol,jCol+jLen-1)),
610 One,
611 WH(_(1,i2),_(1,jLen)));
612 //
613 // ==== Copy top of H to bottom of WH ====
614 //
615 WH(_(i2+1,i2+j2),_(1,jLen))
616 = H(_(inCol+1,inCol+j2),_(jCol,jCol+jLen-1));
617 //
618 // ==== Multiply by U21**T ====
619 //
620 blas::mm(Left, ConjTrans, One,
621 U(_(1,j2),_(i2+1,i2+j2)).lower(),
622 WH(_(i2+1,i2+j2),_(1,jLen)));
623 //
624 // ==== Multiply by U22 ====
625 //
626 blas::mm(ConjTrans, NoTrans, One,
627 U(_(j2+1,j4),_(i2+1,i4)),
628 H(_(inCol+j2+1,inCol+j4),_(jCol,jCol+jLen-1)),
629 One,
630 WH(_(i2+1,i4),_(1,jLen)));
631 //
632 // ==== Copy it back ====
633 //
634 H(_(inCol+1,inCol+kdu),_(jCol,jCol+jLen-1))
635 = WH(_(1,kdu),_(1,jLen));
636 }
637 //
638 // ==== Vertical multiply ====
639 //
640 for (IndexType jRow=jTop; jRow<=max(inCol,kTop)-1; jRow+=nv) {
641 const IndexType jLen = min(nv, max(inCol,kTop) -jRow);
642 //
643 // ==== Copy right of H to scratch (the first KZS
644 // . columns get multiplied by zero) ====
645 //
646 WV(_(1,jLen),_(kZs+1,kZs+kNz))
647 = H(_(jRow,jRow+jLen-1),_(inCol+j2+1, inCol+j2+kNz));
648 //
649 // ==== Multiply by U21 ====
650 //
651 WV(_(1,jLen),_(1,kZs)) = Zero;
652 blas::mm(Right, NoTrans, One,
653 U(_(j2+1,j2+kNz),_(kZs+1,kZs+kNz)).upper(),
654 WV(_(1,jLen),_(kZs+1,kZs+kNz)));
655 //
656 // ==== Multiply by U11 ====
657 //
658 blas::mm(NoTrans, NoTrans, One,
659 H(_(jRow,jRow+jLen-1),_(inCol+1,inCol+j2)),
660 U(_(1,j2),_(1,i2)),
661 One,
662 WV(_(1,jLen),_(1,i2)));
663 //
664 // ==== Copy left of H to right of scratch ====
665 //
666 WV(_(1,jLen),_(i2+1,i2+j2))
667 = H(_(jRow,jRow+jLen-1), _(inCol+1,inCol+j2));
668 //
669 // ==== Multiply by U21 ====
670 //
671 blas::mm(Right, NoTrans, One,
672 U(_(1,i4-i2),_(i2+1,i4)).lower(),
673 WV(_(1,jLen),_(i2+1,i4)));
674 //
675 // ==== Multiply by U22 ====
676 //
677 blas::mm(NoTrans, NoTrans, One,
678 H(_(jRow,jRow+jLen-1),_(inCol+j2+1,inCol+j4)),
679 U(_(j2+1,j4),_(i2+1,i4)),
680 One,
681 WV(_(1,jLen),_(i2+1,i4)));
682 //
683 // ==== Copy it back ====
684 //
685 H(_(jRow,jRow+jLen-1),_(inCol+1,inCol+kdu))
686 = WV(_(1,jLen),_(1,kdu));
687 }
688 //
689 // ==== Multiply Z (also vertical) ====
690 //
691 if (wantZ) {
692 for (IndexType jRow=iLoZ; jRow<=iHiZ; jRow+=nv) {
693 const IndexType jLen = min(nv,iHiZ-jRow+1);
694 //
695 // ==== Copy right of Z to left of scratch (first
696 // . KZS columns get multiplied by zero) ====
697 //
698 WV(_(1,jLen),_(kZs+1,kZs+kNz))
699 = Z(_(jRow,jRow+jLen-1),_(inCol+j2+1,inCol+j2+kNz));
700 //
701 // ==== Multiply by U12 ====
702 //
703 WV(_(1,jLen),_(1,kZs)) = Zero;
704 blas::mm(Right, NoTrans, One,
705 U(_(j2+1,j2+kNz), _(kZs+1,kZs+kNz)).upper(),
706 WV(_(1,jLen),_(kZs+1,kZs+kNz)));
707 //
708 // ==== Multiply by U11 ====
709 //
710 blas::mm(NoTrans, NoTrans, One,
711 Z(_(jRow,jRow+jLen-1), _(inCol+1,inCol+j2)),
712 U(_(1,j2),_(1,i2)),
713 One,
714 WV(_(1,jLen),_(1,i2)));
715 //
716 // ==== Copy left of Z to right of scratch ====
717 //
718 WV(_(1,jLen),_(i2+1,i2+j2))
719 = Z(_(jRow,jRow+jLen-1), _(inCol+1,inCol+j2));
720 //
721 // ==== Multiply by U21 ====
722 //
723 blas::mm(Right, NoTrans, One,
724 U(_(1,i4-i2),_(i2+1,i4)).lower(),
725 WV(_(1,jLen),_(i2+1,i4)));
726 //
727 // ==== Multiply by U22 ====
728 //
729 blas::mm(NoTrans, NoTrans, One,
730 Z(_(jRow,jRow+jLen-1),_(inCol+j2+1,inCol+j4)),
731 U(_(j2+1,j4),_(i2+1,i4)),
732 One,
733 WV(_(1,jLen),_(i2+1,i4)));
734 //
735 // ==== Copy the result back to Z ====
736 //
737 Z(_(jRow,jRow+jLen-1),_(inCol+1,inCol+kdu))
738 = WV(_(1,jLen),_(1,kdu));
739 }
740 }
741 }
742 }
743 }
744 }
745
746 //== interface for native lapack ===============================================
747
748 #ifdef CHECK_CXXLAPACK
749
750 template <typename IndexType, typename VSR, typename VSI, typename MH,
751 typename MZ, typename MV, typename MU, typename MWV, typename MWH>
752 void
753 laqr5_native(bool wantT,
754 bool wantZ,
755 IndexType kacc22,
756 IndexType kTop,
757 IndexType kBot,
758 IndexType nShifts,
759 DenseVector<VSR> &sr,
760 DenseVector<VSI> &si,
761 GeMatrix<MH> &H,
762 IndexType iLoZ,
763 IndexType iHiZ,
764 GeMatrix<MZ> &Z,
765 GeMatrix<MV> &V,
766 GeMatrix<MU> &U,
767 GeMatrix<MWV> &WV,
768 GeMatrix<MWH> &WH)
769 {
770 typedef typename GeMatrix<MH>::ElementType T;
771
772 const LOGICAL WANTT = wantT;
773 const LOGICAL WANTZ = wantZ;
774 const INTEGER KACC22 = kacc22;
775 const INTEGER N = H.numRows();
776 const INTEGER KTOP = kTop;
777 const INTEGER KBOT = kBot;
778 const INTEGER NSHFTS = nShifts;
779 const INTEGER LDH = H.leadingDimension();
780 const INTEGER ILOZ = iLoZ;
781 const INTEGER IHIZ = iHiZ;
782 const INTEGER LDZ = Z.leadingDimension();
783 const INTEGER LDV = V.leadingDimension();
784 const INTEGER LDU = U.leadingDimension();
785 const INTEGER NV = WV.numRows();
786 const INTEGER LDWV = WV.leadingDimension();
787 const INTEGER NH = WH.numCols();
788 const INTEGER LDWH = WH.leadingDimension();
789
790 if (IsSame<T,DOUBLE>::value) {
791 LAPACK_IMPL(dlaqr5)(&WANTT,
792 &WANTZ,
793 &KACC22,
794 &N,
795 &KTOP,
796 &KBOT,
797 &NSHFTS,
798 sr.data(),
799 si.data(),
800 H.data(),
801 &LDH,
802 &ILOZ,
803 &IHIZ,
804 Z.data(),
805 &LDZ,
806 V.data(),
807 &LDV,
808 U.data(),
809 &LDU,
810 &NV,
811 WV.data(),
812 &LDWV,
813 &NH,
814 WH.data(),
815 &LDWH);
816 } else {
817 ASSERT(0);
818 }
819 }
820
821 #endif // CHECK_CXXLAPACK
822
823 //== public interface ==========================================================
824 template <typename IndexType, typename VSR, typename VSI, typename MH,
825 typename MZ, typename MV, typename MU, typename MWV, typename MWH>
826 void
827 laqr5(bool wantT,
828 bool wantZ,
829 IndexType kacc22,
830 IndexType kTop,
831 IndexType kBot,
832 IndexType nShifts,
833 DenseVector<VSR> &sr,
834 DenseVector<VSI> &si,
835 GeMatrix<MH> &H,
836 IndexType iLoZ,
837 IndexType iHiZ,
838 GeMatrix<MZ> &Z,
839 GeMatrix<MV> &V,
840 GeMatrix<MU> &U,
841 GeMatrix<MWV> &WV,
842 GeMatrix<MWH> &WH)
843 {
844 LAPACK_DEBUG_OUT("laqr5");
845
846 using std::max;
847 //
848 // Test the input parameters
849 //
850 # ifndef NDEBUG
851 ASSERT((kacc22==0)||(kacc22==1)||(kacc22==2));
852 ASSERT(H.firstRow()==1);
853 ASSERT(H.firstCol()==1);
854 ASSERT(H.numRows()==H.numCols());
855
856 const IndexType n = H.numRows();
857
858 ASSERT(1<=kTop);
859 ASSERT(kBot<=n);
860
861 ASSERT(nShifts>0);
862 ASSERT(nShifts % 2 == 0);
863
864 ASSERT(sr.length()==nShifts);
865 ASSERT(si.length()==nShifts);
866
867 if (wantZ) {
868 ASSERT(1<=iLoZ);
869 ASSERT(iLoZ<=iHiZ);
870 ASSERT(iHiZ<=n);
871 }
872
873 ASSERT(V.firstRow()==1);
874 ASSERT(V.firstCol()==1);
875 ASSERT(V.numRows()>=3);
876 ASSERT(V.numCols()==nShifts/2);
877
878 ASSERT(U.firstRow()==1);
879 ASSERT(U.firstCol()==1);
880 ASSERT(U.numRows()>=3*nShifts-3);
881
882 ASSERT(WH.firstRow()==1);
883 ASSERT(WH.firstCol()==1);
884 ASSERT(WH.numRows()>=3*nShifts-3);
885 ASSERT(WH.numCols()>=1);
886
887 ASSERT(WV.firstRow()==1);
888 ASSERT(WV.firstCol()==1);
889 ASSERT(WV.numRows()>=1);
890 ASSERT(WV.numCols()>=3*nShifts-3);
891 # endif
892
893 //
894 // Make copies of output arguments
895 //
896 # ifdef CHECK_CXXLAPACK
897 typename DenseVector<VSR>::NoView sr_org = sr;
898 typename DenseVector<VSI>::NoView si_org = si;
899 typename GeMatrix<MH>::NoView H_org = H;
900 typename GeMatrix<MZ>::NoView Z_org = Z;
901 typename GeMatrix<MV>::NoView V_org = V;
902 typename GeMatrix<MU>::NoView U_org = U;
903 typename GeMatrix<MWV>::NoView WV_org = WV;
904 typename GeMatrix<MWH>::NoView WH_org = WH;
905 # endif
906
907 //
908 // Call implementation
909 //
910 laqr5_generic(wantT, wantZ, kacc22, kTop, kBot, nShifts,
911 sr, si, H, iLoZ, iHiZ, Z,
912 V, U, WV, WH);
913
914 # ifdef CHECK_CXXLAPACK
915 //
916 // Make copies of results computed by the generic implementation
917 //
918 typename DenseVector<VSR>::NoView sr_generic = sr;
919 typename DenseVector<VSI>::NoView si_generic = si;
920 typename GeMatrix<MH>::NoView H_generic = H;
921 typename GeMatrix<MZ>::NoView Z_generic = Z;
922 typename GeMatrix<MV>::NoView V_generic = V;
923 typename GeMatrix<MU>::NoView U_generic = U;
924 typename GeMatrix<MWV>::NoView WV_generic = WV;
925 typename GeMatrix<MWH>::NoView WH_generic = WH;
926 //
927 // restore output arguments
928 //
929 sr = sr_org;
930 si = si_org;
931 H = H_org;
932 Z = Z_org;
933 V = V_org;
934 U = U_org;
935 WV = WV_org;
936 WH = WH_org;
937
938 //
939 // Compare generic results with results from the native implementation
940 //
941 laqr5_native(wantT, wantZ, kacc22, kTop, kBot, nShifts,
942 sr, si, H, iLoZ, iHiZ, Z,
943 V, U, WV, WH);
944
945 bool failed = false;
946 if (! isIdentical(sr_generic, sr, "sr_generic", "sr")) {
947 // std::cerr << "CXXLAPACK: sr_generic = " << sr_generic << std::endl;
948 // std::cerr << "F77LAPACK: sr = " << sr << std::endl;
949 failed = true;
950 }
951
952 if (! isIdentical(si_generic, si, "si_generic", "si")) {
953 // std::cerr << "CXXLAPACK: si_generic = " << si_generic << std::endl;
954 // std::cerr << "F77LAPACK: si = " << si << std::endl;
955 failed = true;
956 }
957
958 if (! isIdentical(H_generic, H, "H_generic", "H")) {
959 // std::cerr << "CXXLAPACK: H_generic = " << H_generic << std::endl;
960 // std::cerr << "F77LAPACK: H = " << H << std::endl;
961 failed = true;
962 }
963
964 if (! isIdentical(Z_generic, Z, "Z_generic", "Z")) {
965 // std::cerr << "CXXLAPACK: Z_generic = " << Z_generic << std::endl;
966 // std::cerr << "F77LAPACK: Z = " << Z << std::endl;
967 failed = true;
968 }
969
970 if (! isIdentical(V_generic, V, "V_generic", "V")) {
971 // std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;
972 // std::cerr << "F77LAPACK: V = " << V << std::endl;
973 failed = true;
974 }
975
976 if (! isIdentical(U_generic, U, "U_generic", "U")) {
977 // std::cerr << "CXXLAPACK: U_generic = " << U_generic << std::endl;
978 // std::cerr << "F77LAPACK: U = " << U << std::endl;
979 failed = true;
980 }
981
982 if (! isIdentical(WV_generic, WV, "WV_generic", "WV")) {
983 // std::cerr << "CXXLAPACK: WV_generic = " << WV_generic << std::endl;
984 // std::cerr << "F77LAPACK: WV = " << WV << std::endl;
985 failed = true;
986 }
987
988 if (! isIdentical(WH_generic, WH, "WH_generic", "WH")) {
989 // std::cerr << "CXXLAPACK: WH_generic = " << WH_generic << std::endl;
990 // std::cerr << "F77LAPACK: WH = " << WH << std::endl;
991 failed = true;
992 }
993
994 if (failed) {
995 std::cerr << "error in: laqr5.tcc" << std::endl;
996 std::cerr << "N = H.numRows() = " << H.numRows() << std::endl;
997 std::cerr << "H.numCols() = " << H.numCols() << std::endl;
998 std::cerr << "NV = WV.numRows() = " << WV.numRows() << std::endl;
999 std::cerr << "WV.numCols() = " << WV.numCols() << std::endl;
1000 std::cerr << "NH = WH.numRows() = " << WH.numRows() << std::endl;
1001 std::cerr << "WH.numCols() = " << WH.numCols() << std::endl;
1002 ASSERT(0);
1003 } else {
1004 // std::cerr << "passed: laqr5.tcc" << std::endl;
1005 }
1006 # endif
1007 }
1008
1009 //-- forwarding ----------------------------------------------------------------
1010 template <typename IndexType, typename VSR, typename VSI, typename MH,
1011 typename MZ, typename MV, typename MU, typename MWV, typename MWH>
1012 void
1013 laqr5(bool wantT,
1014 bool wantZ,
1015 IndexType kacc22,
1016 IndexType kTop,
1017 IndexType kBot,
1018 IndexType nShifts,
1019 VSR &&sr,
1020 VSI &&si,
1021 MH &&H,
1022 IndexType iLoZ,
1023 IndexType iHiZ,
1024 MZ &&Z,
1025 MV &&V,
1026 MU &&U,
1027 MWV &&WV,
1028 MWH &&WH)
1029 {
1030 CHECKPOINT_ENTER;
1031 laqr5(wantT, wantZ, kacc22, kTop, kBot, nShifts, sr, si, H, iLoZ, iHiZ, Z,
1032 V, U, WV, WH);
1033 CHECKPOINT_LEAVE;
1034 }
1035
1036 } } // namespace lapack, flens
1037
1038 #endif // FLENS_LAPACK_EIG_LAQR5_TCC