laqr5.tcc (flens/lapack/eig/laqr5.tcc)

   1 /*

   2  *   Copyright (c) 2011, Michael Lehn

   3  *

   4  *   All rights reserved.

   5  *

   6  *   Redistribution and use in source and binary forms, with or without

   7  *   modification, are permitted provided that the following conditions

   8  *   are met:

   9  *

  10  *   1) Redistributions of source code must retain the above copyright

  11  *      notice, this list of conditions and the following disclaimer.

  12  *   2) Redistributions in binary form must reproduce the above copyright

  13  *      notice, this list of conditions and the following disclaimer in

  14  *      the documentation and/or other materials provided with the

  15  *      distribution.

  16  *   3) Neither the name of the FLENS development group nor the names of

  17  *      its contributors may be used to endorse or promote products derived

  18  *      from this software without specific prior written permission.

  19  *

  20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

  21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

  22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

  23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

  24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

  25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

  26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

  27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

  28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

  29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

  30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

  31  */

  32 

  33 /* Based on

  34  *

  35        SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,

  36       $                   SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U,

  37       $                   LDU, NV, WV, LDWV, NH, WH, LDWH )

  38  *

  39  *  -- LAPACK auxiliary routine (version 3.3.0) --

  40  *  -- LAPACK is a software package provided by Univ. of Tennessee,    --

  41  *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--

  42  *     November 2010

  43  *

  44  */

  45 

  46 #ifndef FLENS_LAPACK_EIG_LAQR5_TCC

  47 #define FLENS_LAPACK_EIG_LAQR5_TCC 1

  48 

  49 #include <flens/blas/blas.h>

  50 #include <flens/lapack/lapack.h>

  51 

  52 namespace flens { namespace lapack {

  53 

  54 //== generic lapack implementation =============================================

  55 

  56 template <typename IndexType, typename VSR, typename VSI, typename MH,

  57           typename MZ, typename MV, typename MU, typename MWV, typename MWH>

  58 void

  59 laqr5_generic(bool                      wantT,

  60               bool                      wantZ,

  61               IndexType                 kacc22,

  62               IndexType                 kTop,

  63               IndexType                 kBot,

  64               IndexType                 nShifts,

  65               DenseVector<VSR>          &sr,

  66               DenseVector<VSI>          &si,

  67               GeMatrix<MH>              &H,

  68               IndexType                 iLoZ,

  69               IndexType                 iHiZ,

  70               GeMatrix<MZ>              &Z,

  71               GeMatrix<MV>              &V,

  72               GeMatrix<MU>              &U,

  73               GeMatrix<MWV>             &WV,

  74               GeMatrix<MWH>             &WH)

  75 {

  76     using std::abs;

  77     using std::max;

  78     using std::min;

  79 

  80     typedef typename GeMatrix<MH>::ElementType  T;

  81 

  82     const T Zero(0), One(1);

  83 

  84     const Underscore<IndexType> _;

  85 

  86     const IndexType n   = H.numRows();

  87     const IndexType nv  = WV.numRows();

  88     const IndexType nh  = WH.numCols();

  89 

  90     typedef typename GeMatrix<MH>::VectorView   VectorView;

  91     T           vtBuffer[3];

  92     VectorView  vt = typename VectorView::Engine(3, vtBuffer);

  93 

  94 //

  95 //  ==== If there are no shifts, then there is nothing to do. ====

  96 //

  97     if (nShifts<2) {

  98         return;

  99     }

 100 //

 101 //  ==== If the active block is empty or 1-by-1, then there

 102 //  .    is nothing to do. ====

 103 //

 104     if (kTop>=kBot) {

 105         return;

 106     }

 107 //

 108 //  ==== Shuffle shifts into pairs of real shifts and pairs

 109 //  .    of complex conjugate shifts assuming complex

 110 //  .    conjugate shifts are already adjacent to one

 111 //  .    another. ====

 112 //

 113     for (IndexType i=1; i<=nShifts-2; i+=2) {

 114         if (si(i)!=-si(i+1)) {

 115             T tmp = sr(i);

 116             sr(i) = sr(i+1);

 117             sr(i+1) = sr(i+2);

 118             sr(i+2) = tmp;

 119 

 120             tmp = si(i);

 121             si(i) = si(i+1);

 122             si(i+1) = si(i+2);

 123             si(i+2) = tmp;

 124         }

 125     }

 126 //

 127 //  ==== NSHFTS is supposed to be even, but if it is odd,

 128 //  .    then simply reduce it by one.  The shuffle above

 129 //  .    ensures that the dropped shift is real and that

 130 //  .    the remaining shifts are paired. ====

 131 //

 132     const IndexType ns = nShifts - (nShifts % 2);

 133 //

 134 //  ==== Machine constants for deflation ====

 135 //

 136     T safeMin   = lamch<T>(SafeMin);

 137     T safeMax   = One/safeMin;

 138     labad(safeMin, safeMax);

 139 

 140     const T ulp         = lamch<T>(Precision);

 141     const T smallNum    = safeMin*(T(n)/ulp);

 142 //

 143 //  ==== Use accumulated reflections to update far-from-diagonal

 144 //  .    entries ? ====

 145 //

 146     const bool accum = (kacc22==1) || (kacc22==2);

 147 //

 148 //  ==== If so, exploit the 2-by-2 block structure? ====

 149 //

 150     const bool blk22 = (ns>2) && (kacc22==2);

 151 //

 152 //  ==== clear trash ====

 153 //

 154     if (kTop+2<=kBot) {

 155         H(kTop+2,kTop) = Zero;

 156     }

 157 //

 158 //  ==== NBMPS = number of 2-shift bulges in the chain ====

 159 //

 160     const IndexType nBmps = ns/2;

 161 //

 162 //  ==== KDU = width of slab ====

 163 //

 164     const IndexType kdu = 6*nBmps - 3;

 165 //

 166 //  ==== Create and chase chains of NBMPS bulges ====

 167 //

 168     for (IndexType inCol=3*(1-nBmps)+kTop-1; inCol<=kBot-2; inCol+=3*nBmps-2) {

 169         IndexType ndCol = inCol + kdu;

 170         if (accum) {

 171             auto U_ = U(_(1,kdu),_(1,kdu));

 172             U_           = Zero;

 173             U_.diag(0)   = One;

 174         }

 175 //

 176 //      ==== Near-the-diagonal bulge chase.  The following loop

 177 //      .    performs the near-the-diagonal part of a small bulge

 178 //      .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal

 179 //      .    chunk extends from column INCOL to column NDCOL

 180 //      .    (including both column INCOL and column NDCOL). The

 181 //      .    following loop chases a 3*NBMPS column long chain of

 182 //      .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL

 183 //      .    may be less than KTOP and and NDCOL may be greater than

 184 //      .    KBOT indicating phantom columns from which to chase

 185 //      .    bulges before they are actually introduced or to which

 186 //      .    to chase bulges beyond column KBOT.)  ====

 187 //

 188         const IndexType krColMax = min(inCol+3*nBmps-3, kBot-2);

 189 

 190         for (IndexType krCol=inCol; krCol<=krColMax; ++krCol) {

 191 //

 192 //          ==== Bulges number MTOP to MBOT are active double implicit

 193 //          .    shift bulges.  There may or may not also be small

 194 //          .    2-by-2 bulge, if there is room.  The inactive bulges

 195 //          .    (if any) must wait until the active bulges have moved

 196 //          .    down the diagonal to make room.  The phantom matrix

 197 //          .    paradigm described above helps keep track.  ====

 198 //

 199             const IndexType mTop = max(IndexType(1), ((kTop-1)-krCol+2)/3+1);

 200             const IndexType mBot = min(nBmps, (kBot-krCol)/3);

 201             const IndexType m22  = mBot + 1;

 202             const bool bmp22     = (mBot<nBmps ) && (krCol+3*(m22-1))==(kBot-2);

 203 //

 204 //          ==== Generate reflections to chase the chain right

 205 //          .    one column.  (The minimum value of K is KTOP-1.) ====

 206 //

 207             IndexType   k;

 208             T           alpha, beta;

 209 

 210             for (IndexType m=mTop; m<=mBot; ++m) {

 211                 k = krCol + 3*(m-1);

 212                 if (k==kTop-1) {

 213                     laqr1(H(_(kTop,kTop+2),_(kTop,kTop+2)),

 214                           sr(2*m-1), si(2*m-1), sr(2*m), si(2*m),

 215                           V(_(1,3),m));

 216                     alpha = V(1,m);

 217                     larfg(IndexType(3), alpha, V(_(2,3),m), V(1,m));

 218                 } else {

 219                     beta = H(k+1,k);

 220                     V(2,m) = H(k+2,k);

 221                     V(3,m) = H(k+3,k);

 222                     larfg(IndexType(3), beta, V(_(2,3),m), V(1,m));

 223 //

 224 //                  ==== A Bulge may collapse because of vigilant

 225 //                  .    deflation or destructive underflow.  In the

 226 //                  .    underflow case, try the two-small-subdiagonals

 227 //                  .    trick to try to reinflate the bulge.  ====

 228 //

 229                     if (H(k+3,k)!=Zero

 230                      || H(k+3,k+1)!=Zero

 231                      || H(k+3,k+2)==Zero) {

 232 //

 233 //                   ==== Typical case: not collapsed (yet). ====

 234 //

 235                         H(k+1, k) = beta;

 236                         H(k+2, k) = Zero;

 237                         H(k+3, k) = Zero;

 238                     } else {

 239 //

 240 //                      ==== Atypical case: collapsed.  Attempt to

 241 //                      .    reintroduce ignoring H(K+1,K) and H(K+2,K).

 242 //                      .    If the fill resulting from the new

 243 //                      .    reflector is too large, then abandon it.

 244 //                      .    Otherwise, use the new one. ====

 245 //

 246                         laqr1(H(_(k+1,k+3),_(k+1,k+3)),

 247                               sr(2*m-1), si(2*m-1), sr(2*m), si(2*m),

 248                               vt);

 249                         alpha = vt(1);

 250                         larfg(3, alpha, vt(_(2,3)), vt(1));

 251                         const T refSum = vt(1)*(H(k+1,k) + vt(2)*H(k+2,k));

 252                         if (abs(H(k+2,k)-refSum*vt(2)) + abs(refSum*vt(3))

 253                             > ulp*(abs(H(k,k))+abs(H(k+1,k+1))+abs(H(k+2,k+2))))

 254                         {

 255 //

 256 //                          ==== Starting a new bulge here would

 257 //                          .    create non-negligible fill.  Use

 258 //                          .    the old one with trepidation. ====

 259 //

 260                             H(k+1, k) = beta;

 261                             H(k+2, k) = Zero;

 262                             H(k+3, k) = Zero;

 263                         } else {

 264 //

 265 //                          ==== Stating a new bulge here would

 266 //                          .    create only negligible fill.

 267 //                          .    Replace the old reflector with

 268 //                          .    the new one. ====

 269 //

 270                             H(k+1, k) -= refSum;

 271                             H(k+2, k) = Zero;

 272                             H(k+3, k) = Zero;

 273                             V(1, m) = vt(1);

 274                             V(2, m) = vt(2);

 275                             V(3, m) = vt(3);

 276                         }

 277                     }

 278                 }

 279             }

 280 //

 281 //          ==== Generate a 2-by-2 reflection, if needed. ====

 282 //

 283             k = krCol + 3*(m22-1);

 284             if (bmp22) {

 285                 if (k==kTop-1) {

 286                     laqr1(H(_(k+1,k+2),_(k+1,k+2)),

 287                           sr(2*m22-1), si(2*m22-1), sr(2*m22), si(2*m22),

 288                           V(_(1,2),m22));

 289                     beta = V(1, m22);

 290                     larfg(IndexType(2), beta, V(_(2,2),m22), V(1,m22));

 291                 } else {

 292                     beta = H(k+1, k);

 293                     V(2, m22) = H(k+2,k);

 294                     larfg(IndexType(2), beta, V(_(2,2),m22), V(1,m22));

 295                     H(k+1, k) = beta;

 296                     H(k+2, k) = Zero;

 297                 }

 298             }

 299 //

 300 //          ==== Multiply H by reflections from the left ====

 301 //

 302             IndexType jBot;

 303             if (accum) {

 304                 jBot = min(ndCol, kBot);

 305             } else if (wantT) {

 306                 jBot = n;

 307             } else {

 308                 jBot = kBot;

 309             }

 310             for (IndexType j=max(kTop,krCol); j<=jBot; ++j) {

 311                 IndexType mEnd = min(mBot, (j-krCol+2)/3);

 312                 for (IndexType m=mTop; m<=mEnd; ++m) {

 313                     k = krCol + 3*(m-1);

 314                     const T refSum = V(1,m)*(H(k+1,j) + V(2,m)*H(k+2,j)

 315                                                       + V(3,m)*H(k+3,j));

 316                     H(k+1,j) -= refSum;

 317                     H(k+2,j) -= refSum*V(2,m);

 318                     H(k+3,j) -= refSum*V(3,m);

 319                 }

 320             }

 321             if (bmp22) {

 322                 k = krCol + 3*(m22-1);

 323                 for (IndexType j=max(k+1,kTop); j<=jBot; ++j) {

 324                     const T refSum = V(1,m22)*(H(k+1,j)+V(2,m22)*H(k+2,j));

 325                     H(k+1,j) -= refSum;

 326                     H(k+2,j) -= refSum*V(2,m22);

 327                 }

 328             }

 329 //

 330 //          ==== Multiply H by reflections from the right.

 331 //          .    Delay filling in the last row until the

 332 //          .    vigilant deflation check is complete. ====

 333 //

 334             IndexType jTop;

 335             if (accum) {

 336                 jTop = max(kTop, inCol);

 337             } else if (wantT) {

 338                 jTop = 1;

 339             } else {

 340                 jTop = kTop;

 341             }

 342             for (IndexType m=mTop; m<=mBot; ++m) {

 343                 if (V(1,m)!=Zero) {

 344                     k = krCol + 3*(m-1);

 345                     for (IndexType j=jTop; j<=min(kBot,k+3); ++j) {

 346                         const T refSum = V(1,m)*(H(j,k+1) + V(2,m)*H(j,k+2)

 347                                                           + V(3,m)*H(j,k+3));

 348                         H(j, k+1) -= refSum;

 349                         H(j, k+2) -= refSum*V(2, m);

 350                         H(j, k+3) -= refSum*V(3, m);

 351                     }

 352 

 353                     if (accum) {

 354 //

 355 //                   ==== Accumulate U. (If necessary, update Z later

 356 //                   .    with with an efficient matrix-matrix

 357 //                   .    multiply.) ====

 358 //

 359                         IndexType kms   = k - inCol;

 360                         IndexType j1    = max(IndexType(1),kTop-inCol);

 361 

 362                         for (IndexType j=j1; j<=kdu; ++j) {

 363                             const T refSum = V(1,m)*(U(j,kms+1)

 364                                                         + V(2,m)*U(j,kms+2)

 365                                                         + V(3,m)*U(j,kms+3));

 366                             U(j, kms+1) -= refSum;

 367                             U(j, kms+2) -= refSum*V(2, m);

 368                             U(j, kms+3) -= refSum*V(3, m);

 369                         }

 370                     } else if (wantZ) {

 371 //

 372 //                      ==== U is not accumulated, so update Z

 373 //                      .    now by multiplying by reflections

 374 //                      .    from the right. ====

 375 //

 376                         for (IndexType j=iLoZ; j<=iHiZ; ++j) {

 377                             const T refSum = V(1,m)*(Z(j,k+1)

 378                                                         +V(2,m)*Z(j,k+2)

 379                                                         +V(3,m)*Z(j,k+3));

 380                             Z(j,k+1) -= refSum;

 381                             Z(j,k+2) -= refSum*V(2,m);

 382                             Z(j,k+3) -= refSum*V(3,m);

 383                         }

 384                     }

 385                 }

 386             }

 387 //

 388 //          ==== Special case: 2-by-2 reflection (if needed) ====

 389 //

 390             k = krCol + 3*(m22-1);

 391             if (bmp22) {

 392                 if (V(1,m22)!=Zero) {

 393                     for (IndexType j=jTop; j<=min(kBot,k+3); ++j) {

 394                         const T refSum = V(1,m22)*(H(j,k+1)+V(2,m22)*H(j,k+2));

 395                         H(j,k+1) -= refSum;

 396                         H(j,k+2) -= refSum*V(2,m22);

 397                     }

 398 

 399                     if (accum) {

 400                         IndexType kms   = k - inCol;

 401                         IndexType j1    = max(IndexType(1),kTop-inCol);

 402 

 403                         for (IndexType j=j1; j<=kdu; ++j) {

 404                             const T refSum = V(1,m22)*(U(j,kms+1)

 405                                                         + V(2,m22)*U(j,kms+2));

 406                             U(j,kms+1 ) -= refSum;

 407                             U(j,kms+2 ) -= refSum*V(2,m22);

 408                         }

 409                     } else if (wantZ) {

 410                         for (IndexType j=iLoZ; j<=iHiZ; ++j) {

 411                             const T refSum = V(1,m22)*(Z(j,k+1)

 412                                                         + V(2,m22)*Z(j,k+2));

 413                             Z(j,k+1) -= refSum;

 414                             Z(j,k+2) -= refSum*V(2,m22);

 415                         }

 416                     }

 417                 }

 418             }

 419 //

 420 //          ==== Vigilant deflation check ====

 421 //

 422             IndexType mStart = mTop;

 423             if (krCol+3*(mStart-1)<kTop) {

 424                 ++mStart;

 425             }

 426             IndexType mEnd = mBot;

 427             if (bmp22) {

 428                 ++mEnd;

 429             }

 430             if (krCol==kBot-2) {

 431                 ++mEnd;

 432             }

 433             for (IndexType m=mStart; m<=mEnd; ++m) {

 434                 k = min(kBot-1, krCol+3*(m-1));

 435 //

 436 //              ==== The following convergence test requires that

 437 //              .    the tradition small-compared-to-nearby-diagonals

 438 //              .    criterion and the Ahues & Tisseur (LAWN 122, 1997)

 439 //              .    criteria both be satisfied.  The latter improves

 440 //              .    accuracy in some examples. Falling back on an

 441 //              .    alternate convergence criterion when TST1 or TST2

 442 //              .    is zero (as done here) is traditional but probably

 443 //              .    unnecessary. ====

 444 //

 445                 if (H(k+1,k)!=Zero) {

 446                     T test1 = abs(H(k,k)) + abs(H(k+1,k+1));

 447                     if (test1==Zero) {

 448                         if (k>=kTop+1) {

 449                             test1 += abs(H(k,k-1));

 450                         }

 451                         if (k>=kTop+2) {

 452                             test1 += abs(H(k,k-2));

 453                         }

 454                         if (k>=kTop+3) {

 455                             test1 += abs(H(k,k-3));

 456                         }

 457                         if (k<=kBot-2) {

 458                             test1 += abs(H(k+2,k+1));

 459                         }

 460                         if (k<=kBot-3) {

 461                             test1 += abs(H(k+3,k+1));

 462                         }

 463                         if (k<=kBot-4) {

 464                             test1 += abs(H(k+4,k+1));

 465                         }

 466                     }

 467                     if (abs(H(k+1,k))<=max(smallNum, ulp*test1)) {

 468                         const T H12 = max(abs(H(k+1,k)), abs(H(k,k+1)));

 469                         const T H21 = min(abs(H(k+1,k)), abs(H(k,k+1)));

 470                         const T H11 = max(abs(H(k+1,k+1)),

 471                                           abs(H(k,k)-H(k+1,k+1)));

 472                         const T H22 = min(abs(H(k+1,k+1)),

 473                                           abs(H(k,k)-H(k+1,k+1)));

 474                         const T scal = H11 + H12;

 475                         const T test2 = H22*(H11/scal);

 476 

 477                         if (test2==Zero

 478                          || H21*(H12/scal)<=max(smallNum,ulp*test2))

 479                         {

 480                             H(k+1,k) = Zero;

 481                         }

 482                     }

 483                 }

 484             }

 485 //

 486 //          ==== Fill in the last row of each bulge. ====

 487 //

 488             mEnd = min(nBmps, (kBot-krCol-1)/3);

 489             for (IndexType m=mTop; m<=mEnd; ++m) {

 490                 k = krCol + 3*(m-1);

 491                 const T refSum = V(1,m)*V(3,m)*H(k+4,k+3);

 492                 H(k+4,k+1) = -refSum;

 493                 H(k+4,k+2) = -refSum*V(2,m);

 494                 H(k+4,k+3) -= refSum*V(3,m);

 495             }

 496 //

 497 //          ==== End of near-the-diagonal bulge chase. ====

 498 //

 499         }

 500 //

 501 //      ==== Use U (if accumulated) to update far-from-diagonal

 502 //      .    entries in H.  If required, use U to update Z as

 503 //      .    well. ====

 504 //

 505         if (accum) {

 506             IndexType jTop, jBot;

 507             if (wantT) {

 508                 jTop = 1;

 509                 jBot = n;

 510             } else {

 511                 jTop = kTop;

 512                 jBot = kBot;

 513             }

 514             if ((!blk22) || (inCol<kTop) || (ndCol>kBot) || (ns<=2)) {

 515 //

 516 //              ==== Updates not exploiting the 2-by-2 block

 517 //              .    structure of U.  K1 and NU keep track of

 518 //              .    the location and size of U in the special

 519 //              .    cases of introducing bulges and chasing

 520 //              .    bulges off the bottom.  In these special

 521 //              .    cases and in case the number of shifts

 522 //              .    is NS = 2, there is no 2-by-2 block

 523 //              .    structure to exploit.  ====

 524 //

 525                 const IndexType k1 = max(IndexType(1), kTop-inCol);

 526                 const IndexType nu  = (kdu-max(IndexType(0), ndCol-kBot)) -k1+1;

 527                 const IndexType _nu = (kdu-max(IndexType(0), ndCol-kBot));

 528 //

 529 //              ==== Horizontal Multiply ====

 530 //

 531                 for (IndexType jCol=min(ndCol,kBot)+1; jCol<=jBot; jCol+=nh) {

 532                     const IndexType jLen = min(nh, jBot-jCol+1);

 533 

 534                     auto _U     = U(_(k1, _nu), _(k1, _nu));

 535                     auto _H     = H(_(inCol+k1,inCol+_nu),_(jCol,jCol+jLen-1));

 536                     auto _WH    = WH(_(1,nu),_(1,jLen));

 537 

 538                     blas::mm(ConjTrans, NoTrans, One, _U, _H, Zero, _WH);

 539                     _H = _WH;

 540                 }

 541 //

 542 //              ==== Vertical multiply ====

 543 //

 544                 for (IndexType jRow=jTop; jRow<=max(kTop,inCol)-1; jRow+=nv) {

 545                     const IndexType jLen = min(nv, max(kTop,inCol)-jRow);

 546 

 547                     auto _H     = H(_(jRow,jRow+jLen-1),_(inCol+k1,inCol+_nu));

 548                     auto _U     = U(_(k1,_nu),_(k1,_nu));

 549                     auto _WV    = WV(_(1,jLen),_(1,nu));

 550 

 551                     blas::mm(NoTrans, NoTrans, One, _H, _U, Zero, _WV);

 552                     _H = _WV;

 553                 }

 554 //

 555 //              ==== Z multiply (also vertical) ====

 556 //

 557                 if (wantZ) {

 558                     for (IndexType jRow=iLoZ; jRow<=iHiZ; jRow+=nv) {

 559                         const IndexType jLen = min(nv, iHiZ-jRow+1);

 560 

 561                         auto _Z  = Z(_(jRow,jRow+jLen-1),_(inCol+k1,inCol+_nu));

 562                         auto _U  = U(_(k1,_nu),_(k1,_nu));

 563                         auto _WV = WV(_(1,jLen),_(1,nu));

 564 

 565                         blas::mm(NoTrans, NoTrans, One, _Z, _U, Zero, _WV);

 566                         _Z = _WV;

 567                     }

 568                 }

 569             } else {

 570 //

 571 //              ==== Updates exploiting U's 2-by-2 block structure.

 572 //              .    (I2, I4, J2, J4 are the last rows and columns

 573 //              .    of the blocks.) ====

 574 //

 575                 const IndexType i2 = (kdu+1 ) / 2;

 576                 const IndexType i4 = kdu;

 577                 const IndexType j2 = i4 - i2;

 578                 const IndexType j4 = kdu;

 579 //

 580 //              ==== KZS and KNZ deal with the band of zeros

 581 //              .    along the diagonal of one of the triangular

 582 //              .    blocks. ====

 583 //

 584                 const IndexType kZs = (j4-j2) - (ns+1);

 585                 const IndexType kNz = ns + 1;

 586 //

 587 //              ==== Horizontal multiply ====

 588 //

 589                 for (IndexType jCol=min(ndCol, kBot)+1; jCol<=jBot; jCol+=nh) {

 590                     const IndexType jLen = min(nh, jBot-jCol+1);

 591 //

 592 //                  ==== Copy bottom of H to top+KZS of scratch ====

 593 //                  (The first KZS rows get multiplied by zero.) ====

 594 //

 595                     WH(_(kZs+1,kZs+kNz),_(1,jLen))

 596                         = H(_(inCol+j2+1,inCol+j2+kNz),_(jCol,jCol+jLen-1));

 597 //

 598 //                  ==== Multiply by U21**T ====

 599 //

 600                     WH(_(1,kZs),_(1,jLen)) = Zero;

 601                     blas::mm(Left, ConjTrans, One,

 602                              U(_(j2+1,j2+kNz),_(kZs+1,kZs+kNz)).upper(),

 603                              WH(_(kZs+1,kZs+kNz),_(1,jLen)));

 604 //

 605 //                  ==== Multiply top of H by U11**T ====

 606 //

 607                     blas::mm(ConjTrans, NoTrans, One,

 608                              U(_(1,j2),_(1,i2)),

 609                              H(_(inCol+1,inCol+j2),_(jCol,jCol+jLen-1)),

 610                              One,

 611                              WH(_(1,i2),_(1,jLen)));

 612 //

 613 //                  ==== Copy top of H to bottom of WH ====

 614 //

 615                     WH(_(i2+1,i2+j2),_(1,jLen))

 616                         = H(_(inCol+1,inCol+j2),_(jCol,jCol+jLen-1));

 617 //

 618 //                  ==== Multiply by U21**T ====

 619 //

 620                     blas::mm(Left, ConjTrans, One,

 621                              U(_(1,j2),_(i2+1,i2+j2)).lower(),

 622                              WH(_(i2+1,i2+j2),_(1,jLen)));

 623 //

 624 //                  ==== Multiply by U22 ====

 625 //

 626                     blas::mm(ConjTrans, NoTrans, One,

 627                              U(_(j2+1,j4),_(i2+1,i4)),

 628                              H(_(inCol+j2+1,inCol+j4),_(jCol,jCol+jLen-1)),

 629                              One,

 630                              WH(_(i2+1,i4),_(1,jLen)));

 631 //

 632 //                  ==== Copy it back ====

 633 //

 634                     H(_(inCol+1,inCol+kdu),_(jCol,jCol+jLen-1))

 635                         = WH(_(1,kdu),_(1,jLen));

 636                 }

 637 //

 638 //              ==== Vertical multiply ====

 639 //

 640                 for (IndexType jRow=jTop; jRow<=max(inCol,kTop)-1; jRow+=nv) {

 641                     const IndexType jLen = min(nv, max(inCol,kTop) -jRow);

 642 //

 643 //                  ==== Copy right of H to scratch (the first KZS

 644 //                  .    columns get multiplied by zero) ====

 645 //

 646                     WV(_(1,jLen),_(kZs+1,kZs+kNz))

 647                         = H(_(jRow,jRow+jLen-1),_(inCol+j2+1, inCol+j2+kNz));

 648 //

 649 //                  ==== Multiply by U21 ====

 650 //

 651                     WV(_(1,jLen),_(1,kZs)) = Zero;

 652                     blas::mm(Right, NoTrans, One,

 653                              U(_(j2+1,j2+kNz),_(kZs+1,kZs+kNz)).upper(),

 654                              WV(_(1,jLen),_(kZs+1,kZs+kNz)));

 655 //

 656 //                  ==== Multiply by U11 ====

 657 //

 658                     blas::mm(NoTrans, NoTrans, One,

 659                              H(_(jRow,jRow+jLen-1),_(inCol+1,inCol+j2)),

 660                              U(_(1,j2),_(1,i2)),

 661                              One,

 662                              WV(_(1,jLen),_(1,i2)));

 663 //

 664 //                  ==== Copy left of H to right of scratch ====

 665 //

 666                     WV(_(1,jLen),_(i2+1,i2+j2))

 667                         = H(_(jRow,jRow+jLen-1), _(inCol+1,inCol+j2));

 668 //

 669 //                  ==== Multiply by U21 ====

 670 //

 671                     blas::mm(Right, NoTrans, One,

 672                              U(_(1,i4-i2),_(i2+1,i4)).lower(),

 673                              WV(_(1,jLen),_(i2+1,i4)));

 674 //

 675 //                  ==== Multiply by U22 ====

 676 //

 677                     blas::mm(NoTrans, NoTrans, One,

 678                              H(_(jRow,jRow+jLen-1),_(inCol+j2+1,inCol+j4)),

 679                              U(_(j2+1,j4),_(i2+1,i4)),

 680                              One,

 681                              WV(_(1,jLen),_(i2+1,i4)));

 682 //

 683 //                  ==== Copy it back ====

 684 //

 685                     H(_(jRow,jRow+jLen-1),_(inCol+1,inCol+kdu))

 686                         = WV(_(1,jLen),_(1,kdu));

 687                 }

 688 //

 689 //              ==== Multiply Z (also vertical) ====

 690 //

 691                 if (wantZ) {

 692                     for (IndexType jRow=iLoZ; jRow<=iHiZ; jRow+=nv) {

 693                         const IndexType jLen = min(nv,iHiZ-jRow+1);

 694 //

 695 //                      ==== Copy right of Z to left of scratch (first

 696 //                      .     KZS columns get multiplied by zero) ====

 697 //

 698                         WV(_(1,jLen),_(kZs+1,kZs+kNz))

 699                             = Z(_(jRow,jRow+jLen-1),_(inCol+j2+1,inCol+j2+kNz));

 700 //

 701 //                      ==== Multiply by U12 ====

 702 //

 703                         WV(_(1,jLen),_(1,kZs)) = Zero;

 704                         blas::mm(Right, NoTrans, One,

 705                                  U(_(j2+1,j2+kNz), _(kZs+1,kZs+kNz)).upper(),

 706                                  WV(_(1,jLen),_(kZs+1,kZs+kNz)));

 707 //

 708 //                      ==== Multiply by U11 ====

 709 //

 710                         blas::mm(NoTrans, NoTrans, One,

 711                                  Z(_(jRow,jRow+jLen-1), _(inCol+1,inCol+j2)),

 712                                  U(_(1,j2),_(1,i2)),

 713                                  One,

 714                                  WV(_(1,jLen),_(1,i2)));

 715 //

 716 //                      ==== Copy left of Z to right of scratch ====

 717 //

 718                         WV(_(1,jLen),_(i2+1,i2+j2))

 719                             = Z(_(jRow,jRow+jLen-1), _(inCol+1,inCol+j2));

 720 //

 721 //                      ==== Multiply by U21 ====

 722 //

 723                         blas::mm(Right, NoTrans, One,

 724                                  U(_(1,i4-i2),_(i2+1,i4)).lower(),

 725                                  WV(_(1,jLen),_(i2+1,i4)));

 726 //

 727 //                      ==== Multiply by U22 ====

 728 //

 729                         blas::mm(NoTrans, NoTrans, One,

 730                                  Z(_(jRow,jRow+jLen-1),_(inCol+j2+1,inCol+j4)),

 731                                  U(_(j2+1,j4),_(i2+1,i4)),

 732                                  One,

 733                                  WV(_(1,jLen),_(i2+1,i4)));

 734 //

 735 //                      ==== Copy the result back to Z ====

 736 //

 737                         Z(_(jRow,jRow+jLen-1),_(inCol+1,inCol+kdu))

 738                             = WV(_(1,jLen),_(1,kdu));

 739                     }

 740                 }

 741             }

 742         }

 743     }

 744 }

 745 

 746 //== interface for native lapack ===============================================

 747 

 748 #ifdef CHECK_CXXLAPACK

 749 

 750 template <typename IndexType, typename VSR, typename VSI, typename MH,

 751           typename MZ, typename MV, typename MU, typename MWV, typename MWH>

 752 void

 753 laqr5_native(bool                      wantT,

 754              bool                      wantZ,

 755              IndexType                 kacc22,

 756              IndexType                 kTop,

 757              IndexType                 kBot,

 758              IndexType                 nShifts,

 759              DenseVector<VSR>          &sr,

 760              DenseVector<VSI>          &si,

 761              GeMatrix<MH>              &H,

 762              IndexType                 iLoZ,

 763              IndexType                 iHiZ,

 764              GeMatrix<MZ>              &Z,

 765              GeMatrix<MV>              &V,

 766              GeMatrix<MU>              &U,

 767              GeMatrix<MWV>             &WV,

 768              GeMatrix<MWH>             &WH)

 769 {

 770     typedef typename GeMatrix<MH>::ElementType  T;

 771 

 772     const LOGICAL    WANTT      = wantT;

 773     const LOGICAL    WANTZ      = wantZ;

 774     const INTEGER    KACC22     = kacc22;

 775     const INTEGER    N          = H.numRows();

 776     const INTEGER    KTOP       = kTop;

 777     const INTEGER    KBOT       = kBot;

 778     const INTEGER    NSHFTS     = nShifts;

 779     const INTEGER    LDH        = H.leadingDimension();

 780     const INTEGER    ILOZ       = iLoZ;

 781     const INTEGER    IHIZ       = iHiZ;

 782     const INTEGER    LDZ        = Z.leadingDimension();

 783     const INTEGER    LDV        = V.leadingDimension();

 784     const INTEGER    LDU        = U.leadingDimension();

 785     const INTEGER    NV         = WV.numRows();

 786     const INTEGER    LDWV       = WV.leadingDimension();

 787     const INTEGER    NH         = WH.numCols();

 788     const INTEGER    LDWH       = WH.leadingDimension();

 789 

 790     if (IsSame<T,DOUBLE>::value) {

 791         LAPACK_IMPL(dlaqr5)(&WANTT,

 792                             &WANTZ,

 793                             &KACC22,

 794                             &N,

 795                             &KTOP,

 796                             &KBOT,

 797                             &NSHFTS,

 798                             sr.data(),

 799                             si.data(),

 800                             H.data(),

 801                             &LDH,

 802                             &ILOZ,

 803                             &IHIZ,

 804                             Z.data(),

 805                             &LDZ,

 806                             V.data(),

 807                             &LDV,

 808                             U.data(),

 809                             &LDU,

 810                             &NV,

 811                             WV.data(),

 812                             &LDWV,

 813                             &NH,

 814                             WH.data(),

 815                             &LDWH);

 816     } else {

 817         ASSERT(0);

 818     }

 819 }

 820 

 821 #endif // CHECK_CXXLAPACK

 822 

 823 //== public interface ==========================================================

 824 template <typename IndexType, typename VSR, typename VSI, typename MH,

 825           typename MZ, typename MV, typename MU, typename MWV, typename MWH>

 826 void

 827 laqr5(bool                      wantT,

 828       bool                      wantZ,

 829       IndexType                 kacc22,

 830       IndexType                 kTop,

 831       IndexType                 kBot,

 832       IndexType                 nShifts,

 833       DenseVector<VSR>          &sr,

 834       DenseVector<VSI>          &si,

 835       GeMatrix<MH>              &H,

 836       IndexType                 iLoZ,

 837       IndexType                 iHiZ,

 838       GeMatrix<MZ>              &Z,

 839       GeMatrix<MV>              &V,

 840       GeMatrix<MU>              &U,

 841       GeMatrix<MWV>             &WV,

 842       GeMatrix<MWH>             &WH)

 843 {

 844     LAPACK_DEBUG_OUT("laqr5");

 845 

 846     using std::max;

 847 //

 848 //  Test the input parameters

 849 //

 850 #   ifndef NDEBUG

 851     ASSERT((kacc22==0)||(kacc22==1)||(kacc22==2));

 852     ASSERT(H.firstRow()==1);

 853     ASSERT(H.firstCol()==1);

 854     ASSERT(H.numRows()==H.numCols());

 855 

 856     const IndexType n = H.numRows();

 857 

 858     ASSERT(1<=kTop);

 859     ASSERT(kBot<=n);

 860 

 861     ASSERT(nShifts>0);

 862     ASSERT(nShifts % 2 == 0);

 863 

 864     ASSERT(sr.length()==nShifts);

 865     ASSERT(si.length()==nShifts);

 866 

 867     if (wantZ) {

 868         ASSERT(1<=iLoZ);

 869         ASSERT(iLoZ<=iHiZ);

 870         ASSERT(iHiZ<=n);

 871     }

 872 

 873     ASSERT(V.firstRow()==1);

 874     ASSERT(V.firstCol()==1);

 875     ASSERT(V.numRows()>=3);

 876     ASSERT(V.numCols()==nShifts/2);

 877 

 878     ASSERT(U.firstRow()==1);

 879     ASSERT(U.firstCol()==1);

 880     ASSERT(U.numRows()>=3*nShifts-3);

 881 

 882     ASSERT(WH.firstRow()==1);

 883     ASSERT(WH.firstCol()==1);

 884     ASSERT(WH.numRows()>=3*nShifts-3);

 885     ASSERT(WH.numCols()>=1);

 886 

 887     ASSERT(WV.firstRow()==1);

 888     ASSERT(WV.firstCol()==1);

 889     ASSERT(WV.numRows()>=1);

 890     ASSERT(WV.numCols()>=3*nShifts-3);

 891 #   endif

 892 

 893 //

 894 //  Make copies of output arguments

 895 //

 896 #   ifdef CHECK_CXXLAPACK

 897     typename DenseVector<VSR>::NoView   sr_org = sr;

 898     typename DenseVector<VSI>::NoView   si_org = si;

 899     typename GeMatrix<MH>::NoView       H_org  = H;

 900     typename GeMatrix<MZ>::NoView       Z_org  = Z;

 901     typename GeMatrix<MV>::NoView       V_org  = V;

 902     typename GeMatrix<MU>::NoView       U_org  = U;

 903     typename GeMatrix<MWV>::NoView      WV_org = WV;

 904     typename GeMatrix<MWH>::NoView      WH_org = WH;

 905 #   endif

 906 

 907 //

 908 //  Call implementation

 909 //

 910     laqr5_generic(wantT, wantZ, kacc22, kTop, kBot, nShifts,

 911                   sr, si, H, iLoZ, iHiZ, Z,

 912                   V, U, WV, WH);

 913 

 914 #   ifdef CHECK_CXXLAPACK

 915 //

 916 //  Make copies of results computed by the generic implementation

 917 //

 918     typename DenseVector<VSR>::NoView   sr_generic = sr;

 919     typename DenseVector<VSI>::NoView   si_generic = si;

 920     typename GeMatrix<MH>::NoView       H_generic  = H;

 921     typename GeMatrix<MZ>::NoView       Z_generic  = Z;

 922     typename GeMatrix<MV>::NoView       V_generic  = V;

 923     typename GeMatrix<MU>::NoView       U_generic  = U;

 924     typename GeMatrix<MWV>::NoView      WV_generic = WV;

 925     typename GeMatrix<MWH>::NoView      WH_generic = WH;

 926 //

 927 //  restore output arguments

 928 //

 929     sr = sr_org;

 930     si = si_org;

 931     H  = H_org;

 932     Z  = Z_org;

 933     V  = V_org;

 934     U  = U_org;

 935     WV = WV_org;

 936     WH = WH_org;

 937 

 938 //

 939 //  Compare generic results with results from the native implementation

 940 //

 941     laqr5_native(wantT, wantZ, kacc22, kTop, kBot, nShifts,

 942                  sr, si, H, iLoZ, iHiZ, Z,

 943                  V, U, WV, WH);

 944 

 945     bool failed = false;

 946     if (! isIdentical(sr_generic, sr, "sr_generic", "sr")) {

 947 //        std::cerr << "CXXLAPACK: sr_generic = " << sr_generic << std::endl;

 948 //        std::cerr << "F77LAPACK: sr = " << sr << std::endl;

 949         failed = true;

 950     }

 951 

 952     if (! isIdentical(si_generic, si, "si_generic", "si")) {

 953 //        std::cerr << "CXXLAPACK: si_generic = " << si_generic << std::endl;

 954 //        std::cerr << "F77LAPACK: si = " << si << std::endl;

 955         failed = true;

 956     }

 957 

 958     if (! isIdentical(H_generic, H, "H_generic", "H")) {

 959 //        std::cerr << "CXXLAPACK: H_generic = " << H_generic << std::endl;

 960 //        std::cerr << "F77LAPACK: H = " << H << std::endl;

 961         failed = true;

 962     }

 963 

 964     if (! isIdentical(Z_generic, Z, "Z_generic", "Z")) {

 965 //        std::cerr << "CXXLAPACK: Z_generic = " << Z_generic << std::endl;

 966 //        std::cerr << "F77LAPACK: Z = " << Z << std::endl;

 967         failed = true;

 968     }

 969 

 970     if (! isIdentical(V_generic, V, "V_generic", "V")) {

 971 //        std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;

 972 //        std::cerr << "F77LAPACK: V = " << V << std::endl;

 973         failed = true;

 974     }

 975 

 976     if (! isIdentical(U_generic, U, "U_generic", "U")) {

 977 //        std::cerr << "CXXLAPACK: U_generic = " << U_generic << std::endl;

 978 //        std::cerr << "F77LAPACK: U = " << U << std::endl;

 979         failed = true;

 980     }

 981 

 982     if (! isIdentical(WV_generic, WV, "WV_generic", "WV")) {

 983 //        std::cerr << "CXXLAPACK: WV_generic = " << WV_generic << std::endl;

 984 //        std::cerr << "F77LAPACK: WV = " << WV << std::endl;

 985         failed = true;

 986     }

 987 

 988     if (! isIdentical(WH_generic, WH, "WH_generic", "WH")) {

 989 //        std::cerr << "CXXLAPACK: WH_generic = " << WH_generic << std::endl;

 990 //        std::cerr << "F77LAPACK: WH = " << WH << std::endl;

 991         failed = true;

 992     }

 993 

 994     if (failed) {

 995         std::cerr << "error in: laqr5.tcc" << std::endl;

 996         std::cerr << "N = H.numRows() =   " << H.numRows() << std::endl;

 997         std::cerr << "H.numCols() =       " << H.numCols() << std::endl;

 998         std::cerr << "NV = WV.numRows() = " << WV.numRows() << std::endl;

 999         std::cerr << "WV.numCols() =      " << WV.numCols() << std::endl;

1000         std::cerr << "NH = WH.numRows() = " << WH.numRows() << std::endl;

1001         std::cerr << "WH.numCols() =      " << WH.numCols() << std::endl;

1002         ASSERT(0);

1003     } else {

1004         // std::cerr << "passed: laqr5.tcc" << std::endl;

1005     }

1006 #   endif

1007 }

1008 

1009 //-- forwarding ----------------------------------------------------------------

1010 template <typename IndexType, typename VSR, typename VSI, typename MH,

1011           typename MZ, typename MV, typename MU, typename MWV, typename MWH>

1012 void

1013 laqr5(bool                      wantT,

1014       bool                      wantZ,

1015       IndexType                 kacc22,

1016       IndexType                 kTop,

1017       IndexType                 kBot,

1018       IndexType                 nShifts,

1019       VSR                       &&sr,

1020       VSI                       &&si,

1021       MH                        &&H,

1022       IndexType                 iLoZ,

1023       IndexType                 iHiZ,

1024       MZ                        &&Z,

1025       MV                        &&V,

1026       MU                        &&U,

1027       MWV                       &&WV,

1028       MWH                       &&WH)

1029 {

1030     CHECKPOINT_ENTER;

1031     laqr5(wantT, wantZ, kacc22, kTop, kBot, nShifts, sr, si, H, iLoZ, iHiZ, Z,

1032           V, U, WV, WH);

1033     CHECKPOINT_LEAVE;

1034 }

1035 

1036 } } // namespace lapack, flens

1037 

1038 #endif // FLENS_LAPACK_EIG_LAQR5_TCC