1 /*
   2  *   Copyright (c) 2011, Michael Lehn
   3  *
   4  *   All rights reserved.
   5  *
   6  *   Redistribution and use in source and binary forms, with or without
   7  *   modification, are permitted provided that the following conditions
   8  *   are met:
   9  *
  10  *   1) Redistributions of source code must retain the above copyright
  11  *      notice, this list of conditions and the following disclaimer.
  12  *   2) Redistributions in binary form must reproduce the above copyright
  13  *      notice, this list of conditions and the following disclaimer in
  14  *      the documentation and/or other materials provided with the
  15  *      distribution.
  16  *   3) Neither the name of the FLENS development group nor the names of
  17  *      its contributors may be used to endorse or promote products derived
  18  *      from this software without specific prior written permission.
  19  *
  20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31  */
  32 
  33 /* Based on
  34  *
  35        SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS,
  36       $                   SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U,
  37       $                   LDU, NV, WV, LDWV, NH, WH, LDWH )
  38  *
  39  *  -- LAPACK auxiliary routine (version 3.3.0) --
  40  *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
  41  *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
  42  *     November 2010
  43  *
  44  */
  45 
  46 #ifndef FLENS_LAPACK_EIG_LAQR5_TCC
  47 #define FLENS_LAPACK_EIG_LAQR5_TCC 1
  48 
  49 #include <flens/blas/blas.h>
  50 #include <flens/lapack/lapack.h>
  51 
  52 namespace flens { namespace lapack {
  53 
  54 //== generic lapack implementation =============================================
  55 
  56 template <typename IndexType, typename VSR, typename VSI, typename MH,
  57           typename MZ, typename MV, typename MU, typename MWV, typename MWH>
  58 void
  59 laqr5_generic(bool                      wantT,
  60               bool                      wantZ,
  61               IndexType                 kacc22,
  62               IndexType                 kTop,
  63               IndexType                 kBot,
  64               IndexType                 nShifts,
  65               DenseVector<VSR>          &sr,
  66               DenseVector<VSI>          &si,
  67               GeMatrix<MH>              &H,
  68               IndexType                 iLoZ,
  69               IndexType                 iHiZ,
  70               GeMatrix<MZ>              &Z,
  71               GeMatrix<MV>              &V,
  72               GeMatrix<MU>              &U,
  73               GeMatrix<MWV>             &WV,
  74               GeMatrix<MWH>             &WH)
  75 {
  76     using std::abs;
  77     using std::max;
  78     using std::min;
  79 
  80     typedef typename GeMatrix<MH>::ElementType  T;
  81 
  82     const T Zero(0), One(1);
  83 
  84     const Underscore<IndexType> _;
  85 
  86     const IndexType n   = H.numRows();
  87     const IndexType nv  = WV.numRows();
  88     const IndexType nh  = WH.numCols();
  89 
  90     typedef typename GeMatrix<MH>::VectorView   VectorView;
  91     T           vtBuffer[3];
  92     VectorView  vt = typename VectorView::Engine(3, vtBuffer);
  93 
  94 //
  95 //  ==== If there are no shifts, then there is nothing to do. ====
  96 //
  97     if (nShifts<2) {
  98         return;
  99     }
 100 //
 101 //  ==== If the active block is empty or 1-by-1, then there
 102 //  .    is nothing to do. ====
 103 //
 104     if (kTop>=kBot) {
 105         return;
 106     }
 107 //
 108 //  ==== Shuffle shifts into pairs of real shifts and pairs
 109 //  .    of complex conjugate shifts assuming complex
 110 //  .    conjugate shifts are already adjacent to one
 111 //  .    another. ====
 112 //
 113     for (IndexType i=1; i<=nShifts-2; i+=2) {
 114         if (si(i)!=-si(i+1)) {
 115             T tmp = sr(i);
 116             sr(i) = sr(i+1);
 117             sr(i+1) = sr(i+2);
 118             sr(i+2) = tmp;
 119 
 120             tmp = si(i);
 121             si(i) = si(i+1);
 122             si(i+1) = si(i+2);
 123             si(i+2) = tmp;
 124         }
 125     }
 126 //
 127 //  ==== NSHFTS is supposed to be even, but if it is odd,
 128 //  .    then simply reduce it by one.  The shuffle above
 129 //  .    ensures that the dropped shift is real and that
 130 //  .    the remaining shifts are paired. ====
 131 //
 132     const IndexType ns = nShifts - (nShifts % 2);
 133 //
 134 //  ==== Machine constants for deflation ====
 135 //
 136     T safeMin   = lamch<T>(SafeMin);
 137     T safeMax   = One/safeMin;
 138     labad(safeMin, safeMax);
 139 
 140     const T ulp         = lamch<T>(Precision);
 141     const T smallNum    = safeMin*(T(n)/ulp);
 142 //
 143 //  ==== Use accumulated reflections to update far-from-diagonal
 144 //  .    entries ? ====
 145 //
 146     const bool accum = (kacc22==1) || (kacc22==2);
 147 //
 148 //  ==== If so, exploit the 2-by-2 block structure? ====
 149 //
 150     const bool blk22 = (ns>2) && (kacc22==2);
 151 //
 152 //  ==== clear trash ====
 153 //
 154     if (kTop+2<=kBot) {
 155         H(kTop+2,kTop) = Zero;
 156     }
 157 //
 158 //  ==== NBMPS = number of 2-shift bulges in the chain ====
 159 //
 160     const IndexType nBmps = ns/2;
 161 //
 162 //  ==== KDU = width of slab ====
 163 //
 164     const IndexType kdu = 6*nBmps - 3;
 165 //
 166 //  ==== Create and chase chains of NBMPS bulges ====
 167 //
 168     for (IndexType inCol=3*(1-nBmps)+kTop-1; inCol<=kBot-2; inCol+=3*nBmps-2) {
 169         IndexType ndCol = inCol + kdu;
 170         if (accum) {
 171             auto U_ = U(_(1,kdu),_(1,kdu));
 172             U_           = Zero;
 173             U_.diag(0)   = One;
 174         }
 175 //
 176 //      ==== Near-the-diagonal bulge chase.  The following loop
 177 //      .    performs the near-the-diagonal part of a small bulge
 178 //      .    multi-shift QR sweep.  Each 6*NBMPS-2 column diagonal
 179 //      .    chunk extends from column INCOL to column NDCOL
 180 //      .    (including both column INCOL and column NDCOL). The
 181 //      .    following loop chases a 3*NBMPS column long chain of
 182 //      .    NBMPS bulges 3*NBMPS-2 columns to the right.  (INCOL
 183 //      .    may be less than KTOP and and NDCOL may be greater than
 184 //      .    KBOT indicating phantom columns from which to chase
 185 //      .    bulges before they are actually introduced or to which
 186 //      .    to chase bulges beyond column KBOT.)  ====
 187 //
 188         const IndexType krColMax = min(inCol+3*nBmps-3, kBot-2);
 189 
 190         for (IndexType krCol=inCol; krCol<=krColMax; ++krCol) {
 191 //
 192 //          ==== Bulges number MTOP to MBOT are active double implicit
 193 //          .    shift bulges.  There may or may not also be small
 194 //          .    2-by-2 bulge, if there is room.  The inactive bulges
 195 //          .    (if any) must wait until the active bulges have moved
 196 //          .    down the diagonal to make room.  The phantom matrix
 197 //          .    paradigm described above helps keep track.  ====
 198 //
 199             const IndexType mTop = max(IndexType(1), ((kTop-1)-krCol+2)/3+1);
 200             const IndexType mBot = min(nBmps, (kBot-krCol)/3);
 201             const IndexType m22  = mBot + 1;
 202             const bool bmp22     = (mBot<nBmps ) && (krCol+3*(m22-1))==(kBot-2);
 203 //
 204 //          ==== Generate reflections to chase the chain right
 205 //          .    one column.  (The minimum value of K is KTOP-1.) ====
 206 //
 207             IndexType   k;
 208             T           alpha, beta;
 209 
 210             for (IndexType m=mTop; m<=mBot; ++m) {
 211                 k = krCol + 3*(m-1);
 212                 if (k==kTop-1) {
 213                     laqr1(H(_(kTop,kTop+2),_(kTop,kTop+2)),
 214                           sr(2*m-1), si(2*m-1), sr(2*m), si(2*m),
 215                           V(_(1,3),m));
 216                     alpha = V(1,m);
 217                     larfg(IndexType(3), alpha, V(_(2,3),m), V(1,m));
 218                 } else {
 219                     beta = H(k+1,k);
 220                     V(2,m) = H(k+2,k);
 221                     V(3,m) = H(k+3,k);
 222                     larfg(IndexType(3), beta, V(_(2,3),m), V(1,m));
 223 //
 224 //                  ==== A Bulge may collapse because of vigilant
 225 //                  .    deflation or destructive underflow.  In the
 226 //                  .    underflow case, try the two-small-subdiagonals
 227 //                  .    trick to try to reinflate the bulge.  ====
 228 //
 229                     if (H(k+3,k)!=Zero
 230                      || H(k+3,k+1)!=Zero
 231                      || H(k+3,k+2)==Zero) {
 232 //
 233 //                   ==== Typical case: not collapsed (yet). ====
 234 //
 235                         H(k+1, k) = beta;
 236                         H(k+2, k) = Zero;
 237                         H(k+3, k) = Zero;
 238                     } else {
 239 //
 240 //                      ==== Atypical case: collapsed.  Attempt to
 241 //                      .    reintroduce ignoring H(K+1,K) and H(K+2,K).
 242 //                      .    If the fill resulting from the new
 243 //                      .    reflector is too large, then abandon it.
 244 //                      .    Otherwise, use the new one. ====
 245 //
 246                         laqr1(H(_(k+1,k+3),_(k+1,k+3)),
 247                               sr(2*m-1), si(2*m-1), sr(2*m), si(2*m),
 248                               vt);
 249                         alpha = vt(1);
 250                         larfg(3, alpha, vt(_(2,3)), vt(1));
 251                         const T refSum = vt(1)*(H(k+1,k) + vt(2)*H(k+2,k));
 252                         if (abs(H(k+2,k)-refSum*vt(2)) + abs(refSum*vt(3))
 253                             > ulp*(abs(H(k,k))+abs(H(k+1,k+1))+abs(H(k+2,k+2))))
 254                         {
 255 //
 256 //                          ==== Starting a new bulge here would
 257 //                          .    create non-negligible fill.  Use
 258 //                          .    the old one with trepidation. ====
 259 //
 260                             H(k+1, k) = beta;
 261                             H(k+2, k) = Zero;
 262                             H(k+3, k) = Zero;
 263                         } else {
 264 //
 265 //                          ==== Stating a new bulge here would
 266 //                          .    create only negligible fill.
 267 //                          .    Replace the old reflector with
 268 //                          .    the new one. ====
 269 //
 270                             H(k+1, k) -= refSum;
 271                             H(k+2, k) = Zero;
 272                             H(k+3, k) = Zero;
 273                             V(1, m) = vt(1);
 274                             V(2, m) = vt(2);
 275                             V(3, m) = vt(3);
 276                         }
 277                     }
 278                 }
 279             }
 280 //
 281 //          ==== Generate a 2-by-2 reflection, if needed. ====
 282 //
 283             k = krCol + 3*(m22-1);
 284             if (bmp22) {
 285                 if (k==kTop-1) {
 286                     laqr1(H(_(k+1,k+2),_(k+1,k+2)),
 287                           sr(2*m22-1), si(2*m22-1), sr(2*m22), si(2*m22),
 288                           V(_(1,2),m22));
 289                     beta = V(1, m22);
 290                     larfg(IndexType(2), beta, V(_(2,2),m22), V(1,m22));
 291                 } else {
 292                     beta = H(k+1, k);
 293                     V(2, m22) = H(k+2,k);
 294                     larfg(IndexType(2), beta, V(_(2,2),m22), V(1,m22));
 295                     H(k+1, k) = beta;
 296                     H(k+2, k) = Zero;
 297                 }
 298             }
 299 //
 300 //          ==== Multiply H by reflections from the left ====
 301 //
 302             IndexType jBot;
 303             if (accum) {
 304                 jBot = min(ndCol, kBot);
 305             } else if (wantT) {
 306                 jBot = n;
 307             } else {
 308                 jBot = kBot;
 309             }
 310             for (IndexType j=max(kTop,krCol); j<=jBot; ++j) {
 311                 IndexType mEnd = min(mBot, (j-krCol+2)/3);
 312                 for (IndexType m=mTop; m<=mEnd; ++m) {
 313                     k = krCol + 3*(m-1);
 314                     const T refSum = V(1,m)*(H(k+1,j) + V(2,m)*H(k+2,j)
 315                                                       + V(3,m)*H(k+3,j));
 316                     H(k+1,j) -= refSum;
 317                     H(k+2,j) -= refSum*V(2,m);
 318                     H(k+3,j) -= refSum*V(3,m);
 319                 }
 320             }
 321             if (bmp22) {
 322                 k = krCol + 3*(m22-1);
 323                 for (IndexType j=max(k+1,kTop); j<=jBot; ++j) {
 324                     const T refSum = V(1,m22)*(H(k+1,j)+V(2,m22)*H(k+2,j));
 325                     H(k+1,j) -= refSum;
 326                     H(k+2,j) -= refSum*V(2,m22);
 327                 }
 328             }
 329 //
 330 //          ==== Multiply H by reflections from the right.
 331 //          .    Delay filling in the last row until the
 332 //          .    vigilant deflation check is complete. ====
 333 //
 334             IndexType jTop;
 335             if (accum) {
 336                 jTop = max(kTop, inCol);
 337             } else if (wantT) {
 338                 jTop = 1;
 339             } else {
 340                 jTop = kTop;
 341             }
 342             for (IndexType m=mTop; m<=mBot; ++m) {
 343                 if (V(1,m)!=Zero) {
 344                     k = krCol + 3*(m-1);
 345                     for (IndexType j=jTop; j<=min(kBot,k+3); ++j) {
 346                         const T refSum = V(1,m)*(H(j,k+1) + V(2,m)*H(j,k+2)
 347                                                           + V(3,m)*H(j,k+3));
 348                         H(j, k+1) -= refSum;
 349                         H(j, k+2) -= refSum*V(2, m);
 350                         H(j, k+3) -= refSum*V(3, m);
 351                     }
 352 
 353                     if (accum) {
 354 //
 355 //                   ==== Accumulate U. (If necessary, update Z later
 356 //                   .    with with an efficient matrix-matrix
 357 //                   .    multiply.) ====
 358 //
 359                         IndexType kms   = k - inCol;
 360                         IndexType j1    = max(IndexType(1),kTop-inCol);
 361 
 362                         for (IndexType j=j1; j<=kdu; ++j) {
 363                             const T refSum = V(1,m)*(U(j,kms+1)
 364                                                         + V(2,m)*U(j,kms+2)
 365                                                         + V(3,m)*U(j,kms+3));
 366                             U(j, kms+1) -= refSum;
 367                             U(j, kms+2) -= refSum*V(2, m);
 368                             U(j, kms+3) -= refSum*V(3, m);
 369                         }
 370                     } else if (wantZ) {
 371 //
 372 //                      ==== U is not accumulated, so update Z
 373 //                      .    now by multiplying by reflections
 374 //                      .    from the right. ====
 375 //
 376                         for (IndexType j=iLoZ; j<=iHiZ; ++j) {
 377                             const T refSum = V(1,m)*(Z(j,k+1)
 378                                                         +V(2,m)*Z(j,k+2)
 379                                                         +V(3,m)*Z(j,k+3));
 380                             Z(j,k+1) -= refSum;
 381                             Z(j,k+2) -= refSum*V(2,m);
 382                             Z(j,k+3) -= refSum*V(3,m);
 383                         }
 384                     }
 385                 }
 386             }
 387 //
 388 //          ==== Special case: 2-by-2 reflection (if needed) ====
 389 //
 390             k = krCol + 3*(m22-1);
 391             if (bmp22) {
 392                 if (V(1,m22)!=Zero) {
 393                     for (IndexType j=jTop; j<=min(kBot,k+3); ++j) {
 394                         const T refSum = V(1,m22)*(H(j,k+1)+V(2,m22)*H(j,k+2));
 395                         H(j,k+1) -= refSum;
 396                         H(j,k+2) -= refSum*V(2,m22);
 397                     }
 398 
 399                     if (accum) {
 400                         IndexType kms   = k - inCol;
 401                         IndexType j1    = max(IndexType(1),kTop-inCol);
 402 
 403                         for (IndexType j=j1; j<=kdu; ++j) {
 404                             const T refSum = V(1,m22)*(U(j,kms+1)
 405                                                         + V(2,m22)*U(j,kms+2));
 406                             U(j,kms+1 ) -= refSum;
 407                             U(j,kms+2 ) -= refSum*V(2,m22);
 408                         }
 409                     } else if (wantZ) {
 410                         for (IndexType j=iLoZ; j<=iHiZ; ++j) {
 411                             const T refSum = V(1,m22)*(Z(j,k+1)
 412                                                         + V(2,m22)*Z(j,k+2));
 413                             Z(j,k+1) -= refSum;
 414                             Z(j,k+2) -= refSum*V(2,m22);
 415                         }
 416                     }
 417                 }
 418             }
 419 //
 420 //          ==== Vigilant deflation check ====
 421 //
 422             IndexType mStart = mTop;
 423             if (krCol+3*(mStart-1)<kTop) {
 424                 ++mStart;
 425             }
 426             IndexType mEnd = mBot;
 427             if (bmp22) {
 428                 ++mEnd;
 429             }
 430             if (krCol==kBot-2) {
 431                 ++mEnd;
 432             }
 433             for (IndexType m=mStart; m<=mEnd; ++m) {
 434                 k = min(kBot-1, krCol+3*(m-1));
 435 //
 436 //              ==== The following convergence test requires that
 437 //              .    the tradition small-compared-to-nearby-diagonals
 438 //              .    criterion and the Ahues & Tisseur (LAWN 122, 1997)
 439 //              .    criteria both be satisfied.  The latter improves
 440 //              .    accuracy in some examples. Falling back on an
 441 //              .    alternate convergence criterion when TST1 or TST2
 442 //              .    is zero (as done here) is traditional but probably
 443 //              .    unnecessary. ====
 444 //
 445                 if (H(k+1,k)!=Zero) {
 446                     T test1 = abs(H(k,k)) + abs(H(k+1,k+1));
 447                     if (test1==Zero) {
 448                         if (k>=kTop+1) {
 449                             test1 += abs(H(k,k-1));
 450                         }
 451                         if (k>=kTop+2) {
 452                             test1 += abs(H(k,k-2));
 453                         }
 454                         if (k>=kTop+3) {
 455                             test1 += abs(H(k,k-3));
 456                         }
 457                         if (k<=kBot-2) {
 458                             test1 += abs(H(k+2,k+1));
 459                         }
 460                         if (k<=kBot-3) {
 461                             test1 += abs(H(k+3,k+1));
 462                         }
 463                         if (k<=kBot-4) {
 464                             test1 += abs(H(k+4,k+1));
 465                         }
 466                     }
 467                     if (abs(H(k+1,k))<=max(smallNum, ulp*test1)) {
 468                         const T H12 = max(abs(H(k+1,k)), abs(H(k,k+1)));
 469                         const T H21 = min(abs(H(k+1,k)), abs(H(k,k+1)));
 470                         const T H11 = max(abs(H(k+1,k+1)),
 471                                           abs(H(k,k)-H(k+1,k+1)));
 472                         const T H22 = min(abs(H(k+1,k+1)),
 473                                           abs(H(k,k)-H(k+1,k+1)));
 474                         const T scal = H11 + H12;
 475                         const T test2 = H22*(H11/scal);
 476 
 477                         if (test2==Zero
 478                          || H21*(H12/scal)<=max(smallNum,ulp*test2))
 479                         {
 480                             H(k+1,k) = Zero;
 481                         }
 482                     }
 483                 }
 484             }
 485 //
 486 //          ==== Fill in the last row of each bulge. ====
 487 //
 488             mEnd = min(nBmps, (kBot-krCol-1)/3);
 489             for (IndexType m=mTop; m<=mEnd; ++m) {
 490                 k = krCol + 3*(m-1);
 491                 const T refSum = V(1,m)*V(3,m)*H(k+4,k+3);
 492                 H(k+4,k+1) = -refSum;
 493                 H(k+4,k+2) = -refSum*V(2,m);
 494                 H(k+4,k+3) -= refSum*V(3,m);
 495             }
 496 //
 497 //          ==== End of near-the-diagonal bulge chase. ====
 498 //
 499         }
 500 //
 501 //      ==== Use U (if accumulated) to update far-from-diagonal
 502 //      .    entries in H.  If required, use U to update Z as
 503 //      .    well. ====
 504 //
 505         if (accum) {
 506             IndexType jTop, jBot;
 507             if (wantT) {
 508                 jTop = 1;
 509                 jBot = n;
 510             } else {
 511                 jTop = kTop;
 512                 jBot = kBot;
 513             }
 514             if ((!blk22) || (inCol<kTop) || (ndCol>kBot) || (ns<=2)) {
 515 //
 516 //              ==== Updates not exploiting the 2-by-2 block
 517 //              .    structure of U.  K1 and NU keep track of
 518 //              .    the location and size of U in the special
 519 //              .    cases of introducing bulges and chasing
 520 //              .    bulges off the bottom.  In these special
 521 //              .    cases and in case the number of shifts
 522 //              .    is NS = 2, there is no 2-by-2 block
 523 //              .    structure to exploit.  ====
 524 //
 525                 const IndexType k1 = max(IndexType(1), kTop-inCol);
 526                 const IndexType nu  = (kdu-max(IndexType(0), ndCol-kBot)) -k1+1;
 527                 const IndexType _nu = (kdu-max(IndexType(0), ndCol-kBot));
 528 //
 529 //              ==== Horizontal Multiply ====
 530 //
 531                 for (IndexType jCol=min(ndCol,kBot)+1; jCol<=jBot; jCol+=nh) {
 532                     const IndexType jLen = min(nh, jBot-jCol+1);
 533 
 534                     auto _U     = U(_(k1, _nu), _(k1, _nu));
 535                     auto _H     = H(_(inCol+k1,inCol+_nu),_(jCol,jCol+jLen-1));
 536                     auto _WH    = WH(_(1,nu),_(1,jLen));
 537 
 538                     blas::mm(ConjTrans, NoTrans, One, _U, _H, Zero, _WH);
 539                     _H = _WH;
 540                 }
 541 //
 542 //              ==== Vertical multiply ====
 543 //
 544                 for (IndexType jRow=jTop; jRow<=max(kTop,inCol)-1; jRow+=nv) {
 545                     const IndexType jLen = min(nv, max(kTop,inCol)-jRow);
 546 
 547                     auto _H     = H(_(jRow,jRow+jLen-1),_(inCol+k1,inCol+_nu));
 548                     auto _U     = U(_(k1,_nu),_(k1,_nu));
 549                     auto _WV    = WV(_(1,jLen),_(1,nu));
 550 
 551                     blas::mm(NoTrans, NoTrans, One, _H, _U, Zero, _WV);
 552                     _H = _WV;
 553                 }
 554 //
 555 //              ==== Z multiply (also vertical) ====
 556 //
 557                 if (wantZ) {
 558                     for (IndexType jRow=iLoZ; jRow<=iHiZ; jRow+=nv) {
 559                         const IndexType jLen = min(nv, iHiZ-jRow+1);
 560 
 561                         auto _Z  = Z(_(jRow,jRow+jLen-1),_(inCol+k1,inCol+_nu));
 562                         auto _U  = U(_(k1,_nu),_(k1,_nu));
 563                         auto _WV = WV(_(1,jLen),_(1,nu));
 564 
 565                         blas::mm(NoTrans, NoTrans, One, _Z, _U, Zero, _WV);
 566                         _Z = _WV;
 567                     }
 568                 }
 569             } else {
 570 //
 571 //              ==== Updates exploiting U's 2-by-2 block structure.
 572 //              .    (I2, I4, J2, J4 are the last rows and columns
 573 //              .    of the blocks.) ====
 574 //
 575                 const IndexType i2 = (kdu+1 ) / 2;
 576                 const IndexType i4 = kdu;
 577                 const IndexType j2 = i4 - i2;
 578                 const IndexType j4 = kdu;
 579 //
 580 //              ==== KZS and KNZ deal with the band of zeros
 581 //              .    along the diagonal of one of the triangular
 582 //              .    blocks. ====
 583 //
 584                 const IndexType kZs = (j4-j2) - (ns+1);
 585                 const IndexType kNz = ns + 1;
 586 //
 587 //              ==== Horizontal multiply ====
 588 //
 589                 for (IndexType jCol=min(ndCol, kBot)+1; jCol<=jBot; jCol+=nh) {
 590                     const IndexType jLen = min(nh, jBot-jCol+1);
 591 //
 592 //                  ==== Copy bottom of H to top+KZS of scratch ====
 593 //                  (The first KZS rows get multiplied by zero.) ====
 594 //
 595                     WH(_(kZs+1,kZs+kNz),_(1,jLen))
 596                         = H(_(inCol+j2+1,inCol+j2+kNz),_(jCol,jCol+jLen-1));
 597 //
 598 //                  ==== Multiply by U21**T ====
 599 //
 600                     WH(_(1,kZs),_(1,jLen)) = Zero;
 601                     blas::mm(Left, ConjTrans, One,
 602                              U(_(j2+1,j2+kNz),_(kZs+1,kZs+kNz)).upper(),
 603                              WH(_(kZs+1,kZs+kNz),_(1,jLen)));
 604 //
 605 //                  ==== Multiply top of H by U11**T ====
 606 //
 607                     blas::mm(ConjTrans, NoTrans, One,
 608                              U(_(1,j2),_(1,i2)),
 609                              H(_(inCol+1,inCol+j2),_(jCol,jCol+jLen-1)),
 610                              One,
 611                              WH(_(1,i2),_(1,jLen)));
 612 //
 613 //                  ==== Copy top of H to bottom of WH ====
 614 //
 615                     WH(_(i2+1,i2+j2),_(1,jLen))
 616                         = H(_(inCol+1,inCol+j2),_(jCol,jCol+jLen-1));
 617 //
 618 //                  ==== Multiply by U21**T ====
 619 //
 620                     blas::mm(Left, ConjTrans, One,
 621                              U(_(1,j2),_(i2+1,i2+j2)).lower(),
 622                              WH(_(i2+1,i2+j2),_(1,jLen)));
 623 //
 624 //                  ==== Multiply by U22 ====
 625 //
 626                     blas::mm(ConjTrans, NoTrans, One,
 627                              U(_(j2+1,j4),_(i2+1,i4)),
 628                              H(_(inCol+j2+1,inCol+j4),_(jCol,jCol+jLen-1)),
 629                              One,
 630                              WH(_(i2+1,i4),_(1,jLen)));
 631 //
 632 //                  ==== Copy it back ====
 633 //
 634                     H(_(inCol+1,inCol+kdu),_(jCol,jCol+jLen-1))
 635                         = WH(_(1,kdu),_(1,jLen));
 636                 }
 637 //
 638 //              ==== Vertical multiply ====
 639 //
 640                 for (IndexType jRow=jTop; jRow<=max(inCol,kTop)-1; jRow+=nv) {
 641                     const IndexType jLen = min(nv, max(inCol,kTop) -jRow);
 642 //
 643 //                  ==== Copy right of H to scratch (the first KZS
 644 //                  .    columns get multiplied by zero) ====
 645 //
 646                     WV(_(1,jLen),_(kZs+1,kZs+kNz))
 647                         = H(_(jRow,jRow+jLen-1),_(inCol+j2+1, inCol+j2+kNz));
 648 //
 649 //                  ==== Multiply by U21 ====
 650 //
 651                     WV(_(1,jLen),_(1,kZs)) = Zero;
 652                     blas::mm(Right, NoTrans, One,
 653                              U(_(j2+1,j2+kNz),_(kZs+1,kZs+kNz)).upper(),
 654                              WV(_(1,jLen),_(kZs+1,kZs+kNz)));
 655 //
 656 //                  ==== Multiply by U11 ====
 657 //
 658                     blas::mm(NoTrans, NoTrans, One,
 659                              H(_(jRow,jRow+jLen-1),_(inCol+1,inCol+j2)),
 660                              U(_(1,j2),_(1,i2)),
 661                              One,
 662                              WV(_(1,jLen),_(1,i2)));
 663 //
 664 //                  ==== Copy left of H to right of scratch ====
 665 //
 666                     WV(_(1,jLen),_(i2+1,i2+j2))
 667                         = H(_(jRow,jRow+jLen-1), _(inCol+1,inCol+j2));
 668 //
 669 //                  ==== Multiply by U21 ====
 670 //
 671                     blas::mm(Right, NoTrans, One,
 672                              U(_(1,i4-i2),_(i2+1,i4)).lower(),
 673                              WV(_(1,jLen),_(i2+1,i4)));
 674 //
 675 //                  ==== Multiply by U22 ====
 676 //
 677                     blas::mm(NoTrans, NoTrans, One,
 678                              H(_(jRow,jRow+jLen-1),_(inCol+j2+1,inCol+j4)),
 679                              U(_(j2+1,j4),_(i2+1,i4)),
 680                              One,
 681                              WV(_(1,jLen),_(i2+1,i4)));
 682 //
 683 //                  ==== Copy it back ====
 684 //
 685                     H(_(jRow,jRow+jLen-1),_(inCol+1,inCol+kdu))
 686                         = WV(_(1,jLen),_(1,kdu));
 687                 }
 688 //
 689 //              ==== Multiply Z (also vertical) ====
 690 //
 691                 if (wantZ) {
 692                     for (IndexType jRow=iLoZ; jRow<=iHiZ; jRow+=nv) {
 693                         const IndexType jLen = min(nv,iHiZ-jRow+1);
 694 //
 695 //                      ==== Copy right of Z to left of scratch (first
 696 //                      .     KZS columns get multiplied by zero) ====
 697 //
 698                         WV(_(1,jLen),_(kZs+1,kZs+kNz))
 699                             = Z(_(jRow,jRow+jLen-1),_(inCol+j2+1,inCol+j2+kNz));
 700 //
 701 //                      ==== Multiply by U12 ====
 702 //
 703                         WV(_(1,jLen),_(1,kZs)) = Zero;
 704                         blas::mm(Right, NoTrans, One,
 705                                  U(_(j2+1,j2+kNz), _(kZs+1,kZs+kNz)).upper(),
 706                                  WV(_(1,jLen),_(kZs+1,kZs+kNz)));
 707 //
 708 //                      ==== Multiply by U11 ====
 709 //
 710                         blas::mm(NoTrans, NoTrans, One,
 711                                  Z(_(jRow,jRow+jLen-1), _(inCol+1,inCol+j2)),
 712                                  U(_(1,j2),_(1,i2)),
 713                                  One,
 714                                  WV(_(1,jLen),_(1,i2)));
 715 //
 716 //                      ==== Copy left of Z to right of scratch ====
 717 //
 718                         WV(_(1,jLen),_(i2+1,i2+j2))
 719                             = Z(_(jRow,jRow+jLen-1), _(inCol+1,inCol+j2));
 720 //
 721 //                      ==== Multiply by U21 ====
 722 //
 723                         blas::mm(Right, NoTrans, One,
 724                                  U(_(1,i4-i2),_(i2+1,i4)).lower(),
 725                                  WV(_(1,jLen),_(i2+1,i4)));
 726 //
 727 //                      ==== Multiply by U22 ====
 728 //
 729                         blas::mm(NoTrans, NoTrans, One,
 730                                  Z(_(jRow,jRow+jLen-1),_(inCol+j2+1,inCol+j4)),
 731                                  U(_(j2+1,j4),_(i2+1,i4)),
 732                                  One,
 733                                  WV(_(1,jLen),_(i2+1,i4)));
 734 //
 735 //                      ==== Copy the result back to Z ====
 736 //
 737                         Z(_(jRow,jRow+jLen-1),_(inCol+1,inCol+kdu))
 738                             = WV(_(1,jLen),_(1,kdu));
 739                     }
 740                 }
 741             }
 742         }
 743     }
 744 }
 745 
 746 //== interface for native lapack ===============================================
 747 
 748 #ifdef CHECK_CXXLAPACK
 749 
 750 template <typename IndexType, typename VSR, typename VSI, typename MH,
 751           typename MZ, typename MV, typename MU, typename MWV, typename MWH>
 752 void
 753 laqr5_native(bool                      wantT,
 754              bool                      wantZ,
 755              IndexType                 kacc22,
 756              IndexType                 kTop,
 757              IndexType                 kBot,
 758              IndexType                 nShifts,
 759              DenseVector<VSR>          &sr,
 760              DenseVector<VSI>          &si,
 761              GeMatrix<MH>              &H,
 762              IndexType                 iLoZ,
 763              IndexType                 iHiZ,
 764              GeMatrix<MZ>              &Z,
 765              GeMatrix<MV>              &V,
 766              GeMatrix<MU>              &U,
 767              GeMatrix<MWV>             &WV,
 768              GeMatrix<MWH>             &WH)
 769 {
 770     typedef typename GeMatrix<MH>::ElementType  T;
 771 
 772     const LOGICAL    WANTT      = wantT;
 773     const LOGICAL    WANTZ      = wantZ;
 774     const INTEGER    KACC22     = kacc22;
 775     const INTEGER    N          = H.numRows();
 776     const INTEGER    KTOP       = kTop;
 777     const INTEGER    KBOT       = kBot;
 778     const INTEGER    NSHFTS     = nShifts;
 779     const INTEGER    LDH        = H.leadingDimension();
 780     const INTEGER    ILOZ       = iLoZ;
 781     const INTEGER    IHIZ       = iHiZ;
 782     const INTEGER    LDZ        = Z.leadingDimension();
 783     const INTEGER    LDV        = V.leadingDimension();
 784     const INTEGER    LDU        = U.leadingDimension();
 785     const INTEGER    NV         = WV.numRows();
 786     const INTEGER    LDWV       = WV.leadingDimension();
 787     const INTEGER    NH         = WH.numCols();
 788     const INTEGER    LDWH       = WH.leadingDimension();
 789 
 790     if (IsSame<T,DOUBLE>::value) {
 791         LAPACK_IMPL(dlaqr5)(&WANTT,
 792                             &WANTZ,
 793                             &KACC22,
 794                             &N,
 795                             &KTOP,
 796                             &KBOT,
 797                             &NSHFTS,
 798                             sr.data(),
 799                             si.data(),
 800                             H.data(),
 801                             &LDH,
 802                             &ILOZ,
 803                             &IHIZ,
 804                             Z.data(),
 805                             &LDZ,
 806                             V.data(),
 807                             &LDV,
 808                             U.data(),
 809                             &LDU,
 810                             &NV,
 811                             WV.data(),
 812                             &LDWV,
 813                             &NH,
 814                             WH.data(),
 815                             &LDWH);
 816     } else {
 817         ASSERT(0);
 818     }
 819 }
 820 
 821 #endif // CHECK_CXXLAPACK
 822 
 823 //== public interface ==========================================================
 824 template <typename IndexType, typename VSR, typename VSI, typename MH,
 825           typename MZ, typename MV, typename MU, typename MWV, typename MWH>
 826 void
 827 laqr5(bool                      wantT,
 828       bool                      wantZ,
 829       IndexType                 kacc22,
 830       IndexType                 kTop,
 831       IndexType                 kBot,
 832       IndexType                 nShifts,
 833       DenseVector<VSR>          &sr,
 834       DenseVector<VSI>          &si,
 835       GeMatrix<MH>              &H,
 836       IndexType                 iLoZ,
 837       IndexType                 iHiZ,
 838       GeMatrix<MZ>              &Z,
 839       GeMatrix<MV>              &V,
 840       GeMatrix<MU>              &U,
 841       GeMatrix<MWV>             &WV,
 842       GeMatrix<MWH>             &WH)
 843 {
 844     LAPACK_DEBUG_OUT("laqr5");
 845 
 846     using std::max;
 847 //
 848 //  Test the input parameters
 849 //
 850 #   ifndef NDEBUG
 851     ASSERT((kacc22==0)||(kacc22==1)||(kacc22==2));
 852     ASSERT(H.firstRow()==1);
 853     ASSERT(H.firstCol()==1);
 854     ASSERT(H.numRows()==H.numCols());
 855 
 856     const IndexType n = H.numRows();
 857 
 858     ASSERT(1<=kTop);
 859     ASSERT(kBot<=n);
 860 
 861     ASSERT(nShifts>0);
 862     ASSERT(nShifts % 2 == 0);
 863 
 864     ASSERT(sr.length()==nShifts);
 865     ASSERT(si.length()==nShifts);
 866 
 867     if (wantZ) {
 868         ASSERT(1<=iLoZ);
 869         ASSERT(iLoZ<=iHiZ);
 870         ASSERT(iHiZ<=n);
 871     }
 872 
 873     ASSERT(V.firstRow()==1);
 874     ASSERT(V.firstCol()==1);
 875     ASSERT(V.numRows()>=3);
 876     ASSERT(V.numCols()==nShifts/2);
 877 
 878     ASSERT(U.firstRow()==1);
 879     ASSERT(U.firstCol()==1);
 880     ASSERT(U.numRows()>=3*nShifts-3);
 881 
 882     ASSERT(WH.firstRow()==1);
 883     ASSERT(WH.firstCol()==1);
 884     ASSERT(WH.numRows()>=3*nShifts-3);
 885     ASSERT(WH.numCols()>=1);
 886 
 887     ASSERT(WV.firstRow()==1);
 888     ASSERT(WV.firstCol()==1);
 889     ASSERT(WV.numRows()>=1);
 890     ASSERT(WV.numCols()>=3*nShifts-3);
 891 #   endif
 892 
 893 //
 894 //  Make copies of output arguments
 895 //
 896 #   ifdef CHECK_CXXLAPACK
 897     typename DenseVector<VSR>::NoView   sr_org = sr;
 898     typename DenseVector<VSI>::NoView   si_org = si;
 899     typename GeMatrix<MH>::NoView       H_org  = H;
 900     typename GeMatrix<MZ>::NoView       Z_org  = Z;
 901     typename GeMatrix<MV>::NoView       V_org  = V;
 902     typename GeMatrix<MU>::NoView       U_org  = U;
 903     typename GeMatrix<MWV>::NoView      WV_org = WV;
 904     typename GeMatrix<MWH>::NoView      WH_org = WH;
 905 #   endif
 906 
 907 //
 908 //  Call implementation
 909 //
 910     laqr5_generic(wantT, wantZ, kacc22, kTop, kBot, nShifts,
 911                   sr, si, H, iLoZ, iHiZ, Z,
 912                   V, U, WV, WH);
 913 
 914 #   ifdef CHECK_CXXLAPACK
 915 //
 916 //  Make copies of results computed by the generic implementation
 917 //
 918     typename DenseVector<VSR>::NoView   sr_generic = sr;
 919     typename DenseVector<VSI>::NoView   si_generic = si;
 920     typename GeMatrix<MH>::NoView       H_generic  = H;
 921     typename GeMatrix<MZ>::NoView       Z_generic  = Z;
 922     typename GeMatrix<MV>::NoView       V_generic  = V;
 923     typename GeMatrix<MU>::NoView       U_generic  = U;
 924     typename GeMatrix<MWV>::NoView      WV_generic = WV;
 925     typename GeMatrix<MWH>::NoView      WH_generic = WH;
 926 //
 927 //  restore output arguments
 928 //
 929     sr = sr_org;
 930     si = si_org;
 931     H  = H_org;
 932     Z  = Z_org;
 933     V  = V_org;
 934     U  = U_org;
 935     WV = WV_org;
 936     WH = WH_org;
 937 
 938 //
 939 //  Compare generic results with results from the native implementation
 940 //
 941     laqr5_native(wantT, wantZ, kacc22, kTop, kBot, nShifts,
 942                  sr, si, H, iLoZ, iHiZ, Z,
 943                  V, U, WV, WH);
 944 
 945     bool failed = false;
 946     if (! isIdentical(sr_generic, sr, "sr_generic""sr")) {
 947 //        std::cerr << "CXXLAPACK: sr_generic = " << sr_generic << std::endl;
 948 //        std::cerr << "F77LAPACK: sr = " << sr << std::endl;
 949         failed = true;
 950     }
 951 
 952     if (! isIdentical(si_generic, si, "si_generic""si")) {
 953 //        std::cerr << "CXXLAPACK: si_generic = " << si_generic << std::endl;
 954 //        std::cerr << "F77LAPACK: si = " << si << std::endl;
 955         failed = true;
 956     }
 957 
 958     if (! isIdentical(H_generic, H, "H_generic""H")) {
 959 //        std::cerr << "CXXLAPACK: H_generic = " << H_generic << std::endl;
 960 //        std::cerr << "F77LAPACK: H = " << H << std::endl;
 961         failed = true;
 962     }
 963 
 964     if (! isIdentical(Z_generic, Z, "Z_generic""Z")) {
 965 //        std::cerr << "CXXLAPACK: Z_generic = " << Z_generic << std::endl;
 966 //        std::cerr << "F77LAPACK: Z = " << Z << std::endl;
 967         failed = true;
 968     }
 969 
 970     if (! isIdentical(V_generic, V, "V_generic""V")) {
 971 //        std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;
 972 //        std::cerr << "F77LAPACK: V = " << V << std::endl;
 973         failed = true;
 974     }
 975 
 976     if (! isIdentical(U_generic, U, "U_generic""U")) {
 977 //        std::cerr << "CXXLAPACK: U_generic = " << U_generic << std::endl;
 978 //        std::cerr << "F77LAPACK: U = " << U << std::endl;
 979         failed = true;
 980     }
 981 
 982     if (! isIdentical(WV_generic, WV, "WV_generic""WV")) {
 983 //        std::cerr << "CXXLAPACK: WV_generic = " << WV_generic << std::endl;
 984 //        std::cerr << "F77LAPACK: WV = " << WV << std::endl;
 985         failed = true;
 986     }
 987 
 988     if (! isIdentical(WH_generic, WH, "WH_generic""WH")) {
 989 //        std::cerr << "CXXLAPACK: WH_generic = " << WH_generic << std::endl;
 990 //        std::cerr << "F77LAPACK: WH = " << WH << std::endl;
 991         failed = true;
 992     }
 993 
 994     if (failed) {
 995         std::cerr << "error in: laqr5.tcc" << std::endl;
 996         std::cerr << "N = H.numRows() =   " << H.numRows() << std::endl;
 997         std::cerr << "H.numCols() =       " << H.numCols() << std::endl;
 998         std::cerr << "NV = WV.numRows() = " << WV.numRows() << std::endl;
 999         std::cerr << "WV.numCols() =      " << WV.numCols() << std::endl;
1000         std::cerr << "NH = WH.numRows() = " << WH.numRows() << std::endl;
1001         std::cerr << "WH.numCols() =      " << WH.numCols() << std::endl;
1002         ASSERT(0);
1003     } else {
1004         // std::cerr << "passed: laqr5.tcc" << std::endl;
1005     }
1006 #   endif
1007 }
1008 
1009 //-- forwarding ----------------------------------------------------------------
1010 template <typename IndexType, typename VSR, typename VSI, typename MH,
1011           typename MZ, typename MV, typename MU, typename MWV, typename MWH>
1012 void
1013 laqr5(bool                      wantT,
1014       bool                      wantZ,
1015       IndexType                 kacc22,
1016       IndexType                 kTop,
1017       IndexType                 kBot,
1018       IndexType                 nShifts,
1019       VSR                       &&sr,
1020       VSI                       &&si,
1021       MH                        &&H,
1022       IndexType                 iLoZ,
1023       IndexType                 iHiZ,
1024       MZ                        &&Z,
1025       MV                        &&V,
1026       MU                        &&U,
1027       MWV                       &&WV,
1028       MWH                       &&WH)
1029 {
1030     CHECKPOINT_ENTER;
1031     laqr5(wantT, wantZ, kacc22, kTop, kBot, nShifts, sr, si, H, iLoZ, iHiZ, Z,
1032           V, U, WV, WH);
1033     CHECKPOINT_LEAVE;
1034 }
1035 
1036 } } // namespace lapack, flens
1037 
1038 #endif // FLENS_LAPACK_EIG_LAQR5_TCC