1 /*
  2  *   Copyright (c) 2012, Michael Lehn
  3  *
  4  *   All rights reserved.
  5  *
  6  *   Redistribution and use in source and binary forms, with or without
  7  *   modification, are permitted provided that the following conditions
  8  *   are met:
  9  *
 10  *   1) Redistributions of source code must retain the above copyright
 11  *      notice, this list of conditions and the following disclaimer.
 12  *   2) Redistributions in binary form must reproduce the above copyright
 13  *      notice, this list of conditions and the following disclaimer in
 14  *      the documentation and/or other materials provided with the
 15  *      distribution.
 16  *   3) Neither the name of the FLENS development group nor the names of
 17  *      its contributors may be used to endorse or promote products derived
 18  *      from this software without specific prior written permission.
 19  *
 20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31  */
 32 
 33 /* Based on
 34  *
 35        SUBROUTINE DGSVJ0( JOBV, M, N, A, LDA, D, SVA, MV, V, LDV, EPS,
 36       $                   SFMIN, TOL, NSWEEP, WORK, LWORK, INFO )
 37  *
 38  *  -- LAPACK routine (version 3.3.1)                                  --
 39  *
 40  *  -- Contributed by Zlatko Drmac of the University of Zagreb and     --
 41  *  -- Kresimir Veselic of the Fernuniversitaet Hagen                  --
 42  *  -- April 2011                                                      --
 43  *
 44  *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 45  *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 46  *
 47  * This routine is also part of SIGMA (version 1.23, October 23. 2008.)
 48  * SIGMA is a library of algorithms for highly accurate algorithms for
 49  * computation of SVD, PSVD, QSVD, (H,K)-SVD, and for solution of the
 50  * eigenvalue problems Hx = lambda M x, H M x = lambda x with H, M > 0.
 51  *
 52  */
 53 
 54 #ifndef FLENS_LAPACK_SVD_SVJ0_TCC
 55 #define FLENS_LAPACK_SVD_SVJ0_TCC 1
 56 
 57 #include <flens/blas/blas.h>
 58 #include <flens/lapack/lapack.h>
 59 
 60 namespace flens { namespace lapack {
 61 
 62 //== generic lapack implementation =============================================
 63 
 64 template <typename MA, typename VD, typename VSVA, typename MV, typename VWORK>
 65 typename GeMatrix<MA>::IndexType
 66 svj0_generic(SVJ::JobV                                  jobV,
 67              GeMatrix<MA>                               &A,
 68              DenseVector<VD>                            &d,
 69              DenseVector<VSVA>                          &sva,
 70              GeMatrix<MV>                               &V,
 71              const typename GeMatrix<MA>::ElementType   &eps,
 72              const typename GeMatrix<MA>::ElementType   &safeMin,
 73              const typename GeMatrix<MA>::ElementType   &tol,
 74              typename GeMatrix<MA>::IndexType           nSweep,
 75              DenseVector<VWORK>                         &work)
 76 {
 77     using std::abs;
 78     using std::max;
 79     using std::min;
 80     using std::sqrt;
 81     using std::swap;
 82 
 83     typedef typename GeMatrix<MA>::ElementType  ElementType;
 84     typedef typename GeMatrix<MA>::IndexType    IndexType;
 85 
 86     const ElementType  Zero(0), Half(0.5), One(1);
 87 
 88     const Underscore<IndexType>  _;
 89 
 90     ElementType fastr_data[5];
 91     DenseVectorView<ElementType>
 92         fastr  = typename DenseVectorView<ElementType>::Engine(5, fastr_data);
 93 
 94     const IndexType  m     = A.numRows();
 95     const IndexType  n     = A.numCols();
 96 
 97     auto _work = work(_(1,m));
 98 
 99     const bool applyV   = (jobV==SVJ::ApplyV);
100     const bool rhsVec   = (jobV==SVJ::ComputeV) || applyV;
101 
102     const ElementType rootEps = sqrt(eps);
103     const ElementType rootSafeMin = sqrt(safeMin);
104     const ElementType small = safeMin / eps;
105     const ElementType big = One / safeMin;
106     const ElementType rootBig = One / rootSafeMin;
107     const ElementType bigTheta = One / rootEps;
108     const ElementType rootTol = sqrt(tol);
109 
110     IndexType  info = 0;
111 //
112 //  -#- Row-cyclic Jacobi SVD algorithm with column pivoting -#-
113 //
114     const IndexType emptsw = n*(n-1)/2;
115     IndexType notRot = 0;
116     fastr(1) = Zero;
117 //
118 //  -#- Row-cyclic pivot strategy with de Rijk's pivoting -#-
119 //
120 
121     IndexType swBand = 0;
122 //[TP] SWBAND is a tuning parameter. It is meaningful and effective
123 //  if SGESVJ is used as a computational routine in the preconditioned
124 //  Jacobi SVD algorithm SGESVJ. For sweeps i=1:SWBAND the procedure
125 //  ......
126 
127     const IndexType kbl = min(IndexType(8),n);
128 //[TP] KBL is a tuning parameter that defines the tile size in the
129 //  tiling of the p-q loops of pivot pairs. In general, an optimal
130 //  value of KBL depends on the matrix dimensions and on the
131 //  parameters of the computer's memory.
132 //
133     IndexType nbl = n / kbl;
134     if (nbl*kbl!=n) {
135         ++nbl;
136     }
137 
138     const IndexType blSkip = pow(kbl,2) + 1;
139 //[TP] BLKSKIP is a tuning parameter that depends on SWBAND and KBL.
140 
141     const IndexType rowSkip = min(IndexType(5),kbl);
142 //[TP] ROWSKIP is a tuning parameter.
143 
144     const IndexType lkAhead = 1;
145 //[TP] LKAHEAD is a tuning parameter.
146     IndexType pSkipped = 0;
147 
148     ElementType aapp, aapp0, aaqq, aapq, aqoap, apoaq;
149     ElementType tmp, cs, sn, t, theta, thetaSign;
150 
151     bool rotOk;
152     bool converged = false;
153 
154     for (IndexType i=1; i<=nSweep; ++i) {
155 //  .. go go go ...
156 //
157        ElementType max_aapq = Zero;
158        ElementType max_sinj = Zero;
159 
160        IndexType iswRot = 0;
161 
162        pSkipped = 0;
163        notRot = 0;
164 
165        for (IndexType ibr=1; ibr<=nbl; ++ibr) {
166 
167           IndexType igl = (ibr-1)*kbl + 1;
168 
169           for (IndexType ir1=0; ir1<=min(lkAhead, nbl-ibr); ++ir1) {
170 
171              igl += ir1*kbl;
172 
173              for (IndexType p=igl; p<=min(igl+kbl-1,n-1); ++p) {
174 
175 //  .. de Rijk's pivoting
176                 IndexType q = blas::iamax(sva(_(p,n))) + p - 1;
177                 if (p!=q) {
178                    blas::swap(A(_,p), A(_,q));
179                    if (rhsVec) {
180                       blas::swap(V(_,p), V(_,q));
181                    }
182                    swap(sva(p),sva(q));
183                    swap(d(p),d(q));
184                 }
185 
186                 if (ir1==0) {
187 //
188 //     Column norms are periodically updated by explicit
189 //     norm computation.
190 //     Caveat:
191 //     Some BLAS implementations compute DNRM2(M,A(1,p),1)
192 //     as DSQRT(DDOT(M,A(1,p),1,A(1,p),1)), which may result in
193 //     overflow for ||A(:,p)||_2 > DSQRT(overflow_threshold), and
194 //     undeflow for ||A(:,p)||_2 < DSQRT(underflow_threshold).
195 //     Hence, DNRM2 cannot be trusted, not even in the case when
196 //     the true norm is far from the under(over)flow boundaries.
197 //     If properly implemented DNRM2 is available, the IF-THEN-ELSE
198 //     below should read "AAPP = DNRM2( M, A(1,p), 1 ) * D(p)".
199 //
200                    if (sva(p)<rootBig && sva(p)>rootSafeMin) {
201                       sva(p) = blas::nrm2(A(_,p))*d(p);
202                    } else {
203                       ElementType tmp = Zero;
204                       aapp = One;
205                       lassq(A(_,p), tmp, aapp);
206                       sva(p) = tmp*sqrt(aapp)*d(p);
207                    }
208                    aapp = sva(p);
209                 } else {
210                    aapp = sva(p);
211                 }
212 
213                 if (aapp>Zero) {
214 
215                    pSkipped = 0;
216 
217                    for (IndexType q=p+1; q<=min(igl+kbl-1,n); ++q) {
218 
219                       aaqq = sva(q);
220 
221                       if (aaqq>Zero) {
222 
223                          aapp0 = aapp;
224                          if (aaqq>=One) {
225                             rotOk = small*aapp<=aaqq;
226                             if (aapp<big/aaqq) {
227                                aapq = A(_,p)*A(_,q)*d(p)*d(q)/aaqq/aapp;
228                             } else {
229                                _work = A(_,p);
230                                lascl(LASCL::FullMatrix, 00,
231                                      aapp, d(p), _work);
232                                aapq = _work*A(_,q)*d(q)/aaqq;
233                             }
234                          } else {
235                             rotOk = aapp<=aaqq/small;
236                             if (aapp>small/aaqq) {
237                                aapq = A(_,p)*A(_,q)*d(p)*d(q)/aaqq/aapp;
238                             } else {
239                                _work = A(_,q);
240                                lascl(LASCL::FullMatrix, 00,
241                                      aaqq, d(q), _work);
242                                aapq = _work*A(_,p)*d(p)/aapp;
243                             }
244                          }
245 //
246                          max_aapq = max(max_aapq, abs(aapq));
247 //
248 //     TO rotate or NOT to rotate, THAT is the question ...
249 //
250                          if (abs(aapq)>tol) {
251 //
252 //        .. rotate
253 //        ROTATED = ROTATED + ONE
254 //
255                             if (ir1==0) {
256                                notRot = 0;
257                                pSkipped = 0;
258                                ++iswRot;
259                             }
260 
261                             if (rotOk) {
262 
263                                aqoap = aaqq / aapp;
264                                apoaq = aapp / aaqq;
265                                theta = -Half*abs(aqoap-apoaq)/aapq;
266 
267                                if (abs(theta)>bigTheta) {
268 
269                                   t = Half / theta;
270                                   fastr(3) =  t*d(p)/d(q);
271                                   fastr(4) = -t*d(q)/d(p);
272                                   blas::rotm(A(_,p), A(_,q), fastr);
273                                   if (rhsVec) {
274                                      blas::rotm(V(_,p), V(_,q), fastr);
275                                   }
276                                   sva(q) = aaqq
277                                           *sqrt(max(Zero, One+t*apoaq*aapq));
278                                   aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
279                                   max_sinj = max(max_sinj, abs(t));
280 
281                                } else {
282 //
283 //              .. choose correct signum for THETA and rotate
284 //
285                                   thetaSign = -sign(One, aapq);
286                                   t = One
287                                      / (theta+thetaSign*sqrt(One+theta*theta));
288                                   cs = sqrt(One/(One+t*t));
289                                   sn = t*cs;
290 
291                                   max_sinj = max(max_sinj, abs(sn));
292                                   sva(q) = aaqq
293                                           * sqrt(max(Zero, One+t*apoaq*aapq));
294                                   aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
295 
296                                   apoaq = d(p) / d(q);
297                                   aqoap = d(q) / d(p);
298                                   if (d(p)>=One) {
299                                      if (d(q)>=One) {
300                                         fastr(3) =  t*apoaq;
301                                         fastr(4) = -t*aqoap;
302                                         d(p) *= cs;
303                                         d(q) *= cs;
304                                         blas::rotm(A(_,p), A(_,q), fastr);
305                                         if (rhsVec) {
306                                            blas::rotm(V(_,p), V(_,q), fastr);
307                                         }
308                                      } else {
309                                         A(_,p) -= t*aqoap*A(_,q);
310                                         A(_,q) += cs*sn*apoaq*A(_,p);
311                                         d(p) *= cs;
312                                         d(q) /= cs;
313                                         if (rhsVec) {
314                                            V(_,p) -= t*aqoap*V(_,q);
315                                            V(_,q) += cs*sn*apoaq*V(_,p);
316                                         }
317                                      }
318                                   } else {
319                                      if (d(q)>=One) {
320                                         A(_,q) += t*apoaq*A(_,p);
321                                         A(_,p) -= cs*sn*aqoap*A(_,q);
322                                         d(p) /= cs;
323                                         d(q) *= cs;
324                                         if (rhsVec) {
325                                            V(_,q) += t*apoaq*V(_,p);
326                                            V(_,p) -= cs*sn*aqoap*V(_,q);
327                                         }
328                                      } else {
329                                         if (d(p)>=d(q)) {
330                                            A(_,p) -= t*aqoap*A(_,q);
331                                            A(_,q) += cs*sn*apoaq*A(_,p);
332                                            d(p) *= cs;
333                                            d(q) /= cs;
334                                            if (rhsVec) {
335                                               V(_,p) -= t*aqoap*V(_,q);
336                                               V(_,q) += cs*sn*apoaq*V(_,p);
337                                            }
338                                         } else {
339                                            A(_,q) += t*apoaq*A(_,p);
340                                            A(_,p) -= cs*sn*aqoap*A(_,q);
341                                            d(p) /= cs;
342                                            d(q) *= cs;
343                                            if (rhsVec) {
344                                               V(_,q) += t*apoaq*V(_,p);
345                                               V(_,p) -= cs*sn*aqoap*V(_,q);
346                                            }
347                                         }
348                                      }
349                                   }
350                                }
351 //
352                             } else {
353 //           .. have to use modified Gram-Schmidt like transformation
354                                _work = A(_,p);
355                                lascl(LASCL::FullMatrix, 00,
356                                      aapp, One, _work);
357                                lascl(LASCL::FullMatrix, 00,
358                                      aaqq, One, A(_,q));
359                                tmp = -aapq*d(p)/d(q);
360                                A(_,q) += tmp*_work;
361                                lascl(LASCL::FullMatrix, 00,
362                                      One, aaqq, A(_,q));
363                                sva(q) = aaqq*sqrt(max(Zero,One-aapq*aapq));
364                                max_sinj = max(max_sinj, safeMin);
365                             }
366 //        END IF ROTOK THEN ... ELSE
367 //
368 //        In the case of cancellation in updating SVA(q), SVA(p)
369 //        recompute SVA(q), SVA(p).
370                             if (pow(sva(q)/aaqq,2)<=rootEps) {
371                                if (aaqq<rootBig && aaqq>rootSafeMin) {
372                                   sva(q) = blas::nrm2(A(_,q))*d(q);
373                                } else {
374                                   t = Zero;
375                                   aaqq = One;
376                                   lassq(A(_,q), t, aaqq);
377                                   sva(q) = t*sqrt(aaqq)*d(q);
378                                }
379                             }
380                             if (aapp/aapp0<=rootEps) {
381                                if (aapp<rootBig && aapp>rootSafeMin) {
382                                   aapp = blas::nrm2(A(_,p))*d(p);
383                                } else {
384                                   t = Zero;
385                                   aapp = One;
386                                   lassq(A(_,p), t, aapp);
387                                   aapp = t*sqrt(aapp)*d(p);
388                                }
389                                sva(p) = aapp;
390                             }
391 //
392                          } else {
393 //     A(:,p) and A(:,q) already numerically orthogonal
394                             if (ir1==0) {
395                                ++notRot;
396                             }
397                             ++pSkipped;
398                          }
399                       } else {
400 //     A(:,q) is zero column
401                          if (ir1==0) {
402                             ++notRot;
403                          }
404                          ++pSkipped;
405                       }
406 
407                       if (i<=swBand && pSkipped>rowSkip) {
408                          if (ir1==0) {
409                             aapp = -aapp;
410                          }
411                          notRot = 0;
412                          break;
413                       }
414 
415                    }
416 //  END q-LOOP
417 //
418 
419 //  bailed out of q-loop
420 
421                    sva(p) = aapp;
422 
423                 } else {
424                    sva(p) = aapp;
425                    if (ir1==0 && aapp==Zero) {
426                       notRot += min(igl+kbl-1,n) - p;
427                    }
428                 }
429 
430              }
431 //  end of the p-loop
432 //  end of doing the block ( ibr, ibr )
433           }
434 //  end of ir1-loop
435 //
436 //........................................................
437 //... go to the off diagonal blocks
438 //
439           igl = (ibr-1)*kbl + 1;
440 
441           for (IndexType jbc=ibr+1; jbc<=nbl; ++jbc) {
442 
443              IndexType jgl = (jbc-1)*kbl + 1;
444 //
445 //     doing the block at ( ibr, jbc )
446 //
447              IndexType ijblsk = 0;
448              for (IndexType p=igl; p<=min(igl+kbl-1,n); ++p) {
449 
450                 aapp = sva(p);
451 
452                 if (aapp>Zero) {
453 
454                    pSkipped = 0;
455 
456                    for (IndexType q=jgl; q<=min(jgl+kbl-1,n); ++q) {
457 
458                       aaqq = sva(q);
459 
460                       if (aaqq>Zero) {
461                          aapp0 = aapp;
462 //
463 //  -#- M x 2 Jacobi SVD -#-
464 //
465 //     -#- Safe Gram matrix computation -#-
466 //
467                          if (aaqq>=One) {
468                             if (aapp>=aaqq) {
469                                rotOk = small*aapp<=aaqq;
470                             } else {
471                                rotOk = small*aaqq<=aapp;
472                             }
473                             if (aapp<big/aaqq) {
474                                aapq = A(_,p)*A(_,q)*d(p)*d(q)/aaqq/aapp;
475                             } else {
476                                _work = A(_,p);
477                                lascl(LASCL::FullMatrix, 00,
478                                      aapp, d(p), _work);
479                                aapq = _work*A(_,q)*d(q)/aaqq;
480                             }
481                          } else {
482                             if (aapp>=aaqq) {
483                               rotOk = aapp<=aaqq/small;
484                             } else {
485                               rotOk = aaqq<=aapp/small;
486                             }
487                             if (aapp>small/aaqq) {
488                                aapq = A(_,p)*A(_,q)*d(p)*d(q)/aaqq/aapp;
489                             } else {
490                                _work = A(_,q);
491                                lascl(LASCL::FullMatrix, 00,
492                                      aaqq, d(q), _work);
493                                aapq = _work*A(_,p)*d(p)/aapp;
494                             }
495                          }
496 
497                          max_aapq = max(max_aapq, abs(aapq));
498 //
499 //     TO rotate or NOT to rotate, THAT is the question ...
500 //
501                          if (abs(aapq)>tol) {
502                             notRot = 0;
503 //        ROTATED  = ROTATED + 1
504                             pSkipped = 0;
505                             ++iswRot;
506 
507                             if (rotOk) {
508 
509                                aqoap = aaqq / aapp;
510                                apoaq = aapp / aaqq;
511                                theta = -Half*abs(aqoap-apoaq)/aapq;
512                                if (aaqq>aapp0) {
513                                   theta = -theta;
514                                }
515 
516                                if (abs(theta)>bigTheta) {
517                                   t = Half / theta;
518                                   fastr(3) =  t*d(p)/d(q);
519                                   fastr(4) = -t*d(q)/d(p);
520                                   blas::rotm(A(_,p), A(_,q), fastr);
521                                   if (rhsVec) {
522                                      blas::rotm(V(_,p), V(_,q), fastr);
523                                   }
524                                   sva(q) = aaqq
525                                           *sqrt(max(Zero, One+t*apoaq*aapq));
526                                   aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
527                                   max_sinj = max(max_sinj, abs(t));
528                                } else {
529 //
530 //              .. choose correct signum for THETA and rotate
531 //
532                                   thetaSign = -sign(One, aapq);
533                                   if (aaqq>aapp0) {
534                                      thetaSign = -thetaSign;
535                                   }
536                                   t = One
537                                      / (theta+thetaSign*sqrt(One+theta*theta));
538                                   cs = sqrt(One/(One+t*t));
539                                   sn = t*cs;
540                                   max_sinj = max(max_sinj, abs(sn));
541                                   sva(q) = aaqq
542                                           *sqrt(max(Zero, One+t*apoaq*aapq));
543                                   aapp *= sqrt(max(Zero, One-t*aqoap*aapq));
544 
545                                   apoaq = d(p) / d(q);
546                                   aqoap = d(q) / d(p);
547                                   if (d(p)>=One) {
548 
549                                      if (d(q)>=One) {
550                                         fastr(3) =  t*apoaq;
551                                         fastr(4) = -t*aqoap;
552                                         d(p) *= cs;
553                                         d(q) *= cs;
554                                         blas::rotm(A(_,p), A(_,q), fastr);
555                                         if (rhsVec) {
556                                             blas::rotm(V(_,p), V(_,q), fastr);
557                                         }
558                                      } else {
559                                         A(_,p) -= t*aqoap*A(_,q);
560                                         A(_,q) += cs*sn*apoaq*A(_,p);
561                                         if (rhsVec) {
562                                            V(_,p) -= t*aqoap*V(_,q);
563                                            V(_,q) += cs*sn*apoaq*V(_,p);
564                                         }
565                                         d(p) *= cs;
566                                         d(q) /= cs;
567                                      }
568                                   } else {
569                                      if (d(q)>=One) {
570                                         A(_,q) += t*apoaq*A(_,p);
571                                         A(_,p) -= cs*sn*aqoap*A(_,q);
572                                         if (rhsVec) {
573                                            V(_,q) += t*apoaq*V(_,p);
574                                            V(_,p) -= cs*sn*aqoap*V(_,q);
575                                         }
576                                         d(p) /= cs;
577                                         d(q) *= cs;
578                                      } else {
579                                         if (d(p)>=d(q)) {
580                                            A(_,p) -= t*aqoap*A(_,q);
581                                            A(_,q) += cs*sn*apoaq*A(_,p);
582                                            d(p) *= cs;
583                                            d(q) /= cs;
584                                            if (rhsVec) {
585                                               V(_,p) -= t*aqoap*V(_,q);
586                                               V(_,q) += cs*sn*apoaq*V(_,p);
587                                            }
588                                         } else {
589                                            A(_,q) += t*apoaq*A(_,p);
590                                            A(_,p) -= cs*sn*aqoap*A(_,q);
591                                            d(p) /= cs;
592                                            d(q) *= cs;
593                                            if (rhsVec) {
594                                               V(_,q) += t*apoaq*V(_,p);
595                                               V(_,p) -= cs*sn*aqoap*V(_,q);
596                                            }
597                                         }
598                                      }
599                                   }
600                                }
601 
602                             } else {
603                                if (aapp>aaqq) {
604                                   _work = A(_,p);
605                                   lascl(LASCL::FullMatrix, 00,
606                                         aapp, One, _work);
607                                   lascl(LASCL::FullMatrix, 00,
608                                         aaqq, One, A(_,q));
609                                   tmp = -aapq*d(p)/d(q);
610                                   A(_,q) += tmp*_work;
611                                   lascl(LASCL::FullMatrix, 00,
612                                         One, aaqq, A(_,q));
613                                   sva(q) = aaqq*sqrt(max(Zero, One-aapq*aapq));
614                                   max_sinj = max(max_sinj, safeMin);
615                                } else {
616                                   _work = A(_,q);
617                                   lascl(LASCL::FullMatrix, 00,
618                                         aaqq, One, _work);
619                                   lascl(LASCL::FullMatrix, 00,
620                                         aapp, One, A(_,p));
621                                   tmp = -aapq*d(q)/d(p);
622                                   A(_,p) += tmp*_work;
623                                   lascl(LASCL::FullMatrix, 00,
624                                         One, aapp, A(_,p));
625                                   sva(p) = aapp*sqrt(max(Zero,One-aapq*aapq));
626                                   max_sinj = max(max_sinj, safeMin);
627                                }
628                             }
629 //        END IF ROTOK THEN ... ELSE
630 //
631 //        In the case of cancellation in updating SVA(q)
632 //        .. recompute SVA(q)
633                             if (pow(sva(q)/aaqq, 2)<=rootEps) {
634                                if (aaqq<rootBig && aaqq>rootSafeMin) {
635                                   sva(q) = blas::nrm2(A(_,q))*d(q);
636                                } else {
637                                   t = Zero;
638                                   aaqq = One;
639                                   lassq(A(_,q), t, aaqq);
640                                   sva(q) = t*sqrt(aaqq)*d(q);
641                                }
642                             }
643                             if (pow(aapp/aapp0, 2)<=rootEps) {
644                                if (aapp<rootBig && aapp>rootSafeMin) {
645                                   aapp = blas::nrm2(A(_,p))*d(p);
646                                } else {
647                                   t = Zero;
648                                   aapp = One;
649                                   lassq(A(_,p), t, aapp);
650                                   aapp = t*sqrt(aapp)*d(p);
651                                }
652                                sva(p) = aapp;
653                             }
654 //           end of OK rotation
655                          } else {
656                             ++notRot;
657                             ++pSkipped;
658                             ++ijblsk;
659                          }
660                       } else {
661                          ++notRot;
662                          ++pSkipped;
663                          ++ijblsk;
664                       }
665 
666                       if (i<=swBand && ijblsk>=blSkip) {
667                          sva(p) = aapp;
668                          notRot = 0;
669                          goto jbcLoopExit;
670                       }
671                       if (i<=swBand && pSkipped>rowSkip) {
672                          aapp = -aapp;
673                          notRot = 0;
674                          break;
675                       }
676 
677                    }
678 //     end of the q-loop
679 
680                    sva(p) = aapp;
681 
682                 } else {
683                    if (aapp==Zero) {
684                       notRot += min(jgl+kbl-1, n) -jgl + 1;
685                    }
686                    if (aapp<Zero) {
687                       notRot = 0;
688                    }
689                 }
690 
691              }
692 //  end of the p-loop
693           }
694 //  end of the jbc-loop
695        jbcLoopExit:
696 //2011 bailed out of the jbc-loop
697           for (IndexType p=igl; p<=min(igl+kbl-1,n); ++p) {
698              sva(p) = abs(sva(p));
699           }
700 
701        }
702 //2000 :: end of the ibr-loop
703 //
704 //  .. update SVA(N)
705        if (sva(n)<rootBig  && sva(n)>rootSafeMin) {
706           sva(n) = blas::nrm2(A(_,n))*d(n);
707        } else {
708           ElementType t = Zero;
709           aapp = One;
710           lassq(A(_,n), t, aapp);
711           sva(n) = t*sqrt(aapp)*d(n);
712        }
713 //
714 //  Additional steering devices
715 //
716        if (i<swBand && (max_aapq<=rootTol || iswRot<=n)) {
717            swBand = i;
718        }
719 
720        if (i>swBand+1 && max_aapq<n*tol && n*max_aapq*max_sinj<tol) {
721            converged = true;
722            break;
723        }
724 
725        if (notRot>=emptsw) {
726            converged = true;
727            break;
728        }
729 
730     }
731 //    end i=1:NSWEEP loop
732     if (converged) {
733 //#:) Reaching this point means that during the i-th sweep all pivots were
734 //    below the given tolerance, causing early exit.
735        info = 0;
736     } else {
737 //#:) Reaching this point means that the procedure has comleted the given
738 //    number of iterations.
739         info = nSweep - 1;
740     }
741 //
742 //  Sort the vector D.
743     for (IndexType p=1; p<=n-1; ++p) {
744         IndexType q = blas::iamax(sva(_(p,n))) + p - 1;
745         if (p!=q) {
746             swap(sva(p), sva(q));
747             swap(d(p), d(q));
748             blas::swap(A(_,p), A(_,q));
749             if (rhsVec) {
750                 blas::swap(V(_,p), V(_,q));
751             }
752         }
753     }
754     return info;
755 }
756 
757 //== interface for native lapack ===============================================
758 #ifdef CHECK_CXXLAPACK
759 
760 template <typename MA, typename VD, typename VSVA, typename MV, typename VWORK>
761 typename GeMatrix<MA>::IndexType
762 svj0_native(SVJ::JobV                                  jobV,
763             GeMatrix<MA>                               &A,
764             DenseVector<VD>                            &d,
765             DenseVector<VSVA>                          &sva,
766             GeMatrix<MV>                               &V,
767             const typename GeMatrix<MA>::ElementType   &eps,
768             const typename GeMatrix<MA>::ElementType   &safeMin,
769             const typename GeMatrix<MA>::ElementType   &tol,
770             typename GeMatrix<MA>::IndexType           nSweep,
771             DenseVector<VWORK>                         &work)
772 {
773     typedef typename GeMatrix<MA>::ElementType  T;
774 
775     const char       JOBV = char(jobV);
776     const INTEGER    M = A.numRows();
777     const INTEGER    N = A.numCols();
778     const INTEGER    LDA = A.leadingDimension();
779     const INTEGER    _MV = V.numRows();
780     const INTEGER    LDV = V.leadingDimension();
781     const DOUBLE     EPS = eps;
782     const DOUBLE     SFMIN = safeMin;
783     const DOUBLE     TOL = tol;
784     const INTEGER    NSWEEP = nSweep;
785     const INTEGER    LWORK = work.length();
786     INTEGER          INFO;
787 
788     if (IsSame<T,DOUBLE>::value) {
789         LAPACK_DECL(dgsvj0)(&JOBV,
790                             &M,
791                             &N,
792                             A.data(),
793                             &LDA,
794                             d.data(),
795                             sva.data(),
796                             &_MV,
797                             V.data(),
798                             &LDV,
799                             &EPS,
800                             &SFMIN,
801                             &TOL,
802                             &NSWEEP,
803                             work.data(),
804                             &LWORK,
805                             &INFO);
806     } else {
807         ASSERT(0);
808     }
809     ASSERT(INFO>=0);
810     return INFO;
811 }
812 
813 #endif // CHECK_CXXLAPACK
814 
815 //== public interface ==========================================================
816 template <typename MA, typename VD, typename VSVA, typename MV, typename VWORK>
817 typename GeMatrix<MA>::IndexType
818 svj0(SVJ::JobV                                  jobV,
819      GeMatrix<MA>                               &A,
820      DenseVector<VD>                            &d,
821      DenseVector<VSVA>                          &sva,
822      GeMatrix<MV>                               &V,
823      const typename GeMatrix<MA>::ElementType   &eps,
824      const typename GeMatrix<MA>::ElementType   &safeMin,
825      const typename GeMatrix<MA>::ElementType   &tol,
826      typename GeMatrix<MA>::IndexType           nSweep,
827      DenseVector<VWORK>                         &work)
828 {
829     using std::max;
830     using std::min;
831 
832 //
833 //  Test the input parameters
834 //
835 #   ifndef NDEBUG
836     typedef typename GeMatrix<MA>::IndexType    IndexType;
837 
838     ASSERT(A.firstRow()==1);
839     ASSERT(A.firstCol()==1);
840 
841     const IndexType m = A.numRows();
842     const IndexType n = A.numCols();
843 
844     ASSERT(m>=n);
845 
846     ASSERT(d.firstIndex()==1);
847     ASSERT(d.length()==n);
848 
849     ASSERT(sva.firstIndex()==1);
850     ASSERT(sva.length()==n);
851 
852     ASSERT(V.firstRow()==1);
853     ASSERT(V.firstCol()==1);
854 
855     if (jobV==SVJ::ComputeV) {
856         ASSERT(V.numCols()==n);
857         ASSERT(V.numRows()==n);
858     }
859     if (jobV==SVJ::ApplyV) {
860         ASSERT(V.numCols()==n);
861     }
862 
863     if (work.length()>0) {
864         ASSERT(work.length()>=m);
865     }
866 #   endif
867 //
868 //  Make copies of output arguments
869 //
870 #   ifdef CHECK_CXXLAPACK
871     typename GeMatrix<MA>::NoView       A_org     = A;
872     typename DenseVector<VD>::NoView    d_org     = d;
873     typename DenseVector<VSVA>::NoView  sva_org   = sva;
874     typename GeMatrix<MV>::NoView       V_org     = V;
875     typename DenseVector<VWORK>::NoView work_org  = work;
876 #   endif
877 
878 //
879 //  Call implementation
880 //
881     IndexType info = svj0_generic(jobV, A, d, sva, V, eps, safeMin, tol,
882                                   nSweep, work);
883 
884 #   ifdef CHECK_CXXLAPACK
885 //
886 //  Make copies of results computed by the generic implementation
887 //
888     typename GeMatrix<MA>::NoView       A_generic     = A;
889     typename DenseVector<VD>::NoView    d_generic     = d;
890     typename DenseVector<VSVA>::NoView  sva_generic   = sva;
891     typename GeMatrix<MV>::NoView       V_generic     = V;
892     typename DenseVector<VWORK>::NoView work_generic  = work;
893 //
894 //  restore output arguments
895 //
896     A    = A_org;
897     d    = d_org;
898     sva  = sva_org;
899     V    = V_org;
900     work = work_org;
901 //
902 //  Compare generic results with results from the native implementation
903 //
904     IndexType _info = svj0_native(jobV, A, d, sva, V, eps, safeMin, tol,
905                                   nSweep, work);
906 
907     bool failed = false;
908     if (! isIdentical(A_generic, A, "A_generic""A")) {
909         std::cerr << "CXXLAPACK: A_generic = " << A_generic << std::endl;
910         std::cerr << "F77LAPACK: A = " << A << std::endl;
911         failed = true;
912     }
913     if (! isIdentical(d_generic, d, "d_generic""d")) {
914         std::cerr << "CXXLAPACK: d_generic = " << d_generic << std::endl;
915         std::cerr << "F77LAPACK: d = " << d << std::endl;
916         failed = true;
917     }
918     if (! isIdentical(sva_generic, sva, "sva_generic""sva")) {
919         std::cerr << "CXXLAPACK: sva_generic = " << sva_generic << std::endl;
920         std::cerr << "F77LAPACK: sva = " << sva << std::endl;
921         failed = true;
922     }
923     if (! isIdentical(V_generic, V, "V_generic""V")) {
924         std::cerr << "CXXLAPACK: V_generic = " << V_generic << std::endl;
925         std::cerr << "F77LAPACK: V = " << V << std::endl;
926        failed = true;
927     }
928     if (! isIdentical(work_generic, work, "work_generic""work")) {
929         std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
930         std::cerr << "F77LAPACK: work = " << work << std::endl;
931         failed = true;
932     }
933     if (! isIdentical(info, _info, "info""_info")) {
934         std::cerr << "CXXLAPACK: info = " << info << std::endl;
935         std::cerr << "F77LAPACK: _info = " << _info << std::endl;
936         failed = true;
937     }
938 
939     if (failed) {
940         std::cerr << "error in: svj0.tcc" << std::endl;
941         ASSERT(0);
942     } else {
943         //std::cerr << "passed: svj0.tcc" << std::endl;
944     }
945 #   endif
946 
947     return info;
948 }
949 
950 //-- forwarding ----------------------------------------------------------------
951 template <typename MA, typename VD, typename VSVA, typename MV, typename VWORK>
952 typename MA::IndexType
953 svj0(SVJ::JobV                                  jobV,
954      MA                                         &&A,
955      VD                                         &&d,
956      VSVA                                       &&sva,
957      MV                                         &&V,
958      const typename MA::ElementType             &eps,
959      const typename MA::ElementType             &safeMin,
960      const typename MA::ElementType             &tol,
961      typename MA::IndexType                     nSweep,
962      VWORK                                      &&work)
963 {
964     typename MA::IndexType   info;
965 
966     CHECKPOINT_ENTER;
967     info = svj0(jobV, A, d, sva, V, eps, safeMin, tol, nSweep, work);
968     CHECKPOINT_LEAVE;
969 
970     return info;
971 }
972 
973 } } // namespace lapack, flens
974 
975 #endif // FLENS_LAPACK_SVD_SVJ0_TCC