1       SUBROUTINE DLASD2( NL, NR, SQRE, K, D, Z, ALPHA, BETA, U, LDU, VT,
  2      $                   LDVT, DSIGMA, U2, LDU2, VT2, LDVT2, IDXP, IDX,
  3      $                   IDXC, IDXQ, COLTYP, INFO )
  4 *
  5 *  -- LAPACK auxiliary routine (version 3.2) --
  6 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
  7 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
  8 *     November 2006
  9 *
 10 *     .. Scalar Arguments ..
 11       INTEGER            INFO, K, LDU, LDU2, LDVT, LDVT2, NL, NR, SQRE
 12       DOUBLE PRECISION   ALPHA, BETA
 13 *     ..
 14 *     .. Array Arguments ..
 15       INTEGER            COLTYP( * ), IDX( * ), IDXC( * ), IDXP( * ),
 16      $                   IDXQ( * )
 17       DOUBLE PRECISION   D( * ), DSIGMA( * ), U( LDU, * ),
 18      $                   U2( LDU2, * ), VT( LDVT, * ), VT2( LDVT2, * ),
 19      $                   Z( * )
 20 *     ..
 21 *
 22 *  Purpose
 23 *  =======
 24 *
 25 *  DLASD2 merges the two sets of singular values together into a single
 26 *  sorted set.  Then it tries to deflate the size of the problem.
 27 *  There are two ways in which deflation can occur:  when two or more
 28 *  singular values are close together or if there is a tiny entry in the
 29 *  Z vector.  For each such occurrence the order of the related secular
 30 *  equation problem is reduced by one.
 31 *
 32 *  DLASD2 is called from DLASD1.
 33 *
 34 *  Arguments
 35 *  =========
 36 *
 37 *  NL     (input) INTEGER
 38 *         The row dimension of the upper block.  NL >= 1.
 39 *
 40 *  NR     (input) INTEGER
 41 *         The row dimension of the lower block.  NR >= 1.
 42 *
 43 *  SQRE   (input) INTEGER
 44 *         = 0: the lower block is an NR-by-NR square matrix.
 45 *         = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
 46 *
 47 *         The bidiagonal matrix has N = NL + NR + 1 rows and
 48 *         M = N + SQRE >= N columns.
 49 *
 50 *  K      (output) INTEGER
 51 *         Contains the dimension of the non-deflated matrix,
 52 *         This is the order of the related secular equation. 1 <= K <=N.
 53 *
 54 *  D      (input/output) DOUBLE PRECISION array, dimension(N)
 55 *         On entry D contains the singular values of the two submatrices
 56 *         to be combined.  On exit D contains the trailing (N-K) updated
 57 *         singular values (those which were deflated) sorted into
 58 *         increasing order.
 59 *
 60 *  Z      (output) DOUBLE PRECISION array, dimension(N)
 61 *         On exit Z contains the updating row vector in the secular
 62 *         equation.
 63 *
 64 *  ALPHA  (input) DOUBLE PRECISION
 65 *         Contains the diagonal element associated with the added row.
 66 *
 67 *  BETA   (input) DOUBLE PRECISION
 68 *         Contains the off-diagonal element associated with the added
 69 *         row.
 70 *
 71 *  U      (input/output) DOUBLE PRECISION array, dimension(LDU,N)
 72 *         On entry U contains the left singular vectors of two
 73 *         submatrices in the two square blocks with corners at (1,1),
 74 *         (NL, NL), and (NL+2, NL+2), (N,N).
 75 *         On exit U contains the trailing (N-K) updated left singular
 76 *         vectors (those which were deflated) in its last N-K columns.
 77 *
 78 *  LDU    (input) INTEGER
 79 *         The leading dimension of the array U.  LDU >= N.
 80 *
 81 *  VT     (input/output) DOUBLE PRECISION array, dimension(LDVT,M)
 82 *         On entry VT**T contains the right singular vectors of two
 83 *         submatrices in the two square blocks with corners at (1,1),
 84 *         (NL+1, NL+1), and (NL+2, NL+2), (M,M).
 85 *         On exit VT**T contains the trailing (N-K) updated right singular
 86 *         vectors (those which were deflated) in its last N-K columns.
 87 *         In case SQRE =1, the last row of VT spans the right null
 88 *         space.
 89 *
 90 *  LDVT   (input) INTEGER
 91 *         The leading dimension of the array VT.  LDVT >= M.
 92 *
 93 *  DSIGMA (output) DOUBLE PRECISION array, dimension (N)
 94 *         Contains a copy of the diagonal elements (K-1 singular values
 95 *         and one zero) in the secular equation.
 96 *
 97 *  U2     (output) DOUBLE PRECISION array, dimension(LDU2,N)
 98 *         Contains a copy of the first K-1 left singular vectors which
 99 *         will be used by DLASD3 in a matrix multiply (DGEMM) to solve
100 *         for the new left singular vectors. U2 is arranged into four
101 *         blocks. The first block contains a column with 1 at NL+1 and
102 *         zero everywhere else; the second block contains non-zero
103 *         entries only at and above NL; the third contains non-zero
104 *         entries only below NL+1; and the fourth is dense.
105 *
106 *  LDU2   (input) INTEGER
107 *         The leading dimension of the array U2.  LDU2 >= N.
108 *
109 *  VT2    (output) DOUBLE PRECISION array, dimension(LDVT2,N)
110 *         VT2**T contains a copy of the first K right singular vectors
111 *         which will be used by DLASD3 in a matrix multiply (DGEMM) to
112 *         solve for the new right singular vectors. VT2 is arranged into
113 *         three blocks. The first block contains a row that corresponds
114 *         to the special 0 diagonal element in SIGMA; the second block
115 *         contains non-zeros only at and before NL +1; the third block
116 *         contains non-zeros only at and after  NL +2.
117 *
118 *  LDVT2  (input) INTEGER
119 *         The leading dimension of the array VT2.  LDVT2 >= M.
120 *
121 *  IDXP   (workspace) INTEGER array dimension(N)
122 *         This will contain the permutation used to place deflated
123 *         values of D at the end of the array. On output IDXP(2:K)
124 *         points to the nondeflated D-values and IDXP(K+1:N)
125 *         points to the deflated singular values.
126 *
127 *  IDX    (workspace) INTEGER array dimension(N)
128 *         This will contain the permutation used to sort the contents of
129 *         D into ascending order.
130 *
131 *  IDXC   (output) INTEGER array dimension(N)
132 *         This will contain the permutation used to arrange the columns
133 *         of the deflated U matrix into three groups:  the first group
134 *         contains non-zero entries only at and above NL, the second
135 *         contains non-zero entries only below NL+2, and the third is
136 *         dense.
137 *
138 *  IDXQ   (input/output) INTEGER array dimension(N)
139 *         This contains the permutation which separately sorts the two
140 *         sub-problems in D into ascending order.  Note that entries in
141 *         the first hlaf of this permutation must first be moved one
142 *         position backward; and entries in the second half
143 *         must first have NL+1 added to their values.
144 *
145 *  COLTYP (workspace/output) INTEGER array dimension(N)
146 *         As workspace, this will contain a label which will indicate
147 *         which of the following types a column in the U2 matrix or a
148 *         row in the VT2 matrix is:
149 *         1 : non-zero in the upper half only
150 *         2 : non-zero in the lower half only
151 *         3 : dense
152 *         4 : deflated
153 *
154 *         On exit, it is an array of dimension 4, with COLTYP(I) being
155 *         the dimension of the I-th type columns.
156 *
157 *  INFO   (output) INTEGER
158 *          = 0:  successful exit.
159 *          < 0:  if INFO = -i, the i-th argument had an illegal value.
160 *
161 *  Further Details
162 *  ===============
163 *
164 *  Based on contributions by
165 *     Ming Gu and Huan Ren, Computer Science Division, University of
166 *     California at Berkeley, USA
167 *
168 *  =====================================================================
169 *
170 *     .. Parameters ..
171       DOUBLE PRECISION   ZERO, ONE, TWO, EIGHT
172       PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0,
173      $                   EIGHT = 8.0D+0 )
174 *     ..
175 *     .. Local Arrays ..
176       INTEGER            CTOT( 4 ), PSM( 4 )
177 *     ..
178 *     .. Local Scalars ..
179       INTEGER            CT, I, IDXI, IDXJ, IDXJP, J, JP, JPREV, K2, M,
180      $                   N, NLP1, NLP2
181       DOUBLE PRECISION   C, EPS, HLFTOL, S, TAU, TOL, Z1
182 *     ..
183 *     .. External Functions ..
184       DOUBLE PRECISION   DLAMCH, DLAPY2
185       EXTERNAL           DLAMCH, DLAPY2
186 *     ..
187 *     .. External Subroutines ..
188       EXTERNAL           DCOPY, DLACPY, DLAMRG, DLASET, DROT, XERBLA
189 *     ..
190 *     .. Intrinsic Functions ..
191       INTRINSIC          ABSMAX
192 *     ..
193 *     .. Executable Statements ..
194 *
195 *     Test the input parameters.
196 *
197       INFO = 0
198 *
199       IF( NL.LT.1 ) THEN
200          INFO = -1
201       ELSE IF( NR.LT.1 ) THEN
202          INFO = -2
203       ELSE IF( ( SQRE.NE.1 ) .AND. ( SQRE.NE.0 ) ) THEN
204          INFO = -3
205       END IF
206 *
207       N = NL + NR + 1
208       M = N + SQRE
209 *
210       IF( LDU.LT.N ) THEN
211          INFO = -10
212       ELSE IF( LDVT.LT.M ) THEN
213          INFO = -12
214       ELSE IF( LDU2.LT.N ) THEN
215          INFO = -15
216       ELSE IF( LDVT2.LT.M ) THEN
217          INFO = -17
218       END IF
219       IF( INFO.NE.0 ) THEN
220          CALL XERBLA( 'DLASD2'-INFO )
221          RETURN
222       END IF
223 *
224       NLP1 = NL + 1
225       NLP2 = NL + 2
226 *
227 *     Generate the first part of the vector Z; and move the singular
228 *     values in the first part of D one position backward.
229 *
230       Z1 = ALPHA*VT( NLP1, NLP1 )
231       Z( 1 ) = Z1
232       DO 10 I = NL, 1-1
233          Z( I+1 ) = ALPHA*VT( I, NLP1 )
234          D( I+1 ) = D( I )
235          IDXQ( I+1 ) = IDXQ( I ) + 1
236    10 CONTINUE
237 *
238 *     Generate the second part of the vector Z.
239 *
240       DO 20 I = NLP2, M
241          Z( I ) = BETA*VT( I, NLP2 )
242    20 CONTINUE
243 *
244 *     Initialize some reference arrays.
245 *
246       DO 30 I = 2, NLP1
247          COLTYP( I ) = 1
248    30 CONTINUE
249       DO 40 I = NLP2, N
250          COLTYP( I ) = 2
251    40 CONTINUE
252 *
253 *     Sort the singular values into increasing order
254 *
255       DO 50 I = NLP2, N
256          IDXQ( I ) = IDXQ( I ) + NLP1
257    50 CONTINUE
258 *
259 *     DSIGMA, IDXC, IDXC, and the first column of U2
260 *     are used as storage space.
261 *
262       DO 60 I = 2, N
263          DSIGMA( I ) = D( IDXQ( I ) )
264          U2( I, 1 ) = Z( IDXQ( I ) )
265          IDXC( I ) = COLTYP( IDXQ( I ) )
266    60 CONTINUE
267 *
268       CALL DLAMRG( NL, NR, DSIGMA( 2 ), 11, IDX( 2 ) )
269 *
270       DO 70 I = 2, N
271          IDXI = 1 + IDX( I )
272          D( I ) = DSIGMA( IDXI )
273          Z( I ) = U2( IDXI, 1 )
274          COLTYP( I ) = IDXC( IDXI )
275    70 CONTINUE
276 *
277 *     Calculate the allowable deflation tolerance
278 *
279       EPS = DLAMCH( 'Epsilon' )
280       TOL = MAXABS( ALPHA ), ABS( BETA ) )
281       TOL = EIGHT*EPS*MAXABS( D( N ) ), TOL )
282 *
283 *     There are 2 kinds of deflation -- first a value in the z-vector
284 *     is small, second two (or more) singular values are very close
285 *     together (their difference is small).
286 *
287 *     If the value in the z-vector is small, we simply permute the
288 *     array so that the corresponding singular value is moved to the
289 *     end.
290 *
291 *     If two values in the D-vector are close, we perform a two-sided
292 *     rotation designed to make one of the corresponding z-vector
293 *     entries zero, and then permute the array so that the deflated
294 *     singular value is moved to the end.
295 *
296 *     If there are multiple singular values then the problem deflates.
297 *     Here the number of equal singular values are found.  As each equal
298 *     singular value is found, an elementary reflector is computed to
299 *     rotate the corresponding singular subspace so that the
300 *     corresponding components of Z are zero in this new basis.
301 *
302       K = 1
303       K2 = N + 1
304       DO 80 J = 2, N
305          IFABS( Z( J ) ).LE.TOL ) THEN
306 *
307 *           Deflate due to small z component.
308 *
309             K2 = K2 - 1
310             IDXP( K2 ) = J
311             COLTYP( J ) = 4
312             IF( J.EQ.N )
313      $         GO TO 120
314          ELSE
315             JPREV = J
316             GO TO 90
317          END IF
318    80 CONTINUE
319    90 CONTINUE
320       J = JPREV
321   100 CONTINUE
322       J = J + 1
323       IF( J.GT.N )
324      $   GO TO 110
325       IFABS( Z( J ) ).LE.TOL ) THEN
326 *
327 *        Deflate due to small z component.
328 *
329          K2 = K2 - 1
330          IDXP( K2 ) = J
331          COLTYP( J ) = 4
332       ELSE
333 *
334 *        Check if singular values are close enough to allow deflation.
335 *
336          IFABS( D( J )-D( JPREV ) ).LE.TOL ) THEN
337 *
338 *           Deflation is possible.
339 *
340             S = Z( JPREV )
341             C = Z( J )
342 *
343 *           Find sqrt(a**2+b**2) without overflow or
344 *           destructive underflow.
345 *
346             TAU = DLAPY2( C, S )
347             C = C / TAU
348             S = -/ TAU
349             Z( J ) = TAU
350             Z( JPREV ) = ZERO
351 *
352 *           Apply back the Givens rotation to the left and right
353 *           singular vector matrices.
354 *
355             IDXJP = IDXQ( IDX( JPREV )+1 )
356             IDXJ = IDXQ( IDX( J )+1 )
357             IF( IDXJP.LE.NLP1 ) THEN
358                IDXJP = IDXJP - 1
359             END IF
360             IF( IDXJ.LE.NLP1 ) THEN
361                IDXJ = IDXJ - 1
362             END IF
363             CALL DROT( N, U( 1, IDXJP ), 1, U( 1, IDXJ ), 1, C, S )
364             CALL DROT( M, VT( IDXJP, 1 ), LDVT, VT( IDXJ, 1 ), LDVT, C,
365      $                 S )
366             IF( COLTYP( J ).NE.COLTYP( JPREV ) ) THEN
367                COLTYP( J ) = 3
368             END IF
369             COLTYP( JPREV ) = 4
370             K2 = K2 - 1
371             IDXP( K2 ) = JPREV
372             JPREV = J
373          ELSE
374             K = K + 1
375             U2( K, 1 ) = Z( JPREV )
376             DSIGMA( K ) = D( JPREV )
377             IDXP( K ) = JPREV
378             JPREV = J
379          END IF
380       END IF
381       GO TO 100
382   110 CONTINUE
383 *
384 *     Record the last singular value.
385 *
386       K = K + 1
387       U2( K, 1 ) = Z( JPREV )
388       DSIGMA( K ) = D( JPREV )
389       IDXP( K ) = JPREV
390 *
391   120 CONTINUE
392 *
393 *     Count up the total number of the various types of columns, then
394 *     form a permutation which positions the four column types into
395 *     four groups of uniform structure (although one or more of these
396 *     groups may be empty).
397 *
398       DO 130 J = 14
399          CTOT( J ) = 0
400   130 CONTINUE
401       DO 140 J = 2, N
402          CT = COLTYP( J )
403          CTOT( CT ) = CTOT( CT ) + 1
404   140 CONTINUE
405 *
406 *     PSM(*) = Position in SubMatrix (of types 1 through 4)
407 *
408       PSM( 1 ) = 2
409       PSM( 2 ) = 2 + CTOT( 1 )
410       PSM( 3 ) = PSM( 2 ) + CTOT( 2 )
411       PSM( 4 ) = PSM( 3 ) + CTOT( 3 )
412 *
413 *     Fill out the IDXC array so that the permutation which it induces
414 *     will place all type-1 columns first, all type-2 columns next,
415 *     then all type-3's, and finally all type-4's, starting from the
416 *     second column. This applies similarly to the rows of VT.
417 *
418       DO 150 J = 2, N
419          JP = IDXP( J )
420          CT = COLTYP( JP )
421          IDXC( PSM( CT ) ) = J
422          PSM( CT ) = PSM( CT ) + 1
423   150 CONTINUE
424 *
425 *     Sort the singular values and corresponding singular vectors into
426 *     DSIGMA, U2, and VT2 respectively.  The singular values/vectors
427 *     which were not deflated go into the first K slots of DSIGMA, U2,
428 *     and VT2 respectively, while those which were deflated go into the
429 *     last N - K slots, except that the first column/row will be treated
430 *     separately.
431 *
432       DO 160 J = 2, N
433          JP = IDXP( J )
434          DSIGMA( J ) = D( JP )
435          IDXJ = IDXQ( IDX( IDXP( IDXC( J ) ) )+1 )
436          IF( IDXJ.LE.NLP1 ) THEN
437             IDXJ = IDXJ - 1
438          END IF
439          CALL DCOPY( N, U( 1, IDXJ ), 1, U2( 1, J ), 1 )
440          CALL DCOPY( M, VT( IDXJ, 1 ), LDVT, VT2( J, 1 ), LDVT2 )
441   160 CONTINUE
442 *
443 *     Determine DSIGMA(1), DSIGMA(2) and Z(1)
444 *
445       DSIGMA( 1 ) = ZERO
446       HLFTOL = TOL / TWO
447       IFABS( DSIGMA( 2 ) ).LE.HLFTOL )
448      $   DSIGMA( 2 ) = HLFTOL
449       IF( M.GT.N ) THEN
450          Z( 1 ) = DLAPY2( Z1, Z( M ) )
451          IF( Z( 1 ).LE.TOL ) THEN
452             C = ONE
453             S = ZERO
454             Z( 1 ) = TOL
455          ELSE
456             C = Z1 / Z( 1 )
457             S = Z( M ) / Z( 1 )
458          END IF
459       ELSE
460          IFABS( Z1 ).LE.TOL ) THEN
461             Z( 1 ) = TOL
462          ELSE
463             Z( 1 ) = Z1
464          END IF
465       END IF
466 *
467 *     Move the rest of the updating row to Z.
468 *
469       CALL DCOPY( K-1, U2( 21 ), 1, Z( 2 ), 1 )
470 *
471 *     Determine the first column of U2, the first row of VT2 and the
472 *     last row of VT.
473 *
474       CALL DLASET( 'A', N, 1, ZERO, ZERO, U2, LDU2 )
475       U2( NLP1, 1 ) = ONE
476       IF( M.GT.N ) THEN
477          DO 170 I = 1, NLP1
478             VT( M, I ) = -S*VT( NLP1, I )
479             VT2( 1, I ) = C*VT( NLP1, I )
480   170    CONTINUE
481          DO 180 I = NLP2, M
482             VT2( 1, I ) = S*VT( M, I )
483             VT( M, I ) = C*VT( M, I )
484   180    CONTINUE
485       ELSE
486          CALL DCOPY( M, VT( NLP1, 1 ), LDVT, VT2( 11 ), LDVT2 )
487       END IF
488       IF( M.GT.N ) THEN
489          CALL DCOPY( M, VT( M, 1 ), LDVT, VT2( M, 1 ), LDVT2 )
490       END IF
491 *
492 *     The deflated singular values and their corresponding vectors go
493 *     into the back of D, U, and V respectively.
494 *
495       IF( N.GT.K ) THEN
496          CALL DCOPY( N-K, DSIGMA( K+1 ), 1, D( K+1 ), 1 )
497          CALL DLACPY( 'A', N, N-K, U2( 1, K+1 ), LDU2, U( 1, K+1 ),
498      $                LDU )
499          CALL DLACPY( 'A', N-K, M, VT2( K+11 ), LDVT2, VT( K+11 ),
500      $                LDVT )
501       END IF
502 *
503 *     Copy CTOT into COLTYP for referencing in DLASD3.
504 *
505       DO 190 J = 14
506          COLTYP( J ) = CTOT( J )
507   190 CONTINUE
508 *
509       RETURN
510 *
511 *     End of DLASD2
512 *
513       END