1 /*
  2  *   Copyright (c) 2012, Michael Lehn
  3  *
  4  *   All rights reserved.
  5  *
  6  *   Redistribution and use in source and binary forms, with or without
  7  *   modification, are permitted provided that the following conditions
  8  *   are met:
  9  *
 10  *   1) Redistributions of source code must retain the above copyright
 11  *      notice, this list of conditions and the following disclaimer.
 12  *   2) Redistributions in binary form must reproduce the above copyright
 13  *      notice, this list of conditions and the following disclaimer in
 14  *      the documentation and/or other materials provided with the
 15  *      distribution.
 16  *   3) Neither the name of the FLENS development group nor the names of
 17  *      its contributors may be used to endorse or promote products derived
 18  *      from this software without specific prior written permission.
 19  *
 20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31  */
 32 
 33 /* Based on
 34  *
 35        SUBROUTINE DGEQP3( M, N, A, LDA, JPVT, TAU, WORK, LWORK, INFO )
 36  *
 37  *  -- LAPACK routine (version 3.3.1) --
 38  *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 39  *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 40  *  -- April 2011                                                      --
 41  */
 42 
 43 #ifndef FLENS_LAPACK_QR_QP3_TCC
 44 #define FLENS_LAPACK_QR_QP3_TCC 1
 45 
 46 #include <flens/blas/blas.h>
 47 #include <flens/lapack/lapack.h>
 48 
 49 namespace flens { namespace lapack {
 50 
 51 //== generic lapack implementation =============================================
 52 
 53 template <typename MA, typename JPIV, typename VTAU, typename VWORK>
 54 void
 55 qp3_generic(GeMatrix<MA> &A, DenseVector<JPIV> &jPiv, DenseVector<VTAU> &tau,
 56             DenseVector<VWORK> &work)
 57 {
 58     using std::max;
 59     using std::min;
 60 
 61     typedef typename GeMatrix<MA>::ElementType  T;
 62     typedef typename GeMatrix<MA>::IndexType    IndexType;
 63 
 64     const Underscore<IndexType> _;
 65 
 66     const IndexType m     = A.numRows();
 67     const IndexType n     = A.numCols();
 68     const IndexType minmn = min(m, n);
 69 
 70     IndexType iws, lwOpt;
 71 
 72     if (minmn==0) {
 73         iws   = 1;
 74         lwOpt = 1;
 75     } else {
 76         iws = 3*n + 1;
 77         const IndexType nb = ilaenv<T>(1"GEQRF""", m, n);
 78         lwOpt = 2*n +(n+1)*nb;
 79     }
 80 
 81     if (work.length()==0) {
 82         work.resize(lwOpt);
 83     }
 84     work(1) = lwOpt;
 85 
 86     IndexType lWork = work.length();
 87 
 88 //
 89 //  Quick return if possible.
 90 //
 91     if (minmn==0) {
 92         return;
 93     }
 94 //
 95 //  Move initial columns up front.
 96 //
 97     IndexType  nFixed = 1;
 98     for (IndexType j=1; j<=n; ++j) {
 99         if (jPiv(j)!=0) {
100             if (j!=nFixed) {
101                 blas::swap(A(_,j), A(_,nFixed));
102                 jPiv(j)      = jPiv(nFixed);
103                 jPiv(nFixed) = j;
104             } else {
105                 jPiv(j) = j;
106             }
107             ++nFixed;
108         } else {
109             jPiv(j) = j;
110         }
111     }
112     --nFixed;
113 //
114 //  Factorize fixed columns
115 //  =======================
116 //
117 //  Compute the QR factorization of fixed columns and update
118 //  remaining columns.
119 //
120     if (nFixed>0) {
121         const IndexType na = min(m, nFixed);
122         auto A1 = A(_,_(1,na));
123         auto A2 = A(_,_(na+1,n));
124         auto tau1 = tau(_(1,na));
125 
126         std::cerr << "-> qrf" << std::endl;
127         qrf(A1, tau1, work);
128         std::cerr << "qrf." << std::endl;
129         iws = max(iws, IndexType(work(1)));
130         if (na<n) {
131             std::cerr << "-> ormqr" << std::endl;
132             ormqr(Left, Trans, A1, tau1, A2, work);
133             std::cerr << "ormqr." << std::endl;
134             iws = max(iws, IndexType(work(1)));
135         }
136     }
137 //
138 //  Factorize free columns
139 //  ======================
140 //
141     if (nFixed<minmn) {
142 
143         IndexType sm = m - nFixed;
144         IndexType sn = n - nFixed;
145         IndexType sminmn = minmn - nFixed;
146 //
147 //      Determine the block size.
148 //
149         IndexType nb = ilaenv<T>(1"GEQRF""", sm, sn);
150         IndexType nbMin = 2;
151         IndexType nx = 0;
152 
153         if (nb>1 && nb<sminmn) {
154 //
155 //          Determine when to cross over from blocked to unblocked code.
156 //
157             nx = max(IndexType(0), ilaenv<T>(3"GEQRF""", sm, sn));
158 //
159 //
160             if (nx<sminmn) {
161 //
162 //              Determine if workspace is large enough for blocked code.
163 //
164                 IndexType minWs = 2*sn + (sn+1)*nb;
165                 iws = max(iws, minWs);
166                 if (lWork<minWs) {
167 //
168 //                  Not enough workspace to use optimal NB: Reduce NB and
169 //                  determine the minimum value of NB.
170 //
171                     nb = (lWork-2*sn) / (sn+1);
172                     nbMin = max(IndexType(2),
173                                 ilaenv<T>(2"GEQRF""", sm, sn));
174 
175                 }
176             }
177         }
178 //
179 //      Initialize partial column norms. The first N elements of work
180 //      store the exact column norms.
181 //
182         for (IndexType j=nFixed+1; j<=n; ++j) {
183             work(j) = blas::nrm2(A(_(nFixed+1,m),j));
184             work(n+j) = work(j);
185         }
186 
187         IndexType j;
188 
189         if (nb>=nbMin && nb<sminmn && nx<sminmn) {
190 //
191 //          Use blocked code initially.
192 //
193             j = nFixed + 1;
194 //
195 //          Compute factorization: while loop.
196 //
197 //
198             const IndexType topbmn = minmn - nx;
199             while (j<=topbmn) {
200                 const IndexType jb = min(nb, topbmn-j+1);
201 //
202 //              Factorize JB columns among columns J:N.
203 //
204                 IndexType fjb;
205                 auto _A    = A(_,_(j,n));
206                 auto _jPiv = jPiv(_(j,n));
207                 auto _tau  = tau(_(j,min(j+jb-1,minmn)));
208                 auto vn1   = work(_(j,n));
209                 auto vn2   = work(_(j+n,2*n));
210                 auto aux   = work(_(2*n+12*n+jb));
211 
212                 IndexType fLen = jb*(n-j+1);
213                 auto      f    = work(_(2*n+jb+1,2*n+jb+fLen));
214 
215                 GeMatrixView<T> F(n-j+1, jb, f, n-j+1);
216 
217                 laqps(j-1, jb, fjb, _A, _jPiv, _tau, vn1, vn2, aux, F);
218 
219                 j += fjb;
220             }
221         } else {
222             j = nFixed + 1;
223         }
224 //
225 //      Use unblocked code to factor the last or only block.
226 //
227 //
228         if (j<=minmn) {
229             auto _A    = A(_,_(j,n));
230             auto _jPiv = jPiv(_(j,n));
231             auto _tau  = tau(_(j,minmn));
232             auto vn1   = work(_(j,n));
233             auto vn2   = work(_(j+n,2*n));
234             auto _work = work(_(2*n+13*n+1-j));
235 
236             laqp2(j-1, _A, _jPiv, _tau, vn1, vn2, _work);
237         }
238 
239     }
240 
241     work(1) = iws;
242 }
243 
244 //== interface for native lapack ===============================================
245 
246 #ifdef CHECK_CXXLAPACK
247 
248 template <typename MA, typename JPIV, typename VTAU, typename VWORK>
249 void
250 qp3_native(GeMatrix<MA> &A, DenseVector<JPIV> &jPiv,  DenseVector<VTAU> &tau,
251            DenseVector<VWORK> &work)
252 {
253     typedef typename GeMatrix<MA>::ElementType  T;
254 
255     const INTEGER    M = A.numRows();
256     const INTEGER    N = A.numCols();
257     const INTEGER    LDA = A.leadingDimension();
258     INTEGER          LWORK = work.length();
259     INTEGER          INFO;
260 
261     if (IsSame<T, DOUBLE>::value) {
262         LAPACK_IMPL(dgeqp3)(&M, &N, A.data(), &LDA,
263                             jPiv.data(), tau.data(),
264                             work.data(), &LWORK,
265                             &INFO);
266         assert(INFO==0);
267     } else {
268         ASSERT(0);
269     }
270 }
271 
272 #endif // CHECK_CXXLAPACK
273 
274 //== public interface ==========================================================
275 
276 template <typename MA, typename JPIV, typename VTAU, typename VWORK>
277 void
278 qp3(GeMatrix<MA> &A, DenseVector<JPIV> &jPiv, DenseVector<VTAU> &tau,
279     DenseVector<VWORK> &work)
280 {
281     std::cerr << "enter: qp3" << std::endl;
282 
283     using std::min;
284     typedef typename GeMatrix<MA>::ElementType  ElementType;
285     typedef typename GeMatrix<MA>::IndexType    IndexType;
286 
287 #   ifndef NDEBUG
288 //
289 //  Test the input parameters
290 //
291     ASSERT(A.firstRow()==1);
292     ASSERT(A.firstCol()==1);
293     ASSERT(jPiv.firstIndex()==1);
294     ASSERT(tau.firstIndex()==1);
295     ASSERT(work.firstIndex()==1);
296 
297     const IndexType m = A.numRows();
298     const IndexType n = A.numCols();
299     const IndexType k = min(m, n);
300 
301     ASSERT(jPiv.length()==n);
302     ASSERT(tau.length()==k);
303     ASSERT(work.length()>=3*n+1 || work.length()==IndexType(0));
304 #   endif
305 
306 #   ifdef CHECK_CXXLAPACK
307 //
308 //  Make copies of output arguments
309 //
310     typename GeMatrix<MA>::NoView       A_org      = A;
311     typename DenseVector<JPIV>::NoView  jPiv_org   = jPiv;
312     typename DenseVector<VTAU>::NoView  tau_org    = tau;
313     typename DenseVector<VWORK>::NoView work_org   = work;
314 #   endif
315 
316 //
317 //  Call implementation
318 //
319     qp3_generic(A, jPiv, tau, work);
320 
321 #   ifdef CHECK_CXXLAPACK
322 //
323 //  Restore output arguments
324 //
325     typename GeMatrix<MA>::NoView       A_generic    = A;
326     typename DenseVector<JPIV>::NoView  jPiv_generic = jPiv;
327     typename DenseVector<VTAU>::NoView  tau_generic  = tau;
328     typename DenseVector<VWORK>::NoView work_generic = work;
329 
330     A    = A_org;
331     jPiv = jPiv_org;
332     tau  = tau_org;
333 
334     // if the generic implementation resized work due to a work size query
335     // we must not restore the work array
336     if (work_org.length()>0) {
337         work = work_org;
338     } else {
339         work = 0;
340     }
341 //
342 //  Compare results
343 //
344     qp3_native(A, jPiv, tau, work);
345 
346     bool failed = false;
347     if (! isIdentical(A_generic, A, "A_generic""A")) {
348         std::cerr << "CXXLAPACK: A_generic = " << A_generic << std::endl;
349         std::cerr << "F77LAPACK: A = " << A << std::endl;
350         failed = true;
351     }
352 
353     if (! isIdentical(jPiv_generic, jPiv, "jPiv_generic""jPiv")) {
354         std::cerr << "CXXLAPACK: jPiv_generic = " << jPiv_generic << std::endl;
355         std::cerr << "F77LAPACK: jPiv = " << jPiv << std::endl;
356         failed = true;
357     }
358 
359     if (! isIdentical(tau_generic, tau, "tau_generic""tau")) {
360         std::cerr << "CXXLAPACK: tau_generic = " << tau_generic << std::endl;
361         std::cerr << "F77LAPACK: tau = " << tau << std::endl;
362         failed = true;
363     }
364 
365     if (! isIdentical(work_generic, work, "work_generic""work")) {
366         std::cerr << "CXXLAPACK: work_generic = " << work_generic << std::endl;
367         std::cerr << "F77LAPACK: work = " << work << std::endl;
368         failed = true;
369     }
370 
371     if (failed) {
372         ASSERT(0);
373     }
374 #   endif
375 
376     std::cerr << "leave: qp3" << std::endl;
377 }
378 
379 //-- forwarding ----------------------------------------------------------------
380 template <typename MA, typename JPIV, typename VTAU, typename VWORK>
381 void
382 qp3(MA &&A, JPIV &&jPiv, VTAU &&tau, VWORK &&work)
383 {
384     CHECKPOINT_ENTER;
385     qp3(A, jPiv, tau, work);
386     CHECKPOINT_LEAVE;
387 }
388 
389 } } // namespace lapack, flens
390 
391 #endif // FLENS_LAPACK_QR_QP3_TCC