1 /*
  2  *   Copyright (c) 2007, Michael Lehn
  3  *
  4  *   All rights reserved.
  5  *
  6  *   Redistribution and use in source and binary forms, with or without
  7  *   modification, are permitted provided that the following conditions
  8  *   are met:
  9  *
 10  *   1) Redistributions of source code must retain the above copyright
 11  *      notice, this list of conditions and the following disclaimer.
 12  *   2) Redistributions in binary form must reproduce the above copyright
 13  *      notice, this list of conditions and the following disclaimer in
 14  *      the documentation and/or other materials provided with the
 15  *      distribution.
 16  *   3) Neither the name of the FLENS development group nor the names of
 17  *      its contributors may be used to endorse or promote products derived
 18  *      from this software without specific prior written permission.
 19  *
 20  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31  */
 32 
 33 #ifndef FLENS_BLAS_CLOSURES_AXPY_TCC
 34 #define FLENS_BLAS_CLOSURES_AXPY_TCC 1
 35 
 36 #include <flens/aux/aux.h>
 37 #include <flens/blas/closures/debugclosure.h>
 38 #include <flens/blas/closures/mmswitch.h>
 39 #include <flens/blas/closures/mvswitch.h>
 40 #include <flens/blas/closures/prune.h>
 41 #include <flens/blas/closures/result.h>
 42 #include <flens/blas/level1/level1.h>
 43 #include <flens/blas/level2/level2.h>
 44 #include <flens/typedefs.h>
 45 
 46 #ifdef FLENS_DEBUG_CLOSURES
 47 #   include <flens/blas/blaslogon.h>
 48 #else
 49 #   include <flens/blas/blaslogoff.h>
 50 #endif
 51 
 52 namespace flens { namespace blas {
 53 
 54 //
 55 //== vector closures ===========================================================
 56 //
 57 
 58 //
 59 //  y += alpha*(x1+x2)
 60 //
 61 #ifndef FLENS_DEBUG_CLOSURES
 62 
 63 template <typename ALPHA, typename VL, typename VR, typename VY>
 64 void
 65 axpy(const ALPHA &, const VectorClosure<OpAdd, VL, VR> &, Vector<VY> &)
 66 {
 67 //
 68 //  Note that y += x1+x2  or  y -= x1+x2 is equivalent to y = y + (x1+x2) or
 69 //  y = y - (x1+x2) respectively.  Hence, the result of x1+x2 has to be
 70 //  computed *before* we can update y.  This means that a temporary vectors is
 71 //  needed to hold the result of x1+x2.  We only allow this if the
 72 //  FLENS_DEBUG_CLOSURES macro is defined such that a user has a chance to
 73 //  optimize his/her expressions.
 74 //
 75     ASSERT(0);
 76 }
 77 
 78 #else
 79 
 80 template <typename ALPHA, typename VL, typename VR, typename VY>
 81 void
 82 axpy(const ALPHA &alpha,
 83      const VectorClosure<OpAdd, VL, VR> &x, Vector<VY> &y)
 84 {
 85     FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
 86     typedef VectorClosure<OpAdd, VL, VR>  VC;
 87 
 88 //
 89 //  Compute the result of closure x = (x.left()+x.right()) first and store
 90 //  it in tmp.
 91 //
 92     FLENS_BLASLOG_TMP_TRON;
 93     typename Result<VC>::Type tmp = x;
 94     FLENS_BLASLOG_TMP_TROFF;
 95 //
 96 //  Update y with tmp, i.e. compute y = y + alpha*tmp
 97 //
 98     axpy(alpha, tmp, y.impl());
 99 
100     FLENS_BLASLOG_TMP_REMOVE(tmp, x);
101     FLENS_BLASLOG_END;
102 }
103 
104 #endif
105 
106 //------------------------------------------------------------------------------
107 //
108 //  y += alpha*(x1-x2)
109 //
110 #ifndef FLENS_DEBUG_CLOSURES
111 
112 template <typename ALPHA, typename VL, typename VR, typename VY>
113 void
114 axpy(const ALPHA &, const VectorClosure<OpSub, VL, VR> &, Vector<VY> &)
115 {
116 //
117 //  Note that y += x1-x2  or  y -= x1-x2 is equivalent to y = y + (x1-x2) or
118 //  y = y - (x1-x2) respectively.  Hence, the result of x1-x2 has to be
119 //  computed *before* we can update y.  This means that a temporary vectors is
120 //  needed to hold the result of x1-x2.  We only allow this if the
121 //  FLENS_DEBUG_CLOSURES macro is defined such that a user has a chance to
122 //  optimize his/her expressions.
123 //
124     ASSERT(0);
125 }
126 
127 #else
128 
129 template <typename ALPHA, typename VL, typename VR, typename VY>
130 void
131 axpy(const ALPHA &alpha,
132      const VectorClosure<OpSub, VL, VR> &x, Vector<VY> &y)
133 {
134     FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
135     typedef VectorClosure<OpSub, VL, VR>  VC;
136 
137 //
138 //  Compute the result of closure x = (x.left()-x.right()) first and store
139 //  it in tmp.
140 //
141     FLENS_BLASLOG_TMP_TRON;
142     typename Result<VC>::Type tmp = x;
143     FLENS_BLASLOG_TMP_TROFF;
144 //
145 //  Update y with tmp, i.e. compute y = y + alpha*tmp
146 //
147     axpy(alpha, tmp, y.impl());
148 
149     FLENS_BLASLOG_TMP_REMOVE(tmp, x);
150     FLENS_BLASLOG_END;
151 }
152 
153 #endif
154 
155 //------------------------------------------------------------------------------
156 //
157 //  y += scalar*x
158 //
159 //  We evalute this with a scalSwitch
160 //  case 1: x is no closure
161 //  case 2: x is a closure
162 
163 //
164 //  case 1: x is no closure
165 //
166 template <typename ALPHA, typename VX, typename VY>
167 void
168 axpySwitch(const ALPHA &alpha, const Vector<VX> &x, Vector<VY> &y)
169 {
170 //
171 //  No need to add another log-entry as we simply pass-through to the
172 //  BLAS implementation
173 //
174     axpy(alpha, x.impl(), y.impl());
175 }
176 
177 //
178 //  case 2: x is a closure
179 //
180 #ifndef FLENS_DEBUG_CLOSURES
181 
182 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
183 void
184 axpySwitch(const ALPHA &, const VectorClosure<Op, L, R> &, Vector<VY> &)
185 {
186 //  Creation of temporary not allowed
187 
188     ASSERT(0);
189 }
190 
191 #else
192 
193 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
194 void
195 axpySwitch(const ALPHA &alpha, const VectorClosure<Op, L, R> &x, Vector<VY> &y)
196 {
197     FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
198     typedef VectorClosure<Op, L, R>  VC;
199 
200 //
201 //  Compute the result of closure x and store it in tmp.
202 //
203     FLENS_BLASLOG_TMP_TRON;
204     typename Result<VC>::Type tmp = x;
205     FLENS_BLASLOG_TMP_TROFF;
206 //
207 //  Update y with tmp, i.e. compute y = y + alpha*tmp
208 //
209     axpy(alpha, tmp, y.impl());
210 
211     FLENS_BLASLOG_TMP_REMOVE(tmp, x);
212     FLENS_BLASLOG_END;
213 }
214 
215 #endif
216 
217 //
218 //  entry point for switch
219 //
220 template <typename ALPHA, typename T, typename VX, typename VY>
221 void
222 axpy(const ALPHA &alpha,
223      const VectorClosure<OpMult, ScalarValue<T>, VX> &x, Vector<VY> &y)
224 {
225     FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
226 //
227 //  In the entry point we either have y += x or y -= x
228 //
229     ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
230 //
231 //  Switch: 1) If x is a closure we need a temporary otherwise
232 //          2) call BLAS implementation directly.
233 //
234     axpySwitch(alpha*x.left().value(), x.right(), y.impl());
235 
236     FLENS_BLASLOG_END;
237 }
238 
239 //------------------------------------------------------------------------------
240 //
241 //  y += x/scalar
242 //
243 //  We evalute this with a rscalSwitch
244 //  case 1: x is no closure
245 //  case 2: x is a closure
246 
247 //
248 //  case 1: x is no closure
249 //
250 template <typename ALPHA, typename VX, typename VY>
251 void
252 raxpySwitch(const ALPHA &alpha, const Vector<VX> &x, Vector<VY> &y)
253 {
254 //
255 //  No need to add another log-entry as we simply pass-through to the
256 //  BLAS implementation
257 //
258     raxpy(alpha, x.impl(), y.impl());
259 }
260 
261 //
262 //  case 2: x is a closure
263 //
264 #ifndef FLENS_DEBUG_CLOSURES
265 
266 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
267 void
268 raxpySwitch(const ALPHA &, const VectorClosure<Op, L, R> &, Vector<VY> &)
269 {
270 //
271 //  Creation of temporary not allowed
272 //
273     ASSERT(0);
274 }
275 
276 #else
277 
278 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
279 void
280 raxpySwitch(const ALPHA &alpha, const VectorClosure<Op, L, R> &x, Vector<VY> &y)
281 {
282     FLENS_BLASLOG_BEGIN_RAXPY(alpha, x, y);
283     typedef VectorClosure<Op, L, R>  VC;
284 
285 //
286 //  Compute the result of closure x and store it in tmp.
287 //
288     FLENS_BLASLOG_TMP_TRON;
289     typename Result<VC>::Type tmp = x;
290     FLENS_BLASLOG_TMP_TROFF;
291 //
292 //  Update y with tmp, i.e. compute y = y + tmp/alpha
293 //
294     raxpy(alpha, tmp, y.impl());
295 
296     FLENS_BLASLOG_TMP_REMOVE(tmp, x);
297     FLENS_BLASLOG_END;
298 }
299 
300 #endif
301 
302 //
303 //  entry point for switch
304 //
305 template <typename ALPHA, typename VX, typename T, typename VY>
306 void
307 axpy(const ALPHA &alpha,
308      const VectorClosure<OpDiv, VX, ScalarValue<T> > &x, Vector<VY> &y)
309 {
310     FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
311 //
312 //  In the entry point we either have y += x/scalar or y -= x/scalar
313 //
314     ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
315 //
316 //  Switch: 1) If x is a closure we need a temporary otherwise
317 //          2) call BLAS implementation directly.
318 //
319     raxpySwitch(alpha*x.right().value(), x.left(), y.impl());
320 
321     FLENS_BLASLOG_END;
322 }
323 
324 //------------------------------------------------------------------------------
325 //
326 // y += A*x
327 //
328 template <typename ALPHA, typename ML, typename VR, typename VY>
329 typename RestrictTo<ClosureType<OpMult, ML, VR>::isMatrixVectorProduct,
330          void>::Type
331 axpy(const ALPHA &alpha,
332      const VectorClosure<OpMult, ML, VR> &Ax, Vector<VY> &y)
333 {
334     FLENS_BLASLOG_BEGIN_AXPY(alpha, Ax, y);
335 //
336 //  In the entry point we either have y += A*x or y -= A*x
337 //
338     ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
339 
340 //
341 //  If A is a closure then prune arbitrary many OpTrans/OpConj
342 //
343     typedef typename PruneConjTrans<ML>::Remainder MA;
344 
345     Transpose trans = PruneConjTrans<ML>::trans;
346     const MA  &A    = PruneConjTrans<ML>::remainder(Ax.left());
347 //
348 //  If x is a closure it gets evaluated.  In this case a temporary gets
349 //  created.  Otherwise we only keep a reference
350 //
351     FLENS_BLASLOG_TMP_TRON;
352     const typename Result<VR>::Type  &x = Ax.right();
353     FLENS_BLASLOG_TMP_TROFF;
354 //
355 //  Call mv implementation
356 //
357     typename VY::Impl::ElementType  One(1);
358     mvSwitch(Transpose(Trans^trans), alpha, A, x, One, y.impl());
359 
360 //
361 //  If a temporary was created and registered before we now unregister it
362 //
363 #   ifdef FLENS_DEBUG_CLOSURES
364     if (!IsSame<VR, typename Result<VR>::Type>::value) {
365         FLENS_BLASLOG_TMP_REMOVE(x, Ax.right());
366     }
367 #   endif
368 
369     FLENS_BLASLOG_END;
370 }
371 
372 //------------------------------------------------------------------------------
373 //
374 // y += x*A
375 //
376 template <typename ALPHA, typename VL, typename MR, typename VY>
377 typename RestrictTo<ClosureType<OpMult, VL, MR>::isVectorMatrixProduct,
378          void>::Type
379 axpy(const ALPHA &alpha, const VectorClosure<OpMult, VL, MR> &xA, Vector<VY> &y)
380 {
381     FLENS_BLASLOG_BEGIN_AXPY(alpha, xA, y);
382 //
383 //  In the entry point we either have y += x*A or y -= x*A
384 //
385     ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
386 
387 //
388 //  If A is a closure then prune arbitrary many OpTrans/OpConj
389 //
390     typedef typename PruneConjTrans<MR>::Remainder MA;
391 
392     Transpose trans = PruneConjTrans<MR>::trans;
393     const MA  &A    = PruneConjTrans<MR>::remainder(xA.right());
394 //
395 //  If x is a closure it gets evaluated.  In this case a temporary gets
396 //  created.  Otherwise we only keep a reference
397 //
398     FLENS_BLASLOG_TMP_TRON;
399     const typename Result<VL>::Type  &x = xA.left();
400     FLENS_BLASLOG_TMP_TROFF;
401 //
402 //  Call mv implementation
403 //
404     typename VY::Impl::ElementType  One(1);
405     mvSwitch(Transpose(Trans^trans), alpha, A, x, One, y.impl());
406 
407 //
408 //  If a temporary was created and registered before we now unregister it
409 //
410 #   ifdef FLENS_DEBUG_CLOSURES
411     if (!IsSame<VL, typename Result<VL>::Type>::value) {
412         FLENS_BLASLOG_TMP_REMOVE(x, xA.left());
413     }
414 #   endif
415 
416     FLENS_BLASLOG_END;
417 }
418 
419 //------------------------------------------------------------------------------
420 //
421 //  y += <Unknown Closure>
422 //
423 #ifndef FLENS_DEBUG_CLOSURES
424 
425 template <typename ALPHA, typename Op, typename VL, typename VR, typename VY>
426 void
427 axpy(const ALPHA &, const VectorClosure<Op, VL, VR> &, Vector<VY> &)
428 {
429     ASSERT(0);
430 }
431 
432 #else
433 
434 template <typename ALPHA, typename Op, typename VL, typename VR, typename VY>
435 void
436 axpy(const ALPHA &alpha, const VectorClosure<Op, VL, VR> &x, Vector<VY> &y)
437 {
438     FLENS_BLASLOG_ERROR_AXPY(alpha, x, y);
439     ASSERT(0);
440 }
441 
442 #endif
443 
444 //-- matrix closures -----------------------------------------------------------
445 //
446 //  In the following comments op(X) denotes  x, X^T or X^H
447 //
448 
449 //
450 //  B += alpha*op(A1 + A2)
451 //
452 #ifndef FLENS_DEBUG_CLOSURES
453 
454 template <typename ALPHA, typename ML, typename MR, typename MB>
455 void
456 axpy(Transpose, const ALPHA &, const MatrixClosure<OpAdd, ML, MR> &,
457      Matrix<MB> &)
458 {
459 //
460 //  Note that B += A1+A2  or  B -= A1+A2 is equivalent to B = B + (A1+A2) or
461 //  B = B - (A1+A2) respectively.  Hence, the result of A1+A2 has to be
462 //  computed *before* we can update y.  This means that a temporary vectors is
463 //  needed to hold the result of A1+A2.  We only allow this if the
464 //  FLENS_DEBUG_CLOSURES macro is defined such that a user has a chance to
465 //  optimize his/her expressions.
466 //
467     ASSERT(0);
468 }
469 
470 #else
471 
472 template <typename ALPHA, typename ML, typename MR, typename MB>
473 void
474 axpy(Transpose trans, const ALPHA &alpha,
475      const MatrixClosure<OpAdd, ML, MR> &A, Matrix<MB> &B)
476 {
477     FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
478     typedef MatrixClosure<OpAdd, ML, MR>  MC;
479 
480 //
481 //  Compute the result of closure A = (A.left()+A.right()) first and store
482 //  it in tmp.
483 //
484     FLENS_BLASLOG_TMP_TRON;
485     typename Result<MC>::Type tmp = A;
486     FLENS_BLASLOG_TMP_TROFF;
487 //
488 //  Update y with tmp, i.e. compute B = B + alpha*tmp
489 //
490     axpy(trans, alpha, tmp, B.impl());
491 
492     FLENS_BLASLOG_TMP_REMOVE(tmp, A);
493     FLENS_BLASLOG_END;
494 }
495 
496 #endif
497 
498 //------------------------------------------------------------------------------
499 //
500 //  B += alpha*op(A1 - A2)
501 //
502 #ifndef FLENS_DEBUG_CLOSURES
503 
504 template <typename ALPHA, typename ML, typename MR, typename MB>
505 void
506 axpy(Transpose, const ALPHA &, const MatrixClosure<OpSub, ML, MR> &,
507      Matrix<MB> &)
508 {
509 //
510 //  Note that B += A1+A2  or  B -= A1+A2 is equivalent to B = B + (A1+A2) or
511 //  B = B - (A1+A2) respectively.  Hence, the result of A1+A2 has to be
512 //  computed *before* we can update y.  This means that a temporary vectors is
513 //  needed to hold the result of A1+A2.  We only allow this if the
514 //  FLENS_DEBUG_CLOSURES macro is defined such that a user has a chance to
515 //  optimize his/her expressions.
516 //
517     ASSERT(0);
518 }
519 
520 
521 #else
522 
523 template <typename ALPHA, typename ML, typename MR, typename MB>
524 void
525 axpy(Transpose trans, const ALPHA &alpha,
526      const MatrixClosure<OpSub, ML, MR> &A, Matrix<MB> &B)
527 {
528     typedef MatrixClosure<OpSub, ML, MR>  MC;
529     FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
530 
531 //
532 //  Compute the result of closure A = (A.left()+A.right()) first and store
533 //  it in tmp.
534 //
535     FLENS_BLASLOG_TMP_TRON;
536     typename Result<MC>::Type tmp = A;
537     FLENS_BLASLOG_TMP_TROFF;
538 //
539 //  Update y with tmp, i.e. compute B = B + alpha*tmp
540 //
541     axpy(trans, alpha, tmp, B.impl());
542 
543     FLENS_BLASLOG_TMP_REMOVE(tmp, A);
544     FLENS_BLASLOG_END;
545 }
546 
547 #endif
548 
549 //------------------------------------------------------------------------------
550 //
551 //  B += scalar*op(A)
552 //
553 
554 //  We evalute this with a scalSwitch
555 //  case 1: A is no closure
556 //  case 2: A is a closure
557 
558 //
559 //  case 1: A is no closure
560 //
561 template <typename ALPHA, typename MA, typename MB>
562 void
563 axpySwitch(Transpose trans,
564            const ALPHA &alpha, const Matrix<MA> &A, Matrix<MB> &B)
565 {
566 //
567 //  No need to add another log-entry as we simply pass-through to the
568 //  BLAS implementation
569 //
570     axpy(trans, alpha, A.impl(), B.impl());
571 }
572 
573 //
574 //  case 2: A is a closure
575 //
576 #ifndef FLENS_DEBUG_CLOSURES
577 
578 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
579 void
580 axpySwitch(Transpose, const ALPHA &, const MatrixClosure<Op, L, R> &,
581            Matrix<VY> &)
582 {
583 //
584 //  Creation of temporary not allowed
585 //
586     ASSERT(0);
587 }
588 
589 #else
590 
591 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
592 void
593 axpySwitch(Transpose trans,
594            const ALPHA &alpha, const MatrixClosure<Op, L, R> &A, Matrix<VY> &B)
595 {
596     typedef MatrixClosure<Op, L, R>  MC;
597 
598     FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
599 
600 //
601 //  Compute the result of closure A and store it in tmp.
602 //
603     FLENS_BLASLOG_TMP_TRON;
604     typename Result<MC>::Type tmp = A;
605     FLENS_BLASLOG_TMP_TROFF;
606 //
607 //  Update B with tmp, i.e. compute B = B + alpha*tmp
608 //
609     axpy(trans, alpha, tmp, B.impl());
610 
611     FLENS_BLASLOG_TMP_REMOVE(tmp, A);
612     FLENS_BLASLOG_END;
613 }
614 
615 #   endif
616 
617 //
618 //  entry point for switch
619 //
620 template <typename ALPHA, typename T, typename MA, typename MB>
621 void
622 axpy(Transpose trans, const ALPHA &alpha,
623      const MatrixClosure<OpMult, ScalarValue<T>, MA> &A, Matrix<MB> &B)
624 {
625     FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
626 //
627 //  In the entry point we either have B += alpha*A or B -= alpha*A
628 //
629     ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
630 //
631 //  Switch: 1) If A is a closure we need a temporary otherwise
632 //          2) call BLAS implementation directly.
633 //
634     axpySwitch(trans, alpha*A.left().value(), A.right(), B.impl());
635 
636     FLENS_BLASLOG_END;
637 }
638 
639 //------------------------------------------------------------------------------
640 //
641 //  B += op(A)/scalar
642 //
643 
644 //  We evalute this with a scalSwitch
645 //  case 1: A is no closure
646 //  case 2: A is a closure
647 
648 //
649 //  case 1: A is no closure
650 //
651 template <typename ALPHA, typename MA, typename MB>
652 void
653 raxpySwitch(Transpose trans,
654             const ALPHA &alpha, const Matrix<MA> &A, Matrix<MB> &B)
655 {
656 //
657 //  No need to add another log-entry as we simply pass-through to the
658 //  BLAS implementation
659 //
660     raxpy(trans, alpha, A.impl(), B.impl());
661 }
662 
663 //
664 //  case 2: A is a closure
665 //
666 #ifndef FLENS_DEBUG_CLOSURES
667 
668 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
669 void
670 raxpySwitch(Transpose, const ALPHA &, const MatrixClosure<Op, L, R> &,
671             Matrix<VY> &)
672 {
673 //
674 //  Creation of temporary not allowed
675 //
676     ASSERT(0);
677 }
678 
679 #else
680 
681 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
682 void
683 raxpySwitch(Transpose trans,
684             const ALPHA &alpha, const MatrixClosure<Op, L, R> &A, Matrix<VY> &B)
685 {
686     typedef MatrixClosure<Op, L, R>  MC;
687 
688     FLENS_BLASLOG_BEGIN_MRAXPY(trans, alpha, A, B);
689 
690 //
691 //  Compute the result of closure A and store it in tmp.
692 //
693     FLENS_BLASLOG_TMP_TRON;
694     typename Result<MC>::Type tmp = A;
695     FLENS_BLASLOG_TMP_TROFF;
696 //
697 //  Update B with tmp, i.e. compute B = B + tmp/alpha
698 //
699     raxpy(trans, alpha, tmp, B.impl());
700 
701     FLENS_BLASLOG_TMP_REMOVE(tmp, A);
702     FLENS_BLASLOG_END;
703 }
704 
705 #endif
706 
707 //
708 //  entry point for switch
709 //
710 template <typename ALPHA, typename MA, typename T, typename MB>
711 void
712 axpy(Transpose trans, const ALPHA &alpha,
713      const MatrixClosure<OpDiv, MA, ScalarValue<T> > &A, Matrix<MB> &B)
714 {
715     FLENS_BLASLOG_BEGIN_MRAXPY(trans, alpha, A, B);
716 //
717 //  In the entry point we either have B += A/scalar or B -= A/scalar
718 //
719     ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
720 //
721 //  Switch: 1) If A is a closure we need a temporary otherwise
722 //          2) call BLAS implementation directly.
723 //
724     raxpySwitch(trans, alpha*A.right().value(), A.left(), B.impl());
725 
726     FLENS_BLASLOG_END;
727 }
728 
729 //------------------------------------------------------------------------------
730 //
731 //  B += op(A^T)
732 //
733 template <typename ALPHA, typename MA, typename MB>
734 void
735 axpy(Transpose trans, const ALPHA &alpha,
736      const MatrixClosureOpTrans<MA> &A, Matrix<MB> &B)
737 {
738     FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
739 
740     trans = Transpose(trans^Trans);
741 
742     axpy(trans, alpha, A.left(), B.impl());
743 
744     FLENS_BLASLOG_END;
745 }
746 
747 //------------------------------------------------------------------------------
748 //
749 //  C += A*B
750 //
751 template <typename ALPHA, typename MA, typename MB, typename MC>
752 typename RestrictTo<ClosureType<OpMult, MA, MB>::isMatrixMatrixProduct,
753          void>::Type
754 axpy(Transpose trans, const ALPHA &alpha,
755      const MatrixClosure<OpMult,MA,MB> &AB, Matrix<MC> &C)
756 {
757     FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, AB, C);
758 
759     typename MC::Impl::ElementType  One(1);
760 
761     if (trans==NoTrans || trans==ConjTrans) {
762         mmSwitch(trans, trans, alpha, AB.left(), AB.right(), One, C.impl());
763     } else {
764         mmSwitch(trans, trans, alpha, AB.right(), AB.left(), One, C.impl());
765     }
766 
767     FLENS_BLASLOG_END;
768 }
769 
770 //------------------------------------------------------------------------------
771 //
772 //  B += <Unknown Closure>
773 //
774 #ifndef FLENS_DEBUG_CLOSURES
775 
776 template <typename ALPHA, typename Op, typename ML, typename MR, typename MB>
777 void
778 axpy(Transpose, const ALPHA &, const MatrixClosure<Op, ML, MR> &, Matrix<MB> &)
779 {
780     ASSERT(0);
781 }
782 
783 #else
784 
785 template <typename ALPHA, typename Op, typename ML, typename MR, typename MB>
786 void
787 axpy(Transpose trans, const ALPHA &alpha,
788      const MatrixClosure<Op, ML, MR> &A, Matrix<MB> &B)
789 {
790     FLENS_BLASLOG_ERROR_MAXPY(trans, alpha, A, B);
791     ASSERT(0);
792 }
793 
794 #endif
795 
796 
797 #ifdef FLENS_DEBUG_CLOSURES
798 //------------------------------------------------------------------------------
799 //
800 // B += Some Matrix
801 //
802 template <typename ALPHA, typename MA, typename MB>
803 void
804 axpy(Transpose trans, const ALPHA &alpha,
805      const Matrix<MA> &A, Matrix<MB> &B)
806 {
807     typedef typename MA::Impl::ElementType        TA;
808     typedef GeMatrix<FullStorage<TA, ColMajor> >  RMA;
809 
810     FLENS_BLASLOG_TMP_TRON;
811     RMA _A = A.impl();
812     FLENS_BLASLOG_TMP_TROFF;
813 
814     axpy(trans, alpha, _A, B.impl());
815 
816     FLENS_BLASLOG_TMP_REMOVE(_A, A);
817 }
818 #endif
819 
820 
821 } } // namespace blas, flens
822 
823 #endif // FLENS_BLAS_CLOSURES_AXPY_TCC