1 /*
2 * Copyright (c) 2007, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #ifndef FLENS_BLAS_CLOSURES_AXPY_TCC
34 #define FLENS_BLAS_CLOSURES_AXPY_TCC 1
35
36 #include <flens/aux/aux.h>
37 #include <flens/blas/closures/debugclosure.h>
38 #include <flens/blas/closures/mmswitch.h>
39 #include <flens/blas/closures/mvswitch.h>
40 #include <flens/blas/closures/prune.h>
41 #include <flens/blas/closures/result.h>
42 #include <flens/blas/level1/level1.h>
43 #include <flens/blas/level2/level2.h>
44 #include <flens/typedefs.h>
45
46 #ifdef FLENS_DEBUG_CLOSURES
47 # include <flens/blas/blaslogon.h>
48 #else
49 # include <flens/blas/blaslogoff.h>
50 #endif
51
52 namespace flens { namespace blas {
53
54 //
55 //== vector closures ===========================================================
56 //
57
58 //
59 // y += alpha*(x1+x2)
60 //
61 #ifndef FLENS_DEBUG_CLOSURES
62
63 template <typename ALPHA, typename VL, typename VR, typename VY>
64 void
65 axpy(const ALPHA &, const VectorClosure<OpAdd, VL, VR> &, Vector<VY> &)
66 {
67 //
68 // Note that y += x1+x2 or y -= x1+x2 is equivalent to y = y + (x1+x2) or
69 // y = y - (x1+x2) respectively. Hence, the result of x1+x2 has to be
70 // computed *before* we can update y. This means that a temporary vectors is
71 // needed to hold the result of x1+x2. We only allow this if the
72 // FLENS_DEBUG_CLOSURES macro is defined such that a user has a chance to
73 // optimize his/her expressions.
74 //
75 ASSERT(0);
76 }
77
78 #else
79
80 template <typename ALPHA, typename VL, typename VR, typename VY>
81 void
82 axpy(const ALPHA &alpha,
83 const VectorClosure<OpAdd, VL, VR> &x, Vector<VY> &y)
84 {
85 FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
86 typedef VectorClosure<OpAdd, VL, VR> VC;
87
88 //
89 // Compute the result of closure x = (x.left()+x.right()) first and store
90 // it in tmp.
91 //
92 FLENS_BLASLOG_TMP_TRON;
93 typename Result<VC>::Type tmp = x;
94 FLENS_BLASLOG_TMP_TROFF;
95 //
96 // Update y with tmp, i.e. compute y = y + alpha*tmp
97 //
98 axpy(alpha, tmp, y.impl());
99
100 FLENS_BLASLOG_TMP_REMOVE(tmp, x);
101 FLENS_BLASLOG_END;
102 }
103
104 #endif
105
106 //------------------------------------------------------------------------------
107 //
108 // y += alpha*(x1-x2)
109 //
110 #ifndef FLENS_DEBUG_CLOSURES
111
112 template <typename ALPHA, typename VL, typename VR, typename VY>
113 void
114 axpy(const ALPHA &, const VectorClosure<OpSub, VL, VR> &, Vector<VY> &)
115 {
116 //
117 // Note that y += x1-x2 or y -= x1-x2 is equivalent to y = y + (x1-x2) or
118 // y = y - (x1-x2) respectively. Hence, the result of x1-x2 has to be
119 // computed *before* we can update y. This means that a temporary vectors is
120 // needed to hold the result of x1-x2. We only allow this if the
121 // FLENS_DEBUG_CLOSURES macro is defined such that a user has a chance to
122 // optimize his/her expressions.
123 //
124 ASSERT(0);
125 }
126
127 #else
128
129 template <typename ALPHA, typename VL, typename VR, typename VY>
130 void
131 axpy(const ALPHA &alpha,
132 const VectorClosure<OpSub, VL, VR> &x, Vector<VY> &y)
133 {
134 FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
135 typedef VectorClosure<OpSub, VL, VR> VC;
136
137 //
138 // Compute the result of closure x = (x.left()-x.right()) first and store
139 // it in tmp.
140 //
141 FLENS_BLASLOG_TMP_TRON;
142 typename Result<VC>::Type tmp = x;
143 FLENS_BLASLOG_TMP_TROFF;
144 //
145 // Update y with tmp, i.e. compute y = y + alpha*tmp
146 //
147 axpy(alpha, tmp, y.impl());
148
149 FLENS_BLASLOG_TMP_REMOVE(tmp, x);
150 FLENS_BLASLOG_END;
151 }
152
153 #endif
154
155 //------------------------------------------------------------------------------
156 //
157 // y += scalar*x
158 //
159 // We evalute this with a scalSwitch
160 // case 1: x is no closure
161 // case 2: x is a closure
162
163 //
164 // case 1: x is no closure
165 //
166 template <typename ALPHA, typename VX, typename VY>
167 void
168 axpySwitch(const ALPHA &alpha, const Vector<VX> &x, Vector<VY> &y)
169 {
170 //
171 // No need to add another log-entry as we simply pass-through to the
172 // BLAS implementation
173 //
174 axpy(alpha, x.impl(), y.impl());
175 }
176
177 //
178 // case 2: x is a closure
179 //
180 #ifndef FLENS_DEBUG_CLOSURES
181
182 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
183 void
184 axpySwitch(const ALPHA &, const VectorClosure<Op, L, R> &, Vector<VY> &)
185 {
186 // Creation of temporary not allowed
187
188 ASSERT(0);
189 }
190
191 #else
192
193 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
194 void
195 axpySwitch(const ALPHA &alpha, const VectorClosure<Op, L, R> &x, Vector<VY> &y)
196 {
197 FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
198 typedef VectorClosure<Op, L, R> VC;
199
200 //
201 // Compute the result of closure x and store it in tmp.
202 //
203 FLENS_BLASLOG_TMP_TRON;
204 typename Result<VC>::Type tmp = x;
205 FLENS_BLASLOG_TMP_TROFF;
206 //
207 // Update y with tmp, i.e. compute y = y + alpha*tmp
208 //
209 axpy(alpha, tmp, y.impl());
210
211 FLENS_BLASLOG_TMP_REMOVE(tmp, x);
212 FLENS_BLASLOG_END;
213 }
214
215 #endif
216
217 //
218 // entry point for switch
219 //
220 template <typename ALPHA, typename T, typename VX, typename VY>
221 void
222 axpy(const ALPHA &alpha,
223 const VectorClosure<OpMult, ScalarValue<T>, VX> &x, Vector<VY> &y)
224 {
225 FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
226 //
227 // In the entry point we either have y += x or y -= x
228 //
229 ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
230 //
231 // Switch: 1) If x is a closure we need a temporary otherwise
232 // 2) call BLAS implementation directly.
233 //
234 axpySwitch(alpha*x.left().value(), x.right(), y.impl());
235
236 FLENS_BLASLOG_END;
237 }
238
239 //------------------------------------------------------------------------------
240 //
241 // y += x/scalar
242 //
243 // We evalute this with a rscalSwitch
244 // case 1: x is no closure
245 // case 2: x is a closure
246
247 //
248 // case 1: x is no closure
249 //
250 template <typename ALPHA, typename VX, typename VY>
251 void
252 raxpySwitch(const ALPHA &alpha, const Vector<VX> &x, Vector<VY> &y)
253 {
254 //
255 // No need to add another log-entry as we simply pass-through to the
256 // BLAS implementation
257 //
258 raxpy(alpha, x.impl(), y.impl());
259 }
260
261 //
262 // case 2: x is a closure
263 //
264 #ifndef FLENS_DEBUG_CLOSURES
265
266 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
267 void
268 raxpySwitch(const ALPHA &, const VectorClosure<Op, L, R> &, Vector<VY> &)
269 {
270 //
271 // Creation of temporary not allowed
272 //
273 ASSERT(0);
274 }
275
276 #else
277
278 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
279 void
280 raxpySwitch(const ALPHA &alpha, const VectorClosure<Op, L, R> &x, Vector<VY> &y)
281 {
282 FLENS_BLASLOG_BEGIN_RAXPY(alpha, x, y);
283 typedef VectorClosure<Op, L, R> VC;
284
285 //
286 // Compute the result of closure x and store it in tmp.
287 //
288 FLENS_BLASLOG_TMP_TRON;
289 typename Result<VC>::Type tmp = x;
290 FLENS_BLASLOG_TMP_TROFF;
291 //
292 // Update y with tmp, i.e. compute y = y + tmp/alpha
293 //
294 raxpy(alpha, tmp, y.impl());
295
296 FLENS_BLASLOG_TMP_REMOVE(tmp, x);
297 FLENS_BLASLOG_END;
298 }
299
300 #endif
301
302 //
303 // entry point for switch
304 //
305 template <typename ALPHA, typename VX, typename T, typename VY>
306 void
307 axpy(const ALPHA &alpha,
308 const VectorClosure<OpDiv, VX, ScalarValue<T> > &x, Vector<VY> &y)
309 {
310 FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
311 //
312 // In the entry point we either have y += x/scalar or y -= x/scalar
313 //
314 ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
315 //
316 // Switch: 1) If x is a closure we need a temporary otherwise
317 // 2) call BLAS implementation directly.
318 //
319 raxpySwitch(alpha*x.right().value(), x.left(), y.impl());
320
321 FLENS_BLASLOG_END;
322 }
323
324 //------------------------------------------------------------------------------
325 //
326 // y += A*x
327 //
328 template <typename ALPHA, typename ML, typename VR, typename VY>
329 typename RestrictTo<ClosureType<OpMult, ML, VR>::isMatrixVectorProduct,
330 void>::Type
331 axpy(const ALPHA &alpha,
332 const VectorClosure<OpMult, ML, VR> &Ax, Vector<VY> &y)
333 {
334 FLENS_BLASLOG_BEGIN_AXPY(alpha, Ax, y);
335 //
336 // In the entry point we either have y += A*x or y -= A*x
337 //
338 ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
339
340 //
341 // If A is a closure then prune arbitrary many OpTrans/OpConj
342 //
343 typedef typename PruneConjTrans<ML>::Remainder MA;
344
345 Transpose trans = PruneConjTrans<ML>::trans;
346 const MA &A = PruneConjTrans<ML>::remainder(Ax.left());
347 //
348 // If x is a closure it gets evaluated. In this case a temporary gets
349 // created. Otherwise we only keep a reference
350 //
351 FLENS_BLASLOG_TMP_TRON;
352 const typename Result<VR>::Type &x = Ax.right();
353 FLENS_BLASLOG_TMP_TROFF;
354 //
355 // Call mv implementation
356 //
357 typename VY::Impl::ElementType One(1);
358 mvSwitch(Transpose(Trans^trans), alpha, A, x, One, y.impl());
359
360 //
361 // If a temporary was created and registered before we now unregister it
362 //
363 # ifdef FLENS_DEBUG_CLOSURES
364 if (!IsSame<VR, typename Result<VR>::Type>::value) {
365 FLENS_BLASLOG_TMP_REMOVE(x, Ax.right());
366 }
367 # endif
368
369 FLENS_BLASLOG_END;
370 }
371
372 //------------------------------------------------------------------------------
373 //
374 // y += x*A
375 //
376 template <typename ALPHA, typename VL, typename MR, typename VY>
377 typename RestrictTo<ClosureType<OpMult, VL, MR>::isVectorMatrixProduct,
378 void>::Type
379 axpy(const ALPHA &alpha, const VectorClosure<OpMult, VL, MR> &xA, Vector<VY> &y)
380 {
381 FLENS_BLASLOG_BEGIN_AXPY(alpha, xA, y);
382 //
383 // In the entry point we either have y += x*A or y -= x*A
384 //
385 ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
386
387 //
388 // If A is a closure then prune arbitrary many OpTrans/OpConj
389 //
390 typedef typename PruneConjTrans<MR>::Remainder MA;
391
392 Transpose trans = PruneConjTrans<MR>::trans;
393 const MA &A = PruneConjTrans<MR>::remainder(xA.right());
394 //
395 // If x is a closure it gets evaluated. In this case a temporary gets
396 // created. Otherwise we only keep a reference
397 //
398 FLENS_BLASLOG_TMP_TRON;
399 const typename Result<VL>::Type &x = xA.left();
400 FLENS_BLASLOG_TMP_TROFF;
401 //
402 // Call mv implementation
403 //
404 typename VY::Impl::ElementType One(1);
405 mvSwitch(Transpose(Trans^trans), alpha, A, x, One, y.impl());
406
407 //
408 // If a temporary was created and registered before we now unregister it
409 //
410 # ifdef FLENS_DEBUG_CLOSURES
411 if (!IsSame<VL, typename Result<VL>::Type>::value) {
412 FLENS_BLASLOG_TMP_REMOVE(x, xA.left());
413 }
414 # endif
415
416 FLENS_BLASLOG_END;
417 }
418
419 //------------------------------------------------------------------------------
420 //
421 // y += <Unknown Closure>
422 //
423 #ifndef FLENS_DEBUG_CLOSURES
424
425 template <typename ALPHA, typename Op, typename VL, typename VR, typename VY>
426 void
427 axpy(const ALPHA &, const VectorClosure<Op, VL, VR> &, Vector<VY> &)
428 {
429 ASSERT(0);
430 }
431
432 #else
433
434 template <typename ALPHA, typename Op, typename VL, typename VR, typename VY>
435 void
436 axpy(const ALPHA &alpha, const VectorClosure<Op, VL, VR> &x, Vector<VY> &y)
437 {
438 FLENS_BLASLOG_ERROR_AXPY(alpha, x, y);
439 ASSERT(0);
440 }
441
442 #endif
443
444 //-- matrix closures -----------------------------------------------------------
445 //
446 // In the following comments op(X) denotes x, X^T or X^H
447 //
448
449 //
450 // B += alpha*op(A1 + A2)
451 //
452 #ifndef FLENS_DEBUG_CLOSURES
453
454 template <typename ALPHA, typename ML, typename MR, typename MB>
455 void
456 axpy(Transpose, const ALPHA &, const MatrixClosure<OpAdd, ML, MR> &,
457 Matrix<MB> &)
458 {
459 //
460 // Note that B += A1+A2 or B -= A1+A2 is equivalent to B = B + (A1+A2) or
461 // B = B - (A1+A2) respectively. Hence, the result of A1+A2 has to be
462 // computed *before* we can update y. This means that a temporary vectors is
463 // needed to hold the result of A1+A2. We only allow this if the
464 // FLENS_DEBUG_CLOSURES macro is defined such that a user has a chance to
465 // optimize his/her expressions.
466 //
467 ASSERT(0);
468 }
469
470 #else
471
472 template <typename ALPHA, typename ML, typename MR, typename MB>
473 void
474 axpy(Transpose trans, const ALPHA &alpha,
475 const MatrixClosure<OpAdd, ML, MR> &A, Matrix<MB> &B)
476 {
477 FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
478 typedef MatrixClosure<OpAdd, ML, MR> MC;
479
480 //
481 // Compute the result of closure A = (A.left()+A.right()) first and store
482 // it in tmp.
483 //
484 FLENS_BLASLOG_TMP_TRON;
485 typename Result<MC>::Type tmp = A;
486 FLENS_BLASLOG_TMP_TROFF;
487 //
488 // Update y with tmp, i.e. compute B = B + alpha*tmp
489 //
490 axpy(trans, alpha, tmp, B.impl());
491
492 FLENS_BLASLOG_TMP_REMOVE(tmp, A);
493 FLENS_BLASLOG_END;
494 }
495
496 #endif
497
498 //------------------------------------------------------------------------------
499 //
500 // B += alpha*op(A1 - A2)
501 //
502 #ifndef FLENS_DEBUG_CLOSURES
503
504 template <typename ALPHA, typename ML, typename MR, typename MB>
505 void
506 axpy(Transpose, const ALPHA &, const MatrixClosure<OpSub, ML, MR> &,
507 Matrix<MB> &)
508 {
509 //
510 // Note that B += A1+A2 or B -= A1+A2 is equivalent to B = B + (A1+A2) or
511 // B = B - (A1+A2) respectively. Hence, the result of A1+A2 has to be
512 // computed *before* we can update y. This means that a temporary vectors is
513 // needed to hold the result of A1+A2. We only allow this if the
514 // FLENS_DEBUG_CLOSURES macro is defined such that a user has a chance to
515 // optimize his/her expressions.
516 //
517 ASSERT(0);
518 }
519
520
521 #else
522
523 template <typename ALPHA, typename ML, typename MR, typename MB>
524 void
525 axpy(Transpose trans, const ALPHA &alpha,
526 const MatrixClosure<OpSub, ML, MR> &A, Matrix<MB> &B)
527 {
528 typedef MatrixClosure<OpSub, ML, MR> MC;
529 FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
530
531 //
532 // Compute the result of closure A = (A.left()+A.right()) first and store
533 // it in tmp.
534 //
535 FLENS_BLASLOG_TMP_TRON;
536 typename Result<MC>::Type tmp = A;
537 FLENS_BLASLOG_TMP_TROFF;
538 //
539 // Update y with tmp, i.e. compute B = B + alpha*tmp
540 //
541 axpy(trans, alpha, tmp, B.impl());
542
543 FLENS_BLASLOG_TMP_REMOVE(tmp, A);
544 FLENS_BLASLOG_END;
545 }
546
547 #endif
548
549 //------------------------------------------------------------------------------
550 //
551 // B += scalar*op(A)
552 //
553
554 // We evalute this with a scalSwitch
555 // case 1: A is no closure
556 // case 2: A is a closure
557
558 //
559 // case 1: A is no closure
560 //
561 template <typename ALPHA, typename MA, typename MB>
562 void
563 axpySwitch(Transpose trans,
564 const ALPHA &alpha, const Matrix<MA> &A, Matrix<MB> &B)
565 {
566 //
567 // No need to add another log-entry as we simply pass-through to the
568 // BLAS implementation
569 //
570 axpy(trans, alpha, A.impl(), B.impl());
571 }
572
573 //
574 // case 2: A is a closure
575 //
576 #ifndef FLENS_DEBUG_CLOSURES
577
578 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
579 void
580 axpySwitch(Transpose, const ALPHA &, const MatrixClosure<Op, L, R> &,
581 Matrix<VY> &)
582 {
583 //
584 // Creation of temporary not allowed
585 //
586 ASSERT(0);
587 }
588
589 #else
590
591 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
592 void
593 axpySwitch(Transpose trans,
594 const ALPHA &alpha, const MatrixClosure<Op, L, R> &A, Matrix<VY> &B)
595 {
596 typedef MatrixClosure<Op, L, R> MC;
597
598 FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
599
600 //
601 // Compute the result of closure A and store it in tmp.
602 //
603 FLENS_BLASLOG_TMP_TRON;
604 typename Result<MC>::Type tmp = A;
605 FLENS_BLASLOG_TMP_TROFF;
606 //
607 // Update B with tmp, i.e. compute B = B + alpha*tmp
608 //
609 axpy(trans, alpha, tmp, B.impl());
610
611 FLENS_BLASLOG_TMP_REMOVE(tmp, A);
612 FLENS_BLASLOG_END;
613 }
614
615 # endif
616
617 //
618 // entry point for switch
619 //
620 template <typename ALPHA, typename T, typename MA, typename MB>
621 void
622 axpy(Transpose trans, const ALPHA &alpha,
623 const MatrixClosure<OpMult, ScalarValue<T>, MA> &A, Matrix<MB> &B)
624 {
625 FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
626 //
627 // In the entry point we either have B += alpha*A or B -= alpha*A
628 //
629 ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
630 //
631 // Switch: 1) If A is a closure we need a temporary otherwise
632 // 2) call BLAS implementation directly.
633 //
634 axpySwitch(trans, alpha*A.left().value(), A.right(), B.impl());
635
636 FLENS_BLASLOG_END;
637 }
638
639 //------------------------------------------------------------------------------
640 //
641 // B += op(A)/scalar
642 //
643
644 // We evalute this with a scalSwitch
645 // case 1: A is no closure
646 // case 2: A is a closure
647
648 //
649 // case 1: A is no closure
650 //
651 template <typename ALPHA, typename MA, typename MB>
652 void
653 raxpySwitch(Transpose trans,
654 const ALPHA &alpha, const Matrix<MA> &A, Matrix<MB> &B)
655 {
656 //
657 // No need to add another log-entry as we simply pass-through to the
658 // BLAS implementation
659 //
660 raxpy(trans, alpha, A.impl(), B.impl());
661 }
662
663 //
664 // case 2: A is a closure
665 //
666 #ifndef FLENS_DEBUG_CLOSURES
667
668 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
669 void
670 raxpySwitch(Transpose, const ALPHA &, const MatrixClosure<Op, L, R> &,
671 Matrix<VY> &)
672 {
673 //
674 // Creation of temporary not allowed
675 //
676 ASSERT(0);
677 }
678
679 #else
680
681 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
682 void
683 raxpySwitch(Transpose trans,
684 const ALPHA &alpha, const MatrixClosure<Op, L, R> &A, Matrix<VY> &B)
685 {
686 typedef MatrixClosure<Op, L, R> MC;
687
688 FLENS_BLASLOG_BEGIN_MRAXPY(trans, alpha, A, B);
689
690 //
691 // Compute the result of closure A and store it in tmp.
692 //
693 FLENS_BLASLOG_TMP_TRON;
694 typename Result<MC>::Type tmp = A;
695 FLENS_BLASLOG_TMP_TROFF;
696 //
697 // Update B with tmp, i.e. compute B = B + tmp/alpha
698 //
699 raxpy(trans, alpha, tmp, B.impl());
700
701 FLENS_BLASLOG_TMP_REMOVE(tmp, A);
702 FLENS_BLASLOG_END;
703 }
704
705 #endif
706
707 //
708 // entry point for switch
709 //
710 template <typename ALPHA, typename MA, typename T, typename MB>
711 void
712 axpy(Transpose trans, const ALPHA &alpha,
713 const MatrixClosure<OpDiv, MA, ScalarValue<T> > &A, Matrix<MB> &B)
714 {
715 FLENS_BLASLOG_BEGIN_MRAXPY(trans, alpha, A, B);
716 //
717 // In the entry point we either have B += A/scalar or B -= A/scalar
718 //
719 ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
720 //
721 // Switch: 1) If A is a closure we need a temporary otherwise
722 // 2) call BLAS implementation directly.
723 //
724 raxpySwitch(trans, alpha*A.right().value(), A.left(), B.impl());
725
726 FLENS_BLASLOG_END;
727 }
728
729 //------------------------------------------------------------------------------
730 //
731 // B += op(A^T)
732 //
733 template <typename ALPHA, typename MA, typename MB>
734 void
735 axpy(Transpose trans, const ALPHA &alpha,
736 const MatrixClosureOpTrans<MA> &A, Matrix<MB> &B)
737 {
738 FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
739
740 trans = Transpose(trans^Trans);
741
742 axpy(trans, alpha, A.left(), B.impl());
743
744 FLENS_BLASLOG_END;
745 }
746
747 //------------------------------------------------------------------------------
748 //
749 // C += A*B
750 //
751 template <typename ALPHA, typename MA, typename MB, typename MC>
752 typename RestrictTo<ClosureType<OpMult, MA, MB>::isMatrixMatrixProduct,
753 void>::Type
754 axpy(Transpose trans, const ALPHA &alpha,
755 const MatrixClosure<OpMult,MA,MB> &AB, Matrix<MC> &C)
756 {
757 FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, AB, C);
758
759 typename MC::Impl::ElementType One(1);
760
761 if (trans==NoTrans || trans==ConjTrans) {
762 mmSwitch(trans, trans, alpha, AB.left(), AB.right(), One, C.impl());
763 } else {
764 mmSwitch(trans, trans, alpha, AB.right(), AB.left(), One, C.impl());
765 }
766
767 FLENS_BLASLOG_END;
768 }
769
770 //------------------------------------------------------------------------------
771 //
772 // B += <Unknown Closure>
773 //
774 #ifndef FLENS_DEBUG_CLOSURES
775
776 template <typename ALPHA, typename Op, typename ML, typename MR, typename MB>
777 void
778 axpy(Transpose, const ALPHA &, const MatrixClosure<Op, ML, MR> &, Matrix<MB> &)
779 {
780 ASSERT(0);
781 }
782
783 #else
784
785 template <typename ALPHA, typename Op, typename ML, typename MR, typename MB>
786 void
787 axpy(Transpose trans, const ALPHA &alpha,
788 const MatrixClosure<Op, ML, MR> &A, Matrix<MB> &B)
789 {
790 FLENS_BLASLOG_ERROR_MAXPY(trans, alpha, A, B);
791 ASSERT(0);
792 }
793
794 #endif
795
796
797 #ifdef FLENS_DEBUG_CLOSURES
798 //------------------------------------------------------------------------------
799 //
800 // B += Some Matrix
801 //
802 template <typename ALPHA, typename MA, typename MB>
803 void
804 axpy(Transpose trans, const ALPHA &alpha,
805 const Matrix<MA> &A, Matrix<MB> &B)
806 {
807 typedef typename MA::Impl::ElementType TA;
808 typedef GeMatrix<FullStorage<TA, ColMajor> > RMA;
809
810 FLENS_BLASLOG_TMP_TRON;
811 RMA _A = A.impl();
812 FLENS_BLASLOG_TMP_TROFF;
813
814 axpy(trans, alpha, _A, B.impl());
815
816 FLENS_BLASLOG_TMP_REMOVE(_A, A);
817 }
818 #endif
819
820
821 } } // namespace blas, flens
822
823 #endif // FLENS_BLAS_CLOSURES_AXPY_TCC
2 * Copyright (c) 2007, Michael Lehn
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1) Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2) Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 * 3) Neither the name of the FLENS development group nor the names of
17 * its contributors may be used to endorse or promote products derived
18 * from this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #ifndef FLENS_BLAS_CLOSURES_AXPY_TCC
34 #define FLENS_BLAS_CLOSURES_AXPY_TCC 1
35
36 #include <flens/aux/aux.h>
37 #include <flens/blas/closures/debugclosure.h>
38 #include <flens/blas/closures/mmswitch.h>
39 #include <flens/blas/closures/mvswitch.h>
40 #include <flens/blas/closures/prune.h>
41 #include <flens/blas/closures/result.h>
42 #include <flens/blas/level1/level1.h>
43 #include <flens/blas/level2/level2.h>
44 #include <flens/typedefs.h>
45
46 #ifdef FLENS_DEBUG_CLOSURES
47 # include <flens/blas/blaslogon.h>
48 #else
49 # include <flens/blas/blaslogoff.h>
50 #endif
51
52 namespace flens { namespace blas {
53
54 //
55 //== vector closures ===========================================================
56 //
57
58 //
59 // y += alpha*(x1+x2)
60 //
61 #ifndef FLENS_DEBUG_CLOSURES
62
63 template <typename ALPHA, typename VL, typename VR, typename VY>
64 void
65 axpy(const ALPHA &, const VectorClosure<OpAdd, VL, VR> &, Vector<VY> &)
66 {
67 //
68 // Note that y += x1+x2 or y -= x1+x2 is equivalent to y = y + (x1+x2) or
69 // y = y - (x1+x2) respectively. Hence, the result of x1+x2 has to be
70 // computed *before* we can update y. This means that a temporary vectors is
71 // needed to hold the result of x1+x2. We only allow this if the
72 // FLENS_DEBUG_CLOSURES macro is defined such that a user has a chance to
73 // optimize his/her expressions.
74 //
75 ASSERT(0);
76 }
77
78 #else
79
80 template <typename ALPHA, typename VL, typename VR, typename VY>
81 void
82 axpy(const ALPHA &alpha,
83 const VectorClosure<OpAdd, VL, VR> &x, Vector<VY> &y)
84 {
85 FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
86 typedef VectorClosure<OpAdd, VL, VR> VC;
87
88 //
89 // Compute the result of closure x = (x.left()+x.right()) first and store
90 // it in tmp.
91 //
92 FLENS_BLASLOG_TMP_TRON;
93 typename Result<VC>::Type tmp = x;
94 FLENS_BLASLOG_TMP_TROFF;
95 //
96 // Update y with tmp, i.e. compute y = y + alpha*tmp
97 //
98 axpy(alpha, tmp, y.impl());
99
100 FLENS_BLASLOG_TMP_REMOVE(tmp, x);
101 FLENS_BLASLOG_END;
102 }
103
104 #endif
105
106 //------------------------------------------------------------------------------
107 //
108 // y += alpha*(x1-x2)
109 //
110 #ifndef FLENS_DEBUG_CLOSURES
111
112 template <typename ALPHA, typename VL, typename VR, typename VY>
113 void
114 axpy(const ALPHA &, const VectorClosure<OpSub, VL, VR> &, Vector<VY> &)
115 {
116 //
117 // Note that y += x1-x2 or y -= x1-x2 is equivalent to y = y + (x1-x2) or
118 // y = y - (x1-x2) respectively. Hence, the result of x1-x2 has to be
119 // computed *before* we can update y. This means that a temporary vectors is
120 // needed to hold the result of x1-x2. We only allow this if the
121 // FLENS_DEBUG_CLOSURES macro is defined such that a user has a chance to
122 // optimize his/her expressions.
123 //
124 ASSERT(0);
125 }
126
127 #else
128
129 template <typename ALPHA, typename VL, typename VR, typename VY>
130 void
131 axpy(const ALPHA &alpha,
132 const VectorClosure<OpSub, VL, VR> &x, Vector<VY> &y)
133 {
134 FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
135 typedef VectorClosure<OpSub, VL, VR> VC;
136
137 //
138 // Compute the result of closure x = (x.left()-x.right()) first and store
139 // it in tmp.
140 //
141 FLENS_BLASLOG_TMP_TRON;
142 typename Result<VC>::Type tmp = x;
143 FLENS_BLASLOG_TMP_TROFF;
144 //
145 // Update y with tmp, i.e. compute y = y + alpha*tmp
146 //
147 axpy(alpha, tmp, y.impl());
148
149 FLENS_BLASLOG_TMP_REMOVE(tmp, x);
150 FLENS_BLASLOG_END;
151 }
152
153 #endif
154
155 //------------------------------------------------------------------------------
156 //
157 // y += scalar*x
158 //
159 // We evalute this with a scalSwitch
160 // case 1: x is no closure
161 // case 2: x is a closure
162
163 //
164 // case 1: x is no closure
165 //
166 template <typename ALPHA, typename VX, typename VY>
167 void
168 axpySwitch(const ALPHA &alpha, const Vector<VX> &x, Vector<VY> &y)
169 {
170 //
171 // No need to add another log-entry as we simply pass-through to the
172 // BLAS implementation
173 //
174 axpy(alpha, x.impl(), y.impl());
175 }
176
177 //
178 // case 2: x is a closure
179 //
180 #ifndef FLENS_DEBUG_CLOSURES
181
182 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
183 void
184 axpySwitch(const ALPHA &, const VectorClosure<Op, L, R> &, Vector<VY> &)
185 {
186 // Creation of temporary not allowed
187
188 ASSERT(0);
189 }
190
191 #else
192
193 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
194 void
195 axpySwitch(const ALPHA &alpha, const VectorClosure<Op, L, R> &x, Vector<VY> &y)
196 {
197 FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
198 typedef VectorClosure<Op, L, R> VC;
199
200 //
201 // Compute the result of closure x and store it in tmp.
202 //
203 FLENS_BLASLOG_TMP_TRON;
204 typename Result<VC>::Type tmp = x;
205 FLENS_BLASLOG_TMP_TROFF;
206 //
207 // Update y with tmp, i.e. compute y = y + alpha*tmp
208 //
209 axpy(alpha, tmp, y.impl());
210
211 FLENS_BLASLOG_TMP_REMOVE(tmp, x);
212 FLENS_BLASLOG_END;
213 }
214
215 #endif
216
217 //
218 // entry point for switch
219 //
220 template <typename ALPHA, typename T, typename VX, typename VY>
221 void
222 axpy(const ALPHA &alpha,
223 const VectorClosure<OpMult, ScalarValue<T>, VX> &x, Vector<VY> &y)
224 {
225 FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
226 //
227 // In the entry point we either have y += x or y -= x
228 //
229 ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
230 //
231 // Switch: 1) If x is a closure we need a temporary otherwise
232 // 2) call BLAS implementation directly.
233 //
234 axpySwitch(alpha*x.left().value(), x.right(), y.impl());
235
236 FLENS_BLASLOG_END;
237 }
238
239 //------------------------------------------------------------------------------
240 //
241 // y += x/scalar
242 //
243 // We evalute this with a rscalSwitch
244 // case 1: x is no closure
245 // case 2: x is a closure
246
247 //
248 // case 1: x is no closure
249 //
250 template <typename ALPHA, typename VX, typename VY>
251 void
252 raxpySwitch(const ALPHA &alpha, const Vector<VX> &x, Vector<VY> &y)
253 {
254 //
255 // No need to add another log-entry as we simply pass-through to the
256 // BLAS implementation
257 //
258 raxpy(alpha, x.impl(), y.impl());
259 }
260
261 //
262 // case 2: x is a closure
263 //
264 #ifndef FLENS_DEBUG_CLOSURES
265
266 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
267 void
268 raxpySwitch(const ALPHA &, const VectorClosure<Op, L, R> &, Vector<VY> &)
269 {
270 //
271 // Creation of temporary not allowed
272 //
273 ASSERT(0);
274 }
275
276 #else
277
278 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
279 void
280 raxpySwitch(const ALPHA &alpha, const VectorClosure<Op, L, R> &x, Vector<VY> &y)
281 {
282 FLENS_BLASLOG_BEGIN_RAXPY(alpha, x, y);
283 typedef VectorClosure<Op, L, R> VC;
284
285 //
286 // Compute the result of closure x and store it in tmp.
287 //
288 FLENS_BLASLOG_TMP_TRON;
289 typename Result<VC>::Type tmp = x;
290 FLENS_BLASLOG_TMP_TROFF;
291 //
292 // Update y with tmp, i.e. compute y = y + tmp/alpha
293 //
294 raxpy(alpha, tmp, y.impl());
295
296 FLENS_BLASLOG_TMP_REMOVE(tmp, x);
297 FLENS_BLASLOG_END;
298 }
299
300 #endif
301
302 //
303 // entry point for switch
304 //
305 template <typename ALPHA, typename VX, typename T, typename VY>
306 void
307 axpy(const ALPHA &alpha,
308 const VectorClosure<OpDiv, VX, ScalarValue<T> > &x, Vector<VY> &y)
309 {
310 FLENS_BLASLOG_BEGIN_AXPY(alpha, x, y);
311 //
312 // In the entry point we either have y += x/scalar or y -= x/scalar
313 //
314 ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
315 //
316 // Switch: 1) If x is a closure we need a temporary otherwise
317 // 2) call BLAS implementation directly.
318 //
319 raxpySwitch(alpha*x.right().value(), x.left(), y.impl());
320
321 FLENS_BLASLOG_END;
322 }
323
324 //------------------------------------------------------------------------------
325 //
326 // y += A*x
327 //
328 template <typename ALPHA, typename ML, typename VR, typename VY>
329 typename RestrictTo<ClosureType<OpMult, ML, VR>::isMatrixVectorProduct,
330 void>::Type
331 axpy(const ALPHA &alpha,
332 const VectorClosure<OpMult, ML, VR> &Ax, Vector<VY> &y)
333 {
334 FLENS_BLASLOG_BEGIN_AXPY(alpha, Ax, y);
335 //
336 // In the entry point we either have y += A*x or y -= A*x
337 //
338 ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
339
340 //
341 // If A is a closure then prune arbitrary many OpTrans/OpConj
342 //
343 typedef typename PruneConjTrans<ML>::Remainder MA;
344
345 Transpose trans = PruneConjTrans<ML>::trans;
346 const MA &A = PruneConjTrans<ML>::remainder(Ax.left());
347 //
348 // If x is a closure it gets evaluated. In this case a temporary gets
349 // created. Otherwise we only keep a reference
350 //
351 FLENS_BLASLOG_TMP_TRON;
352 const typename Result<VR>::Type &x = Ax.right();
353 FLENS_BLASLOG_TMP_TROFF;
354 //
355 // Call mv implementation
356 //
357 typename VY::Impl::ElementType One(1);
358 mvSwitch(Transpose(Trans^trans), alpha, A, x, One, y.impl());
359
360 //
361 // If a temporary was created and registered before we now unregister it
362 //
363 # ifdef FLENS_DEBUG_CLOSURES
364 if (!IsSame<VR, typename Result<VR>::Type>::value) {
365 FLENS_BLASLOG_TMP_REMOVE(x, Ax.right());
366 }
367 # endif
368
369 FLENS_BLASLOG_END;
370 }
371
372 //------------------------------------------------------------------------------
373 //
374 // y += x*A
375 //
376 template <typename ALPHA, typename VL, typename MR, typename VY>
377 typename RestrictTo<ClosureType<OpMult, VL, MR>::isVectorMatrixProduct,
378 void>::Type
379 axpy(const ALPHA &alpha, const VectorClosure<OpMult, VL, MR> &xA, Vector<VY> &y)
380 {
381 FLENS_BLASLOG_BEGIN_AXPY(alpha, xA, y);
382 //
383 // In the entry point we either have y += x*A or y -= x*A
384 //
385 ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
386
387 //
388 // If A is a closure then prune arbitrary many OpTrans/OpConj
389 //
390 typedef typename PruneConjTrans<MR>::Remainder MA;
391
392 Transpose trans = PruneConjTrans<MR>::trans;
393 const MA &A = PruneConjTrans<MR>::remainder(xA.right());
394 //
395 // If x is a closure it gets evaluated. In this case a temporary gets
396 // created. Otherwise we only keep a reference
397 //
398 FLENS_BLASLOG_TMP_TRON;
399 const typename Result<VL>::Type &x = xA.left();
400 FLENS_BLASLOG_TMP_TROFF;
401 //
402 // Call mv implementation
403 //
404 typename VY::Impl::ElementType One(1);
405 mvSwitch(Transpose(Trans^trans), alpha, A, x, One, y.impl());
406
407 //
408 // If a temporary was created and registered before we now unregister it
409 //
410 # ifdef FLENS_DEBUG_CLOSURES
411 if (!IsSame<VL, typename Result<VL>::Type>::value) {
412 FLENS_BLASLOG_TMP_REMOVE(x, xA.left());
413 }
414 # endif
415
416 FLENS_BLASLOG_END;
417 }
418
419 //------------------------------------------------------------------------------
420 //
421 // y += <Unknown Closure>
422 //
423 #ifndef FLENS_DEBUG_CLOSURES
424
425 template <typename ALPHA, typename Op, typename VL, typename VR, typename VY>
426 void
427 axpy(const ALPHA &, const VectorClosure<Op, VL, VR> &, Vector<VY> &)
428 {
429 ASSERT(0);
430 }
431
432 #else
433
434 template <typename ALPHA, typename Op, typename VL, typename VR, typename VY>
435 void
436 axpy(const ALPHA &alpha, const VectorClosure<Op, VL, VR> &x, Vector<VY> &y)
437 {
438 FLENS_BLASLOG_ERROR_AXPY(alpha, x, y);
439 ASSERT(0);
440 }
441
442 #endif
443
444 //-- matrix closures -----------------------------------------------------------
445 //
446 // In the following comments op(X) denotes x, X^T or X^H
447 //
448
449 //
450 // B += alpha*op(A1 + A2)
451 //
452 #ifndef FLENS_DEBUG_CLOSURES
453
454 template <typename ALPHA, typename ML, typename MR, typename MB>
455 void
456 axpy(Transpose, const ALPHA &, const MatrixClosure<OpAdd, ML, MR> &,
457 Matrix<MB> &)
458 {
459 //
460 // Note that B += A1+A2 or B -= A1+A2 is equivalent to B = B + (A1+A2) or
461 // B = B - (A1+A2) respectively. Hence, the result of A1+A2 has to be
462 // computed *before* we can update y. This means that a temporary vectors is
463 // needed to hold the result of A1+A2. We only allow this if the
464 // FLENS_DEBUG_CLOSURES macro is defined such that a user has a chance to
465 // optimize his/her expressions.
466 //
467 ASSERT(0);
468 }
469
470 #else
471
472 template <typename ALPHA, typename ML, typename MR, typename MB>
473 void
474 axpy(Transpose trans, const ALPHA &alpha,
475 const MatrixClosure<OpAdd, ML, MR> &A, Matrix<MB> &B)
476 {
477 FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
478 typedef MatrixClosure<OpAdd, ML, MR> MC;
479
480 //
481 // Compute the result of closure A = (A.left()+A.right()) first and store
482 // it in tmp.
483 //
484 FLENS_BLASLOG_TMP_TRON;
485 typename Result<MC>::Type tmp = A;
486 FLENS_BLASLOG_TMP_TROFF;
487 //
488 // Update y with tmp, i.e. compute B = B + alpha*tmp
489 //
490 axpy(trans, alpha, tmp, B.impl());
491
492 FLENS_BLASLOG_TMP_REMOVE(tmp, A);
493 FLENS_BLASLOG_END;
494 }
495
496 #endif
497
498 //------------------------------------------------------------------------------
499 //
500 // B += alpha*op(A1 - A2)
501 //
502 #ifndef FLENS_DEBUG_CLOSURES
503
504 template <typename ALPHA, typename ML, typename MR, typename MB>
505 void
506 axpy(Transpose, const ALPHA &, const MatrixClosure<OpSub, ML, MR> &,
507 Matrix<MB> &)
508 {
509 //
510 // Note that B += A1+A2 or B -= A1+A2 is equivalent to B = B + (A1+A2) or
511 // B = B - (A1+A2) respectively. Hence, the result of A1+A2 has to be
512 // computed *before* we can update y. This means that a temporary vectors is
513 // needed to hold the result of A1+A2. We only allow this if the
514 // FLENS_DEBUG_CLOSURES macro is defined such that a user has a chance to
515 // optimize his/her expressions.
516 //
517 ASSERT(0);
518 }
519
520
521 #else
522
523 template <typename ALPHA, typename ML, typename MR, typename MB>
524 void
525 axpy(Transpose trans, const ALPHA &alpha,
526 const MatrixClosure<OpSub, ML, MR> &A, Matrix<MB> &B)
527 {
528 typedef MatrixClosure<OpSub, ML, MR> MC;
529 FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
530
531 //
532 // Compute the result of closure A = (A.left()+A.right()) first and store
533 // it in tmp.
534 //
535 FLENS_BLASLOG_TMP_TRON;
536 typename Result<MC>::Type tmp = A;
537 FLENS_BLASLOG_TMP_TROFF;
538 //
539 // Update y with tmp, i.e. compute B = B + alpha*tmp
540 //
541 axpy(trans, alpha, tmp, B.impl());
542
543 FLENS_BLASLOG_TMP_REMOVE(tmp, A);
544 FLENS_BLASLOG_END;
545 }
546
547 #endif
548
549 //------------------------------------------------------------------------------
550 //
551 // B += scalar*op(A)
552 //
553
554 // We evalute this with a scalSwitch
555 // case 1: A is no closure
556 // case 2: A is a closure
557
558 //
559 // case 1: A is no closure
560 //
561 template <typename ALPHA, typename MA, typename MB>
562 void
563 axpySwitch(Transpose trans,
564 const ALPHA &alpha, const Matrix<MA> &A, Matrix<MB> &B)
565 {
566 //
567 // No need to add another log-entry as we simply pass-through to the
568 // BLAS implementation
569 //
570 axpy(trans, alpha, A.impl(), B.impl());
571 }
572
573 //
574 // case 2: A is a closure
575 //
576 #ifndef FLENS_DEBUG_CLOSURES
577
578 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
579 void
580 axpySwitch(Transpose, const ALPHA &, const MatrixClosure<Op, L, R> &,
581 Matrix<VY> &)
582 {
583 //
584 // Creation of temporary not allowed
585 //
586 ASSERT(0);
587 }
588
589 #else
590
591 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
592 void
593 axpySwitch(Transpose trans,
594 const ALPHA &alpha, const MatrixClosure<Op, L, R> &A, Matrix<VY> &B)
595 {
596 typedef MatrixClosure<Op, L, R> MC;
597
598 FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
599
600 //
601 // Compute the result of closure A and store it in tmp.
602 //
603 FLENS_BLASLOG_TMP_TRON;
604 typename Result<MC>::Type tmp = A;
605 FLENS_BLASLOG_TMP_TROFF;
606 //
607 // Update B with tmp, i.e. compute B = B + alpha*tmp
608 //
609 axpy(trans, alpha, tmp, B.impl());
610
611 FLENS_BLASLOG_TMP_REMOVE(tmp, A);
612 FLENS_BLASLOG_END;
613 }
614
615 # endif
616
617 //
618 // entry point for switch
619 //
620 template <typename ALPHA, typename T, typename MA, typename MB>
621 void
622 axpy(Transpose trans, const ALPHA &alpha,
623 const MatrixClosure<OpMult, ScalarValue<T>, MA> &A, Matrix<MB> &B)
624 {
625 FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
626 //
627 // In the entry point we either have B += alpha*A or B -= alpha*A
628 //
629 ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
630 //
631 // Switch: 1) If A is a closure we need a temporary otherwise
632 // 2) call BLAS implementation directly.
633 //
634 axpySwitch(trans, alpha*A.left().value(), A.right(), B.impl());
635
636 FLENS_BLASLOG_END;
637 }
638
639 //------------------------------------------------------------------------------
640 //
641 // B += op(A)/scalar
642 //
643
644 // We evalute this with a scalSwitch
645 // case 1: A is no closure
646 // case 2: A is a closure
647
648 //
649 // case 1: A is no closure
650 //
651 template <typename ALPHA, typename MA, typename MB>
652 void
653 raxpySwitch(Transpose trans,
654 const ALPHA &alpha, const Matrix<MA> &A, Matrix<MB> &B)
655 {
656 //
657 // No need to add another log-entry as we simply pass-through to the
658 // BLAS implementation
659 //
660 raxpy(trans, alpha, A.impl(), B.impl());
661 }
662
663 //
664 // case 2: A is a closure
665 //
666 #ifndef FLENS_DEBUG_CLOSURES
667
668 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
669 void
670 raxpySwitch(Transpose, const ALPHA &, const MatrixClosure<Op, L, R> &,
671 Matrix<VY> &)
672 {
673 //
674 // Creation of temporary not allowed
675 //
676 ASSERT(0);
677 }
678
679 #else
680
681 template <typename ALPHA, typename Op, typename L, typename R, typename VY>
682 void
683 raxpySwitch(Transpose trans,
684 const ALPHA &alpha, const MatrixClosure<Op, L, R> &A, Matrix<VY> &B)
685 {
686 typedef MatrixClosure<Op, L, R> MC;
687
688 FLENS_BLASLOG_BEGIN_MRAXPY(trans, alpha, A, B);
689
690 //
691 // Compute the result of closure A and store it in tmp.
692 //
693 FLENS_BLASLOG_TMP_TRON;
694 typename Result<MC>::Type tmp = A;
695 FLENS_BLASLOG_TMP_TROFF;
696 //
697 // Update B with tmp, i.e. compute B = B + tmp/alpha
698 //
699 raxpy(trans, alpha, tmp, B.impl());
700
701 FLENS_BLASLOG_TMP_REMOVE(tmp, A);
702 FLENS_BLASLOG_END;
703 }
704
705 #endif
706
707 //
708 // entry point for switch
709 //
710 template <typename ALPHA, typename MA, typename T, typename MB>
711 void
712 axpy(Transpose trans, const ALPHA &alpha,
713 const MatrixClosure<OpDiv, MA, ScalarValue<T> > &A, Matrix<MB> &B)
714 {
715 FLENS_BLASLOG_BEGIN_MRAXPY(trans, alpha, A, B);
716 //
717 // In the entry point we either have B += A/scalar or B -= A/scalar
718 //
719 ASSERT(alpha==ALPHA(1) || alpha==ALPHA(-1));
720 //
721 // Switch: 1) If A is a closure we need a temporary otherwise
722 // 2) call BLAS implementation directly.
723 //
724 raxpySwitch(trans, alpha*A.right().value(), A.left(), B.impl());
725
726 FLENS_BLASLOG_END;
727 }
728
729 //------------------------------------------------------------------------------
730 //
731 // B += op(A^T)
732 //
733 template <typename ALPHA, typename MA, typename MB>
734 void
735 axpy(Transpose trans, const ALPHA &alpha,
736 const MatrixClosureOpTrans<MA> &A, Matrix<MB> &B)
737 {
738 FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, A, B);
739
740 trans = Transpose(trans^Trans);
741
742 axpy(trans, alpha, A.left(), B.impl());
743
744 FLENS_BLASLOG_END;
745 }
746
747 //------------------------------------------------------------------------------
748 //
749 // C += A*B
750 //
751 template <typename ALPHA, typename MA, typename MB, typename MC>
752 typename RestrictTo<ClosureType<OpMult, MA, MB>::isMatrixMatrixProduct,
753 void>::Type
754 axpy(Transpose trans, const ALPHA &alpha,
755 const MatrixClosure<OpMult,MA,MB> &AB, Matrix<MC> &C)
756 {
757 FLENS_BLASLOG_BEGIN_MAXPY(trans, alpha, AB, C);
758
759 typename MC::Impl::ElementType One(1);
760
761 if (trans==NoTrans || trans==ConjTrans) {
762 mmSwitch(trans, trans, alpha, AB.left(), AB.right(), One, C.impl());
763 } else {
764 mmSwitch(trans, trans, alpha, AB.right(), AB.left(), One, C.impl());
765 }
766
767 FLENS_BLASLOG_END;
768 }
769
770 //------------------------------------------------------------------------------
771 //
772 // B += <Unknown Closure>
773 //
774 #ifndef FLENS_DEBUG_CLOSURES
775
776 template <typename ALPHA, typename Op, typename ML, typename MR, typename MB>
777 void
778 axpy(Transpose, const ALPHA &, const MatrixClosure<Op, ML, MR> &, Matrix<MB> &)
779 {
780 ASSERT(0);
781 }
782
783 #else
784
785 template <typename ALPHA, typename Op, typename ML, typename MR, typename MB>
786 void
787 axpy(Transpose trans, const ALPHA &alpha,
788 const MatrixClosure<Op, ML, MR> &A, Matrix<MB> &B)
789 {
790 FLENS_BLASLOG_ERROR_MAXPY(trans, alpha, A, B);
791 ASSERT(0);
792 }
793
794 #endif
795
796
797 #ifdef FLENS_DEBUG_CLOSURES
798 //------------------------------------------------------------------------------
799 //
800 // B += Some Matrix
801 //
802 template <typename ALPHA, typename MA, typename MB>
803 void
804 axpy(Transpose trans, const ALPHA &alpha,
805 const Matrix<MA> &A, Matrix<MB> &B)
806 {
807 typedef typename MA::Impl::ElementType TA;
808 typedef GeMatrix<FullStorage<TA, ColMajor> > RMA;
809
810 FLENS_BLASLOG_TMP_TRON;
811 RMA _A = A.impl();
812 FLENS_BLASLOG_TMP_TROFF;
813
814 axpy(trans, alpha, _A, B.impl());
815
816 FLENS_BLASLOG_TMP_REMOVE(_A, A);
817 }
818 #endif
819
820
821 } } // namespace blas, flens
822
823 #endif // FLENS_BLAS_CLOSURES_AXPY_TCC