1
       2
       3
       4
       5
       6
       7
       8
       9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
      88
      89
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
     100
     101
     102
     103
     104
     105
     106
     107
     108
     109
     110
     111
     112
     113
     114
     115
     116
     117
     118
     119
     120
     121
     122
     123
     124
     125
     126
/*
 * Copyright (C) 2014, The University of Texas at Austin
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *  - Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  - Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  - Neither the name of The University of Texas at Austin nor the names
 *    of its contributors may be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

/*
 * Copyright (C) 2014-2015, Michael Lehn
 *
 * ulmBLAS adopted general ideas from BLIS.  Using micro kernels from BLIS
 * only requires minor modifications,
 *
 */

#ifndef ULMBLAS_IMPL_LEVEL1_KERNEL_SSE_AXPY_TCC
#define ULMBLAS_IMPL_LEVEL1_KERNEL_SSE_AXPY_TCC 1

#include <immintrin.h>

#include <ulmblas/impl/auxiliary/isaligned.h>
#include <ulmblas/impl/level1/kernel/ref/axpy.h>
#include <ulmblas/impl/level1/kernel/sse/axpy.h>

namespace ulmBLAS { namespace sse {

//
// ----------------
// Double Precision
// ----------------
//

template <typename IndexType>
void
axpy(IndexType      n,
     const double   &alpha,
     const double   *x,
     IndexType      incX,
     double         *y,
     IndexType      incY)
{
    if (n<=0 || alpha==double(0)) {
        return;
    }

    if (incX!=1 || incY!=1) {
        ref::axpy(n, alpha, x, incX, y, incY);
        return;
    }

    bool xAligned = isAligned(x, 16);
    bool yAligned = isAligned(y, 16);

    if (!xAligned && !yAligned) {
        y[0] += alpha*x[0];
        ++x;
        ++y;
        --n;
        xAligned = yAligned = true;
    }
    if (xAligned && yAligned) {
        IndexType nb = n / 6;
        IndexType nl = n % 6;

        __m128d alpha11, x12, x34, x56, y12, y34, y56;

        alpha11 = _mm_loaddup_pd(&alpha);

        for (IndexType i=0; i<nb; ++i) {
            x12 = _mm_load_pd(x);
            y12 = _mm_load_pd(y);

            x12 = x12 * alpha11;
            y12 = y12 + x12;
            _mm_store_pd(y, y12);

            x34 = _mm_load_pd(x+2);
            y34 = _mm_load_pd(y+2);

            x34 = x34 * alpha11;
            y34 = y34 + x34;
            _mm_store_pd(y+2, y34);

            x56 = _mm_load_pd(x+4);
            y56 = _mm_load_pd(y+4);

            x56 = x56 * alpha11;
            y56 = y56 + x56;
            _mm_store_pd(y+4, y56);

            x += 6;
            y += 6;
        }
        for (IndexType i=0; i<nl; ++i) {
            y[i] += alpha*x[i];
        }
    } else {
        ref::axpy(n, alpha, x, IndexType(1), y, IndexType(1));
    }
}

} } // namespace sse, ulmBLAS

#endif // ULMBLAS_IMPL_LEVEL1_KERNEL_SSE_AXPY_TCC 1