1
       2
       3
       4
       5
       6
       7
       8
       9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
      88
      89
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
     100
     101
     102
     103
     104
     105
     106
     107
     108
     109
     110
     111
     112
     113
     114
     115
     116
     117
     118
     119
#ifndef GCCVEC_HPP
#define GCCVEC_HPP

#include "gemm.hpp"
#include <type_traits>

//-- Micro Kernel --------------------------------------------------------------
template <typename Index>
typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
                    && BlockSize<double>::MR==4
                    && BlockSize<double>::NR==8
                    && BlockSize<double>::vlen == 4,
        void>::type
ugemm(Index kc, double alpha,
     const double *A, const double *B,
     double beta,
     double *C, Index incRowC, Index incColC)
{
    static const Index MR = BlockSize<double>::MR;
    static const Index NR = BlockSize<double>::NR/4;
    typedef double vx __attribute__((vector_size (4*sizeof(double))));

    A = (const double*) __builtin_assume_aligned (A, 32);
    B = (const double*) __builtin_assume_aligned (B, 32);

    vx b0, b1, tmp1, tmp2;

    const vx *b = (const vx *)B;
    b0 = b[0];

    vx P0_03 = {}; vx P0_47 = {};
    vx P1_03 = {}; vx P1_47 = {};
    vx P2_03 = {}; vx P2_47 = {};
    vx P3_03 = {}; vx P3_47 = {};

    for (Index l=0; l<kc; ++l) {
        b1 = b[1];

        tmp1 = A[0]*b0;  P0_03 += tmp1;
        tmp2 = A[1]*b0;  P1_03 += tmp2;
        tmp1 = A[2]*b0;  P2_03 += tmp1;
        tmp2 = A[3]*b0;  P3_03 += tmp2;

        b0 = b[2];

        tmp1 = A[0]*b1;  P0_47 += tmp1;
        tmp2 = A[1]*b1;  P1_47 += tmp2;
        tmp1 = A[2]*b1;  P2_47 += tmp1;
        tmp2 = A[3]*b1;  P3_47 += tmp2;

        A += MR;
        b += NR;
    }

    if (alpha!=double(1)) {
        P0_03 *= alpha;  P0_47 *= alpha;
        P1_03 *= alpha;  P1_47 *= alpha;
        P2_03 *= alpha;  P2_47 *= alpha;
        P3_03 *= alpha;  P3_47 *= alpha;
    }

    if (beta!=double(1)) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                C[i*incRowC+j*incColC] *= beta;
            }
        }
    }

    const double *p = (const double *) &P0_03;
    C[0*incRowC+0*incColC] += p[0];
    C[0*incRowC+1*incColC] += p[1];
    C[0*incRowC+2*incColC] += p[2];
    C[0*incRowC+3*incColC] += p[3];

    p = (const double *) &P0_47;
    C[0*incRowC+4*incColC] += p[0];
    C[0*incRowC+5*incColC] += p[1];
    C[0*incRowC+6*incColC] += p[2];
    C[0*incRowC+7*incColC] += p[3];

    p = (const double *) &P1_03;
    C[1*incRowC+0*incColC] += p[0];
    C[1*incRowC+1*incColC] += p[1];
    C[1*incRowC+2*incColC] += p[2];
    C[1*incRowC+3*incColC] += p[3];

    p = (const double *) &P1_47;
    C[1*incRowC+4*incColC] += p[0];
    C[1*incRowC+5*incColC] += p[1];
    C[1*incRowC+6*incColC] += p[2];
    C[1*incRowC+7*incColC] += p[3];

    p = (const double *) &P2_03;
    C[2*incRowC+0*incColC] += p[0];
    C[2*incRowC+1*incColC] += p[1];
    C[2*incRowC+2*incColC] += p[2];
    C[2*incRowC+3*incColC] += p[3];

    p = (const double *) &P2_47;
    C[2*incRowC+4*incColC] += p[0];
    C[2*incRowC+5*incColC] += p[1];
    C[2*incRowC+6*incColC] += p[2];
    C[2*incRowC+7*incColC] += p[3];

    p = (const double *) &P3_03;
    C[3*incRowC+0*incColC] += p[0];
    C[3*incRowC+1*incColC] += p[1];
    C[3*incRowC+2*incColC] += p[2];
    C[3*incRowC+3*incColC] += p[3];

    p = (const double *) &P3_47;
    C[3*incRowC+4*incColC] += p[0];
    C[3*incRowC+5*incColC] += p[1];
    C[3*incRowC+6*incColC] += p[2];
    C[3*incRowC+7*incColC] += p[3];
}

#endif // GCCVEC_HPP