1
      2
      3
      4
      5
      6
      7
      8
      9
     10
     11
     12
     13
     14
     15
     16
     17
     18
     19
     20
     21
     22
     23
     24
     25
     26
     27
     28
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
#ifndef GCCVEC_HPP
#define GCCVEC_HPP

#include "gemm.hpp"
#include <type_traits>

//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
typename std::enable_if<BlockSize<T>::vlen != 0,
         void>::type
ugemm(Index kc, T alpha, const T *A, const T *B, T beta,
      T *C, Index incRowC, Index incColC)
{
    typedef T vx __attribute__((vector_size (BlockSize<T>::rwidth/8)));

    static constexpr Index vlen = BlockSize<T>::vlen;
    static constexpr Index MR   = BlockSize<T>::MR;
    static constexpr Index NR   = BlockSize<T>::NR/vlen;

    A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align);
    B = (const T*) __builtin_assume_aligned (B, BlockSize<T>::align);

    vx P[MR*NR] = {};

    for (Index l=0; l<kc; ++l) {
        const vx *b = (const vx *)B;
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] += A[i]*b[j];
            }
        }
        A += MR;
        B += vlen*NR;
    }

    if (alpha!=T(1)) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] *= alpha;
            }
        }
    }

    for (Index i=0; i<MR; ++i) {
        for (Index j=0; j<NR; ++j) {
            const T *p = (const T *) &P[i*NR+j];
            for (Index j1=0; j1<vlen; ++j1) {
                C[i*incRowC+(j*vlen+j1)*incColC] *= beta;
                C[i*incRowC+(j*vlen+j1)*incColC] += p[j1];
            }
        }
    }
}

#endif // GCCVEC_HPP