gccvec.hpp (session8/gccvec.hpp)

#ifndef GCCVEC_HPP
#define GCCVEC_HPP

#include "gemm.hpp"
#include <type_traits>

//-- Micro Kernel --------------------------------------------------------------
template <typename Index, typename T>
typename std::enable_if<BlockSize<T>::vlen != 0,
         void>::type
ugemm(Index kc, T alpha, const T *A, const T *B, T beta,
      T *C, Index incRowC, Index incColC)
{
    typedef T vx __attribute__((vector_size (BlockSize<T>::rwidth/8)));

    static constexpr Index vlen = BlockSize<T>::vlen;
    static constexpr Index MR   = BlockSize<T>::MR;
    static constexpr Index NR   = BlockSize<T>::NR/vlen;

    A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align);
    B = (const T*) __builtin_assume_aligned (B, BlockSize<T>::align);

    vx P[MR*NR] = {};

    for (Index l=0; l<kc; ++l) {
        const vx *b = (const vx *)B;
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] += A[i]*b[j];
            }
        }
        A += MR;
        B += vlen*NR;
    }

    if (alpha!=T(1)) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] *= alpha;
            }
        }
    }

    for (Index i=0; i<MR; ++i) {
        for (Index j=0; j<NR; ++j) {
            const T *p = (const T *) &P[i*NR+j];
            for (Index j1=0; j1<vlen; ++j1) {
                C[i*incRowC+(j*vlen+j1)*incColC] *= beta;
                C[i*incRowC+(j*vlen+j1)*incColC] += p[j1];
            }
        }
    }
}

#endif // GCCVEC_HPP