#ifndef GCCVEC_HPP #define GCCVEC_HPP #include "gemm.hpp" #include <type_traits> //-- Micro Kernel -------------------------------------------------------------- template <typename Index, typename T> typename std::enable_if<BlockSize<T>::vlen != 0, void>::type ugemm(Index kc, T alpha, const T *A, const T *B, T beta, T *C, Index incRowC, Index incColC) { typedef T vx __attribute__((vector_size (BlockSize<T>::rwidth/8))); static constexpr Index vlen = BlockSize<T>::vlen; static constexpr Index MR = BlockSize<T>::MR; static constexpr Index NR = BlockSize<T>::NR/vlen; A = (const T*) __builtin_assume_aligned (A, BlockSize<T>::align); B = (const T*) __builtin_assume_aligned (B, BlockSize<T>::align); vx P[MR*NR] = {}; for (Index l=0; l<kc; ++l) { const vx *b = (const vx *)B; for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { P[i*NR+j] += A[i]*b[j]; } } A += MR; B += vlen*NR; } if (alpha!=T(1)) { for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { P[i*NR+j] *= alpha; } } } for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { const T *p = (const T *) &P[i*NR+j]; for (Index j1=0; j1<vlen; ++j1) { C[i*incRowC+(j*vlen+j1)*incColC] *= beta; C[i*incRowC+(j*vlen+j1)*incColC] += p[j1]; } } } } #endif // GCCVEC_HPP |