#ifndef GCCVEC_HPP
#define GCCVEC_HPP
#include "gemm.hpp"
#include <type_traits>
//-- Micro Kernel --------------------------------------------------------------
template <typename Index>
typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
&& BlockSize<double>::MR==4
&& BlockSize<double>::NR==8
&& BlockSize<double>::vlen == 4,
void>::type
ugemm(Index kc, double alpha,
const double *A, const double *B,
double beta,
double *C, Index incRowC, Index incColC)
{
static const Index MR = BlockSize<double>::MR;
static const Index NR = BlockSize<double>::NR/4;
typedef double vx __attribute__((vector_size (4*sizeof(double))));
A = (const double*) __builtin_assume_aligned (A, 32);
B = (const double*) __builtin_assume_aligned (B, 32);
vx b0, b1, tmp1, tmp2;
const vx *b = (const vx *)B;
b0 = b[0];
vx P0_03 = {}; vx P0_47 = {};
vx P1_03 = {}; vx P1_47 = {};
vx P2_03 = {}; vx P2_47 = {};
vx P3_03 = {}; vx P3_47 = {};
for (Index l=0; l<kc; ++l) {
b1 = b[1];
tmp1 = A[0]*b0; P0_03 += tmp1;
tmp2 = A[1]*b0; P1_03 += tmp2;
tmp1 = A[2]*b0; P2_03 += tmp1;
tmp2 = A[3]*b0; P3_03 += tmp2;
b0 = b[2];
tmp1 = A[0]*b1; P0_47 += tmp1;
tmp2 = A[1]*b1; P1_47 += tmp2;
tmp1 = A[2]*b1; P2_47 += tmp1;
tmp2 = A[3]*b1; P3_47 += tmp2;
A += MR;
b += NR;
}
if (alpha!=double(1)) {
P0_03 *= alpha; P0_47 *= alpha;
P1_03 *= alpha; P1_47 *= alpha;
P2_03 *= alpha; P2_47 *= alpha;
P3_03 *= alpha; P3_47 *= alpha;
}
if (beta!=double(1)) {
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<NR; ++j) {
C[i*incRowC+j*incColC] *= beta;
}
}
}
const double *p = (const double *) &P0_03;
C[0*incRowC+0*incColC] += p[0];
C[0*incRowC+1*incColC] += p[1];
C[0*incRowC+2*incColC] += p[2];
C[0*incRowC+3*incColC] += p[3];
p = (const double *) &P0_47;
C[0*incRowC+4*incColC] += p[0];
C[0*incRowC+5*incColC] += p[1];
C[0*incRowC+6*incColC] += p[2];
C[0*incRowC+7*incColC] += p[3];
p = (const double *) &P1_03;
C[1*incRowC+0*incColC] += p[0];
C[1*incRowC+1*incColC] += p[1];
C[1*incRowC+2*incColC] += p[2];
C[1*incRowC+3*incColC] += p[3];
p = (const double *) &P1_47;
C[1*incRowC+4*incColC] += p[0];
C[1*incRowC+5*incColC] += p[1];
C[1*incRowC+6*incColC] += p[2];
C[1*incRowC+7*incColC] += p[3];
p = (const double *) &P2_03;
C[2*incRowC+0*incColC] += p[0];
C[2*incRowC+1*incColC] += p[1];
C[2*incRowC+2*incColC] += p[2];
C[2*incRowC+3*incColC] += p[3];
p = (const double *) &P2_47;
C[2*incRowC+4*incColC] += p[0];
C[2*incRowC+5*incColC] += p[1];
C[2*incRowC+6*incColC] += p[2];
C[2*incRowC+7*incColC] += p[3];
p = (const double *) &P3_03;
C[3*incRowC+0*incColC] += p[0];
C[3*incRowC+1*incColC] += p[1];
C[3*incRowC+2*incColC] += p[2];
C[3*incRowC+3*incColC] += p[3];
p = (const double *) &P3_47;
C[3*incRowC+4*incColC] += p[0];
C[3*incRowC+5*incColC] += p[1];
C[3*incRowC+6*incColC] += p[2];
C[3*incRowC+7*incColC] += p[3];
}
#endif // GCCVEC_HPP