GNU Vector Extensions

Content

In this session we give you a taste on the performance boost you can achieve by applying hardware optimizations to the micro kernel.

But note: You only can observe this performance improvement because we have previously exploited the cache hierarchy.

Exercise

Make the following modifications to ulmblas.c:

Compile with additional flags -O3 and -mavx and re-run the benchmarks.

GEMM Micro Kernel using the GNU Vector Extensions

//-- GEMM micro kernel (gcc vector extensions) ---------------------------------

#ifndef DGEMM_GCC_VECBITS
#define DGEMM_GCC_VECBITS     256
#endif

#define DGEMM_GCC_VECBYTES    (DGEMM_GCC_VECBITS / 8)
#define DGEMM_GCC_VECDBLS     (DGEMM_GCC_VECBITS / (8*sizeof(double)))
#define DGEMM_GCC_NR          (DGEMM_NR / DGEMM_GCC_VECDBLS)

void
dgemm_micro_gcc(size_t k, double alpha,
                const double *A, const double *B,
                double beta,
                double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
    typedef double vec __attribute__((vector_size (DGEMM_GCC_VECBYTES)));
    vec AB[DGEMM_MR*DGEMM_GCC_NR] = {};

    A = (const double*) __builtin_assume_aligned (A, DGEMM_GCC_VECBYTES);
    B = (const double*) __builtin_assume_aligned (B, DGEMM_GCC_VECBYTES);

    // AB <- A*B
    for (size_t l=0; l<k; ++l) {
        const vec *b = (const vec *)B;
        for (size_t i=0; i<DGEMM_MR; ++i) {
            for (size_t j=0; j<DGEMM_GCC_NR; ++j) {
                AB[i*DGEMM_GCC_NR+j] += A[i]*b[j];
            }
        }
        A += DGEMM_MR;
        B += DGEMM_NR;
    }
    // AB <- alpha*AB
    for (size_t i=0; i<DGEMM_MR; ++i) {
        for (size_t j=0; j<DGEMM_GCC_NR; ++j) {
            AB[i*DGEMM_GCC_NR+j] *= alpha;
        }
    }
    // C <- beta*C + alpha*AB
    if (beta!=0) {
        for (size_t i=0; i<DGEMM_MR; ++i) {
            for (size_t j=0; j<DGEMM_GCC_NR; ++j) {
                const double *p = (const double *) &AB[i*DGEMM_GCC_NR+j];
                for (size_t j0=0; j0<DGEMM_GCC_VECDBLS; ++j0) {
                    C[i*incRowC + (j*DGEMM_GCC_VECDBLS + j0)*incColC] *= beta;
                    C[i*incRowC + (j*DGEMM_GCC_VECDBLS + j0)*incColC] += p[j0];
                }
            }
        }
    } else {
        for (size_t i=0; i<DGEMM_MR; ++i) {
            for (size_t j=0; j<DGEMM_GCC_NR; ++j) {
                const double *p = (const double *) &AB[i*DGEMM_GCC_NR+j];
                for (size_t j0=0; j0<DGEMM_GCC_VECDBLS; ++j0) {
                    C[i*incRowC + (j*DGEMM_GCC_VECDBLS + j0)*incColC] = p[j0];
                }
            }
        }
    }
}