Using fused AXPY and DOT Operations for TRSV

Content

In the previous benchmark the LU variant based on GEMV/TRSV has more efficient. That mainly due to the fact that the GEMV implementation uses fuesed AXPY and fused DOT operations.

We will now alos use the fused operations for TRSV and hope to achieve a further improvement in the performance.

TRSV: Optimized for col major case

The TRSV operation can be optimized using fused AXPY or fused DOT operations. In the code of ulmblas.c (found in /home/numerik/pub/hpc/ss18/ulmblas/session14b/) this kind of optimization was applied for the col major case. Further modifications:

#include <ulmblas.h>
#include <ulmaux.h>

#include <math.h>           // for nan(), fabs()

//-- BLAS Level 1 functions ----------------------------------------------------

void
dcopy(size_t n,
      const double *x, ptrdiff_t incX,
      double *y, ptrdiff_t incY)
{
    for (size_t i=0; i<n; ++i) {
        y[i*incY] = x[i*incX];
    }
}

void
daxpy(size_t n, double alpha,
      const double *x, ptrdiff_t incX,
      double *y, ptrdiff_t incY)
{
    for (size_t i=0; i<n; ++i) {
        y[i*incY] += alpha*x[i*incX];
    }
}

double
ddot(size_t n,
     const double *x, ptrdiff_t incX,
     const double *y, ptrdiff_t incY)
{
    double result = 0;

    for (size_t i=0; i<n; ++i) {
        result += x[i*incX]*y[i*incY];
    }

    return result;
}

void
dscal(size_t n,
      double alpha,
      double *x, ptrdiff_t incX)
{
    if (alpha==1) {
        return;
    }
    if (alpha!=0) {
        for (size_t i=0; i<n; ++i) {
            x[i*incX] *= alpha;
        }
    } else {
        for (size_t i=0; i<n; ++i) {
            x[i*incX] = 0;
        }
    }
}

size_t
idamax(size_t n, const double *x, ptrdiff_t incX)
{
    double max = 0;
    size_t I   = 0;

    for (size_t i=0; i<n; ++i) {
        if (fabs(x[i*incX])>max) {
            I   = i;
            max = fabs(x[i*incX]);
        }
    }
    return I;
}

void
dswap(size_t n, double *x, ptrdiff_t incX, double *y, ptrdiff_t incY)
{
    for (size_t i=0; i<n; ++i) {
        double tmp = x[i*incX];
        x[i*incX] = y[i*incY];
        y[i*incY] = tmp;
    }
}

//-- BLAS Level 2 functions ----------------------------------------------------

#ifndef AXPYF
#define AXPYF 4
#endif

// y <- alpha*A*x + y where A is a m x AXPYF matrix
void
daxpyf(size_t m, double alpha,
       const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
       const double *x, ptrdiff_t incX,
       double *y, ptrdiff_t incY)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t l=0; l<AXPYF; ++l) {
            y[i*incY] += alpha * A[i*incRowA+l*incColA] * x[l*incX];
        }
    }
}

void
dgemv_axpyf(size_t m, size_t n,
            double alpha,
            const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
            const double *x, ptrdiff_t incX,
            double *y, ptrdiff_t incY)
{
    size_t nb = n / AXPYF;
    for (size_t j=0; j<nb; ++j) {
        daxpyf(m, alpha,
               &A[j*AXPYF*incColA], incRowA, incColA,
               &x[j*AXPYF*incX], incX,
               y, incY);
    }
    for (size_t j=nb*AXPYF; j<n; ++j) {
        daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY);
    }
}

#ifndef DOTF
#define DOTF 4
#endif

void
dgemv_dotf(size_t m, size_t n,
           double alpha,
           const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
           const double *x, ptrdiff_t incX,
           double *y, ptrdiff_t incY)
{
    size_t mb = m / DOTF;

    for (size_t i=0; i<mb; ++i) {
        for (size_t j=0; j<n; ++j) {
            for (size_t l=0; l<DOTF; ++l) {
                y[(DOTF*i+l)*incY]
                    += alpha*A[(DOTF*i+l)*incRowA+j*incColA]*x[j*incX];
            }
        }
    }

    for (size_t i=mb*DOTF; i<m; ++i) {
        y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX);
    }
}

void
dgemv(size_t m, size_t n,
      double alpha,
      const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
      const double *x, ptrdiff_t incX,
      double beta,
      double *y, ptrdiff_t incY)
{
    dscal(m, beta, y, incY);

    if (alpha==0 || m==0 || n==0) {
        return;
    }

    if (incRowA<incColA) {
        dgemv_axpyf(m, n, alpha, A, incRowA, incColA, x, incX, y, incY);
    } else {
        dgemv_dotf(m, n, alpha, A, incRowA, incColA, x, incX, y, incY);
    }
}

void
dger(size_t m, size_t n,
     double alpha,
     const double *x, ptrdiff_t incX,
     const double *y, ptrdiff_t incY,
     double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
    if (alpha==0 || m==0 || n==0) {
        return;
    }

    if (incRowA>incColA) {
        dger(n, m, alpha, y, incY, x, incX, A, incColA, incRowA);
        return;
    }

    for (size_t j=0; j<n; ++j) {
        for (size_t i=0; i<m; ++i) {
            A[i*incRowA+j*incColA] += alpha*x[i*incX]*y[j*incY];
        }
    }
}

void
dtrsv(size_t n, bool lower, bool unit,
      const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
      double *x, ptrdiff_t incX)
{
    if (lower) {
        if (incRowA<incColA) {
            // A is col major

            size_t nb = n / AXPYF;
            for (size_t j=0; j<nb; ++j) {
                for (size_t l=0; l<AXPYF; ++l) {
                    size_t J = j*AXPYF + l;

                    if (!unit) {
                        x[J*incX] /= A[J*(incRowA+incColA)];
                    }
                    daxpy(AXPYF-1-l, -x[J*incX],
                          &A[(J+1)*incRowA + J*incColA], incRowA,
                          &x[(J+1)*incX], incX);
                }
                daxpyf(n-(j+1)*AXPYF, -1,
                       &A[((j+1)*incRowA + j*incColA)*AXPYF], incRowA, incColA,
                       &x[j*incX*AXPYF], incX,
                       &x[(j+1)*incX*AXPYF], incX);
            }
            for (size_t j=nb*AXPYF; j<n; ++j) {
                if (!unit) {
                    x[j*incX] /= A[j*(incRowA+incColA)];
                }
                daxpy(n-1-j, -x[j*incX],
                      &A[(j+1)*incRowA+j*incColA], incRowA,
                      &x[(j+1)*incX], incX);
            }
        } else {
            // A is row major
            for (size_t j=0; j<n; ++j) {
                x[j*incX] -= ddot(j, &A[j*incRowA], incColA, x, incX);
                if (!unit) {
                    x[j*incX] /= A[j*(incRowA+incColA)];
                }
            }
        }
    } else {
        if (incRowA<incColA) {
            // A is col major
            for (size_t j=n; j-- >0; ) {
                if (!unit) {
                    x[j*incX] /= A[j*(incRowA+incColA)];
                }
                daxpy(j, -x[j*incX], &A[j*incColA], incRowA,
                      x, incX);
            }
        } else {
            // A is row major
            for (size_t j=n; j-- >0; ) {
                x[j*incX] -= ddot(n-1-j,
                                  &A[j*incRowA+(j+1)*incColA], incColA,
                                  &x[(j+1)*incX], incX);
                if (!unit) {
                    x[j*incX] /= A[j*(incRowA+incColA)];
                }
            }
        }
    }
}

//-- LAPACK functions ----------------------------------------------------------

ptrdiff_t
dgetrf_ger(size_t m, size_t n,
           double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
           size_t *p, ptrdiff_t incP)
{
    size_t k = m<n ? m : n;

    for (size_t j=0; j<k; ++j) {
        size_t pj = j + idamax(m-j, &A[j*incRowA + j*incColA], incRowA);

        if (j!=pj) {
            dswap(n, &A[j*incRowA], incColA, &A[pj*incRowA], incColA);
        }
        p[j*incP] = pj;

        double alpha = A[j*incRowA + j*incColA];
        if (alpha==0) {
            return j;
        } else {
            alpha = 1/alpha;
        }

        dscal(m-1-j, alpha, &A[(j+1)*incRowA + j*incColA], incRowA);
        dger(m-1-j, n-1-j, -1,
             &A[(j+1)*incRowA +  j   *incColA], incRowA,
             &A[ j   *incRowA + (j+1)*incColA], incColA,
             &A[(j+1)*incRowA + (j+1)*incColA], incRowA, incColA);
    }

    return -1;
}

ptrdiff_t
dgetrf_gemv(size_t m, size_t n,
            double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
            size_t *p, ptrdiff_t incP)
{
    size_t k = m<n ? m : n;

    for (size_t j=0; j<k; ++j) {
        dtrsv(j, true, true,
              A, incRowA, incColA,
              &A[j*incColA], incRowA);

        dgemv(m-j, j, -1,
              &A[j*incRowA], incRowA, incColA,
              &A[j*incColA], incRowA,
              1,
              &A[j*incRowA+j*incColA], incRowA);

        size_t pj = j + idamax(m-j, &A[j*incRowA + j*incColA], incRowA);

        if (j!=pj) {
            dswap(n, &A[j*incRowA], incColA, &A[pj*incRowA], incColA);
        }
        p[j*incP] = pj;

        double alpha = A[j*incRowA + j*incColA];
        if (alpha==0) {
            return j;
        }

        dscal(m-j-1, 1/alpha, &A[(j+1)*incRowA+j*incColA], incRowA);
    }
    for (size_t j=k; j<n; ++j) {
        dtrsv(m, true, true,
              A, incRowA, incColA,
              &A[j*incColA], incRowA);
    }

    return -1;
}

#define GETRF_GER   1
#define GETRF_GEMV  2

#ifndef GETRF
#define GETRF   0
#endif

// this pragma makes gcc stop immediately on '#error'
#pragma GCC diagnostic error "-Wfatal-errors"

ptrdiff_t
dgetrf(size_t m, size_t n,
       double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
       size_t *p, ptrdiff_t incP)
{
#if  GETRF == GETRF_GER
    return dgetrf_ger(m, n, A, incRowA, incColA, p, incP);
#elif GETRF == GETRF_GEMV
    return dgetrf_gemv(m, n, A, incRowA, incColA, p, incP);
#else
#error "Compile with either -DGETRF=GETRF_GER or -DGETRF=GETRF_GEMV"
#endif
}

Exercise

Test and Benchmark