#include <ulmblas/level2.h>
#include <ulmblas/level1.h>
//
// BLAS Level 2
//
#ifndef DGEMV_MxBS
#define DGEMV_MxBS 4
#endif
void
dgemv_mxBS(int m, double alpha,
           const double *A, int incRowA, int incColA,
           const double *x, int incX,
           double *y, int incY)
{
    int i, j;
    if (alpha==0) {
        return;
    }
    for (i=0; i<m; ++i) {
        double dot = 0;
        for (j=0; j<DGEMV_MxBS; ++j) {
            dot += A[i*incRowA+j*incColA]*x[j*incX];
        }
        y[i*incY] += alpha*dot;
    }
}
#ifndef DGEMV_BSxN
#define DGEMV_BSxN 4
#endif
void
dgemv_BSxn(int n, double alpha,
           const double *A, int incRowA, int incColA,
           const double *x, int incX,
           double beta,
           double *y, int incY)
{
    int i, j;
    if (alpha==0) {
        return;
    }
    if (beta!=1) {
        for (i=0; i<DGEMV_BSxN; ++i) {
            y[i*incY] *= beta;
        }
    }
    for (j=0; j<n; ++j) {
        for (i=0; i<DGEMV_BSxN; ++i) {
            y[i*incY] += alpha*A[i*incRowA+j*incColA]*x[j*incX];
        }
    }
}
//-- GEMV implementations ------------------------------------------------------
void
dgemv_unblk(int m, int n, double alpha,
            const double *A, int incRowA, int incColA,
            const double *x, int incX,
            double beta,
            double *y, int incY)
{
    if (incRowA<incColA) {
        int j;
        dscal(m, beta, y, incY);
        for (j=0; j<n; ++j) {
            daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY);
        }
    } else {
        int i;
        for (i=0; i<m; ++i) {
            y[i*incY] *= beta;
            y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX);
        }
    }
}
void
dgemv(int m, int n, double alpha,
      const double *A, int incRowA, int incColA,
      const double *x, int incX,
      double beta,
      double *y, int incY)
{
    if (incRowA<incColA) {
        int bs = DGEMV_MxBS;
        int j;
        dscal(m, beta, y, incY);
        for (j=0; j+bs<=n; j+=bs) {
            dgemv_mxBS(m, alpha,
                       &A[j*incColA], incRowA, incColA,
                       &x[j*incX], incX,
                       y, incY);
        }
        dgemv_unblk(m, n-j, alpha,
                    &A[j*incColA], incRowA, incColA,
                    &x[j*incX], incX,
                    1.0,
                    y, incY);
    } else {
        int bs = DGEMV_BSxN;
        int i;
        for (i=0; i+bs<=m; i+=bs) {
            dgemv_BSxn(n, alpha,
                       &A[i*incRowA], incRowA, incColA,
                       x, incX,
                       beta,
                       &y[i*incY], incY);
        }
        dgemv_unblk(m-i, n, alpha,
                    &A[i*incRowA], incRowA, incColA,
                    x, incX,
                    beta,
                    &y[i*incY], incY);
    }
}
//-- TRMV implementations ------------------------------------------------------
void
dtrlmv_col(int n, int unitDiag,
           const double *A, int incRowA, int incColA,
           double *x, int incX)
{
    int k;
    if (unitDiag) {
        for (k=n-1; k>=0; --k) {
            daxpy(n-k-1, x[k*incX],
                  &A[(k+1)*incRowA+k*incColA], incRowA,
                  &x[(k+1)*incX], incX);
        }
    } else {
        for (k=n-1; k>=0; --k) {
            daxpy(n-k-1, x[k*incX],
                  &A[(k+1)*incRowA+k*incColA], incRowA,
                  &x[(k+1)*incX], incX);
            x[k*incX] *= A[k*(incRowA+incColA)];
        }
    }
}
void
dtrlmv_row(int n, int unitDiag,
           const double *A, int incRowA, int incColA,
           double *x, int incX)
{
    int k;
    if (unitDiag) {
        for (k=n-1; k>=0; --k) {
            x[k*incX] += ddot(k, &A[k*incRowA], incColA, x, incX);
        }
    } else {
        for (k=n-1; k>=0; --k) {
            x[k*incX] = ddot(k+1, &A[k*incRowA], incColA, x, incX);
        }
    }
}
void
dtrlmv(int n, int unitDiag,
       const double *A, int incRowA, int incColA,
       double *x, int incX)
{
    if (incRowA<incColA) {
        int bs = DGEMV_MxBS;
        int k  = (n/bs)*bs;
        dtrlmv_col(n-k, unitDiag,
                   &A[k*(incRowA+incColA)], incRowA, incColA,
                   &x[k*incX], incX);
        for (k-=bs; k>=0; k-=bs) {
            dgemv_mxBS(n-k-bs, 1.0,
                       &A[(k+bs)*incRowA+k*incColA], incRowA, incColA,
                       &x[k*incX], incX,
                       &x[(k+bs)*incX], incX);
            dtrlmv_col(bs, unitDiag,
                       &A[k*(incRowA+incColA)], incRowA, incColA,
                       &x[k*incX], incX);
        }
    } else {
        double buffer[DGEMV_BSxN];
        int    bs = DGEMV_BSxN;
        int    k  = (n/bs)*bs;
        dgemv(n-k, k, 1.0,
              &A[k*incRowA], incRowA, incColA,
              x, incX,
              0.0,
              buffer,1);
        dtrlmv_row(n-k, unitDiag,
                   &A[k*(incRowA+incColA)], incRowA, incColA,
                   &x[k*incX], incX);
        daxpy(n-k, 1.0, buffer, 1, &x[k*incX], incX);
        for (k-=bs; k>=0; k-=bs) {
            dgemv_BSxn(k, 1.0,
                       &A[k*incRowA], incRowA, incColA,
                       x, incX,
                       0.0,
                       buffer, 1);
            dtrlmv_col(bs, unitDiag,
                       &A[k*(incRowA+incColA)], incRowA, incColA,
                       &x[k*incX], incX);
            daxpy(bs, 1.0, buffer, 1, &x[k*incX], incX);
        }
    }
}
//-- TRSV implementations ------------------------------------------------------
void
dtrlsv_row(int n, int unitDiag,
           const double *A, int incRowA, int incColA,
           double *x, int incX)
{
    int k;
    for (k=0; k<n; ++k) {
        x[k*incX] -= ddot(k, &A[k*incRowA], incColA, x, incX);
        x[k*incX] /= (unitDiag) ? 1.0 : A[k*(incRowA+incColA)];
    }
}
void
dtrlsv_col(int n, int unitDiag,
           const double *A, int incRowA, int incColA,
           double *x, int incX)
{
    int k;
    for (k=0; k<n; ++k) {
        x[k*incX] /= (unitDiag) ? 1.0 : A[k*(incRowA+incColA)];
        daxpy(n-k-1, -x[k*incX],
              &A[(k+1)*incRowA+k*incColA], incRowA,
              &x[(k+1)*incX], incX);
    }
}
void
dtrlsv(int n, int unitDiag,
       const double *A, int incRowA, int incColA,
       double *x, int incX)
{
    if (incRowA<incColA) {
        dtrlsv_col(n, unitDiag, A, incRowA, incColA, x, incX);
    } else {
        dtrlsv_row(n, unitDiag, A, incRowA, incColA, x, incX);
    }
}
//-- GER implementations -------------------------------------------------------
void
dger_row(int m, int n, double alpha,
         const double *x, int incX,
         const double *y, int incY,
         double *A, int incRowA, int incColA)
{
    int i;
    for (i=0; i<m; ++i) {
        daxpy(n, alpha*x[i*incX], y, incY, &A[i*incRowA], incColA);
    }
}
void
dger_col(int m, int n, double alpha,
         const double *x, int incX,
         const double *y, int incY,
         double *A, int incRowA, int incColA)
{
    int j;
    for (j=0; j<n; ++j) {
        daxpy(m, alpha*y[j*incY], x, incX, &A[j*incColA], incRowA);
    }
}
void
dger(int m, int n, double alpha,
     const double *x, int incX,
     const double *y, int incY,
     double *A, int incRowA, int incColA)
{
    if (incRowA<incColA) {
        dger_col(m, n, alpha, x, incX, y, incY, A, incRowA, incColA);
    } else {
        dger_row(m, n, alpha, x, incX, y, incY, A, incRowA, incColA);
    }
}