#include <ulmblas.h>
#include <ulmaux.h>
#include <math.h>
#include <stdlib.h>
void
dcopy(size_t n,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t i=0; i<n; ++i) {
y[i*incY] = x[i*incX];
}
}
void
daxpy(size_t n, double alpha,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t i=0; i<n; ++i) {
y[i*incY] += alpha*x[i*incX];
}
}
double
ddot(size_t n,
const double *x, ptrdiff_t incX,
const double *y, ptrdiff_t incY)
{
double result = 0;
for (size_t i=0; i<n; ++i) {
result += x[i*incX]*y[i*incY];
}
return result;
}
void
dscal(size_t n,
double alpha,
double *x, ptrdiff_t incX)
{
if (alpha==1) {
return;
}
if (alpha!=0) {
for (size_t i=0; i<n; ++i) {
x[i*incX] *= alpha;
}
} else {
for (size_t i=0; i<n; ++i) {
x[i*incX] = 0;
}
}
}
size_t
idamax(size_t n, const double *x, ptrdiff_t incX)
{
double max = 0;
size_t I = 0;
for (size_t i=0; i<n; ++i) {
if (fabs(x[i*incX])>max) {
I = i;
max = fabs(x[i*incX]);
}
}
return I;
}
void
dswap(size_t n, double *x, ptrdiff_t incX, double *y, ptrdiff_t incY)
{
for (size_t i=0; i<n; ++i) {
double tmp = x[i*incX];
x[i*incX] = y[i*incY];
y[i*incY] = tmp;
}
}
#ifndef AXPYF
#define AXPYF 4
#endif
void
daxpyf(size_t m, double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t i=0; i<m; ++i) {
for (size_t l=0; l<AXPYF; ++l) {
y[i*incY] += alpha * A[i*incRowA+l*incColA] * x[l*incX];
}
}
}
void
dgemv_axpyf(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
size_t nb = n / AXPYF;
for (size_t j=0; j<nb; ++j) {
daxpyf(m, alpha,
&A[j*AXPYF*incColA], incRowA, incColA,
&x[j*AXPYF*incX], incX,
y, incY);
}
for (size_t j=nb*AXPYF; j<n; ++j) {
daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY);
}
}
#ifndef DOTF
#define DOTF 4
#endif
void
dgemv_dotf(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
size_t mb = m / DOTF;
for (size_t i=0; i<mb; ++i) {
for (size_t j=0; j<n; ++j) {
for (size_t l=0; l<DOTF; ++l) {
y[(DOTF*i+l)*incY]
+= alpha*A[(DOTF*i+l)*incRowA+j*incColA]*x[j*incX];
}
}
}
for (size_t i=mb*DOTF; i<m; ++i) {
y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX);
}
}
void
dgemv(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double beta,
double *y, ptrdiff_t incY)
{
dscal(m, beta, y, incY);
if (alpha==0 || m==0 || n==0) {
return;
}
if (incRowA<incColA) {
dgemv_axpyf(m, n, alpha, A, incRowA, incColA, x, incX, y, incY);
} else {
dgemv_dotf(m, n, alpha, A, incRowA, incColA, x, incX, y, incY);
}
}
void
dger(size_t m, size_t n,
double alpha,
const double *x, ptrdiff_t incX,
const double *y, ptrdiff_t incY,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
if (alpha==0 || m==0 || n==0) {
return;
}
if (incRowA>incColA) {
dger(n, m, alpha, y, incY, x, incX, A, incColA, incRowA);
return;
}
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] += alpha*x[i*incX]*y[j*incY];
}
}
}
void
dtrsv(size_t n, bool lower, bool unit,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
double *x, ptrdiff_t incX)
{
if (lower) {
if (incRowA<incColA) {
size_t nb = n / AXPYF;
for (size_t j=0; j<nb; ++j) {
for (size_t l=0; l<AXPYF; ++l) {
size_t J = j*AXPYF + l;
if (!unit) {
x[J*incX] /= A[J*(incRowA+incColA)];
}
daxpy(AXPYF-1-l, -x[J*incX],
&A[(J+1)*incRowA + J*incColA], incRowA,
&x[(J+1)*incX], incX);
}
daxpyf(n-(j+1)*AXPYF, -1,
&A[((j+1)*incRowA + j*incColA)*AXPYF], incRowA, incColA,
&x[j*incX*AXPYF], incX,
&x[(j+1)*incX*AXPYF], incX);
}
for (size_t j=nb*AXPYF; j<n; ++j) {
if (!unit) {
x[j*incX] /= A[j*(incRowA+incColA)];
}
daxpy(n-1-j, -x[j*incX],
&A[(j+1)*incRowA+j*incColA], incRowA,
&x[(j+1)*incX], incX);
}
} else {
for (size_t j=0; j<n; ++j) {
x[j*incX] -= ddot(j, &A[j*incRowA], incColA, x, incX);
if (!unit) {
x[j*incX] /= A[j*(incRowA+incColA)];
}
}
}
} else {
if (incRowA<incColA) {
for (size_t j=n; j-- >0; ) {
if (!unit) {
x[j*incX] /= A[j*(incRowA+incColA)];
}
daxpy(j, -x[j*incX], &A[j*incColA], incRowA,
x, incX);
}
} else {
for (size_t j=n; j-- >0; ) {
x[j*incX] -= ddot(n-1-j,
&A[j*incRowA+(j+1)*incColA], incColA,
&x[(j+1)*incX], incX);
if (!unit) {
x[j*incX] /= A[j*(incRowA+incColA)];
}
}
}
}
}
void
ge_dscal(size_t m, size_t n,
double alpha,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
if (alpha!=0) {
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
A[i*incRowA + j*incColA] *= alpha;
}
}
} else {
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
A[i*incRowA + j*incColA] = 0;
}
}
}
}
void
ge_dcopy(size_t m, size_t n,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
double *B, ptrdiff_t incRowB, ptrdiff_t incColB)
{
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
B[i*incRowB + j*incColB] = A[i*incRowA + j*incColA];
}
}
}
void
dgemm_jli(size_t m, size_t n, size_t k,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
for (size_t j=0; j<n; ++j) {
for (size_t l=0; l<k; ++l) {
for (size_t i=0; i<m; ++i) {
C[i*incRowC + j*incColC] += alpha*A[i*incRowA + l*incColA]
*B[l*incRowB + j*incColB];
}
}
}
}
#ifndef DGEMM_MC
#define DGEMM_MC 8
#endif
#ifndef DGEMM_KC
#define DGEMM_KC 256
#endif
#ifndef DGEMM_NC
#define DGEMM_NC 1024
#endif
void
dgemm(size_t m, size_t n, size_t k,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
double beta,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
ge_dscal(m, n, beta, C, incRowC, incColC);
if (k==0 || alpha==0) {
return;
}
double *A_ = malloc(DGEMM_MC*DGEMM_KC*sizeof(double));
double *B_ = malloc(DGEMM_KC*DGEMM_NC*sizeof(double));
size_t mb = (m + DGEMM_MC - 1)/DGEMM_MC;
size_t nb = (n + DGEMM_NC - 1)/DGEMM_NC;
size_t kb = (k + DGEMM_KC - 1)/DGEMM_KC;
size_t mr = m % DGEMM_MC;
size_t nr = n % DGEMM_NC;
size_t kr = k % DGEMM_KC;
for (size_t jb=0; jb<nb; ++jb) {
size_t nc = (jb!=nb-1 || nr==0) ? DGEMM_NC : nr;
for (size_t lb=0; lb<kb; ++lb) {
size_t kc = (lb!=kb-1 || kr==0) ? DGEMM_KC : kr;
ge_dcopy(kc, nc, &B[lb*DGEMM_KC*incRowB + jb*DGEMM_NC*incColB],
incRowB, incColB,
B_, 1, DGEMM_KC);
for (size_t ib=0; ib<mb; ++ib) {
size_t mc = (ib!=mb-1 || mr==0) ? DGEMM_MC : mr;
ge_dcopy(mc, kc, &A[ib*DGEMM_MC*incRowA + lb*DGEMM_KC*incColA],
incRowA, incColA,
A_, 1, DGEMM_MC);
dgemm_jli(mc, nc, kc,
alpha,
A_, 1, DGEMM_MC,
B_, 1, DGEMM_KC,
&C[ib*DGEMM_MC*incRowC + jb*DGEMM_NC*incColC],
incRowC, incColC);
}
}
}
free(A_);
free(B_);
}
void
dtrsm(size_t m, size_t n, bool lower, bool unit, double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
double *X, ptrdiff_t incRowX, ptrdiff_t incColX)
{
if (alpha!=0) {
for (size_t j=0; j<n; ++j) {
dtrsv(m, lower, unit, A, incRowA, incColA, &X[j*incColX], incRowX);
}
}
ge_dscal(m, n, alpha, X, incRowX, incColX);
}
ptrdiff_t
dgetrf_ger(size_t m, size_t n,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
size_t *p, ptrdiff_t incP)
{
size_t k = m<n ? m : n;
for (size_t j=0; j<k; ++j) {
size_t pj = j + idamax(m-j, &A[j*incRowA + j*incColA], incRowA);
if (j!=pj) {
dswap(n, &A[j*incRowA], incColA, &A[pj*incRowA], incColA);
}
p[j*incP] = pj;
double alpha = A[j*incRowA + j*incColA];
if (alpha==0) {
return j;
} else {
alpha = 1/alpha;
}
dscal(m-1-j, alpha, &A[(j+1)*incRowA + j*incColA], incRowA);
dger(m-1-j, n-1-j, -1,
&A[(j+1)*incRowA + j *incColA], incRowA,
&A[ j *incRowA + (j+1)*incColA], incColA,
&A[(j+1)*incRowA + (j+1)*incColA], incRowA, incColA);
}
return -1;
}
ptrdiff_t
dgetrf_gemv(size_t m, size_t n,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
size_t *p, ptrdiff_t incP)
{
size_t k = m<n ? m : n;
for (size_t j=0; j<k; ++j) {
dtrsv(j, true, true,
A, incRowA, incColA,
&A[j*incColA], incRowA);
dgemv(m-j, j, -1,
&A[j*incRowA], incRowA, incColA,
&A[j*incColA], incRowA,
1,
&A[j*incRowA+j*incColA], incRowA);
size_t pj = j + idamax(m-j, &A[j*incRowA + j*incColA], incRowA);
if (j!=pj) {
dswap(n, &A[j*incRowA], incColA, &A[pj*incRowA], incColA);
}
p[j*incP] = pj;
double alpha = A[j*incRowA + j*incColA];
if (alpha==0) {
return j;
}
dscal(m-j-1, 1/alpha, &A[(j+1)*incRowA+j*incColA], incRowA);
}
for (size_t j=k; j<n; ++j) {
dtrsv(m, true, true,
A, incRowA, incColA,
&A[j*incColA], incRowA);
}
return -1;
}
ptrdiff_t
dgetrf(size_t m, size_t n,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
size_t *p, ptrdiff_t incP)
{
return dgetrf_gemv(m, n, A, incRowA, incColA, p, incP);
}
void
dlaswp(size_t n, size_t k0, size_t k1,
const size_t *p, ptrdiff_t incP,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
ptrdiff_t kInc = k0<k1 ? 1 : -1;
k1 += kInc;
for (size_t k=k0; k!=k1; k+=kInc) {
if (k!=p[k*incP]) {
dswap(n, &A[k*incRowA], incColA, &A[p[k*incP]], incColA);
}
}
}