#include <ulmblas.h>
#include <ulmaux.h>
#include <math.h>
#include <stdlib.h>
void
dcopy(size_t n,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t i=0; i<n; ++i) {
y[i*incY] = x[i*incX];
}
}
void
daxpy(size_t n, double alpha,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t i=0; i<n; ++i) {
y[i*incY] += alpha*x[i*incX];
}
}
double
ddot(size_t n,
const double *x, ptrdiff_t incX,
const double *y, ptrdiff_t incY)
{
double result = 0;
for (size_t i=0; i<n; ++i) {
result += x[i*incX]*y[i*incY];
}
return result;
}
void
dscal(size_t n,
double alpha,
double *x, ptrdiff_t incX)
{
if (alpha==1) {
return;
}
if (alpha!=0) {
for (size_t i=0; i<n; ++i) {
x[i*incX] *= alpha;
}
} else {
for (size_t i=0; i<n; ++i) {
x[i*incX] = 0;
}
}
}
size_t
idamax(size_t n, const double *x, ptrdiff_t incX)
{
double max = 0;
size_t I = 0;
for (size_t i=0; i<n; ++i) {
if (fabs(x[i*incX])>max) {
I = i;
max = fabs(x[i*incX]);
}
}
return I;
}
void
dswap(size_t n, double *x, ptrdiff_t incX, double *y, ptrdiff_t incY)
{
for (size_t i=0; i<n; ++i) {
double tmp = x[i*incX];
x[i*incX] = y[i*incY];
y[i*incY] = tmp;
}
}
#ifndef AXPYF
#define AXPYF 4
#endif
void
daxpyf(size_t m, double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t i=0; i<m; ++i) {
for (size_t l=0; l<AXPYF; ++l) {
y[i*incY] += alpha * A[i*incRowA+l*incColA] * x[l*incX];
}
}
}
void
dgemv_axpyf(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
size_t nb = n / AXPYF;
for (size_t j=0; j<nb; ++j) {
daxpyf(m, alpha,
&A[j*AXPYF*incColA], incRowA, incColA,
&x[j*AXPYF*incX], incX,
y, incY);
}
for (size_t j=nb*AXPYF; j<n; ++j) {
daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY);
}
}
#ifndef DOTF
#define DOTF 4
#endif
void
dgemv_dotf(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
size_t mb = m / DOTF;
for (size_t i=0; i<mb; ++i) {
for (size_t j=0; j<n; ++j) {
for (size_t l=0; l<DOTF; ++l) {
y[(DOTF*i+l)*incY]
+= alpha*A[(DOTF*i+l)*incRowA+j*incColA]*x[j*incX];
}
}
}
for (size_t i=mb*DOTF; i<m; ++i) {
y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX);
}
}
void
dgemv(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double beta,
double *y, ptrdiff_t incY)
{
dscal(m, beta, y, incY);
if (alpha==0 || m==0 || n==0) {
return;
}
if (incRowA<incColA) {
dgemv_axpyf(m, n, alpha, A, incRowA, incColA, x, incX, y, incY);
} else {
dgemv_dotf(m, n, alpha, A, incRowA, incColA, x, incX, y, incY);
}
}
void
dger(size_t m, size_t n,
double alpha,
const double *x, ptrdiff_t incX,
const double *y, ptrdiff_t incY,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
if (alpha==0 || m==0 || n==0) {
return;
}
if (incRowA>incColA) {
dger(n, m, alpha, y, incY, x, incX, A, incColA, incRowA);
return;
}
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] += alpha*x[i*incX]*y[j*incY];
}
}
}
void
dtrsv(size_t n, bool lower, bool unit,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
double *x, ptrdiff_t incX)
{
if (lower) {
if (incRowA<incColA) {
size_t nb = n / AXPYF;
for (size_t j=0; j<nb; ++j) {
for (size_t l=0; l<AXPYF; ++l) {
size_t J = j*AXPYF + l;
if (!unit) {
x[J*incX] /= A[J*(incRowA+incColA)];
}
daxpy(AXPYF-1-l, -x[J*incX],
&A[(J+1)*incRowA + J*incColA], incRowA,
&x[(J+1)*incX], incX);
}
daxpyf(n-(j+1)*AXPYF, -1,
&A[((j+1)*incRowA + j*incColA)*AXPYF], incRowA, incColA,
&x[j*incX*AXPYF], incX,
&x[(j+1)*incX*AXPYF], incX);
}
for (size_t j=nb*AXPYF; j<n; ++j) {
if (!unit) {
x[j*incX] /= A[j*(incRowA+incColA)];
}
daxpy(n-1-j, -x[j*incX],
&A[(j+1)*incRowA+j*incColA], incRowA,
&x[(j+1)*incX], incX);
}
} else {
for (size_t j=0; j<n; ++j) {
x[j*incX] -= ddot(j, &A[j*incRowA], incColA, x, incX);
if (!unit) {
x[j*incX] /= A[j*(incRowA+incColA)];
}
}
}
} else {
if (incRowA<incColA) {
for (size_t j=n; j-- >0; ) {
if (!unit) {
x[j*incX] /= A[j*(incRowA+incColA)];
}
daxpy(j, -x[j*incX], &A[j*incColA], incRowA,
x, incX);
}
} else {
for (size_t j=n; j-- >0; ) {
x[j*incX] -= ddot(n-1-j,
&A[j*incRowA+(j+1)*incColA], incColA,
&x[(j+1)*incX], incX);
if (!unit) {
x[j*incX] /= A[j*(incRowA+incColA)];
}
}
}
}
}
#ifndef DGEMM_MC
#define DGEMM_MC 4
#endif
#ifndef DGEMM_NC
#define DGEMM_NC 6
#endif
#ifndef DGEMM_KC
#define DGEMM_KC 5
#endif
#ifndef DGEMM_MR
#define DGEMM_MR 2
#endif
#ifndef DGEMM_NR
#define DGEMM_NR 3
#endif
#if (DGEMM_MC % DGEMM_MR != 0)
#error "DGEMM_MC must be a multiple of DEGMM_MR."
#endif
#if (DGEMM_NC % DGEMM_NR != 0)
#error "DGEMM_NC must be a multiple of DEGMM_NR."
#endif
void
dpack_A(size_t mc, size_t kc,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
double *A_)
{
size_t mb = mc / DGEMM_MR;
for (size_t ib=0; ib<mb; ++ib) {
for (size_t j=0; j<kc; ++j) {
for (size_t i=0; i<DGEMM_MR; ++i) {
A_[ib*DGEMM_MR*kc + j*DGEMM_MR + i]
= A[(ib*DGEMM_MR+i)*incRowA + j*incColA];
}
}
}
if (mb*DGEMM_MR<mc) {
size_t mr = mc % DGEMM_MR;
for (size_t j=0; j<kc; ++j) {
for (size_t i=0; i<mr; ++i) {
A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i]
= A[(mb*DGEMM_MR+i)*incRowA + j*incColA];
}
for (size_t i=mr; i<DGEMM_MR; ++i) {
A_[mb*DGEMM_MR*kc + j*DGEMM_MR + i] = 0;
}
}
}
}
void
dpack_B(size_t kc, size_t nc,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
double *B_)
{
size_t nb = nc / DGEMM_NR;
for (size_t jb=0; jb<nb; ++jb) {
for (size_t i=0; i<kc; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
B_[jb*DGEMM_NR*kc + i*DGEMM_NR + j]
= B[i*incRowB + (jb*DGEMM_NR+j)*incColB];
}
}
}
if (nb*DGEMM_NR<nc) {
size_t nr = nc % DGEMM_NR;
for (size_t i=0; i<kc; ++i) {
for (size_t j=0; j<nr; ++j) {
B_[nb*DGEMM_NR*kc + i*DGEMM_NR + j]
= B[i*incRowB + (nb*DGEMM_NR+j)*incColB];
}
for (size_t j=nr; j<DGEMM_NR; ++j) {
B_[nb*DGEMM_NR*kc + i*DGEMM_NR + j] = 0;
}
}
}
}
void
dgemm_micro_ref(size_t k, double alpha,
const double *A, const double *B,
double beta,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
double R[DGEMM_MR*DGEMM_NR];
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
R[i*DGEMM_NR+j] = 0;
}
}
for (size_t l=0; l<k; ++l) {
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
R[i*DGEMM_NR+j] += A[i+l*DGEMM_MR]*B[l*DGEMM_NR+j];
}
}
}
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
R[i*DGEMM_NR+j] *= alpha;
}
}
if (beta==0) {
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
C[i*incRowC+j*incColC] = R[i*DGEMM_NR+j];
}
}
} else {
for (size_t i=0; i<DGEMM_MR; ++i) {
for (size_t j=0; j<DGEMM_NR; ++j) {
C[i*incRowC+j*incColC] *= beta;
C[i*incRowC+j*incColC] += R[i*DGEMM_NR+j];
}
}
}
}
void
ge_dscal(size_t m, size_t n,
double alpha,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
if (alpha!=0) {
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
A[i*incRowA + j*incColA] *= alpha;
}
}
} else {
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
A[i*incRowA + j*incColA] = 0;
}
}
}
}
void
ge_daxpy(size_t m, size_t n, double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
double *B, ptrdiff_t incRowB, ptrdiff_t incColB)
{
if (m==0 || n==0 || alpha==0) {
return;
}
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
B[i*incRowB + j*incColB] += alpha*A[i*incRowA + j*incColA];
}
}
}
void
dgemm_macro(size_t mc, size_t nc, size_t kc,
double alpha,
const double *A, const double *B,
double beta,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
double AB[DGEMM_MR*DGEMM_NR];
size_t mb = (mc+DGEMM_MR-1) / DGEMM_MR;
size_t nb = (nc+DGEMM_NR-1) / DGEMM_NR;
size_t mr_ = mc % DGEMM_MR;
size_t nr_ = nc % DGEMM_NR;
for (size_t ib=0; ib<mb; ++ib) {
size_t mr = (ib<mb-1 || mr_==0) ? DGEMM_MR
: mr_;
for (size_t jb=0; jb<nb; ++jb) {
size_t nr = (jb<nb-1 || nr_==0) ? DGEMM_NR
: nr_;
if (mr==DGEMM_MR && nr==DGEMM_NR) {
dgemm_micro_ref(kc, alpha,
&A[ib*DGEMM_MR*kc], &B[jb*kc*DGEMM_NR],
beta,
&C[ib*DGEMM_MR*incRowC+jb*DGEMM_NR*incColC],
incRowC, incColC);
} else {
dgemm_micro_ref(kc, alpha,
&A[ib*DGEMM_MR*kc], &B[jb*kc*DGEMM_NR],
0,
AB, 1, DGEMM_MR);
ge_dscal(mr, nr,
beta,
&C[ib*DGEMM_MR*incRowC + jb*DGEMM_NR*incColC],
incRowC, incColC);
ge_daxpy(mr, nr,
1,
AB, 1, DGEMM_MR,
&C[ib*DGEMM_MR*incRowC + jb*DGEMM_NR*incColC],
incRowC, incColC);
}
}
}
}
void
dgemm(size_t m, size_t n, size_t k,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
double beta,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
if (k==0 || alpha==0) {
ge_dscal(m, n, beta, C, incRowC, incColC);
return;
}
double *A_ = malloc(DGEMM_MC*DGEMM_KC*sizeof(double));
double *B_ = malloc(DGEMM_KC*DGEMM_NC*sizeof(double));
size_t mb = (m + DGEMM_MC - 1) / DGEMM_MC;
size_t nb = (n + DGEMM_NC - 1) / DGEMM_NC;
size_t kb = (k + DGEMM_KC - 1) / DGEMM_KC;
size_t mc_ = m % DGEMM_MC;
size_t nc_ = n % DGEMM_NC;
size_t kc_ = k % DGEMM_KC;
for (size_t jb=0; jb<nb; ++jb) {
size_t nc = (jb<nb-1 || nc_==0) ? DGEMM_NC
: nc_;
for (size_t lb=0; lb<kb; ++lb) {
size_t kc = (lb<kb-1 || kc_==0) ? DGEMM_KC
: kc_;
double beta_ = (lb==0) ? beta
: 1;
dpack_B(kc, nc,
&B[lb*DGEMM_KC*incRowB+jb*DGEMM_NC*incColB],
incRowB, incColB,
B_);
for (size_t ib=0; ib<mb; ++ib) {
size_t mc = (ib<mb-1 || mc_==0) ? DGEMM_MC
: mc_;
dpack_A(mc, kc,
&A[ib*DGEMM_MC*incRowA+lb*DGEMM_KC*incColA],
incRowA, incColA,
A_);
dgemm_macro(mc, nc, kc,
alpha,
A_, B_,
beta_,
&C[ib*DGEMM_MC*incRowC+jb*DGEMM_NC*incColC],
incRowC, incColC);
}
}
}
free(A_);
free(B_);
}
ptrdiff_t
dgetrf_ger(size_t m, size_t n,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
size_t *p, ptrdiff_t incP)
{
size_t k = m<n ? m : n;
for (size_t j=0; j<k; ++j) {
size_t pj = j + idamax(m-j, &A[j*incRowA + j*incColA], incRowA);
if (j!=pj) {
dswap(n, &A[j*incRowA], incColA, &A[pj*incRowA], incColA);
}
p[j*incP] = pj;
double alpha = A[j*incRowA + j*incColA];
if (alpha==0) {
return j;
} else {
alpha = 1/alpha;
}
dscal(m-1-j, alpha, &A[(j+1)*incRowA + j*incColA], incRowA);
dger(m-1-j, n-1-j, -1,
&A[(j+1)*incRowA + j *incColA], incRowA,
&A[ j *incRowA + (j+1)*incColA], incColA,
&A[(j+1)*incRowA + (j+1)*incColA], incRowA, incColA);
}
return -1;
}
ptrdiff_t
dgetrf_gemv(size_t m, size_t n,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
size_t *p, ptrdiff_t incP)
{
size_t k = m<n ? m : n;
for (size_t j=0; j<k; ++j) {
dtrsv(j, true, true,
A, incRowA, incColA,
&A[j*incColA], incRowA);
dgemv(m-j, j, -1,
&A[j*incRowA], incRowA, incColA,
&A[j*incColA], incRowA,
1,
&A[j*incRowA+j*incColA], incRowA);
size_t pj = j + idamax(m-j, &A[j*incRowA + j*incColA], incRowA);
if (j!=pj) {
dswap(n, &A[j*incRowA], incColA, &A[pj*incRowA], incColA);
}
p[j*incP] = pj;
double alpha = A[j*incRowA + j*incColA];
if (alpha==0) {
return j;
}
dscal(m-j-1, 1/alpha, &A[(j+1)*incRowA+j*incColA], incRowA);
}
for (size_t j=k; j<n; ++j) {
dtrsv(m, true, true,
A, incRowA, incColA,
&A[j*incColA], incRowA);
}
return -1;
}
ptrdiff_t
dgetrf(size_t m, size_t n,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
size_t *p, ptrdiff_t incP)
{
return dgetrf_gemv(m, n, A, incRowA, incColA, p, incP);
}
void
dlaswp(size_t n, size_t k0, size_t k1,
const size_t *p, ptrdiff_t incP,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
ptrdiff_t kInc = k0<k1 ? 1 : -1;
k1 += kInc;
for (size_t k=k0; k!=k1; k+=kInc) {
if (k!=p[k*incP]) {
dswap(n, &A[k*incRowA], incColA, &A[p[k*incP]], incColA);
}
}
}