#include <ulmblas.h>
#include <ulmaux.h>
#include <math.h>
#include <stdlib.h>
void
dcopy(size_t n,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t i=0; i<n; ++i) {
y[i*incY] = x[i*incX];
}
}
void
daxpy(size_t n, double alpha,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t i=0; i<n; ++i) {
y[i*incY] += alpha*x[i*incX];
}
}
double
ddot(size_t n,
const double *x, ptrdiff_t incX,
const double *y, ptrdiff_t incY)
{
double result = 0;
for (size_t i=0; i<n; ++i) {
result += x[i*incX]*y[i*incY];
}
return result;
}
void
dscal(size_t n,
double alpha,
double *x, ptrdiff_t incX)
{
if (alpha==1) {
return;
}
if (alpha!=0) {
for (size_t i=0; i<n; ++i) {
x[i*incX] *= alpha;
}
} else {
for (size_t i=0; i<n; ++i) {
x[i*incX] = 0;
}
}
}
size_t
idamax(size_t n, const double *x, ptrdiff_t incX)
{
double max = 0;
size_t I = 0;
for (size_t i=0; i<n; ++i) {
if (fabs(x[i*incX])>max) {
I = i;
max = fabs(x[i*incX]);
}
}
return I;
}
void
dswap(size_t n, double *x, ptrdiff_t incX, double *y, ptrdiff_t incY)
{
for (size_t i=0; i<n; ++i) {
double tmp = x[i*incX];
x[i*incX] = y[i*incY];
y[i*incY] = tmp;
}
}
#ifndef AXPYF
#define AXPYF 4
#endif
void
daxpyf(size_t m, double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
for (size_t i=0; i<m; ++i) {
for (size_t l=0; l<AXPYF; ++l) {
y[i*incY] += alpha * A[i*incRowA+l*incColA] * x[l*incX];
}
}
}
void
dgemv_axpyf(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
size_t nb = n / AXPYF;
for (size_t j=0; j<nb; ++j) {
daxpyf(m, alpha,
&A[j*AXPYF*incColA], incRowA, incColA,
&x[j*AXPYF*incX], incX,
y, incY);
}
for (size_t j=nb*AXPYF; j<n; ++j) {
daxpy(m, alpha*x[j*incX], &A[j*incColA], incRowA, y, incY);
}
}
#ifndef DOTF
#define DOTF 4
#endif
void
dgemv_dotf(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double *y, ptrdiff_t incY)
{
size_t mb = m / DOTF;
for (size_t i=0; i<mb; ++i) {
for (size_t j=0; j<n; ++j) {
for (size_t l=0; l<DOTF; ++l) {
y[(DOTF*i+l)*incY]
+= alpha*A[(DOTF*i+l)*incRowA+j*incColA]*x[j*incX];
}
}
}
for (size_t i=mb*DOTF; i<m; ++i) {
y[i*incY] += alpha*ddot(n, &A[i*incRowA], incColA, x, incX);
}
}
void
dgemv(size_t m, size_t n,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *x, ptrdiff_t incX,
double beta,
double *y, ptrdiff_t incY)
{
dscal(m, beta, y, incY);
if (alpha==0 || m==0 || n==0) {
return;
}
if (incRowA<incColA) {
dgemv_axpyf(m, n, alpha, A, incRowA, incColA, x, incX, y, incY);
} else {
dgemv_dotf(m, n, alpha, A, incRowA, incColA, x, incX, y, incY);
}
}
void
dger(size_t m, size_t n,
double alpha,
const double *x, ptrdiff_t incX,
const double *y, ptrdiff_t incY,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
if (alpha==0 || m==0 || n==0) {
return;
}
if (incRowA>incColA) {
dger(n, m, alpha, y, incY, x, incX, A, incColA, incRowA);
return;
}
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
A[i*incRowA+j*incColA] += alpha*x[i*incX]*y[j*incY];
}
}
}
void
dtrsv(size_t n, bool lower, bool unit,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
double *x, ptrdiff_t incX)
{
if (lower) {
if (incRowA<incColA) {
size_t nb = n / AXPYF;
for (size_t j=0; j<nb; ++j) {
for (size_t l=0; l<AXPYF; ++l) {
size_t J = j*AXPYF + l;
if (!unit) {
x[J*incX] /= A[J*(incRowA+incColA)];
}
daxpy(AXPYF-1-l, -x[J*incX],
&A[(J+1)*incRowA + J*incColA], incRowA,
&x[(J+1)*incX], incX);
}
daxpyf(n-(j+1)*AXPYF, -1,
&A[((j+1)*incRowA + j*incColA)*AXPYF], incRowA, incColA,
&x[j*incX*AXPYF], incX,
&x[(j+1)*incX*AXPYF], incX);
}
for (size_t j=nb*AXPYF; j<n; ++j) {
if (!unit) {
x[j*incX] /= A[j*(incRowA+incColA)];
}
daxpy(n-1-j, -x[j*incX],
&A[(j+1)*incRowA+j*incColA], incRowA,
&x[(j+1)*incX], incX);
}
} else {
for (size_t j=0; j<n; ++j) {
x[j*incX] -= ddot(j, &A[j*incRowA], incColA, x, incX);
if (!unit) {
x[j*incX] /= A[j*(incRowA+incColA)];
}
}
}
} else {
if (incRowA<incColA) {
for (size_t j=n; j-- >0; ) {
if (!unit) {
x[j*incX] /= A[j*(incRowA+incColA)];
}
daxpy(j, -x[j*incX], &A[j*incColA], incRowA,
x, incX);
}
} else {
for (size_t j=n; j-- >0; ) {
x[j*incX] -= ddot(n-1-j,
&A[j*incRowA+(j+1)*incColA], incColA,
&x[(j+1)*incX], incX);
if (!unit) {
x[j*incX] /= A[j*(incRowA+incColA)];
}
}
}
}
}
void
ge_dscal(size_t m, size_t n,
double alpha,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
if (alpha!=0) {
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
A[i*incRowA + j*incColA] *= alpha;
}
}
} else {
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
A[i*incRowA + j*incColA] = 0;
}
}
}
}
void
ge_dcopy(size_t m, size_t n,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
double *B, ptrdiff_t incRowB, ptrdiff_t incColB)
{
for (size_t j=0; j<n; ++j) {
for (size_t i=0; i<m; ++i) {
B[i*incRowB + j*incColB] = A[i*incRowA + j*incColA];
}
}
}
void
dgemm_jli(size_t m, size_t n, size_t k,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
for (size_t j=0; j<n; ++j) {
for (size_t l=0; l<k; ++l) {
for (size_t i=0; i<m; ++i) {
C[i*incRowC + j*incColC] += alpha*A[i*incRowA + l*incColA]
*B[l*incRowB + j*incColB];
}
}
}
}
void
dgemm_ijl(size_t m, size_t n, size_t k,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
for (size_t i=0; i<m; ++i) {
for (size_t j=0; j<n; ++j) {
for (size_t l=0; l<k; ++l) {
C[i*incRowC + j*incColC] += alpha*A[i*incRowA + l*incColA]
*B[l*incRowB + j*incColB];
}
}
}
}
#ifndef DGEMM_MC
#define DGEMM_MC 8
#endif
#ifndef DGEMM_KC
#define DGEMM_KC 256
#endif
#ifndef DGEMM_NC
#define DGEMM_NC 1024
#endif
void
dgemm(size_t m, size_t n, size_t k,
double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
double beta,
double *C, ptrdiff_t incRowC, ptrdiff_t incColC)
{
TODO
if (m==0 || n==0 || ((alpha==0 || k==0) && beta==1)) {
return;
}
ge_dscal(m, n, beta, C, incRowC, incColC);
if (alpha==0) {
return;
}
dgemm_ijl(m, n, k, alpha, A, incRowA, incColA, B, incRowB, incColB,
C, incRowC, incColC);
}
void
dtrsm(size_t m, size_t n, bool lower, bool unit, double alpha,
const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
double *X, ptrdiff_t incRowX, ptrdiff_t incColX)
{
TODO
}
ptrdiff_t
dgetrf_ger(size_t m, size_t n,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
size_t *p, ptrdiff_t incP)
{
size_t k = m<n ? m : n;
for (size_t j=0; j<k; ++j) {
size_t pj = j + idamax(m-j, &A[j*incRowA + j*incColA], incRowA);
if (j!=pj) {
dswap(n, &A[j*incRowA], incColA, &A[pj*incRowA], incColA);
}
p[j*incP] = pj;
double alpha = A[j*incRowA + j*incColA];
if (alpha==0) {
return j;
} else {
alpha = 1/alpha;
}
dscal(m-1-j, alpha, &A[(j+1)*incRowA + j*incColA], incRowA);
dger(m-1-j, n-1-j, -1,
&A[(j+1)*incRowA + j *incColA], incRowA,
&A[ j *incRowA + (j+1)*incColA], incColA,
&A[(j+1)*incRowA + (j+1)*incColA], incRowA, incColA);
}
return -1;
}
ptrdiff_t
dgetrf_gemv(size_t m, size_t n,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
size_t *p, ptrdiff_t incP)
{
size_t k = m<n ? m : n;
for (size_t j=0; j<k; ++j) {
dtrsv(j, true, true,
A, incRowA, incColA,
&A[j*incColA], incRowA);
dgemv(m-j, j, -1,
&A[j*incRowA], incRowA, incColA,
&A[j*incColA], incRowA,
1,
&A[j*incRowA+j*incColA], incRowA);
size_t pj = j + idamax(m-j, &A[j*incRowA + j*incColA], incRowA);
if (j!=pj) {
dswap(n, &A[j*incRowA], incColA, &A[pj*incRowA], incColA);
}
p[j*incP] = pj;
double alpha = A[j*incRowA + j*incColA];
if (alpha==0) {
return j;
}
dscal(m-j-1, 1/alpha, &A[(j+1)*incRowA+j*incColA], incRowA);
}
for (size_t j=k; j<n; ++j) {
dtrsv(m, true, true,
A, incRowA, incColA,
&A[j*incColA], incRowA);
}
return -1;
}
ptrdiff_t
dgetrf(size_t m, size_t n,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
size_t *p, ptrdiff_t incP)
{
return dgetrf_gemv(m, n, A, incRowA, incColA, p, incP);
}
void
dlaswp(size_t n, size_t k0, size_t k1,
const size_t *p, ptrdiff_t incP,
double *A, ptrdiff_t incRowA, ptrdiff_t incColA)
{
TODO
}