#include <cstdio>
#include <cassert>
#include <chrono>
#include <complex>
#include <cmath>
#include <limits>
#include <random>
#include <flens/flens.cxx>
#ifndef TYPE_P
#define TYPE_P int64_t
#endif
#ifndef MAX_N
#define MAX_N 6000
#endif
#ifndef MAX_M
#define MAX_M 6000
#endif
#include "common.h"
//------------------------------------------------------------------------------
// Unblocked LU factorization with partial pivoting
template <typename MA, typename VP>
typename MA::IndexType
lu_unblocked_gemv(flens::GeMatrix<MA> &A, flens::DenseVector<VP> &p)
{
typedef typename MA::IndexType IndexType;
typedef typename MA::ElementType ElementType;
flens::Underscore<IndexType> _;
constexpr ElementType Zero(0);
constexpr ElementType One(1);
constexpr ElementType MinusOne(-1);
assert(A.numRows()==p.length());
const IndexType m = A.numRows();
const IndexType n = A.numCols();
const IndexType mn = std::min(m, n);
IndexType info = 0;
for (IndexType j=1; j<=n; ++ j) {
const IndexType jm = std::min(j, m);
for (IndexType i=1; i<jm; ++i) {
if (i!=p(i)) {
std::swap(A(i,j), A(p(i),j));
}
}
for (IndexType i=2; i<jm; ++i) {
const auto Ai_ = A(i,_(1,i-1));
const auto A_j = A(_(1,i-1),j);
A(i,j) -= flens::blas::dot(Ai_,A_j);
}
if (j<=m) {
flens::blas::mv(flens::NoTrans, MinusOne,
A(_(j,m),_(1,j-1)), A(_(1,j-1),j),
One,
A(_(j,m),j));
p(j) = j -1 + flens::blas::iamax(A(_(j,m),j));
ElementType alpha = A(p(j),j);
if (alpha!=Zero) {
if (j!=p(j)) {
flens::blas::swap(A(j,_(1,j)), A(p(j),_(1,j)));
}
A(_(j+1,m),j) *= One/alpha;
} else {
if (info==0) {
info = j;
}
}
}
}
return info;
}
// Unblocked LU factorization with partial pivoting
template <typename MA, typename VP>
typename MA::IndexType
lu_unblocked_ger(flens::GeMatrix<MA> &A, flens::DenseVector<VP> &p)
{
typedef typename MA::IndexType IndexType;
typedef typename MA::ElementType ElementType;
flens::Underscore<IndexType> _;
constexpr ElementType Zero(0);
constexpr ElementType One(1);
constexpr ElementType MinusOne(-1);
assert(A.numRows()==p.length());
const IndexType m = A.numRows();
const IndexType n = A.numCols();
const IndexType mn = std::min(m, n);
IndexType info = 0;
for (IndexType i=1; i<=mn; ++ i) {
auto A_i = A(_,i);
auto Ai_ = A(i,_);
p(i) = i -1 + flens::blas::iamax(A_i(_(i, m)));
if (A(p(i),i) != Zero) {
if (p(i)!=i) {
flens::blas::swap(Ai_, A(p(i),_));
}
} else {
if (info==0) {
info = i;
}
}
ElementType alpha = One/A(i,i);
A_i(_(i+1,m)) *= alpha;
flens::blas::r(MinusOne, A_i(_(i+1,m)), Ai_(_(i+1,n)),
A(_(i+1,m),_(i+1,n)));
}
return info;
}
// Blocked recursive LU factorization with partial pivoting
template <typename MA, typename VP>
typename MA::IndexType
lu_blocked_recursive(flens::GeMatrix<MA> &A, flens::DenseVector<VP> &p)
{
typedef typename MA::IndexType IndexType;
typedef typename MA::ElementType ElementType;
flens::Underscore<IndexType> _;
constexpr ElementType One(1);
constexpr ElementType MinusOne(-1);
IndexType info = 0;
IndexType info_ = 0;
IndexType m = A.numRows();
IndexType n = A.numCols();
IndexType mn = std::min(m, n);
IndexType bs = 4;
if (m==0 || n==0) {
return info;
}
if (bs>=mn) {
info = lu_unblocked_gemv(A, p);
} else {
IndexType k;
for (k=1; k<mn/2; k*=2);
auto A_left = A(_,_(1,k));
info_ = lu_blocked_recursive(A_left, p);
if (info==0 && info_>0) {
info = info_;
}
auto A_right = A(_,_(k+1,n));
auto mk = std::min(m, k);
//flens::lapack::laswp(A_right, p(_(1,mk)));
cxxblas::laswp(n-k,
A_right.data(), A_right.strideRow(), A_right.strideCol(),
1, mk,
p.data(),
p.stride());
const auto L = A(_(1,mk),_(1,mk)).lowerUnit();
auto U_right = A(_(1,mk),_(mk+1,n));
flens::blas::sm(flens::Left, flens::NoTrans, One, L, U_right);
auto A_ = A(_(mk+1,m),_(mk+1,n));
auto p_ = p(_(mk+1,m));
flens::blas::mm(flens::NoTrans, flens::NoTrans,
MinusOne, A(_(mk+1,m),_(1,mk)), A(_(1,mk),_(mk+1,n)),
One, A_);
info_ = lu_blocked_recursive(A_, p_);
if (info==0 && info_>0) {
info = info_ + mk + 1;
}
//auto A_left_bottom = A(_(mk+1,m),_(1,k));
//flens::lapack::laswp(A_left_bottom, p(_(mk+1,mn)));
for (IndexType i=mk+1; i<=mn; ++i) {
p(i) += mk;
}
cxxblas::laswp(k,
A.data(), A.strideRow(), A.strideCol(),
mk+1, mn,
p.data(),
p.stride());
}
return info;
}
//------------------------------------------------------------------------------
#ifndef BLASINT
#define BLASINT int64_t
#endif
#define BLASINDEX TYPE_P
extern "C" {
void
sgetrf_(const BLASINT *m, const BLASINT *n,
float *A, const BLASINT *ldA,
BLASINDEX *ipiv,
BLASINT *info);
void
dgetrf_(const BLASINT *m, const BLASINT *n,
double *A, const BLASINT *ldA,
BLASINDEX *ipiv,
BLASINT *info);
void
cgetrf_(const BLASINT *m, const BLASINT *n,
float *A, const BLASINT *ldA,
BLASINDEX *ipiv,
BLASINT *info);
void
zgetrf_(const BLASINT *m, const BLASINT *n,
double *A, const BLASINT *ldA,
BLASINDEX *ipiv,
BLASINT *info);
} // extern "C"
template <typename Index>
Index
getrf(Index m, Index n, float *A, Index ldA, BLASINDEX *p)
{
BLASINT M = m;
BLASINT N = n;
BLASINT LDA = ldA;
BLASINT INFO = 0;
sgetrf_(&M, &N, A, &LDA, p, &INFO);
return INFO;
}
template <typename Index>
Index
getrf(Index m, Index n, double *A, Index ldA, BLASINDEX *p)
{
BLASINT M = m;
BLASINT N = n;
BLASINT LDA = ldA;
BLASINT INFO = 0;
dgetrf_(&M, &N, A, &LDA, p, &INFO);
return INFO;
}
template <typename Index>
Index
getrf(Index m, Index n, std::complex<float> *A, Index ldA, BLASINDEX *p)
{
BLASINT M = m;
BLASINT N = n;
BLASINT LDA = ldA;
BLASINT INFO = 0;
cgetrf_(&M, &N, (float *)A, &LDA, p, &INFO);
return INFO;
}
template <typename Index>
Index
getrf(Index m, Index n, std::complex<double> *A, Index ldA, BLASINDEX *p)
{
BLASINT M = m;
BLASINT N = n;
BLASINT LDA = ldA;
BLASINT INFO = 0;
zgetrf_(&M, &N, (double *)A, &LDA, p, &INFO);
return INFO;
}
//------------------------------------------------------------------------------
//
// For checking I don't care for efficiency of elegance ...
//
template <typename MA1, typename MA2, typename VP>
double
checkLU(const flens::GeMatrix<MA1> &A1, flens::GeMatrix<MA2> &A2,
const flens::DenseVector<VP> &p)
{
typedef typename MA1::IndexType IndexType;
typedef typename MA1::ElementType T;
typedef typename ComplexTrait<T>::PT PT;
constexpr T Zero(0);
constexpr T One(1);
IndexType m = A1.numRows();
IndexType n = A1.numCols();
flens::GeMatrix<MA1> L(m,m);
// Copy L form A2 and set A2 to U
for (IndexType j=1; j<=std::min(m,n); ++j) {
for (IndexType i=1; i<=m; ++i) {
if (i==j) {
L(i,i) = 1;
} else if (i>j) {
L(i,j) = A2(i,j);
A2(i,j) = 0;
} else if (j>i) {
L(i,j) = 0;
}
}
}
flens::GeMatrix<MA1> LU(m,n);
flens::blas::mm(flens::NoTrans, flens::NoTrans, One, L, A2, Zero, LU);
// LU <- inv(P)*L*U
flens::lapack::laswp(LU, p.reverse());
double diff = 0;
for (IndexType j=1; j<=n; ++j) {
for (IndexType i=1; i<=m; ++i) {
diff += std::abs(A1(i,j)-LU(i,j));
}
}
double aNorm = asum(A1);
diff /= (aNorm*std::min(m,n)*std::numeric_limits<PT>::epsilon());
return diff;
}
//------------------------------------------------------------------------------
int
main()
{
typedef flens::GeMatrix<flens::FullStorage<TYPE_A> > GeMatrixA;
typedef flens::DenseVector<flens::Array<BLASINDEX> > DenseVectorP;
const std::size_t min_m = MIN_M;
const std::size_t min_n = MIN_N;
const std::size_t max_m = MAX_M;
const std::size_t max_n = MAX_N;
const std::size_t inc_m = INC_M;
const std::size_t inc_n = INC_N;
std::printf("#%5s %5s %5s", "m", "n", "info");
std::printf("%20s %9s", "FLENS/ulmBLAS: t", "MFLOPS");
std::printf("%20s %9s %9s", BLAS_LIB ": t", "MFLOPS", "Residual");
std::printf("\n");
WallTime<double> walltime;
for (std::size_t m=min_m, n=min_n;
m<=max_m && n<=max_n;
m += inc_m, n += inc_n)
{
GeMatrixA A_org(m, n);
DenseVectorP p(m);
GeMatrixA A(m, n);
double res;
std::size_t info;
std::size_t minMN = std::min(m,n);
std::size_t maxMN = std::max(m,n);
std::size_t flops;
flops = minMN*minMN*maxMN -(minMN*minMN*minMN/3.) -(minMN*minMN/2.);
fill(A_org);
A = A_org;
walltime.tic();
info = lu_blocked_recursive(A, p);
//info = lu_unblocked_gemv(A, p);
double t1 = walltime.toc();
res = checkLU(A_org, A, p);
std::printf(" %5ld %5ld %5ld", m, n, info);
std::printf("%20.4lf %9.2lf", t1, flops/(1000000*t1));
std::printf(" %9.1e", res);
std::printf("\n");
}
}