#include <cassert>
#include <chrono>
#include <cmath>
#include <limits>
#include <random>
#include <type_traits>
#include <boost/timer.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/lu.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include "gemm.hpp"
#include "lu.hpp"
#include <mkl_lapack.h>
#include <Eigen/Dense>
template <typename T>
struct WallTime
{
    void
    tic()
    {
        t0 = std::chrono::high_resolution_clock::now();
    }
    T
    toc()
    {
        using namespace std::chrono;
        elapsed = high_resolution_clock::now() - t0;
        return duration<T,seconds::period>(elapsed).count();
    }
    std::chrono::high_resolution_clock::time_point t0;
    std::chrono::high_resolution_clock::duration   elapsed;
};
//
// For checking I don't care for efficiency of elegance ...
//
template <typename MatrixA1, typename MatrixA2, typename MatrixP>
double
checkLU(const MatrixA1 &A1, MatrixA2 &A2, const MatrixP &P)
{
    namespace ublas = boost::numeric::ublas;
    typedef typename MatrixA1::size_type    size_type;
    typedef typename MatrixA1::value_type   value_type;
    size_type m = A1.size1();
    size_type n = A1.size2();
    ublas::matrix<value_type>     L(m,m);
    // Copy L form A2 and set A2 to U
    for (size_type j=0; j<m; ++j) {
        for (size_type i=0; i<m; ++i) {
            if (i==j) {
                L(i,i) = 1;
            } else if (i>j) {
                L(i,j)  = A2(i,j);
                A2(i,j) = 0;
            } else if (j>i) {
                L(i,j)  = 0;
            }
        }
    }
    ublas::matrix<value_type>   LU(m,n);
    ublas::axpy_prod(L, A2, LU, true);
    // LU <- inv(P)*L*U
    foo::swap_rows(P, LU, true);
    double diff = 0;
    for (size_type j=0; j<n; ++j) {
        for (size_type i=0; i<m; ++i) {
            diff += std::abs(A1(i,j)-LU(i,j));
        }
    }
    return diff;
}
// fill rectangular matrix with random values
template <typename MATRIX>
void
fill(MATRIX &A)
{
    typedef typename MATRIX::size_type  size_type;
    typedef typename MATRIX::value_type T;
    std::random_device                  random;
    std::default_random_engine          mt(random());
    std::uniform_real_distribution<T>   uniform(-100,100);
    for (size_type i=0; i<A.size1(); ++i) {
        for (size_type j=0; j<A.size2(); ++j) {
            A(i,j) = uniform(mt);
        }
    }
}
#ifndef M_MAX
#define M_MAX 10000
#endif
#ifndef N_MAX
#define N_MAX 10000
#endif
int
main()
{
    namespace ublas = boost::numeric::ublas;
    const std::size_t m_min = 50;
    const std::size_t n_min = 50;
    const std::size_t m_max = M_MAX;
    const std::size_t n_max = N_MAX;
    const std::size_t m_inc = 50;
    const std::size_t n_inc = 50;
    typedef double              T;
    typedef ublas::row_major    SO;
    typedef Eigen::Matrix<T,Eigen::Dynamic,Eigen::Dynamic> EigenMat;
    WallTime<double>  walltime;
    for (std::size_t m=m_min, n=n_min;
         m<=m_max && n<=n_max;
         m+=m_inc, n+=n_inc)
    {
        ublas::matrix<T,SO>             A_org(m, n);
        ublas::matrix<T,SO>             A1(m, n);
        ublas::matrix<T,SO>             A2(m, n);
        ublas::matrix<T,SO>             A3(m, n);
        ublas::matrix<T,SO>             A4(m, n);
        ublas::permutation_matrix<T>    P1(m), P2(m), P3(m), P4(m);
        fill(A_org);
        A1 = A_org;
        A2 = A_org;
        A3 = A_org;
        A4 = A_org;
        double      diff = 0, t, mflops;
        std::size_t info;
        std::size_t minMN = std::min(m,n);
        std::size_t maxMN = std::max(m,n);
        mflops  = minMN*minMN*maxMN -(minMN*minMN*minMN/3.) -(minMN*minMN/2.);
        mflops /= 1000000;
        double    *A_mkl  = new double[m*n];
        long      *p_mkl_ = new long[m];
        long long *p_mkl  = (long long *)p_mkl_;
        for (std::size_t i=0; i<m; ++i) {
            for (std::size_t j=0; j<n; ++j) {
                A_mkl[i+j*m] = A1(i,j);
            }
        }
        long long m_ = m, n_ = n, info_;
        walltime.tic();
        dgetrf(&m_, &n_, A_mkl, &m_, p_mkl, &info_);
        t = walltime.toc();
        for (std::size_t i=0; i<m; ++i) {
            for (std::size_t j=0; j<n; ++j) {
                A1(i,j) = A_mkl[i+j*m];
            }
            P1(i) = p_mkl[i]-1;
        }
        info = info_;
        delete [] A_mkl;
        delete [] p_mkl_;
#ifndef NO_CHECK
        diff   = checkLU(A_org, A1, P1);
#endif
        std::printf("%4ld %4ld %4ld %16.3e %16.5lf %16.5lf",
                    m, n, info, diff, t, mflops/t);
        walltime.tic();
        info = foo::lu_blocked(A2, P2);
        t = walltime.toc();
#ifndef NO_CHECK
        diff = checkLU(A_org, A2, P2);
#endif
        std::printf("%4ld %16.3e %16.5lf %16.5lf",
                    info, diff, t, mflops/t);
        walltime.tic();
        info = foo::lu_blocked_recursive(A3, P3);
        t = walltime.toc();
#ifndef NO_CHECK
        diff = checkLU(A_org, A3, P3);
#endif
        std::printf("%4ld %16.3e %16.5lf %16.5lf",
                    info, diff, t, mflops/t);
        EigenMat eigA(m, m);
        for (std::size_t i=0; i<m; ++i) {
            for (std::size_t j=0; j<n; ++j) {
                eigA(i,j) = A4(i,j);
            }
        }
        Eigen::PartialPivLU<EigenMat> lu(eigA);
        walltime.tic();
        lu.compute(eigA);
        t = walltime.toc();
        auto eigA_ = lu.reconstructedMatrix();
#ifndef NO_CHECK
        diff = 0;
        for (std::size_t i=0; i<m; ++i) {
            for (std::size_t j=0; j<n; ++j) {
                diff += std::abs(A_org(i,j) - eigA_(i,j));
            }
        }
#endif
        std::printf("%4ld %16.3e %16.5lf %16.5lf\n",
                    info, diff, t, mflops/t);
    }
}