1
      2
      3
      4
      5
      6
      7
      8
      9
     10
     11
     12
     13
     14
     15
     16
     17
     18
     19
     20
     21
     22
     23
     24
     25
     26
     27
     28
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
#ifndef ULMBLAS_IMPL_LAPACK_GETRF_TCC
#define ULMBLAS_IMPL_LAPACK_GETRF_TCC 1

#include <algorithm>
#include <ulmblas/impl/lapack/getrf.h>
#include <ulmblas/impl/lapack/getf2.h>
#include <ulmblas/impl/lapack/laenv.h>
#include <ulmblas/impl/lapack/laswp.h>
#include <ulmblas/impl/level3/gemm.h>
#include <ulmblas/impl/level3/trlsm.h>

namespace ulmBLAS {

template <typename IndexType, typename T>
IndexType
getrf(IndexType    m,
      IndexType    n,
      T            *A,
      IndexType    incRowA,
      IndexType    incColA,
      IndexType    *piv,
      IndexType    incPiv)
{
    const T One(1);

    if (m==0 || n==0) {
        return 0;
    }

    const IndexType nb = laenv<T>(1"GETRF""", m, n);

    IndexType info = 0;

    if (nb<=1 || nb>=std::min(m,n)) {
        info = getf2(m, n, A, incRowA, incColA, piv, incPiv);
    } else {
        for (IndexType j=0; j<std::min(m,n); j+=nb) {
            const IndexType jb = std::min(std::min(m,n)-j, nb);

            IndexType info_ = getf2(m-j, jb,
                                    &A[j*(incRowA+incColA)], incRowA, incColA,
                                    &piv[j*incPiv], incPiv);

            if (info==0 && info_>0) {
                info = info_ + j;
            }

            for (IndexType i=j; i<std::min(m,j+jb); ++i) {
                piv[i*incPiv] += j;
            }

            laswp(j, A, incRowA, incColA, j, j+jb, piv, incPiv);

            if (j+jb<n) {
                laswp(n-j-jb,
                      &A[(j+jb)*incColA], incRowA, incColA,
                      j, j+jb,
                      piv, incPiv);

                trlsm(jb, n-j-jb, One,
                      falsetrue, &A[j*(incRowA+incColA)], incRowA, incColA,
                      &A[j*incRowA+(j+jb)*incColA], incRowA, incColA);

                if (j+jb<m) {
                    gemm(m-j-jb, n-j-jb, jb, -One,
                         false, &A[(j+jb)*incRowA+j*incColA], incRowA, incColA,
                         false, &A[j*incRowA+(j+jb)*incColA], incRowA, incColA,
                         One,
                         &A[(j+jb)*(incRowA+incColA)], incRowA, incColA);
                }
            }
        }
    }
    return info;
}

// namespace ulmBLAS

#endif // ULMBLAS_IMPL_LAPACK_GETRF_TCC