#include <ulmblas/level3.h>
#include <ulmblas/level2.h>
#include <ulmblas/level1.h>
#include <stdlib.h>
void
dgemm_mv(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int j;
for (j=0; j<n; ++j) {
dgemv(m, k,
alpha,
A, incRowA, incColA,
&B[j*incColB], incRowB,
beta,
&C[j*incColC], incRowC);
}
}
void
dgemm_jil(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
C[i*incRowC+j*incColC] *= beta;
for (l=0; l<k; ++l) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
void
dgemm_jli(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
for (j=0; j<n; ++j) {
for (l=0; l<k; ++l) {
for (i=0; i<m; ++i) {
if (l==0) {
C[i*incRowC+j*incColC] *= beta;
}
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
void
dgemm_vm(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i;
for (i=0; i<m; ++i) {
dgemv(n, k,
alpha,
B, incColB, incRowB,
&A[i*incRowA], incColA,
beta,
&C[i*incRowC], incColC);
}
}
void
dgemm_ilj(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
for (i=0; i<m; ++i) {
for (l=0; l<k; ++l) {
for (j=0; j<n; ++j) {
if (l==0) {
C[i*incRowC+j*incColC] *= beta;
}
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
void
dgemm_ijl(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
C[i*incRowC+j*incColC] *= beta;
for (l=0; l<k; ++l) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
void
dgemm_vv(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int l;
dgescal(m, n, alpha, C, incRowC, incColC);
for (l=0; l<k; ++l) {
dger(m, n, alpha,
&A[l*incColA], incRowA,
&B[l*incRowB], incColB,
C, incRowC, incColC);
}
}
void
dgemm_lij(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
C[i*incRowC+j*incColC] *= beta;
}
}
for (l=0; l<k; ++l) {
for (i=0; i<m; ++i) {
for (j=0; j<n; ++j) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
void
dgemm_lji(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
int i, j, l;
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
C[i*incRowC+j*incColC] *= beta;
}
}
for (l=0; l<k; ++l) {
for (j=0; j<n; ++j) {
for (i=0; i<m; ++i) {
C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
*B[l*incRowB+j*incColB];
}
}
}
}
#ifndef GEMM_MC
#define GEMM_MC 256
#endif
#ifndef GEMM_NC
#define GEMM_NC 1024
#endif
#ifndef GEMM_KC
#define GEMM_KC 256
#endif
void
dgemm_blk_jli(int m, int n, int k,
double alpha,
const double *A, int incRowA, int incColA,
const double *B, int incRowB, int incColB,
double beta,
double *C, int incRowC, int incColC)
{
double *A_ = (double *)malloc(sizeof(double)*GEMM_MC*GEMM_KC);
double *B_ = (double *)malloc(sizeof(double)*GEMM_KC*GEMM_NC);
double *C_ = (double *)malloc(sizeof(double)*GEMM_MC*GEMM_NC);
int i, j, l;
dgescal(m, n, beta, C, incRowC, incColC);
for (j=0; j<n; j+=GEMM_NC) {
int nc = (j+GEMM_NC<=n) ? GEMM_NC
: n - j;
for (l=0; l<k; l+=GEMM_KC) {
int kc = (l+GEMM_KC<=k) ? GEMM_KC
: k - l;
dgecopy(kc, nc,
&B[l*incRowB+j*incColB], incRowB, incColB,
B_, 1, kc);
for (i=0; i<m; i+=GEMM_MC) {
int mc = (i+GEMM_MC<=m) ? GEMM_MC
: m - i;
dgecopy(mc, kc,
&A[i*incRowA+l*incColA], incRowA, incColA,
A_, 1, mc);
dgemm_jli(mc, nc, kc,
1.0,
A_, 1, mc,
B_, 1, kc,
0.0,
C_, 1, mc);
dgeaxpy(mc, nc, alpha,
C_, 1, mc,
&C[i*incRowC+j*incColC], incRowC, incColC);
}
}
}
free(A_);
free(B_);
free(C_);
}