Lösungsvorschlag

#include <ulmblas/level3.h>
#include <ulmblas/level2.h>
#include <ulmblas/level1.h>
#include <stdlib.h>

//
// BLAS Level 3
//

//-- GEMM ----------------------------------------------------------------------

void
dgemm_mv(int m, int n, int k,
         double alpha,
         const double *A, int incRowA, int incColA,
         const double *B, int incRowB, int incColB,
         double beta,
         double *C, int incRowC, int incColC)
{
    int j;

    for (j=0; j<n; ++j) {
        dgemv(m, k,
              alpha,
              A, incRowA, incColA,
              &B[j*incColB], incRowB,
              beta,
              &C[j*incColC], incRowC);
    }
}

void
dgemm_jil(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            C[i*incRowC+j*incColC] *= beta;
            for (l=0; l<k; ++l) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

void
dgemm_jli(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (j=0; j<n; ++j) {
        for (l=0; l<k; ++l) {
            for (i=0; i<m; ++i) {
                if (l==0) {
                    C[i*incRowC+j*incColC] *= beta;
                }
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

void
dgemm_vm(int m, int n, int k,
         double alpha,
         const double *A, int incRowA, int incColA,
         const double *B, int incRowB, int incColB,
         double beta,
         double *C, int incRowC, int incColC)
{
    int i;

    for (i=0; i<m; ++i) {
        dgemv(n, k,
              alpha,
              B, incColB, incRowB,
              &A[i*incRowA], incColA,
              beta,
              &C[i*incRowC], incColC);
    }
}

void
dgemm_ilj(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (i=0; i<m; ++i) {
        for (l=0; l<k; ++l) {
            for (j=0; j<n; ++j) {
                if (l==0) {
                    C[i*incRowC+j*incColC] *= beta;
                }
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

void
dgemm_ijl(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (i=0; i<m; ++i) {
        for (j=0; j<n; ++j) {
            C[i*incRowC+j*incColC] *= beta;
            for (l=0; l<k; ++l) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

void
dgemm_vv(int m, int n, int k,
         double alpha,
         const double *A, int incRowA, int incColA,
         const double *B, int incRowB, int incColB,
         double beta,
         double *C, int incRowC, int incColC)
{
    int l;

    dgescal(m, n, alpha, C, incRowC, incColC);

    for (l=0; l<k; ++l) {
        dger(m, n, alpha,
             &A[l*incColA], incRowA,
             &B[l*incRowB], incColB,
             C, incRowC, incColC);
    }
}

void
dgemm_lij(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            C[i*incRowC+j*incColC] *= beta;
        }
    }
    for (l=0; l<k; ++l) {
        for (i=0; i<m; ++i) {
            for (j=0; j<n; ++j) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}

void
dgemm_lji(int m, int n, int k,
          double alpha,
          const double *A, int incRowA, int incColA,
          const double *B, int incRowB, int incColB,
          double beta,
          double *C, int incRowC, int incColC)
{
    int i, j, l;

    for (j=0; j<n; ++j) {
        for (i=0; i<m; ++i) {
            C[i*incRowC+j*incColC] *= beta;
        }
    }
    for (l=0; l<k; ++l) {
        for (j=0; j<n; ++j) {
            for (i=0; i<m; ++i) {
                C[i*incRowC+j*incColC] += alpha*A[i*incRowA+l*incColA]
                                               *B[l*incRowB+j*incColB];
            }
        }
    }
}


//--------------------------------------------------------------------------------
#ifndef GEMM_MC
#define GEMM_MC     256
#endif

#ifndef GEMM_NC
#define GEMM_NC     256
#endif

#ifndef GEMM_KC
#define GEMM_KC     1024
#endif

void
dgemm_blk_jli(int m, int n, int k,
              double alpha,
              const double *A, int incRowA, int incColA,
              const double *B, int incRowB, int incColB,
              double beta,
              double *C, int incRowC, int incColC)
{
    double *A_ = (double *)malloc(sizeof(double)*GEMM_MC*GEMM_KC);
    double *B_ = (double *)malloc(sizeof(double)*GEMM_KC*GEMM_NC);
    double *C_ = (double *)malloc(sizeof(double)*GEMM_MC*GEMM_NC);

    int i, j, l;

    dgescal(m, n, beta, C, incRowC, incColC);

    for (j=0; j<n; j+=GEMM_NC) {
        int nc = (j+GEMM_NC<=n) ? GEMM_NC
                                : n - j;


        for (l=0; l<k; l+=GEMM_KC) {
            int kc = (l+GEMM_KC<=k) ? GEMM_KC
                                    : k - l;

            dgecopy(kc, nc,
                    &B[l*incRowB+j*incColB], incRowB, incColB,
                    B_, 1, kc);

            for (i=0; i<m; i+=GEMM_MC) {
                int mc = (i+GEMM_MC<=m) ? GEMM_MC
                                        : m - i;

                dgecopy(mc, kc,
                        &A[i*incRowA+l*incColA], incRowA, incColA,
                        A_, 1, mc);

                dgemm_jli(mc, nc, kc,
                          1.0,
                          A_, 1, mc,
                          B_, 1, kc,
                          0.0,
                          C_, 1, mc);

                dgeaxpy(mc, nc, alpha,
                        C_, 1, mc,
                        &C[i*incRowC+j*incColC], incRowC, incColC);
            }
        }
    }

    free(A_);
    free(B_);
    free(C_);
}

Benchmark

$shell> pwd
/home/numerik/hpc/ss16/session07/hpc_project_blk/ulmblas
$shell> make
make: Nothing to be done for 'all'.
$shell> 
$shell> pwd
/home/numerik/hpc/ss16/session07/hpc_project_blk/bench
$shell> make
gcc -Wall -mavx -O3 -I ..   -c -o aux.o aux.c
gcc -Wall -mavx -O3 -I ..   -c -o refblas.o refblas.c
gcc -Wall -mavx -O3 -I ..   -c -o errbound.o errbound.c
gcc -Wall -mavx -O3 -I ..  aux.o  refblas.o  errbound.o -o bench_gemm_blk bench_gemm_blk.c ../ulmblas.a
gcc -Wall -mavx -O3 -I ..  aux.o  refblas.o  errbound.o -o bench_gemm_vv bench_gemm_vv.c ../ulmblas.a
gcc -Wall -mavx -O3 -I ..  aux.o  refblas.o  errbound.o -o bench_gemm_vm bench_gemm_vm.c ../ulmblas.a
gcc -Wall -mavx -O3 -I ..  aux.o  refblas.o  errbound.o -o bench_gemm_mv bench_gemm_mv.c ../ulmblas.a
rm refblas.o aux.o errbound.o
$shell> 
$shell> ./bench_gemm_blk > report.gemm_blk_ccc
$shell> 
set terminal svg size 1200, 500
set output "bench.gemm_ccc.svg"
set xlabel "Matrix Dimension (M=N=K)"
set ylabel "MFLOPS"
set title "GEMM (Blocked): A,B, C Col-Major"
set key outside
set pointsize 1.25
plot "report.gemm_mv_ccc"  using 1:8  with linespoints lt 1 lw 1 title "dgemm_jil", \
     "report.gemm_mv_ccc"  using 1:11 with linespoints lt 2 lw 1 title "dgemm_jli", \
     "report.gemm_vm_ccc"  using 1:8  with linespoints lt 3 lw 1 title "dgemm_ijl", \
     "report.gemm_vm_ccc"  using 1:11 with linespoints lt 4 lw 1 title "dgemm_ilj", \
     "report.gemm_vv_ccc"  using 1:8  with linespoints lt 5 lw 1 title "dgemm_lij", \
     "report.gemm_vv_ccc"  using 1:11 with linespoints lt 6 lw 1 title "dgemm_lji", \
     "report.gemm_blk_ccc" using 1:8  with linespoints lt 7 lw 1 title "dgemm_blk_jli"
$shell> gnuplot plot.gemm_blk
$shell>