Possible Solution

Setup a Framework for Testing

So again, we postpone the actual implementation of gepack_B!

#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>

void
initGeMatrix(size_t m, size_t n,
             double *A,
             ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
printGeMatrix(size_t m, size_t n,
              const double *A,
              ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

#ifndef DGEMM_KC
#define DGEMM_KC    8
#endif

#ifndef DGEMM_NC
#define DGEMM_NC    10
#endif

#ifndef DGEMM_NR
#define DGEMM_NR    5
#endif

int
main()
{
    size_t      k       = 7;
    size_t      n       = 8;
    double      *B      = malloc(k*n*sizeof(*B));
    ptrdiff_t   incRowB = 1;
    ptrdiff_t   incColB = k;

    double      *p      = malloc(DGEMM_KC*DGEMM_NC*sizeof(*p));

    initGeMatrix(k, n, B, incRowB, incColB);
    printGeMatrix(k, n, B, incRowB, incColB);
    printGeMatrix(1, DGEMM_KC*DGEMM_NC, p, 1, 1);

    free(p);
    free(B);
}

Implement and test dgepack_B

After compiling and running the above code we start to implement dgepack_B:

#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>

void
initGeMatrix(size_t m, size_t n,
             double *A,
             ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
printGeMatrix(size_t m, size_t n,
              const double *A,
              ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

#ifndef DGEMM_KC
#define DGEMM_KC    8
#endif

#ifndef DGEMM_NC
#define DGEMM_NC    10
#endif

#ifndef DGEMM_NR
#define DGEMM_NR    5
#endif

void
dgepack_B(size_t k, size_t n,
          const double *B, ptrdiff_t incRowB, ptrdiff_t incColB,
          double *p)
{
    size_t nb = (n+DGEMM_NR-1)/DGEMM_NR;

    for (size_t j1=0; j1<nb; ++j1) {
        for (size_t j0=0; j0<DGEMM_NR; ++j0) {
            for (size_t l=0; l<k; ++l) {
                size_t j  = j1*DGEMM_NR + j0;
                size_t nu = j1*DGEMM_NR*k + l*DGEMM_NR + j0;
                p[nu] = (j<n) ? B[l*incRowB + j*incColB]
                              : 0;
            }
        }
    }
}

int
main()
{
    size_t      k       = 7;
    size_t      n       = 8;
    double      *B      = malloc(k*n*sizeof(*B));
    ptrdiff_t   incRowB = 1;
    ptrdiff_t   incColB = k;

    double      *p      = malloc(DGEMM_KC*DGEMM_NC*sizeof(*p));

    initGeMatrix(k, n, B, incRowB, incColB);
    printGeMatrix(k, n, B, incRowB, incColB);
    dgepack_B(k, n, B, incRowB, incColB, p);
    printGeMatrix(1, DGEMM_KC*DGEMM_NC, p, 1, 1);

    free(p);
    free(B);
}