Possible Solution

Setup a Framework for Testing

So we postpone the actual implementation of gepack_A!

#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>

void
initGeMatrix(size_t m, size_t n,
             double *A,
             ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
printGeMatrix(size_t m, size_t n,
              const double *A,
              ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

#ifndef DGEMM_MC
#define DGEMM_MC    8
#endif

#ifndef DGEMM_KC
#define DGEMM_KC    9
#endif

#ifndef DGEMM_MR
#define DGEMM_MR    4
#endif

int
main()
{
    size_t      m       = 6;
    size_t      k       = 7;
    double      *A      = malloc(m*k*sizeof(*A));
    ptrdiff_t   incRowA = 1;
    ptrdiff_t   incColA = m;

    double      *p      = malloc(DGEMM_MC*DGEMM_KC*sizeof(*p));

    initGeMatrix(m, k, A, incRowA, incColA);
    printGeMatrix(m, k, A, incRowA, incColA);
    printGeMatrix(1, DGEMM_MC*DGEMM_KC, p, 1, 1);

    free(p);
    free(A);
}

Implement and test dgepack_A

After compiling and running the above code we start to implement dgepack_A:

#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>

void
initGeMatrix(size_t m, size_t n,
             double *A,
             ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            A[i*incRowA+j*incColA] = i*n + j + 1;
        }
    }
}

void
printGeMatrix(size_t m, size_t n,
              const double *A,
              ptrdiff_t incRowA, ptrdiff_t incColA)
{
    for (size_t i=0; i<m; ++i) {
        for (size_t j=0; j<n; ++j) {
            printf("%10.4lf ", A[i*incRowA+j*incColA]);
        }
        printf("\n");
    }
    printf("\n");
}

#ifndef DGEMM_MC
#define DGEMM_MC    8
#endif

#ifndef DGEMM_KC
#define DGEMM_KC    9
#endif

#ifndef DGEMM_MR
#define DGEMM_MR    4
#endif

void
dgepack_A(size_t m, size_t k,
          const double *A, ptrdiff_t incRowA, ptrdiff_t incColA,
          double *p)
{
    size_t mb = (m+DGEMM_MR-1)/DGEMM_MR;

    for (size_t l=0; l<k; ++l) {
        for (size_t i1=0; i1<mb; ++i1) {
            for (size_t i0=0; i0<DGEMM_MR; ++i0) {
                size_t i  = i1*DGEMM_MR + i0;
                size_t nu = i1*DGEMM_MR*k + l*DGEMM_MR + i0;
                p[nu] = (i<m) ? A[i*incRowA + l*incColA]
                              : 0;
            }
        }
    }
}

int
main()
{
    size_t      m       = 6;
    size_t      k       = 7;
    double      *A      = malloc(m*k*sizeof(*A));
    ptrdiff_t   incRowA = 1;
    ptrdiff_t   incColA = m;

    double      *p      = malloc(DGEMM_MC*DGEMM_KC*sizeof(*p));

    initGeMatrix(m, k, A, incRowA, incColA);
    printGeMatrix(m, k, A, incRowA, incColA);
    dgepack_A(m, k, A, incRowA, incColA, p);
    printGeMatrix(1, DGEMM_MC*DGEMM_KC, p, 1, 1);

    free(p);
    free(A);
}