1
       2
       3
       4
       5
       6
       7
       8
       9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      75
      76
      77
      78
      79
      80
      81
      82
      83
      84
      85
      86
      87
      88
      89
      90
      91
      92
      93
      94
      95
      96
      97
      98
      99
     100
     101
     102
     103
     104
     105
     106
     107
     108
     109
     110
     111
     112
     113
     114
     115
     116
     117
     118
     119
     120
     121
     122
     123
     124
     125
     126
static void
dgemm_micro_kernel(int kc,
                   double alpha, const double *A, const double *B,
                   double beta,
                   double *C, int incRowC, int incColC)
{
    double _AB[MR*NR] __attribute__ ((aligned (16)));
    double *AB = _AB;

    int i, j;

//
//  Compute AB = A*B
//
    __asm__ volatile
    (
        "movq        %1, %%rax           \n\t"
        "movq        %2, %%rbx           \n\t"
        "movq        %3, %%rcx           \n\t"
        "                                \n\t"
        "xorpd   %%xmm3, %%xmm3          \n\t"
        "xorpd   %%xmm4, %%xmm4          \n\t"
        "xorpd   %%xmm5, %%xmm5          \n\t"
        "xorpd   %%xmm6, %%xmm6          \n\t"
        "                                \n\t"
        "xorpd   %%xmm8, %%xmm8          \n\t"
        "xorpd   %%xmm9, %%xmm9          \n\t"
        "xorpd   %%xmm10, %%xmm10        \n\t"
        "xorpd   %%xmm11, %%xmm11        \n\t"
        "xorpd   %%xmm12, %%xmm12        \n\t"
        "xorpd   %%xmm13, %%xmm13        \n\t"
        "xorpd   %%xmm14, %%xmm14        \n\t"
        "xorpd   %%xmm15, %%xmm15        \n\t"
        "                                \n\t"
        "movaps    (%%rax), %%xmm0       \n\t"
        "movaps  16(%%rax), %%xmm1       \n\t"
        "movaps    (%%rbx), %%xmm2       \n\t"
        "                                \n\t"
        "movl        %0, %%esi           \n\t"
        "testl    %%esi, %%esi           \n\t"
        "je      .DWRITEBACK%=           \n\t"
        "                                \n\t"
        ".DLOOP%=:                       \n\t"
        "                                \n\t"
        "addpd   %%xmm3, %%xmm11         \n\t"
        "movaps  16(%%rbx), %%xmm3       \n\t"
        "addpd   %%xmm4, %%xmm15         \n\t"
        "movaps  %%xmm2, %%xmm4          \n\t"
        "pshufd   $0x4e, %%xmm2, %%xmm7  \n\t"
        "mulpd   %%xmm0, %%xmm2          \n\t"
        "mulpd   %%xmm1, %%xmm4          \n\t"
        "                                \n\t"
        "addpd   %%xmm5, %%xmm10         \n\t"
        "addpd   %%xmm6, %%xmm14         \n\t"
        "movaps  %%xmm7, %%xmm6          \n\t"
        "mulpd   %%xmm0, %%xmm7          \n\t"
        "mulpd   %%xmm1, %%xmm6          \n\t"
        "                                \n\t"
        "addpd   %%xmm2, %%xmm9          \n\t"
        "movaps  32(%%rbx), %%xmm2       \n\t"
        "addpd   %%xmm4, %%xmm13         \n\t"
        "movaps  %%xmm3, %%xmm4          \n\t"
        "pshufd   $0x4e, %%xmm3, %%xmm5  \n\t"
        "mulpd   %%xmm0, %%xmm3          \n\t"
        "mulpd   %%xmm1, %%xmm4          \n\t"
        "                                \n\t"
        "addpd   %%xmm7, %%xmm8          \n\t"
        "addpd   %%xmm6, %%xmm12         \n\t"
        "movaps  %%xmm5, %%xmm6          \n\t"
        "mulpd   %%xmm0, %%xmm5          \n\t"
        "movaps  32(%%rax), %%xmm0       \n\t"
        "mulpd   %%xmm1, %%xmm6          \n\t"
        "movaps  48(%%rax), %%xmm1       \n\t"
        "                                \n\t"
        "                                \n\t"
        "addq    $32, %%rax              \n\t"
        "addq    $32, %%rbx              \n\t"
        "                                \n\t"
        "                                \n\t"
        "decl   %%esi                    \n\t"
        "jne    .DLOOP%=                 \n\t"
        "                                \n\t"
        "                                \n\t"
        "addpd   %%xmm3, %%xmm11         \n\t"
        "addpd   %%xmm4, %%xmm15         \n\t"
        "addpd   %%xmm5, %%xmm10         \n\t"
        "addpd   %%xmm6, %%xmm14         \n\t"
        "                                \n\t"
        ".DWRITEBACK%=:                  \n\t"
        "                                \n\t"
        "movlpd  %%xmm9,    (%%rcx)      \n\t"
        "movhpd  %%xmm8,   8(%%rcx)      \n\t"
        "movlpd  %%xmm13, 16(%%rcx)      \n\t"
        "movhpd  %%xmm12, 24(%%rcx)      \n\t"
        "                                \n\t"
        "addq  $32, %%rcx                \n\t"
        "movlpd  %%xmm8,    (%%rcx)      \n\t"
        "movhpd  %%xmm9,   8(%%rcx)      \n\t"
        "movlpd  %%xmm12, 16(%%rcx)      \n\t"
        "movhpd  %%xmm13, 24(%%rcx)      \n\t"
        "                                \n\t"
        "addq  $32, %%rcx                \n\t"
        "movlpd  %%xmm11,   (%%rcx)      \n\t"
        "movhpd  %%xmm10,  8(%%rcx)      \n\t"
        "movlpd  %%xmm15, 16(%%rcx)      \n\t"
        "movhpd  %%xmm14, 24(%%rcx)      \n\t"
        "                                \n\t"
        "addq  $32, %%rcx                \n\t"
        "movlpd  %%xmm10,   (%%rcx)      \n\t"
        "movhpd  %%xmm11,  8(%%rcx)      \n\t"
        "movlpd  %%xmm14, 16(%%rcx)      \n\t"
        "movhpd  %%xmm15, 24(%%rcx)      \n\t"
    : // output
    : // input
        "m" (kc),     // 0
        "m" (A),      // 1
        "m" (B),      // 2
        "m" (AB)      // 3
    : // register clobber list
        "rax""rbx""rcx""esi",
        "xmm0""xmm1""xmm2""xmm3",
        "xmm4""xmm5""xmm6""xmm7",
        "xmm8""xmm9""xmm10""xmm11",
        "xmm12""xmm13""xmm14""xmm15"
    );