1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
static void
dgemm_micro_kernel(int kc,
double alpha, const double *A, const double *B,
double beta,
double *C, int incRowC, int incColC)
{
double _AB[MR*NR] __attribute__ ((aligned (16)));
double *AB = _AB;
int i, j;
//
// Compute AB = A*B
//
__asm__ volatile
(
"movq %1, %%rax \n\t"
"movq %2, %%rbx \n\t"
"movq %3, %%rcx \n\t"
" \n\t"
"xorpd %%xmm3, %%xmm3 \n\t"
"xorpd %%xmm4, %%xmm4 \n\t"
"xorpd %%xmm5, %%xmm5 \n\t"
"xorpd %%xmm6, %%xmm6 \n\t"
" \n\t"
"xorpd %%xmm8, %%xmm8 \n\t"
"xorpd %%xmm9, %%xmm9 \n\t"
"xorpd %%xmm10, %%xmm10 \n\t"
"xorpd %%xmm11, %%xmm11 \n\t"
"xorpd %%xmm12, %%xmm12 \n\t"
"xorpd %%xmm13, %%xmm13 \n\t"
"xorpd %%xmm14, %%xmm14 \n\t"
"xorpd %%xmm15, %%xmm15 \n\t"
" \n\t"
"movaps (%%rax), %%xmm0 \n\t"
"movaps 16(%%rax), %%xmm1 \n\t"
"movaps (%%rbx), %%xmm2 \n\t"
" \n\t"
"movl %0, %%esi \n\t"
"testl %%esi, %%esi \n\t"
"je .DWRITEBACK%= \n\t"
" \n\t"
".DLOOP%=: \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t"
"movaps 16(%%rbx), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"movaps %%xmm2, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm2, %%xmm7 \n\t"
"mulpd %%xmm0, %%xmm2 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
"movaps %%xmm7, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm7 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
" \n\t"
"addpd %%xmm2, %%xmm9 \n\t"
"movaps 32(%%rbx), %%xmm2 \n\t"
"addpd %%xmm4, %%xmm13 \n\t"
"movaps %%xmm3, %%xmm4 \n\t"
"pshufd $0x4e, %%xmm3, %%xmm5 \n\t"
"mulpd %%xmm0, %%xmm3 \n\t"
"mulpd %%xmm1, %%xmm4 \n\t"
" \n\t"
"addpd %%xmm7, %%xmm8 \n\t"
"addpd %%xmm6, %%xmm12 \n\t"
"movaps %%xmm5, %%xmm6 \n\t"
"mulpd %%xmm0, %%xmm5 \n\t"
"movaps 32(%%rax), %%xmm0 \n\t"
"mulpd %%xmm1, %%xmm6 \n\t"
"movaps 48(%%rax), %%xmm1 \n\t"
" \n\t"
" \n\t"
"addq $32, %%rax \n\t"
"addq $32, %%rbx \n\t"
" \n\t"
" \n\t"
"decl %%esi \n\t"
"jne .DLOOP%= \n\t"
" \n\t"
" \n\t"
"addpd %%xmm3, %%xmm11 \n\t"
"addpd %%xmm4, %%xmm15 \n\t"
"addpd %%xmm5, %%xmm10 \n\t"
"addpd %%xmm6, %%xmm14 \n\t"
" \n\t"
".DWRITEBACK%=: \n\t"
" \n\t"
"movlpd %%xmm9, (%%rcx) \n\t"
"movhpd %%xmm8, 8(%%rcx) \n\t"
"movlpd %%xmm13, 16(%%rcx) \n\t"
"movhpd %%xmm12, 24(%%rcx) \n\t"
" \n\t"
"addq $32, %%rcx \n\t"
"movlpd %%xmm8, (%%rcx) \n\t"
"movhpd %%xmm9, 8(%%rcx) \n\t"
"movlpd %%xmm12, 16(%%rcx) \n\t"
"movhpd %%xmm13, 24(%%rcx) \n\t"
" \n\t"
"addq $32, %%rcx \n\t"
"movlpd %%xmm11, (%%rcx) \n\t"
"movhpd %%xmm10, 8(%%rcx) \n\t"
"movlpd %%xmm15, 16(%%rcx) \n\t"
"movhpd %%xmm14, 24(%%rcx) \n\t"
" \n\t"
"addq $32, %%rcx \n\t"
"movlpd %%xmm10, (%%rcx) \n\t"
"movhpd %%xmm11, 8(%%rcx) \n\t"
"movlpd %%xmm14, 16(%%rcx) \n\t"
"movhpd %%xmm15, 24(%%rcx) \n\t"
: // output
: // input
"m" (kc), // 0
"m" (A), // 1
"m" (B), // 2
"m" (AB) // 3
: // register clobber list
"rax", "rbx", "rcx", "esi",
"xmm0", "xmm1", "xmm2", "xmm3",
"xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11",
"xmm12", "xmm13", "xmm14", "xmm15"
);
|