Pointer und Optimierungen

C-Beispiel mit Pointer

double a, b;

double *p;

void

blub()

{

    p = &a;

    b = *p;

    *p = a+b;

}

Assembler Code

Wir übersetzen den C-Code zunächst wie üblich in Assembler:

$shell> gcc-4.8 -S -fno-asynchronous-unwind-tables blub.c

Damit erhalten wir

day04/blub.s

                .comm           _a,8,3

                .comm           _b,8,3

                .comm           _p,8,3

                .text

                .globl _blub

_blub:

                pushq           %rbp

                movq            %rsp, %rbp

                movq            _p@GOTPCREL(%rip), %rax

                movq            _a@GOTPCREL(%rip), %rdx

                movq            %rdx, (%rax)

                movq            _p@GOTPCREL(%rip), %rax

                movq            (%rax), %rax

                movq            (%rax), %rax

                movq            _b@GOTPCREL(%rip), %rdx

                movq            %rax, (%rdx)

                movq            _p@GOTPCREL(%rip), %rax

                movq            (%rax), %rcx

                movq            _a@GOTPCREL(%rip), %rax

                movq            (%rax), %rdx

                movq            _b@GOTPCREL(%rip), %rax

                movq            (%rax), %rax

                movd            %rdx, %xmm0

                movd            %rax, %xmm1

                addsd           %xmm1, %xmm0

                movd            %xmm0, %rax

                movq            %rax, (%rcx)

                popq            %rbp

                ret

                .subsections_via_symbols

Optimierter Assembler Code

Jetzt übersetzen wir mit voller Optimierung (-O3)

$shell> gcc-4.8 -S -O3 -fno-asynchronous-unwind-tables blub.c

Damit ist der Assembler Code plötzlich sehr übersichtlich.

day04/blub.s

                .text

                .align 4,0x90

                .globl _blub

_blub:

                movq            _a@GOTPCREL(%rip), %rax

                movq            _p@GOTPCREL(%rip), %rdx

                movsd           (%rax), %xmm0

                movq            %rax, (%rdx)

                movq            _b@GOTPCREL(%rip), %rdx

                movsd           %xmm0, (%rdx)

                addsd           %xmm0, %xmm0

                movsd           %xmm0, (%rax)

                ret

                .comm           _p,8,3

                .comm           _b,8,3

                .comm           _a,8,3

                .subsections_via_symbols

Benchmark Trouble

Ein Compiler darf eine Menge tricksen solange er garantieren kann, dass stets das gleiche Endergebnis rauskommt. Bei Benchmarks ist das aber problematisch, denn ein Rechenergebnis interessiert uns ja nicht. Sondern nur wie lange eine Berechnung dauert. Weil eine einzelne Rechnung unterhalb der Messgenauigkeit liegt und weil Ausreisser das Ergebnis verfälschen können wiederholt man eine Rechnung \(x\) mal und mittelt die Zeiten.

Benchmark für Speicherzugriff

Wir wollen testen wie teuer ein Speicherzugriff ist. Wir schreiben also das

day04/bench_assign.c

unsigned j;

unsigned x;

// Benchmark f�r Zuweisung

void

bench(void)

{

    for (j = 1 ; j < 1000000000 ; j++) {

        x = 10;

    }

}

Zunächst übersetzen wir ohne Optimierung (wir schalten diese sogar explizit aus mit -O0)

$shell> gcc-4.8 -S -O0 -fno-asynchronous-unwind-tables bench_assign.c

Und erhalten

day04/bench_assign.s

                .comm           _j,4,2

                .comm           _x,4,2

                .text

                .globl _bench

_bench:

                pushq           %rbp

                movq            %rsp, %rbp

                movq            _j@GOTPCREL(%rip), %rax

                movl            $1, (%rax)

                jmp             L2

L3:

                movq            _x@GOTPCREL(%rip), %rax

                movl            $10, (%rax)

                movq            _j@GOTPCREL(%rip), %rax

                movl            (%rax), %eax

                leal            1(%rax), %edx

                movq            _j@GOTPCREL(%rip), %rax

                movl            %edx, (%rax)

L2:

                movq            _j@GOTPCREL(%rip), %rax

                movl            (%rax), %eax

                cmpl            $999999999, %eax

                jbe             L3

                popq            %rbp

                ret

                .subsections_via_symbols

Dann schalten wir die Optimierung an:

$shell> gcc-4.8 -S -O3 -fno-asynchronous-unwind-tables bench_assign.c

Jetzt erhalten wir ä :import: day04/bench_assign.s

Benchmark für Pointerzugriff

Wir wollen testen wie teuer ein indirekter Variabel-Zugriff über einen Pointer ist:

day04/bench_pointer.c

unsigned x;

unsigned *p;

unsigned t;

unsigned j;

// Benchmark f�r Pointer Zugriff

void

bench(void)

{

    p = &x;

    for (j = 1 ; j < 1000000000 ; j++) {

        t = *p;

        *p = t+x;

    }

}

Zunächst übersetzen wir ohne Optimierung (wir schalten diese sogar explizit aus mit -O0)

$shell> gcc-4.8 -S -O0 -fno-asynchronous-unwind-tables bench_pointer.c

Und erhalten

day04/bench_pointer.s

                .comm           _x,4,2

                .comm           _p,8,3

                .comm           _t,4,2

                .comm           _j,4,2

                .text

                .globl _bench

_bench:

                pushq           %rbp

                movq            %rsp, %rbp

                movq            _p@GOTPCREL(%rip), %rax

                movq            _x@GOTPCREL(%rip), %rdx

                movq            %rdx, (%rax)

                movq            _j@GOTPCREL(%rip), %rax

                movl            $1, (%rax)

                jmp             L2

L3:

                movq            _p@GOTPCREL(%rip), %rax

                movq            (%rax), %rax

                movl            (%rax), %edx

                movq            _t@GOTPCREL(%rip), %rax

                movl            %edx, (%rax)

                movq            _p@GOTPCREL(%rip), %rax

                movq            (%rax), %rax

                movq            _t@GOTPCREL(%rip), %rdx

                movl            (%rdx), %ecx

                movq            _x@GOTPCREL(%rip), %rdx

                movl            (%rdx), %edx

                addl            %ecx, %edx

                movl            %edx, (%rax)

                movq            _j@GOTPCREL(%rip), %rax

                movl            (%rax), %eax

                leal            1(%rax), %edx

                movq            _j@GOTPCREL(%rip), %rax

                movl            %edx, (%rax)

L2:

                movq            _j@GOTPCREL(%rip), %rax

                movl            (%rax), %eax

                cmpl            $999999999, %eax

                jbe             L3

                popq            %rbp

                ret

                .subsections_via_symbols

Dann schalten wir die Optimierung an:

$shell> gcc-4.8 -S -O3 -fno-asynchronous-unwind-tables bench_pointer.c

Jetzt erhalten wir

day04/bench_pointer.s

                .text

                .align 4,0x90

                .globl _bench

_bench:

                movq            _x@GOTPCREL(%rip), %rsi

                movl            $999999999, %edx

                movq            _p@GOTPCREL(%rip), %rax

                movq            %rsi, (%rax)

                movl            (%rsi), %eax

                movl            %eax, %ecx

                jmp             L3

                .align 4,0x90

L5:

                movl            %ecx, %eax

L3:

                addl            %eax, %ecx

                subl            $1, %edx

                jne             L5

                movq            _t@GOTPCREL(%rip), %rdx

                movl            %ecx, (%rsi)

                movl            %eax, (%rdx)

                movq            _j@GOTPCREL(%rip), %rax

                movl            $1000000000, (%rax)

                ret

                .comm           _j,4,2

                .comm           _t,4,2

                .comm           _p,8,3

                .comm           _x,4,2

                .subsections_via_symbols

der Compiler hat erkannt, wie man direkt zugreifen kann und hat die Indirektion komplett weg optimiert. Das ist toll in der Praxis! Aber es ist blöd, wenn man dafür einen Benchmark machen möchte.