AutoFDO Optimizes Code with Profile Data

AutoFDO
●
Automatic Feedback-Directed Optimization
●
Some say PGO: Profile Guided Optimization

Only C, No C++ in this presentation:
Rationale
●
Linux inventor Linus Torvalds’ comments on
C++ from last week at the Embedded Linux
Conference 2017 keynote
●
https://youtu.be/NLQZzEvavGs?list=PLbzoR-
pLrL6pISWAq-1cXP4_UZAyRtesk&t=1343

Bubble sort
”In computer graphics bubble sort is popular ... in almost-sorted arrays … with just linear
complexity (2n)” - wikipedia
void bubble_sort(u64 *a, int n) {
u64 i, temp, swap_flag = 1;
while (swap_flag) {
swap_flag = 0;
for (i = 1; i < n; i++) {
if (a[i] < a[i - 1]) {
/* swap */
temp = a[i];
a[i] = a[i - 1];
a[i - 1] = temp;
swap_flag = 1;
}
}
}
}
Condition predictable?
Function inline?
Loop unroll?
Minimize branches

Common practice
●
gcc -g -O3 -o sort sort.c
●
./sort 30000
What did the compiler do?:
– gcc -S –verbose-asm -o sort.S sort.c
– objdump -d -S sort
– perf record ./sort 3000; perf annotate

Bubble sort, -O3
while (swap_flag) {
swap_flag = 0;
for (i = 1; i < n; i++) {
if (a[i] < a[i - 1]) {
/* swap */
temp = a[i];
a[i] = a[i - 1];
a[i - 1] = temp;
swap_flag = 1;
}
}
}
}
│ bubble_sort():
│ for (i = 1; i < n; i++) { // 6
0.01 │ 90: cmp $0x1,%rbp
│ ↓ jbe c6
│ lea 0x8(%rbx),%rax
│ xor %edi,%edi
│ nop
│ if (a[i] < a[i - 1]) { // 7
7.78 │ a0: mov (%rax),%rdx
5.62 │ mov -0x8(%rax),%rcx
3.52 │ cmp %rcx,%rdx
12.50 │ ↓ jae b8
│ a[i] = a[i - 1]; // 9
27.75 │ mov %rcx,(%rax)
│ a[i - 1] = temp; // 10
5.42 │ mov %rdx,-0x8(%rax)
│ swap_flag = 1; // 11
│ mov $0x1,%edi
20.62 │ b8: add $0x8,%rax
│ for (i = 1; i < n; i++) { // 6
│ cmp %rsi,%rax
16.76 │ ↑ jne a0
│ while (swap_flag) {
│ test %rdi,%rdi
│ ↑ jne 90

less common practice:
software based instrumentation
●
gcc -g -O3 -fprofile-generate -o sort sort.c
●
./sort 3000

(arm64) gcc 4.8 -O3 -fprofil-glelratl
stp x29, x30, [sp,#-32]!
adrp x2, __gcov_i_c_c
mov x29, sp
str x19, [sp,#16]
mrs x19, tpidr_el0
add x19, x19, #0x0, lsl #12
add x19, x19, #0x10
mov x1, #0x0
add x2, x2, #0xd60
ldr x0, [x19]
ldr x3, [x19,#8]
bl __gcov_i_c_p
adrp x11, a+0x1cf00
add x0, x11, #0x670
mov w7, #29998
str xzr, [x19,#8]
ldr x6, [x0,#8]
ldr x10, [x0,#24]
mov w0, #0x0
cmp w0, w7
adrp x2, _G_O_T_+0x48
add w1, w0, #0x1
ldr x8, [x11,#1648]
mov w9, #0x1
add x2, x2, #0x100
sbfiz x4, x0, #2, #32
sbfiz x3, x1, #2, #32
b.hi main+0xac
ldr w0, [x2,x4]
ldr w5, [x2,x3]
add x6, x6, #0x1
cmp w0, w5
b.le main+0x94
str w5, [x2,x4]
str w0, [x2,x3]
add x8, x8, #0x1
mov w9, #0x0
mov w0, w1
cmp w0, w7
add w1, w0, #0x1
sbfiz x4, x0, #2, #32
sbfiz x3, x1, #2, #32
b.ls 400dd0
cbnz w9, 400e24
mov w1, w9
add x10, x10, #0x1
mov w9, #0x1
mov w0, w1
b 400df8 <main+0x98>
add x1, x11, #0x670
mov w0, #0x0
ldr x19, [sp,#16]
ldr x2, [x1,#16]
str x6, [x1,#8]
add x2, x2, #0x1
str x10, [x1,#24]
str x2, [x1,#16]
str x8, [x11,#1648]
ldp x29, x30, [sp],#32
ret

less common practice:
software based instrumentation
●
gcc -g -O3 -fprofile-generate -o sort sort.c
●
./sort 3000
●
gcc -g -O3 -fprofile-use -o sort sort.c
●
./sort 30000

What did the compiler do differently?
│ b6: lea 0x8(%rcx),%rdx
│ for (i = 1; i < n; i++) { // 6
│ cmp %rdx,%r8
│ ↓ je 2e0
│ test %rax,%rax
│ ↓ je 24b
│ cmp $0x1,%rax
│ ↓ je 1a6
│ cmp $0x2,%rax
│ ↓ je 18a
│ cmp $0x3,%rax
│ ↓ je 16e
│ cmp $0x4,%rax
│ ↓ je 152
│ cmp $0x5,%rax
│ ↓ je 136
│ cmp $0x6,%rax
│ ↓ je 11a
│ if (a[i] < a[i - 1]) { // 7
│ mov 0x8(%rcx),%r9
│ mov -0x8(%rdx),%r10
│ cmp %r10,%r9
│ ↓ jae 116
│ a[i] = a[i - 1]; // 9
│ mov %r10,0x8(%rcx)
│ swap_flag = 1; // 11
│ mov $0x1,%edi
│ a[i - 1] = temp; // 10
│ mov %r9,-0x8(%rdx)
│116: add $0x8,%rdx
│ if (a[i] < a[i - 1]) { // 7
│11a: mov (%rdx),%r11
│ mov -0x8(%rdx),%r12
│ cmp %r12,%r11
│ ↓ jae 132
│ a[i] = a[i - 1]; // 9
│ mov %r12,(%rdx)
│ a[i - 1] = temp; // 10
│ mov %r11,-0x8(%rdx)
– It unrolled the inner loop

Loop unwinding
●
The goal of loop unwinding is to increase a program's
speed by:
– reducing or eliminating instructions that control the loop, such as
pointer arithmetic and "end of loop" tests on each iteration;
– reducing branch penalties; as well as
– hiding latencies including the delay in reading data from memory.
●
To eliminate this computational overhead, loops can be re-
written as a repeated sequence of similar independent
statements.
– wikipedia

software based instrumentation:
deployments
●
Ahem, I don’t really know (need a survey?)
●
Do know git supports building itself like this
– Full profile
– Fast profile
●
Hands up if any of the projects you have worked on!

How to get a good profile
●
perf record [-e <event>] <workload>

perf record [-e cycles] ./sort [-O3]
while (swap_flag) {
swap_flag = 0;
for (i = 1; i < n; i++) {
if (a[i] < a[i - 1]) {
/* swap */
temp = a[i];
a[i] = a[i - 1];
a[i - 1] = temp;
swap_flag = 1;
}
}
}
}
│ bubble_sort():
│ for (i = 1; i < n; i++) { // 6
0.01 │ 90: cmp $0x1,%rbp
│ ↓ jbe c6
│ lea 0x8(%rbx),%rax
│ xor %edi,%edi
│ nop
│ if (a[i] < a[i - 1]) { // 7
7.78 │ a0: mov (%rax),%rdx
5.62 │ mov -0x8(%rax),%rcx
3.52 │ cmp %rcx,%rdx
12.50 │ ↓ jae b8
│ a[i] = a[i - 1]; // 9
27.75 │ mov %rcx,(%rax)
│ a[i - 1] = temp; // 10
5.42 │ mov %rdx,-0x8(%rax)
│ swap_flag = 1; // 11
│ mov $0x1,%edi
20.62 │ b8: add $0x8,%rax
│ for (i = 1; i < n; i++) { // 6
│ cmp %rsi,%rax
16.76 │ ↑ jne a0
│ while (swap_flag) {
│ test %rdi,%rdi
│ ↑ jne 90

least^WGoogle common practice:
AutoFDO
~/git/pmu-tools/ocperf.py record -b -e
br_inst_retired.near_taken:pp -- ./sort 30000
~/git/autofdo-andikleen/create_gcov -debug_dump -logtostderr
--binary=./sort --profile=perf.data --gcov=./sort.gcov
-gcov_version=1
~/git/autofdo-andikleen/dump_gcov -gcov_version=1 ./sort.gcov
gcc -g -O3 -fauto-profile=sort.gcov -o sort sort.c

least^WGoogle common practice 2:
AutoFDO via runtime process attachment
./sort 300000 &
~/git/pmu-tools/ocperf.py record -b -e
br_inst_retired.near_taken:pp -p <PID>
kill %1
~/git/autofdo-andikleen/create_gcov -debug_dump -logtostderr
--binary=./sort --profile=perf.data --gcov=./sort.gcov
-gcov_version=1
~/git/autofdo-andikleen/dump_gcov -gcov_version=1 ./sort.gcov
gcc -g -O3 -fauto-profile=sort.gcov -o sort sort.c

Deployed
●
Cpython (rumour: 5% off the interpreter loop)
●
Firefox
●
Google datacenters (“over 50% of cycles spent are
optimized with FDO”)
●
Chrome, ChromeOS
●
Clearlinux
●
Github: kevinquinnyo/php7-wp-build-docker: Builds latest
stable php releases in docker container, optimizes the
build for wordpress with GCC AutoFDO and builds …

Extra tidbits
●
Coverage files (.gcov, etc.) are CPU arch-
independent: generate once, use x86, Arm, Power
●
AutoFDO supports LLVM (different coverage files)
●
5-10+% improvement consistently observed at
Google, most gain within 3-5-7 iterations with little
sample data
●
6-month old (“stale”) coverage files still good for at
least ½ of the original performance benefit

Additional resources
●
Tutorial:
– https://gcc.gnu.org/wiki/AutoFDO/Tutorial
●
Where to get gcov_create:
– https://github.com/google/autofdo
●
Where to get ocperf.py:
– git://github.com/andikleen/pmu-tools.git
●
Dehao Chen’s presentation at GCC Cauldron conf.:
– https://www.youtube.com/watch?v=26SrOC6MXWg
●
Co-worker’s presentation at Embedded Linux conf.:
– https://www.youtube.com/watch?v=S2Q1OJuZoX4
●
Large CERN project experience (5-13% improvement):
– https://indico.cern.ch/event/587970/contributions/2369824/attachments/1374948/2087355/slides.pdf
●
Me:
– kim.phillips@arm.com

Excerpt from git’s INSTALL file:
If you're willing to trade off (much) longer build time for a later faster git you
can also do a profile feedback build...
This will run the complete test suite as training workload and then rebuild
git with the generated profile feedback. This results in a git which is a few
percent faster on CPU intensive workloads. This may be a good tradeoff
for distribution packagers.
Alternatively you can run profile feedback only with the git benchmark suite.
This runs significantly faster than the full test suite, but has less coverage...
As a caveat: a profile-optimized build takes a *lot* longer since the git tree
must be built twice, and in order for the profiling measurements to work
properly, ccache must be disabled and the test suite has to be run using
only a single CPU. In addition, the profile feedback build stage currently
generates a lot of additional compiler warnings.

AutoFDO Optimizes Code with Profile Data

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Similar to AutoFDO Optimizes Code with Profile Data

Similar to AutoFDO Optimizes Code with Profile Data (20)

Recently uploaded

Recently uploaded (20)

AutoFDO Optimizes Code with Profile Data