SlideShare a Scribd company logo
AutoFDO
●
Automatic Feedback-Directed Optimization
●
Some say PGO: Profile Guided Optimization
Only C, No C++ in this presentation:
Rationale
●
Linux inventor Linus Torvalds’ comments on
C++ from last week at the Embedded Linux
Conference 2017 keynote
●
https://youtu.be/NLQZzEvavGs?list=PLbzoR-
pLrL6pISWAq-1cXP4_UZAyRtesk&t=1343
Bubble sort
”In computer graphics bubble sort is popular ... in almost-sorted arrays … with just linear
complexity (2n)” - wikipedia
void bubble_sort(u64 *a, int n) {
u64 i, temp, swap_flag = 1;
while (swap_flag) {
swap_flag = 0;
for (i = 1; i < n; i++) {
if (a[i] < a[i - 1]) {
/* swap */
temp = a[i];
a[i] = a[i - 1];
a[i - 1] = temp;
swap_flag = 1;
}
}
}
}
Condition predictable?
Function inline?
Loop unroll?
Minimize branches
Common practice
●
gcc -g -O3 -o sort sort.c
●
./sort 30000
What did the compiler do?:
– gcc -S –verbose-asm -o sort.S sort.c
– objdump -d -S sort
– perf record ./sort 3000; perf annotate
Bubble sort, -O3
void bubble_sort(u64 *a, int n) {
u64 i, temp, swap_flag = 1;
while (swap_flag) {
swap_flag = 0;
for (i = 1; i < n; i++) {
if (a[i] < a[i - 1]) {
/* swap */
temp = a[i];
a[i] = a[i - 1];
a[i - 1] = temp;
swap_flag = 1;
}
}
}
}
│ bubble_sort():
│ for (i = 1; i < n; i++) { // 6
0.01 │ 90: cmp $0x1,%rbp
│ ↓ jbe c6
│ lea 0x8(%rbx),%rax
│ xor %edi,%edi
│ nop
│ if (a[i] < a[i - 1]) { // 7
7.78 │ a0: mov (%rax),%rdx
5.62 │ mov -0x8(%rax),%rcx
3.52 │ cmp %rcx,%rdx
12.50 │ ↓ jae b8
│ a[i] = a[i - 1]; // 9
27.75 │ mov %rcx,(%rax)
│ a[i - 1] = temp; // 10
5.42 │ mov %rdx,-0x8(%rax)
│ swap_flag = 1; // 11
│ mov $0x1,%edi
20.62 │ b8: add $0x8,%rax
│ for (i = 1; i < n; i++) { // 6
│ cmp %rsi,%rax
16.76 │ ↑ jne a0
│ while (swap_flag) {
│ test %rdi,%rdi
│ ↑ jne 90
less common practice:
software based instrumentation
●
gcc -g -O3 -fprofile-generate -o sort sort.c
●
./sort 3000
(arm64) gcc 4.8 -O3 -fprofil-glelratl
stp x29, x30, [sp,#-32]!
adrp x2, __gcov_i_c_c
mov x29, sp
str x19, [sp,#16]
mrs x19, tpidr_el0
add x19, x19, #0x0, lsl #12
add x19, x19, #0x10
mov x1, #0x0
add x2, x2, #0xd60
ldr x0, [x19]
ldr x3, [x19,#8]
bl __gcov_i_c_p
adrp x11, a+0x1cf00
add x0, x11, #0x670
mov w7, #29998
str xzr, [x19,#8]
ldr x6, [x0,#8]
ldr x10, [x0,#24]
mov w0, #0x0
cmp w0, w7
adrp x2, _G_O_T_+0x48
add w1, w0, #0x1
ldr x8, [x11,#1648]
mov w9, #0x1
add x2, x2, #0x100
sbfiz x4, x0, #2, #32
sbfiz x3, x1, #2, #32
b.hi main+0xac
ldr w0, [x2,x4]
ldr w5, [x2,x3]
add x6, x6, #0x1
cmp w0, w5
b.le main+0x94
str w5, [x2,x4]
str w0, [x2,x3]
add x8, x8, #0x1
mov w9, #0x0
mov w0, w1
cmp w0, w7
add w1, w0, #0x1
sbfiz x4, x0, #2, #32
sbfiz x3, x1, #2, #32
b.ls 400dd0
cbnz w9, 400e24
mov w1, w9
add x10, x10, #0x1
mov w9, #0x1
mov w0, w1
b 400df8 <main+0x98>
add x1, x11, #0x670
mov w0, #0x0
ldr x19, [sp,#16]
ldr x2, [x1,#16]
str x6, [x1,#8]
add x2, x2, #0x1
str x10, [x1,#24]
str x2, [x1,#16]
str x8, [x11,#1648]
ldp x29, x30, [sp],#32
ret
less common practice:
software based instrumentation
●
gcc -g -O3 -fprofile-generate -o sort sort.c
●
./sort 3000
●
gcc -g -O3 -fprofile-use -o sort sort.c
●
./sort 30000
What did the compiler do differently?
│ b6: lea 0x8(%rcx),%rdx
│ for (i = 1; i < n; i++) { // 6
│ cmp %rdx,%r8
│ ↓ je 2e0
│ test %rax,%rax
│ ↓ je 24b
│ cmp $0x1,%rax
│ ↓ je 1a6
│ cmp $0x2,%rax
│ ↓ je 18a
│ cmp $0x3,%rax
│ ↓ je 16e
│ cmp $0x4,%rax
│ ↓ je 152
│ cmp $0x5,%rax
│ ↓ je 136
│ cmp $0x6,%rax
│ ↓ je 11a
│ if (a[i] < a[i - 1]) { // 7
│ mov 0x8(%rcx),%r9
│ mov -0x8(%rdx),%r10
│ cmp %r10,%r9
│ ↓ jae 116
│ a[i] = a[i - 1]; // 9
│ mov %r10,0x8(%rcx)
│ swap_flag = 1; // 11
│ mov $0x1,%edi
│ a[i - 1] = temp; // 10
│ mov %r9,-0x8(%rdx)
│116: add $0x8,%rdx
│ if (a[i] < a[i - 1]) { // 7
│11a: mov (%rdx),%r11
│ mov -0x8(%rdx),%r12
│ cmp %r12,%r11
│ ↓ jae 132
│ a[i] = a[i - 1]; // 9
│ mov %r12,(%rdx)
│ a[i - 1] = temp; // 10
│ mov %r11,-0x8(%rdx)
– It unrolled the inner loop
Loop unwinding
●
The goal of loop unwinding is to increase a program's
speed by:
– reducing or eliminating instructions that control the loop, such as
pointer arithmetic and "end of loop" tests on each iteration;
– reducing branch penalties; as well as
– hiding latencies including the delay in reading data from memory.
●
To eliminate this computational overhead, loops can be re-
written as a repeated sequence of similar independent
statements.
– wikipedia
software based instrumentation:
deployments
●
Ahem, I don’t really know (need a survey?)
●
Do know git supports building itself like this
– Full profile
– Fast profile
●
Hands up if any of the projects you have worked on!
AutoFDO
How to get a good profile
●
perf record [-e <event>] <workload>
perf record [-e cycles] ./sort [-O3]
void bubble_sort(u64 *a, int n) {
u64 i, temp, swap_flag = 1;
while (swap_flag) {
swap_flag = 0;
for (i = 1; i < n; i++) {
if (a[i] < a[i - 1]) {
/* swap */
temp = a[i];
a[i] = a[i - 1];
a[i - 1] = temp;
swap_flag = 1;
}
}
}
}
│ bubble_sort():
│ for (i = 1; i < n; i++) { // 6
0.01 │ 90: cmp $0x1,%rbp
│ ↓ jbe c6
│ lea 0x8(%rbx),%rax
│ xor %edi,%edi
│ nop
│ if (a[i] < a[i - 1]) { // 7
7.78 │ a0: mov (%rax),%rdx
5.62 │ mov -0x8(%rax),%rcx
3.52 │ cmp %rcx,%rdx
12.50 │ ↓ jae b8
│ a[i] = a[i - 1]; // 9
27.75 │ mov %rcx,(%rax)
│ a[i - 1] = temp; // 10
5.42 │ mov %rdx,-0x8(%rax)
│ swap_flag = 1; // 11
│ mov $0x1,%edi
20.62 │ b8: add $0x8,%rax
│ for (i = 1; i < n; i++) { // 6
│ cmp %rsi,%rax
16.76 │ ↑ jne a0
│ while (swap_flag) {
│ test %rdi,%rdi
│ ↑ jne 90
least^WGoogle common practice:
AutoFDO
gcc -g -O3 -o sort sort.c
~/git/pmu-tools/ocperf.py record -b -e
br_inst_retired.near_taken:pp -- ./sort 30000
~/git/autofdo-andikleen/create_gcov -debug_dump -logtostderr
--binary=./sort --profile=perf.data --gcov=./sort.gcov
-gcov_version=1
~/git/autofdo-andikleen/dump_gcov -gcov_version=1 ./sort.gcov
gcc -g -O3 -fauto-profile=sort.gcov -o sort sort.c
least^WGoogle common practice 2:
AutoFDO via runtime process attachment
gcc -g -O3 -o sort sort.c
./sort 300000 &
~/git/pmu-tools/ocperf.py record -b -e
br_inst_retired.near_taken:pp -p <PID>
kill %1
~/git/autofdo-andikleen/create_gcov -debug_dump -logtostderr
--binary=./sort --profile=perf.data --gcov=./sort.gcov
-gcov_version=1
~/git/autofdo-andikleen/dump_gcov -gcov_version=1 ./sort.gcov
gcc -g -O3 -fauto-profile=sort.gcov -o sort sort.c
Deployed
●
Cpython (rumour: 5% off the interpreter loop)
●
Firefox
●
Google datacenters (“over 50% of cycles spent are
optimized with FDO”)
●
Chrome, ChromeOS
●
Clearlinux
●
Github: kevinquinnyo/php7-wp-build-docker: Builds latest
stable php releases in docker container, optimizes the
build for wordpress with GCC AutoFDO and builds …
Extra tidbits
●
Coverage files (.gcov, etc.) are CPU arch-
independent: generate once, use x86, Arm, Power
●
AutoFDO supports LLVM (different coverage files)
●
5-10+% improvement consistently observed at
Google, most gain within 3-5-7 iterations with little
sample data
●
6-month old (“stale”) coverage files still good for at
least ½ of the original performance benefit
Additional resources
●
Tutorial:
– https://gcc.gnu.org/wiki/AutoFDO/Tutorial
●
Where to get gcov_create:
– https://github.com/google/autofdo
●
Where to get ocperf.py:
– git://github.com/andikleen/pmu-tools.git
●
Dehao Chen’s presentation at GCC Cauldron conf.:
– https://www.youtube.com/watch?v=26SrOC6MXWg
●
Co-worker’s presentation at Embedded Linux conf.:
– https://www.youtube.com/watch?v=S2Q1OJuZoX4
●
Large CERN project experience (5-13% improvement):
– https://indico.cern.ch/event/587970/contributions/2369824/attachments/1374948/2087355/slides.pdf
●
Me:
– kim.phillips@arm.com
Excerpt from git’s INSTALL file:
If you're willing to trade off (much) longer build time for a later faster git you
can also do a profile feedback build...
This will run the complete test suite as training workload and then rebuild
git with the generated profile feedback. This results in a git which is a few
percent faster on CPU intensive workloads. This may be a good tradeoff
for distribution packagers.
Alternatively you can run profile feedback only with the git benchmark suite.
This runs significantly faster than the full test suite, but has less coverage...
As a caveat: a profile-optimized build takes a *lot* longer since the git tree
must be built twice, and in order for the profiling measurements to work
properly, ccache must be disabled and the test suite has to be run using
only a single CPU. In addition, the profile feedback build stage currently
generates a lot of additional compiler warnings.

More Related Content

What's hot

GPGPU Seminar (PyCUDA)
GPGPU Seminar (PyCUDA)GPGPU Seminar (PyCUDA)
GPGPU Seminar (PyCUDA)
智啓 出川
 
Introduction to open_sbi
Introduction to open_sbiIntroduction to open_sbi
Introduction to open_sbi
Nylon
 
C/C++プログラマのための開発ツール
C/C++プログラマのための開発ツールC/C++プログラマのための開発ツール
C/C++プログラマのための開発ツール
MITSUNARI Shigeo
 
20111015 勉強会 (PCIe / SR-IOV)
20111015 勉強会 (PCIe / SR-IOV)20111015 勉強会 (PCIe / SR-IOV)
20111015 勉強会 (PCIe / SR-IOV)
Kentaro Ebisawa
 
PHP Version Up と AWS への移行
PHP Version Up と AWS への移行PHP Version Up と AWS への移行
PHP Version Up と AWS への移行
gree_tech
 
今時のオンプレなgithubクローン環境構築
今時のオンプレなgithubクローン環境構築今時のオンプレなgithubクローン環境構築
今時のオンプレなgithubクローン環境構築
You&I
 
한국IT산업과 고용환경의변화
한국IT산업과 고용환경의변화한국IT산업과 고용환경의변화
한국IT산업과 고용환경의변화
수보 김
 
本当に怖いパフォーマンスが悪い実装 #phpcon2013
本当に怖いパフォーマンスが悪い実装 #phpcon2013本当に怖いパフォーマンスが悪い実装 #phpcon2013
本当に怖いパフォーマンスが悪い実装 #phpcon2013
Yahoo!デベロッパーネットワーク
 
Metasploitでペネトレーションテスト
MetasploitでペネトレーションテストMetasploitでペネトレーションテスト
Metasploitでペネトレーションテストsuper_a1ice
 
そうだったのか! よくわかる process.nextTick() node.jsのイベントループを理解する
そうだったのか! よくわかる process.nextTick() node.jsのイベントループを理解するそうだったのか! よくわかる process.nextTick() node.jsのイベントループを理解する
そうだったのか! よくわかる process.nextTick() node.jsのイベントループを理解するshigeki_ohtsu
 
C#, C/CLI と CUDAによる画像処理ことはじめ
C#, C/CLI と CUDAによる画像処理ことはじめC#, C/CLI と CUDAによる画像処理ことはじめ
C#, C/CLI と CUDAによる画像処理ことはじめ
NVIDIA Japan
 
Embedded Hypervisor for ARM
Embedded Hypervisor for ARMEmbedded Hypervisor for ARM
Embedded Hypervisor for ARM
National Cheng Kung University
 
Pci express transaction
Pci express transactionPci express transaction
Pci express transaction
y38y38
 
ZynqMPのQEMU
ZynqMPのQEMUZynqMPのQEMU
ZynqMPのQEMU
Mr. Vengineer
 
Galileo computing software testing
Galileo computing software testingGalileo computing software testing
Galileo computing software testing
Qualister
 
MOVで実践したサーバーAPI実装の超最適化について [MOBILITY:dev]
MOVで実践したサーバーAPI実装の超最適化について [MOBILITY:dev]MOVで実践したサーバーAPI実装の超最適化について [MOBILITY:dev]
MOVで実践したサーバーAPI実装の超最適化について [MOBILITY:dev]
DeNA
 
unique_ptrにポインタ以外のものを持たせるとき
unique_ptrにポインタ以外のものを持たせるときunique_ptrにポインタ以外のものを持たせるとき
unique_ptrにポインタ以外のものを持たせるとき
Shintarou Okada
 
'Embedding' a meta state machine
'Embedding' a meta state machine'Embedding' a meta state machine
'Embedding' a meta state machine
emBO_Conference
 
gcc and friends
gcc and friendsgcc and friends
gcc and friends
Anil Kumar Pugalia
 
Kernel Recipes 2015: Representing device-tree peripherals in ACPI
Kernel Recipes 2015: Representing device-tree peripherals in ACPIKernel Recipes 2015: Representing device-tree peripherals in ACPI
Kernel Recipes 2015: Representing device-tree peripherals in ACPI
Anne Nicolas
 

What's hot (20)

GPGPU Seminar (PyCUDA)
GPGPU Seminar (PyCUDA)GPGPU Seminar (PyCUDA)
GPGPU Seminar (PyCUDA)
 
Introduction to open_sbi
Introduction to open_sbiIntroduction to open_sbi
Introduction to open_sbi
 
C/C++プログラマのための開発ツール
C/C++プログラマのための開発ツールC/C++プログラマのための開発ツール
C/C++プログラマのための開発ツール
 
20111015 勉強会 (PCIe / SR-IOV)
20111015 勉強会 (PCIe / SR-IOV)20111015 勉強会 (PCIe / SR-IOV)
20111015 勉強会 (PCIe / SR-IOV)
 
PHP Version Up と AWS への移行
PHP Version Up と AWS への移行PHP Version Up と AWS への移行
PHP Version Up と AWS への移行
 
今時のオンプレなgithubクローン環境構築
今時のオンプレなgithubクローン環境構築今時のオンプレなgithubクローン環境構築
今時のオンプレなgithubクローン環境構築
 
한국IT산업과 고용환경의변화
한국IT산업과 고용환경의변화한국IT산업과 고용환경의변화
한국IT산업과 고용환경의변화
 
本当に怖いパフォーマンスが悪い実装 #phpcon2013
本当に怖いパフォーマンスが悪い実装 #phpcon2013本当に怖いパフォーマンスが悪い実装 #phpcon2013
本当に怖いパフォーマンスが悪い実装 #phpcon2013
 
Metasploitでペネトレーションテスト
MetasploitでペネトレーションテストMetasploitでペネトレーションテスト
Metasploitでペネトレーションテスト
 
そうだったのか! よくわかる process.nextTick() node.jsのイベントループを理解する
そうだったのか! よくわかる process.nextTick() node.jsのイベントループを理解するそうだったのか! よくわかる process.nextTick() node.jsのイベントループを理解する
そうだったのか! よくわかる process.nextTick() node.jsのイベントループを理解する
 
C#, C/CLI と CUDAによる画像処理ことはじめ
C#, C/CLI と CUDAによる画像処理ことはじめC#, C/CLI と CUDAによる画像処理ことはじめ
C#, C/CLI と CUDAによる画像処理ことはじめ
 
Embedded Hypervisor for ARM
Embedded Hypervisor for ARMEmbedded Hypervisor for ARM
Embedded Hypervisor for ARM
 
Pci express transaction
Pci express transactionPci express transaction
Pci express transaction
 
ZynqMPのQEMU
ZynqMPのQEMUZynqMPのQEMU
ZynqMPのQEMU
 
Galileo computing software testing
Galileo computing software testingGalileo computing software testing
Galileo computing software testing
 
MOVで実践したサーバーAPI実装の超最適化について [MOBILITY:dev]
MOVで実践したサーバーAPI実装の超最適化について [MOBILITY:dev]MOVで実践したサーバーAPI実装の超最適化について [MOBILITY:dev]
MOVで実践したサーバーAPI実装の超最適化について [MOBILITY:dev]
 
unique_ptrにポインタ以外のものを持たせるとき
unique_ptrにポインタ以外のものを持たせるときunique_ptrにポインタ以外のものを持たせるとき
unique_ptrにポインタ以外のものを持たせるとき
 
'Embedding' a meta state machine
'Embedding' a meta state machine'Embedding' a meta state machine
'Embedding' a meta state machine
 
gcc and friends
gcc and friendsgcc and friends
gcc and friends
 
Kernel Recipes 2015: Representing device-tree peripherals in ACPI
Kernel Recipes 2015: Representing device-tree peripherals in ACPIKernel Recipes 2015: Representing device-tree peripherals in ACPI
Kernel Recipes 2015: Representing device-tree peripherals in ACPI
 

Similar to C c++-meetup-1nov2017-autofdo

Linux kernel tracing superpowers in the cloud
Linux kernel tracing superpowers in the cloudLinux kernel tracing superpowers in the cloud
Linux kernel tracing superpowers in the cloud
Andrea Righi
 
Building a DSL with GraalVM (VoxxedDays Luxembourg)
Building a DSL with GraalVM (VoxxedDays Luxembourg)Building a DSL with GraalVM (VoxxedDays Luxembourg)
Building a DSL with GraalVM (VoxxedDays Luxembourg)
Maarten Mulders
 
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak   CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
PROIDEA
 
Being functional in PHP (PHPDay Italy 2016)
Being functional in PHP (PHPDay Italy 2016)Being functional in PHP (PHPDay Italy 2016)
Being functional in PHP (PHPDay Italy 2016)
David de Boer
 
A CTF Hackers Toolbox
A CTF Hackers ToolboxA CTF Hackers Toolbox
A CTF Hackers Toolbox
Stefan
 
All I know about rsc.io/c2go
All I know about rsc.io/c2goAll I know about rsc.io/c2go
All I know about rsc.io/c2goMoriyoshi Koizumi
 
Being functional in PHP (DPC 2016)
Being functional in PHP (DPC 2016)Being functional in PHP (DPC 2016)
Being functional in PHP (DPC 2016)
David de Boer
 
Boosting Developer Productivity with Clang
Boosting Developer Productivity with ClangBoosting Developer Productivity with Clang
Boosting Developer Productivity with Clang
Samsung Open Source Group
 
[FOSDEM 2015] How to test OpenGL drivers using Free Software
[FOSDEM 2015] How to test OpenGL drivers using Free Software[FOSDEM 2015] How to test OpenGL drivers using Free Software
[FOSDEM 2015] How to test OpenGL drivers using Free Software
Samuel Iglesias Gonsálvez
 
How to test OpenGL drivers using Free Software (FOSDEM 2015)
How to test OpenGL drivers using Free Software (FOSDEM 2015)How to test OpenGL drivers using Free Software (FOSDEM 2015)
How to test OpenGL drivers using Free Software (FOSDEM 2015)
Igalia
 
Building a DSL with GraalVM (CodeOne)
Building a DSL with GraalVM (CodeOne)Building a DSL with GraalVM (CodeOne)
Building a DSL with GraalVM (CodeOne)
Maarten Mulders
 
The why and how of moving to PHP 5.5/5.6
The why and how of moving to PHP 5.5/5.6The why and how of moving to PHP 5.5/5.6
The why and how of moving to PHP 5.5/5.6
Wim Godden
 
Being functional in PHP
Being functional in PHPBeing functional in PHP
Being functional in PHP
David de Boer
 
Profiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf ToolsProfiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf Tools
emBO_Conference
 
Beyond Breakpoints: A Tour of Dynamic Analysis
Beyond Breakpoints: A Tour of Dynamic AnalysisBeyond Breakpoints: A Tour of Dynamic Analysis
Beyond Breakpoints: A Tour of Dynamic Analysis
C4Media
 
RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析
RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析
RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析
Mr. Vengineer
 
Global Interpreter Lock: Episode I - Break the Seal
Global Interpreter Lock: Episode I - Break the SealGlobal Interpreter Lock: Episode I - Break the Seal
Global Interpreter Lock: Episode I - Break the Seal
Tzung-Bi Shih
 
Google App Engine: Basic
Google App Engine: BasicGoogle App Engine: Basic
Google App Engine: Basic
KAI CHU CHUNG
 
Cvim half precision floating point
Cvim half precision floating pointCvim half precision floating point
Cvim half precision floating point
tomoaki0705
 
Go Go Gadget! - An Intro to Return Oriented Programming (ROP)
Go Go Gadget! - An Intro to Return Oriented Programming (ROP)Go Go Gadget! - An Intro to Return Oriented Programming (ROP)
Go Go Gadget! - An Intro to Return Oriented Programming (ROP)
Miguel Arroyo
 

Similar to C c++-meetup-1nov2017-autofdo (20)

Linux kernel tracing superpowers in the cloud
Linux kernel tracing superpowers in the cloudLinux kernel tracing superpowers in the cloud
Linux kernel tracing superpowers in the cloud
 
Building a DSL with GraalVM (VoxxedDays Luxembourg)
Building a DSL with GraalVM (VoxxedDays Luxembourg)Building a DSL with GraalVM (VoxxedDays Luxembourg)
Building a DSL with GraalVM (VoxxedDays Luxembourg)
 
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak   CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
 
Being functional in PHP (PHPDay Italy 2016)
Being functional in PHP (PHPDay Italy 2016)Being functional in PHP (PHPDay Italy 2016)
Being functional in PHP (PHPDay Italy 2016)
 
A CTF Hackers Toolbox
A CTF Hackers ToolboxA CTF Hackers Toolbox
A CTF Hackers Toolbox
 
All I know about rsc.io/c2go
All I know about rsc.io/c2goAll I know about rsc.io/c2go
All I know about rsc.io/c2go
 
Being functional in PHP (DPC 2016)
Being functional in PHP (DPC 2016)Being functional in PHP (DPC 2016)
Being functional in PHP (DPC 2016)
 
Boosting Developer Productivity with Clang
Boosting Developer Productivity with ClangBoosting Developer Productivity with Clang
Boosting Developer Productivity with Clang
 
[FOSDEM 2015] How to test OpenGL drivers using Free Software
[FOSDEM 2015] How to test OpenGL drivers using Free Software[FOSDEM 2015] How to test OpenGL drivers using Free Software
[FOSDEM 2015] How to test OpenGL drivers using Free Software
 
How to test OpenGL drivers using Free Software (FOSDEM 2015)
How to test OpenGL drivers using Free Software (FOSDEM 2015)How to test OpenGL drivers using Free Software (FOSDEM 2015)
How to test OpenGL drivers using Free Software (FOSDEM 2015)
 
Building a DSL with GraalVM (CodeOne)
Building a DSL with GraalVM (CodeOne)Building a DSL with GraalVM (CodeOne)
Building a DSL with GraalVM (CodeOne)
 
The why and how of moving to PHP 5.5/5.6
The why and how of moving to PHP 5.5/5.6The why and how of moving to PHP 5.5/5.6
The why and how of moving to PHP 5.5/5.6
 
Being functional in PHP
Being functional in PHPBeing functional in PHP
Being functional in PHP
 
Profiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf ToolsProfiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf Tools
 
Beyond Breakpoints: A Tour of Dynamic Analysis
Beyond Breakpoints: A Tour of Dynamic AnalysisBeyond Breakpoints: A Tour of Dynamic Analysis
Beyond Breakpoints: A Tour of Dynamic Analysis
 
RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析
RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析
RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析
 
Global Interpreter Lock: Episode I - Break the Seal
Global Interpreter Lock: Episode I - Break the SealGlobal Interpreter Lock: Episode I - Break the Seal
Global Interpreter Lock: Episode I - Break the Seal
 
Google App Engine: Basic
Google App Engine: BasicGoogle App Engine: Basic
Google App Engine: Basic
 
Cvim half precision floating point
Cvim half precision floating pointCvim half precision floating point
Cvim half precision floating point
 
Go Go Gadget! - An Intro to Return Oriented Programming (ROP)
Go Go Gadget! - An Intro to Return Oriented Programming (ROP)Go Go Gadget! - An Intro to Return Oriented Programming (ROP)
Go Go Gadget! - An Intro to Return Oriented Programming (ROP)
 

Recently uploaded

Top Features to Include in Your Winzo Clone App for Business Growth (4).pptx
Top Features to Include in Your Winzo Clone App for Business Growth (4).pptxTop Features to Include in Your Winzo Clone App for Business Growth (4).pptx
Top Features to Include in Your Winzo Clone App for Business Growth (4).pptx
rickgrimesss22
 
Accelerate Enterprise Software Engineering with Platformless
Accelerate Enterprise Software Engineering with PlatformlessAccelerate Enterprise Software Engineering with Platformless
Accelerate Enterprise Software Engineering with Platformless
WSO2
 
Prosigns: Transforming Business with Tailored Technology Solutions
Prosigns: Transforming Business with Tailored Technology SolutionsProsigns: Transforming Business with Tailored Technology Solutions
Prosigns: Transforming Business with Tailored Technology Solutions
Prosigns
 
Lecture 1 Introduction to games development
Lecture 1 Introduction to games developmentLecture 1 Introduction to games development
Lecture 1 Introduction to games development
abdulrafaychaudhry
 
Globus Connect Server Deep Dive - GlobusWorld 2024
Globus Connect Server Deep Dive - GlobusWorld 2024Globus Connect Server Deep Dive - GlobusWorld 2024
Globus Connect Server Deep Dive - GlobusWorld 2024
Globus
 
May Marketo Masterclass, London MUG May 22 2024.pdf
May Marketo Masterclass, London MUG May 22 2024.pdfMay Marketo Masterclass, London MUG May 22 2024.pdf
May Marketo Masterclass, London MUG May 22 2024.pdf
Adele Miller
 
Vitthal Shirke Microservices Resume Montevideo
Vitthal Shirke Microservices Resume MontevideoVitthal Shirke Microservices Resume Montevideo
Vitthal Shirke Microservices Resume Montevideo
Vitthal Shirke
 
top nidhi software solution freedownload
top nidhi software solution freedownloadtop nidhi software solution freedownload
top nidhi software solution freedownload
vrstrong314
 
RISE with SAP and Journey to the Intelligent Enterprise
RISE with SAP and Journey to the Intelligent EnterpriseRISE with SAP and Journey to the Intelligent Enterprise
RISE with SAP and Journey to the Intelligent Enterprise
Srikant77
 
Enhancing Research Orchestration Capabilities at ORNL.pdf
Enhancing Research Orchestration Capabilities at ORNL.pdfEnhancing Research Orchestration Capabilities at ORNL.pdf
Enhancing Research Orchestration Capabilities at ORNL.pdf
Globus
 
Understanding Globus Data Transfers with NetSage
Understanding Globus Data Transfers with NetSageUnderstanding Globus Data Transfers with NetSage
Understanding Globus Data Transfers with NetSage
Globus
 
Graphic Design Crash Course for beginners
Graphic Design Crash Course for beginnersGraphic Design Crash Course for beginners
Graphic Design Crash Course for beginners
e20449
 
Custom Healthcare Software for Managing Chronic Conditions and Remote Patient...
Custom Healthcare Software for Managing Chronic Conditions and Remote Patient...Custom Healthcare Software for Managing Chronic Conditions and Remote Patient...
Custom Healthcare Software for Managing Chronic Conditions and Remote Patient...
Mind IT Systems
 
Orion Context Broker introduction 20240604
Orion Context Broker introduction 20240604Orion Context Broker introduction 20240604
Orion Context Broker introduction 20240604
Fermin Galan
 
Quarkus Hidden and Forbidden Extensions
Quarkus Hidden and Forbidden ExtensionsQuarkus Hidden and Forbidden Extensions
Quarkus Hidden and Forbidden Extensions
Max Andersen
 
GlobusWorld 2024 Opening Keynote session
GlobusWorld 2024 Opening Keynote sessionGlobusWorld 2024 Opening Keynote session
GlobusWorld 2024 Opening Keynote session
Globus
 
OpenFOAM solver for Helmholtz equation, helmholtzFoam / helmholtzBubbleFoam
OpenFOAM solver for Helmholtz equation, helmholtzFoam / helmholtzBubbleFoamOpenFOAM solver for Helmholtz equation, helmholtzFoam / helmholtzBubbleFoam
OpenFOAM solver for Helmholtz equation, helmholtzFoam / helmholtzBubbleFoam
takuyayamamoto1800
 
Providing Globus Services to Users of JASMIN for Environmental Data Analysis
Providing Globus Services to Users of JASMIN for Environmental Data AnalysisProviding Globus Services to Users of JASMIN for Environmental Data Analysis
Providing Globus Services to Users of JASMIN for Environmental Data Analysis
Globus
 
TROUBLESHOOTING 9 TYPES OF OUTOFMEMORYERROR
TROUBLESHOOTING 9 TYPES OF OUTOFMEMORYERRORTROUBLESHOOTING 9 TYPES OF OUTOFMEMORYERROR
TROUBLESHOOTING 9 TYPES OF OUTOFMEMORYERROR
Tier1 app
 
Globus Compute Introduction - GlobusWorld 2024
Globus Compute Introduction - GlobusWorld 2024Globus Compute Introduction - GlobusWorld 2024
Globus Compute Introduction - GlobusWorld 2024
Globus
 

Recently uploaded (20)

Top Features to Include in Your Winzo Clone App for Business Growth (4).pptx
Top Features to Include in Your Winzo Clone App for Business Growth (4).pptxTop Features to Include in Your Winzo Clone App for Business Growth (4).pptx
Top Features to Include in Your Winzo Clone App for Business Growth (4).pptx
 
Accelerate Enterprise Software Engineering with Platformless
Accelerate Enterprise Software Engineering with PlatformlessAccelerate Enterprise Software Engineering with Platformless
Accelerate Enterprise Software Engineering with Platformless
 
Prosigns: Transforming Business with Tailored Technology Solutions
Prosigns: Transforming Business with Tailored Technology SolutionsProsigns: Transforming Business with Tailored Technology Solutions
Prosigns: Transforming Business with Tailored Technology Solutions
 
Lecture 1 Introduction to games development
Lecture 1 Introduction to games developmentLecture 1 Introduction to games development
Lecture 1 Introduction to games development
 
Globus Connect Server Deep Dive - GlobusWorld 2024
Globus Connect Server Deep Dive - GlobusWorld 2024Globus Connect Server Deep Dive - GlobusWorld 2024
Globus Connect Server Deep Dive - GlobusWorld 2024
 
May Marketo Masterclass, London MUG May 22 2024.pdf
May Marketo Masterclass, London MUG May 22 2024.pdfMay Marketo Masterclass, London MUG May 22 2024.pdf
May Marketo Masterclass, London MUG May 22 2024.pdf
 
Vitthal Shirke Microservices Resume Montevideo
Vitthal Shirke Microservices Resume MontevideoVitthal Shirke Microservices Resume Montevideo
Vitthal Shirke Microservices Resume Montevideo
 
top nidhi software solution freedownload
top nidhi software solution freedownloadtop nidhi software solution freedownload
top nidhi software solution freedownload
 
RISE with SAP and Journey to the Intelligent Enterprise
RISE with SAP and Journey to the Intelligent EnterpriseRISE with SAP and Journey to the Intelligent Enterprise
RISE with SAP and Journey to the Intelligent Enterprise
 
Enhancing Research Orchestration Capabilities at ORNL.pdf
Enhancing Research Orchestration Capabilities at ORNL.pdfEnhancing Research Orchestration Capabilities at ORNL.pdf
Enhancing Research Orchestration Capabilities at ORNL.pdf
 
Understanding Globus Data Transfers with NetSage
Understanding Globus Data Transfers with NetSageUnderstanding Globus Data Transfers with NetSage
Understanding Globus Data Transfers with NetSage
 
Graphic Design Crash Course for beginners
Graphic Design Crash Course for beginnersGraphic Design Crash Course for beginners
Graphic Design Crash Course for beginners
 
Custom Healthcare Software for Managing Chronic Conditions and Remote Patient...
Custom Healthcare Software for Managing Chronic Conditions and Remote Patient...Custom Healthcare Software for Managing Chronic Conditions and Remote Patient...
Custom Healthcare Software for Managing Chronic Conditions and Remote Patient...
 
Orion Context Broker introduction 20240604
Orion Context Broker introduction 20240604Orion Context Broker introduction 20240604
Orion Context Broker introduction 20240604
 
Quarkus Hidden and Forbidden Extensions
Quarkus Hidden and Forbidden ExtensionsQuarkus Hidden and Forbidden Extensions
Quarkus Hidden and Forbidden Extensions
 
GlobusWorld 2024 Opening Keynote session
GlobusWorld 2024 Opening Keynote sessionGlobusWorld 2024 Opening Keynote session
GlobusWorld 2024 Opening Keynote session
 
OpenFOAM solver for Helmholtz equation, helmholtzFoam / helmholtzBubbleFoam
OpenFOAM solver for Helmholtz equation, helmholtzFoam / helmholtzBubbleFoamOpenFOAM solver for Helmholtz equation, helmholtzFoam / helmholtzBubbleFoam
OpenFOAM solver for Helmholtz equation, helmholtzFoam / helmholtzBubbleFoam
 
Providing Globus Services to Users of JASMIN for Environmental Data Analysis
Providing Globus Services to Users of JASMIN for Environmental Data AnalysisProviding Globus Services to Users of JASMIN for Environmental Data Analysis
Providing Globus Services to Users of JASMIN for Environmental Data Analysis
 
TROUBLESHOOTING 9 TYPES OF OUTOFMEMORYERROR
TROUBLESHOOTING 9 TYPES OF OUTOFMEMORYERRORTROUBLESHOOTING 9 TYPES OF OUTOFMEMORYERROR
TROUBLESHOOTING 9 TYPES OF OUTOFMEMORYERROR
 
Globus Compute Introduction - GlobusWorld 2024
Globus Compute Introduction - GlobusWorld 2024Globus Compute Introduction - GlobusWorld 2024
Globus Compute Introduction - GlobusWorld 2024
 

C c++-meetup-1nov2017-autofdo

  • 2. Only C, No C++ in this presentation: Rationale ● Linux inventor Linus Torvalds’ comments on C++ from last week at the Embedded Linux Conference 2017 keynote ● https://youtu.be/NLQZzEvavGs?list=PLbzoR- pLrL6pISWAq-1cXP4_UZAyRtesk&t=1343
  • 3. Bubble sort ”In computer graphics bubble sort is popular ... in almost-sorted arrays … with just linear complexity (2n)” - wikipedia void bubble_sort(u64 *a, int n) { u64 i, temp, swap_flag = 1; while (swap_flag) { swap_flag = 0; for (i = 1; i < n; i++) { if (a[i] < a[i - 1]) { /* swap */ temp = a[i]; a[i] = a[i - 1]; a[i - 1] = temp; swap_flag = 1; } } } } Condition predictable? Function inline? Loop unroll? Minimize branches
  • 4. Common practice ● gcc -g -O3 -o sort sort.c ● ./sort 30000 What did the compiler do?: – gcc -S –verbose-asm -o sort.S sort.c – objdump -d -S sort – perf record ./sort 3000; perf annotate
  • 5. Bubble sort, -O3 void bubble_sort(u64 *a, int n) { u64 i, temp, swap_flag = 1; while (swap_flag) { swap_flag = 0; for (i = 1; i < n; i++) { if (a[i] < a[i - 1]) { /* swap */ temp = a[i]; a[i] = a[i - 1]; a[i - 1] = temp; swap_flag = 1; } } } } │ bubble_sort(): │ for (i = 1; i < n; i++) { // 6 0.01 │ 90: cmp $0x1,%rbp │ ↓ jbe c6 │ lea 0x8(%rbx),%rax │ xor %edi,%edi │ nop │ if (a[i] < a[i - 1]) { // 7 7.78 │ a0: mov (%rax),%rdx 5.62 │ mov -0x8(%rax),%rcx 3.52 │ cmp %rcx,%rdx 12.50 │ ↓ jae b8 │ a[i] = a[i - 1]; // 9 27.75 │ mov %rcx,(%rax) │ a[i - 1] = temp; // 10 5.42 │ mov %rdx,-0x8(%rax) │ swap_flag = 1; // 11 │ mov $0x1,%edi 20.62 │ b8: add $0x8,%rax │ for (i = 1; i < n; i++) { // 6 │ cmp %rsi,%rax 16.76 │ ↑ jne a0 │ while (swap_flag) { │ test %rdi,%rdi │ ↑ jne 90
  • 6. less common practice: software based instrumentation ● gcc -g -O3 -fprofile-generate -o sort sort.c ● ./sort 3000
  • 7. (arm64) gcc 4.8 -O3 -fprofil-glelratl stp x29, x30, [sp,#-32]! adrp x2, __gcov_i_c_c mov x29, sp str x19, [sp,#16] mrs x19, tpidr_el0 add x19, x19, #0x0, lsl #12 add x19, x19, #0x10 mov x1, #0x0 add x2, x2, #0xd60 ldr x0, [x19] ldr x3, [x19,#8] bl __gcov_i_c_p adrp x11, a+0x1cf00 add x0, x11, #0x670 mov w7, #29998 str xzr, [x19,#8] ldr x6, [x0,#8] ldr x10, [x0,#24] mov w0, #0x0 cmp w0, w7 adrp x2, _G_O_T_+0x48 add w1, w0, #0x1 ldr x8, [x11,#1648] mov w9, #0x1 add x2, x2, #0x100 sbfiz x4, x0, #2, #32 sbfiz x3, x1, #2, #32 b.hi main+0xac ldr w0, [x2,x4] ldr w5, [x2,x3] add x6, x6, #0x1 cmp w0, w5 b.le main+0x94 str w5, [x2,x4] str w0, [x2,x3] add x8, x8, #0x1 mov w9, #0x0 mov w0, w1 cmp w0, w7 add w1, w0, #0x1 sbfiz x4, x0, #2, #32 sbfiz x3, x1, #2, #32 b.ls 400dd0 cbnz w9, 400e24 mov w1, w9 add x10, x10, #0x1 mov w9, #0x1 mov w0, w1 b 400df8 <main+0x98> add x1, x11, #0x670 mov w0, #0x0 ldr x19, [sp,#16] ldr x2, [x1,#16] str x6, [x1,#8] add x2, x2, #0x1 str x10, [x1,#24] str x2, [x1,#16] str x8, [x11,#1648] ldp x29, x30, [sp],#32 ret
  • 8. less common practice: software based instrumentation ● gcc -g -O3 -fprofile-generate -o sort sort.c ● ./sort 3000 ● gcc -g -O3 -fprofile-use -o sort sort.c ● ./sort 30000
  • 9. What did the compiler do differently? │ b6: lea 0x8(%rcx),%rdx │ for (i = 1; i < n; i++) { // 6 │ cmp %rdx,%r8 │ ↓ je 2e0 │ test %rax,%rax │ ↓ je 24b │ cmp $0x1,%rax │ ↓ je 1a6 │ cmp $0x2,%rax │ ↓ je 18a │ cmp $0x3,%rax │ ↓ je 16e │ cmp $0x4,%rax │ ↓ je 152 │ cmp $0x5,%rax │ ↓ je 136 │ cmp $0x6,%rax │ ↓ je 11a │ if (a[i] < a[i - 1]) { // 7 │ mov 0x8(%rcx),%r9 │ mov -0x8(%rdx),%r10 │ cmp %r10,%r9 │ ↓ jae 116 │ a[i] = a[i - 1]; // 9 │ mov %r10,0x8(%rcx) │ swap_flag = 1; // 11 │ mov $0x1,%edi │ a[i - 1] = temp; // 10 │ mov %r9,-0x8(%rdx) │116: add $0x8,%rdx │ if (a[i] < a[i - 1]) { // 7 │11a: mov (%rdx),%r11 │ mov -0x8(%rdx),%r12 │ cmp %r12,%r11 │ ↓ jae 132 │ a[i] = a[i - 1]; // 9 │ mov %r12,(%rdx) │ a[i - 1] = temp; // 10 │ mov %r11,-0x8(%rdx) – It unrolled the inner loop
  • 10. Loop unwinding ● The goal of loop unwinding is to increase a program's speed by: – reducing or eliminating instructions that control the loop, such as pointer arithmetic and "end of loop" tests on each iteration; – reducing branch penalties; as well as – hiding latencies including the delay in reading data from memory. ● To eliminate this computational overhead, loops can be re- written as a repeated sequence of similar independent statements. – wikipedia
  • 11. software based instrumentation: deployments ● Ahem, I don’t really know (need a survey?) ● Do know git supports building itself like this – Full profile – Fast profile ● Hands up if any of the projects you have worked on!
  • 13. How to get a good profile ● perf record [-e <event>] <workload>
  • 14. perf record [-e cycles] ./sort [-O3] void bubble_sort(u64 *a, int n) { u64 i, temp, swap_flag = 1; while (swap_flag) { swap_flag = 0; for (i = 1; i < n; i++) { if (a[i] < a[i - 1]) { /* swap */ temp = a[i]; a[i] = a[i - 1]; a[i - 1] = temp; swap_flag = 1; } } } } │ bubble_sort(): │ for (i = 1; i < n; i++) { // 6 0.01 │ 90: cmp $0x1,%rbp │ ↓ jbe c6 │ lea 0x8(%rbx),%rax │ xor %edi,%edi │ nop │ if (a[i] < a[i - 1]) { // 7 7.78 │ a0: mov (%rax),%rdx 5.62 │ mov -0x8(%rax),%rcx 3.52 │ cmp %rcx,%rdx 12.50 │ ↓ jae b8 │ a[i] = a[i - 1]; // 9 27.75 │ mov %rcx,(%rax) │ a[i - 1] = temp; // 10 5.42 │ mov %rdx,-0x8(%rax) │ swap_flag = 1; // 11 │ mov $0x1,%edi 20.62 │ b8: add $0x8,%rax │ for (i = 1; i < n; i++) { // 6 │ cmp %rsi,%rax 16.76 │ ↑ jne a0 │ while (swap_flag) { │ test %rdi,%rdi │ ↑ jne 90
  • 15. least^WGoogle common practice: AutoFDO gcc -g -O3 -o sort sort.c ~/git/pmu-tools/ocperf.py record -b -e br_inst_retired.near_taken:pp -- ./sort 30000 ~/git/autofdo-andikleen/create_gcov -debug_dump -logtostderr --binary=./sort --profile=perf.data --gcov=./sort.gcov -gcov_version=1 ~/git/autofdo-andikleen/dump_gcov -gcov_version=1 ./sort.gcov gcc -g -O3 -fauto-profile=sort.gcov -o sort sort.c
  • 16. least^WGoogle common practice 2: AutoFDO via runtime process attachment gcc -g -O3 -o sort sort.c ./sort 300000 & ~/git/pmu-tools/ocperf.py record -b -e br_inst_retired.near_taken:pp -p <PID> kill %1 ~/git/autofdo-andikleen/create_gcov -debug_dump -logtostderr --binary=./sort --profile=perf.data --gcov=./sort.gcov -gcov_version=1 ~/git/autofdo-andikleen/dump_gcov -gcov_version=1 ./sort.gcov gcc -g -O3 -fauto-profile=sort.gcov -o sort sort.c
  • 17. Deployed ● Cpython (rumour: 5% off the interpreter loop) ● Firefox ● Google datacenters (“over 50% of cycles spent are optimized with FDO”) ● Chrome, ChromeOS ● Clearlinux ● Github: kevinquinnyo/php7-wp-build-docker: Builds latest stable php releases in docker container, optimizes the build for wordpress with GCC AutoFDO and builds …
  • 18. Extra tidbits ● Coverage files (.gcov, etc.) are CPU arch- independent: generate once, use x86, Arm, Power ● AutoFDO supports LLVM (different coverage files) ● 5-10+% improvement consistently observed at Google, most gain within 3-5-7 iterations with little sample data ● 6-month old (“stale”) coverage files still good for at least ½ of the original performance benefit
  • 19. Additional resources ● Tutorial: – https://gcc.gnu.org/wiki/AutoFDO/Tutorial ● Where to get gcov_create: – https://github.com/google/autofdo ● Where to get ocperf.py: – git://github.com/andikleen/pmu-tools.git ● Dehao Chen’s presentation at GCC Cauldron conf.: – https://www.youtube.com/watch?v=26SrOC6MXWg ● Co-worker’s presentation at Embedded Linux conf.: – https://www.youtube.com/watch?v=S2Q1OJuZoX4 ● Large CERN project experience (5-13% improvement): – https://indico.cern.ch/event/587970/contributions/2369824/attachments/1374948/2087355/slides.pdf ● Me: – kim.phillips@arm.com
  • 20. Excerpt from git’s INSTALL file: If you're willing to trade off (much) longer build time for a later faster git you can also do a profile feedback build... This will run the complete test suite as training workload and then rebuild git with the generated profile feedback. This results in a git which is a few percent faster on CPU intensive workloads. This may be a good tradeoff for distribution packagers. Alternatively you can run profile feedback only with the git benchmark suite. This runs significantly faster than the full test suite, but has less coverage... As a caveat: a profile-optimized build takes a *lot* longer since the git tree must be built twice, and in order for the profiling measurements to work properly, ccache must be disabled and the test suite has to be run using only a single CPU. In addition, the profile feedback build stage currently generates a lot of additional compiler warnings.