SlideShare a Scribd company logo
1 of 20
Download to read offline
AutoFDO
●
Automatic Feedback-Directed Optimization
●
Some say PGO: Profile Guided Optimization
Only C, No C++ in this presentation:
Rationale
●
Linux inventor Linus Torvalds’ comments on
C++ from last week at the Embedded Linux
Conference 2017 keynote
●
https://youtu.be/NLQZzEvavGs?list=PLbzoR-
pLrL6pISWAq-1cXP4_UZAyRtesk&t=1343
Bubble sort
”In computer graphics bubble sort is popular ... in almost-sorted arrays … with just linear
complexity (2n)” - wikipedia
void bubble_sort(u64 *a, int n) {
u64 i, temp, swap_flag = 1;
while (swap_flag) {
swap_flag = 0;
for (i = 1; i < n; i++) {
if (a[i] < a[i - 1]) {
/* swap */
temp = a[i];
a[i] = a[i - 1];
a[i - 1] = temp;
swap_flag = 1;
}
}
}
}
Condition predictable?
Function inline?
Loop unroll?
Minimize branches
Common practice
●
gcc -g -O3 -o sort sort.c
●
./sort 30000
What did the compiler do?:
– gcc -S –verbose-asm -o sort.S sort.c
– objdump -d -S sort
– perf record ./sort 3000; perf annotate
Bubble sort, -O3
void bubble_sort(u64 *a, int n) {
u64 i, temp, swap_flag = 1;
while (swap_flag) {
swap_flag = 0;
for (i = 1; i < n; i++) {
if (a[i] < a[i - 1]) {
/* swap */
temp = a[i];
a[i] = a[i - 1];
a[i - 1] = temp;
swap_flag = 1;
}
}
}
}
│ bubble_sort():
│ for (i = 1; i < n; i++) { // 6
0.01 │ 90: cmp $0x1,%rbp
│ ↓ jbe c6
│ lea 0x8(%rbx),%rax
│ xor %edi,%edi
│ nop
│ if (a[i] < a[i - 1]) { // 7
7.78 │ a0: mov (%rax),%rdx
5.62 │ mov -0x8(%rax),%rcx
3.52 │ cmp %rcx,%rdx
12.50 │ ↓ jae b8
│ a[i] = a[i - 1]; // 9
27.75 │ mov %rcx,(%rax)
│ a[i - 1] = temp; // 10
5.42 │ mov %rdx,-0x8(%rax)
│ swap_flag = 1; // 11
│ mov $0x1,%edi
20.62 │ b8: add $0x8,%rax
│ for (i = 1; i < n; i++) { // 6
│ cmp %rsi,%rax
16.76 │ ↑ jne a0
│ while (swap_flag) {
│ test %rdi,%rdi
│ ↑ jne 90
less common practice:
software based instrumentation
●
gcc -g -O3 -fprofile-generate -o sort sort.c
●
./sort 3000
(arm64) gcc 4.8 -O3 -fprofil-glelratl
stp x29, x30, [sp,#-32]!
adrp x2, __gcov_i_c_c
mov x29, sp
str x19, [sp,#16]
mrs x19, tpidr_el0
add x19, x19, #0x0, lsl #12
add x19, x19, #0x10
mov x1, #0x0
add x2, x2, #0xd60
ldr x0, [x19]
ldr x3, [x19,#8]
bl __gcov_i_c_p
adrp x11, a+0x1cf00
add x0, x11, #0x670
mov w7, #29998
str xzr, [x19,#8]
ldr x6, [x0,#8]
ldr x10, [x0,#24]
mov w0, #0x0
cmp w0, w7
adrp x2, _G_O_T_+0x48
add w1, w0, #0x1
ldr x8, [x11,#1648]
mov w9, #0x1
add x2, x2, #0x100
sbfiz x4, x0, #2, #32
sbfiz x3, x1, #2, #32
b.hi main+0xac
ldr w0, [x2,x4]
ldr w5, [x2,x3]
add x6, x6, #0x1
cmp w0, w5
b.le main+0x94
str w5, [x2,x4]
str w0, [x2,x3]
add x8, x8, #0x1
mov w9, #0x0
mov w0, w1
cmp w0, w7
add w1, w0, #0x1
sbfiz x4, x0, #2, #32
sbfiz x3, x1, #2, #32
b.ls 400dd0
cbnz w9, 400e24
mov w1, w9
add x10, x10, #0x1
mov w9, #0x1
mov w0, w1
b 400df8 <main+0x98>
add x1, x11, #0x670
mov w0, #0x0
ldr x19, [sp,#16]
ldr x2, [x1,#16]
str x6, [x1,#8]
add x2, x2, #0x1
str x10, [x1,#24]
str x2, [x1,#16]
str x8, [x11,#1648]
ldp x29, x30, [sp],#32
ret
less common practice:
software based instrumentation
●
gcc -g -O3 -fprofile-generate -o sort sort.c
●
./sort 3000
●
gcc -g -O3 -fprofile-use -o sort sort.c
●
./sort 30000
What did the compiler do differently?
│ b6: lea 0x8(%rcx),%rdx
│ for (i = 1; i < n; i++) { // 6
│ cmp %rdx,%r8
│ ↓ je 2e0
│ test %rax,%rax
│ ↓ je 24b
│ cmp $0x1,%rax
│ ↓ je 1a6
│ cmp $0x2,%rax
│ ↓ je 18a
│ cmp $0x3,%rax
│ ↓ je 16e
│ cmp $0x4,%rax
│ ↓ je 152
│ cmp $0x5,%rax
│ ↓ je 136
│ cmp $0x6,%rax
│ ↓ je 11a
│ if (a[i] < a[i - 1]) { // 7
│ mov 0x8(%rcx),%r9
│ mov -0x8(%rdx),%r10
│ cmp %r10,%r9
│ ↓ jae 116
│ a[i] = a[i - 1]; // 9
│ mov %r10,0x8(%rcx)
│ swap_flag = 1; // 11
│ mov $0x1,%edi
│ a[i - 1] = temp; // 10
│ mov %r9,-0x8(%rdx)
│116: add $0x8,%rdx
│ if (a[i] < a[i - 1]) { // 7
│11a: mov (%rdx),%r11
│ mov -0x8(%rdx),%r12
│ cmp %r12,%r11
│ ↓ jae 132
│ a[i] = a[i - 1]; // 9
│ mov %r12,(%rdx)
│ a[i - 1] = temp; // 10
│ mov %r11,-0x8(%rdx)
– It unrolled the inner loop
Loop unwinding
●
The goal of loop unwinding is to increase a program's
speed by:
– reducing or eliminating instructions that control the loop, such as
pointer arithmetic and "end of loop" tests on each iteration;
– reducing branch penalties; as well as
– hiding latencies including the delay in reading data from memory.
●
To eliminate this computational overhead, loops can be re-
written as a repeated sequence of similar independent
statements.
– wikipedia
software based instrumentation:
deployments
●
Ahem, I don’t really know (need a survey?)
●
Do know git supports building itself like this
– Full profile
– Fast profile
●
Hands up if any of the projects you have worked on!
AutoFDO
How to get a good profile
●
perf record [-e <event>] <workload>
perf record [-e cycles] ./sort [-O3]
void bubble_sort(u64 *a, int n) {
u64 i, temp, swap_flag = 1;
while (swap_flag) {
swap_flag = 0;
for (i = 1; i < n; i++) {
if (a[i] < a[i - 1]) {
/* swap */
temp = a[i];
a[i] = a[i - 1];
a[i - 1] = temp;
swap_flag = 1;
}
}
}
}
│ bubble_sort():
│ for (i = 1; i < n; i++) { // 6
0.01 │ 90: cmp $0x1,%rbp
│ ↓ jbe c6
│ lea 0x8(%rbx),%rax
│ xor %edi,%edi
│ nop
│ if (a[i] < a[i - 1]) { // 7
7.78 │ a0: mov (%rax),%rdx
5.62 │ mov -0x8(%rax),%rcx
3.52 │ cmp %rcx,%rdx
12.50 │ ↓ jae b8
│ a[i] = a[i - 1]; // 9
27.75 │ mov %rcx,(%rax)
│ a[i - 1] = temp; // 10
5.42 │ mov %rdx,-0x8(%rax)
│ swap_flag = 1; // 11
│ mov $0x1,%edi
20.62 │ b8: add $0x8,%rax
│ for (i = 1; i < n; i++) { // 6
│ cmp %rsi,%rax
16.76 │ ↑ jne a0
│ while (swap_flag) {
│ test %rdi,%rdi
│ ↑ jne 90
least^WGoogle common practice:
AutoFDO
gcc -g -O3 -o sort sort.c
~/git/pmu-tools/ocperf.py record -b -e
br_inst_retired.near_taken:pp -- ./sort 30000
~/git/autofdo-andikleen/create_gcov -debug_dump -logtostderr
--binary=./sort --profile=perf.data --gcov=./sort.gcov
-gcov_version=1
~/git/autofdo-andikleen/dump_gcov -gcov_version=1 ./sort.gcov
gcc -g -O3 -fauto-profile=sort.gcov -o sort sort.c
least^WGoogle common practice 2:
AutoFDO via runtime process attachment
gcc -g -O3 -o sort sort.c
./sort 300000 &
~/git/pmu-tools/ocperf.py record -b -e
br_inst_retired.near_taken:pp -p <PID>
kill %1
~/git/autofdo-andikleen/create_gcov -debug_dump -logtostderr
--binary=./sort --profile=perf.data --gcov=./sort.gcov
-gcov_version=1
~/git/autofdo-andikleen/dump_gcov -gcov_version=1 ./sort.gcov
gcc -g -O3 -fauto-profile=sort.gcov -o sort sort.c
Deployed
●
Cpython (rumour: 5% off the interpreter loop)
●
Firefox
●
Google datacenters (“over 50% of cycles spent are
optimized with FDO”)
●
Chrome, ChromeOS
●
Clearlinux
●
Github: kevinquinnyo/php7-wp-build-docker: Builds latest
stable php releases in docker container, optimizes the
build for wordpress with GCC AutoFDO and builds …
Extra tidbits
●
Coverage files (.gcov, etc.) are CPU arch-
independent: generate once, use x86, Arm, Power
●
AutoFDO supports LLVM (different coverage files)
●
5-10+% improvement consistently observed at
Google, most gain within 3-5-7 iterations with little
sample data
●
6-month old (“stale”) coverage files still good for at
least ½ of the original performance benefit
Additional resources
●
Tutorial:
– https://gcc.gnu.org/wiki/AutoFDO/Tutorial
●
Where to get gcov_create:
– https://github.com/google/autofdo
●
Where to get ocperf.py:
– git://github.com/andikleen/pmu-tools.git
●
Dehao Chen’s presentation at GCC Cauldron conf.:
– https://www.youtube.com/watch?v=26SrOC6MXWg
●
Co-worker’s presentation at Embedded Linux conf.:
– https://www.youtube.com/watch?v=S2Q1OJuZoX4
●
Large CERN project experience (5-13% improvement):
– https://indico.cern.ch/event/587970/contributions/2369824/attachments/1374948/2087355/slides.pdf
●
Me:
– kim.phillips@arm.com
Excerpt from git’s INSTALL file:
If you're willing to trade off (much) longer build time for a later faster git you
can also do a profile feedback build...
This will run the complete test suite as training workload and then rebuild
git with the generated profile feedback. This results in a git which is a few
percent faster on CPU intensive workloads. This may be a good tradeoff
for distribution packagers.
Alternatively you can run profile feedback only with the git benchmark suite.
This runs significantly faster than the full test suite, but has less coverage...
As a caveat: a profile-optimized build takes a *lot* longer since the git tree
must be built twice, and in order for the profiling measurements to work
properly, ccache must be disabled and the test suite has to be run using
only a single CPU. In addition, the profile feedback build stage currently
generates a lot of additional compiler warnings.

More Related Content

What's hot

Евгений Крутько, Многопоточные вычисления, современный подход.
Евгений Крутько, Многопоточные вычисления, современный подход.Евгений Крутько, Многопоточные вычисления, современный подход.
Евгений Крутько, Многопоточные вычисления, современный подход.Platonov Sergey
 
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash courseCodepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash courseSages
 
Wprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache HadoopWprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache HadoopSages
 
Алексей Кутумов, Coroutines everywhere
Алексей Кутумов, Coroutines everywhereАлексей Кутумов, Coroutines everywhere
Алексей Кутумов, Coroutines everywhereSergey Platonov
 
RxJS 5 in Depth
RxJS 5 in DepthRxJS 5 in Depth
RxJS 5 in DepthC4Media
 
Cluj.py Meetup: Extending Python in C
Cluj.py Meetup: Extending Python in CCluj.py Meetup: Extending Python in C
Cluj.py Meetup: Extending Python in CSteffen Wenz
 
Kirk Shoop, Reactive programming in C++
Kirk Shoop, Reactive programming in C++Kirk Shoop, Reactive programming in C++
Kirk Shoop, Reactive programming in C++Sergey Platonov
 
PyCon KR 2019 sprint - RustPython by example
PyCon KR 2019 sprint  - RustPython by examplePyCon KR 2019 sprint  - RustPython by example
PyCon KR 2019 sprint - RustPython by exampleYunWon Jeong
 
Wprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data EcosystemWprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data EcosystemSages
 
Python opcodes
Python opcodesPython opcodes
Python opcodesalexgolec
 
TCO in Python via bytecode manipulation.
TCO in Python via bytecode manipulation.TCO in Python via bytecode manipulation.
TCO in Python via bytecode manipulation.lnikolaeva
 
NS2: AWK and GNUplot - PArt III
NS2: AWK and GNUplot - PArt IIINS2: AWK and GNUplot - PArt III
NS2: AWK and GNUplot - PArt IIIAjit Nayak
 
Cluj Big Data Meetup - Big Data in Practice
Cluj Big Data Meetup - Big Data in PracticeCluj Big Data Meetup - Big Data in Practice
Cluj Big Data Meetup - Big Data in PracticeSteffen Wenz
 
Protocol handler in Gecko
Protocol handler in GeckoProtocol handler in Gecko
Protocol handler in GeckoChih-Hsuan Kuo
 
Ns2: Introduction - Part I
Ns2: Introduction - Part INs2: Introduction - Part I
Ns2: Introduction - Part IAjit Nayak
 

What's hot (20)

Евгений Крутько, Многопоточные вычисления, современный подход.
Евгений Крутько, Многопоточные вычисления, современный подход.Евгений Крутько, Многопоточные вычисления, современный подход.
Евгений Крутько, Многопоточные вычисления, современный подход.
 
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash courseCodepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
Codepot - Pig i Hive: szybkie wprowadzenie / Pig and Hive crash course
 
Wprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache HadoopWprowadzenie do technologi Big Data i Apache Hadoop
Wprowadzenie do technologi Big Data i Apache Hadoop
 
Алексей Кутумов, Coroutines everywhere
Алексей Кутумов, Coroutines everywhereАлексей Кутумов, Coroutines everywhere
Алексей Кутумов, Coroutines everywhere
 
RxJS 5 in Depth
RxJS 5 in DepthRxJS 5 in Depth
RxJS 5 in Depth
 
dplyr
dplyrdplyr
dplyr
 
Cluj.py Meetup: Extending Python in C
Cluj.py Meetup: Extending Python in CCluj.py Meetup: Extending Python in C
Cluj.py Meetup: Extending Python in C
 
Kirk Shoop, Reactive programming in C++
Kirk Shoop, Reactive programming in C++Kirk Shoop, Reactive programming in C++
Kirk Shoop, Reactive programming in C++
 
PyCon KR 2019 sprint - RustPython by example
PyCon KR 2019 sprint  - RustPython by examplePyCon KR 2019 sprint  - RustPython by example
PyCon KR 2019 sprint - RustPython by example
 
Wprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data EcosystemWprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
Wprowadzenie do technologii Big Data / Intro to Big Data Ecosystem
 
Python opcodes
Python opcodesPython opcodes
Python opcodes
 
TCO in Python via bytecode manipulation.
TCO in Python via bytecode manipulation.TCO in Python via bytecode manipulation.
TCO in Python via bytecode manipulation.
 
Clang tidy
Clang tidyClang tidy
Clang tidy
 
Rcpp11 useR2014
Rcpp11 useR2014Rcpp11 useR2014
Rcpp11 useR2014
 
Dafunctor
DafunctorDafunctor
Dafunctor
 
NS2: AWK and GNUplot - PArt III
NS2: AWK and GNUplot - PArt IIINS2: AWK and GNUplot - PArt III
NS2: AWK and GNUplot - PArt III
 
Cluj Big Data Meetup - Big Data in Practice
Cluj Big Data Meetup - Big Data in PracticeCluj Big Data Meetup - Big Data in Practice
Cluj Big Data Meetup - Big Data in Practice
 
user2015 keynote talk
user2015 keynote talkuser2015 keynote talk
user2015 keynote talk
 
Protocol handler in Gecko
Protocol handler in GeckoProtocol handler in Gecko
Protocol handler in Gecko
 
Ns2: Introduction - Part I
Ns2: Introduction - Part INs2: Introduction - Part I
Ns2: Introduction - Part I
 

Similar to AutoFDO Optimizes Code with Profile Data

Linux kernel tracing superpowers in the cloud
Linux kernel tracing superpowers in the cloudLinux kernel tracing superpowers in the cloud
Linux kernel tracing superpowers in the cloudAndrea Righi
 
Building a DSL with GraalVM (VoxxedDays Luxembourg)
Building a DSL with GraalVM (VoxxedDays Luxembourg)Building a DSL with GraalVM (VoxxedDays Luxembourg)
Building a DSL with GraalVM (VoxxedDays Luxembourg)Maarten Mulders
 
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak   CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak PROIDEA
 
Being functional in PHP (PHPDay Italy 2016)
Being functional in PHP (PHPDay Italy 2016)Being functional in PHP (PHPDay Italy 2016)
Being functional in PHP (PHPDay Italy 2016)David de Boer
 
A CTF Hackers Toolbox
A CTF Hackers ToolboxA CTF Hackers Toolbox
A CTF Hackers ToolboxStefan
 
Being functional in PHP (DPC 2016)
Being functional in PHP (DPC 2016)Being functional in PHP (DPC 2016)
Being functional in PHP (DPC 2016)David de Boer
 
[FOSDEM 2015] How to test OpenGL drivers using Free Software
[FOSDEM 2015] How to test OpenGL drivers using Free Software[FOSDEM 2015] How to test OpenGL drivers using Free Software
[FOSDEM 2015] How to test OpenGL drivers using Free SoftwareSamuel Iglesias Gonsálvez
 
How to test OpenGL drivers using Free Software (FOSDEM 2015)
How to test OpenGL drivers using Free Software (FOSDEM 2015)How to test OpenGL drivers using Free Software (FOSDEM 2015)
How to test OpenGL drivers using Free Software (FOSDEM 2015)Igalia
 
Building a DSL with GraalVM (CodeOne)
Building a DSL with GraalVM (CodeOne)Building a DSL with GraalVM (CodeOne)
Building a DSL with GraalVM (CodeOne)Maarten Mulders
 
The why and how of moving to PHP 5.5/5.6
The why and how of moving to PHP 5.5/5.6The why and how of moving to PHP 5.5/5.6
The why and how of moving to PHP 5.5/5.6Wim Godden
 
Being functional in PHP
Being functional in PHPBeing functional in PHP
Being functional in PHPDavid de Boer
 
Profiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf ToolsProfiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf ToolsemBO_Conference
 
Beyond Breakpoints: A Tour of Dynamic Analysis
Beyond Breakpoints: A Tour of Dynamic AnalysisBeyond Breakpoints: A Tour of Dynamic Analysis
Beyond Breakpoints: A Tour of Dynamic AnalysisC4Media
 
RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析
RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析
RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析Mr. Vengineer
 
Global Interpreter Lock: Episode I - Break the Seal
Global Interpreter Lock: Episode I - Break the SealGlobal Interpreter Lock: Episode I - Break the Seal
Global Interpreter Lock: Episode I - Break the SealTzung-Bi Shih
 
Google App Engine: Basic
Google App Engine: BasicGoogle App Engine: Basic
Google App Engine: BasicKAI CHU CHUNG
 
Cvim half precision floating point
Cvim half precision floating pointCvim half precision floating point
Cvim half precision floating pointtomoaki0705
 
Go Go Gadget! - An Intro to Return Oriented Programming (ROP)
Go Go Gadget! - An Intro to Return Oriented Programming (ROP)Go Go Gadget! - An Intro to Return Oriented Programming (ROP)
Go Go Gadget! - An Intro to Return Oriented Programming (ROP)Miguel Arroyo
 
Deep learning - the conf br 2018
Deep learning - the conf br 2018Deep learning - the conf br 2018
Deep learning - the conf br 2018Fabio Janiszevski
 

Similar to AutoFDO Optimizes Code with Profile Data (20)

Linux kernel tracing superpowers in the cloud
Linux kernel tracing superpowers in the cloudLinux kernel tracing superpowers in the cloud
Linux kernel tracing superpowers in the cloud
 
Building a DSL with GraalVM (VoxxedDays Luxembourg)
Building a DSL with GraalVM (VoxxedDays Luxembourg)Building a DSL with GraalVM (VoxxedDays Luxembourg)
Building a DSL with GraalVM (VoxxedDays Luxembourg)
 
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak   CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
CONFidence 2015: DTrace + OSX = Fun - Andrzej Dyjak
 
Being functional in PHP (PHPDay Italy 2016)
Being functional in PHP (PHPDay Italy 2016)Being functional in PHP (PHPDay Italy 2016)
Being functional in PHP (PHPDay Italy 2016)
 
A CTF Hackers Toolbox
A CTF Hackers ToolboxA CTF Hackers Toolbox
A CTF Hackers Toolbox
 
Being functional in PHP (DPC 2016)
Being functional in PHP (DPC 2016)Being functional in PHP (DPC 2016)
Being functional in PHP (DPC 2016)
 
Boosting Developer Productivity with Clang
Boosting Developer Productivity with ClangBoosting Developer Productivity with Clang
Boosting Developer Productivity with Clang
 
[FOSDEM 2015] How to test OpenGL drivers using Free Software
[FOSDEM 2015] How to test OpenGL drivers using Free Software[FOSDEM 2015] How to test OpenGL drivers using Free Software
[FOSDEM 2015] How to test OpenGL drivers using Free Software
 
How to test OpenGL drivers using Free Software (FOSDEM 2015)
How to test OpenGL drivers using Free Software (FOSDEM 2015)How to test OpenGL drivers using Free Software (FOSDEM 2015)
How to test OpenGL drivers using Free Software (FOSDEM 2015)
 
Building a DSL with GraalVM (CodeOne)
Building a DSL with GraalVM (CodeOne)Building a DSL with GraalVM (CodeOne)
Building a DSL with GraalVM (CodeOne)
 
The why and how of moving to PHP 5.5/5.6
The why and how of moving to PHP 5.5/5.6The why and how of moving to PHP 5.5/5.6
The why and how of moving to PHP 5.5/5.6
 
Being functional in PHP
Being functional in PHPBeing functional in PHP
Being functional in PHP
 
Profiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf ToolsProfiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf Tools
 
Beyond Breakpoints: A Tour of Dynamic Analysis
Beyond Breakpoints: A Tour of Dynamic AnalysisBeyond Breakpoints: A Tour of Dynamic Analysis
Beyond Breakpoints: A Tour of Dynamic Analysis
 
RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析
RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析
RISC-V : Berkeley Boot Loader & Proxy Kernelのソースコード解析
 
Global Interpreter Lock: Episode I - Break the Seal
Global Interpreter Lock: Episode I - Break the SealGlobal Interpreter Lock: Episode I - Break the Seal
Global Interpreter Lock: Episode I - Break the Seal
 
Google App Engine: Basic
Google App Engine: BasicGoogle App Engine: Basic
Google App Engine: Basic
 
Cvim half precision floating point
Cvim half precision floating pointCvim half precision floating point
Cvim half precision floating point
 
Go Go Gadget! - An Intro to Return Oriented Programming (ROP)
Go Go Gadget! - An Intro to Return Oriented Programming (ROP)Go Go Gadget! - An Intro to Return Oriented Programming (ROP)
Go Go Gadget! - An Intro to Return Oriented Programming (ROP)
 
Deep learning - the conf br 2018
Deep learning - the conf br 2018Deep learning - the conf br 2018
Deep learning - the conf br 2018
 

Recently uploaded

Der Spagat zwischen BIAS und FAIRNESS (2024)
Der Spagat zwischen BIAS und FAIRNESS (2024)Der Spagat zwischen BIAS und FAIRNESS (2024)
Der Spagat zwischen BIAS und FAIRNESS (2024)OPEN KNOWLEDGE GmbH
 
Russian Call Girls in Karol Bagh Aasnvi ➡️ 8264348440 💋📞 Independent Escort S...
Russian Call Girls in Karol Bagh Aasnvi ➡️ 8264348440 💋📞 Independent Escort S...Russian Call Girls in Karol Bagh Aasnvi ➡️ 8264348440 💋📞 Independent Escort S...
Russian Call Girls in Karol Bagh Aasnvi ➡️ 8264348440 💋📞 Independent Escort S...soniya singh
 
Xen Safety Embedded OSS Summit April 2024 v4.pdf
Xen Safety Embedded OSS Summit April 2024 v4.pdfXen Safety Embedded OSS Summit April 2024 v4.pdf
Xen Safety Embedded OSS Summit April 2024 v4.pdfStefano Stabellini
 
What is Fashion PLM and Why Do You Need It
What is Fashion PLM and Why Do You Need ItWhat is Fashion PLM and Why Do You Need It
What is Fashion PLM and Why Do You Need ItWave PLM
 
Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...
Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...
Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...stazi3110
 
How to Track Employee Performance A Comprehensive Guide.pdf
How to Track Employee Performance A Comprehensive Guide.pdfHow to Track Employee Performance A Comprehensive Guide.pdf
How to Track Employee Performance A Comprehensive Guide.pdfLivetecs LLC
 
SpotFlow: Tracking Method Calls and States at Runtime
SpotFlow: Tracking Method Calls and States at RuntimeSpotFlow: Tracking Method Calls and States at Runtime
SpotFlow: Tracking Method Calls and States at Runtimeandrehoraa
 
Software Project Health Check: Best Practices and Techniques for Your Product...
Software Project Health Check: Best Practices and Techniques for Your Product...Software Project Health Check: Best Practices and Techniques for Your Product...
Software Project Health Check: Best Practices and Techniques for Your Product...Velvetech LLC
 
Maximizing Efficiency and Profitability with OnePlan’s Professional Service A...
Maximizing Efficiency and Profitability with OnePlan’s Professional Service A...Maximizing Efficiency and Profitability with OnePlan’s Professional Service A...
Maximizing Efficiency and Profitability with OnePlan’s Professional Service A...OnePlan Solutions
 
办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样
办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样
办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样umasea
 
Automate your Kamailio Test Calls - Kamailio World 2024
Automate your Kamailio Test Calls - Kamailio World 2024Automate your Kamailio Test Calls - Kamailio World 2024
Automate your Kamailio Test Calls - Kamailio World 2024Andreas Granig
 
Cloud Management Software Platforms: OpenStack
Cloud Management Software Platforms: OpenStackCloud Management Software Platforms: OpenStack
Cloud Management Software Platforms: OpenStackVICTOR MAESTRE RAMIREZ
 
GOING AOT WITH GRAALVM – DEVOXX GREECE.pdf
GOING AOT WITH GRAALVM – DEVOXX GREECE.pdfGOING AOT WITH GRAALVM – DEVOXX GREECE.pdf
GOING AOT WITH GRAALVM – DEVOXX GREECE.pdfAlina Yurenko
 
EY_Graph Database Powered Sustainability
EY_Graph Database Powered SustainabilityEY_Graph Database Powered Sustainability
EY_Graph Database Powered SustainabilityNeo4j
 
Unveiling Design Patterns: A Visual Guide with UML Diagrams
Unveiling Design Patterns: A Visual Guide with UML DiagramsUnveiling Design Patterns: A Visual Guide with UML Diagrams
Unveiling Design Patterns: A Visual Guide with UML DiagramsAhmed Mohamed
 
BATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASE
BATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASEBATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASE
BATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASEOrtus Solutions, Corp
 
Balasore Best It Company|| Top 10 IT Company || Balasore Software company Odisha
Balasore Best It Company|| Top 10 IT Company || Balasore Software company OdishaBalasore Best It Company|| Top 10 IT Company || Balasore Software company Odisha
Balasore Best It Company|| Top 10 IT Company || Balasore Software company Odishasmiwainfosol
 
MYjobs Presentation Django-based project
MYjobs Presentation Django-based projectMYjobs Presentation Django-based project
MYjobs Presentation Django-based projectAnoyGreter
 
What is Advanced Excel and what are some best practices for designing and cre...
What is Advanced Excel and what are some best practices for designing and cre...What is Advanced Excel and what are some best practices for designing and cre...
What is Advanced Excel and what are some best practices for designing and cre...Technogeeks
 
Recruitment Management Software Benefits (Infographic)
Recruitment Management Software Benefits (Infographic)Recruitment Management Software Benefits (Infographic)
Recruitment Management Software Benefits (Infographic)Hr365.us smith
 

Recently uploaded (20)

Der Spagat zwischen BIAS und FAIRNESS (2024)
Der Spagat zwischen BIAS und FAIRNESS (2024)Der Spagat zwischen BIAS und FAIRNESS (2024)
Der Spagat zwischen BIAS und FAIRNESS (2024)
 
Russian Call Girls in Karol Bagh Aasnvi ➡️ 8264348440 💋📞 Independent Escort S...
Russian Call Girls in Karol Bagh Aasnvi ➡️ 8264348440 💋📞 Independent Escort S...Russian Call Girls in Karol Bagh Aasnvi ➡️ 8264348440 💋📞 Independent Escort S...
Russian Call Girls in Karol Bagh Aasnvi ➡️ 8264348440 💋📞 Independent Escort S...
 
Xen Safety Embedded OSS Summit April 2024 v4.pdf
Xen Safety Embedded OSS Summit April 2024 v4.pdfXen Safety Embedded OSS Summit April 2024 v4.pdf
Xen Safety Embedded OSS Summit April 2024 v4.pdf
 
What is Fashion PLM and Why Do You Need It
What is Fashion PLM and Why Do You Need ItWhat is Fashion PLM and Why Do You Need It
What is Fashion PLM and Why Do You Need It
 
Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...
Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...
Building a General PDE Solving Framework with Symbolic-Numeric Scientific Mac...
 
How to Track Employee Performance A Comprehensive Guide.pdf
How to Track Employee Performance A Comprehensive Guide.pdfHow to Track Employee Performance A Comprehensive Guide.pdf
How to Track Employee Performance A Comprehensive Guide.pdf
 
SpotFlow: Tracking Method Calls and States at Runtime
SpotFlow: Tracking Method Calls and States at RuntimeSpotFlow: Tracking Method Calls and States at Runtime
SpotFlow: Tracking Method Calls and States at Runtime
 
Software Project Health Check: Best Practices and Techniques for Your Product...
Software Project Health Check: Best Practices and Techniques for Your Product...Software Project Health Check: Best Practices and Techniques for Your Product...
Software Project Health Check: Best Practices and Techniques for Your Product...
 
Maximizing Efficiency and Profitability with OnePlan’s Professional Service A...
Maximizing Efficiency and Profitability with OnePlan’s Professional Service A...Maximizing Efficiency and Profitability with OnePlan’s Professional Service A...
Maximizing Efficiency and Profitability with OnePlan’s Professional Service A...
 
办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样
办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样
办理学位证(UQ文凭证书)昆士兰大学毕业证成绩单原版一模一样
 
Automate your Kamailio Test Calls - Kamailio World 2024
Automate your Kamailio Test Calls - Kamailio World 2024Automate your Kamailio Test Calls - Kamailio World 2024
Automate your Kamailio Test Calls - Kamailio World 2024
 
Cloud Management Software Platforms: OpenStack
Cloud Management Software Platforms: OpenStackCloud Management Software Platforms: OpenStack
Cloud Management Software Platforms: OpenStack
 
GOING AOT WITH GRAALVM – DEVOXX GREECE.pdf
GOING AOT WITH GRAALVM – DEVOXX GREECE.pdfGOING AOT WITH GRAALVM – DEVOXX GREECE.pdf
GOING AOT WITH GRAALVM – DEVOXX GREECE.pdf
 
EY_Graph Database Powered Sustainability
EY_Graph Database Powered SustainabilityEY_Graph Database Powered Sustainability
EY_Graph Database Powered Sustainability
 
Unveiling Design Patterns: A Visual Guide with UML Diagrams
Unveiling Design Patterns: A Visual Guide with UML DiagramsUnveiling Design Patterns: A Visual Guide with UML Diagrams
Unveiling Design Patterns: A Visual Guide with UML Diagrams
 
BATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASE
BATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASEBATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASE
BATTLEFIELD ORM: TIPS, TACTICS AND STRATEGIES FOR CONQUERING YOUR DATABASE
 
Balasore Best It Company|| Top 10 IT Company || Balasore Software company Odisha
Balasore Best It Company|| Top 10 IT Company || Balasore Software company OdishaBalasore Best It Company|| Top 10 IT Company || Balasore Software company Odisha
Balasore Best It Company|| Top 10 IT Company || Balasore Software company Odisha
 
MYjobs Presentation Django-based project
MYjobs Presentation Django-based projectMYjobs Presentation Django-based project
MYjobs Presentation Django-based project
 
What is Advanced Excel and what are some best practices for designing and cre...
What is Advanced Excel and what are some best practices for designing and cre...What is Advanced Excel and what are some best practices for designing and cre...
What is Advanced Excel and what are some best practices for designing and cre...
 
Recruitment Management Software Benefits (Infographic)
Recruitment Management Software Benefits (Infographic)Recruitment Management Software Benefits (Infographic)
Recruitment Management Software Benefits (Infographic)
 

AutoFDO Optimizes Code with Profile Data

  • 2. Only C, No C++ in this presentation: Rationale ● Linux inventor Linus Torvalds’ comments on C++ from last week at the Embedded Linux Conference 2017 keynote ● https://youtu.be/NLQZzEvavGs?list=PLbzoR- pLrL6pISWAq-1cXP4_UZAyRtesk&t=1343
  • 3. Bubble sort ”In computer graphics bubble sort is popular ... in almost-sorted arrays … with just linear complexity (2n)” - wikipedia void bubble_sort(u64 *a, int n) { u64 i, temp, swap_flag = 1; while (swap_flag) { swap_flag = 0; for (i = 1; i < n; i++) { if (a[i] < a[i - 1]) { /* swap */ temp = a[i]; a[i] = a[i - 1]; a[i - 1] = temp; swap_flag = 1; } } } } Condition predictable? Function inline? Loop unroll? Minimize branches
  • 4. Common practice ● gcc -g -O3 -o sort sort.c ● ./sort 30000 What did the compiler do?: – gcc -S –verbose-asm -o sort.S sort.c – objdump -d -S sort – perf record ./sort 3000; perf annotate
  • 5. Bubble sort, -O3 void bubble_sort(u64 *a, int n) { u64 i, temp, swap_flag = 1; while (swap_flag) { swap_flag = 0; for (i = 1; i < n; i++) { if (a[i] < a[i - 1]) { /* swap */ temp = a[i]; a[i] = a[i - 1]; a[i - 1] = temp; swap_flag = 1; } } } } │ bubble_sort(): │ for (i = 1; i < n; i++) { // 6 0.01 │ 90: cmp $0x1,%rbp │ ↓ jbe c6 │ lea 0x8(%rbx),%rax │ xor %edi,%edi │ nop │ if (a[i] < a[i - 1]) { // 7 7.78 │ a0: mov (%rax),%rdx 5.62 │ mov -0x8(%rax),%rcx 3.52 │ cmp %rcx,%rdx 12.50 │ ↓ jae b8 │ a[i] = a[i - 1]; // 9 27.75 │ mov %rcx,(%rax) │ a[i - 1] = temp; // 10 5.42 │ mov %rdx,-0x8(%rax) │ swap_flag = 1; // 11 │ mov $0x1,%edi 20.62 │ b8: add $0x8,%rax │ for (i = 1; i < n; i++) { // 6 │ cmp %rsi,%rax 16.76 │ ↑ jne a0 │ while (swap_flag) { │ test %rdi,%rdi │ ↑ jne 90
  • 6. less common practice: software based instrumentation ● gcc -g -O3 -fprofile-generate -o sort sort.c ● ./sort 3000
  • 7. (arm64) gcc 4.8 -O3 -fprofil-glelratl stp x29, x30, [sp,#-32]! adrp x2, __gcov_i_c_c mov x29, sp str x19, [sp,#16] mrs x19, tpidr_el0 add x19, x19, #0x0, lsl #12 add x19, x19, #0x10 mov x1, #0x0 add x2, x2, #0xd60 ldr x0, [x19] ldr x3, [x19,#8] bl __gcov_i_c_p adrp x11, a+0x1cf00 add x0, x11, #0x670 mov w7, #29998 str xzr, [x19,#8] ldr x6, [x0,#8] ldr x10, [x0,#24] mov w0, #0x0 cmp w0, w7 adrp x2, _G_O_T_+0x48 add w1, w0, #0x1 ldr x8, [x11,#1648] mov w9, #0x1 add x2, x2, #0x100 sbfiz x4, x0, #2, #32 sbfiz x3, x1, #2, #32 b.hi main+0xac ldr w0, [x2,x4] ldr w5, [x2,x3] add x6, x6, #0x1 cmp w0, w5 b.le main+0x94 str w5, [x2,x4] str w0, [x2,x3] add x8, x8, #0x1 mov w9, #0x0 mov w0, w1 cmp w0, w7 add w1, w0, #0x1 sbfiz x4, x0, #2, #32 sbfiz x3, x1, #2, #32 b.ls 400dd0 cbnz w9, 400e24 mov w1, w9 add x10, x10, #0x1 mov w9, #0x1 mov w0, w1 b 400df8 <main+0x98> add x1, x11, #0x670 mov w0, #0x0 ldr x19, [sp,#16] ldr x2, [x1,#16] str x6, [x1,#8] add x2, x2, #0x1 str x10, [x1,#24] str x2, [x1,#16] str x8, [x11,#1648] ldp x29, x30, [sp],#32 ret
  • 8. less common practice: software based instrumentation ● gcc -g -O3 -fprofile-generate -o sort sort.c ● ./sort 3000 ● gcc -g -O3 -fprofile-use -o sort sort.c ● ./sort 30000
  • 9. What did the compiler do differently? │ b6: lea 0x8(%rcx),%rdx │ for (i = 1; i < n; i++) { // 6 │ cmp %rdx,%r8 │ ↓ je 2e0 │ test %rax,%rax │ ↓ je 24b │ cmp $0x1,%rax │ ↓ je 1a6 │ cmp $0x2,%rax │ ↓ je 18a │ cmp $0x3,%rax │ ↓ je 16e │ cmp $0x4,%rax │ ↓ je 152 │ cmp $0x5,%rax │ ↓ je 136 │ cmp $0x6,%rax │ ↓ je 11a │ if (a[i] < a[i - 1]) { // 7 │ mov 0x8(%rcx),%r9 │ mov -0x8(%rdx),%r10 │ cmp %r10,%r9 │ ↓ jae 116 │ a[i] = a[i - 1]; // 9 │ mov %r10,0x8(%rcx) │ swap_flag = 1; // 11 │ mov $0x1,%edi │ a[i - 1] = temp; // 10 │ mov %r9,-0x8(%rdx) │116: add $0x8,%rdx │ if (a[i] < a[i - 1]) { // 7 │11a: mov (%rdx),%r11 │ mov -0x8(%rdx),%r12 │ cmp %r12,%r11 │ ↓ jae 132 │ a[i] = a[i - 1]; // 9 │ mov %r12,(%rdx) │ a[i - 1] = temp; // 10 │ mov %r11,-0x8(%rdx) – It unrolled the inner loop
  • 10. Loop unwinding ● The goal of loop unwinding is to increase a program's speed by: – reducing or eliminating instructions that control the loop, such as pointer arithmetic and "end of loop" tests on each iteration; – reducing branch penalties; as well as – hiding latencies including the delay in reading data from memory. ● To eliminate this computational overhead, loops can be re- written as a repeated sequence of similar independent statements. – wikipedia
  • 11. software based instrumentation: deployments ● Ahem, I don’t really know (need a survey?) ● Do know git supports building itself like this – Full profile – Fast profile ● Hands up if any of the projects you have worked on!
  • 13. How to get a good profile ● perf record [-e <event>] <workload>
  • 14. perf record [-e cycles] ./sort [-O3] void bubble_sort(u64 *a, int n) { u64 i, temp, swap_flag = 1; while (swap_flag) { swap_flag = 0; for (i = 1; i < n; i++) { if (a[i] < a[i - 1]) { /* swap */ temp = a[i]; a[i] = a[i - 1]; a[i - 1] = temp; swap_flag = 1; } } } } │ bubble_sort(): │ for (i = 1; i < n; i++) { // 6 0.01 │ 90: cmp $0x1,%rbp │ ↓ jbe c6 │ lea 0x8(%rbx),%rax │ xor %edi,%edi │ nop │ if (a[i] < a[i - 1]) { // 7 7.78 │ a0: mov (%rax),%rdx 5.62 │ mov -0x8(%rax),%rcx 3.52 │ cmp %rcx,%rdx 12.50 │ ↓ jae b8 │ a[i] = a[i - 1]; // 9 27.75 │ mov %rcx,(%rax) │ a[i - 1] = temp; // 10 5.42 │ mov %rdx,-0x8(%rax) │ swap_flag = 1; // 11 │ mov $0x1,%edi 20.62 │ b8: add $0x8,%rax │ for (i = 1; i < n; i++) { // 6 │ cmp %rsi,%rax 16.76 │ ↑ jne a0 │ while (swap_flag) { │ test %rdi,%rdi │ ↑ jne 90
  • 15. least^WGoogle common practice: AutoFDO gcc -g -O3 -o sort sort.c ~/git/pmu-tools/ocperf.py record -b -e br_inst_retired.near_taken:pp -- ./sort 30000 ~/git/autofdo-andikleen/create_gcov -debug_dump -logtostderr --binary=./sort --profile=perf.data --gcov=./sort.gcov -gcov_version=1 ~/git/autofdo-andikleen/dump_gcov -gcov_version=1 ./sort.gcov gcc -g -O3 -fauto-profile=sort.gcov -o sort sort.c
  • 16. least^WGoogle common practice 2: AutoFDO via runtime process attachment gcc -g -O3 -o sort sort.c ./sort 300000 & ~/git/pmu-tools/ocperf.py record -b -e br_inst_retired.near_taken:pp -p <PID> kill %1 ~/git/autofdo-andikleen/create_gcov -debug_dump -logtostderr --binary=./sort --profile=perf.data --gcov=./sort.gcov -gcov_version=1 ~/git/autofdo-andikleen/dump_gcov -gcov_version=1 ./sort.gcov gcc -g -O3 -fauto-profile=sort.gcov -o sort sort.c
  • 17. Deployed ● Cpython (rumour: 5% off the interpreter loop) ● Firefox ● Google datacenters (“over 50% of cycles spent are optimized with FDO”) ● Chrome, ChromeOS ● Clearlinux ● Github: kevinquinnyo/php7-wp-build-docker: Builds latest stable php releases in docker container, optimizes the build for wordpress with GCC AutoFDO and builds …
  • 18. Extra tidbits ● Coverage files (.gcov, etc.) are CPU arch- independent: generate once, use x86, Arm, Power ● AutoFDO supports LLVM (different coverage files) ● 5-10+% improvement consistently observed at Google, most gain within 3-5-7 iterations with little sample data ● 6-month old (“stale”) coverage files still good for at least ½ of the original performance benefit
  • 19. Additional resources ● Tutorial: – https://gcc.gnu.org/wiki/AutoFDO/Tutorial ● Where to get gcov_create: – https://github.com/google/autofdo ● Where to get ocperf.py: – git://github.com/andikleen/pmu-tools.git ● Dehao Chen’s presentation at GCC Cauldron conf.: – https://www.youtube.com/watch?v=26SrOC6MXWg ● Co-worker’s presentation at Embedded Linux conf.: – https://www.youtube.com/watch?v=S2Q1OJuZoX4 ● Large CERN project experience (5-13% improvement): – https://indico.cern.ch/event/587970/contributions/2369824/attachments/1374948/2087355/slides.pdf ● Me: – kim.phillips@arm.com
  • 20. Excerpt from git’s INSTALL file: If you're willing to trade off (much) longer build time for a later faster git you can also do a profile feedback build... This will run the complete test suite as training workload and then rebuild git with the generated profile feedback. This results in a git which is a few percent faster on CPU intensive workloads. This may be a good tradeoff for distribution packagers. Alternatively you can run profile feedback only with the git benchmark suite. This runs significantly faster than the full test suite, but has less coverage... As a caveat: a profile-optimized build takes a *lot* longer since the git tree must be built twice, and in order for the profiling measurements to work properly, ccache must be disabled and the test suite has to be run using only a single CPU. In addition, the profile feedback build stage currently generates a lot of additional compiler warnings.