Как помочь и как помешать компилятору. Андрей Олейников ➠ CoreHard Autumn 2019

Как помочь и как помешать
компилятору
Андрей Олейников,
разработчик, Беспилотные автомобили Яндекса
andreyol@yandex-team.ru

Введение
› Компиляторные оптимизации
› Возможность на них повлиять
Предупреждения
› Clang/LLVM (6.0)
› x64 (i7-8750H CPU)
› Нестандартные расширения
› Синтетические примеры
https://github.com/duke-gh/CoreHard2019
2

План доклада
› Inline
› Loop unrolling
› Instruction combining
› Branching
› LTO
4

Inline cpp
double calc(int i) {
return std::sin(i / 100.0) + std::cos(i / 100.0);
}
double get_res() {
const int it_count{100000000};
double res{0.0};
for (int i = 0; i < it_count; i++) {
res += calc(i);
}
return res;
}
6

Псевдокод после оптимизации
return std::sin(i / 100.0) + std::cos(i / 100.0);
}
double get_res() {
double res{0.0};
res += std::sin(i / 100.0) + std::cos(i / 100.0);
}
return res;
}
7

Inline в Сlang
› Passes: AlwaysInlinerPass, InlinerPass
› inline
› __attribute__((always_inline))
› __attribute__((noinline))
8

Inline пример 1
double calc_value(int i, int branch) {
switch (i) {...}
...
switch (i) {...}
double di = static_cast<double>(i);
if (branch) {
return i + di ∗ di + di ∗ di ∗ di + di ∗ di ∗ di ∗ 0.2 + i % 2;
} else {
return i ∗ 0.1 + di ∗ di / 2.0 + di ∗ di ∗ di ∗ 0.3
+ di ∗ di ∗ di ∗ 0.4 + i / 2.0;
}
}
9

double get_first_value(int i, int branch) {
return calc_value(i, branch);
}
double get_second_value(int i, int branch) {
}
double get_res() {
double res{0.0};
res += get_first_value(i, 0) + get_second_value(i, 0);
}
return res;
} 10

$ clang++ —O2 —Rpass=inline —c calc.cpp
calc.cpp:87:12: remark: _Z15get_first_valueii inlined into
_Z7get_resv with cost=0 (threshold=337) [—Rpass=inline]
res += get_first_value(i, 0) + get_second_value(i, 0) ;
^
calc.cpp:87:37: remark: _Z16get_second_valueii inlined into
_Z7get_resv with cost=0 (threshold=337) [—Rpass=inline]
res += get_first_value(i, 0) + get_second_value(i, 0) ;
11

$ clang++ -O2 -Rpass=inline -Rpass-missed=inline -c calc.cpp
calc.cpp:76:10: remark: _Z10calc_valueii not inlined into
_Z15get_first_valueii because too costly to inline
(cost=320, threshold=225) [-Rpass-missed=inline]
^
calc.cpp:80:10: remark: _Z10calc_valueii not inlined into
_Z16get_second_valueii because too costly to inline
(cost=320, threshold=225) [-Rpass-missed=inline]
...
12

$ sudo perf stat -B ./a.out
Spent 1109395529 ns
Performance counter stats for './a.out':
1 111,60 msec task-clock
3 context-switches
0 cpu-migrations
117 page-faults
4 261 552 332 cycles
4 604 496 295 instructions
1 100 771 723 branches
25 792 branch-misses
1,112127349 seconds time elapsed
1,112106000 seconds user
0,000000000 seconds sys 13

inline double calc_value(int i, int branch) {
...
}
14

// llvm/lib/Analysis/InlineCost.cpp:916
if (Callee.hasFnAttribute(Attribute::InlineHint))
Threshold = MaxIfValid(Threshold, Params.HintThreshold);
15

$ clang++ -O2 -Rpass=inline -c calc.cpp
calc.cpp:76:10: remark: _Z10calc_valueii inlined into
_Z15get_first_valueii with cost=320 (threshold=325)
[-Rpass=inline]
^
_Z16get_second_valueii with cost=320 (threshold=325)
[-Rpass=inline]
^
16

Spent 1823334387 ns
4 context-switches
0 cpu-migrations
117 page-faults
7 239 846 942 cycles
2 100 884 626 branches
0,000000000 seconds sys 17

double get_first_value(int i, int branch) {
}
double get_second_value(int i, int branch) {
}
double get_res() {
double res{0.0};
res += get_first_value(i, 0) + get_second_value(i, 0);
}
return res;
} 18

Inline пример 2
switch (i) {...}
...
switch (i) {...}
if (branch) {
} else {
return i ∗ 0.1 + di ∗ di / 2.0 + di ∗ di ∗ di ∗ 0.3 + di ∗ di ∗ di ∗ 0.4 + i
/ 2.0;
}
}
19

double get_first_value(int i) {
return calc_value(i, 0);
}
double get_second_value(int i) {
}
double get_res() {
double res{0.0};
res += get_first_value(i) + get_second_value(i);
}
return res;
} 20

calc.cpp:89:12: remark: _Z15get_first_valuei inlined into
_Z7get_resv with cost=5 (threshold=337) [-Rpass=inline]
^
calc.cpp:89:33: remark: _Z16get_second_valuei inlined into
21

Spent 1691298621 ns
2 context-switches
0 cpu-migrations
119 page-faults
6 762 472 189 cycles
2 100 883 212 branches
0,000000000 seconds sys
22

inline double calc_value(int i, int branch) {
...
}
23

_Z15get_first_valuei with cost=265 (threshold=325) [-
Rpass=inline]
^
_Z16get_second_valuei with cost=255 (threshold=325) [-
Rpass=inline]
^
24

Spent 1690155986 ns
3 context-switches
0 cpu-migrations
116 page-faults
6 763 570 064 cycles
1 900 880 186 branches
25

inline double get_first_value(int i) { ... }
inline double get_second_value(int i) { ... }
26

_Z15get_first_valuei with cost=265 (threshold=325)
^
...
^
...
27

Spent 1823334387 ns
424,66 msec task-clock
0 context-switches
0 cpu-migrations
115 page-faults
1 700 420 431 cycles
1 500 629 258 branches
28

switch (i) {...}
...
switch (i) {...}
if (branch) {
} else {
return i ∗ 0.1 + di ∗ di / 2.0 + di ∗ di ∗ di ∗ 0.3 + di ∗ di ∗ di ∗ 0.4 + i
/ 2.0;
}
}
29

__attribute__((always_inline)) double get_first_value(int i) { ... }
__attribute__((noinline)) double get_second_value(int i) { ... }
30

$ clang++ -O2 -Rpass=inline -Rpass-missed=inline -c calc.cpp
...
_Z7get_resv with cost=always [-Rpass=inline]
^
...
calc.cpp:89:33: remark: _Z16get_second_valuei not inlined into
_Z7get_resv because it should never be inlined
(cost=never) [-Rpass-missed=inline]
31

double get_res() {
double res{0.0};
res += get_first_light_value(i) + get_second_light_value(i + 1);
if (i < 10) {
res += get_first_heavy_value(i, res);
res += get_second_heavy_value(i, res);
}
res += get_second_light_value(i) + get_first_light_value(i — 1);
}
return res;
}
32

Результаты за 10 запусков с always_inline и без:
среднее, мс p90 p10
по умолчанию 3567.6 3584.4 3555.1
always_inline 3701.5 3714.5 3691.9
33

Итого, inline
› Позволяет делать подсказки только у вызываемой функции.
› Можно облегчить компиятору работу указав явно как
поступать.
› Потенциально руками можно обогнать эвристики.
34

Loop unroll
return std::sin(i);
}
double get_res(int it_count) {
double res{0.0};
res += calc(i);
}
return res;
}
36

IR перед оптимизацией
%1:
%2 = icmp sgt i32 %0, 0
br i1 %2, label %3, label %6
T F
%3:
br label %8
%6:
%7 = phi double [ 0.000000e+00, %1 ], [ %5, %4 ]
ret double %7
%8:
%9 = phi i32 [ %14, %8 ], [ 0, %3 ]
%10 = phi double [ %13, %8 ], [ 0.000000e+00, %3 ]
%11 = sitofp i32 %9 to double
%12 = tail call double @sin(double %11)
%13 = fadd double %10, %12
%14 = add nuw nsw i32 %9, 1
%15 = icmp eq i32 %14, %0
T F
%4:
%5 = phi double [ %13, %8 ]
br label %6
37

IR после оптимизации
%1:
%2 = icmp sgt i32 %0, 0
T F
%3:
%4 = add i32 %0, -1
%5 = and i32 %0, 3
%6 = icmp ult i32 %4, 3
T F
%29:
%30 = phi double [ 0.000000e+00, %1 ], [ %28, %27 ]
ret double %30
%10:
%11 = phi double [ undef, %3 ], [ %49, %9 ]
%12 = phi i32 [ 0, %3 ], [ %50, %9 ]
%13 = phi double [ 0.000000e+00, %3 ], [ %49, %9 ]
%14 = icmp eq i32 %5, 0
T F
%7:
%8 = sub i32 %0, %5
br label %31
%27:
%28 = phi double [ %11, %10 ], [ %22, %26 ]
br label %29
%15:
br label %16
%31:
%32 = phi i32 [ 0, %7 ], [ %50, %31 ]
%33 = phi double [ 0.000000e+00, %7 ], [ %49, %31 ]
%34 = phi i32 [ %8, %7 ], [ %51, %31 ]
%37 = fadd double %33, %36
%38 = or i32 %32, 1
%41 = fadd double %37, %40
%42 = or i32 %32, 2
%45 = fadd double %41, %44
%46 = or i32 %32, 3
%49 = fadd double %45, %48
%50 = add nuw nsw i32 %32, 4
%51 = add i32 %34, -4
%52 = icmp eq i32 %51, 0
T F
%9:
br label %10
%16:
%17 = phi i32 [ %23, %16 ], [ %12, %15 ]
%18 = phi double [ %22, %16 ], [ %13, %15 ]
%19 = phi i32 [ %24, %16 ], [ %5, %15 ]
%22 = fadd double %18, %21
%23 = add nuw nsw i32 %17, 1
%24 = add i32 %19, -1
%25 = icmp eq i32 %24, 0
br i1 %25, label %26, label %16, !llvm.loop !0
T F
%26:
br label %27
38

// before
res += calc(i);
}
// after
int i = 0;
for (; it_count — i > 3; i += 4) {
res += calc(i);
res += calc(i + 1);
res += calc(i + 2);
res += calc(i + 3);
}
for (; i < it_count; i++) {
res += calc(i);
} 39

Разворачивание циклов в Clang
› Pass: LoopUnrollPass
› #pragma unroll
› #pragma nounroll
› #pragma clang loop unroll(full)
› #pragma clang loop unroll_count(12)
40

// llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp:960
if (HasUnrollDisablePragma(L))
return LoopUnrollResult::Unmodified;
...
// llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp:747
if (ExplicitUnroll && TripCount != 0) {
// If the loop has an unrolling pragma, we want to be more aggressive with
// unrolling limits. Set thresholds to at least the PragmaThreshold value
// which is larger than the default limits.
UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
UP.PartialThreshold =
std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
}
41

Loop unroll пример
return std::sin(i);
}
double get_res_unroll_12(int it_count) {
double res{0.0};
#pragma clang loop unroll_count(12)
res += calc(i);
}
return res;
}
42

%1:
%2 = icmp sgt i32 %0, 0
T F
%3:
%4 = add i32 %0, -1
%5 = urem i32 %4, 12
%6 = add nuw nsw i32 %5, 1
%7 = urem i32 %6, 12
%8 = icmp ult i32 %4, 11
T F
%27:
%28 = phi double [ 0.000000e+00, %1 ], [ %12, %11 ], [ %23, %17 ]
ret double %28
%11:
%12 = phi double [ undef, %3 ], [ %79, %29 ]
%13 = phi i32 [ 0, %3 ], [ %80, %29 ]
%14 = phi double [ 0.000000e+00, %3 ], [ %79, %29 ]
%15 = icmp eq i32 %7, 0
T F
%9:
%10 = sub i32 %0, %7
br label %29
%16:
br label %17
%29:
%30 = phi i32 [ 0, %9 ], [ %80, %29 ]
%31 = phi double [ 0.000000e+00, %9 ], [ %79, %29 ]
%32 = phi i32 [ %10, %9 ], [ %81, %29 ]
%34 = tail call double @sin(double %33) #2
%35 = fadd double %31, %34
%36 = or i32 %30, 1
%39 = fadd double %35, %38
%40 = or i32 %30, 2
%43 = fadd double %39, %42
%44 = or i32 %30, 3
%47 = fadd double %43, %46
%48 = add nuw nsw i32 %30, 4
%51 = fadd double %47, %50
%52 = add nuw nsw i32 %30, 5
%55 = fadd double %51, %54
%56 = add nuw nsw i32 %30, 6
%59 = fadd double %55, %58
%60 = add nuw nsw i32 %30, 7
%63 = fadd double %59, %62
%64 = add nuw nsw i32 %30, 8
%67 = fadd double %63, %66
%68 = add nuw nsw i32 %30, 9
%71 = fadd double %67, %70
%72 = add nuw nsw i32 %30, 10
%75 = fadd double %71, %74
%76 = add nuw nsw i32 %30, 11
%79 = fadd double %75, %78
%80 = add nuw nsw i32 %30, 12
%81 = add i32 %32, -12
%82 = icmp eq i32 %81, 0
T F
%17:
%18 = phi i32 [ %24, %17 ], [ %13, %16 ]
%19 = phi double [ %23, %17 ], [ %14, %16 ]
%20 = phi i32 [ %25, %17 ], [ %7, %16 ]
%23 = fadd double %19, %22
%24 = add nuw nsw i32 %18, 1
%25 = add i32 %20, -1
%26 = icmp eq i32 %25, 0
T F
43

Результаты за 10 запусков с различными прагмами:
branches average p90 p10
nounroll 1 001 796 322 1045.3 1067.3 1028.5
unroll 250 920 668 1038.6 1057.2 1019.8
unroll_count(12) 84 168 844 1023.7 1053.4 1009.9
44

Итого, разворачивание циклов
› Можно делать подсказки у конкретного цикла.
› Не является обязующей директивой.
› Потенциально руками можно обогнать эвристики.
45

Instruction combining
int get_res(int a, int b, int c) {
return a ∗ b + a ∗ c + a + a + a + c — a;
}
.
47

Instruction combining: результаты
Было:
𝑎𝑏 + 𝑎𝑐 + 𝑎 + 𝑎 + 𝑎 + 𝑐 − 𝑎
Стало:
(𝑏 + 2 + 𝑐)𝑎 + 𝑐
48

Instruction combining в Clang
› Passes: InstCombinePass, ReassociatePass
› -ffast-math
› -fno-honor-infinities
› -fno-honor-nans
› -fno-math-errno
› -ffinite-math
› -fassociative-math
› -freciprocal-math
› -fno-signed-zeros
› -fno-trapping-math
› -ffp-contract=fast
49

float get_res(float a, float b, float c, float d, float e) {
return a ∗ d + b ∗ e + c ∗ e;
}
int main() {
float a = 500000000.0f;
float b = —500000000.0f;
float c = 1.0f;
std::cout << get_res(a, b, c, 1.0f, 1.0f) << std::endl;
return 0;
}
50

$ clang++ -O2 main.cpp
$ ./a.out
1
$ clang++ -ffast-math -O2 main.cpp
$ ./a.out
0
51

// a = 500000000.0f;
// b = —500000000.0f;
// c, d, e = 1.0f;
float get_res(float a, float b, float c, float d, float e) {
return a ∗ d + b ∗ e + c ∗ e;
}
float get_res_opt(float a, float b, float c, float d, float e) {
return a ∗ d + (b + c) ∗ e;
}
52

Итого, упрощение выражений
› Ключ для сборки - влияет на всю единицу трансляции.
› Говорит компилятору быть менее консервативным.
53

Branching
void function(bool should_fail) {
if (should_fail) {
...
// Some long, rarely used error handling
...
} else {
...
// Main execution path
...
}
}
55

Branching в Clang
› __builtin_expect(long exp, long c)
56

Branching пример
if (__builtin_expect(i>0, 0)) {
switch (i) { ... }
switch (i) { ... }
return di ∗ di + di ∗ 0.1;
} else {
return di ∗ di ∗ di + di ∗ 0.3;
}
}
57

Ассемблер без подсказок
0: test %edi,%edi
2: jle 44 <_Z4calci+0x44>
T F
4: lea -0x1(%rdi),%eax
7: cmp $0x2,%eax
a: mov $0xfffffffe,%eax
f: cmovb %eax,%edi
12: cmp $0x2,%edi
15: jbe 61 <_Z4calci+0x61>
...
44: cvtsi2sd %edi,%xmm1
48: movapd %xmm1,%xmm0
4c: mulsd %xmm0,%xmm0
50: mulsd %xmm1,%xmm0
54: mulsd 0x0(%rip),%xmm1
5b: 00
5c: addsd %xmm1,%xmm0
60:retq
58

Ассемблер с подсказками
0: test %edi,%edi
2: jg 21 <_Z4calci+0x21>
T F
21: lea -0x1(%rdi),%eax
24: cmp $0x2,%eax
27: mov $0xfffffffe,%eax
2c: cmovb %eax,%edi
2f: cmp $0x2,%edi
32: jbe 61 <_Z4calci+0x61>
...
4: cvtsi2sd %edi,%xmm1
8: movapd %xmm1,%xmm0
c: mulsd %xmm0,%xmm0
10: mulsd %xmm1,%xmm0
14: mulsd 0x0(%rip),%xmm1
1b: 00
1c: addsd %xmm1,%xmm0
20: retq
59

Итого, подсказки при ветвлении
› Можно делать подсказки у каждого отдельного условного
оператора.
› Компилятор не знает на каких данных будет запускаться
программа.
60

// main.cpp
#include <iostream>
#include <chrono>
#include "source2.h"
int main() {
double res{0.0};
const auto start = std::chrono::high_resolution_clock::now().time_since_epoch
().count();
res = get_res();
const auto end = std::chrono::high_resolution_clock::now().time_since_epoch().
count();
std::cout << "res = " << res << std::endl;
...
return 0;
} 64

// source2.cpp
double get_res() {
double res{0.0};
res += calc_value(i, 0);
}
return res;
}
65

// source1.cpp
int calc_value(int i, int branch) {
int original_i = i;
switch (i) { ... }
switch (i) { ... }
...
if (branch) {
return i;
} else {
return original_i;
}
}
66

$ clang++ -O2 main.cpp source1.cpp source2.cpp
$ ./a.out
res = 5e+17
Spent 3536021382 ns (3536 ms)
4.2.1 Compatible Clang 6.0.0 (tags/RELEASE_600/final)
67

$ clang++ -O2 -flto -c main.cpp source1.cpp source2.cpp
$ clang++ -O2 -flto main.o source1.o source2.o
$ ./a.out
res = 5e+17
Spent 1013128016 ns (1013 ms)
$ ls -n a.out
-rwxr-xr-x 1 1000 1000 9072 ноя 19 11:53 a.out
$ llvm-dis-6.0 source1.o -o source1.ll
68

$ clang++ -O2 -flto=thin -c main.cpp source1.cpp source2.cpp
$ clang++ -O2 -flto=thin main.o source1.o source2.o
$ ./a.out
res = 5e+17
Spent 1013771808 ns (1013 ms)
$ ls -ln a.out
-rwxr-xr-x 1 1000 1000 13248 ноя 19 03:03 a.out
69

// source2.cpp
double get_res() {
double res{0.0};
res += calc_value(i, 0);
}
return res;
}
70

// source1.cpp
int calc_value(int i, int branch) {
int original_i = i;
switch (i) { ... }
switch (i) { ... }
...
if (branch) {
return i;
} else {
return original_i;
}
}
71

Итого, оптимизации на этапе линковки
› Требуют изменения процесса сборки проекта.
› Ограничивают возможные выбор линкера.
› Увеличивают время сборки (особенно инкрементальной).
› Дают компилятору больше контекста для оптимизаций.
72

Заключение
› LTO, если не жалко скорость сборки.
› Подсказки компилятору, если есть время и желание
проверять результат.
› Стоит изучать свои инструменты
73

Спасибо за внимание!
Андрей Олейников
andreyol@yandex-team.ru

Как помочь и как помешать компилятору. Андрей Олейников ➠ CoreHard Autumn 2019

More Related Content

What's hot

More from corehard_by

Как помочь и как помешать компилятору. Андрей Олейников ➠ CoreHard Autumn 2019