Как помочь и как помешать
компилятору
Андрей Олейников,
разработчик, Беспилотные автомобили Яндекса
andreyol@yandex-team.ru
Введение
› Компиляторные оптимизации
› Возможность на них повлиять
Предупреждения
› Clang/LLVM (6.0)
› x64 (i7-8750H CPU)
› Нестандартные расширения
› Синтетические примеры
https://github.com/duke-gh/CoreHard2019
2
Clang/LLVM
3
План доклада
› Inline
› Loop unrolling
› Instruction combining
› Branching
› LTO
4
Inline
Inline cpp
double calc(int i) {
return std::sin(i / 100.0) + std::cos(i / 100.0);
}
double get_res() {
const int it_count{100000000};
double res{0.0};
for (int i = 0; i < it_count; i++) {
res += calc(i);
}
return res;
}
6
Псевдокод после оптимизации
double calc(int i) {
return std::sin(i / 100.0) + std::cos(i / 100.0);
}
double get_res() {
const int it_count{100000000};
double res{0.0};
for (int i = 0; i < it_count; i++) {
res += std::sin(i / 100.0) + std::cos(i / 100.0);
}
return res;
}
7
Inline в Сlang
› Passes: AlwaysInlinerPass, InlinerPass
› inline
› __attribute__((always_inline))
› __attribute__((noinline))
8
Inline пример 1
double calc_value(int i, int branch) {
switch (i) {...}
...
switch (i) {...}
double di = static_cast<double>(i);
if (branch) {
return i + di ∗ di + di ∗ di ∗ di + di ∗ di ∗ di ∗ 0.2 + i % 2;
} else {
return i ∗ 0.1 + di ∗ di / 2.0 + di ∗ di ∗ di ∗ 0.3
+ di ∗ di ∗ di ∗ 0.4 + i / 2.0;
}
}
9
double get_first_value(int i, int branch) {
return calc_value(i, branch);
}
double get_second_value(int i, int branch) {
return calc_value(i, branch);
}
double get_res() {
const int it_count{100000000};
double res{0.0};
for (int i = 0; i < it_count; i++) {
res += get_first_value(i, 0) + get_second_value(i, 0);
}
return res;
} 10
$ clang++ —O2 —Rpass=inline —c calc.cpp
calc.cpp:87:12: remark: _Z15get_first_valueii inlined into
_Z7get_resv with cost=0 (threshold=337) [—Rpass=inline]
res += get_first_value(i, 0) + get_second_value(i, 0) ;
^
calc.cpp:87:37: remark: _Z16get_second_valueii inlined into
_Z7get_resv with cost=0 (threshold=337) [—Rpass=inline]
res += get_first_value(i, 0) + get_second_value(i, 0) ;
11
$ clang++ -O2 -Rpass=inline -Rpass-missed=inline -c calc.cpp
calc.cpp:76:10: remark: _Z10calc_valueii not inlined into
_Z15get_first_valueii because too costly to inline
(cost=320, threshold=225) [-Rpass-missed=inline]
return calc_value(i, branch);
^
calc.cpp:80:10: remark: _Z10calc_valueii not inlined into
_Z16get_second_valueii because too costly to inline
(cost=320, threshold=225) [-Rpass-missed=inline]
return calc_value(i, branch);
...
12
$ sudo perf stat -B ./a.out
Spent 1109395529 ns
Performance counter stats for './a.out':
1 111,60 msec task-clock
3 context-switches
0 cpu-migrations
117 page-faults
4 261 552 332 cycles
4 604 496 295 instructions
1 100 771 723 branches
25 792 branch-misses
1,112127349 seconds time elapsed
1,112106000 seconds user
0,000000000 seconds sys 13
inline double calc_value(int i, int branch) {
...
}
14
// llvm/lib/Analysis/InlineCost.cpp:916
if (Callee.hasFnAttribute(Attribute::InlineHint))
Threshold = MaxIfValid(Threshold, Params.HintThreshold);
15
$ clang++ -O2 -Rpass=inline -c calc.cpp
calc.cpp:76:10: remark: _Z10calc_valueii inlined into
_Z15get_first_valueii with cost=320 (threshold=325)
[-Rpass=inline]
return calc_value(i, branch);
^
calc.cpp:80:10: remark: _Z10calc_valueii inlined into
_Z16get_second_valueii with cost=320 (threshold=325)
[-Rpass=inline]
return calc_value(i, branch);
^
16
$ sudo perf stat -B ./a.out
Spent 1823334387 ns
Performance counter stats for './a.out':
1 824,14 msec task-clock
4 context-switches
0 cpu-migrations
117 page-faults
7 239 846 942 cycles
8 405 174 671 instructions
2 100 884 626 branches
28 842 branch-misses
1,824386069 seconds time elapsed
1,824351000 seconds user
0,000000000 seconds sys 17
double get_first_value(int i, int branch) {
return calc_value(i, branch);
}
double get_second_value(int i, int branch) {
return calc_value(i, branch);
}
double get_res() {
const int it_count{100000000};
double res{0.0};
for (int i = 0; i < it_count; i++) {
res += get_first_value(i, 0) + get_second_value(i, 0);
}
return res;
} 18
Inline пример 2
double calc_value(int i, int branch) {
switch (i) {...}
...
switch (i) {...}
double di = static_cast<double>(i);
if (branch) {
return i + di ∗ di + di ∗ di ∗ di + di ∗ di ∗ di ∗ 0.2 + i % 2;
} else {
return i ∗ 0.1 + di ∗ di / 2.0 + di ∗ di ∗ di ∗ 0.3 + di ∗ di ∗ di ∗ 0.4 + i
/ 2.0;
}
}
19
double get_first_value(int i) {
return calc_value(i, 0);
}
double get_second_value(int i) {
return calc_value(i, 1);
}
double get_res() {
const int it_count{100000000};
double res{0.0};
for (int i = 0; i < it_count; i++) {
res += get_first_value(i) + get_second_value(i);
}
return res;
} 20
$ clang++ -O2 -Rpass=inline -c calc.cpp
calc.cpp:89:12: remark: _Z15get_first_valuei inlined into
_Z7get_resv with cost=5 (threshold=337) [-Rpass=inline]
res += get_first_value(i) + get_second_value(i);
^
calc.cpp:89:33: remark: _Z16get_second_valuei inlined into
_Z7get_resv with cost=5 (threshold=337) [-Rpass=inline]
res += get_first_value(i) + get_second_value(i);
21
$ sudo perf stat -B ./a.out
Spent 1691298621 ns
Performance counter stats for './a.out':
1 692,13 msec task-clock
2 context-switches
0 cpu-migrations
119 page-faults
6 762 472 189 cycles
8 405 218 118 instructions
2 100 883 212 branches
27 518 branch-misses
1,692417574 seconds time elapsed
1,692363000 seconds user
0,000000000 seconds sys
22
inline double calc_value(int i, int branch) {
...
}
23
$ clang++ -O2 -Rpass=inline -c calc.cpp
calc.cpp:78:10: remark: _Z10calc_valueii inlined into
_Z15get_first_valuei with cost=265 (threshold=325) [-
Rpass=inline]
return calc_value(i, 0);
^
calc.cpp:82:10: remark: _Z10calc_valueii inlined into
_Z16get_second_valuei with cost=255 (threshold=325) [-
Rpass=inline]
return calc_value(i, 1);
^
24
$ sudo perf stat -B ./a.out
Spent 1690155986 ns
Performance counter stats for './a.out':
1 691,02 msec task-clock
3 context-switches
0 cpu-migrations
116 page-faults
6 763 570 064 cycles
7 805 197 904 instructions
1 900 880 186 branches
28 067 branch-misses
1,691254028 seconds time elapsed
1,691260000 seconds user
0,000000000 seconds sys
25
inline double get_first_value(int i) { ... }
inline double get_second_value(int i) { ... }
26
$ clang++ -O2 -Rpass=inline -c calc.cpp
calc.cpp:78:10: remark: _Z10calc_valueii inlined into
_Z15get_first_valuei with cost=265 (threshold=325)
return calc_value(i, 0);
^
...
calc.cpp:89:12: remark: _Z15get_first_valuei inlined into
_Z7get_resv with cost=270 (threshold=325) [-Rpass=inline]
res += get_first_value(i) + get_second_value(i);
^
...
27
$ sudo perf stat -B ./a.out
Spent 1823334387 ns
Performance counter stats for './a.out':
424,66 msec task-clock
0 context-switches
0 cpu-migrations
115 page-faults
1 700 420 431 cycles
7 203 660 181 instructions
1 500 629 258 branches
20 223 branch-misses
0,424871806 seconds time elapsed
0,424891000 seconds user
0,000000000 seconds sys
28
double calc_value(int i, int branch) {
switch (i) {...}
...
switch (i) {...}
double di = static_cast<double>(i);
if (branch) {
return i + di ∗ di + di ∗ di ∗ di + di ∗ di ∗ di ∗ 0.2 + i % 2;
} else {
return i ∗ 0.1 + di ∗ di / 2.0 + di ∗ di ∗ di ∗ 0.3 + di ∗ di ∗ di ∗ 0.4 + i
/ 2.0;
}
}
29
__attribute__((always_inline)) double get_first_value(int i) { ... }
__attribute__((noinline)) double get_second_value(int i) { ... }
30
$ clang++ -O2 -Rpass=inline -Rpass-missed=inline -c calc.cpp
...
calc.cpp:89:12: remark: _Z15get_first_valuei inlined into
_Z7get_resv with cost=always [-Rpass=inline]
res += get_first_value(i) + get_second_value(i);
^
...
calc.cpp:89:33: remark: _Z16get_second_valuei not inlined into
_Z7get_resv because it should never be inlined
(cost=never) [-Rpass-missed=inline]
res += get_first_value(i) + get_second_value(i);
31
double get_res() {
const int it_count{1000000000};
double res{0.0};
for (int i = 0; i < it_count; i++) {
res += get_first_light_value(i) + get_second_light_value(i + 1);
if (i < 10) {
res += get_first_heavy_value(i, res);
res += get_second_heavy_value(i, res);
}
res += get_second_light_value(i) + get_first_light_value(i — 1);
}
return res;
}
32
Результаты за 10 запусков с always_inline и без:
среднее, мс p90 p10
по умолчанию 3567.6 3584.4 3555.1
always_inline 3701.5 3714.5 3691.9
33
Итого, inline
› Позволяет делать подсказки только у вызываемой функции.
› Можно облегчить компиятору работу указав явно как
поступать.
› Потенциально руками можно обогнать эвристики.
34
Loop unroll
Loop unroll
double calc(int i) {
return std::sin(i);
}
double get_res(int it_count) {
double res{0.0};
for (int i = 0; i < it_count; i++) {
res += calc(i);
}
return res;
}
36
IR перед оптимизацией
%1:
%2 = icmp sgt i32 %0, 0
br i1 %2, label %3, label %6
T F
%3:
br label %8
%6:
%7 = phi double [ 0.000000e+00, %1 ], [ %5, %4 ]
ret double %7
%8:
%9 = phi i32 [ %14, %8 ], [ 0, %3 ]
%10 = phi double [ %13, %8 ], [ 0.000000e+00, %3 ]
%11 = sitofp i32 %9 to double
%12 = tail call double @sin(double %11)
%13 = fadd double %10, %12
%14 = add nuw nsw i32 %9, 1
%15 = icmp eq i32 %14, %0
br i1 %15, label %4, label %8
T F
%4:
%5 = phi double [ %13, %8 ]
br label %6
37
IR после оптимизации
%1:
%2 = icmp sgt i32 %0, 0
br i1 %2, label %3, label %29
T F
%3:
%4 = add i32 %0, -1
%5 = and i32 %0, 3
%6 = icmp ult i32 %4, 3
br i1 %6, label %10, label %7
T F
%29:
%30 = phi double [ 0.000000e+00, %1 ], [ %28, %27 ]
ret double %30
%10:
%11 = phi double [ undef, %3 ], [ %49, %9 ]
%12 = phi i32 [ 0, %3 ], [ %50, %9 ]
%13 = phi double [ 0.000000e+00, %3 ], [ %49, %9 ]
%14 = icmp eq i32 %5, 0
br i1 %14, label %27, label %15
T F
%7:
%8 = sub i32 %0, %5
br label %31
%27:
%28 = phi double [ %11, %10 ], [ %22, %26 ]
br label %29
%15:
br label %16
%31:
%32 = phi i32 [ 0, %7 ], [ %50, %31 ]
%33 = phi double [ 0.000000e+00, %7 ], [ %49, %31 ]
%34 = phi i32 [ %8, %7 ], [ %51, %31 ]
%35 = sitofp i32 %32 to double
%36 = tail call double @sin(double %35)
%37 = fadd double %33, %36
%38 = or i32 %32, 1
%39 = sitofp i32 %38 to double
%40 = tail call double @sin(double %39)
%41 = fadd double %37, %40
%42 = or i32 %32, 2
%43 = sitofp i32 %42 to double
%44 = tail call double @sin(double %43)
%45 = fadd double %41, %44
%46 = or i32 %32, 3
%47 = sitofp i32 %46 to double
%48 = tail call double @sin(double %47)
%49 = fadd double %45, %48
%50 = add nuw nsw i32 %32, 4
%51 = add i32 %34, -4
%52 = icmp eq i32 %51, 0
br i1 %52, label %9, label %31
T F
%9:
br label %10
%16:
%17 = phi i32 [ %23, %16 ], [ %12, %15 ]
%18 = phi double [ %22, %16 ], [ %13, %15 ]
%19 = phi i32 [ %24, %16 ], [ %5, %15 ]
%20 = sitofp i32 %17 to double
%21 = tail call double @sin(double %20)
%22 = fadd double %18, %21
%23 = add nuw nsw i32 %17, 1
%24 = add i32 %19, -1
%25 = icmp eq i32 %24, 0
br i1 %25, label %26, label %16, !llvm.loop !0
T F
%26:
br label %27
38
// before
for (int i = 0; i < it_count; i++) {
res += calc(i);
}
// after
int i = 0;
for (; it_count — i > 3; i += 4) {
res += calc(i);
res += calc(i + 1);
res += calc(i + 2);
res += calc(i + 3);
}
for (; i < it_count; i++) {
res += calc(i);
} 39
Разворачивание циклов в Clang
› Pass: LoopUnrollPass
› #pragma unroll
› #pragma nounroll
› #pragma clang loop unroll(full)
› #pragma clang loop unroll_count(12)
40
// llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp:960
if (HasUnrollDisablePragma(L))
return LoopUnrollResult::Unmodified;
...
// llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp:747
if (ExplicitUnroll && TripCount != 0) {
// If the loop has an unrolling pragma, we want to be more aggressive with
// unrolling limits. Set thresholds to at least the PragmaThreshold value
// which is larger than the default limits.
UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
UP.PartialThreshold =
std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
}
41
Loop unroll пример
double calc(int i) {
return std::sin(i);
}
double get_res_unroll_12(int it_count) {
double res{0.0};
#pragma clang loop unroll_count(12)
for (int i = 0; i < it_count; i++) {
res += calc(i);
}
return res;
}
42
%1:
%2 = icmp sgt i32 %0, 0
br i1 %2, label %3, label %27
T F
%3:
%4 = add i32 %0, -1
%5 = urem i32 %4, 12
%6 = add nuw nsw i32 %5, 1
%7 = urem i32 %6, 12
%8 = icmp ult i32 %4, 11
br i1 %8, label %11, label %9
T F
%27:
%28 = phi double [ 0.000000e+00, %1 ], [ %12, %11 ], [ %23, %17 ]
ret double %28
%11:
%12 = phi double [ undef, %3 ], [ %79, %29 ]
%13 = phi i32 [ 0, %3 ], [ %80, %29 ]
%14 = phi double [ 0.000000e+00, %3 ], [ %79, %29 ]
%15 = icmp eq i32 %7, 0
br i1 %15, label %27, label %16
T F
%9:
%10 = sub i32 %0, %7
br label %29
%16:
br label %17
%29:
%30 = phi i32 [ 0, %9 ], [ %80, %29 ]
%31 = phi double [ 0.000000e+00, %9 ], [ %79, %29 ]
%32 = phi i32 [ %10, %9 ], [ %81, %29 ]
%33 = sitofp i32 %30 to double
%34 = tail call double @sin(double %33) #2
%35 = fadd double %31, %34
%36 = or i32 %30, 1
%37 = sitofp i32 %36 to double
%38 = tail call double @sin(double %37) #2
%39 = fadd double %35, %38
%40 = or i32 %30, 2
%41 = sitofp i32 %40 to double
%42 = tail call double @sin(double %41) #2
%43 = fadd double %39, %42
%44 = or i32 %30, 3
%45 = sitofp i32 %44 to double
%46 = tail call double @sin(double %45) #2
%47 = fadd double %43, %46
%48 = add nuw nsw i32 %30, 4
%49 = sitofp i32 %48 to double
%50 = tail call double @sin(double %49) #2
%51 = fadd double %47, %50
%52 = add nuw nsw i32 %30, 5
%53 = sitofp i32 %52 to double
%54 = tail call double @sin(double %53) #2
%55 = fadd double %51, %54
%56 = add nuw nsw i32 %30, 6
%57 = sitofp i32 %56 to double
%58 = tail call double @sin(double %57) #2
%59 = fadd double %55, %58
%60 = add nuw nsw i32 %30, 7
%61 = sitofp i32 %60 to double
%62 = tail call double @sin(double %61) #2
%63 = fadd double %59, %62
%64 = add nuw nsw i32 %30, 8
%65 = sitofp i32 %64 to double
%66 = tail call double @sin(double %65) #2
%67 = fadd double %63, %66
%68 = add nuw nsw i32 %30, 9
%69 = sitofp i32 %68 to double
%70 = tail call double @sin(double %69) #2
%71 = fadd double %67, %70
%72 = add nuw nsw i32 %30, 10
%73 = sitofp i32 %72 to double
%74 = tail call double @sin(double %73) #2
%75 = fadd double %71, %74
%76 = add nuw nsw i32 %30, 11
%77 = sitofp i32 %76 to double
%78 = tail call double @sin(double %77) #2
%79 = fadd double %75, %78
%80 = add nuw nsw i32 %30, 12
%81 = add i32 %32, -12
%82 = icmp eq i32 %81, 0
br i1 %82, label %11, label %29, !llvm.loop !4
T F
%17:
%18 = phi i32 [ %24, %17 ], [ %13, %16 ]
%19 = phi double [ %23, %17 ], [ %14, %16 ]
%20 = phi i32 [ %25, %17 ], [ %7, %16 ]
%21 = sitofp i32 %18 to double
%22 = tail call double @sin(double %21) #2
%23 = fadd double %19, %22
%24 = add nuw nsw i32 %18, 1
%25 = add i32 %20, -1
%26 = icmp eq i32 %25, 0
br i1 %26, label %27, label %17, !llvm.loop !2
T F
43
Результаты за 10 запусков с различными прагмами:
branches average p90 p10
nounroll 1 001 796 322 1045.3 1067.3 1028.5
unroll 250 920 668 1038.6 1057.2 1019.8
unroll_count(12) 84 168 844 1023.7 1053.4 1009.9
44
Итого, разворачивание циклов
› Можно делать подсказки у конкретного цикла.
› Не является обязующей директивой.
› Потенциально руками можно обогнать эвристики.
45
Instruction combining
Instruction combining
int get_res(int a, int b, int c) {
return a ∗ b + a ∗ c + a + a + a + c — a;
}
.
47
Instruction combining: результаты
Было:
𝑎𝑏 + 𝑎𝑐 + 𝑎 + 𝑎 + 𝑎 + 𝑐 − 𝑎
Стало:
(𝑏 + 2 + 𝑐)𝑎 + 𝑐
48
Instruction combining в Clang
› Passes: InstCombinePass, ReassociatePass
› -ffast-math
› -fno-honor-infinities
› -fno-honor-nans
› -fno-math-errno
› -ffinite-math
› -fassociative-math
› -freciprocal-math
› -fno-signed-zeros
› -fno-trapping-math
› -ffp-contract=fast
49
float get_res(float a, float b, float c, float d, float e) {
return a ∗ d + b ∗ e + c ∗ e;
}
int main() {
float a = 500000000.0f;
float b = —500000000.0f;
float c = 1.0f;
std::cout << get_res(a, b, c, 1.0f, 1.0f) << std::endl;
return 0;
}
50
$ clang++ -O2 main.cpp
$ ./a.out
1
$ clang++ -ffast-math -O2 main.cpp
$ ./a.out
0
51
// a = 500000000.0f;
// b = —500000000.0f;
// c, d, e = 1.0f;
float get_res(float a, float b, float c, float d, float e) {
return a ∗ d + b ∗ e + c ∗ e;
}
float get_res_opt(float a, float b, float c, float d, float e) {
return a ∗ d + (b + c) ∗ e;
}
52
Итого, упрощение выражений
› Ключ для сборки - влияет на всю единицу трансляции.
› Говорит компилятору быть менее консервативным.
53
Branching
Branching
void function(bool should_fail) {
if (should_fail) {
...
// Some long, rarely used error handling
...
} else {
...
// Main execution path
...
}
}
55
Branching в Clang
› __builtin_expect(long exp, long c)
56
Branching пример
double calc(int i) {
if (__builtin_expect(i>0, 0)) {
switch (i) { ... }
switch (i) { ... }
double di = static_cast<double>(i);
return di ∗ di + di ∗ 0.1;
} else {
double di = static_cast<double>(i);
return di ∗ di ∗ di + di ∗ 0.3;
}
}
57
Ассемблер без подсказок
0: test %edi,%edi
2: jle 44 <_Z4calci+0x44>
T F
4: lea -0x1(%rdi),%eax
7: cmp $0x2,%eax
a: mov $0xfffffffe,%eax
f: cmovb %eax,%edi
12: cmp $0x2,%edi
15: jbe 61 <_Z4calci+0x61>
...
44: cvtsi2sd %edi,%xmm1
48: movapd %xmm1,%xmm0
4c: mulsd %xmm0,%xmm0
50: mulsd %xmm1,%xmm0
54: mulsd 0x0(%rip),%xmm1
5b: 00
5c: addsd %xmm1,%xmm0
60:retq
58
Ассемблер с подсказками
0: test %edi,%edi
2: jg 21 <_Z4calci+0x21>
T F
21: lea -0x1(%rdi),%eax
24: cmp $0x2,%eax
27: mov $0xfffffffe,%eax
2c: cmovb %eax,%edi
2f: cmp $0x2,%edi
32: jbe 61 <_Z4calci+0x61>
...
4: cvtsi2sd %edi,%xmm1
8: movapd %xmm1,%xmm0
c: mulsd %xmm0,%xmm0
10: mulsd %xmm1,%xmm0
14: mulsd 0x0(%rip),%xmm1
1b: 00
1c: addsd %xmm1,%xmm0
20: retq
59
Итого, подсказки при ветвлении
› Можно делать подсказки у каждого отдельного условного
оператора.
› Компилятор не знает на каких данных будет запускаться
программа.
60
Link-time optimization
Сборка
62
Сборка с LTO
63
// main.cpp
#include <iostream>
#include <chrono>
#include "source2.h"
int main() {
double res{0.0};
const auto start = std::chrono::high_resolution_clock::now().time_since_epoch
().count();
res = get_res();
const auto end = std::chrono::high_resolution_clock::now().time_since_epoch().
count();
std::cout << "res = " << res << std::endl;
...
return 0;
} 64
// source2.cpp
#include "source1.h"
double get_res() {
const int it_count{1000000000};
double res{0.0};
for (int i = 0; i < it_count; i++) {
res += calc_value(i, 0);
}
return res;
}
65
// source1.cpp
int calc_value(int i, int branch) {
int original_i = i;
switch (i) { ... }
switch (i) { ... }
...
if (branch) {
return i;
} else {
return original_i;
}
}
66
$ clang++ -O2 main.cpp source1.cpp source2.cpp
$ ./a.out
res = 5e+17
Spent 3536021382 ns (3536 ms)
4.2.1 Compatible Clang 6.0.0 (tags/RELEASE_600/final)
67
$ clang++ -O2 -flto -c main.cpp source1.cpp source2.cpp
$ clang++ -O2 -flto main.o source1.o source2.o
$ ./a.out
res = 5e+17
Spent 1013128016 ns (1013 ms)
4.2.1 Compatible Clang 6.0.0 (tags/RELEASE_600/final)
$ ls -n a.out
-rwxr-xr-x 1 1000 1000 9072 ноя 19 11:53 a.out
$ llvm-dis-6.0 source1.o -o source1.ll
68
$ clang++ -O2 -flto=thin -c main.cpp source1.cpp source2.cpp
$ clang++ -O2 -flto=thin main.o source1.o source2.o
$ ./a.out
res = 5e+17
Spent 1013771808 ns (1013 ms)
4.2.1 Compatible Clang 6.0.0 (tags/RELEASE_600/final)
$ ls -ln a.out
-rwxr-xr-x 1 1000 1000 13248 ноя 19 03:03 a.out
69
// source2.cpp
#include "source1.h"
double get_res() {
const int it_count{1000000000};
double res{0.0};
for (int i = 0; i < it_count; i++) {
res += calc_value(i, 0);
}
return res;
}
70
// source1.cpp
int calc_value(int i, int branch) {
int original_i = i;
switch (i) { ... }
switch (i) { ... }
...
if (branch) {
return i;
} else {
return original_i;
}
}
71
Итого, оптимизации на этапе линковки
› Требуют изменения процесса сборки проекта.
› Ограничивают возможные выбор линкера.
› Увеличивают время сборки (особенно инкрементальной).
› Дают компилятору больше контекста для оптимизаций.
72
Заключение
› LTO, если не жалко скорость сборки.
› Подсказки компилятору, если есть время и желание
проверять результат.
› Стоит изучать свои инструменты
73
Спасибо за внимание!
Андрей Олейников
andreyol@yandex-team.ru

Как помочь и как помешать компилятору. Андрей Олейников ➠ CoreHard Autumn 2019

  • 1.
    Как помочь икак помешать компилятору Андрей Олейников, разработчик, Беспилотные автомобили Яндекса andreyol@yandex-team.ru
  • 2.
    Введение › Компиляторные оптимизации ›Возможность на них повлиять Предупреждения › Clang/LLVM (6.0) › x64 (i7-8750H CPU) › Нестандартные расширения › Синтетические примеры https://github.com/duke-gh/CoreHard2019 2
  • 3.
  • 4.
    План доклада › Inline ›Loop unrolling › Instruction combining › Branching › LTO 4
  • 5.
  • 6.
    Inline cpp double calc(inti) { return std::sin(i / 100.0) + std::cos(i / 100.0); } double get_res() { const int it_count{100000000}; double res{0.0}; for (int i = 0; i < it_count; i++) { res += calc(i); } return res; } 6
  • 7.
    Псевдокод после оптимизации doublecalc(int i) { return std::sin(i / 100.0) + std::cos(i / 100.0); } double get_res() { const int it_count{100000000}; double res{0.0}; for (int i = 0; i < it_count; i++) { res += std::sin(i / 100.0) + std::cos(i / 100.0); } return res; } 7
  • 8.
    Inline в Сlang ›Passes: AlwaysInlinerPass, InlinerPass › inline › __attribute__((always_inline)) › __attribute__((noinline)) 8
  • 9.
    Inline пример 1 doublecalc_value(int i, int branch) { switch (i) {...} ... switch (i) {...} double di = static_cast<double>(i); if (branch) { return i + di ∗ di + di ∗ di ∗ di + di ∗ di ∗ di ∗ 0.2 + i % 2; } else { return i ∗ 0.1 + di ∗ di / 2.0 + di ∗ di ∗ di ∗ 0.3 + di ∗ di ∗ di ∗ 0.4 + i / 2.0; } } 9
  • 10.
    double get_first_value(int i,int branch) { return calc_value(i, branch); } double get_second_value(int i, int branch) { return calc_value(i, branch); } double get_res() { const int it_count{100000000}; double res{0.0}; for (int i = 0; i < it_count; i++) { res += get_first_value(i, 0) + get_second_value(i, 0); } return res; } 10
  • 11.
    $ clang++ —O2—Rpass=inline —c calc.cpp calc.cpp:87:12: remark: _Z15get_first_valueii inlined into _Z7get_resv with cost=0 (threshold=337) [—Rpass=inline] res += get_first_value(i, 0) + get_second_value(i, 0) ; ^ calc.cpp:87:37: remark: _Z16get_second_valueii inlined into _Z7get_resv with cost=0 (threshold=337) [—Rpass=inline] res += get_first_value(i, 0) + get_second_value(i, 0) ; 11
  • 12.
    $ clang++ -O2-Rpass=inline -Rpass-missed=inline -c calc.cpp calc.cpp:76:10: remark: _Z10calc_valueii not inlined into _Z15get_first_valueii because too costly to inline (cost=320, threshold=225) [-Rpass-missed=inline] return calc_value(i, branch); ^ calc.cpp:80:10: remark: _Z10calc_valueii not inlined into _Z16get_second_valueii because too costly to inline (cost=320, threshold=225) [-Rpass-missed=inline] return calc_value(i, branch); ... 12
  • 13.
    $ sudo perfstat -B ./a.out Spent 1109395529 ns Performance counter stats for './a.out': 1 111,60 msec task-clock 3 context-switches 0 cpu-migrations 117 page-faults 4 261 552 332 cycles 4 604 496 295 instructions 1 100 771 723 branches 25 792 branch-misses 1,112127349 seconds time elapsed 1,112106000 seconds user 0,000000000 seconds sys 13
  • 14.
    inline double calc_value(inti, int branch) { ... } 14
  • 15.
  • 16.
    $ clang++ -O2-Rpass=inline -c calc.cpp calc.cpp:76:10: remark: _Z10calc_valueii inlined into _Z15get_first_valueii with cost=320 (threshold=325) [-Rpass=inline] return calc_value(i, branch); ^ calc.cpp:80:10: remark: _Z10calc_valueii inlined into _Z16get_second_valueii with cost=320 (threshold=325) [-Rpass=inline] return calc_value(i, branch); ^ 16
  • 17.
    $ sudo perfstat -B ./a.out Spent 1823334387 ns Performance counter stats for './a.out': 1 824,14 msec task-clock 4 context-switches 0 cpu-migrations 117 page-faults 7 239 846 942 cycles 8 405 174 671 instructions 2 100 884 626 branches 28 842 branch-misses 1,824386069 seconds time elapsed 1,824351000 seconds user 0,000000000 seconds sys 17
  • 18.
    double get_first_value(int i,int branch) { return calc_value(i, branch); } double get_second_value(int i, int branch) { return calc_value(i, branch); } double get_res() { const int it_count{100000000}; double res{0.0}; for (int i = 0; i < it_count; i++) { res += get_first_value(i, 0) + get_second_value(i, 0); } return res; } 18
  • 19.
    Inline пример 2 doublecalc_value(int i, int branch) { switch (i) {...} ... switch (i) {...} double di = static_cast<double>(i); if (branch) { return i + di ∗ di + di ∗ di ∗ di + di ∗ di ∗ di ∗ 0.2 + i % 2; } else { return i ∗ 0.1 + di ∗ di / 2.0 + di ∗ di ∗ di ∗ 0.3 + di ∗ di ∗ di ∗ 0.4 + i / 2.0; } } 19
  • 20.
    double get_first_value(int i){ return calc_value(i, 0); } double get_second_value(int i) { return calc_value(i, 1); } double get_res() { const int it_count{100000000}; double res{0.0}; for (int i = 0; i < it_count; i++) { res += get_first_value(i) + get_second_value(i); } return res; } 20
  • 21.
    $ clang++ -O2-Rpass=inline -c calc.cpp calc.cpp:89:12: remark: _Z15get_first_valuei inlined into _Z7get_resv with cost=5 (threshold=337) [-Rpass=inline] res += get_first_value(i) + get_second_value(i); ^ calc.cpp:89:33: remark: _Z16get_second_valuei inlined into _Z7get_resv with cost=5 (threshold=337) [-Rpass=inline] res += get_first_value(i) + get_second_value(i); 21
  • 22.
    $ sudo perfstat -B ./a.out Spent 1691298621 ns Performance counter stats for './a.out': 1 692,13 msec task-clock 2 context-switches 0 cpu-migrations 119 page-faults 6 762 472 189 cycles 8 405 218 118 instructions 2 100 883 212 branches 27 518 branch-misses 1,692417574 seconds time elapsed 1,692363000 seconds user 0,000000000 seconds sys 22
  • 23.
    inline double calc_value(inti, int branch) { ... } 23
  • 24.
    $ clang++ -O2-Rpass=inline -c calc.cpp calc.cpp:78:10: remark: _Z10calc_valueii inlined into _Z15get_first_valuei with cost=265 (threshold=325) [- Rpass=inline] return calc_value(i, 0); ^ calc.cpp:82:10: remark: _Z10calc_valueii inlined into _Z16get_second_valuei with cost=255 (threshold=325) [- Rpass=inline] return calc_value(i, 1); ^ 24
  • 25.
    $ sudo perfstat -B ./a.out Spent 1690155986 ns Performance counter stats for './a.out': 1 691,02 msec task-clock 3 context-switches 0 cpu-migrations 116 page-faults 6 763 570 064 cycles 7 805 197 904 instructions 1 900 880 186 branches 28 067 branch-misses 1,691254028 seconds time elapsed 1,691260000 seconds user 0,000000000 seconds sys 25
  • 26.
    inline double get_first_value(inti) { ... } inline double get_second_value(int i) { ... } 26
  • 27.
    $ clang++ -O2-Rpass=inline -c calc.cpp calc.cpp:78:10: remark: _Z10calc_valueii inlined into _Z15get_first_valuei with cost=265 (threshold=325) return calc_value(i, 0); ^ ... calc.cpp:89:12: remark: _Z15get_first_valuei inlined into _Z7get_resv with cost=270 (threshold=325) [-Rpass=inline] res += get_first_value(i) + get_second_value(i); ^ ... 27
  • 28.
    $ sudo perfstat -B ./a.out Spent 1823334387 ns Performance counter stats for './a.out': 424,66 msec task-clock 0 context-switches 0 cpu-migrations 115 page-faults 1 700 420 431 cycles 7 203 660 181 instructions 1 500 629 258 branches 20 223 branch-misses 0,424871806 seconds time elapsed 0,424891000 seconds user 0,000000000 seconds sys 28
  • 29.
    double calc_value(int i,int branch) { switch (i) {...} ... switch (i) {...} double di = static_cast<double>(i); if (branch) { return i + di ∗ di + di ∗ di ∗ di + di ∗ di ∗ di ∗ 0.2 + i % 2; } else { return i ∗ 0.1 + di ∗ di / 2.0 + di ∗ di ∗ di ∗ 0.3 + di ∗ di ∗ di ∗ 0.4 + i / 2.0; } } 29
  • 30.
    __attribute__((always_inline)) double get_first_value(inti) { ... } __attribute__((noinline)) double get_second_value(int i) { ... } 30
  • 31.
    $ clang++ -O2-Rpass=inline -Rpass-missed=inline -c calc.cpp ... calc.cpp:89:12: remark: _Z15get_first_valuei inlined into _Z7get_resv with cost=always [-Rpass=inline] res += get_first_value(i) + get_second_value(i); ^ ... calc.cpp:89:33: remark: _Z16get_second_valuei not inlined into _Z7get_resv because it should never be inlined (cost=never) [-Rpass-missed=inline] res += get_first_value(i) + get_second_value(i); 31
  • 32.
    double get_res() { constint it_count{1000000000}; double res{0.0}; for (int i = 0; i < it_count; i++) { res += get_first_light_value(i) + get_second_light_value(i + 1); if (i < 10) { res += get_first_heavy_value(i, res); res += get_second_heavy_value(i, res); } res += get_second_light_value(i) + get_first_light_value(i — 1); } return res; } 32
  • 33.
    Результаты за 10запусков с always_inline и без: среднее, мс p90 p10 по умолчанию 3567.6 3584.4 3555.1 always_inline 3701.5 3714.5 3691.9 33
  • 34.
    Итого, inline › Позволяетделать подсказки только у вызываемой функции. › Можно облегчить компиятору работу указав явно как поступать. › Потенциально руками можно обогнать эвристики. 34
  • 35.
  • 36.
    Loop unroll double calc(inti) { return std::sin(i); } double get_res(int it_count) { double res{0.0}; for (int i = 0; i < it_count; i++) { res += calc(i); } return res; } 36
  • 37.
    IR перед оптимизацией %1: %2= icmp sgt i32 %0, 0 br i1 %2, label %3, label %6 T F %3: br label %8 %6: %7 = phi double [ 0.000000e+00, %1 ], [ %5, %4 ] ret double %7 %8: %9 = phi i32 [ %14, %8 ], [ 0, %3 ] %10 = phi double [ %13, %8 ], [ 0.000000e+00, %3 ] %11 = sitofp i32 %9 to double %12 = tail call double @sin(double %11) %13 = fadd double %10, %12 %14 = add nuw nsw i32 %9, 1 %15 = icmp eq i32 %14, %0 br i1 %15, label %4, label %8 T F %4: %5 = phi double [ %13, %8 ] br label %6 37
  • 38.
    IR после оптимизации %1: %2= icmp sgt i32 %0, 0 br i1 %2, label %3, label %29 T F %3: %4 = add i32 %0, -1 %5 = and i32 %0, 3 %6 = icmp ult i32 %4, 3 br i1 %6, label %10, label %7 T F %29: %30 = phi double [ 0.000000e+00, %1 ], [ %28, %27 ] ret double %30 %10: %11 = phi double [ undef, %3 ], [ %49, %9 ] %12 = phi i32 [ 0, %3 ], [ %50, %9 ] %13 = phi double [ 0.000000e+00, %3 ], [ %49, %9 ] %14 = icmp eq i32 %5, 0 br i1 %14, label %27, label %15 T F %7: %8 = sub i32 %0, %5 br label %31 %27: %28 = phi double [ %11, %10 ], [ %22, %26 ] br label %29 %15: br label %16 %31: %32 = phi i32 [ 0, %7 ], [ %50, %31 ] %33 = phi double [ 0.000000e+00, %7 ], [ %49, %31 ] %34 = phi i32 [ %8, %7 ], [ %51, %31 ] %35 = sitofp i32 %32 to double %36 = tail call double @sin(double %35) %37 = fadd double %33, %36 %38 = or i32 %32, 1 %39 = sitofp i32 %38 to double %40 = tail call double @sin(double %39) %41 = fadd double %37, %40 %42 = or i32 %32, 2 %43 = sitofp i32 %42 to double %44 = tail call double @sin(double %43) %45 = fadd double %41, %44 %46 = or i32 %32, 3 %47 = sitofp i32 %46 to double %48 = tail call double @sin(double %47) %49 = fadd double %45, %48 %50 = add nuw nsw i32 %32, 4 %51 = add i32 %34, -4 %52 = icmp eq i32 %51, 0 br i1 %52, label %9, label %31 T F %9: br label %10 %16: %17 = phi i32 [ %23, %16 ], [ %12, %15 ] %18 = phi double [ %22, %16 ], [ %13, %15 ] %19 = phi i32 [ %24, %16 ], [ %5, %15 ] %20 = sitofp i32 %17 to double %21 = tail call double @sin(double %20) %22 = fadd double %18, %21 %23 = add nuw nsw i32 %17, 1 %24 = add i32 %19, -1 %25 = icmp eq i32 %24, 0 br i1 %25, label %26, label %16, !llvm.loop !0 T F %26: br label %27 38
  • 39.
    // before for (inti = 0; i < it_count; i++) { res += calc(i); } // after int i = 0; for (; it_count — i > 3; i += 4) { res += calc(i); res += calc(i + 1); res += calc(i + 2); res += calc(i + 3); } for (; i < it_count; i++) { res += calc(i); } 39
  • 40.
    Разворачивание циклов вClang › Pass: LoopUnrollPass › #pragma unroll › #pragma nounroll › #pragma clang loop unroll(full) › #pragma clang loop unroll_count(12) 40
  • 41.
    // llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp:960 if (HasUnrollDisablePragma(L)) returnLoopUnrollResult::Unmodified; ... // llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp:747 if (ExplicitUnroll && TripCount != 0) { // If the loop has an unrolling pragma, we want to be more aggressive with // unrolling limits. Set thresholds to at least the PragmaThreshold value // which is larger than the default limits. UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold); UP.PartialThreshold = std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold); } 41
  • 42.
    Loop unroll пример doublecalc(int i) { return std::sin(i); } double get_res_unroll_12(int it_count) { double res{0.0}; #pragma clang loop unroll_count(12) for (int i = 0; i < it_count; i++) { res += calc(i); } return res; } 42
  • 43.
    %1: %2 = icmpsgt i32 %0, 0 br i1 %2, label %3, label %27 T F %3: %4 = add i32 %0, -1 %5 = urem i32 %4, 12 %6 = add nuw nsw i32 %5, 1 %7 = urem i32 %6, 12 %8 = icmp ult i32 %4, 11 br i1 %8, label %11, label %9 T F %27: %28 = phi double [ 0.000000e+00, %1 ], [ %12, %11 ], [ %23, %17 ] ret double %28 %11: %12 = phi double [ undef, %3 ], [ %79, %29 ] %13 = phi i32 [ 0, %3 ], [ %80, %29 ] %14 = phi double [ 0.000000e+00, %3 ], [ %79, %29 ] %15 = icmp eq i32 %7, 0 br i1 %15, label %27, label %16 T F %9: %10 = sub i32 %0, %7 br label %29 %16: br label %17 %29: %30 = phi i32 [ 0, %9 ], [ %80, %29 ] %31 = phi double [ 0.000000e+00, %9 ], [ %79, %29 ] %32 = phi i32 [ %10, %9 ], [ %81, %29 ] %33 = sitofp i32 %30 to double %34 = tail call double @sin(double %33) #2 %35 = fadd double %31, %34 %36 = or i32 %30, 1 %37 = sitofp i32 %36 to double %38 = tail call double @sin(double %37) #2 %39 = fadd double %35, %38 %40 = or i32 %30, 2 %41 = sitofp i32 %40 to double %42 = tail call double @sin(double %41) #2 %43 = fadd double %39, %42 %44 = or i32 %30, 3 %45 = sitofp i32 %44 to double %46 = tail call double @sin(double %45) #2 %47 = fadd double %43, %46 %48 = add nuw nsw i32 %30, 4 %49 = sitofp i32 %48 to double %50 = tail call double @sin(double %49) #2 %51 = fadd double %47, %50 %52 = add nuw nsw i32 %30, 5 %53 = sitofp i32 %52 to double %54 = tail call double @sin(double %53) #2 %55 = fadd double %51, %54 %56 = add nuw nsw i32 %30, 6 %57 = sitofp i32 %56 to double %58 = tail call double @sin(double %57) #2 %59 = fadd double %55, %58 %60 = add nuw nsw i32 %30, 7 %61 = sitofp i32 %60 to double %62 = tail call double @sin(double %61) #2 %63 = fadd double %59, %62 %64 = add nuw nsw i32 %30, 8 %65 = sitofp i32 %64 to double %66 = tail call double @sin(double %65) #2 %67 = fadd double %63, %66 %68 = add nuw nsw i32 %30, 9 %69 = sitofp i32 %68 to double %70 = tail call double @sin(double %69) #2 %71 = fadd double %67, %70 %72 = add nuw nsw i32 %30, 10 %73 = sitofp i32 %72 to double %74 = tail call double @sin(double %73) #2 %75 = fadd double %71, %74 %76 = add nuw nsw i32 %30, 11 %77 = sitofp i32 %76 to double %78 = tail call double @sin(double %77) #2 %79 = fadd double %75, %78 %80 = add nuw nsw i32 %30, 12 %81 = add i32 %32, -12 %82 = icmp eq i32 %81, 0 br i1 %82, label %11, label %29, !llvm.loop !4 T F %17: %18 = phi i32 [ %24, %17 ], [ %13, %16 ] %19 = phi double [ %23, %17 ], [ %14, %16 ] %20 = phi i32 [ %25, %17 ], [ %7, %16 ] %21 = sitofp i32 %18 to double %22 = tail call double @sin(double %21) #2 %23 = fadd double %19, %22 %24 = add nuw nsw i32 %18, 1 %25 = add i32 %20, -1 %26 = icmp eq i32 %25, 0 br i1 %26, label %27, label %17, !llvm.loop !2 T F 43
  • 44.
    Результаты за 10запусков с различными прагмами: branches average p90 p10 nounroll 1 001 796 322 1045.3 1067.3 1028.5 unroll 250 920 668 1038.6 1057.2 1019.8 unroll_count(12) 84 168 844 1023.7 1053.4 1009.9 44
  • 45.
    Итого, разворачивание циклов ›Можно делать подсказки у конкретного цикла. › Не является обязующей директивой. › Потенциально руками можно обогнать эвристики. 45
  • 46.
  • 47.
    Instruction combining int get_res(inta, int b, int c) { return a ∗ b + a ∗ c + a + a + a + c — a; } . 47
  • 48.
    Instruction combining: результаты Было: 𝑎𝑏+ 𝑎𝑐 + 𝑎 + 𝑎 + 𝑎 + 𝑐 − 𝑎 Стало: (𝑏 + 2 + 𝑐)𝑎 + 𝑐 48
  • 49.
    Instruction combining вClang › Passes: InstCombinePass, ReassociatePass › -ffast-math › -fno-honor-infinities › -fno-honor-nans › -fno-math-errno › -ffinite-math › -fassociative-math › -freciprocal-math › -fno-signed-zeros › -fno-trapping-math › -ffp-contract=fast 49
  • 50.
    float get_res(float a,float b, float c, float d, float e) { return a ∗ d + b ∗ e + c ∗ e; } int main() { float a = 500000000.0f; float b = —500000000.0f; float c = 1.0f; std::cout << get_res(a, b, c, 1.0f, 1.0f) << std::endl; return 0; } 50
  • 51.
    $ clang++ -O2main.cpp $ ./a.out 1 $ clang++ -ffast-math -O2 main.cpp $ ./a.out 0 51
  • 52.
    // a =500000000.0f; // b = —500000000.0f; // c, d, e = 1.0f; float get_res(float a, float b, float c, float d, float e) { return a ∗ d + b ∗ e + c ∗ e; } float get_res_opt(float a, float b, float c, float d, float e) { return a ∗ d + (b + c) ∗ e; } 52
  • 53.
    Итого, упрощение выражений ›Ключ для сборки - влияет на всю единицу трансляции. › Говорит компилятору быть менее консервативным. 53
  • 54.
  • 55.
    Branching void function(bool should_fail){ if (should_fail) { ... // Some long, rarely used error handling ... } else { ... // Main execution path ... } } 55
  • 56.
    Branching в Clang ›__builtin_expect(long exp, long c) 56
  • 57.
    Branching пример double calc(inti) { if (__builtin_expect(i>0, 0)) { switch (i) { ... } switch (i) { ... } double di = static_cast<double>(i); return di ∗ di + di ∗ 0.1; } else { double di = static_cast<double>(i); return di ∗ di ∗ di + di ∗ 0.3; } } 57
  • 58.
    Ассемблер без подсказок 0:test %edi,%edi 2: jle 44 <_Z4calci+0x44> T F 4: lea -0x1(%rdi),%eax 7: cmp $0x2,%eax a: mov $0xfffffffe,%eax f: cmovb %eax,%edi 12: cmp $0x2,%edi 15: jbe 61 <_Z4calci+0x61> ... 44: cvtsi2sd %edi,%xmm1 48: movapd %xmm1,%xmm0 4c: mulsd %xmm0,%xmm0 50: mulsd %xmm1,%xmm0 54: mulsd 0x0(%rip),%xmm1 5b: 00 5c: addsd %xmm1,%xmm0 60:retq 58
  • 59.
    Ассемблер с подсказками 0:test %edi,%edi 2: jg 21 <_Z4calci+0x21> T F 21: lea -0x1(%rdi),%eax 24: cmp $0x2,%eax 27: mov $0xfffffffe,%eax 2c: cmovb %eax,%edi 2f: cmp $0x2,%edi 32: jbe 61 <_Z4calci+0x61> ... 4: cvtsi2sd %edi,%xmm1 8: movapd %xmm1,%xmm0 c: mulsd %xmm0,%xmm0 10: mulsd %xmm1,%xmm0 14: mulsd 0x0(%rip),%xmm1 1b: 00 1c: addsd %xmm1,%xmm0 20: retq 59
  • 60.
    Итого, подсказки приветвлении › Можно делать подсказки у каждого отдельного условного оператора. › Компилятор не знает на каких данных будет запускаться программа. 60
  • 61.
  • 62.
  • 63.
  • 64.
    // main.cpp #include <iostream> #include<chrono> #include "source2.h" int main() { double res{0.0}; const auto start = std::chrono::high_resolution_clock::now().time_since_epoch ().count(); res = get_res(); const auto end = std::chrono::high_resolution_clock::now().time_since_epoch(). count(); std::cout << "res = " << res << std::endl; ... return 0; } 64
  • 65.
    // source2.cpp #include "source1.h" doubleget_res() { const int it_count{1000000000}; double res{0.0}; for (int i = 0; i < it_count; i++) { res += calc_value(i, 0); } return res; } 65
  • 66.
    // source1.cpp int calc_value(inti, int branch) { int original_i = i; switch (i) { ... } switch (i) { ... } ... if (branch) { return i; } else { return original_i; } } 66
  • 67.
    $ clang++ -O2main.cpp source1.cpp source2.cpp $ ./a.out res = 5e+17 Spent 3536021382 ns (3536 ms) 4.2.1 Compatible Clang 6.0.0 (tags/RELEASE_600/final) 67
  • 68.
    $ clang++ -O2-flto -c main.cpp source1.cpp source2.cpp $ clang++ -O2 -flto main.o source1.o source2.o $ ./a.out res = 5e+17 Spent 1013128016 ns (1013 ms) 4.2.1 Compatible Clang 6.0.0 (tags/RELEASE_600/final) $ ls -n a.out -rwxr-xr-x 1 1000 1000 9072 ноя 19 11:53 a.out $ llvm-dis-6.0 source1.o -o source1.ll 68
  • 69.
    $ clang++ -O2-flto=thin -c main.cpp source1.cpp source2.cpp $ clang++ -O2 -flto=thin main.o source1.o source2.o $ ./a.out res = 5e+17 Spent 1013771808 ns (1013 ms) 4.2.1 Compatible Clang 6.0.0 (tags/RELEASE_600/final) $ ls -ln a.out -rwxr-xr-x 1 1000 1000 13248 ноя 19 03:03 a.out 69
  • 70.
    // source2.cpp #include "source1.h" doubleget_res() { const int it_count{1000000000}; double res{0.0}; for (int i = 0; i < it_count; i++) { res += calc_value(i, 0); } return res; } 70
  • 71.
    // source1.cpp int calc_value(inti, int branch) { int original_i = i; switch (i) { ... } switch (i) { ... } ... if (branch) { return i; } else { return original_i; } } 71
  • 72.
    Итого, оптимизации наэтапе линковки › Требуют изменения процесса сборки проекта. › Ограничивают возможные выбор линкера. › Увеличивают время сборки (особенно инкрементальной). › Дают компилятору больше контекста для оптимизаций. 72
  • 73.
    Заключение › LTO, еслине жалко скорость сборки. › Подсказки компилятору, если есть время и желание проверять результат. › Стоит изучать свои инструменты 73
  • 74.
    Спасибо за внимание! АндрейОлейников andreyol@yandex-team.ru