Successfully reported this slideshow.
We use your LinkedIn profile and activity data to personalize ads and to show you more relevant ads. You can change your ad preferences anytime.

.NET Fest 2019. Николай Балакин. Микрооптимизации в мире .NET

21 views

Published on

Что делать, если все, что можно уже закэшировано, а код всё ещё тормозит? В этом докладе мы обсудим, как работают некоторые низкоуровневые механизмы .NET и как мы с их помощью можем выиграть драгоценные секунды, когда счет идет на отдельные такты процессора.

Published in: Education
  • Be the first to comment

  • Be the first to like this

.NET Fest 2019. Николай Балакин. Микрооптимизации в мире .NET

  1. 1. Micro-optimizations in .NET world .NET LEVEL UP .NET CONFERENCE #1 IN UKRAINE KYIV 2019
  2. 2. What is micro-optimization
  3. 3. What is micro-optimization DataController.Get MyService.GetData DataRepository.Load DbCommand.ExecuteRead MyService.Process MyService.Calculate Total time Own timeMethod 15 234 ms 15 221 ms 14 163 ms 14 028 ms 1 034 ms 975 ms 13 ms 24 ms 135 ms 3 ms 59 ms 975 ms
  4. 4. What is micro-optimization DataController.Get MyService.GetData DataRepository.Load DbCommand.ExecuteRead MyService.Process MyService.Calculate Total time Own timeMethod 15 234 ms 15 221 ms 832 ms 697 ms 14 365 ms 14 227 ms 13 ms 24 ms 135 ms 5 ms 138 ms 14 227 ms
  5. 5. What is micro-optimization Noun micro-optimization (plural micro-optimizations) (programming, computer architecture)
 Optimization at the level of individual instructions and operations.
  6. 6. Неделя оптимизации кода А разговоров-то было
  7. 7. Intrinsics
  8. 8. Intrinsics Noun intrinsic (plural intrinsics) (computing, programming) A built-in function that is implemented directly by the compiler, without any intermediate call to a library.
  9. 9. 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001 2000 1999 1998 1997 MMX SSE 3DNow! SSE2 SSE4 SSSE3 SSE3 ADX AVX2, BMI2 AVX FMA4 CLMUL, AES-NI FMA3, BMI1 AVX-512
  10. 10. PADDW (Add Packed Integers) x0x1x2x3x4x5x6x7 y0y1y2y3y4y5y6y7 x0+y0x1+y1x2+y2x3+y3x4+y4x5+y5x6+y6x7+y7 Source 1 Source 2 Result
  11. 11. PDEP (Parallel Bits Deposit) x0x1x2x3x4x5x6x7Source 1 Source 2 Result 00101100 x0x1x2x3x4x5 0010 xn 000 … … 00x00x1x20000x30000 …
  12. 12. Intrinsics • SSE, SSE2, SSE3, SSE4, SSE4.1, SSE4.2, SSSE3 • Lzcnt, Popcnt • AVX, AVX2 • AES • BMI1, BMI2 • FMA • CLMUL • NEON (ARM)
  13. 13. using System.Runtime.Intrinsics.X86; uint CalculateCrc32(byte[] data) {     if (Sse42.IsSupported)     {         uint result = 0;         foreach (var b in data)             result = Sse42.Crc32(result, b);         return result;     }     else     {         // Falback implementation of the method         // without using intrinsics     } }
  14. 14. namespace System.Numerics {     public static class BitOperations     {         public static int Log2(uint value)         {             if (Lzcnt.IsSupported)             {                 if (value == 0)                 {                     return 0;                 }                 // LZCNT contract is 0->32                 return 31 - (int)Lzcnt.LeadingZeroCount(value);             }             // Fallback contract is 0->0             return Log2SoftwareFallback(value);         } } } Benchmark: | Time | ---------- |--------:| Lznt | 0.58 ns | Fallback | 1.52 ns |
  15. 15. Vectorization byte[] data = ... for (var i = 0; i < data.Length; i++) {     var item = data[i];     DoAction(item); } byte[] data = ... for (var i = 0; i < data.Length; i += 8) {     var vector = data[i..i+8]     DoAction(vector); } Verb vectorize (computing, transitive)
 To convert a program that operates on scalar values into the equivalent program operating on vectors.
  16. 16. public static int IndexOf(this ReadOnlySpan<char> source, char value, int startIndex) {     for (int i = startIndex; i < source.Length; i++)     {         if (source[i] == value)         {             return i;         }     }     return -1; }
  17. 17. public static unsafe int IndexOf(ref char searchSpace, char value, int length) { if (Sse2.IsSupported)         Determine how many to iterate to get data 16 bytes aligned SequentialScan: Iterate byte by byte if (Avx2.IsSupported)     { if (not 32 bytes aligned) Check 16 bytes using SSE2 Iterate by 32 bytes using AVX2 if (more than 16 bytes left) Check 16 bytes using SSE2 if (not all data iterated) goto SequentialScan;     }     else if (Sse2.IsSupported)     { Iterate by 16 bytes using SSE2 if (not all data iterated) goto SequentialScan: } Found: return offset; } Benchmark: Length | Time | ----- |--------:|-----------:| Old | 15 | 8.817 ns | New | 15 | 4.577 ns | Old | 1024 | 68.530 ns | New | 1024 | 49.741 ns |
  18. 18. Heap allocations
  19. 19. (almost) New features • ArrayPool class • Span and ReadOnlySpan structures • stackalloc
  20. 20. namespace System {     internal static class DateTimeFormat     {         internal static string Format(DateTime dateTime, string? format, IFormatProvider? provider, TimeSpan offset)         {             if (format != null && format.Length == 1)             {                 // Optimize for these standard formats that are not affected by culture.                 switch (format[0])                 {                     // Round trip format                     case 'o':                     case 'O':                         const int MaxFormatOLength = 33;                         Span<char> span = stackalloc char[MaxFormatOLength];                         TryFormatO(dateTime, offset, span, out int ochars);                         return span.Slice(0, ochars).ToString();                 }             } // ... More code goes here...         } } }
  21. 21. namespace System.Data.SqlClient {     internal sealed partial class TdsParser {         private bool TryReadSqlDateTime(SqlBuffer value, byte tdsType, int length, byte scale, TdsParserStateObject stateObj)         {             Span<byte> datetimeBuffer = ((uint)length <= 16) ? stackalloc byte[16] : new byte[length];             if (!stateObj.TryReadByteArray(datetimeBuffer, length))             {                 return false;             }             ReadOnlySpan<byte> dateTimeData = datetimeBuffer.Slice(0, length); // ... More code goes here... } } }
  22. 22. namespace System.IO {     public abstract partial class TextReader : MarshalByRefObject, IDisposable {         public virtual async Task<string> ReadToEndAsync()         {             var sb = new StringBuilder(4096);             char[] chars = ArrayPool<char>.Shared.Rent(4096);             try             {                 int len;                 while ((len = await ReadAsyncInternal(chars, default).ConfigureAwait(false)) != 0)                 {                     sb.Append(chars, 0, len);                 }             }             finally             {                 ArrayPool<char>.Shared.Return(chars);             }             return sb.ToString();         } } }
  23. 23. namespace System.Text.Json {     public static partial class JsonSerializer     {         private static ReadOnlySpan<byte> GetUnescapedString(ReadOnlySpan<byte> utf8Source, int idx)         {             // The escaped name is always longer than the unescaped, so it is safe to use escaped name for the buffer length.             int length = utf8Source.Length;             byte[] pooledName = null;             Span<byte> unescapedName = length <= JsonConstants.StackallocThreshold ?                 stackalloc byte[length] :                 (pooledName = ArrayPool<byte>.Shared.Rent(length));             JsonReaderHelper.Unescape(utf8Source, unescapedName, idx, out int written);             ReadOnlySpan<byte> propertyName = unescapedName.Slice(0, written).ToArray();             if (pooledName != null)             {                 // We clear the array because it is "user data" (although a property name).                 new Span<byte>(pooledName, 0, written).Clear();                 ArrayPool<byte>.Shared.Return(pooledName);             }             return propertyName;         }     } }
  24. 24. namespace System {     public readonly struct Range : IEquatable<Range> {         public override string ToString()         { // 2 for "..", then for each index 1 for '^' and 10 for longest possible uint             Span<char> span = stackalloc char[2 + (2 * 11)];             int charsWritten;             int pos = 0;             if (Start.IsFromEnd)             {                 span[0] = '^';                 pos = 1;             }             bool formatted = ((uint)Start.Value).TryFormat(span.Slice(pos), out charsWritten);             pos += charsWritten;             span[pos++] = '.';             span[pos++] = '.';             if (End.IsFromEnd)             {                 span[pos++] = '^';             }             formatted = ((uint)End.Value).TryFormat(span.Slice(pos), out charsWritten);             pos += charsWritten;             return new string(span.Slice(0, pos));         } } }
  25. 25. Parsing with Span
  26. 26. Object stack allocation class AddOperation {     public int First { get; set; }     public int Second { get; set; }     public int Calculate()     {         return First + Second;     } } public int Test() {     var operation = new AddOperation();     operation.First = _first;     operation.Second = _second;     return operation.Calculate(); } Object stack allocation disabled: push rbx mov rbx, rdi movabs rdi, 0x113dc1bc0 call 0x1049f2920 (JitHelp: CORINFO_HELP_NEWSFAST) mov edi, dword ptr [rbx + 0x8] mov dword ptr [rax + 0x8], edi mov edi, dword ptr [rbx + 0xc] mov dword ptr [rax + 0xc], edi mov edi, dword ptr [rax + 0x8] add edi, dword ptr [rax + 0xc] mov eax, edi pop rbx ret
  27. 27. Object stack allocation class AddOperation {     public int First { get; set; }     public int Second { get; set; }     public int Calculate()     {         return First + Second;     } } public int Test() {     var operation = new AddOperation();     operation.First = _first;     operation.Second = _second;     return operation.Calculate(); } Object stack allocation disabled: push rbx mov rbx, rdi movabs rdi, 0x113dc1bc0 call 0x1049f2920 (JitHelp: CORINFO_HELP_NEWSFAST) mov edi, dword ptr [rbx + 0x8] mov dword ptr [rax + 0x8], edi mov edi, dword ptr [rbx + 0xc] mov dword ptr [rax + 0xc], edi mov edi, dword ptr [rax + 0x8] add edi, dword ptr [rax + 0xc] mov eax, edi pop rbx ret
  28. 28. Object stack allocation class AddOperation {     public int First { get; set; }     public int Second { get; set; }     public int Calculate()     {         return First + Second;     } } public int Test() {     var operation = new AddOperation();     operation.First = _first;     operation.Second = _second;     return operation.Calculate(); } Object stack allocation disabled: push rbx mov rbx, rdi movabs rdi, 0x113dc1bc0 call 0x1049f2920 (JitHelp: CORINFO_HELP_NEWSFAST) mov edi, dword ptr [rbx + 0x8] mov dword ptr [rax + 0x8], edi mov edi, dword ptr [rbx + 0xc] mov dword ptr [rax + 0xc], edi mov edi, dword ptr [rax + 0x8] add edi, dword ptr [rax + 0xc] mov eax, edi pop rbx ret
  29. 29. Object stack allocation class AddOperation {     public int First { get; set; }     public int Second { get; set; }     public int Calculate()     {         return First + Second;     } } public int Test() {     var operation = new AddOperation();     operation.First = _first;     operation.Second = _second;     return operation.Calculate(); } Object stack allocation disabled: push rbx mov rbx, rdi movabs rdi, 0x113dc1bc0 call 0x1049f2920 (JitHelp: CORINFO_HELP_NEWSFAST) mov edi, dword ptr [rbx + 0x8] mov dword ptr [rax + 0x8], edi mov edi, dword ptr [rbx + 0xc] mov dword ptr [rax + 0xc], edi mov edi, dword ptr [rax + 0x8] add edi, dword ptr [rax + 0xc] mov eax, edi pop rbx ret
  30. 30. Object stack allocation class AddOperation {     public int First { get; set; }     public int Second { get; set; }     public int Calculate()     {         return First + Second;     } } public int Test() {     var operation = new AddOperation();     operation.First = _first;     operation.Second = _second;     return operation.Calculate(); } Object stack allocation disabled: push rbx mov rbx, rdi movabs rdi, 0x113dc1bc0 call 0x1049f2920 (JitHelp: CORINFO_HELP_NEWSFAST) mov edi, dword ptr [rbx + 0x8] mov dword ptr [rax + 0x8], edi mov edi, dword ptr [rbx + 0xc] mov dword ptr [rax + 0xc], edi mov edi, dword ptr [rax + 0x8] add edi, dword ptr [rax + 0xc] mov eax, edi pop rbx ret
  31. 31. Object stack allocation class AddOperation {     public int First { get; set; }     public int Second { get; set; }     public int Calculate()     {         return First + Second;     } } public int Test() {     var operation = new AddOperation();     operation.First = _first;     operation.Second = _second;     return operation.Calculate(); } Object stack allocation enabled: mov eax, dword ptr [rdi + 0x8] mov edi, dword ptr [rdi + 0xc] add eax, edi ret Object stack allocation disabled: push rbx mov rbx, rdi movabs rdi, 0x113dc1bc0 call 0x1049f2920 (JitHelp: CORINFO_HELP_NEWSFAST) mov edi, dword ptr [rbx + 0x8] mov dword ptr [rax + 0x8], edi mov edi, dword ptr [rbx + 0xc] mov dword ptr [rax + 0xc], edi mov edi, dword ptr [rax + 0x8] add edi, dword ptr [rax + 0xc] mov eax, edi pop rbx ret
  32. 32. Stack allocations
  33. 33. Stack structure Local data Previous frame address Return address Arguments Current method stack frame Previous stack frame Previous stack frame Previous stack frame Stack growth
 direction
  34. 34. Stack structure int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Stack growth
 directionrsp
  35. 35. Stack structure int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer Stack growth
 direction rsp
  36. 36. Stack structure int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer Stack growth
 direction rsp rbp
  37. 37. Stack structure int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer Stack growth
 direction rsp rdi rbp
  38. 38. Stack structure int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer Stack growth
 direction rsp rdi rbp
  39. 39. Stack structure int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer Stack growth
 direction rsp rdi rbp
  40. 40. Stack structure int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer Stack growth
 direction rsp rdi first variable rbp
  41. 41. Stack structure int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer first variable Stack growth
 direction rsp rdi rbp
  42. 42. Stack structure second variable int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer first variable Stack growth
 direction rsp rdi rbp
  43. 43. Stack structure second variable int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer first variable Stack growth
 direction rsp rbp
  44. 44. Stack structure second variable int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer first variable Stack growth
 direction rsp rbp
  45. 45. Stack structure second variable int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer first variable Stack growth
 direction rsp rbp
  46. 46. Stack structure second variable int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer first variable Stack growth
 direction first variable copy rsp rbp
  47. 47. Stack structure second variable int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer first variable Stack growth
 direction second variable copy first variable copy rsp rbp
  48. 48. Stack structure second variable int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer first variable Stack growth
 direction rsp rbp second variable copy first variable copy
  49. 49. Stack structure second variable int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer first variable Stack growth
 direction TheMethod argument 2 TheMethod argument 1 rsp rbp
  50. 50. Stack structure second variable int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer first variable Stack growth
 direction TheMethod argument 2 TheMethod argument 1 Return address rsp rbp
  51. 51. Stack structure second variable int Test() {     var first = new YearMonth(2019, 1);     var second = new YearMonth(2018, 10);     return TheMethod(first, second); } push    rbp sub     rsp, 0x80 lea     rbp, [rsp + 0x80] ... fill stack frame with zeroes lea     rdi, [rbp - 0x40] mov     esi, 0x7e3 mov     edx, 0x1 call    0x114973fe0 (YearMonth..ctor) lea     rdi, [rbp - 0x20] mov     esi, 0x7e2 mov     edx, 0xa call    0x114973fe0 (YearMonth..ctor)) vmovdqu xmm0, xmmword ptr [rbp - 0x40] vmovdqu xmmword ptr [rsp], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x30] vmovdqu xmmword ptr [rsp + 0x10], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x20] vmovdqu xmmword ptr [rsp + 0x20], xmm0 vmovdqu xmm0, xmmword ptr [rbp - 0x10] vmovdqu xmmword ptr [rsp + 0x30], xmm0 call    0x114972400 (TheMethod) lea     rsp, [rbp] pop     rbp ret Previous stack frame pointer first variable Stack growth
 direction TheMethod argument 2 TheMethod argument 1 Return address TheMethod local data rbp rsp
  52. 52. Minimizing structure size
  53. 53. in parameter modifier namespace System {     public readonly struct Decimal     {         public static double ToDouble(decimal d)         {             return DecCalc.VarR8FromDec(in d);         } private struct DecCalc         {             internal static double VarR8FromDec(in decimal value)             {                 const double ds2to64 = 1.8446744073709552e+019;                 double dbl = ((double)value.Low64 +                     (double)value.High * ds2to64) / s_doublePowers10[value.Scale];                 if (value.IsNegative)                     dbl = -dbl;                 return dbl;             } } } }
  54. 54. ref return namespace System {     public static class Math     {         public static decimal Max(decimal val1, decimal val2)         {             return decimal.Max(val1, val2);         } }     public readonly partial struct Decimal     {         internal static ref readonly decimal Max(in decimal d1, in decimal d2)         {             return ref DecCalc.VarDecCmp(in d1, in d2) >= 0 ? ref d1 : ref d2;         } } }
  55. 55. Virtual calls
  56. 56. Virtual calls Method table Method 1 Method 2 Compiled method 1Compiled method 2 Object instance Some data More data And even more data
  57. 57. Non-virtual call lea rdi, [rbx + 0x8] call 0x114c9f620 (MyStructure.DoAction) public void MyMethod() { _someStructure.DoAction(); }
  58. 58. Virtual table dispatch mov rdi, qword ptr [rdi + 0x8] mov rax, qword ptr [rdi] mov rax, qword ptr [rax + 0x40] call qword ptr [rax + 0x20] public void MyMethod() {     _someObject.VirtualMethod(); } Stuff VTable indirections Chunk 1 Header MethodTable pointer Instance data Object instance Method table Chunk 2 … Stuff Target Method 1 Method 8 Method 7 Method 6 Method 5 Method 4 Method 3 Method 2 VTable indirection
  59. 59. Virtual stub dispatch public void MyMethod() {     _someObject.InterfaceMethod(); } mov rdi, qword ptr [rdi + 0x8] movabs rax, 0x107320848 call qword ptr [rax] movabs rax, 0x10846fd08 cmp qword ptr [rdi], rax movabs rax, 0x1083e6c60 jne 0x1073668a5 jmp rax Lookup stub Indirect cell Caller Dispatch stub Resolve stub Target Generic resolver
  60. 60. Devirtualization Verb devirtualize (computing, transitive) To make no longer virtual.
  61. 61. Devirtualization Benchmark: | Method | Mean | Error | StdDev | |----------- |----------:|---------:|---------:| | NonVirtual | 163.38 us | 1.286 us | 1.203 us | | Virtual | 163.77 us | 1.848 us | 1.729 us | | Interface | 191.27 us | 1.754 us | 1.641 us | public class MyList<T> : IReadOnlyList<T> {     private readonly T[] _data;     public virtual int Count => _data.Length;     public virtual T this[int index] => _data[index]; } public int Test() { var result = 0; var data = ... var length = data.Count; for (var i = 0; i < length; i++) {     result += data[i]; } return result; }
  62. 62. Inlining
  63. 63. Inlining Noun in-line expansion (plural in-line expansions) (software compilation) The replacement by a compiler of a function call with a copy of the entire function body.
  64. 64. Inlining void DoStuff() { var user = GetUser(); Console.WriteLine(user.Name); } class User { private string _name; public string Name { get => _name; set => _name = value; } } DotStuff method: push    rbp mov     rbp, rsp call    0x11849bea0 (GetUser) mov     rdi, rax call    0x11849f650 (User.get_Name) mov     rdi, rax call    0x1184a0b70 (WriteLine) pop     rbp ret User.get_Name method: mov     rax, qword ptr [rdi + 0x8] ret
  65. 65. Inlining void DoStuff() { var user = GetUser(); Console.WriteLine(user.Name); } class User { private string _name; public string Name { get => _name; set => _name = value; } } DotStuff method: push    rbp mov     rbp, rsp call    0x11849bea0 (GetUser) mov     rdi, rax call    0x11849f650 (User.get_Name) mov     rdi, rax call    0x1184a0b70 (WriteLine) pop     rbp ret DotStuff method (with inlining): push    rbp mov     rbp, rsp call    0x12541bea0 (GetUser) mov     rdi, qword ptr [rax + 0x8] call    0x125420b70 (WriteLine) pop     rbp ret User.get_Name method: mov     rax, qword ptr [rdi + 0x8] ret void DoStuff() { Console.WriteLine(GetUser()._name); }
  66. 66. Inlining Benchmark: | Method | Mean | Error | StdDev | |----------- |----------:|---------:|---------:| | Inlining | 58.27 us | 1.140 us | 1.120 us | | NonVirtual | 163.38 us | 1.286 us | 1.203 us | | Virtual | 163.77 us | 1.848 us | 1.729 us | | Interface | 191.27 us | 1.754 us | 1.641 us | public class MyList<T> : IReadOnlyList<T> {     private readonly T[] _data;     public virtual int Count => _data.Length;     public virtual T this[int index] => _data[index]; } public int Test() { var result = 0; var data = ... var length = data.Count; for (var i = 0; i < length; i++) {     result += data[i]; } return result; }
  67. 67. Inlining requirements • Devirtualization / non-virtual call • No recursion • Heuristic: • Inlining is profitable • Stack size is less than 16 bytes • IL code is smaller than 16 bytes • …
  68. 68. Inlining requirements • Devirtualization / non-virtual call • No recursion • Heuristic: • Inlining is profitable • Stack size is less than 16 bytes • IL code is smaller than 16 bytes • … namespace System {     public static class Math     {         [MethodImpl(MethodImplOptions.AggressiveInlining)]         public static decimal Min(decimal val1, decimal val2)         {             return decimal.Min(val1, val2);         }     } }
  69. 69. foreach optimization Benchmark: | Method | Mean | Error | StdDev | |-------- |---------:|----------:|----------:| | List | 2.311 us | 0.0233 us | 0.0218 us | | IList | 4.392 us | 0.0601 us | 0.0562 us | public int Test() {     var result = 0;     IList<int> data = GetData();     foreach (var item in data)     {         result += item;     }     return result; }
  70. 70. .locals init ( [0] int32, [1] valuetype List`1/Enumerator<int32>, ) call GetData() callvirt instance valuetype List`1<int32>::GetEnumerator() stloc.1 br.s loop start: ldloca.s 1 call instance !0 valuetype List`1/Enumerator<int32>::get_Current() ldloc.0 add stloc.0 loop: ldloca.s 1 call instance bool valuetype List`1/Enumerator<int32>::MoveNext() brtrue.s start ldloc.0 ret .locals init ( [0] int32, [1] class IEnumerator`1<int32>, ) call GetData() callvirt instance class IEnumerable`1<int32>::GetEnumerator() stloc.1 br.s loop start: ldloc.1 callvirt instance !0 class IEnumerator::get_Current() ldloc.0 add stloc.0 loop: ldloc.1 callvirt instance bool IEnumerator::MoveNext() brtrue.s start ldloc.0 ret
  71. 71. namespace System.Collections.Generic {     public class List<T> : IList<T>, IList, IReadOnlyList<T>     {         public Enumerator GetEnumerator() => new Enumerator(this);         IEnumerator<T> IEnumerable<T>.GetEnumerator() => new Enumerator(this);         public struct Enumerator : IEnumerator<T>, IEnumerator         {             private readonly List<T> _list;             private int _index;             private T _current;             internal Enumerator(List<T> list)             {                 _list = list;             }             public bool MoveNext()             {                 if (_index < _list._size)                 {                     _current = _list._items[_index];                     _index++;                     return true;                 }                 return false;             }             public T Current => _current;         }     } }
  72. 72. That’s all
  73. 73. Useful links • Егор Богатов — Оптимизации внутри .NET Core
 https://youtu.be/n3-j_sTtGb0 • SIMD + aligning example (corefx repo)
 src/Common/src/CoreLib/System/SpanHelpers.Char.cs • Just complex SIMD usage (corefx repo)
 src/System.Memory/src/System/Buffers/Text/Base64Encoder.cs • Book of the Runtime (a.k.a. BOTR)
 https://github.com/dotnet/coreclr/tree/master/Documentation/botr .NET LEVEL UP .NET CONFERENCE #1 IN UKRAINE KYIV 2019 @NikolayBalakin n@balakin.me

×