SlideShare a Scribd company logo
1 of 26
Download to read offline
A collection of
Micro Optimizations
  by Alex, Gabriel & Michael
Start
● Optimizations from hash.0.c to hash.13.c
● Performance testing:
     gcc -Wall -O3 hash.c -o hash
     perf stat -r 5 -e instructions -e branch-misses hash input input2
     perf stat -r 5 -e cycles hash input input2


● Result:
  Cycles:                     7.292.009.385
  Instructions:               1.063.178.278
  Branch mispredictions:      11.395.359
  Time elapsed:               2.2927 s
Analysis
● Hashtable
  ○   Max. Collisions:   7
  ○   Empty Elements:    363.543
  ○   Amount Elements:   1.048.575
  ○   Input Elements:    724.129


● Good hash table size: ~ 20% of Input
● Brent-Hashing?
● Parallelism?
Convert Linked-Lists to Arrays
● fewer cache misses on frequently used lookup

● overhead due to reorganizing

● struct size reduced from 24 to 16 bytes
  due to removing *next

● faster at large lists
   ○ break even point at HASHSIZE 2^18
Loop peeling: lookup
if(l != NULL) {
    if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0)
        return l->value;
    l = l->next;
    while (l!=NULL) {
      if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0)
        return l->value;
      l = l->next;
    }
  }
  return -1;

    Cycles:                       7.255.927.875 (-0,495%)
    Instructions:                 1.067.896.719 (+0,444%)
    Branch mispredictions:        11.464.124 (+0,603%)
    Time elapsed:                 2,1613 s (-5,731%)
Inline
inline   struct block slurp(char *filename)
inline   unsigned long hash(char *addr, size_t len)
inline   void insert(char *keyaddr, size_t keylen, int value)
inline   int lookup(char *keyaddr, size_t keylen)



    Cycles:                         7.265.216.080 (+0,128%)
    Instructions:                   1.067.543.945 (-0,033%)
    Branch mispredictions:          11.541.050 (+0,671%)
    Time elapsed:                   2,1672 s (+0,273%)
Replace loop with macro
#define REPEAT10(x) { x x x x x x x x x x }
REPEAT10 (
     for (p=input2.addr, endp=input2.addr+input2.len; p<endp; ) {
       ...
     }
  );


    Cycles:                       7.313.103.515 (+0,659%)
    Instructions:                 1.062.596.883 (-0,463%)
    Branch mispredictions:        11.423.373 (-1,020%)
    Time elapsed:                 2,1791 s (+0,549%)
Some Minor Changes
● new Makro HASHSIZE-1
● Remove unnecessary Casts


... with no effects
Loop peeling + adjust len
inline unsigned long hash(char *addr, size_t len) {

...

if(len > 7 ) {
    len = len - 7;
    x = (*(unsigned long *)addr)*hashmult;
    for (i=8; i<len; i+=8) {
      w = *(unsigned long *)(addr+i);
      x = (x + w)*hashmult;
    }
    len = len + 7;
  }

...
Loop peeling + adjust len


  Cycles:                  8.271.902.713 (+13,111%)
  Instructions:            1.038.690.398 (-2,250%)
  Branch mispredictions:   11.809.722 (+3,382%)
  Time elapsed:            2,4551 s (+12,668%)


=> probably faster for long strings
=> changes discarded
Pointers instead of indices
uint128_t x;
unsigned long * laddr = (unsigned long *) addr;
unsigned long * end = (unsigned long *) (addr+len);

if(len > 7 ) {
  x = *laddr * hashmult;
  end--;
  for (laddr++; laddr <= end; laddr++) {
    x = (x + *laddr)*hashmult;
  }
  if (laddr < (end+1))
    x = ( x + ((*laddr)<< ( ((char*)laddr - (char*)end)*8)) ) * hashmult;
  return x+(x>>64);
} else if (laddr < end) {
  x = (uint128_t)((*laddr)<<((8-len)*8)) * hashmult;
  return x+(x>>64);
}

return 0;
Pointers instead of indices

  Cycles:                  8.253.559.129 (+12,860%)
  Instructions:            1.021.822.315 (-3,837%)
  Branch misprecditions:   1.1825.252 (+3,518%)
  Time elapsed:            2,4558 s (+12,700%)



=> probably faster for long strings
=> changes discarded
Improve loop-layout
for (p=input1.addr, endp=input1.addr+input1.len, i=0; p<endp; i++) {
    nextp=memchr(p, 'n', endp-p);
      if (nextp == NULL)
        break;
      ...
  }
------------------------------------------------
for (p=input.addr, endp=input.addr+input.len, r=0,
     nextp=memchr(p, 'n', endp-p); nextp != NULL;
       r++, nextp=memchr(p, 'n', endp-p)) {
       ...
}
Improve loop-layout
  Cycles:                  7.364.723.755 (+0,705%)
  Instructions:            1072512560 (+0,933%)
  Branch mispredictions:   11606354 (+1,601%)
  Time elapsed:            2,2509 s (+3,294%)


=> "if" and "&&" probably similar instructions
   in this case
Remove unnecessary check
for (p=input.addr, endp=input.addr+input.len, r=0,
     nextp=memchr(p, 'n', endp-p); nextp != NULL;
       r++, nextp=memchr(p, 'n', endp-p)) {
       ...
}




Remove unnecessary variables
struct block input1, input2;                         struct block input;

unsigned int i;                                      unsigned long r=0;
unsigned long r=0;
Remove unnecessary
check & variables


 Cycles:                  7323904385 (-0,554%)
 Instructions:            1064977111 (-0,702%)
 Branch mispredictions:   11734428 (+1,103%)
 Time elapsed:            2,2129 s (-1,688%)
Sentinel with rawmemchr
 ●   Idea:
      ○ replace '0' with 'n' at the end
      ○ use rawmemchr without length check instead
      ○ safe compares

endp=input1.addr+input1.len;
 *endp = 'n';

for (p=input1.addr, i=0, nextp=rawmemchr(p, 'n'); p<endp ; i++) {
  nextp=rawmemchr(p, 'n');
  insert(p, nextp-p, i);
  p = nextp+1;
}
Sentinel self made rawmemchr
endp=input.addr+input.len;
*endp = 'n';
p=input.addr;
nextp = p;

for (r=0; nextp<endp; r++) {
  for(;*nextp ^ 'n'; nextp++);
  insert(p, nextp-p, r);
  nextp++;
  p = nextp;
}

  Cycles:                         7.400.275.087 (+1,042%)
  Instructions:                   1.157.591.866 (+8,696%)
  Branch mispredictions:          11.715.914 (-0,158%)
  Time elapsed:                   2,2064 s (-0,293%)
Faster memcmp
inline int mycmp(char* in1, char* in2, int len){
  do{
    if(*in1 ^ *in2) return 0;
    in1++; in2++; len--;
  }while(len>0);
  return 1;
}

if (keylen == l->keylen && mycmp(l->keyaddr, keyaddr, keylen))


    Cycles:                       5.826.523.410 (-21,266%)
    Instructions:                 1.913.851.749 (+65,330%)
    Branch mispredictions:        14.810.147 (+26,410%)
    Time elapsed:                 1,7366 s (-21,292%)
Faster memcmp with Sentinel
*(keyaddr+keylen) = 0; // FROM INSERT

inline int mycmp(char* in1, char* in2, int len){
  while(*in1 == *in2) {
    in1++; in2++; len--;
  }
  return len;
}

if (keylen==l->keylen && !mycmp(l->keyaddr, keyaddr, keylen)) // FROM
LOOKUP


    Cycles:                       5.766.254.891 (-22,080%)
    Instructions:                 1.747.135.165 (+50,928%)
    Branch mispredictions:        14.772.984 (+26,093%)
    Time elapsed:                 1,7182 s (-22,126%)
Caching
int *cache = malloc(size*sizeof(int));       endcache = cache;
int *startcache, *endcache;
startcache = cache;                           REPEAT9 (
endcache = startcache + size;                   cache = startcache;
                                                while (cache < endcache) {
while(nextp<endp) {                               r = r * 2654435761L +
  if (cache >= endcache){                     *cache;
    size = size<<1;                               r = r + (r>>32);
    cache = realloc(cache, size*sizeof(int));     cache++;
  }                                             } );
    for(;*nextp ^ 'n'; nextp++);
    *cache = lookup(p, nextp-p);
    r = r * 2654435761L + *cache;
    r = r + (r>>32);
    cache++; nextp++; p = nextp;
}
Caching + memcmp

 Cycles:                  925.886.063 (-84,109%)
 Instructions:            494.630.615 (-74,155%)
 Branch mispredictions:   2.395.446 (-83,825%)
 Time elapsed:            0,2847 s (-83,603%)
Caching + memcmp with Sentinel

 Cycles:                  925.783.880 (-84,110%)
 Instructions:            475.125.520 (-75,172%)
 Branch mispredictions:   2.418.936 (-83,659%)
 Time elapsed:            0,2839 s (-83,738%)
Approximation of cache size

int size = input.len/6;                       int size = input.len/2;
...
if (cache >= endcache){
  size = size<<1;
  cache = realloc(cache, size*sizeof(int));
}
...


    Cycles:                       930.929.061 (+0,544%)
    Instructions:                 475.676.977 (-3,831%)
    Branch mispredictions:        2.384.999 (-0,436%)
    Time elapsed:                 0,2830 s (-0,586%)
Overall

  Cycles:                  930.929.061 (-87,233%)
  Instructions:            475.676.977 (-55,259%)
  Branch mispredictions:   2.384.999 (-79.070%)
  Time elapsed:            0,2830 s (-87,656%)
Fin
Any Questions?




Code available:
https://github.com/grill/micro-optimisations

More Related Content

What's hot

Use C++ to Manipulate mozSettings in Gecko
Use C++ to Manipulate mozSettings in GeckoUse C++ to Manipulate mozSettings in Gecko
Use C++ to Manipulate mozSettings in GeckoChih-Hsuan Kuo
 
Zone.js 2017
Zone.js 2017Zone.js 2017
Zone.js 2017Jia Li
 
Time Series Analysis for Network Secruity
Time Series Analysis for Network SecruityTime Series Analysis for Network Secruity
Time Series Analysis for Network Secruitymrphilroth
 
PythonScripting
PythonScriptingPythonScripting
PythonScriptingSait Elmas
 
zen and the art of SQL optimization
zen and the art of SQL optimizationzen and the art of SQL optimization
zen and the art of SQL optimizationKaren Morton
 
PostgreSQL query planner's internals
PostgreSQL query planner's internalsPostgreSQL query planner's internals
PostgreSQL query planner's internalsAlexey Ermakov
 
Kubernetes Tutorial
Kubernetes TutorialKubernetes Tutorial
Kubernetes TutorialCi Jie Li
 
The Weather of the Century Part 2: High Performance
The Weather of the Century Part 2: High PerformanceThe Weather of the Century Part 2: High Performance
The Weather of the Century Part 2: High PerformanceMongoDB
 
Detection of errors and potential vulnerabilities in C and C++ code using the...
Detection of errors and potential vulnerabilities in C and C++ code using the...Detection of errors and potential vulnerabilities in C and C++ code using the...
Detection of errors and potential vulnerabilities in C and C++ code using the...Andrey Karpov
 
Effective Modern C++ - Item 35 & 36
Effective Modern C++ - Item 35 & 36Effective Modern C++ - Item 35 & 36
Effective Modern C++ - Item 35 & 36Chih-Hsuan Kuo
 
Arna Friend Controls II Final
Arna Friend Controls II FinalArna Friend Controls II Final
Arna Friend Controls II FinalArna Friend
 
Apply Hammer Directly to Thumb; Avoiding Apache Spark and Cassandra AntiPatt...
 Apply Hammer Directly to Thumb; Avoiding Apache Spark and Cassandra AntiPatt... Apply Hammer Directly to Thumb; Avoiding Apache Spark and Cassandra AntiPatt...
Apply Hammer Directly to Thumb; Avoiding Apache Spark and Cassandra AntiPatt...Databricks
 
Code vectorization for mobile devices
Code vectorization for mobile devicesCode vectorization for mobile devices
Code vectorization for mobile devicesSt1X
 
Bind Peeking - The Endless Tuning Nightmare
Bind Peeking - The Endless Tuning NightmareBind Peeking - The Endless Tuning Nightmare
Bind Peeking - The Endless Tuning NightmareSage Computing Services
 
Exploring Parallel Merging In GPU Based Systems Using CUDA C.
Exploring Parallel Merging In GPU Based Systems Using CUDA C.Exploring Parallel Merging In GPU Based Systems Using CUDA C.
Exploring Parallel Merging In GPU Based Systems Using CUDA C.Rakib Hossain
 
Valerii Vasylkov Erlang. measurements and benefits.
Valerii Vasylkov Erlang. measurements and benefits.Valerii Vasylkov Erlang. measurements and benefits.
Valerii Vasylkov Erlang. measurements and benefits.Аліна Шепшелей
 
Scoped dynamic rewrite rules
Scoped dynamic rewrite rulesScoped dynamic rewrite rules
Scoped dynamic rewrite rulesEelco Visser
 

What's hot (20)

Use C++ to Manipulate mozSettings in Gecko
Use C++ to Manipulate mozSettings in GeckoUse C++ to Manipulate mozSettings in Gecko
Use C++ to Manipulate mozSettings in Gecko
 
Dun ddd
Dun dddDun ddd
Dun ddd
 
Zone.js 2017
Zone.js 2017Zone.js 2017
Zone.js 2017
 
Time Series Analysis for Network Secruity
Time Series Analysis for Network SecruityTime Series Analysis for Network Secruity
Time Series Analysis for Network Secruity
 
PythonScripting
PythonScriptingPythonScripting
PythonScripting
 
R and cpp
R and cppR and cpp
R and cpp
 
zen and the art of SQL optimization
zen and the art of SQL optimizationzen and the art of SQL optimization
zen and the art of SQL optimization
 
PostgreSQL query planner's internals
PostgreSQL query planner's internalsPostgreSQL query planner's internals
PostgreSQL query planner's internals
 
Kubernetes Tutorial
Kubernetes TutorialKubernetes Tutorial
Kubernetes Tutorial
 
The Weather of the Century Part 2: High Performance
The Weather of the Century Part 2: High PerformanceThe Weather of the Century Part 2: High Performance
The Weather of the Century Part 2: High Performance
 
Detection of errors and potential vulnerabilities in C and C++ code using the...
Detection of errors and potential vulnerabilities in C and C++ code using the...Detection of errors and potential vulnerabilities in C and C++ code using the...
Detection of errors and potential vulnerabilities in C and C++ code using the...
 
Effective Modern C++ - Item 35 & 36
Effective Modern C++ - Item 35 & 36Effective Modern C++ - Item 35 & 36
Effective Modern C++ - Item 35 & 36
 
Arna Friend Controls II Final
Arna Friend Controls II FinalArna Friend Controls II Final
Arna Friend Controls II Final
 
Apply Hammer Directly to Thumb; Avoiding Apache Spark and Cassandra AntiPatt...
 Apply Hammer Directly to Thumb; Avoiding Apache Spark and Cassandra AntiPatt... Apply Hammer Directly to Thumb; Avoiding Apache Spark and Cassandra AntiPatt...
Apply Hammer Directly to Thumb; Avoiding Apache Spark and Cassandra AntiPatt...
 
Code vectorization for mobile devices
Code vectorization for mobile devicesCode vectorization for mobile devices
Code vectorization for mobile devices
 
Bind Peeking - The Endless Tuning Nightmare
Bind Peeking - The Endless Tuning NightmareBind Peeking - The Endless Tuning Nightmare
Bind Peeking - The Endless Tuning Nightmare
 
Exploring Parallel Merging In GPU Based Systems Using CUDA C.
Exploring Parallel Merging In GPU Based Systems Using CUDA C.Exploring Parallel Merging In GPU Based Systems Using CUDA C.
Exploring Parallel Merging In GPU Based Systems Using CUDA C.
 
BGP communities and geotags
BGP communities and geotagsBGP communities and geotags
BGP communities and geotags
 
Valerii Vasylkov Erlang. measurements and benefits.
Valerii Vasylkov Erlang. measurements and benefits.Valerii Vasylkov Erlang. measurements and benefits.
Valerii Vasylkov Erlang. measurements and benefits.
 
Scoped dynamic rewrite rules
Scoped dynamic rewrite rulesScoped dynamic rewrite rules
Scoped dynamic rewrite rules
 

Similar to Efficient Programs

Nodejs性能分析优化和分布式设计探讨
Nodejs性能分析优化和分布式设计探讨Nodejs性能分析优化和分布式设计探讨
Nodejs性能分析优化和分布式设计探讨flyinweb
 
Performance tweaks and tools for Linux (Joe Damato)
Performance tweaks and tools for Linux (Joe Damato)Performance tweaks and tools for Linux (Joe Damato)
Performance tweaks and tools for Linux (Joe Damato)Ontico
 
Tracing Parallel Execution (UKOUG 2006)
Tracing Parallel Execution (UKOUG 2006)Tracing Parallel Execution (UKOUG 2006)
Tracing Parallel Execution (UKOUG 2006)Doug Burns
 
Parallel Computing with R
Parallel Computing with RParallel Computing with R
Parallel Computing with RPeter Solymos
 
HW 5-RSAascii2str.mfunction str = ascii2str(ascii) .docx
HW 5-RSAascii2str.mfunction str = ascii2str(ascii)        .docxHW 5-RSAascii2str.mfunction str = ascii2str(ascii)        .docx
HW 5-RSAascii2str.mfunction str = ascii2str(ascii) .docxwellesleyterresa
 
Debugging Ruby
Debugging RubyDebugging Ruby
Debugging RubyAman Gupta
 
Dive into EXPLAIN - PostgreSql
Dive into EXPLAIN  - PostgreSqlDive into EXPLAIN  - PostgreSql
Dive into EXPLAIN - PostgreSqlDmytro Shylovskyi
 
Write Python for Speed
Write Python for SpeedWrite Python for Speed
Write Python for SpeedYung-Yu Chen
 
Debugging Ruby Systems
Debugging Ruby SystemsDebugging Ruby Systems
Debugging Ruby SystemsEngine Yard
 
Parallel R in snow (english after 2nd slide)
Parallel R in snow (english after 2nd slide)Parallel R in snow (english after 2nd slide)
Parallel R in snow (english after 2nd slide)Cdiscount
 
Joker 2015 - Валеев Тагир - Что же мы измеряем?
Joker 2015 - Валеев Тагир - Что же мы измеряем?Joker 2015 - Валеев Тагир - Что же мы измеряем?
Joker 2015 - Валеев Тагир - Что же мы измеряем?tvaleev
 
PVS-Studio team experience: checking various open source projects, or mistake...
PVS-Studio team experience: checking various open source projects, or mistake...PVS-Studio team experience: checking various open source projects, or mistake...
PVS-Studio team experience: checking various open source projects, or mistake...Andrey Karpov
 
Windbg랑 친해지기
Windbg랑 친해지기Windbg랑 친해지기
Windbg랑 친해지기Ji Hun Kim
 
lecture7.ppt
lecture7.pptlecture7.ppt
lecture7.pptEdFeranil
 
Accelerating microbiome research with OpenACC
Accelerating microbiome research with OpenACCAccelerating microbiome research with OpenACC
Accelerating microbiome research with OpenACCIgor Sfiligoi
 
Profiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf ToolsProfiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf ToolsemBO_Conference
 

Similar to Efficient Programs (20)

Nodejs性能分析优化和分布式设计探讨
Nodejs性能分析优化和分布式设计探讨Nodejs性能分析优化和分布式设计探讨
Nodejs性能分析优化和分布式设计探讨
 
Performance tweaks and tools for Linux (Joe Damato)
Performance tweaks and tools for Linux (Joe Damato)Performance tweaks and tools for Linux (Joe Damato)
Performance tweaks and tools for Linux (Joe Damato)
 
Tracing Parallel Execution (UKOUG 2006)
Tracing Parallel Execution (UKOUG 2006)Tracing Parallel Execution (UKOUG 2006)
Tracing Parallel Execution (UKOUG 2006)
 
Performance
PerformancePerformance
Performance
 
Parallel Computing with R
Parallel Computing with RParallel Computing with R
Parallel Computing with R
 
HW 5-RSAascii2str.mfunction str = ascii2str(ascii) .docx
HW 5-RSAascii2str.mfunction str = ascii2str(ascii)        .docxHW 5-RSAascii2str.mfunction str = ascii2str(ascii)        .docx
HW 5-RSAascii2str.mfunction str = ascii2str(ascii) .docx
 
Debugging Ruby
Debugging RubyDebugging Ruby
Debugging Ruby
 
Ping to Pong
Ping to PongPing to Pong
Ping to Pong
 
Dive into EXPLAIN - PostgreSql
Dive into EXPLAIN  - PostgreSqlDive into EXPLAIN  - PostgreSql
Dive into EXPLAIN - PostgreSql
 
Write Python for Speed
Write Python for SpeedWrite Python for Speed
Write Python for Speed
 
Debugging Ruby Systems
Debugging Ruby SystemsDebugging Ruby Systems
Debugging Ruby Systems
 
Parallel R in snow (english after 2nd slide)
Parallel R in snow (english after 2nd slide)Parallel R in snow (english after 2nd slide)
Parallel R in snow (english after 2nd slide)
 
Joker 2015 - Валеев Тагир - Что же мы измеряем?
Joker 2015 - Валеев Тагир - Что же мы измеряем?Joker 2015 - Валеев Тагир - Что же мы измеряем?
Joker 2015 - Валеев Тагир - Что же мы измеряем?
 
Rkf
RkfRkf
Rkf
 
PVS-Studio team experience: checking various open source projects, or mistake...
PVS-Studio team experience: checking various open source projects, or mistake...PVS-Studio team experience: checking various open source projects, or mistake...
PVS-Studio team experience: checking various open source projects, or mistake...
 
Windbg랑 친해지기
Windbg랑 친해지기Windbg랑 친해지기
Windbg랑 친해지기
 
Auto
AutoAuto
Auto
 
lecture7.ppt
lecture7.pptlecture7.ppt
lecture7.ppt
 
Accelerating microbiome research with OpenACC
Accelerating microbiome research with OpenACCAccelerating microbiome research with OpenACC
Accelerating microbiome research with OpenACC
 
Profiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf ToolsProfiling your Applications using the Linux Perf Tools
Profiling your Applications using the Linux Perf Tools
 

Recently uploaded

Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Igalia
 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfsudhanshuwaghmare1
 
Boost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityBoost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityPrincipled Technologies
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsMaria Levchenko
 
What Are The Drone Anti-jamming Systems Technology?
What Are The Drone Anti-jamming Systems Technology?What Are The Drone Anti-jamming Systems Technology?
What Are The Drone Anti-jamming Systems Technology?Antenna Manufacturer Coco
 
How to convert PDF to text with Nanonets
How to convert PDF to text with NanonetsHow to convert PDF to text with Nanonets
How to convert PDF to text with Nanonetsnaman860154
 
08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking MenDelhi Call girls
 
08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking Men08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking MenDelhi Call girls
 
Factors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptxFactors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptxKatpro Technologies
 
GenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationGenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationMichael W. Hawkins
 
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...Neo4j
 
CNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of ServiceCNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of Servicegiselly40
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024Rafal Los
 
A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024Results
 
Presentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreterPresentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreternaman860154
 
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Scriptwesley chun
 
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfThe Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfEnterprise Knowledge
 
Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024The Digital Insurer
 
Breaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path MountBreaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path MountPuma Security, LLC
 

Recently uploaded (20)

Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdf
 
Boost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivityBoost PC performance: How more available memory can improve productivity
Boost PC performance: How more available memory can improve productivity
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed texts
 
What Are The Drone Anti-jamming Systems Technology?
What Are The Drone Anti-jamming Systems Technology?What Are The Drone Anti-jamming Systems Technology?
What Are The Drone Anti-jamming Systems Technology?
 
How to convert PDF to text with Nanonets
How to convert PDF to text with NanonetsHow to convert PDF to text with Nanonets
How to convert PDF to text with Nanonets
 
08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men08448380779 Call Girls In Civil Lines Women Seeking Men
08448380779 Call Girls In Civil Lines Women Seeking Men
 
08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking Men08448380779 Call Girls In Friends Colony Women Seeking Men
08448380779 Call Girls In Friends Colony Women Seeking Men
 
Factors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptxFactors to Consider When Choosing Accounts Payable Services Providers.pptx
Factors to Consider When Choosing Accounts Payable Services Providers.pptx
 
GenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationGenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day Presentation
 
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
 
CNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of ServiceCNv6 Instructor Chapter 6 Quality of Service
CNv6 Instructor Chapter 6 Quality of Service
 
The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024The 7 Things I Know About Cyber Security After 25 Years | April 2024
The 7 Things I Know About Cyber Security After 25 Years | April 2024
 
A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024A Call to Action for Generative AI in 2024
A Call to Action for Generative AI in 2024
 
Presentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreterPresentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreter
 
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Script
 
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfThe Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
 
Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024Tata AIG General Insurance Company - Insurer Innovation Award 2024
Tata AIG General Insurance Company - Insurer Innovation Award 2024
 
Breaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path MountBreaking the Kubernetes Kill Chain: Host Path Mount
Breaking the Kubernetes Kill Chain: Host Path Mount
 

Efficient Programs

  • 1. A collection of Micro Optimizations by Alex, Gabriel & Michael
  • 2. Start ● Optimizations from hash.0.c to hash.13.c ● Performance testing: gcc -Wall -O3 hash.c -o hash perf stat -r 5 -e instructions -e branch-misses hash input input2 perf stat -r 5 -e cycles hash input input2 ● Result: Cycles: 7.292.009.385 Instructions: 1.063.178.278 Branch mispredictions: 11.395.359 Time elapsed: 2.2927 s
  • 3. Analysis ● Hashtable ○ Max. Collisions: 7 ○ Empty Elements: 363.543 ○ Amount Elements: 1.048.575 ○ Input Elements: 724.129 ● Good hash table size: ~ 20% of Input ● Brent-Hashing? ● Parallelism?
  • 4. Convert Linked-Lists to Arrays ● fewer cache misses on frequently used lookup ● overhead due to reorganizing ● struct size reduced from 24 to 16 bytes due to removing *next ● faster at large lists ○ break even point at HASHSIZE 2^18
  • 5. Loop peeling: lookup if(l != NULL) { if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0) return l->value; l = l->next; while (l!=NULL) { if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0) return l->value; l = l->next; } } return -1; Cycles: 7.255.927.875 (-0,495%) Instructions: 1.067.896.719 (+0,444%) Branch mispredictions: 11.464.124 (+0,603%) Time elapsed: 2,1613 s (-5,731%)
  • 6. Inline inline struct block slurp(char *filename) inline unsigned long hash(char *addr, size_t len) inline void insert(char *keyaddr, size_t keylen, int value) inline int lookup(char *keyaddr, size_t keylen) Cycles: 7.265.216.080 (+0,128%) Instructions: 1.067.543.945 (-0,033%) Branch mispredictions: 11.541.050 (+0,671%) Time elapsed: 2,1672 s (+0,273%)
  • 7. Replace loop with macro #define REPEAT10(x) { x x x x x x x x x x } REPEAT10 ( for (p=input2.addr, endp=input2.addr+input2.len; p<endp; ) { ... } ); Cycles: 7.313.103.515 (+0,659%) Instructions: 1.062.596.883 (-0,463%) Branch mispredictions: 11.423.373 (-1,020%) Time elapsed: 2,1791 s (+0,549%)
  • 8. Some Minor Changes ● new Makro HASHSIZE-1 ● Remove unnecessary Casts ... with no effects
  • 9. Loop peeling + adjust len inline unsigned long hash(char *addr, size_t len) { ... if(len > 7 ) { len = len - 7; x = (*(unsigned long *)addr)*hashmult; for (i=8; i<len; i+=8) { w = *(unsigned long *)(addr+i); x = (x + w)*hashmult; } len = len + 7; } ...
  • 10. Loop peeling + adjust len Cycles: 8.271.902.713 (+13,111%) Instructions: 1.038.690.398 (-2,250%) Branch mispredictions: 11.809.722 (+3,382%) Time elapsed: 2,4551 s (+12,668%) => probably faster for long strings => changes discarded
  • 11. Pointers instead of indices uint128_t x; unsigned long * laddr = (unsigned long *) addr; unsigned long * end = (unsigned long *) (addr+len); if(len > 7 ) { x = *laddr * hashmult; end--; for (laddr++; laddr <= end; laddr++) { x = (x + *laddr)*hashmult; } if (laddr < (end+1)) x = ( x + ((*laddr)<< ( ((char*)laddr - (char*)end)*8)) ) * hashmult; return x+(x>>64); } else if (laddr < end) { x = (uint128_t)((*laddr)<<((8-len)*8)) * hashmult; return x+(x>>64); } return 0;
  • 12. Pointers instead of indices Cycles: 8.253.559.129 (+12,860%) Instructions: 1.021.822.315 (-3,837%) Branch misprecditions: 1.1825.252 (+3,518%) Time elapsed: 2,4558 s (+12,700%) => probably faster for long strings => changes discarded
  • 13. Improve loop-layout for (p=input1.addr, endp=input1.addr+input1.len, i=0; p<endp; i++) { nextp=memchr(p, 'n', endp-p); if (nextp == NULL) break; ... } ------------------------------------------------ for (p=input.addr, endp=input.addr+input.len, r=0, nextp=memchr(p, 'n', endp-p); nextp != NULL; r++, nextp=memchr(p, 'n', endp-p)) { ... }
  • 14. Improve loop-layout Cycles: 7.364.723.755 (+0,705%) Instructions: 1072512560 (+0,933%) Branch mispredictions: 11606354 (+1,601%) Time elapsed: 2,2509 s (+3,294%) => "if" and "&&" probably similar instructions in this case
  • 15. Remove unnecessary check for (p=input.addr, endp=input.addr+input.len, r=0, nextp=memchr(p, 'n', endp-p); nextp != NULL; r++, nextp=memchr(p, 'n', endp-p)) { ... } Remove unnecessary variables struct block input1, input2; struct block input; unsigned int i; unsigned long r=0; unsigned long r=0;
  • 16. Remove unnecessary check & variables Cycles: 7323904385 (-0,554%) Instructions: 1064977111 (-0,702%) Branch mispredictions: 11734428 (+1,103%) Time elapsed: 2,2129 s (-1,688%)
  • 17. Sentinel with rawmemchr ● Idea: ○ replace '0' with 'n' at the end ○ use rawmemchr without length check instead ○ safe compares endp=input1.addr+input1.len; *endp = 'n'; for (p=input1.addr, i=0, nextp=rawmemchr(p, 'n'); p<endp ; i++) { nextp=rawmemchr(p, 'n'); insert(p, nextp-p, i); p = nextp+1; }
  • 18. Sentinel self made rawmemchr endp=input.addr+input.len; *endp = 'n'; p=input.addr; nextp = p; for (r=0; nextp<endp; r++) { for(;*nextp ^ 'n'; nextp++); insert(p, nextp-p, r); nextp++; p = nextp; } Cycles: 7.400.275.087 (+1,042%) Instructions: 1.157.591.866 (+8,696%) Branch mispredictions: 11.715.914 (-0,158%) Time elapsed: 2,2064 s (-0,293%)
  • 19. Faster memcmp inline int mycmp(char* in1, char* in2, int len){ do{ if(*in1 ^ *in2) return 0; in1++; in2++; len--; }while(len>0); return 1; } if (keylen == l->keylen && mycmp(l->keyaddr, keyaddr, keylen)) Cycles: 5.826.523.410 (-21,266%) Instructions: 1.913.851.749 (+65,330%) Branch mispredictions: 14.810.147 (+26,410%) Time elapsed: 1,7366 s (-21,292%)
  • 20. Faster memcmp with Sentinel *(keyaddr+keylen) = 0; // FROM INSERT inline int mycmp(char* in1, char* in2, int len){ while(*in1 == *in2) { in1++; in2++; len--; } return len; } if (keylen==l->keylen && !mycmp(l->keyaddr, keyaddr, keylen)) // FROM LOOKUP Cycles: 5.766.254.891 (-22,080%) Instructions: 1.747.135.165 (+50,928%) Branch mispredictions: 14.772.984 (+26,093%) Time elapsed: 1,7182 s (-22,126%)
  • 21. Caching int *cache = malloc(size*sizeof(int)); endcache = cache; int *startcache, *endcache; startcache = cache; REPEAT9 ( endcache = startcache + size; cache = startcache; while (cache < endcache) { while(nextp<endp) { r = r * 2654435761L + if (cache >= endcache){ *cache; size = size<<1; r = r + (r>>32); cache = realloc(cache, size*sizeof(int)); cache++; } } ); for(;*nextp ^ 'n'; nextp++); *cache = lookup(p, nextp-p); r = r * 2654435761L + *cache; r = r + (r>>32); cache++; nextp++; p = nextp; }
  • 22. Caching + memcmp Cycles: 925.886.063 (-84,109%) Instructions: 494.630.615 (-74,155%) Branch mispredictions: 2.395.446 (-83,825%) Time elapsed: 0,2847 s (-83,603%)
  • 23. Caching + memcmp with Sentinel Cycles: 925.783.880 (-84,110%) Instructions: 475.125.520 (-75,172%) Branch mispredictions: 2.418.936 (-83,659%) Time elapsed: 0,2839 s (-83,738%)
  • 24. Approximation of cache size int size = input.len/6; int size = input.len/2; ... if (cache >= endcache){ size = size<<1; cache = realloc(cache, size*sizeof(int)); } ... Cycles: 930.929.061 (+0,544%) Instructions: 475.676.977 (-3,831%) Branch mispredictions: 2.384.999 (-0,436%) Time elapsed: 0,2830 s (-0,586%)
  • 25. Overall Cycles: 930.929.061 (-87,233%) Instructions: 475.676.977 (-55,259%) Branch mispredictions: 2.384.999 (-79.070%) Time elapsed: 0,2830 s (-87,656%)