Efficient Programs

A collection of
Micro Optimizations
by Alex, Gabriel & Michael

Start
● Optimizations from hash.0.c to hash.13.c
● Performance testing:
gcc -Wall -O3 hash.c -o hash
perf stat -r 5 -e instructions -e branch-misses hash input input2
perf stat -r 5 -e cycles hash input input2

● Result:
Cycles: 7.292.009.385
Instructions: 1.063.178.278
Branch mispredictions: 11.395.359
Time elapsed: 2.2927 s

Analysis
● Hashtable
○ Max. Collisions: 7
○ Empty Elements: 363.543
○ Amount Elements: 1.048.575
○ Input Elements: 724.129

● Good hash table size: ~ 20% of Input
● Brent-Hashing?
● Parallelism?

Convert Linked-Lists to Arrays
● fewer cache misses on frequently used lookup

● overhead due to reorganizing

● struct size reduced from 24 to 16 bytes
due to removing *next

● faster at large lists
○ break even point at HASHSIZE 2^18

Loop peeling: lookup
if(l != NULL) {
if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0)
return l->value;
l = l->next;
while (l!=NULL) {
if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0)
return l->value;
l = l->next;
}
}
return -1;

Cycles: 7.255.927.875 (-0,495%)
Instructions: 1.067.896.719 (+0,444%)
Branch mispredictions: 11.464.124 (+0,603%)
Time elapsed: 2,1613 s (-5,731%)

Inline
inline struct block slurp(char *filename)
inline unsigned long hash(char *addr, size_t len)
inline void insert(char *keyaddr, size_t keylen, int value)
inline int lookup(char *keyaddr, size_t keylen)

Cycles: 7.265.216.080 (+0,128%)
Instructions: 1.067.543.945 (-0,033%)
Time elapsed: 2,1672 s (+0,273%)

Replace loop with macro
#define REPEAT10(x) { x x x x x x x x x x }
REPEAT10 (
for (p=input2.addr, endp=input2.addr+input2.len; p<endp; ) {
...
}
);

Cycles: 7.313.103.515 (+0,659%)
Instructions: 1.062.596.883 (-0,463%)
Branch mispredictions: 11.423.373 (-1,020%)
Time elapsed: 2,1791 s (+0,549%)

Some Minor Changes
● new Makro HASHSIZE-1
● Remove unnecessary Casts

... with no effects

Loop peeling + adjust len
inline unsigned long hash(char *addr, size_t len) {

...

if(len > 7 ) {
len = len - 7;
x = (*(unsigned long *)addr)*hashmult;
for (i=8; i<len; i+=8) {
w = *(unsigned long *)(addr+i);
x = (x + w)*hashmult;
}
len = len + 7;
}

...

Loop peeling + adjust len

Cycles: 8.271.902.713 (+13,111%)
Instructions: 1.038.690.398 (-2,250%)
Time elapsed: 2,4551 s (+12,668%)

=> probably faster for long strings
=> changes discarded

Pointers instead of indices
uint128_t x;
unsigned long * laddr = (unsigned long *) addr;
unsigned long * end = (unsigned long *) (addr+len);

if(len > 7 ) {
x = *laddr * hashmult;
end--;
for (laddr++; laddr <= end; laddr++) {
x = (x + *laddr)*hashmult;
}
if (laddr < (end+1))
x = ( x + ((*laddr)<< ( ((char*)laddr - (char*)end)*8)) ) * hashmult;
return x+(x>>64);
} else if (laddr < end) {
x = (uint128_t)((*laddr)<<((8-len)*8)) * hashmult;
return x+(x>>64);
}

return 0;

Pointers instead of indices

Cycles: 8.253.559.129 (+12,860%)
Instructions: 1.021.822.315 (-3,837%)
Branch misprecditions: 1.1825.252 (+3,518%)
Time elapsed: 2,4558 s (+12,700%)

=> probably faster for long strings
=> changes discarded

Improve loop-layout
for (p=input1.addr, endp=input1.addr+input1.len, i=0; p<endp; i++) {
nextp=memchr(p, 'n', endp-p);
if (nextp == NULL)
break;
...
}
------------------------------------------------
for (p=input.addr, endp=input.addr+input.len, r=0,
nextp=memchr(p, 'n', endp-p); nextp != NULL;
r++, nextp=memchr(p, 'n', endp-p)) {
...
}

Improve loop-layout
Cycles: 7.364.723.755 (+0,705%)
Instructions: 1072512560 (+0,933%)
Branch mispredictions: 11606354 (+1,601%)
Time elapsed: 2,2509 s (+3,294%)

=> "if" and "&&" probably similar instructions
in this case

Remove unnecessary check
for (p=input.addr, endp=input.addr+input.len, r=0,
nextp=memchr(p, 'n', endp-p); nextp != NULL;
r++, nextp=memchr(p, 'n', endp-p)) {
...
}

Remove unnecessary variables
struct block input1, input2; struct block input;

unsigned int i; unsigned long r=0;
unsigned long r=0;

Remove unnecessary
check & variables

Cycles: 7323904385 (-0,554%)
Instructions: 1064977111 (-0,702%)
Branch mispredictions: 11734428 (+1,103%)
Time elapsed: 2,2129 s (-1,688%)

Sentinel with rawmemchr
● Idea:
○ replace '0' with 'n' at the end
○ use rawmemchr without length check instead
○ safe compares

endp=input1.addr+input1.len;
*endp = 'n';

for (p=input1.addr, i=0, nextp=rawmemchr(p, 'n'); p<endp ; i++) {
nextp=rawmemchr(p, 'n');
insert(p, nextp-p, i);
p = nextp+1;
}

Sentinel self made rawmemchr
endp=input.addr+input.len;
*endp = 'n';
p=input.addr;
nextp = p;

for (r=0; nextp<endp; r++) {
for(;*nextp ^ 'n'; nextp++);
insert(p, nextp-p, r);
nextp++;
p = nextp;
}

Cycles: 7.400.275.087 (+1,042%)
Instructions: 1.157.591.866 (+8,696%)
Time elapsed: 2,2064 s (-0,293%)

Faster memcmp
inline int mycmp(char* in1, char* in2, int len){
do{
if(*in1 ^ *in2) return 0;
in1++; in2++; len--;
}while(len>0);
return 1;
}

if (keylen == l->keylen && mycmp(l->keyaddr, keyaddr, keylen))

Cycles: 5.826.523.410 (-21,266%)
Instructions: 1.913.851.749 (+65,330%)
Time elapsed: 1,7366 s (-21,292%)

Faster memcmp with Sentinel
*(keyaddr+keylen) = 0; // FROM INSERT

inline int mycmp(char* in1, char* in2, int len){
while(*in1 == *in2) {
in1++; in2++; len--;
}
return len;
}

if (keylen==l->keylen && !mycmp(l->keyaddr, keyaddr, keylen)) // FROM
LOOKUP

Cycles: 5.766.254.891 (-22,080%)
Instructions: 1.747.135.165 (+50,928%)
Time elapsed: 1,7182 s (-22,126%)

Caching
int *cache = malloc(size*sizeof(int)); endcache = cache;
int *startcache, *endcache;
startcache = cache; REPEAT9 (
endcache = startcache + size; cache = startcache;
while (cache < endcache) {
while(nextp<endp) { r = r * 2654435761L +
if (cache >= endcache){ *cache;
size = size<<1; r = r + (r>>32);
cache = realloc(cache, size*sizeof(int)); cache++;
} } );
for(;*nextp ^ 'n'; nextp++);
*cache = lookup(p, nextp-p);
r = r * 2654435761L + *cache;
r = r + (r>>32);
cache++; nextp++; p = nextp;
}

Caching + memcmp

Cycles: 925.886.063 (-84,109%)
Instructions: 494.630.615 (-74,155%)
Time elapsed: 0,2847 s (-83,603%)

Caching + memcmp with Sentinel

Cycles: 925.783.880 (-84,110%)
Instructions: 475.125.520 (-75,172%)
Time elapsed: 0,2839 s (-83,738%)

Approximation of cache size

int size = input.len/6; int size = input.len/2;
...
if (cache >= endcache){
size = size<<1;
cache = realloc(cache, size*sizeof(int));
}
...

Cycles: 930.929.061 (+0,544%)
Instructions: 475.676.977 (-3,831%)
Time elapsed: 0,2830 s (-0,586%)

Overall

Cycles: 930.929.061 (-87,233%)
Instructions: 475.676.977 (-55,259%)
Branch mispredictions: 2.384.999 (-79.070%)
Time elapsed: 0,2830 s (-87,656%)

Fin
Any Questions?

Code available:
https://github.com/grill/micro-optimisations

Efficient Programs

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Similar to Efficient Programs

Similar to Efficient Programs (20)

Recently uploaded

Recently uploaded (20)

Efficient Programs