The document describes a collection of micro-optimizations made to a hash table implementation, including converting linked lists to arrays, inlining functions, loop peeling, and adding a lookup cache. The most effective changes were adding a faster memcmp function and caching lookup results, reducing cycles by 84% and time elapsed by 87%.
4. Convert Linked-Lists to Arrays
● fewer cache misses on frequently used lookup
● overhead due to reorganizing
● struct size reduced from 24 to 16 bytes
due to removing *next
● faster at large lists
○ break even point at HASHSIZE 2^18
5. Loop peeling: lookup
if(l != NULL) {
if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0)
return l->value;
l = l->next;
while (l!=NULL) {
if (keylen == l->keylen && memcmp(keyaddr, l->keyaddr, keylen)==0)
return l->value;
l = l->next;
}
}
return -1;
Cycles: 7.255.927.875 (-0,495%)
Instructions: 1.067.896.719 (+0,444%)
Branch mispredictions: 11.464.124 (+0,603%)
Time elapsed: 2,1613 s (-5,731%)
6. Inline
inline struct block slurp(char *filename)
inline unsigned long hash(char *addr, size_t len)
inline void insert(char *keyaddr, size_t keylen, int value)
inline int lookup(char *keyaddr, size_t keylen)
Cycles: 7.265.216.080 (+0,128%)
Instructions: 1.067.543.945 (-0,033%)
Branch mispredictions: 11.541.050 (+0,671%)
Time elapsed: 2,1672 s (+0,273%)
7. Replace loop with macro
#define REPEAT10(x) { x x x x x x x x x x }
REPEAT10 (
for (p=input2.addr, endp=input2.addr+input2.len; p<endp; ) {
...
}
);
Cycles: 7.313.103.515 (+0,659%)
Instructions: 1.062.596.883 (-0,463%)
Branch mispredictions: 11.423.373 (-1,020%)
Time elapsed: 2,1791 s (+0,549%)
8. Some Minor Changes
● new Makro HASHSIZE-1
● Remove unnecessary Casts
... with no effects
9. Loop peeling + adjust len
inline unsigned long hash(char *addr, size_t len) {
...
if(len > 7 ) {
len = len - 7;
x = (*(unsigned long *)addr)*hashmult;
for (i=8; i<len; i+=8) {
w = *(unsigned long *)(addr+i);
x = (x + w)*hashmult;
}
len = len + 7;
}
...
10. Loop peeling + adjust len
Cycles: 8.271.902.713 (+13,111%)
Instructions: 1.038.690.398 (-2,250%)
Branch mispredictions: 11.809.722 (+3,382%)
Time elapsed: 2,4551 s (+12,668%)
=> probably faster for long strings
=> changes discarded
11. Pointers instead of indices
uint128_t x;
unsigned long * laddr = (unsigned long *) addr;
unsigned long * end = (unsigned long *) (addr+len);
if(len > 7 ) {
x = *laddr * hashmult;
end--;
for (laddr++; laddr <= end; laddr++) {
x = (x + *laddr)*hashmult;
}
if (laddr < (end+1))
x = ( x + ((*laddr)<< ( ((char*)laddr - (char*)end)*8)) ) * hashmult;
return x+(x>>64);
} else if (laddr < end) {
x = (uint128_t)((*laddr)<<((8-len)*8)) * hashmult;
return x+(x>>64);
}
return 0;
12. Pointers instead of indices
Cycles: 8.253.559.129 (+12,860%)
Instructions: 1.021.822.315 (-3,837%)
Branch misprecditions: 1.1825.252 (+3,518%)
Time elapsed: 2,4558 s (+12,700%)
=> probably faster for long strings
=> changes discarded
14. Improve loop-layout
Cycles: 7.364.723.755 (+0,705%)
Instructions: 1072512560 (+0,933%)
Branch mispredictions: 11606354 (+1,601%)
Time elapsed: 2,2509 s (+3,294%)
=> "if" and "&&" probably similar instructions
in this case
15. Remove unnecessary check
for (p=input.addr, endp=input.addr+input.len, r=0,
nextp=memchr(p, 'n', endp-p); nextp != NULL;
r++, nextp=memchr(p, 'n', endp-p)) {
...
}
Remove unnecessary variables
struct block input1, input2; struct block input;
unsigned int i; unsigned long r=0;
unsigned long r=0;