Introduction To Cache-Oblivious Algorithms
by Christopher Gilbert
http://www.twitter.com/bigdatadev
http://www.github.com/bigdatadev


http://www.cjgilbert.me
“A cache-oblivious algorithm is not oblivious to
cache memory”
(However, it is oblivious to the size of the cache)
“Cache-oblivious algorithms are
effective on any system, regardless
of memory hierarchy”
“Cache oblivious algorithms do not improve
complexity.”
(But they still improve performance)
Processor
Graphics
Core Core Core Core
Shared L3 Cache
Memory Controller I/O
L1 Cache
L2 Cache
L1 Cache
L2 Cache
L1 Cache
L2 Cache
L1 Cache
L2 Cache
System
Agent &
Memory
Controller
Register
L1 Cache
L2 Cache
L3 Cache
Memory
Disk
LatencyIncreases
CapacityDecreases
Tall Cache Assumption
Fully Associative
Perfect Eviction Policy
MT(N) = (N/B)
Estimating Memory Transfers
struct item_t {
	 uint64_t x;
	 uint64_t y;
};
bool
predicate(item_t const* i1, item_t const* i2) {
	 return ((i1->x + i2->y) * i2->x == (i1->y + i2->x) * i2->y);
}
size_t
kernel(item_t const* begin1, item_t const* end1,
item_t const* begin2, item_t const* end2) {
	 size_t count = 0;
	 for (item_t const* pos1 = begin1; pos1 != end1; pos1++) {
		for (item_t const* pos2 = begin2; pos2 != end2; pos2++) {
			if (predicate(pos1, pos2))
				count += 1;
		}
	 }
	 return count;
}
size_t
simple_parallel(item_t const* begin, size_t count,
unsigned thread_count) {
	 size_t res = 0;
#pragma omp parallel for reduction(+:res) num_threads(thread_count)
	 for (int i = 0; i < (int)count; i += 1)
		res += kernel(begin + i, begin + i + 1, begin, begin + count);
	 return res;
}
Memory Transfer Estimate
MT(N) = (N2
/B)
0
5000
10000
15000
20000
25000
30000
Number Of Threads
Time(ms)
Recursive Approach
class coba_task : public task {
public:
	 task* execute() {
		if (_count1 + _count2 > 256) {
			coba_task& a = *new(allocate_child()) coba_task(
				_begin1, _count1 / 2,
				_begin2, _count2 / 2
			);
			coba_task& b = *new(allocate_child()) coba_task(
				_begin1, _count1 / 2,
				_begin2 + _count2 / 2, _count2 - _count2 / 2
			);
			coba_task& c = *new(allocate_child()) coba_task(
				_begin1 + _count1 / 2, _count1 - _count1 / 2,
				_begin2 + _count2 / 2, _count2 - _count2 / 2
			);
			coba_task& d = *new(allocate_child()) coba_task(
				_begin1 + _count1 / 2, _count1 - _count1 / 2,
				_begin2, _count2 / 2
			);
			set_ref_count(5);
			spawn(b); spawn(c); spawn(d); spawn_and_wait_for_all(a);
			_result = a.result() + b.result() + c.result() + d.result();
		}
		else {
			_result = kernel(_begin1, _begin1 + _count1, _begin2, _begin2 + _count2);
		}
		return NULL;
	 }
};
size_t
recursive_parallel(item_t const* begin, size_t count,
unsigned thread_count) {
	 coba_task& a = *new(task::allocate_root()) coba_task(
		begin, count / 2,
		begin + count / 2, count / 2
	 );
	 task::spawn_root_and_wait(a);
	 return a.result();
}
Revised Memory Transfer Estimate
MT(N) = (N2
/CB)
0
5000
10000
15000
20000
25000
30000
Number Of Threads
Time(ms)
Exploit both spatial and temporal locality.
Use recursion.
Optimise your memory transfers.

Introduction to Cache-Oblivious Algorithms

  • 1.
    Introduction To Cache-ObliviousAlgorithms by Christopher Gilbert http://www.twitter.com/bigdatadev http://www.github.com/bigdatadev   http://www.cjgilbert.me
  • 2.
    “A cache-oblivious algorithmis not oblivious to cache memory” (However, it is oblivious to the size of the cache)
  • 3.
    “Cache-oblivious algorithms are effectiveon any system, regardless of memory hierarchy”
  • 4.
    “Cache oblivious algorithmsdo not improve complexity.” (But they still improve performance)
  • 5.
    Processor Graphics Core Core CoreCore Shared L3 Cache Memory Controller I/O L1 Cache L2 Cache L1 Cache L2 Cache L1 Cache L2 Cache L1 Cache L2 Cache System Agent & Memory Controller
  • 6.
    Register L1 Cache L2 Cache L3Cache Memory Disk LatencyIncreases CapacityDecreases
  • 7.
  • 8.
  • 9.
  • 10.
    MT(N) = (N/B) EstimatingMemory Transfers
  • 11.
    struct item_t { uint64_t x; uint64_t y; }; bool predicate(item_t const* i1, item_t const* i2) { return ((i1->x + i2->y) * i2->x == (i1->y + i2->x) * i2->y); } size_t kernel(item_t const* begin1, item_t const* end1, item_t const* begin2, item_t const* end2) { size_t count = 0; for (item_t const* pos1 = begin1; pos1 != end1; pos1++) { for (item_t const* pos2 = begin2; pos2 != end2; pos2++) { if (predicate(pos1, pos2)) count += 1; } } return count; } size_t simple_parallel(item_t const* begin, size_t count, unsigned thread_count) { size_t res = 0; #pragma omp parallel for reduction(+:res) num_threads(thread_count) for (int i = 0; i < (int)count; i += 1) res += kernel(begin + i, begin + i + 1, begin, begin + count); return res; }
  • 12.
  • 13.
  • 14.
  • 15.
    class coba_task :public task { public: task* execute() { if (_count1 + _count2 > 256) { coba_task& a = *new(allocate_child()) coba_task( _begin1, _count1 / 2, _begin2, _count2 / 2 ); coba_task& b = *new(allocate_child()) coba_task( _begin1, _count1 / 2, _begin2 + _count2 / 2, _count2 - _count2 / 2 ); coba_task& c = *new(allocate_child()) coba_task( _begin1 + _count1 / 2, _count1 - _count1 / 2, _begin2 + _count2 / 2, _count2 - _count2 / 2 ); coba_task& d = *new(allocate_child()) coba_task( _begin1 + _count1 / 2, _count1 - _count1 / 2, _begin2, _count2 / 2 ); set_ref_count(5); spawn(b); spawn(c); spawn(d); spawn_and_wait_for_all(a); _result = a.result() + b.result() + c.result() + d.result(); } else { _result = kernel(_begin1, _begin1 + _count1, _begin2, _begin2 + _count2); } return NULL; } }; size_t recursive_parallel(item_t const* begin, size_t count, unsigned thread_count) { coba_task& a = *new(task::allocate_root()) coba_task( begin, count / 2, begin + count / 2, count / 2 ); task::spawn_root_and_wait(a); return a.result(); }
  • 16.
    Revised Memory TransferEstimate MT(N) = (N2 /CB)
  • 17.
  • 18.
    Exploit both spatialand temporal locality.
  • 19.
  • 20.