Cpp11 multithreading and_simd_linux_code

1,235 views

Published on

Published in: Technology, Education
0 Comments
0 Likes
Statistics
Notes
  • Be the first to comment

  • Be the first to like this

No Downloads
Views
Total views
1,235
On SlideShare
0
From Embeds
0
Number of Embeds
3
Actions
Shares
0
Downloads
0
Comments
0
Likes
0
Embeds 0
No embeds

No notes for slide

Cpp11 multithreading and_simd_linux_code

  1. 1. 1C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp /* * multi.cpp * * Created on: 13 Mar 2014 * Author: Russell John Childs. */ //======================================================================================================= // COPYRIGHT NOTICE // This code sample and ideas embodied remain the property of Russell John Childs, PhD and have been // distributed as a representative example of my use of C++11 features. //========================================================================================================== ============ //==================================================== // File contains // (1) Implementation of lock-based thread pool. // (2) Implementation of lock-free "thread pool". // (3) Implementation of SIMD, autovectorisation (N.B. This was compiled for Intel i7-3720QM Ivy Bridge processor) // Implementation of parallel search (using lock-based thread pool, lock-free event-based pool, SIMD parallel for): // 1) Split sorted array into <num_threads> equal chunks // 2) Assign each chunk to a thread. // 3) Thread returns true iff chunk.begin() <= search_val <= chunk.end() // 4) Replace array with chunk that returned true and return to step 1 // Complexity: // t - number of threads ( > 1 ) // Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t)) // Parallel = O(log_t(n)) // // Compiling this code sample (Linux Mint - g++ 4.8) // // Compiler options: // g++ -O3 -fopenmp -mavx -m64 -g -Wall -c -fmessage-length=0 -fno-omit-frame-pointer // --fast-math -ftree-vectorizer-verbose=3 -std=c++11 -I/opt/intel/vtune_amplifier_xe_2013/include / // multithreading.cpp // // Linker options: // g++ -fopenmp -L/opt/intel/vtune_amplifier_xe_2013/lib64/ -o "multithreading" $(OBJS) $(USER_OBJS) $ (LIBS) -lpthread -latomic -littnotify -ldl // //============================================================== #include <thread> #include <future> #include <condition_variable> #include <atomic> #include <functional> #include <deque> #include <vector> #include <set> #include <iostream> #include <string> #include <sstream> #include <cmath> #include <algorithm> #include <omp.h> #include <immintrin.h> //#include <cilk/cilk.h> //Uncomment following #define if you do not have Intel VTune Amplifier XE 2013 performance profiler. #define INTEL_NO_ITTNOTIFY_API //include Vtune API header iff INTEL_NO_ITTNOTIFY_API is not #defined #ifndef INTEL_NO_ITTNOTIFY_API #include "ittnotify.h" #endif //Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined #ifdef INTEL_NO_ITTNOTIFY_API #define VTUNE(STATEMENT) #else
  2. 2. 2C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp #define VTUNE(STATEMENT) STATEMENT #endif //Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined #ifdef INTEL_NO_ITTNOTIFY_API #define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) STATEMENTS #else #define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) { auto domain = __itt_domain_create(DOMAIN); __itt_task_begin(domain, __itt_null, __itt_null, __itt_string_handle_create("simd_search()")); STATEMENTS __itt_task_end(domain); } #endif // ================================================================= // Class wrapper for std::packaged_task to make different signatures, e.g. int(void), fload(int,int), ... // storable in STL container for thread pool. // === // N.B. simpler mechanism would be std::vector<std:function<void(void)>>; v[i]= std:packaged_task<Type (Type)>(type), // since packaged_task has void operator()(void). However, there is a problem: std::function // requires command object to be copyable and packaged_task has move-only semantics. //================================================================== //============================ //Primary template //============================ template< typename Out = void, typename In = void > struct MyPackagedTask { virtual ~MyPackagedTask(void) { } }; //============================ //Explicit specialization, acts as base class // MyPackagedTask<>& poly = *new MyPackagedTaks<MyType(OtherType)>; // poly(); --> calls MyPackagedTaks<MyType(OtherType)>::op() //============================ template<> struct MyPackagedTask<> { virtual ~MyPackagedTask(void) { } virtual void operator()(void) { } }; std::mutex last_return_mutex; //============================ //Specialization for function signature // MyPackagedTaks<MyType(OtherType)> //============================ template< typename Out, typename... In > struct MyPackagedTask< Out(In...) > : public MyPackagedTask<> { MyPackagedTask(std::function<Out(In...)> func, In... in) : m_task(std::bind(func, in...)) { } virtual ~MyPackagedTask(void) { } MyPackagedTask(MyPackagedTask&& other) : m_task(std::move(other.m_task))
  3. 3. 3C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp { } void operator()(void) { m_task(); } std::future<Out> get_future(void) { return m_task.get_future(); } private: std::packaged_task<Out(void)> m_task; }; //====================================================================== // Simple thread pool class // Places tasks onto common queue // Allocates fixed number of threads which pop tasks. // TODO: Load balancing, cache ping-pong (RFOs), VTune optimisation. //==================================================================== class ThreadPool { public: ThreadPool(unsigned max_num_threads = 1U << 31) : m_done(false), //notice to threads to shut down m_print_shutdown_msg(true), //print or not print shutdown msg m_max_num_threads(max_num_threads), //maximum num threads allowed in pool m_num_threads(0), //num threads allocated by the pool m_processing(0), //tasks still running m_cancel(false) { } ~ThreadPool(void) { //Shut down threads iff user has not alread called shutdown() if (!m_done) { shutdown(); } } //================= // Push task onto pool //================ template< typename Out, typename... In > std::future<Out> push(std::function<Out(In...)> func, In... in) { //Create task, store future MyPackagedTask<Out(In...)> task(func, in...); std::future<Out> ret_val = task.get_future(); //lock task queue, push the new task onto the queue, notify threads waiting on empty queue, release lock if (m_cancel == false) { { std::unique_ptr<MyPackagedTask<>> ptr(new MyPackagedTask<Out(In...)>(std::move(task))); // Base*=&Derived for poly call std::lock_guard<std::mutex> lock(m_tasks); //lock queue m_pool.push_back(std::move(ptr)); //push task } //release lock m_condition_variable.notify_all(); //notify waiting threads //spawn a thread(async will prevent oversubscription) and store thread future(to check for thread termination at pool shutdown) if ((++m_num_threads <= m_max_num_threads)) { std::unique_lock<std::mutex> lock(m_threads); m_thread_list.push_back(std::async(std::launch::async, &ThreadPool::run_tasks, this)); }
  4. 4. 4C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp } //return packaged_task future so that caller can wait for result return ret_val; } //================= // get number of threads allocated //================ unsigned get_num_threads(void) { std::unique_lock<std::mutex> lock(m_threads); return m_thread_list.size(); } //================= // Cancel all tasks but keep threads alivre (for reuse by next set of tasks during iteration). Not yest tested. //================ void cancel_tasks(void) { m_cancel = true; while (m_processing != 0); { std::unique_lock<std::mutex> lock(m_tasks); //lock task queue m_pool.clear(); } m_cancel = false; } //================= // Kill all threads and print out shutdown message (iff msg==true) //================ void shutdown(bool msg = true) { m_print_shutdown_msg = msg; { if (m_print_shutdown_msg) { std::unique_lock<std::mutex> lock(m_shutdown); std::cout << std::endl << "================================================================= " << std::endl << "Shutting down threads: "; } } cancel_tasks(); //Notify all threads of thread pool termination m_done = true; m_condition_variable.notify_all(); //Loop over all threads and wait for them to terminate { std::unique_lock<std::mutex> lock(m_threads); for (auto& elem : m_thread_list) { while (!elem.valid()); elem.get(); } } //Clear thread queue { std::unique_lock<std::mutex> lock(m_threads); m_thread_list.clear(); } //Print out shutdown message if (m_print_shutdown_msg)
  5. 5. 5C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp { std::unique_lock<std::mutex> lock(m_shutdown); std::cout << std::endl << "=================================================================" << std::endl; } } private: //================= // Pop and run tasks in threads. //================ void run_tasks(void) { //To avoid branch misprediction, use array to store branch code instead of if-else std::unique_ptr<MyPackagedTask<>> func; std::function<void(void)> branch_true = [&]{ func = std::move(m_pool.front()); m_pool.pop_front(); }; std::function<void(void)> branch_false = [&]{ func = std::unique_ptr<MyPackagedTask<>>(new MyPackagedTask<>); };//NOP std::function<void(void)> switch_func[2]{ branch_false, branch_true}; while (!m_done) { // Only wait if there are still tasks to be processed { bool empty; //Status of task queue std::unique_lock<std::mutex> lock(m_tasks); //lock task queue m_condition_variable.wait_for(lock, std::chrono::nanoseconds(100), [&]{ return !(empty = m_pool.empty()) || m_done; }); //wakeup if queue empty or shutdown switch_func[!empty && !m_done](); //only run non-NOP if queue not empty and not shutdown. } ++m_processing; (*func)(); --m_processing; } //Print out shutdown msg if (m_done & m_print_shutdown_msg) { std::unique_lock<std::mutex> lock(m_shutdown); std::cout << std::this_thread::get_id() << " "; } } std::atomic<bool> m_done; std::atomic<bool> m_print_shutdown_msg; std::atomic<unsigned> m_max_num_threads; std::atomic<unsigned> m_num_threads; std::atomic<unsigned> m_processing; std::atomic<bool> m_cancel; std::deque< std::unique_ptr<MyPackagedTask<>> > m_pool; std::vector< std::future<void> > m_thread_list; std::mutex m_threads; std::mutex m_tasks; std::mutex m_shutdown; std::condition_variable m_condition_variable; }; //===================================== // Simple test class // Creates a few tasks, pushes them onto thread pool, gets results //================================================================== struct SimpleTest { SimpleTest(void) try { std::cout << std::endl << "Simple Test......" << std::endl << std::endl; //Create thread pool ThreadPool thread_pool;
  6. 6. 6C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp //create a task std::thread::id f1_id; std::function< int(int, int) > f1 = [&](int i, int j) { f1_id = std::this_thread::get_id(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); return i*j; }; //create another task std::thread::id f2_id; std::function< std::string(void) > f2 = [&](void) { f2_id = std::this_thread::get_id(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); return std::string("return value of f2"); }; //create another task std::thread::id f3_id; std::string f3_str; std::function< void(void) > f3 = [&](void) { f3_id = std::this_thread::get_id(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); f3_str = "f3 called"; }; //push tasks auto start = std::chrono::high_resolution_clock::now(); //start timer std::future<int> fut_1(std::move(thread_pool.push(f1, 10, 20))); std::future<std::string> fut_2 = thread_pool.push(f2); int fut_1_res = fut_1.get(); std::string fut_2_res = fut_2.get(); auto end = std::chrono::high_resolution_clock::now(); //stop timer //std::future<void> fut_3 = thread_pool.push(f3); //TODO - fix compilation error. // std::cout << typeid(decltype(thread_pool.push(f3))).name() << std::endl; // gives std::future <void> //std::future<void> test_fut; //compiles //std::future<void> test_fut1 = std::move(test_fut); //compiles //thread_pool.push(f3); // doesn't compile // std::function< int(int) > f4 = [&](int i){ return ++i; }; //compiles // thread_pool.push(f4, 2); //compiles // std::function< void(int) > f4 = [&](int i){ ++i; }; //compiles // thread_pool.push(f4, 2); //doesn't compile //print num of threads running, thread id for tasks, result sent back by tasks std::cout << "num threads=" << thread_pool.get_num_threads() << std::endl; std::cout << "f1 thread id=" << f1_id << std::endl; std::cout << "f1's result: " << fut_1_res << std::endl; std::cout << "f2 thread id=" << f2_id << std::endl; std::cout << "f2's result: " << fut_2_res << std::endl; //std::cout << "f3 thread id=" << f3_id << std::endl; //std::cout << "f3's result: " << f3_str << std::endl; std::cout << "thread_pool time = " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << "ns" << std:: endl; //cleanup threads //thread_pool.shutdown(); test dtor } catch (...) { std::cout << "SimpleTest exception" << std::endl; } }; //============================================================== // Parallel vs binary search test class // t - number of threads ( > 1) // Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t)) // Parallel = O(log_t(n)) // // Binary = std::find (single threaded for comparison).
  7. 7. 7C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp // Parallel: (1) Split array into equal chuncks, push them onto thread pool // (2) Chunk containing search-val returns true. N.B. predicate simply returns begin() <= val <= end(). // (3) Chunk returning true replaces array and step(1) repeated. // N.B. Parallel search gets insertion point of nearest match rather than vector.end() if no match // Not optimised for RFOs (cache ping-pong), load balancing, VTune or early match (binary is quicker if early match found). // Benchmarks show high overhead of thread pool. //=============================================================== struct ParallelSearch { //Choose which to run bool is_lock_free = false; //run lock free lambda (if enabled, should set variable "factor", below, to 100 since the search is time consuming) bool is_lock_based = false; //run lock-based lambda (if enabled, should set variable "factor", below, to 100 since the search is time consuming) bool is_simd = true; //run simd-based lambda //Choose number of threads //unsigned num_threads = 2; //To get thread overhead (parallel/binary = 1 for no overhead) //const unsigned num_threads = std::thread::hardware_concurrency()/2; //undersubscription, should run slower than optimal const unsigned num_threads = std::thread::hardware_concurrency(); //Should be optimal choice //const unsigned num_threads = 2*std::thread::hardware_concurrency(); //moderate oversubsrciption, should run slower tha optimal //const unsigned num_threads = 4*std::thread::hardware_concurrency(); //heavy oversubsrciption, should run slower tha optimal //const unsigned num_threads = 128 * std::thread::hardware_concurrency(); //massive oversubsrciption, should run slower tha optimal ParallelSearch(void) try { std::atomic<bool> done(false); //flag used in lcok-free search to notify of completion //Create large, sorted array on heap to avoid seg fault. const unsigned size = 2 << 24; std::vector<unsigned> my_array(size); for (auto& elem : my_array) { static unsigned i = 0; elem = 2 * i; //even numbers ++i; } //double-word atomic containing the address of a matching chunk and the new new chunk length (size, size/t, size/t^2 ...) struct DoubleWord { unsigned* m_address; unsigned m_chunk_length; }; std::atomic<DoubleWord> chunk_address_and_length(DoubleWord{ &my_array[0], size / num_threads }); //val seacrched for (TODO: binary search faster than parallel search if binary finds early match. Need to terminate parallel search earlier) bool even = true; unsigned val = my_array[((size >> 1) + ((size >> 1) - 1)*rand() / RAND_MAX)] + (even ? 0 : 1); // even/odd number --> found/not found //Variables for found position, passes taken and whether to printout progress(incurs overhead) unsigned* ret_val = &my_array[0]; int passes = 0; //int required by g++ autovectorize bool printout = false; //SIMD lambda (Proved to be quite difficult getting g++ to autovectorise) //(N.B. This was compiled for Intel i7-3720QM Ivy Bridge processor) // 1. Split array into t chunks // 2. Allocate chunks to t SIMD lanes // 3. Each lane checks chunk.begin() <= search-val <= chunk.end() // 4. The SIMD lane getting a match set array = chunk // 5. Steps 1 to 4 repeated until chunk is 1 element long.
  8. 8. 8C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp std::function<bool(void)> simd_search = [&]() { //Alignment (SSE - 16 byte SIMD register, AVX 32-byte SIMD register) const unsigned alignment = 16; //g++ bug with 32-byte ((http://gcc.gnu.org/bugzilla/show_bug.cgi ?id=56787) //Pre-calculate chunk size (size/8, size/64, size/8^3 ... 1 element(s)) alignas(alignment) int chunk_length[9]{size >> 3, size >> 6, size >> 9, size >> 12, size >> 15, size >> 18, size >> 21, size >> 24, 1}; //Pre-calculate lower index for lower <= val << upper. N.B This is converted to lower[n]/8, lower[n]/64 ... alignas(alignment) int lower_index[8]{0, size, 2 * size, 3 * size, 4 * size, 5 * size, 6 * size, 7 * size}; //Pre-caclulate num of SIMD lanes to allocate to for loop to be vectorised alignas(alignment) int limits[9]{8, 8, 8, 8, 8, 8, 8, 8, 2}; //Running tally of start of chunk to be searched alignas(alignment) int offset = 0; alignas(alignment) int tmp_offset = 0; //Loop until chunk length is 1 element for (passes = 0; passes<9; ++passes) { //Following lambda is a test to see if hotspots marked "LINE X" and "LINE Y", below, are due to memory stalls. //It turns out prefetch does eliminate hotspots X, Y, but adds overhead of its own, so this search algorithm is unavoidably //memory-bound unless something along the lines of a heap-ordered array (i.e array is laid out as a breadth-first n-ary tree) is //used to convert random access to linear access without need for scatter-gather. //#pragma omp parallel for //Adds too much overhead [&]() //Sadly, won't vectorise due to function call { unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0], alignment); //Requirement for autovectorise. for (int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes to prefetch data they will use { int tmp = pos*chunk_length[passes]; //Get lower index for chunk interval __builtin_prefetch(&my_array[0] + offset + tmp); //See if it removes hotspot from "LINE X", below __builtin_prefetch(&my_array[0] + offset + tmp + chunk_length[passes] - 1); //See if it removes hotspot from "LINE Y", below } }(); //Fork: Assign each chunk to an SIMD lane //N.B. Use lmabda to force vectorisation of loop. Without it, loop is unrolled but SLP not vectorised. This does autovectorise under g++ 4.8 //N.B. Code has been borken down into painfully simple steps to help autovectoriser and pinpoint which operations are causing trouble [&]() { unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0], alignment); //Requirement for autovectorise. alignas(alignment) int chunk = chunk_length[passes]; for (alignas(alignment) int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes { //Find matching chunk by adding 0 to offset for no-match and chunk address for a match alignas(alignment) int tmp = pos*chunk; //Lower index of chunk range without offset //int tmp=lower_index[pos]>>3; //g++ bug (http://gcc.gnu.org/bugzilla/show_bug.cgi? id=56787). Can't use 32-byte AVX. alignas(alignment) int lower_ind = offset + tmp; //Lower index of chunk range alignas(alignment) int upper_ind = lower_ind + chunk - 1; //Upper index of chunk range unsigned lower_val = tmp_ret_val[lower_ind]; //LINE X - Hotspot (eliminated by above
  9. 9. 9C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp prefetch) unsigned upper_val = tmp_ret_val[upper_ind]; //LINE Y - Hotspot (eliminated by above prefecth) alignas(alignment) bool test_lower = lower_val <= val; //Lower alignas(alignment) bool test_upper = val <= upper_val; //and upper limit check alignas(alignment) bool test = test_lower && test_upper; // is search-val inside chunk for this SIMD lane? tmp_offset += test*tmp; //Horrible construct to get it to autovec. It masks out SIMD lanes that don't contain search val. //Following fails because it is "not suitable for gather" (whatever that means) //offset += ((tmp_ret_val[offset+tmp] <= val) & (val <= tmp_ret_val[offset+tmp+ chunk_length[passes]-1]))*tmp; //Following fails because of "control flow" (Can't see why g++ doesn't autovec it, control flow can be reaplced with masked op) //if((tmp_ret_val[offset+tmp] <= val) && (val <= tmp_ret_val[offset+tmp+chunk_length [passes]-1])) tmp_offset = tmp; } }(); //Join: end of SIMD //Update chunk start address index offset = tmp_offset; /*std::cout << "offset=" << offset << std::endl; std::cout << "passes=" << passes; std::cout << ", val=" << val; std::cout << ", range=[" << array[offset] << "," << array[offset+1]; std::cout<< ", chunk length=" << chunk_length[passes] << std::endl; */ } //Update final index of search-val ret_val = &my_array[0] + offset; return true; }; //Lock-free lambda for each thead //Operation: //1. The array is split into t (num of threads) chunks //2. Each thread examines its chunk //3. If a match is found in a chunk, the thread changes the array to be that chunk. //4. The process repeated from step 1. //t threads continous monitor the array and process their chunk of the array. Since the array pointer is // atomic, when one thread sees a matching chunk and changes the array to be that chunk, this is picked up // by all threads. No synchronisation is needed. // arg chunk_pos - section of chunk to search (0 - [begin, begin+chunk_length/t], 1 - [begin+ chunk_length/t, begin+2*chunk_length/t], ..) std::atomic<unsigned> running_threads(0); std::atomic<bool> go(false); std::function<bool(unsigned)> lock_free = [&](unsigned chunk_pos) { //Increment running thread count ++running_threads; //Keep all threads on hold until signalled to begin together (for timings). while (!go); //Keep searching until a thread notifies completion. while (!done) { //capture chunk address and length DoubleWord capture = chunk_address_and_length; //Check if search-val between chunk.begin() and chunk.end() unsigned *begin = capture.m_address + chunk_pos*capture.m_chunk_length; unsigned *end = begin + capture.m_chunk_length - 1; unsigned test1 = *begin, test2 = *end; if (*begin <= val && val <= *end)
  10. 10. 10C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp { //Print out iterations (adds significant overhead) static std::mutex printout_mutex; if (printout) { std::unique_lock<std::mutex> lock(printout_mutex); //Print out iterations (adds significant overhead) std::cout << "Parallel find (pass " << passes << "): Closest match " << *begin << "<=" << val << "<=" << *end << ", chunk length=" << capture.m_chunk_length << std::endl; } //Update parent variables for printouts ret_val = begin; ++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num threads) std::function<void(void)> branch_true = [&]() //IF { //Update chunk length and address capture.m_chunk_length = (capture.m_chunk_length >= num_threads ? (capture. m_chunk_length / num_threads) : 1); //divide chunk evenly capture.m_address = begin; //point to this chunk chunk_address_and_length = capture; }; std::function<void(void)> branch_false = [&]() //ELSE { done = true; //notify parent and sister threads of completion }; std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if} if_else[capture.m_chunk_length > 1](); //if-else } else { std::this_thread::sleep_for(std::chrono::nanoseconds(5000)); } } return true;; }; //Create thread pool for lock-based search static ThreadPool thread_pool(num_threads); //Notification of completion of lock-based search std::condition_variable finished; // lock - based lambda for each thread.It simply tests whether array[pos] <= search_val <= array[pos + chuhnk_length] // and iff true, spawns t threads to narrow down the search, iteratively arriving at the insertion point. std::function<bool(unsigned*, unsigned)> lock_based = [&](unsigned* tmp, unsigned chunk_length) // { //Keep all threads on hold until signalled to begin together (for timings). while (!go); //Check if search-val between chunk.begin() and chunk.end() if (*tmp <= val && val <= *(tmp + chunk_length - 1)) { //Print out iterations (adds significant overhead) if (printout) { std::cout << "Parallel find (pass " << passes << "): Closest match " << *tmp << "<=" << val << "<=" << (chunk_length > 1 ? *(tmp + chunk_length - 1) : *tmp == val ? val : *(tmp + 1)) << ", chunk length=" << chunk_length << std::endl; } //Update parent variables for printouts ret_val = tmp; ++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num threads)
  11. 11. 11C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp //Spawn new tasks to process this chunk //Following peculiar construct is to avoid branch misprediction by using array of fn ptrs to replace if-else //need VTune to test out whether it saves us any mispredictions. std::function<void(void)> branch_true = [&]() //IF { chunk_length = (chunk_length >= num_threads ? (chunk_length / num_threads) : 1); //divide chunk evenly for (unsigned index = 0; index < num_threads; ++index) { thread_pool.push(lock_based, tmp + index*chunk_length, chunk_length); } }; std::function<void(void)> branch_false = [&]() //ELSE { finished.notify_one(); //chunk length is 1, so we are finished dividing-and- conquering }; std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if} if_else[chunk_length>1](); //if-else } return true; }; std::cout << std::endl << "Parallel vs Binary Search......" << std::endl << std::endl; //Obtain position of element (to verify parallel search finds correct position). auto pos = std::find(my_array.begin(), my_array.end(), val); //Ordinary binary search for timing comparison std::cout << std::endl << "========================================================================= =====" << std::endl; std::cout << "Running binary search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std ::endl; unsigned factor = 10000; //number of times to run search auto start_binary = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) for (unsigned i = 0; i < factor; ++i) std::binary_search(my_array.begin(), my_array.end(), val); // binary search VTUNE(__itt_pause();) auto end_binary = std::chrono::high_resolution_clock::now(); //stop timer //print out results of binary search using std::chrono::duration_cast; using std::chrono::nanoseconds; std::cout << "clock resolution is: " << static_cast<double>(std::chrono::high_resolution_clock:: period::num) << " ns" << std::endl; std::cout << "std::find: val=" << val << ", element=" << (pos != my_array.end() ? *pos : -1) << ", index=" << pos - my_array.begin() << ", found==" << std::boolalpha << (pos != my_array.end()) << ", time=" << duration_cast <nanoseconds>(end_binary - start_binary).count() << "ns" << std::endl; //Parallel searches //SIMD search if (is_simd) { std::cout << std::endl << "===================================================================== =========" << std::endl; std::cout << "Running simd parallel search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std::endl; //Kick off the search auto start = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) VTUNE_TASK("Parallel Search", "simd_search()", for (unsigned i = 0; i<factor; i++) simd_search(); ) //Wait for result and then get the intertion point and number of paasses
  12. 12. 12C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp VTUNE(__itt_pause();) //get execution time auto end = std::chrono::high_resolution_clock::now(); //stop timer auto parallel_time = std::chrono::duration_cast<nanoseconds>(end - start).count(); //print results double complexity = (passes + 1) / (std::log(size) / std::log(2)); auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count(); std::cout << "Simd results:" << std::endl; std::cout << "Size of array=" << size / 1000000 << " million elements" << std::endl; std::cout << "Search repeated " << factor << " times" << std::endl; std::cout << "number of threads=" << running_threads << std::endl; std::cout << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] << "]"; std::cout << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - & my_array[0] << "]" << std::endl; std::cout << "O(n_parallel)/O(n_binary)=" << complexity << std::endl; std::cout << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time << "ns"; std::cout << " = "; std::cout << parallel_time / binary_time << std::endl; std::cout << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std:: endl; } //Lock-free multithreaded search if (is_lock_free) { std::cout << std::endl << "===================================================================== =========" << std::endl; std::cout << "Running lock-free parallel search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std::endl; double parallel_time = 0; for (unsigned i = 0; i<factor; ++i) { //reset passes counter, chunk struct, done flag passes = 0; chunk_address_and_length = DoubleWord{ &my_array[0], size / num_threads }; done = false; //Kick off the search //auto start = std::chrono::high_resolution_clock::now(); //start timer std::vector<std::future<bool>> futures; go = false; futures.push_back(std::move(std::async(std::launch::deferred, lock_free, 0))); for (unsigned chunk_pos = 1; chunk_pos < num_threads; ++chunk_pos) { futures.push_back(std::move(std::async(std::launch::async, lock_free, chunk_pos))); } //Wait for result and then get the intertion point and number of paasses auto start = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) VTUNE_TASK("Parallel Search", "lock_free()", go = true; futures[0].get(); ) VTUNE(__itt_pause();) //get execution time auto end = std::chrono::high_resolution_clock::now(); //stop timer parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count(); } //print results double complexity = (passes + 1) / (std::log(size) / std::log(2)); auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count(); std::cout << "Lock free results:" << std::endl << "Size of array=" << size / 1000000 << " million elements" << std::endl << "Search repeated " << factor << " times" << std::endl
  13. 13. 13C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp << "number of threads=" << running_threads << std::endl << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] < < "]" << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - & my_array[0] << "]" << std::endl << "O(n_parallel)/O(n_binary)=" << complexity << std::endl << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time << "ns" << " = " << parallel_time / binary_time << std::endl << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl; } if (is_lock_based) { std::cout << std::endl << "===================================================================== =========" << std::endl; std::cout << "Running lock-based parallel search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std::endl; double parallel_time = 0; for (unsigned i = 0; i<factor; ++i) { //reset passes counter, chunk struct, done flag passes = 0; //Kick off the search //auto start = std::chrono::high_resolution_clock::now(); //start timer go = false; auto f = thread_pool.push(lock_based, &my_array[0], size); //Wait for result and then get the intertion point and number of paasses { //wait for completion std::mutex dummy; std::unique_lock<std::mutex> lock(dummy); auto start = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) VTUNE_TASK("Parallel Search", "lock_based()", go = true; finished.wait(lock); ) VTUNE(__itt_pause();) //get execution time auto end = std::chrono::high_resolution_clock::now(); //stop timer parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count(); thread_pool.cancel_tasks(); std::this_thread::sleep_for(std::chrono::milliseconds(10)); } } //kill thread pool thread_pool.shutdown(false); //print results double complexity = (passes + 1) / (std::log(size) / std::log(2)); auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count(); std::cout << "Lock based results:" << std::endl << "Size of array=" << size / 1000000 << " million elements" << std::endl << "Search repeated " << factor << " times" << std::endl << "number of threads=" << running_threads << std::endl << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] < < "]" << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - & my_array[0] << "]" << std::endl << "O(n_parallel)/O(n_binary)=" << complexity << std::endl << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time << "ns" << " = "
  14. 14. 14C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp << parallel_time / binary_time << std::endl << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl; } } catch (...) { std::cout << "ParallelSearch exception" << std::endl; } }; int main(void) { //SimpleTest simple_test; VTUNE(__itt_pause();) ParallelSearch parallel_search; char c; std::cout << "Press any key to exit" << std::endl; std::cin >> c; //keep console alive }

×