Successfully reported this slideshow.
We use your LinkedIn profile and activity data to personalize ads and to show you more relevant ads. You can change your ad preferences anytime.

Евгений Крутько, Многопоточные вычисления, современный подход.

484 views

Published on

Многопоточные вычисления, современный подход.

Published in: Software
  • Be the first to comment

  • Be the first to like this

Евгений Крутько, Многопоточные вычисления, современный подход.

  1. 1. C++ User Group Russia, Ekaterinburg, 25.11.2016 Многопоточные вычисления, современный подход Крутько Е.С. НИЦ «Курчатовский институт» e.s.krutko@gmail.com
  2. 2. Цель доклада Современный стандарт C++ позволяет делать многопоточные вычисления легко и удобно. Если приложение или библиотека тратит меньше времени на работу – это хорошо Исходники тестов: https://github.com/eskrut/multithread.git 2
  3. 3. Немного истории. Нативные потоки 3 #include <stdlib.h> #include <pthread.h> #include <assert.h> void* someParallelTask(void *arg) { int *preciousValue = reinterpret_cast<int*>(arg); *preciousValue = 1; } int main(int argc, char**argv) { pthread_t thread; int value, othreValue; pthread_create(&thread, nullptr, someParallelTask, &value); someParallelTask(&othreValue); pthread_join(thread, nullptr); assert(value == othreValue); return 0; } #include <stdlib.h> #include <windows.h> #include <assert.h> DWORD WINAPI someParallelTask(void *arg) { int *preciousValue = reinterpret_cast<int*>(arg); *preciousValue = 1; } int main(int argc, char**argv) { HANDLE thread; int value, othreValue; thread = CreateThread( NULL, 0, someParallelTask, &value, 0, NULL ); someParallelTask(&othreValue); WaitForSingleObject(thread, INFINITE); assert(value == othreValue);
  4. 4. Стандартные потоки. с++11 4 #include <stdlib.h> #include <thread> #include <assert.h> void someParallelTask(int &value) { value = 1; } int main(int argc, char**argv) { std::thread thread; int value, othreValue; thread = std::thread( someParallelTask, std::ref(value)); someParallelTask(othreValue); thread.join(); assert(value == othreValue); return 0; }
  5. 5. Threadpool. с++11 5 #include <stdlib.h> #include <assert.h> //git submodule add https://github.com/progschj/ThreadPool.git #include "ThreadPool/ThreadPool.h" int someExample(); int someExampleParallel(); int main(int argc, char**argv) { someExample(); someExampleParallel(); return 0; }
  6. 6. Threadpool. с++11 6 int someExample() { int value1; //code to evaluate value1 //may require significant amount of time value1 = 3; int value2; //code to evaluate value2 //may require significant amount of time value2 = 3; //Now use some fancy algorythm using values int result = value1 + value2; assert(result == 6); return result; }
  7. 7. Threadpool. с++11 7 int someExampleParallel() { ThreadPool pool(8); auto futureValue1 = pool.enqueue([](){ int value1; //code to evaluate value1 //may require significant amount of time value1 = 3; return value1; }); auto futureValue2 = pool.enqueue([](){ int value2; //code to evaluate value2 //may require significant amount of time value2 = 3; return value2; }); //Now use some fancy algorythm using values int result = futureValue1.get() + futureValue2.get(); assert(result == 6); return result; }
  8. 8. std::async. с++11 8 int someExampleParallel() { auto futureValue1 = std::async([](){ int value1; //code to evaluate value1 //may require significant amount of time value1 = 3; return value1; }); auto futureValue2 = std::async([](){ int value2; //code to evaluate value2 //may require significant amount of time value2 = 3; return value2; }); //Now use some fancy algorythm using values int result = futureValue1.get() + futureValue2.get(); assert(result == 6); return result; }
  9. 9. Пример из жизни 9 void PhotoSortModel::fill(const QString &path) { unsigned numRows = invisibleRootItem()->rowCount(); auto read = [this,path](int id, int start, int stop){ for(int row = start; row < stop; ++row) { auto photo = photoItem(row); readDown(photo, path); QMetaObject::invokeMethod(this, "partialDone", Qt::DirectConnection, Q_ARG(int, id), Q_ARG(int, row-start)); } return 0; }; read(0, 0, numRows); emit(loaded()); for(unsigned row = 0; row < numRows; ++row) itemChanged(photoItem(row)); }
  10. 10. Пример из жизни 10 // read(0, 0, numRows); doneMap_.clear(); doneMap_[-1] = numRows; std::list<std::future<int>> futures; unsigned numThreads = std::thread::hardware_concurrency(); for(unsigned ct = 0; ct < numThreads; ++ct) { doneMap_[ct] = 0; unsigned start = (ct*numRows)/numThreads; unsigned stop = ((ct+1)*numRows)/numThreads; if( (ct + 1) == numThreads && stop > numRows ) stop = numRows; futures.push_back(std::async(read, ct, start, stop)); } for(auto &f : futures) f.get(); Последовательный код Параллельный 365.236 64.5399
  11. 11. Неприятности с параллельностью 11 int dataRaceTarget = 0; int numCycles = 1000; auto taskP = [](int volatile &dataRaceTarget, int numCycles){ for(size_t ct = 0; ct < numCycles; ++ct) dataRaceTarget++; return 0; }; auto taskM = [](int volatile &dataRaceTarget, int numCycles){ for(size_t ct = 0; ct < numCycles; ++ct) dataRaceTarget--; return 0; };
  12. 12. Неприятности с параллельностью 12 std::list<std::thread> threads; for(size_t ct = 0; ct < std::thread::hardware_concurrency(); ++ct) { if(ct % 2) threads.push_back(std::thread(std::bind( taskP, std::ref(dataRaceTarget), numCycles))); else threads.push_back(std::thread(std::bind( taskM, std::ref(dataRaceTarget), numCycles))); } for(auto &t : threads) { t.join(); } std::cout << "result: " << dataRaceTarget << std::endl;
  13. 13. Поиск ошибок Инстременты поиска несинхронного доступа к памяти: •valgring •clang/gcc (linux only [пока]) 13
  14. 14. valgring valgrind --num-callers=1 --tool=helgrind ./datarace ==77960== Possible data race during read of size 4 at 0x10480810C by thread #3 ==77960== Locks held: none ==77960== at 0x100002D2C: main::$_0::operator()(int volatile&, int) const (datarace.cpp:15) ==77960== ==77960== This conflicts with a previous write of size 4 by thread #2 ==77960== Locks held: none ==77960== at 0x100003634: main::$_1::operator()(int volatile&, int) const (datarace.cpp:21) ==77960== Address 0x10480810c is on thread #1's stack 14 dataRaceTarget++;
  15. 15. Thread Sunitizer (clang/gcc) option(TUNE_THREAD_SANITIZER "Perform thread error sanitizing" OFF) if(TUNE_THREAD_SANITIZER) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread -fPIC -fPIE" ) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=thread -pie") endif() Thread T2 (tid=8175, running) created by main thread at: #0 pthread_create <null>:0 (libtsan.so.0+0x000000047f23) #1 std::thread::_M_start_thread(std::shared_ptr<std::thread::_Impl_base>) <null>:0 (libstdc++.so.6+0x0000000b6a90) #2 main /media/psf/Home/Google Drive/work/doc/meetingCpp/2016Ekb/code/parallelTests/datarace/datarace.cpp:30 (datarace+0x000000005d8b) 15 threads.push_back(std::thread(std::bind( taskP, std::ref(dataRaceTarget), numCycles)));
  16. 16. Параллельная [стандартная] библиотека / на примере gcc 16 const size_t length = 0.1 /*Gb*/ * 1024ull /*Mb*/ * 1024 /*Kb*/ * 1024 /*b*/ / sizeof(size_t) /*count*/; std::vector<size_t> vecOrigin; vecOrigin.reserve(length); for(size_t ct = 0; ct < length; ++ct) vecOrigin.push_back(ct); auto vecToSort = vecOrigin; std::shuffle(vecToSort.begin(), vecToSort.end(), std::default_random_engine( hr_clock::now().time_since_epoch().count() ) ); auto vecToSort2 = vecToSort;
  17. 17. Параллельная [стандартная] библиотека / на примере gcc 17 size_t max; sw.start(); max = *std::max_element( vecToSort.begin(), vecToSort.end()); std::cout << sw.stop() << std::endl;
  18. 18. Параллельная [стандартная] библиотека / на примере gcc 18 size_t max; sw.start(); max = *std::max_element( vecToSort.begin(), vecToSort.end()); std::cout << sw.stop() << std::endl; size_t max2; sw.start(); max2 = *std::__parallel::max_element( vecToSort2.begin(), vecToSort2.end()); std::cout << sw.stop() << std::endl; if(max != length-1) throw std::runtime_error("Cant evaluete max with sequential max_element"); if(max2 != length-1) throw std::runtime_error("Cant evaluete max with parallel max_element");
  19. 19. Параллельная [стандартная] библиотека / на примере gcc 19 sw.start(); std::sort(vecToSort.begin(), vecToSort.end()); std::cout << sw.stop() << std::endl; sw.start(); std::__parallel::sort(vecToSort2.begin(), vecToSort2.end()); std::cout << sw.stop() << std::endl;
  20. 20. Параллельная [стандартная] библиотека / на примере gcc 20 const size_t lengthData = 25000; const size_t lengthVector = 6 /*Gb*/ * 1024ull /*Mb*/ * 1024 /*Kb*/ * 1024 /*b*/ / sizeof(size_t) /*count*/ / lengthData; std::vector<std::vector<size_t>> dataBundle; dataBundle.resize(lengthData, std::vector<size_t>(lengthVector, 0)); auto gen = std::default_random_engine( hr_clock::now().time_since_epoch().count() ); for(auto &vec : dataBundle) { for(auto &value : vec) { value = gen(); } }
  21. 21. Параллельная [стандартная] библиотека / на примере gcc 21 std::vector<size_t> maxes(lengthData, 0); sw.start(); std::for_each( dataBundle.cbegin(), dataBundle.cend(), [&dataBundle, &maxes](const std::vector<size_t> &vec) { size_t index = &vec - dataBundle.data(); maxes[index] = *std::max_element(vec.begin(), vec.end()); } ); std::cout << sw.stop() << std::endl; std::vector<size_t> maxes2(lengthData, 0); sw.start(); std::__parallel::for_each(dataBundle.cbegin(), dataBundle.cend(), [&dataBundle, &maxes2](const std::vector<size_t> &vec) { size_t index = &vec - dataBundle.data(); maxes2[index] = *std::max_element(vec.begin(), vec.end()); } ); std::cout << sw.stop() << std::endl;
  22. 22. Параллельная [стандартная] библиотека / на примере gcc 22 Результаты замены std на std::_parallel std std::_parallel max_element 0.015215 0.006585 sort 1.19395 0.264197 for_each 0.970016 0.356471
  23. 23. А если нужно большая гибкость? boost::thread 23 #include <stdlib.h> #include "boost/thread.hpp" #include <assert.h> void someParallelTask() { while(true) { //forewer cycle boost::this_thread::disable_interruption di; //alloc some resources to work //do not interrupt me boost::this_thread::restore_interruption ri(di); //ok check if I should die ( boost::this_thread::interruption_point(); } } int main(int argc, char**argv) { auto thread = boost::thread( someParallelTask ); //do some jod //And now I do not want to wait thread thread.interrupt(); thread.join(); return 0; }
  24. 24. IMHO. Самая простая параллельность. OpenMP 24 int someExample() { int value1; //code to evaluate value1 //may require significant amount of time value1 = 3; int value2; //code to evaluate value2 //may require significant amount of time value2 = 3; //Now use some fancy algorythm using values int result = value1 + value2; assert(result == 6); return result; } #include <omp.h> int someExampleParallel() { int value1; int value2; #pragma omp parallel sections { #pragma omp section { //code to evaluate value1 //may require significant amount of time value1 = 3; } #pragma omp section { //code to evaluate value2 //may require significant amount of time value2 = 3; } }//here we wait for all blocks //Now use some fancy algorythm using values int result = value1 + value2; assert(result == 6); return result; }
  25. 25. Итог При достаточном уровне понимания сути алгоритма работы программы с использованием современного C++ легко внедрять параллельную обработку данных. И этим надо пользоваться ) Я предпочитаю: std::async и std::list<std::future<T>> boost::thread (если нужен менеджер потоков) OpenMP 25
  26. 26. Крутько Евгений e.s.krutko@gmail.com google.com/+ЕвгенийКрутько 26

×