SlideShare a Scribd company logo
1 of 42
Download to read offline
GPU
Programming
on CPUs
Using C++AMP
Miller Lee
Outline
1. Introduction to C++AMP
2. Introduction to Tiling
3. tile_static
4. barrier.wait and solutions
a. C++11 thread
b. setjmp/longjmp
c. ucontext
2
(Homogeneous coordinates)
(0, 0) (0, 1) (0, 2) (0, 3)
(1, 0) (1, 1) (1, 2) (1, 3)
(2, 0) (2, 1) (2, 2) (2, 3)
(3, 0) (3, 1) (3, 2) (3, 3)
X
0
1
2
3
Matrix A b
=
0
1
2
3
result
Computing example
● Simple matrix multiplication
3
C++ Version
1. int A[4][4];
2. int b[4];
3. int result[4];
4. for (int i = 0; i < 4; i++) {
5. result[i] = 0;
6. for (int j = 0; j < 4; j++)
7. result[i] += A[i][j] * b[j];
8. } 4
C++AMP Version
1. array_view<float, 2> A(4, 4);
2. array_view<float, 1> b(4);
3. array_view<float, 1> result(4);
4. extent<1> ext(4);
5. parallel_for_each(ext, [&](index<1> idx) restrict(amp)
6. {
7. result[idx[0]] = 0;
8. for (int i = 0; i < 4; i++)
9. result[idx[0]] += A(idx[0], i) * b(i);
10. });
5
memory access
0 1 2 3
P0 P1 P2 P3
global memory
b
100t
Total access time = 400t 6
shared memory
0 1 2 3
shared memory
10t
100t
Total access time = 130t
b
7
1. array_view<float, 2> A(4, 4);
2. array_view<float, 1> b(4);
3. array_view<float, 1> result(4);
4. extent<1> ext(4);
5. parallel_for_each(ext.tile<4>(), [&](tiled_index<4> tidx)
restrict(amp)
6. {
7. int local = tidx.local[0];
8. int global = tidx.global[0];
9. tile_statc int buf[4];
10. buf[local] = b[global];
11. tidx.barrier.wait();
12. result[idx[0]] = 0;
13. for (int i = 0; i < 4; i++)
14. result[idx[0]] += A[idx[0]][i] * buf[i];
15. }); 8
barrier
9
Architecture
source: NVIDIA TESLA:AUNIFIED GRAPHICS AND COMPUTING ARCHITECTURE
shared memory
accessible to all SPs
10
Goal
● Implement all the C++AMP function on CPU
instead of GPU without any compiler
modification.
11
tiled_static
● The limitation of C++ syntax leads to the
following choices
○ const, volatile
○ __attribute__(...)
○ static
● Choose static
○ static memory can be shared among all the threads
○ side effect: At most one thread group can be
executed at the same time.
#define tile_static static
12
Barrier.wait
● Threads in the same thread group will be
waited at the point where “wait” is called.
● Program can
a. perform real barrier action
b. jump out of current execution context
13
● True threading
○ C++11 thread
● Fake threading(Coroutines)
○ setjmp/longjmp
○ makecontext/getcontext/swapcontext/setcontext
Approaches
14
C++11 thread
● launch hundreds of threads at a time.
● implemente my own barrier by using C++11
mutex library.
→ extremely slow.
→ The data on static memory will be corrupted
15
setjmp/longjmp
● int setjmp(jmp_buf env)
○ setjmp() saves the stack context/environment in env
for later use by longjmp.
○ The stack context will be invalidated if the function
which called setjmp() returns.
● void longjmp(jmp_buf env, int val);
○ longjmp() restores the environment saved by the last
call of setjmp.
16
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf;
4. void wait(void) {
5. printf("waitn"); // prints
6. longjmp(buf,1);
7. }
8. void first(void) {
9. wait();
10. printf("firstn"); // does not print
11. }
12. int main() {
13. if (!setjmp(buf))
14. first(); // when executed, setjmp returns 0
15. else // when longjmp jumps back, setjmp returns 1
16. printf("mainn"); // prints
17. return 0;
18. }
17
Pseudo code (1)
void entry()
{
while(!finish)
for(t : tasks)
run(t)
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
void entry()
{
while(!finish)
for(t : tasks)
run(t)
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
18
Pseudo code (2)
void entry()
{
while(!finish)
for(t : tasks)
run(t)
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
void entry()
{
while(!finish)
for(t : tasks)
run(t)
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
19
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. } 20
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. }
buf
21
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. }
ret address
buf
b
22
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. }
buf
b
23
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. }
Cannot return
???
???
???
buf
b
24
Problems
● Cannot return
○ return address in the stack is destroyed
● Cannot use too many static variables
○ will lost spilled registers
→ can be solved by using “alloca”
http://www.codemud.net/~thinker/GinGin_CGI.
py/show_id_doc/489
25
ucontext.h
● ucontext_t
● getcontext
● makecontest
● swapcontext
● setcontext
26
ucontext_t
typedef struct ucontext {
struct ucontext *uc_link;
sigset_t uc_sigmask;
stack_t uc_stack;
mcontext_t uc_mcontext;
...
} ucontext_t;
● uc_link
○ points to the context that will be resumed when the current context
terminates
● uc_stack
○ the stack used by this context
● uc_mcontext
○ machine-specific representation of the saved context, that includes the
calling thread's machine registers
27
Functions
● int getcontext(ucontext_t *ucp);
○ initializes the structure pointed at by ucp.
● int setcontext(const ucontext_t *ucp);
○ restores the user context pointed at by ucp
● int swapcontext(ucontext_t *oucp, const
ucontext_t *ucp);
○ saves the current context in the structure pointed to
by oucp, and then activates the context pointed to by
ucp.
28
makecontext
● void makecontext(ucontext_t *ucp, void
(*func)(), int argc, ...);
○ glibc(x86_64) saves the arguments to registers
instead of pushing them on stack as AMD64 ABI
said
○ The size of the arguments that passed to
makecontext should be no less than sizeof(register)
29
1. #include <stdio.h>
2. #include <ucontext.h>
3. static ucontext_t ctx[2];
4. static void f1 (void) {
5. puts("start f1");
6. swapcontext(&ctx[1], &ctx[0]);
7. puts("finish f1");
8. }
9. int main (void)
10. {
11. char st1[8192];
12. getcontext(&ctx[1]);
13. ctx[1].uc_stack.ss_sp = st1;
14. ctx[1].uc_stack.ss_size = sizeof st1;
15. ctx[1].uc_link = &ctx[0];
16. makecontext(&ctx[1], f1, 0);
17. swapcontext(&ctx[0], &ctx[1]);
18. swapcontext(&ctx[0], &ctx[1]);
19. return 0;
20. } 30
1. #include <stdio.h>
2. #include <ucontext.h>
3. static ucontext_t ctx[3];
4. static void f1 (void) {
5. puts("start f1");
6. swapcontext(&ctx[1], &ctx
[0]);
7. puts("finish f1");
8. }
9. static void f2 (void)
10. {
11. puts("start f2");
12. swapcontext(&ctx[2], &ctx
[1]);
13. puts("finish f2");
14. }
1. int main (void)
2. {
3. char st1[8192], st2[8192];
4. getcontext(&ctx[1]);
5. ctx[1].uc_stack.ss_sp = st1;
6. ctx[1].uc_stack.ss_size = sizeof
st1;
7. ctx[1].uc_link = &ctx[0];
8. makecontext(&ctx[1], f1, 0);
9.
10. getcontext(&ctx[2]);
11. ctx[2].uc_stack.ss_sp = st2;
12. ctx[2].uc_stack.ss_size = sizeof
st2;
13. ctx[2].uc_link = &ctx[1];
14. makecontext(&ctx[2], f2, 0);
15. swapcontext(&ctx[0], &ctx[2]);
16. swapcontext(&ctx[0], &ctx[2]);
17. return 0;
18. }
31
Fake threading (yield)
void entry()
{
setup(fun, 2);
while(!finish)
switch_to();
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
32
void entry()
{
setup(fun, 2);
while(!finish)
switch_to();
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
Problems
1. How to pass a lambda?
○ makecontext(&ctx,
(void (*)(void))&Kernel::operator(), …);
2. How to pass non-int arguments?
○ What if sizeof(Type) > sizeof(int)
○ How about complex structure and class
33
Pass lambda
1. Use a wrapper function!!
template <typename Ker, typename Arg>
void fun(Ker k, Arg arg)
{
k(arg);
}
template <typename Ker, typename Arg>
void makectx(Ker k, Arg arg)
{
makecontext(&ctx, (void (*)(void))fun<ker, Arg>, 2, k, arg);
}
34
Pass non-int arguments
2. Pass pointer instead!!
template <typename Ker, typename Arg>
void fun(Ker *k, Arg *arg)
{
(*k)(*arg);
}
template <typename Ker, typename Arg>
void makectx(Ker k, Arg arg)
{
makecontext(&ctx, (void (*)(void))fun<ker, Arg>, 2, &k, &arg);
}
35
Additional
● Use a counter so that we can spawn
coroutines dynamically
● Can it be multithreaded? Yes
36
true threading
barrier
There are 12 threads in one thread group
37
one thread
barrier
38
multithreading
barrier
Hardware Core = 4
39
barrier
struct bar_t {
unsigned const count;
std::atomic<unsigned> spaces;
std::atomic<unsigned> generation;
bar_t(unsigned count_) :
count(count_), spaces(count_), generation(0)
{}
void wait() noexcept {
unsigned const my_generation = generation;
if (!--spaces) {
spaces = count;
++generation;
} else {
while(generation == my_generation);
}
}
}; source: C++ Concurrency in Action: Practical Multithreading
40
Summary
● It works fine on AMP right now
● The importance of low level knowledge
41
42

More Related Content

What's hot

Story of static code analyzer development
Story of static code analyzer developmentStory of static code analyzer development
Story of static code analyzer developmentAndrey Karpov
 
Histogram dan Segmentasi 2
Histogram dan Segmentasi 2Histogram dan Segmentasi 2
Histogram dan Segmentasi 2Lusiana Diyan
 
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)changehee lee
 
Powered by Python - PyCon Germany 2016
Powered by Python - PyCon Germany 2016Powered by Python - PyCon Germany 2016
Powered by Python - PyCon Germany 2016Steffen Wenz
 
4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)
4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)
4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)PROIDEA
 
Facebook Glow Compiler のソースコードをグダグダ語る会
Facebook Glow Compiler のソースコードをグダグダ語る会Facebook Glow Compiler のソースコードをグダグダ語る会
Facebook Glow Compiler のソースコードをグダグダ語る会Mr. Vengineer
 
Kirk Shoop, Reactive programming in C++
Kirk Shoop, Reactive programming in C++Kirk Shoop, Reactive programming in C++
Kirk Shoop, Reactive programming in C++Sergey Platonov
 
HKG15-207: Advanced Toolchain Usage Part 3
HKG15-207: Advanced Toolchain Usage Part 3HKG15-207: Advanced Toolchain Usage Part 3
HKG15-207: Advanced Toolchain Usage Part 3Linaro
 
Global Interpreter Lock: Episode I - Break the Seal
Global Interpreter Lock: Episode I - Break the SealGlobal Interpreter Lock: Episode I - Break the Seal
Global Interpreter Lock: Episode I - Break the SealTzung-Bi Shih
 
Cluj.py Meetup: Extending Python in C
Cluj.py Meetup: Extending Python in CCluj.py Meetup: Extending Python in C
Cluj.py Meetup: Extending Python in CSteffen Wenz
 
深入淺出C語言
深入淺出C語言深入淺出C語言
深入淺出C語言Simen Li
 
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2  「エッジAIモダン計測制御の世界」オ...Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2  「エッジAIモダン計測制御の世界」オ...
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...Mr. Vengineer
 
Open CL For Speedup Workshop
Open CL For Speedup WorkshopOpen CL For Speedup Workshop
Open CL For Speedup WorkshopOfer Rosenberg
 
Windbg랑 친해지기
Windbg랑 친해지기Windbg랑 친해지기
Windbg랑 친해지기Ji Hun Kim
 
Cluj Big Data Meetup - Big Data in Practice
Cluj Big Data Meetup - Big Data in PracticeCluj Big Data Meetup - Big Data in Practice
Cluj Big Data Meetup - Big Data in PracticeSteffen Wenz
 
Pythonによるカスタム可能な高位設計技術 (Design Solution Forum 2016@新横浜)
Pythonによるカスタム可能な高位設計技術 (Design Solution Forum 2016@新横浜)Pythonによるカスタム可能な高位設計技術 (Design Solution Forum 2016@新横浜)
Pythonによるカスタム可能な高位設計技術 (Design Solution Forum 2016@新横浜)Shinya Takamaeda-Y
 

What's hot (20)

Story of static code analyzer development
Story of static code analyzer developmentStory of static code analyzer development
Story of static code analyzer development
 
Histogram dan Segmentasi 2
Histogram dan Segmentasi 2Histogram dan Segmentasi 2
Histogram dan Segmentasi 2
 
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
 
OpenGL SC 2.0 Quick Reference
OpenGL SC 2.0 Quick ReferenceOpenGL SC 2.0 Quick Reference
OpenGL SC 2.0 Quick Reference
 
Vulkan 1.1 Reference Guide
Vulkan 1.1 Reference GuideVulkan 1.1 Reference Guide
Vulkan 1.1 Reference Guide
 
Powered by Python - PyCon Germany 2016
Powered by Python - PyCon Germany 2016Powered by Python - PyCon Germany 2016
Powered by Python - PyCon Germany 2016
 
4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)
4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)
4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)
 
Dafunctor
DafunctorDafunctor
Dafunctor
 
Facebook Glow Compiler のソースコードをグダグダ語る会
Facebook Glow Compiler のソースコードをグダグダ語る会Facebook Glow Compiler のソースコードをグダグダ語る会
Facebook Glow Compiler のソースコードをグダグダ語る会
 
Kirk Shoop, Reactive programming in C++
Kirk Shoop, Reactive programming in C++Kirk Shoop, Reactive programming in C++
Kirk Shoop, Reactive programming in C++
 
HKG15-207: Advanced Toolchain Usage Part 3
HKG15-207: Advanced Toolchain Usage Part 3HKG15-207: Advanced Toolchain Usage Part 3
HKG15-207: Advanced Toolchain Usage Part 3
 
Global Interpreter Lock: Episode I - Break the Seal
Global Interpreter Lock: Episode I - Break the SealGlobal Interpreter Lock: Episode I - Break the Seal
Global Interpreter Lock: Episode I - Break the Seal
 
Cluj.py Meetup: Extending Python in C
Cluj.py Meetup: Extending Python in CCluj.py Meetup: Extending Python in C
Cluj.py Meetup: Extending Python in C
 
深入淺出C語言
深入淺出C語言深入淺出C語言
深入淺出C語言
 
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2  「エッジAIモダン計測制御の世界」オ...Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2  「エッジAIモダン計測制御の世界」オ...
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
 
Open CL For Speedup Workshop
Open CL For Speedup WorkshopOpen CL For Speedup Workshop
Open CL For Speedup Workshop
 
Interpreter, Compiler, JIT from scratch
Interpreter, Compiler, JIT from scratchInterpreter, Compiler, JIT from scratch
Interpreter, Compiler, JIT from scratch
 
Windbg랑 친해지기
Windbg랑 친해지기Windbg랑 친해지기
Windbg랑 친해지기
 
Cluj Big Data Meetup - Big Data in Practice
Cluj Big Data Meetup - Big Data in PracticeCluj Big Data Meetup - Big Data in Practice
Cluj Big Data Meetup - Big Data in Practice
 
Pythonによるカスタム可能な高位設計技術 (Design Solution Forum 2016@新横浜)
Pythonによるカスタム可能な高位設計技術 (Design Solution Forum 2016@新横浜)Pythonによるカスタム可能な高位設計技術 (Design Solution Forum 2016@新横浜)
Pythonによるカスタム可能な高位設計技術 (Design Solution Forum 2016@新横浜)
 

Similar to GPU Programming on CPU - Using C++AMP

C++ CoreHard Autumn 2018. Concurrency and Parallelism in C++17 and C++20/23 -...
C++ CoreHard Autumn 2018. Concurrency and Parallelism in C++17 and C++20/23 -...C++ CoreHard Autumn 2018. Concurrency and Parallelism in C++17 and C++20/23 -...
C++ CoreHard Autumn 2018. Concurrency and Parallelism in C++17 and C++20/23 -...corehard_by
 
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321Teddy Hsiung
 
Whats new in_csharp4
Whats new in_csharp4Whats new in_csharp4
Whats new in_csharp4Abed Bukhari
 
Runtime Code Generation and Data Management for Heterogeneous Computing in Java
Runtime Code Generation and Data Management for Heterogeneous Computing in JavaRuntime Code Generation and Data Management for Heterogeneous Computing in Java
Runtime Code Generation and Data Management for Heterogeneous Computing in JavaJuan Fumero
 
rrxv6 Build a Riscv xv6 Kernel in Rust.pdf
rrxv6 Build a Riscv xv6 Kernel in Rust.pdfrrxv6 Build a Riscv xv6 Kernel in Rust.pdf
rrxv6 Build a Riscv xv6 Kernel in Rust.pdfYodalee
 
CUDA Deep Dive
CUDA Deep DiveCUDA Deep Dive
CUDA Deep Divekrasul
 
Giorgio zoppi cpp11concurrency
Giorgio zoppi cpp11concurrencyGiorgio zoppi cpp11concurrency
Giorgio zoppi cpp11concurrencyGiorgio Zoppi
 
2.1 ### uVision Project, (C) Keil Software .docx
2.1   ### uVision Project, (C) Keil Software    .docx2.1   ### uVision Project, (C) Keil Software    .docx
2.1 ### uVision Project, (C) Keil Software .docxtarifarmarie
 
2011.02.18 marco parenzan - modelli di programmazione per le gpu
2011.02.18   marco parenzan - modelli di programmazione per le gpu2011.02.18   marco parenzan - modelli di programmazione per le gpu
2011.02.18 marco parenzan - modelli di programmazione per le gpuMarco Parenzan
 
Embedded systemsproject_2020
Embedded systemsproject_2020Embedded systemsproject_2020
Embedded systemsproject_2020Nikos Mouzakitis
 
Rust LDN 24 7 19 Oxidising the Command Line
Rust LDN 24 7 19 Oxidising the Command LineRust LDN 24 7 19 Oxidising the Command Line
Rust LDN 24 7 19 Oxidising the Command LineMatt Provost
 
Software transactional memory. pure functional approach
Software transactional memory. pure functional approachSoftware transactional memory. pure functional approach
Software transactional memory. pure functional approachAlexander Granin
 
20140531 serebryany lecture02_find_scary_cpp_bugs
20140531 serebryany lecture02_find_scary_cpp_bugs20140531 serebryany lecture02_find_scary_cpp_bugs
20140531 serebryany lecture02_find_scary_cpp_bugsComputer Science Club
 
Tema3_Introduction_to_CUDA_C.pdf
Tema3_Introduction_to_CUDA_C.pdfTema3_Introduction_to_CUDA_C.pdf
Tema3_Introduction_to_CUDA_C.pdfpepe464163
 
TLPI - 6 Process
TLPI - 6 ProcessTLPI - 6 Process
TLPI - 6 ProcessShu-Yu Fu
 
Embedded JavaScript
Embedded JavaScriptEmbedded JavaScript
Embedded JavaScriptJens Siebert
 
Roll your own toy unix clone os
Roll your own toy unix clone osRoll your own toy unix clone os
Roll your own toy unix clone oseramax
 
Parallel Futures of a Game Engine
Parallel Futures of a Game EngineParallel Futures of a Game Engine
Parallel Futures of a Game EngineJohan Andersson
 

Similar to GPU Programming on CPU - Using C++AMP (20)

C++ CoreHard Autumn 2018. Concurrency and Parallelism in C++17 and C++20/23 -...
C++ CoreHard Autumn 2018. Concurrency and Parallelism in C++17 and C++20/23 -...C++ CoreHard Autumn 2018. Concurrency and Parallelism in C++17 and C++20/23 -...
C++ CoreHard Autumn 2018. Concurrency and Parallelism in C++17 and C++20/23 -...
 
Microkernel Development
Microkernel DevelopmentMicrokernel Development
Microkernel Development
 
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
 
Whats new in_csharp4
Whats new in_csharp4Whats new in_csharp4
Whats new in_csharp4
 
Runtime Code Generation and Data Management for Heterogeneous Computing in Java
Runtime Code Generation and Data Management for Heterogeneous Computing in JavaRuntime Code Generation and Data Management for Heterogeneous Computing in Java
Runtime Code Generation and Data Management for Heterogeneous Computing in Java
 
Quiz 9
Quiz 9Quiz 9
Quiz 9
 
rrxv6 Build a Riscv xv6 Kernel in Rust.pdf
rrxv6 Build a Riscv xv6 Kernel in Rust.pdfrrxv6 Build a Riscv xv6 Kernel in Rust.pdf
rrxv6 Build a Riscv xv6 Kernel in Rust.pdf
 
CUDA Deep Dive
CUDA Deep DiveCUDA Deep Dive
CUDA Deep Dive
 
Giorgio zoppi cpp11concurrency
Giorgio zoppi cpp11concurrencyGiorgio zoppi cpp11concurrency
Giorgio zoppi cpp11concurrency
 
2.1 ### uVision Project, (C) Keil Software .docx
2.1   ### uVision Project, (C) Keil Software    .docx2.1   ### uVision Project, (C) Keil Software    .docx
2.1 ### uVision Project, (C) Keil Software .docx
 
2011.02.18 marco parenzan - modelli di programmazione per le gpu
2011.02.18   marco parenzan - modelli di programmazione per le gpu2011.02.18   marco parenzan - modelli di programmazione per le gpu
2011.02.18 marco parenzan - modelli di programmazione per le gpu
 
Embedded systemsproject_2020
Embedded systemsproject_2020Embedded systemsproject_2020
Embedded systemsproject_2020
 
Rust LDN 24 7 19 Oxidising the Command Line
Rust LDN 24 7 19 Oxidising the Command LineRust LDN 24 7 19 Oxidising the Command Line
Rust LDN 24 7 19 Oxidising the Command Line
 
Software transactional memory. pure functional approach
Software transactional memory. pure functional approachSoftware transactional memory. pure functional approach
Software transactional memory. pure functional approach
 
20140531 serebryany lecture02_find_scary_cpp_bugs
20140531 serebryany lecture02_find_scary_cpp_bugs20140531 serebryany lecture02_find_scary_cpp_bugs
20140531 serebryany lecture02_find_scary_cpp_bugs
 
Tema3_Introduction_to_CUDA_C.pdf
Tema3_Introduction_to_CUDA_C.pdfTema3_Introduction_to_CUDA_C.pdf
Tema3_Introduction_to_CUDA_C.pdf
 
TLPI - 6 Process
TLPI - 6 ProcessTLPI - 6 Process
TLPI - 6 Process
 
Embedded JavaScript
Embedded JavaScriptEmbedded JavaScript
Embedded JavaScript
 
Roll your own toy unix clone os
Roll your own toy unix clone osRoll your own toy unix clone os
Roll your own toy unix clone os
 
Parallel Futures of a Game Engine
Parallel Futures of a Game EngineParallel Futures of a Game Engine
Parallel Futures of a Game Engine
 

Recently uploaded

How To Use Server-Side Rendering with Nuxt.js
How To Use Server-Side Rendering with Nuxt.jsHow To Use Server-Side Rendering with Nuxt.js
How To Use Server-Side Rendering with Nuxt.jsAndolasoft Inc
 
Exploring the Best Video Editing App.pdf
Exploring the Best Video Editing App.pdfExploring the Best Video Editing App.pdf
Exploring the Best Video Editing App.pdfproinshot.com
 
W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...
W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...
W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...panagenda
 
+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...
+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...
+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...Health
 
8257 interfacing 2 in microprocessor for btech students
8257 interfacing 2 in microprocessor for btech students8257 interfacing 2 in microprocessor for btech students
8257 interfacing 2 in microprocessor for btech studentsHimanshiGarg82
 
Tech Tuesday-Harness the Power of Effective Resource Planning with OnePlan’s ...
Tech Tuesday-Harness the Power of Effective Resource Planning with OnePlan’s ...Tech Tuesday-Harness the Power of Effective Resource Planning with OnePlan’s ...
Tech Tuesday-Harness the Power of Effective Resource Planning with OnePlan’s ...OnePlan Solutions
 
The Real-World Challenges of Medical Device Cybersecurity- Mitigating Vulnera...
The Real-World Challenges of Medical Device Cybersecurity- Mitigating Vulnera...The Real-World Challenges of Medical Device Cybersecurity- Mitigating Vulnera...
The Real-World Challenges of Medical Device Cybersecurity- Mitigating Vulnera...ICS
 
Right Money Management App For Your Financial Goals
Right Money Management App For Your Financial GoalsRight Money Management App For Your Financial Goals
Right Money Management App For Your Financial GoalsJhone kinadey
 
Diamond Application Development Crafting Solutions with Precision
Diamond Application Development Crafting Solutions with PrecisionDiamond Application Development Crafting Solutions with Precision
Diamond Application Development Crafting Solutions with PrecisionSolGuruz
 
HR Software Buyers Guide in 2024 - HRSoftware.com
HR Software Buyers Guide in 2024 - HRSoftware.comHR Software Buyers Guide in 2024 - HRSoftware.com
HR Software Buyers Guide in 2024 - HRSoftware.comFatema Valibhai
 
How to Choose the Right Laravel Development Partner in New York City_compress...
How to Choose the Right Laravel Development Partner in New York City_compress...How to Choose the Right Laravel Development Partner in New York City_compress...
How to Choose the Right Laravel Development Partner in New York City_compress...software pro Development
 
Introducing Microsoft’s new Enterprise Work Management (EWM) Solution
Introducing Microsoft’s new Enterprise Work Management (EWM) SolutionIntroducing Microsoft’s new Enterprise Work Management (EWM) Solution
Introducing Microsoft’s new Enterprise Work Management (EWM) SolutionOnePlan Solutions
 
The Ultimate Test Automation Guide_ Best Practices and Tips.pdf
The Ultimate Test Automation Guide_ Best Practices and Tips.pdfThe Ultimate Test Automation Guide_ Best Practices and Tips.pdf
The Ultimate Test Automation Guide_ Best Practices and Tips.pdfkalichargn70th171
 
Direct Style Effect Systems - The Print[A] Example - A Comprehension Aid
Direct Style Effect Systems -The Print[A] Example- A Comprehension AidDirect Style Effect Systems -The Print[A] Example- A Comprehension Aid
Direct Style Effect Systems - The Print[A] Example - A Comprehension AidPhilip Schwarz
 
5 Signs You Need a Fashion PLM Software.pdf
5 Signs You Need a Fashion PLM Software.pdf5 Signs You Need a Fashion PLM Software.pdf
5 Signs You Need a Fashion PLM Software.pdfWave PLM
 
AI Mastery 201: Elevating Your Workflow with Advanced LLM Techniques
AI Mastery 201: Elevating Your Workflow with Advanced LLM TechniquesAI Mastery 201: Elevating Your Workflow with Advanced LLM Techniques
AI Mastery 201: Elevating Your Workflow with Advanced LLM TechniquesVictorSzoltysek
 
Azure_Native_Qumulo_High_Performance_Compute_Benchmarks.pdf
Azure_Native_Qumulo_High_Performance_Compute_Benchmarks.pdfAzure_Native_Qumulo_High_Performance_Compute_Benchmarks.pdf
Azure_Native_Qumulo_High_Performance_Compute_Benchmarks.pdfryanfarris8
 

Recently uploaded (20)

How To Use Server-Side Rendering with Nuxt.js
How To Use Server-Side Rendering with Nuxt.jsHow To Use Server-Side Rendering with Nuxt.js
How To Use Server-Side Rendering with Nuxt.js
 
Exploring the Best Video Editing App.pdf
Exploring the Best Video Editing App.pdfExploring the Best Video Editing App.pdf
Exploring the Best Video Editing App.pdf
 
W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...
W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...
W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...
 
Microsoft AI Transformation Partner Playbook.pdf
Microsoft AI Transformation Partner Playbook.pdfMicrosoft AI Transformation Partner Playbook.pdf
Microsoft AI Transformation Partner Playbook.pdf
 
+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...
+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...
+971565801893>>SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHAB...
 
8257 interfacing 2 in microprocessor for btech students
8257 interfacing 2 in microprocessor for btech students8257 interfacing 2 in microprocessor for btech students
8257 interfacing 2 in microprocessor for btech students
 
Tech Tuesday-Harness the Power of Effective Resource Planning with OnePlan’s ...
Tech Tuesday-Harness the Power of Effective Resource Planning with OnePlan’s ...Tech Tuesday-Harness the Power of Effective Resource Planning with OnePlan’s ...
Tech Tuesday-Harness the Power of Effective Resource Planning with OnePlan’s ...
 
The Real-World Challenges of Medical Device Cybersecurity- Mitigating Vulnera...
The Real-World Challenges of Medical Device Cybersecurity- Mitigating Vulnera...The Real-World Challenges of Medical Device Cybersecurity- Mitigating Vulnera...
The Real-World Challenges of Medical Device Cybersecurity- Mitigating Vulnera...
 
Right Money Management App For Your Financial Goals
Right Money Management App For Your Financial GoalsRight Money Management App For Your Financial Goals
Right Money Management App For Your Financial Goals
 
Diamond Application Development Crafting Solutions with Precision
Diamond Application Development Crafting Solutions with PrecisionDiamond Application Development Crafting Solutions with Precision
Diamond Application Development Crafting Solutions with Precision
 
HR Software Buyers Guide in 2024 - HRSoftware.com
HR Software Buyers Guide in 2024 - HRSoftware.comHR Software Buyers Guide in 2024 - HRSoftware.com
HR Software Buyers Guide in 2024 - HRSoftware.com
 
How to Choose the Right Laravel Development Partner in New York City_compress...
How to Choose the Right Laravel Development Partner in New York City_compress...How to Choose the Right Laravel Development Partner in New York City_compress...
How to Choose the Right Laravel Development Partner in New York City_compress...
 
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICECHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
 
Introducing Microsoft’s new Enterprise Work Management (EWM) Solution
Introducing Microsoft’s new Enterprise Work Management (EWM) SolutionIntroducing Microsoft’s new Enterprise Work Management (EWM) Solution
Introducing Microsoft’s new Enterprise Work Management (EWM) Solution
 
The Ultimate Test Automation Guide_ Best Practices and Tips.pdf
The Ultimate Test Automation Guide_ Best Practices and Tips.pdfThe Ultimate Test Automation Guide_ Best Practices and Tips.pdf
The Ultimate Test Automation Guide_ Best Practices and Tips.pdf
 
Vip Call Girls Noida ➡️ Delhi ➡️ 9999965857 No Advance 24HRS Live
Vip Call Girls Noida ➡️ Delhi ➡️ 9999965857 No Advance 24HRS LiveVip Call Girls Noida ➡️ Delhi ➡️ 9999965857 No Advance 24HRS Live
Vip Call Girls Noida ➡️ Delhi ➡️ 9999965857 No Advance 24HRS Live
 
Direct Style Effect Systems - The Print[A] Example - A Comprehension Aid
Direct Style Effect Systems -The Print[A] Example- A Comprehension AidDirect Style Effect Systems -The Print[A] Example- A Comprehension Aid
Direct Style Effect Systems - The Print[A] Example - A Comprehension Aid
 
5 Signs You Need a Fashion PLM Software.pdf
5 Signs You Need a Fashion PLM Software.pdf5 Signs You Need a Fashion PLM Software.pdf
5 Signs You Need a Fashion PLM Software.pdf
 
AI Mastery 201: Elevating Your Workflow with Advanced LLM Techniques
AI Mastery 201: Elevating Your Workflow with Advanced LLM TechniquesAI Mastery 201: Elevating Your Workflow with Advanced LLM Techniques
AI Mastery 201: Elevating Your Workflow with Advanced LLM Techniques
 
Azure_Native_Qumulo_High_Performance_Compute_Benchmarks.pdf
Azure_Native_Qumulo_High_Performance_Compute_Benchmarks.pdfAzure_Native_Qumulo_High_Performance_Compute_Benchmarks.pdf
Azure_Native_Qumulo_High_Performance_Compute_Benchmarks.pdf
 

GPU Programming on CPU - Using C++AMP

  • 2. Outline 1. Introduction to C++AMP 2. Introduction to Tiling 3. tile_static 4. barrier.wait and solutions a. C++11 thread b. setjmp/longjmp c. ucontext 2
  • 3. (Homogeneous coordinates) (0, 0) (0, 1) (0, 2) (0, 3) (1, 0) (1, 1) (1, 2) (1, 3) (2, 0) (2, 1) (2, 2) (2, 3) (3, 0) (3, 1) (3, 2) (3, 3) X 0 1 2 3 Matrix A b = 0 1 2 3 result Computing example ● Simple matrix multiplication 3
  • 4. C++ Version 1. int A[4][4]; 2. int b[4]; 3. int result[4]; 4. for (int i = 0; i < 4; i++) { 5. result[i] = 0; 6. for (int j = 0; j < 4; j++) 7. result[i] += A[i][j] * b[j]; 8. } 4
  • 5. C++AMP Version 1. array_view<float, 2> A(4, 4); 2. array_view<float, 1> b(4); 3. array_view<float, 1> result(4); 4. extent<1> ext(4); 5. parallel_for_each(ext, [&](index<1> idx) restrict(amp) 6. { 7. result[idx[0]] = 0; 8. for (int i = 0; i < 4; i++) 9. result[idx[0]] += A(idx[0], i) * b(i); 10. }); 5
  • 6. memory access 0 1 2 3 P0 P1 P2 P3 global memory b 100t Total access time = 400t 6
  • 7. shared memory 0 1 2 3 shared memory 10t 100t Total access time = 130t b 7
  • 8. 1. array_view<float, 2> A(4, 4); 2. array_view<float, 1> b(4); 3. array_view<float, 1> result(4); 4. extent<1> ext(4); 5. parallel_for_each(ext.tile<4>(), [&](tiled_index<4> tidx) restrict(amp) 6. { 7. int local = tidx.local[0]; 8. int global = tidx.global[0]; 9. tile_statc int buf[4]; 10. buf[local] = b[global]; 11. tidx.barrier.wait(); 12. result[idx[0]] = 0; 13. for (int i = 0; i < 4; i++) 14. result[idx[0]] += A[idx[0]][i] * buf[i]; 15. }); 8
  • 10. Architecture source: NVIDIA TESLA:AUNIFIED GRAPHICS AND COMPUTING ARCHITECTURE shared memory accessible to all SPs 10
  • 11. Goal ● Implement all the C++AMP function on CPU instead of GPU without any compiler modification. 11
  • 12. tiled_static ● The limitation of C++ syntax leads to the following choices ○ const, volatile ○ __attribute__(...) ○ static ● Choose static ○ static memory can be shared among all the threads ○ side effect: At most one thread group can be executed at the same time. #define tile_static static 12
  • 13. Barrier.wait ● Threads in the same thread group will be waited at the point where “wait” is called. ● Program can a. perform real barrier action b. jump out of current execution context 13
  • 14. ● True threading ○ C++11 thread ● Fake threading(Coroutines) ○ setjmp/longjmp ○ makecontext/getcontext/swapcontext/setcontext Approaches 14
  • 15. C++11 thread ● launch hundreds of threads at a time. ● implemente my own barrier by using C++11 mutex library. → extremely slow. → The data on static memory will be corrupted 15
  • 16. setjmp/longjmp ● int setjmp(jmp_buf env) ○ setjmp() saves the stack context/environment in env for later use by longjmp. ○ The stack context will be invalidated if the function which called setjmp() returns. ● void longjmp(jmp_buf env, int val); ○ longjmp() restores the environment saved by the last call of setjmp. 16
  • 17. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf; 4. void wait(void) { 5. printf("waitn"); // prints 6. longjmp(buf,1); 7. } 8. void first(void) { 9. wait(); 10. printf("firstn"); // does not print 11. } 12. int main() { 13. if (!setjmp(buf)) 14. first(); // when executed, setjmp returns 0 15. else // when longjmp jumps back, setjmp returns 1 16. printf("mainn"); // prints 17. return 0; 18. } 17
  • 18. Pseudo code (1) void entry() { while(!finish) for(t : tasks) run(t) } void fun() { … wait(); ... } void fun() { … wait(); ... } void entry() { while(!finish) for(t : tasks) run(t) } void fun() { … wait(); ... } void fun() { … wait(); ... } 18
  • 19. Pseudo code (2) void entry() { while(!finish) for(t : tasks) run(t) } void fun() { … wait(); ... } void fun() { … wait(); ... } void entry() { while(!finish) for(t : tasks) run(t) } void fun() { … wait(); ... } void fun() { … wait(); ... } 19
  • 20. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } 20
  • 21. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } buf 21
  • 22. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } ret address buf b 22
  • 23. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } buf b 23
  • 24. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } Cannot return ??? ??? ??? buf b 24
  • 25. Problems ● Cannot return ○ return address in the stack is destroyed ● Cannot use too many static variables ○ will lost spilled registers → can be solved by using “alloca” http://www.codemud.net/~thinker/GinGin_CGI. py/show_id_doc/489 25
  • 26. ucontext.h ● ucontext_t ● getcontext ● makecontest ● swapcontext ● setcontext 26
  • 27. ucontext_t typedef struct ucontext { struct ucontext *uc_link; sigset_t uc_sigmask; stack_t uc_stack; mcontext_t uc_mcontext; ... } ucontext_t; ● uc_link ○ points to the context that will be resumed when the current context terminates ● uc_stack ○ the stack used by this context ● uc_mcontext ○ machine-specific representation of the saved context, that includes the calling thread's machine registers 27
  • 28. Functions ● int getcontext(ucontext_t *ucp); ○ initializes the structure pointed at by ucp. ● int setcontext(const ucontext_t *ucp); ○ restores the user context pointed at by ucp ● int swapcontext(ucontext_t *oucp, const ucontext_t *ucp); ○ saves the current context in the structure pointed to by oucp, and then activates the context pointed to by ucp. 28
  • 29. makecontext ● void makecontext(ucontext_t *ucp, void (*func)(), int argc, ...); ○ glibc(x86_64) saves the arguments to registers instead of pushing them on stack as AMD64 ABI said ○ The size of the arguments that passed to makecontext should be no less than sizeof(register) 29
  • 30. 1. #include <stdio.h> 2. #include <ucontext.h> 3. static ucontext_t ctx[2]; 4. static void f1 (void) { 5. puts("start f1"); 6. swapcontext(&ctx[1], &ctx[0]); 7. puts("finish f1"); 8. } 9. int main (void) 10. { 11. char st1[8192]; 12. getcontext(&ctx[1]); 13. ctx[1].uc_stack.ss_sp = st1; 14. ctx[1].uc_stack.ss_size = sizeof st1; 15. ctx[1].uc_link = &ctx[0]; 16. makecontext(&ctx[1], f1, 0); 17. swapcontext(&ctx[0], &ctx[1]); 18. swapcontext(&ctx[0], &ctx[1]); 19. return 0; 20. } 30
  • 31. 1. #include <stdio.h> 2. #include <ucontext.h> 3. static ucontext_t ctx[3]; 4. static void f1 (void) { 5. puts("start f1"); 6. swapcontext(&ctx[1], &ctx [0]); 7. puts("finish f1"); 8. } 9. static void f2 (void) 10. { 11. puts("start f2"); 12. swapcontext(&ctx[2], &ctx [1]); 13. puts("finish f2"); 14. } 1. int main (void) 2. { 3. char st1[8192], st2[8192]; 4. getcontext(&ctx[1]); 5. ctx[1].uc_stack.ss_sp = st1; 6. ctx[1].uc_stack.ss_size = sizeof st1; 7. ctx[1].uc_link = &ctx[0]; 8. makecontext(&ctx[1], f1, 0); 9. 10. getcontext(&ctx[2]); 11. ctx[2].uc_stack.ss_sp = st2; 12. ctx[2].uc_stack.ss_size = sizeof st2; 13. ctx[2].uc_link = &ctx[1]; 14. makecontext(&ctx[2], f2, 0); 15. swapcontext(&ctx[0], &ctx[2]); 16. swapcontext(&ctx[0], &ctx[2]); 17. return 0; 18. } 31
  • 32. Fake threading (yield) void entry() { setup(fun, 2); while(!finish) switch_to(); } void fun() { … wait(); ... } void fun() { … wait(); ... } 32 void entry() { setup(fun, 2); while(!finish) switch_to(); } void fun() { … wait(); ... } void fun() { … wait(); ... }
  • 33. Problems 1. How to pass a lambda? ○ makecontext(&ctx, (void (*)(void))&Kernel::operator(), …); 2. How to pass non-int arguments? ○ What if sizeof(Type) > sizeof(int) ○ How about complex structure and class 33
  • 34. Pass lambda 1. Use a wrapper function!! template <typename Ker, typename Arg> void fun(Ker k, Arg arg) { k(arg); } template <typename Ker, typename Arg> void makectx(Ker k, Arg arg) { makecontext(&ctx, (void (*)(void))fun<ker, Arg>, 2, k, arg); } 34
  • 35. Pass non-int arguments 2. Pass pointer instead!! template <typename Ker, typename Arg> void fun(Ker *k, Arg *arg) { (*k)(*arg); } template <typename Ker, typename Arg> void makectx(Ker k, Arg arg) { makecontext(&ctx, (void (*)(void))fun<ker, Arg>, 2, &k, &arg); } 35
  • 36. Additional ● Use a counter so that we can spawn coroutines dynamically ● Can it be multithreaded? Yes 36
  • 37. true threading barrier There are 12 threads in one thread group 37
  • 40. barrier struct bar_t { unsigned const count; std::atomic<unsigned> spaces; std::atomic<unsigned> generation; bar_t(unsigned count_) : count(count_), spaces(count_), generation(0) {} void wait() noexcept { unsigned const my_generation = generation; if (!--spaces) { spaces = count; ++generation; } else { while(generation == my_generation); } } }; source: C++ Concurrency in Action: Practical Multithreading 40
  • 41. Summary ● It works fine on AMP right now ● The importance of low level knowledge 41
  • 42. 42