SlideShare a Scribd company logo
C++AMP on Linux 
Miller Lee
About Me 
● Miller Lee 
● Junior student at NCTU CS 
● Interests: C/C++, PL, CA, OS, compiler, 
parallel programming, optimization
Why C++ AMP? 
● GPUs can be 10+X 
faster than CPUs for 
parallel code 
● CUDA and OpenCL 
are still too 
complex/verbose for 
programmers
GPU computing 
require explicit transfer
What we need in GPU programming 
1. put data parallel codes into a kernel for GPU 
to execute 
2. pass the arguments to GPU 
○ We can not pass the arguments by stack 
3. an index to indicate current thread 
4. move the data between GPU and CPU 
memory
OpenCL as an example
Device code in OpenCL 
__kernel void 
matrixMul(__global float* C, __global float* A, 
__global float* B, int wA, int wB) 
{ 
int tx = get_global_id(0); 
int ty = get_global_id(1); 
float value = 0; 
for (int k = 0; k < wA; ++k) 
{ 
float elementA = A[ty * wA + k]; 
float elementB = B[k * wB + tx]; 
value += elementA * elementB; 
} 
C[ty * wA + tx] = value; 
}
Host code in OpenCL 1.2 
1. allocate and initialize memory on host side 
2. Initialize OpenCL 
3. allocate device memory and move the data 
4. Load and build device code 
5. Launch kernel 
a. append arguments 
6. move the data back from device
int 
main(int argc, char** argv) 
{ 
// set seed for rand() 
srand(2006); 
// 1. allocate host memory for matrices A and B 
unsigned int size_A = WA * HA; 
unsigned int mem_size_A = sizeof(float) * size_A; 
float* h_A = (float*) malloc(mem_size_A); 
unsigned int size_B = WB * HB; 
unsigned int mem_size_B = sizeof(float) * size_B; 
float* h_B = (float*) malloc(mem_size_B); 
// 2. initialize host memory 
randomInit(h_A, size_A); 
randomInit(h_B, size_B); 
// 4. allocate host memory for the result C 
unsigned int size_C = WC * HC; 
unsigned int mem_size_C = sizeof(float) * size_C; 
float* h_C = (float*) malloc(mem_size_C); 
// 5. Initialize OpenCL 
// OpenCL specific variables 
cl_context clGPUContext; 
cl_command_queue clCommandQue; 
cl_program clProgram; 
size_t dataBytes; 
size_t kernelLength; 
cl_int errcode; 
// OpenCL device memory for matrices 
cl_mem d_A; 
cl_mem d_B; 
cl_mem d_C; 
/*****************************************/ 
/* Initialize OpenCL */ 
/*****************************************/ 
clGPUContext = clCreateContextFromType(0, 
CL_DEVICE_TYPE_GPU, 
NULL, NULL, &errcode); 
shrCheckError(errcode, CL_SUCCESS); 
// get the list of GPU devices associated 
// with context 
errcode = clGetContextInfo(clGPUContext, 
CL_CONTEXT_DEVICES, 0, NULL, 
&dataBytes); 
cl_device_id *clDevices = (cl_device_id *) 
malloc(dataBytes); 
errcode |= clGetContextInfo(clGPUContext, 
CL_CONTEXT_DEVICES, dataBytes, 
clDevices, NULL); 
shrCheckError(errcode, CL_SUCCESS); 
//Create a command-queue 
clCommandQue = clCreateCommandQueue(clGPUContext, 
clDevices[0], 0, &errcode); 
shrCheckError(errcode, CL_SUCCESS); 
// Setup device memory 
d_C = clCreateBuffer(clGPUContext, 
CL_MEM_READ_WRITE, 
mem_size_A, NULL, &errcode); 
d_A = clCreateBuffer(clGPUContext, 
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
mem_size_A, h_A, &errcode); 
d_B = clCreateBuffer(clGPUContext, 
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
mem_size_B, h_B, &errcode); 
// 6. Load and build OpenCL kernel 
char *clMatrixMul = oclLoadProgSource("kernel.cl", 
"// My commentn", 
&kernelLength); 
shrCheckError(clMatrixMul != NULL, shrTRUE); 
clProgram = clCreateProgramWithSource(clGPUContext, 
1, (const char **)&clMatrixMul, 
&kernelLength, &errcode); 
shrCheckError(errcode, CL_SUCCESS); 
errcode = clBuildProgram(clProgram, 0, 
NULL, NULL, NULL, NULL); 
shrCheckError(errcode, CL_SUCCESS); 
clKernel = clCreateKernel(clProgram, 
"matrixMul", &errcode); 
shrCheckError(errcode, CL_SUCCESS); 
// 7. Launch OpenCL kernel 
size_t localWorkSize[2], globalWorkSize[2]; 
int wA = WA; 
int wC = WC; 
errcode = clSetKernelArg(clKernel, 0, 
sizeof(cl_mem), (void *)&d_C); 
errcode |= clSetKernelArg(clKernel, 1, 
sizeof(cl_mem), (void *)&d_A); 
errcode |= clSetKernelArg(clKernel, 2, 
sizeof(cl_mem), (void *)&d_B); 
errcode |= clSetKernelArg(clKernel, 3, 
sizeof(int), (void *)&wA); 
errcode |= clSetKernelArg(clKernel, 4, 
sizeof(int), (void *)&wC); 
shrCheckError(errcode, CL_SUCCESS); 
localWorkSize[0] = 16; 
localWorkSize[1] = 16; 
globalWorkSize[0] = 1024; 
globalWorkSize[1] = 1024; 
errcode = clEnqueueNDRangeKernel(clCommandQue, 
clKernel, 2, NULL, globalWorkSize, 
localWorkSize, 0, NULL, NULL); 
shrCheckError(errcode, CL_SUCCESS); 
// 8. Retrieve result from device 
errcode = clEnqueueReadBuffer(clCommandQue, 
d_C, CL_TRUE, 0, mem_size_C, 
h_C, 0, NULL, NULL); 
shrCheckError(errcode, CL_SUCCESS); 
// 10. clean up memory 
free(h_A); 
free(h_B); 
free(h_C); 
clReleaseMemObject(d_A); 
clReleaseMemObject(d_C); 
clReleaseMemObject(d_B); 
free(clDevices); 
free(clMatrixMul); 
clReleaseContext(clGPUContext); 
clReleaseKernel(clKernel); 
clReleaseProgram(clProgram); 
clReleaseCommandQueue(clCommandQue); 
}
Nearly 200 lines of code
What is C++ AMP 
● C++ Accelerated Massive Parallelism 
○ Designed for data level parallelism 
○ Extension of C++11 proposed by M$ 
○ An open specification with multiple implementations 
aiming at standardization 
■ MS Visual Studio 2013 
■ MCW CLAMP 
● GPU data modeled as C++14-like containers for 
multidimensional arrays 
● GPU kernels modeled as C++11 lambda
Comparisons 
C++AMP Thrust Bolt OpenACC SYCL 
Intro 
simple, elegant, 
performance(?), 
proposed by M$ 
library 
proposed 
by CUDA 
library 
proposed 
by AMP 
Annotation 
and 
pragmas 
proposed 
by SGI 
wrapper for 
OpenCL 
proposed 
by 
Codeplay
Matrix Multiplication in C++AMP 
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int 
*productMatrix, 
int ha, int hb, int hc) { 
array_view<int, 2> a(ha, hb, aMatrix); 
array_view<int, 2> b(hb, hc, bMatrix); 
array_view<int, 2> product(ha, hc, productMatrix); 
parallel_for_each( 
product.extent, 
[=](index<2> idx) restrict(amp) { 
int row = idx[0]; 
int col = idx[1]; 
for (int inner = 0; inner < 2; inner++) { 
product[idx] += a(row, inner) * b(inner, col); 
} 
} 
); 
product.synchronize(); 
} 
clGPUContext = clCreateContextFromType(0, 
CL_DEVICE_TYPE_GPU, 
NULL, NULL, &errcode); 
shrCheckError(errcode, CL_SUCCESS); 
// get the list of GPU devices associated 
// with context 
errcode = clGetContextInfo(clGPUContext, 
__kernel void 
matrixMul(__global float* C, __global float* 
A, 
CL_CONTEXT_DEVICES, 0, NULL, 
&dataBytes); 
__global float* B, int wA, int wB) 
{ 
int tx = get_global_id(0); 
int ty = get_global_id(1); 
float value = 0; 
for (int k = 0; k < wA; ++k) 
{ 
cl_device_id *clDevices = (cl_device_id *) 
malloc(dataBytes); 
errcode |= clGetContextInfo(clGPUContext, 
CL_CONTEXT_DEVICES, dataBytes, 
clDevices, NULL); 
shrCheckError(errcode, CL_SUCCESS); 
//Create a command-queue 
clCommandQue = clCreateCommandQueue 
(clGPUContext, 
float elementA = A[ty * wA + k]; 
float elementB = B[k * wB + tx]; 
value += elementA * elementB; 
} 
C[ty * wA + tx] = value; 
} 
clDevices[0], 0, &errcode); 
shrCheckError(errcode, CL_SUCCESS);
Only 20 lines of code 
but performance?
C++AMP programming model 
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { 
array_view<int, 2> a(3, 2, aMatrix); 
array_view<int, 2> b(2, 3, bMatrix); 
array_view<int, 2> product(3, 3, productMatrix); 
parallel_for_each( 
product.extent, 
[=](index<2> idx) restrict(amp) { 
int row = idx[0]; 
int col = idx[1]; 
for (int inner = 0; inner < 2; inner++) { 
product[idx] += a(row, inner) * b(inner, col); 
} 
} 
); 
product.synchronize(); 
} 
GPU data modeled 
as data container
C++AMP programming model 
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { 
array_view<int, 2> a(3, 2, aMatrix); 
array_view<int, 2> b(2, 3, bMatrix); 
array_view<int, 2> product(3, 3, productMatrix); 
parallel_for_each( 
product.extent, 
[=](index<2> idx) restrict(amp) { 
int row = idx[0]; 
int col = idx[1]; 
for (int inner = 0; inner < 2; inner++) { 
product[idx] += a(row, inner) * b(inner, col); 
} 
} 
); 
product.synchronize(); 
} 
Execution interface; 
marking an implicitly 
parallel region for GPU 
execution
C++AMP programming model 
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { 
array_view<int, 2> a(3, 2, aMatrix); 
array_view<int, 2> b(2, 3, bMatrix); 
array_view<int, 2> product(3, 3, productMatrix); 
parallel_for_each( 
product.extent, 
[=](index<2> idx) restrict(amp) { 
int row = idx[0]; 
int col = idx[1]; 
for (int inner = 0; inner < 2; inner++) { 
product[idx] += a(row, inner) * b(inner, col); 
} 
} 
); 
product.synchronize(); 
} 
Kernels modeled as 
lambdas; arguments 
are implicitly modeled 
as captured variables
MCW C++AMP (CLAMP) 
● Clang/LLVM-based 
○ translate C++AMP code to OpenCL C code and 
generate OpenCL SPIR file 
○ With some template library 
● Runtime support: gmac/OpenCL/HSA Okra 
● An Open Source project 
○ The only two C++ AMP implementations recognized 
by HSA foundation (the other is MSVC) 
○ Microsoft and HSA foundation supported
MCW C++ AMP Compiler 
● Device Path 
○ generate OpenCL C code by 
CBackend 
○ emit kernel function 
● Host Path 
○ preparation to launch the 
code 
C++ AMP 
source 
code 
Clang/LLVM 3.3 
Device 
Code Host Code
Execution process 
C++ AMP 
source 
code 
Clang 
/LLV 
M 3.3 
Device 
Code 
C++ AMP 
source 
code 
Clang 
/LLV 
M 3.3 
Host Code 
gmac 
OpenCL 
Our work
gmac 
● unified virtual address 
space in software 
● Can have high 
overhead sometimes 
● In HSA (AMD Kaveri), 
GMAC is no longer 
needed
Compiling C++AMP to OpenCL 
● C++AMP → LLVM IR → subset of C 
● arguments passing (lambda capture vs 
function calls) 
● explicit V.S. implicit memory transfer 
● Heavy works were done by compiler and 
runtime
lambda capture 
struct add { 
int a; 
add(int a) : a(a) {} 
int operator()(int x) const { 
return a + x; 
} 
}; 
int main(void) 
{ 
int a = 3; 
auto fn = [=] (int x) { return a + x; }; 
int c = fn(3); 
return 0; 
} 
Those arguments should be put 
on the argument lists of OpenCL 
kernel.
What we need to do? 
● Kernel function 
○ emit the kernel function with required arguments 
● In Host side 
○ a function that recursively traverses the object and 
append the arguments to OpenCL stack. 
● In Device side 
○ reconstructor it on the device code for future use.
Example 
struct A { int a; }; 
struct B : A { int b; }; 
struct C { B b; int c; }; 
struct C c; 
c.c = 100; 
auto fn = [=] () { int qq = c.c; };
Kernel code 
__kernel void(int a, int b, int c) 
{ 
C c(a, b, c); 
... 
}
Deserialization constructor 
struct C 
{ 
B b; 
int c; 
C (int a, int b, int c) : c(c), b(a, b) {} 
};
Serialization constructor 
struct C 
{ 
B b; 
int c; 
void __cxxamp_serialize(Concurrency::Serialize s) { 
b.__cxxamp_serialize(s); 
s.Append(sizeof(int), &c); 
} 
};
Translation 
parallel_for_each(product.extent, 
[=](index<2> idx) restrict(amp) { 
int row = idx[0]; 
int col = idx[1]; 
for (int inner = 0; inner < 2; inner++) { 
product[idx] += a(row, inner) * b(inner, col); 
} 
} 
); 
__kernel void 
matrixMul(__global float* C, __global float* A, 
__global float* B, int wA, int wB) 
{ 
int tx = get_global_id(0); 
int ty = get_global_id(1); 
float value = 0; 
for (int k = 0; k < wA; ++k) 
{ 
float elementA = A[ty * wA + k]; 
float elementB = B[k * wB + tx]; 
value += elementA * elementB; 
} 
C[ty * wA + tx] = value; 
} 
● Append the arguments 
● Set the index 
● emit kernel function 
● implicit memory management
Future work 
● Future work for us 
○ restrict(auto) 
○ HSA related work
Future works for you 
● Try this out!! 
● Many of us get spoiled and don’t want to go 
back to write OpenCL directly anymore :-) 
● related links 
○ Driver 
○ Clang 
○ sandbox
C++ amp on linux

More Related Content

What's hot

TVM VTA (TSIM)
TVM VTA (TSIM) TVM VTA (TSIM)
TVM VTA (TSIM)
Mr. Vengineer
 
Productive OpenCL Programming An Introduction to OpenCL Libraries with Array...
Productive OpenCL Programming An Introduction to OpenCL Libraries  with Array...Productive OpenCL Programming An Introduction to OpenCL Libraries  with Array...
Productive OpenCL Programming An Introduction to OpenCL Libraries with Array...
AMD Developer Central
 
How to make a large C++-code base manageable
How to make a large C++-code base manageableHow to make a large C++-code base manageable
How to make a large C++-code base manageable
corehard_by
 
Евгений Крутько, Многопоточные вычисления, современный подход.
Евгений Крутько, Многопоточные вычисления, современный подход.Евгений Крутько, Многопоточные вычисления, современный подход.
Евгений Крутько, Многопоточные вычисления, современный подход.
Platonov Sergey
 
Joel Falcou, Boost.SIMD
Joel Falcou, Boost.SIMDJoel Falcou, Boost.SIMD
Joel Falcou, Boost.SIMD
Sergey Platonov
 
Facebook Glow Compiler のソースコードをグダグダ語る会
Facebook Glow Compiler のソースコードをグダグダ語る会Facebook Glow Compiler のソースコードをグダグダ語る会
Facebook Glow Compiler のソースコードをグダグダ語る会
Mr. Vengineer
 
C++17 now
C++17 nowC++17 now
C++17 now
corehard_by
 
Evgeniy Muralev, Mark Vince, Working with the compiler, not against it
Evgeniy Muralev, Mark Vince, Working with the compiler, not against itEvgeniy Muralev, Mark Vince, Working with the compiler, not against it
Evgeniy Muralev, Mark Vince, Working with the compiler, not against it
Sergey Platonov
 
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
changehee lee
 
Multithreading done right
Multithreading done rightMultithreading done right
Multithreading done right
Platonov Sergey
 
Threaded Programming
Threaded ProgrammingThreaded Programming
Threaded Programming
Sri Prasanna
 
Introduction to gdb
Introduction to gdbIntroduction to gdb
Introduction to gdb
Owen Hsu
 
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
Teddy Hsiung
 
Работа с реляционными базами данных в C++
Работа с реляционными базами данных в C++Работа с реляционными базами данных в C++
Работа с реляционными базами данных в C++
corehard_by
 
Story of static code analyzer development
Story of static code analyzer developmentStory of static code analyzer development
Story of static code analyzer development
Andrey Karpov
 
Node.js System: The Landing
Node.js System: The LandingNode.js System: The Landing
Node.js System: The Landing
Haci Murat Yaman
 
【論文紹介】Relay: A New IR for Machine Learning Frameworks
【論文紹介】Relay: A New IR for Machine Learning Frameworks【論文紹介】Relay: A New IR for Machine Learning Frameworks
【論文紹介】Relay: A New IR for Machine Learning Frameworks
Takeo Imai
 
Interpreter, Compiler, JIT from scratch
Interpreter, Compiler, JIT from scratchInterpreter, Compiler, JIT from scratch
Interpreter, Compiler, JIT from scratch
National Cheng Kung University
 
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2  「エッジAIモダン計測制御の世界」オ...Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2  「エッジAIモダン計測制御の世界」オ...
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
Mr. Vengineer
 
Open CL For Haifa Linux Club
Open CL For Haifa Linux ClubOpen CL For Haifa Linux Club
Open CL For Haifa Linux Club
Ofer Rosenberg
 

What's hot (20)

TVM VTA (TSIM)
TVM VTA (TSIM) TVM VTA (TSIM)
TVM VTA (TSIM)
 
Productive OpenCL Programming An Introduction to OpenCL Libraries with Array...
Productive OpenCL Programming An Introduction to OpenCL Libraries  with Array...Productive OpenCL Programming An Introduction to OpenCL Libraries  with Array...
Productive OpenCL Programming An Introduction to OpenCL Libraries with Array...
 
How to make a large C++-code base manageable
How to make a large C++-code base manageableHow to make a large C++-code base manageable
How to make a large C++-code base manageable
 
Евгений Крутько, Многопоточные вычисления, современный подход.
Евгений Крутько, Многопоточные вычисления, современный подход.Евгений Крутько, Многопоточные вычисления, современный подход.
Евгений Крутько, Многопоточные вычисления, современный подход.
 
Joel Falcou, Boost.SIMD
Joel Falcou, Boost.SIMDJoel Falcou, Boost.SIMD
Joel Falcou, Boost.SIMD
 
Facebook Glow Compiler のソースコードをグダグダ語る会
Facebook Glow Compiler のソースコードをグダグダ語る会Facebook Glow Compiler のソースコードをグダグダ語る会
Facebook Glow Compiler のソースコードをグダグダ語る会
 
C++17 now
C++17 nowC++17 now
C++17 now
 
Evgeniy Muralev, Mark Vince, Working with the compiler, not against it
Evgeniy Muralev, Mark Vince, Working with the compiler, not against itEvgeniy Muralev, Mark Vince, Working with the compiler, not against it
Evgeniy Muralev, Mark Vince, Working with the compiler, not against it
 
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
 
Multithreading done right
Multithreading done rightMultithreading done right
Multithreading done right
 
Threaded Programming
Threaded ProgrammingThreaded Programming
Threaded Programming
 
Introduction to gdb
Introduction to gdbIntroduction to gdb
Introduction to gdb
 
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
 
Работа с реляционными базами данных в C++
Работа с реляционными базами данных в C++Работа с реляционными базами данных в C++
Работа с реляционными базами данных в C++
 
Story of static code analyzer development
Story of static code analyzer developmentStory of static code analyzer development
Story of static code analyzer development
 
Node.js System: The Landing
Node.js System: The LandingNode.js System: The Landing
Node.js System: The Landing
 
【論文紹介】Relay: A New IR for Machine Learning Frameworks
【論文紹介】Relay: A New IR for Machine Learning Frameworks【論文紹介】Relay: A New IR for Machine Learning Frameworks
【論文紹介】Relay: A New IR for Machine Learning Frameworks
 
Interpreter, Compiler, JIT from scratch
Interpreter, Compiler, JIT from scratchInterpreter, Compiler, JIT from scratch
Interpreter, Compiler, JIT from scratch
 
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2  「エッジAIモダン計測制御の世界」オ...Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2  「エッジAIモダン計測制御の世界」オ...
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
 
Open CL For Haifa Linux Club
Open CL For Haifa Linux ClubOpen CL For Haifa Linux Club
Open CL For Haifa Linux Club
 

Viewers also liked

C++ AMPを使ってみよう
C++ AMPを使ってみようC++ AMPを使ってみよう
C++ AMPを使ってみよう
Osamu Masutani
 
A Sensing Coverage Analysis of a Route Control Method for Vehicular Crowd Sen...
A Sensing Coverage Analysis of a Route Control Method for Vehicular Crowd Sen...A Sensing Coverage Analysis of a Route Control Method for Vehicular Crowd Sen...
A Sensing Coverage Analysis of a Route Control Method for Vehicular Crowd Sen...
Osamu Masutani
 
Taxi Fare Deep Dive
Taxi Fare Deep DiveTaxi Fare Deep Dive
Taxi Fare Deep Dive
Osamu Masutani
 
Traffic simulation based on space syntax
Traffic simulation based on space syntaxTraffic simulation based on space syntax
Traffic simulation based on space syntax
Osamu Masutani
 
Power BI チュートリアル 導入・初級編
Power BI チュートリアル 導入・初級編Power BI チュートリアル 導入・初級編
Power BI チュートリアル 導入・初級編
Osamu Masutani
 
Linux practicals T.Y.B.ScIT
Linux practicals T.Y.B.ScITLinux practicals T.Y.B.ScIT
Linux practicals T.Y.B.ScIT
vignesh0009
 
Linux System Administration Crash Course
Linux System Administration Crash CourseLinux System Administration Crash Course
Linux System Administration Crash Course
Jason Cannon
 
Matlab distributed computing serverの使い方
Matlab distributed computing serverの使い方Matlab distributed computing serverの使い方
Matlab distributed computing serverの使い方
Osamu Masutani
 
Linux Administration
Linux AdministrationLinux Administration
Linux Administration
Harish1983
 

Viewers also liked (9)

C++ AMPを使ってみよう
C++ AMPを使ってみようC++ AMPを使ってみよう
C++ AMPを使ってみよう
 
A Sensing Coverage Analysis of a Route Control Method for Vehicular Crowd Sen...
A Sensing Coverage Analysis of a Route Control Method for Vehicular Crowd Sen...A Sensing Coverage Analysis of a Route Control Method for Vehicular Crowd Sen...
A Sensing Coverage Analysis of a Route Control Method for Vehicular Crowd Sen...
 
Taxi Fare Deep Dive
Taxi Fare Deep DiveTaxi Fare Deep Dive
Taxi Fare Deep Dive
 
Traffic simulation based on space syntax
Traffic simulation based on space syntaxTraffic simulation based on space syntax
Traffic simulation based on space syntax
 
Power BI チュートリアル 導入・初級編
Power BI チュートリアル 導入・初級編Power BI チュートリアル 導入・初級編
Power BI チュートリアル 導入・初級編
 
Linux practicals T.Y.B.ScIT
Linux practicals T.Y.B.ScITLinux practicals T.Y.B.ScIT
Linux practicals T.Y.B.ScIT
 
Linux System Administration Crash Course
Linux System Administration Crash CourseLinux System Administration Crash Course
Linux System Administration Crash Course
 
Matlab distributed computing serverの使い方
Matlab distributed computing serverの使い方Matlab distributed computing serverの使い方
Matlab distributed computing serverの使い方
 
Linux Administration
Linux AdministrationLinux Administration
Linux Administration
 

Similar to C++ amp on linux

3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf
3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf
3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf
JunZhao68
 
Exploiting GPU's for Columnar DataFrrames by Kiran Lonikar
Exploiting GPU's for Columnar DataFrrames by Kiran LonikarExploiting GPU's for Columnar DataFrrames by Kiran Lonikar
Exploiting GPU's for Columnar DataFrrames by Kiran Lonikar
Spark Summit
 
Introduction to cuda geek camp singapore 2011
Introduction to cuda   geek camp singapore 2011Introduction to cuda   geek camp singapore 2011
Introduction to cuda geek camp singapore 2011
Raymond Tay
 
2011.02.18 marco parenzan - modelli di programmazione per le gpu
2011.02.18   marco parenzan - modelli di programmazione per le gpu2011.02.18   marco parenzan - modelli di programmazione per le gpu
2011.02.18 marco parenzan - modelli di programmazione per le gpu
Marco Parenzan
 
Runtime Code Generation and Data Management for Heterogeneous Computing in Java
Runtime Code Generation and Data Management for Heterogeneous Computing in JavaRuntime Code Generation and Data Management for Heterogeneous Computing in Java
Runtime Code Generation and Data Management for Heterogeneous Computing in Java
Juan Fumero
 
C++20 the small things - Timur Doumler
C++20 the small things - Timur DoumlerC++20 the small things - Timur Doumler
C++20 the small things - Timur Doumler
corehard_by
 
Introduction to CUDA
Introduction to CUDAIntroduction to CUDA
Introduction to CUDA
Raymond Tay
 
A New Chapter of Data Processing with CDK
A New Chapter of Data Processing with CDKA New Chapter of Data Processing with CDK
A New Chapter of Data Processing with CDK
Shu-Jeng Hsieh
 
Two C++ Tools: Compiler Explorer and Cpp Insights
Two C++ Tools: Compiler Explorer and Cpp InsightsTwo C++ Tools: Compiler Explorer and Cpp Insights
Two C++ Tools: Compiler Explorer and Cpp Insights
Alison Chaiken
 
Gpu workshop cluster universe: scripting cuda
Gpu workshop cluster universe: scripting cudaGpu workshop cluster universe: scripting cuda
Gpu workshop cluster universe: scripting cuda
Ferdinand Jamitzky
 
Tips and tricks for building high performance android apps using native code
Tips and tricks for building high performance android apps using native codeTips and tricks for building high performance android apps using native code
Tips and tricks for building high performance android apps using native code
Kenneth Geisshirt
 
20145-5SumII_CSC407_assign1.htmlCSC 407 Computer Systems II.docx
20145-5SumII_CSC407_assign1.htmlCSC 407 Computer Systems II.docx20145-5SumII_CSC407_assign1.htmlCSC 407 Computer Systems II.docx
20145-5SumII_CSC407_assign1.htmlCSC 407 Computer Systems II.docx
eugeniadean34240
 
JVM code reading -- C2
JVM code reading -- C2JVM code reading -- C2
JVM code reading -- C2
ytoshima
 
Analysis of Haiku Operating System (BeOS Family) by PVS-Studio. Part 2
Analysis of Haiku Operating System (BeOS Family) by PVS-Studio. Part 2Analysis of Haiku Operating System (BeOS Family) by PVS-Studio. Part 2
Analysis of Haiku Operating System (BeOS Family) by PVS-Studio. Part 2
PVS-Studio
 
OpenCL Heterogeneous Parallel Computing
OpenCL Heterogeneous Parallel ComputingOpenCL Heterogeneous Parallel Computing
OpenCL Heterogeneous Parallel Computing
João Paulo Leonidas Fernandes Dias da Silva
 
100 bugs in Open Source C/C++ projects
100 bugs in Open Source C/C++ projects 100 bugs in Open Source C/C++ projects
100 bugs in Open Source C/C++ projects
Andrey Karpov
 
Lrz kurs: gpu and mic programming with r
Lrz kurs: gpu and mic programming with rLrz kurs: gpu and mic programming with r
Lrz kurs: gpu and mic programming with r
Ferdinand Jamitzky
 
C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략 C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략
명신 김
 
JVM Mechanics: When Does the JVM JIT & Deoptimize?
JVM Mechanics: When Does the JVM JIT & Deoptimize?JVM Mechanics: When Does the JVM JIT & Deoptimize?
JVM Mechanics: When Does the JVM JIT & Deoptimize?
Doug Hawkins
 
Config interface
Config interfaceConfig interface
Config interface
Ryan Boland
 

Similar to C++ amp on linux (20)

3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf
3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf
3 Open-Source-SYCL-Intel-Khronos-EVS-Workshop_May19.pdf
 
Exploiting GPU's for Columnar DataFrrames by Kiran Lonikar
Exploiting GPU's for Columnar DataFrrames by Kiran LonikarExploiting GPU's for Columnar DataFrrames by Kiran Lonikar
Exploiting GPU's for Columnar DataFrrames by Kiran Lonikar
 
Introduction to cuda geek camp singapore 2011
Introduction to cuda   geek camp singapore 2011Introduction to cuda   geek camp singapore 2011
Introduction to cuda geek camp singapore 2011
 
2011.02.18 marco parenzan - modelli di programmazione per le gpu
2011.02.18   marco parenzan - modelli di programmazione per le gpu2011.02.18   marco parenzan - modelli di programmazione per le gpu
2011.02.18 marco parenzan - modelli di programmazione per le gpu
 
Runtime Code Generation and Data Management for Heterogeneous Computing in Java
Runtime Code Generation and Data Management for Heterogeneous Computing in JavaRuntime Code Generation and Data Management for Heterogeneous Computing in Java
Runtime Code Generation and Data Management for Heterogeneous Computing in Java
 
C++20 the small things - Timur Doumler
C++20 the small things - Timur DoumlerC++20 the small things - Timur Doumler
C++20 the small things - Timur Doumler
 
Introduction to CUDA
Introduction to CUDAIntroduction to CUDA
Introduction to CUDA
 
A New Chapter of Data Processing with CDK
A New Chapter of Data Processing with CDKA New Chapter of Data Processing with CDK
A New Chapter of Data Processing with CDK
 
Two C++ Tools: Compiler Explorer and Cpp Insights
Two C++ Tools: Compiler Explorer and Cpp InsightsTwo C++ Tools: Compiler Explorer and Cpp Insights
Two C++ Tools: Compiler Explorer and Cpp Insights
 
Gpu workshop cluster universe: scripting cuda
Gpu workshop cluster universe: scripting cudaGpu workshop cluster universe: scripting cuda
Gpu workshop cluster universe: scripting cuda
 
Tips and tricks for building high performance android apps using native code
Tips and tricks for building high performance android apps using native codeTips and tricks for building high performance android apps using native code
Tips and tricks for building high performance android apps using native code
 
20145-5SumII_CSC407_assign1.htmlCSC 407 Computer Systems II.docx
20145-5SumII_CSC407_assign1.htmlCSC 407 Computer Systems II.docx20145-5SumII_CSC407_assign1.htmlCSC 407 Computer Systems II.docx
20145-5SumII_CSC407_assign1.htmlCSC 407 Computer Systems II.docx
 
JVM code reading -- C2
JVM code reading -- C2JVM code reading -- C2
JVM code reading -- C2
 
Analysis of Haiku Operating System (BeOS Family) by PVS-Studio. Part 2
Analysis of Haiku Operating System (BeOS Family) by PVS-Studio. Part 2Analysis of Haiku Operating System (BeOS Family) by PVS-Studio. Part 2
Analysis of Haiku Operating System (BeOS Family) by PVS-Studio. Part 2
 
OpenCL Heterogeneous Parallel Computing
OpenCL Heterogeneous Parallel ComputingOpenCL Heterogeneous Parallel Computing
OpenCL Heterogeneous Parallel Computing
 
100 bugs in Open Source C/C++ projects
100 bugs in Open Source C/C++ projects 100 bugs in Open Source C/C++ projects
100 bugs in Open Source C/C++ projects
 
Lrz kurs: gpu and mic programming with r
Lrz kurs: gpu and mic programming with rLrz kurs: gpu and mic programming with r
Lrz kurs: gpu and mic programming with r
 
C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략 C++ AMP 실천 및 적용 전략
C++ AMP 실천 및 적용 전략
 
JVM Mechanics: When Does the JVM JIT & Deoptimize?
JVM Mechanics: When Does the JVM JIT & Deoptimize?JVM Mechanics: When Does the JVM JIT & Deoptimize?
JVM Mechanics: When Does the JVM JIT & Deoptimize?
 
Config interface
Config interfaceConfig interface
Config interface
 

Recently uploaded

Why Choose Odoo 17 Community & How it differs from Odoo 17 Enterprise Edition
Why Choose Odoo 17 Community & How it differs from Odoo 17 Enterprise EditionWhy Choose Odoo 17 Community & How it differs from Odoo 17 Enterprise Edition
Why Choose Odoo 17 Community & How it differs from Odoo 17 Enterprise Edition
Envertis Software Solutions
 
Introducing Crescat - Event Management Software for Venues, Festivals and Eve...
Introducing Crescat - Event Management Software for Venues, Festivals and Eve...Introducing Crescat - Event Management Software for Venues, Festivals and Eve...
Introducing Crescat - Event Management Software for Venues, Festivals and Eve...
Crescat
 
Top Features to Include in Your Winzo Clone App for Business Growth (4).pptx
Top Features to Include in Your Winzo Clone App for Business Growth (4).pptxTop Features to Include in Your Winzo Clone App for Business Growth (4).pptx
Top Features to Include in Your Winzo Clone App for Business Growth (4).pptx
rickgrimesss22
 
DDS-Security 1.2 - What's New? Stronger security for long-running systems
DDS-Security 1.2 - What's New? Stronger security for long-running systemsDDS-Security 1.2 - What's New? Stronger security for long-running systems
DDS-Security 1.2 - What's New? Stronger security for long-running systems
Gerardo Pardo-Castellote
 
KuberTENes Birthday Bash Guadalajara - Introducción a Argo CD
KuberTENes Birthday Bash Guadalajara - Introducción a Argo CDKuberTENes Birthday Bash Guadalajara - Introducción a Argo CD
KuberTENes Birthday Bash Guadalajara - Introducción a Argo CD
rodomar2
 
openEuler Case Study - The Journey to Supply Chain Security
openEuler Case Study - The Journey to Supply Chain SecurityopenEuler Case Study - The Journey to Supply Chain Security
openEuler Case Study - The Journey to Supply Chain Security
Shane Coughlan
 
Fundamentals of Programming and Language Processors
Fundamentals of Programming and Language ProcessorsFundamentals of Programming and Language Processors
Fundamentals of Programming and Language Processors
Rakesh Kumar R
 
GreenCode-A-VSCode-Plugin--Dario-Jurisic
GreenCode-A-VSCode-Plugin--Dario-JurisicGreenCode-A-VSCode-Plugin--Dario-Jurisic
GreenCode-A-VSCode-Plugin--Dario-Jurisic
Green Software Development
 
APIs for Browser Automation (MoT Meetup 2024)
APIs for Browser Automation (MoT Meetup 2024)APIs for Browser Automation (MoT Meetup 2024)
APIs for Browser Automation (MoT Meetup 2024)
Boni García
 
E-commerce Development Services- Hornet Dynamics
E-commerce Development Services- Hornet DynamicsE-commerce Development Services- Hornet Dynamics
E-commerce Development Services- Hornet Dynamics
Hornet Dynamics
 
AI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI App
AI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI AppAI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI App
AI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI App
Google
 
Graspan: A Big Data System for Big Code Analysis
Graspan: A Big Data System for Big Code AnalysisGraspan: A Big Data System for Big Code Analysis
Graspan: A Big Data System for Big Code Analysis
Aftab Hussain
 
2024 eCommerceDays Toulouse - Sylius 2.0.pdf
2024 eCommerceDays Toulouse - Sylius 2.0.pdf2024 eCommerceDays Toulouse - Sylius 2.0.pdf
2024 eCommerceDays Toulouse - Sylius 2.0.pdf
Łukasz Chruściel
 
GraphSummit Paris - The art of the possible with Graph Technology
GraphSummit Paris - The art of the possible with Graph TechnologyGraphSummit Paris - The art of the possible with Graph Technology
GraphSummit Paris - The art of the possible with Graph Technology
Neo4j
 
Enterprise Resource Planning System in Telangana
Enterprise Resource Planning System in TelanganaEnterprise Resource Planning System in Telangana
Enterprise Resource Planning System in Telangana
NYGGS Automation Suite
 
Artificia Intellicence and XPath Extension Functions
Artificia Intellicence and XPath Extension FunctionsArtificia Intellicence and XPath Extension Functions
Artificia Intellicence and XPath Extension Functions
Octavian Nadolu
 
OpenMetadata Community Meeting - 5th June 2024
OpenMetadata Community Meeting - 5th June 2024OpenMetadata Community Meeting - 5th June 2024
OpenMetadata Community Meeting - 5th June 2024
OpenMetadata
 
A Study of Variable-Role-based Feature Enrichment in Neural Models of Code
A Study of Variable-Role-based Feature Enrichment in Neural Models of CodeA Study of Variable-Role-based Feature Enrichment in Neural Models of Code
A Study of Variable-Role-based Feature Enrichment in Neural Models of Code
Aftab Hussain
 
Vitthal Shirke Java Microservices Resume.pdf
Vitthal Shirke Java Microservices Resume.pdfVitthal Shirke Java Microservices Resume.pdf
Vitthal Shirke Java Microservices Resume.pdf
Vitthal Shirke
 
How to write a program in any programming language
How to write a program in any programming languageHow to write a program in any programming language
How to write a program in any programming language
Rakesh Kumar R
 

Recently uploaded (20)

Why Choose Odoo 17 Community & How it differs from Odoo 17 Enterprise Edition
Why Choose Odoo 17 Community & How it differs from Odoo 17 Enterprise EditionWhy Choose Odoo 17 Community & How it differs from Odoo 17 Enterprise Edition
Why Choose Odoo 17 Community & How it differs from Odoo 17 Enterprise Edition
 
Introducing Crescat - Event Management Software for Venues, Festivals and Eve...
Introducing Crescat - Event Management Software for Venues, Festivals and Eve...Introducing Crescat - Event Management Software for Venues, Festivals and Eve...
Introducing Crescat - Event Management Software for Venues, Festivals and Eve...
 
Top Features to Include in Your Winzo Clone App for Business Growth (4).pptx
Top Features to Include in Your Winzo Clone App for Business Growth (4).pptxTop Features to Include in Your Winzo Clone App for Business Growth (4).pptx
Top Features to Include in Your Winzo Clone App for Business Growth (4).pptx
 
DDS-Security 1.2 - What's New? Stronger security for long-running systems
DDS-Security 1.2 - What's New? Stronger security for long-running systemsDDS-Security 1.2 - What's New? Stronger security for long-running systems
DDS-Security 1.2 - What's New? Stronger security for long-running systems
 
KuberTENes Birthday Bash Guadalajara - Introducción a Argo CD
KuberTENes Birthday Bash Guadalajara - Introducción a Argo CDKuberTENes Birthday Bash Guadalajara - Introducción a Argo CD
KuberTENes Birthday Bash Guadalajara - Introducción a Argo CD
 
openEuler Case Study - The Journey to Supply Chain Security
openEuler Case Study - The Journey to Supply Chain SecurityopenEuler Case Study - The Journey to Supply Chain Security
openEuler Case Study - The Journey to Supply Chain Security
 
Fundamentals of Programming and Language Processors
Fundamentals of Programming and Language ProcessorsFundamentals of Programming and Language Processors
Fundamentals of Programming and Language Processors
 
GreenCode-A-VSCode-Plugin--Dario-Jurisic
GreenCode-A-VSCode-Plugin--Dario-JurisicGreenCode-A-VSCode-Plugin--Dario-Jurisic
GreenCode-A-VSCode-Plugin--Dario-Jurisic
 
APIs for Browser Automation (MoT Meetup 2024)
APIs for Browser Automation (MoT Meetup 2024)APIs for Browser Automation (MoT Meetup 2024)
APIs for Browser Automation (MoT Meetup 2024)
 
E-commerce Development Services- Hornet Dynamics
E-commerce Development Services- Hornet DynamicsE-commerce Development Services- Hornet Dynamics
E-commerce Development Services- Hornet Dynamics
 
AI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI App
AI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI AppAI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI App
AI Fusion Buddy Review: Brand New, Groundbreaking Gemini-Powered AI App
 
Graspan: A Big Data System for Big Code Analysis
Graspan: A Big Data System for Big Code AnalysisGraspan: A Big Data System for Big Code Analysis
Graspan: A Big Data System for Big Code Analysis
 
2024 eCommerceDays Toulouse - Sylius 2.0.pdf
2024 eCommerceDays Toulouse - Sylius 2.0.pdf2024 eCommerceDays Toulouse - Sylius 2.0.pdf
2024 eCommerceDays Toulouse - Sylius 2.0.pdf
 
GraphSummit Paris - The art of the possible with Graph Technology
GraphSummit Paris - The art of the possible with Graph TechnologyGraphSummit Paris - The art of the possible with Graph Technology
GraphSummit Paris - The art of the possible with Graph Technology
 
Enterprise Resource Planning System in Telangana
Enterprise Resource Planning System in TelanganaEnterprise Resource Planning System in Telangana
Enterprise Resource Planning System in Telangana
 
Artificia Intellicence and XPath Extension Functions
Artificia Intellicence and XPath Extension FunctionsArtificia Intellicence and XPath Extension Functions
Artificia Intellicence and XPath Extension Functions
 
OpenMetadata Community Meeting - 5th June 2024
OpenMetadata Community Meeting - 5th June 2024OpenMetadata Community Meeting - 5th June 2024
OpenMetadata Community Meeting - 5th June 2024
 
A Study of Variable-Role-based Feature Enrichment in Neural Models of Code
A Study of Variable-Role-based Feature Enrichment in Neural Models of CodeA Study of Variable-Role-based Feature Enrichment in Neural Models of Code
A Study of Variable-Role-based Feature Enrichment in Neural Models of Code
 
Vitthal Shirke Java Microservices Resume.pdf
Vitthal Shirke Java Microservices Resume.pdfVitthal Shirke Java Microservices Resume.pdf
Vitthal Shirke Java Microservices Resume.pdf
 
How to write a program in any programming language
How to write a program in any programming languageHow to write a program in any programming language
How to write a program in any programming language
 

C++ amp on linux

  • 1. C++AMP on Linux Miller Lee
  • 2. About Me ● Miller Lee ● Junior student at NCTU CS ● Interests: C/C++, PL, CA, OS, compiler, parallel programming, optimization
  • 3. Why C++ AMP? ● GPUs can be 10+X faster than CPUs for parallel code ● CUDA and OpenCL are still too complex/verbose for programmers
  • 4.
  • 5.
  • 6. GPU computing require explicit transfer
  • 7. What we need in GPU programming 1. put data parallel codes into a kernel for GPU to execute 2. pass the arguments to GPU ○ We can not pass the arguments by stack 3. an index to indicate current thread 4. move the data between GPU and CPU memory
  • 8. OpenCL as an example
  • 9. Device code in OpenCL __kernel void matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB) { int tx = get_global_id(0); int ty = get_global_id(1); float value = 0; for (int k = 0; k < wA; ++k) { float elementA = A[ty * wA + k]; float elementB = B[k * wB + tx]; value += elementA * elementB; } C[ty * wA + tx] = value; }
  • 10. Host code in OpenCL 1.2 1. allocate and initialize memory on host side 2. Initialize OpenCL 3. allocate device memory and move the data 4. Load and build device code 5. Launch kernel a. append arguments 6. move the data back from device
  • 11. int main(int argc, char** argv) { // set seed for rand() srand(2006); // 1. allocate host memory for matrices A and B unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*) malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*) malloc(mem_size_B); // 2. initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); // 4. allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); // 5. Initialize OpenCL // OpenCL specific variables cl_context clGPUContext; cl_command_queue clCommandQue; cl_program clProgram; size_t dataBytes; size_t kernelLength; cl_int errcode; // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; /*****************************************/ /* Initialize OpenCL */ /*****************************************/ clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); shrCheckError(errcode, CL_SUCCESS); // get the list of GPU devices associated // with context errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); shrCheckError(errcode, CL_SUCCESS); //Create a command-queue clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode); shrCheckError(errcode, CL_SUCCESS); // Setup device memory d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &errcode); d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &errcode); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &errcode); // 6. Load and build OpenCL kernel char *clMatrixMul = oclLoadProgSource("kernel.cl", "// My commentn", &kernelLength); shrCheckError(clMatrixMul != NULL, shrTRUE); clProgram = clCreateProgramWithSource(clGPUContext, 1, (const char **)&clMatrixMul, &kernelLength, &errcode); shrCheckError(errcode, CL_SUCCESS); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); shrCheckError(errcode, CL_SUCCESS); clKernel = clCreateKernel(clProgram, "matrixMul", &errcode); shrCheckError(errcode, CL_SUCCESS); // 7. Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); shrCheckError(errcode, CL_SUCCESS); localWorkSize[0] = 16; localWorkSize[1] = 16; globalWorkSize[0] = 1024; globalWorkSize[1] = 1024; errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); shrCheckError(errcode, CL_SUCCESS); // 8. Retrieve result from device errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); shrCheckError(errcode, CL_SUCCESS); // 10. clean up memory free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); free(clDevices); free(clMatrixMul); clReleaseContext(clGPUContext); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseCommandQueue(clCommandQue); }
  • 12. Nearly 200 lines of code
  • 13. What is C++ AMP ● C++ Accelerated Massive Parallelism ○ Designed for data level parallelism ○ Extension of C++11 proposed by M$ ○ An open specification with multiple implementations aiming at standardization ■ MS Visual Studio 2013 ■ MCW CLAMP ● GPU data modeled as C++14-like containers for multidimensional arrays ● GPU kernels modeled as C++11 lambda
  • 14. Comparisons C++AMP Thrust Bolt OpenACC SYCL Intro simple, elegant, performance(?), proposed by M$ library proposed by CUDA library proposed by AMP Annotation and pragmas proposed by SGI wrapper for OpenCL proposed by Codeplay
  • 15. Matrix Multiplication in C++AMP void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix, int ha, int hb, int hc) { array_view<int, 2> a(ha, hb, aMatrix); array_view<int, 2> b(hb, hc, bMatrix); array_view<int, 2> product(ha, hc, productMatrix); parallel_for_each( product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); product.synchronize(); } clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); shrCheckError(errcode, CL_SUCCESS); // get the list of GPU devices associated // with context errcode = clGetContextInfo(clGPUContext, __kernel void matrixMul(__global float* C, __global float* A, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); __global float* B, int wA, int wB) { int tx = get_global_id(0); int ty = get_global_id(1); float value = 0; for (int k = 0; k < wA; ++k) { cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); shrCheckError(errcode, CL_SUCCESS); //Create a command-queue clCommandQue = clCreateCommandQueue (clGPUContext, float elementA = A[ty * wA + k]; float elementB = B[k * wB + tx]; value += elementA * elementB; } C[ty * wA + tx] = value; } clDevices[0], 0, &errcode); shrCheckError(errcode, CL_SUCCESS);
  • 16. Only 20 lines of code but performance?
  • 17. C++AMP programming model void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { array_view<int, 2> a(3, 2, aMatrix); array_view<int, 2> b(2, 3, bMatrix); array_view<int, 2> product(3, 3, productMatrix); parallel_for_each( product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); product.synchronize(); } GPU data modeled as data container
  • 18. C++AMP programming model void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { array_view<int, 2> a(3, 2, aMatrix); array_view<int, 2> b(2, 3, bMatrix); array_view<int, 2> product(3, 3, productMatrix); parallel_for_each( product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); product.synchronize(); } Execution interface; marking an implicitly parallel region for GPU execution
  • 19. C++AMP programming model void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { array_view<int, 2> a(3, 2, aMatrix); array_view<int, 2> b(2, 3, bMatrix); array_view<int, 2> product(3, 3, productMatrix); parallel_for_each( product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); product.synchronize(); } Kernels modeled as lambdas; arguments are implicitly modeled as captured variables
  • 20. MCW C++AMP (CLAMP) ● Clang/LLVM-based ○ translate C++AMP code to OpenCL C code and generate OpenCL SPIR file ○ With some template library ● Runtime support: gmac/OpenCL/HSA Okra ● An Open Source project ○ The only two C++ AMP implementations recognized by HSA foundation (the other is MSVC) ○ Microsoft and HSA foundation supported
  • 21. MCW C++ AMP Compiler ● Device Path ○ generate OpenCL C code by CBackend ○ emit kernel function ● Host Path ○ preparation to launch the code C++ AMP source code Clang/LLVM 3.3 Device Code Host Code
  • 22. Execution process C++ AMP source code Clang /LLV M 3.3 Device Code C++ AMP source code Clang /LLV M 3.3 Host Code gmac OpenCL Our work
  • 23. gmac ● unified virtual address space in software ● Can have high overhead sometimes ● In HSA (AMD Kaveri), GMAC is no longer needed
  • 24. Compiling C++AMP to OpenCL ● C++AMP → LLVM IR → subset of C ● arguments passing (lambda capture vs function calls) ● explicit V.S. implicit memory transfer ● Heavy works were done by compiler and runtime
  • 25. lambda capture struct add { int a; add(int a) : a(a) {} int operator()(int x) const { return a + x; } }; int main(void) { int a = 3; auto fn = [=] (int x) { return a + x; }; int c = fn(3); return 0; } Those arguments should be put on the argument lists of OpenCL kernel.
  • 26. What we need to do? ● Kernel function ○ emit the kernel function with required arguments ● In Host side ○ a function that recursively traverses the object and append the arguments to OpenCL stack. ● In Device side ○ reconstructor it on the device code for future use.
  • 27. Example struct A { int a; }; struct B : A { int b; }; struct C { B b; int c; }; struct C c; c.c = 100; auto fn = [=] () { int qq = c.c; };
  • 28. Kernel code __kernel void(int a, int b, int c) { C c(a, b, c); ... }
  • 29. Deserialization constructor struct C { B b; int c; C (int a, int b, int c) : c(c), b(a, b) {} };
  • 30. Serialization constructor struct C { B b; int c; void __cxxamp_serialize(Concurrency::Serialize s) { b.__cxxamp_serialize(s); s.Append(sizeof(int), &c); } };
  • 31. Translation parallel_for_each(product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); __kernel void matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB) { int tx = get_global_id(0); int ty = get_global_id(1); float value = 0; for (int k = 0; k < wA; ++k) { float elementA = A[ty * wA + k]; float elementB = B[k * wB + tx]; value += elementA * elementB; } C[ty * wA + tx] = value; } ● Append the arguments ● Set the index ● emit kernel function ● implicit memory management
  • 32.
  • 33. Future work ● Future work for us ○ restrict(auto) ○ HSA related work
  • 34. Future works for you ● Try this out!! ● Many of us get spoiled and don’t want to go back to write OpenCL directly anymore :-) ● related links ○ Driver ○ Clang ○ sandbox