More Related Content More from PyCon Italia (19) PyCuda: Come sfruttare la potenza delle schede video nelle applicazioni python2. Talk Structure
1.β―Why a GPU ?
2.β―How does It works ?
3.β―How do I Program it ?
4.β―Can I Use Python ?
PyCon 4 β Florence 2010 β Fabrizio Milo
3. Talk Structure
1.β―Why a GPU ?
2.β―How does It works ?
3.β―How do I Program it ?
4.β―Can I Use Python ?
PyCon 4 β Florence 2010 β Fabrizio Milo
4. WHY A GPU ?
PyCon 4 β Florence 2010 β Fabrizio Milo
7. Talk Structure
1.β―Why a GPU ?
2.β―How does it works ?
3.β―How do I Program it ?
4.β―Can I Use Python ?
PyCon 4 β Florence 2010 β Fabrizio Milo
8. How does it works ?
PyCon 4 β Florence 2010 β Fabrizio Milo
9. ALU ALU
Control
ALU ALU
Cache
DRAM
CPU
PyCon 4 β Florence 2010 β Fabrizio Milo
10. DRAM
GPU
PyCon 4 β Florence 2010 β Fabrizio Milo
11. ALU ALU
Control
ALU ALU
Cache
DRAM DRAM
CPU GPU
PyCon 4 β Florence 2010 β Fabrizio Milo
14. CUDA
A Parallel Computing Architecture for NVIDIA GPUs
Direct X
Compute
PyCon 4 β Florence 2010 β Fabrizio Milo
15. Execution Model
CUDA
Device Model
PyCon 4 β Florence 2010 β Fabrizio Milo
17. Thread
Smallest unit of logic
PyCon 4 β Florence 2010 β Fabrizio Milo
18. A Block
A Group of Threads
PyCon 4 β Florence 2010 β Fabrizio Milo
19. A Grid
A Group of Blocks
PyCon 4 β Florence 2010 β Fabrizio Milo
20. One Block can have many threads
PyCon 4 β Florence 2010 β Fabrizio Milo
21. One Grid can have many blocks
PyCon 4 β Florence 2010 β Fabrizio Milo
22. The hardware
DEVICE MODEL
PyCon 4 β Florence 2010 β Fabrizio Milo
30. Real Example: 10-Series Architecture
" β― 240 Scalar Processor (SP) cores execute kernel threads
" β― 30 Streaming Multiprocessors (SMs) each contain
" 8 scalar processors
β―
" β―1 double precision unit
" β―Shared memory
PyCon 4 β Florence 2010 β Fabrizio Milo
31. Software Hardware
Scalar
Processor
Thread
Thread
Block Multiprocessor
Grid Device
PyCon 4 β Florence 2010 β Fabrizio Milo
34. RAM
CPU Global Memory
Host - Device
PyCon 4 β Florence 2010 β Fabrizio Milo
35. RAM
CPU
Host β Multi Device
PyCon 4 β Florence 2010 β Fabrizio Milo
36. 1.β―Why a GPU ?
2.β―How does It works ?
3.β―How do I Program it ?
4.β―Can I Use Python ?
PyCon 4 β Florence 2010 β Fabrizio Milo
37. Software Hardware
Scalar
Processor
Thread
Thread
Block Multiprocessor
Grid Device
PyCon 4 β Florence 2010 β Fabrizio Milo
41. Kernel
__global__ void kernel( β¦ )
{
const int idx =
blockIdx.x * blockDim.x + threadIdx.x;
β¦
}
Grid
PyCon 4 β Florence 2010 β Fabrizio Milo
42. How do I Program it ?
Main Logic Kernel
GCC
NVCC
CPU .bin .cubin GPU
PyCon 4 β Florence 2010 β Fabrizio Milo
43. How do I Program it ?
Main Logic Kernel
GCC
NVCC
GPU
.bin .cubin
.bin .cubin . CPU
PyCon 4 β Florence 2010 β Fabrizio Milo
44. RAM
CPU Global Memory
Host - Device
PyCon 4 β Florence 2010 β Fabrizio Milo
45. RAM
CPU Global Memory
PyCon 4 β Florence 2010 β Fabrizio Milo
47. Copy to device
cudaMalloc( pointer, size )
cudaMemcpy( dest, src, size, direction)
PyCon 4 β Florence 2010 β Fabrizio Milo
48. Kernel Launch
cudaMalloc( pointer, size )
cudaMemcpy( dest, src, size, direction)
Kernel<<< # blocks, # threads >> (*params)
PyCon 4 β Florence 2010 β Fabrizio Milo
49. Get Back the Results
cudaMalloc( pointer, size )
cudaMemcpy( dest, src, size, direction)
Kernel<<< # blocks, # threads >> (*params)
cudaMemcpy( dest, src, size, direction)
PyCon 4 β Florence 2010 β Fabrizio Milo
51. And soon it becomes β¦
If(cudaMalloc( pointer, size ) != cudaSuccess){
handle_error()
}
if (cudaMemcpy( dest, src, size, direction ) == cudaSuccess) {}
If (Kernel<<< # blocks, # threads >> (*params) != cudaSuccess){
handle_error()
}
If( cudaMemcpy( dest, src, size, direction) != cudaSuccess) { }
PyCon 4 β Florence 2010 β Fabrizio Milo
52. And soon it becomes β¦
If(cudaMalloc( pointer, size ) != cudaSuccess){
handle_error() If(cudaMalloc( pointer, size ) != cudaSuccess){
} handle_error()
}
if (cudaMemcpy( dest, src, size, direction ) == cudaSuccess) {}
if (cudaMemcpy( dest, src, size, direction ) == cudaSuccess) {}
If (Kernel<<< # blocks, # threads >> (*params) != cudaSuccess){
handle_error() If (Kernel<<< # blocks, # threads >> (*params) != cudaSuccess){
} handle_error()
}
If( cudaMemcpy( dest, src, size, direction) != cudaSuccess) { }
If( cudaMemcpy( dest, src, size, direction) != cudaSuccess) { }
If(cudaMalloc( pointer, size ) != cudaSuccess){
handle_error() If(cudaMalloc( pointer, size ) != cudaSuccess){
} handle_error()
}
if (cudaMemcpy( dest, src, size, direction ) == cudaSuccess) {}
if (cudaMemcpy( dest, src, size, direction ) == cudaSuccess) {}
If (Kernel<<< # blocks, # threads >> (*params) != cudaSuccess){
handle_error() If (Kernel<<< # blocks, # threads >> (*params) != cudaSuccess){
} handle_error()
}
If( cudaMemcpy( dest, src, size, direction) != cudaSuccess) { }
If( cudaMemcpy( dest, src, size, direction) != cudaSuccess) { }
If(cudaMalloc( pointer, size ) != cudaSuccess){
handle_error() If(cudaMalloc( pointer, size ) != cudaSuccess){
} handle_error()
}
if (cudaMemcpy( dest, src, size, direction ) == cudaSuccess) {}
if (cudaMemcpy( dest, src, size, direction ) == cudaSuccess) {}
If (Kernel<<< # blocks, # threads >> (*params) != cudaSuccess){
handle_error() If (Kernel<<< # blocks, # threads >> (*params) != cudaSuccess){
} handle_error()
}
If( cudaMemcpy( dest, src, size, direction) != cudaSuccess) { }
If( cudaMemcpy( dest, src, size, direction) != cudaSuccess) { }
PyCon 4 β Florence 2010 β Fabrizio Milo
54. 1.β―Why a GPU ?
2.β―How does It works ?
3.β―How do I Program it ?
4.β―Can I Use Python ?
PyCon 4 β Florence 2010 β Fabrizio Milo
55. +
& ANDREAS KLOCKNER
= PYCUDA
PyCon 4 β Florence 2010 β Fabrizio Milo
56. PyCuda Philosopy
Provide
Complete
Access
PyCon 4 β Florence 2010 β Fabrizio Milo
57. PyCuda Philosopy
AutoMatically
Manage
Resources
PyCon 4 β Florence 2010 β Fabrizio Milo
58. PyCuda Philosopy
Check and
Report Errors
PyCon 4 β Florence 2010 β Fabrizio Milo
60. PyCuda Philosopy
Allow
Interactive
Use
PyCon 4 β Florence 2010 β Fabrizio Milo
61. PyCuda Philosopy
NumPy
Integration
PyCon 4 β Florence 2010 β Fabrizio Milo
63. 1 1 1 1 1 1
0 99
import numpy
my_array = numpy.array([1,] * 100)
PyCon 4 β Florence 2010 β Fabrizio Milo
64. 1 1 1 0 1 1
import numpy
my_array = numpy.array([1,] * 100)
my_array[3] = 0
PyCon 4 β Florence 2010 β Fabrizio Milo
69. Memory Copy
gpu_mem = cuda.mem_alloc( size_bytes )
cuda.memcpy_htod( gpu_mem, cpu_mem )
PyCon 4 β Florence 2010 β Fabrizio Milo
70. Kernel
gpu_mem = cuda.mem_alloc( size_bytes )
cuda.memcpy_htod( gpu_mem, cpu_mem )
SourceModule(βββ
__global__ void multiply_them( float *dest, float *a,
float *b )
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}βββ)
PyCon 4 β Florence 2010 β Fabrizio Milo
71. Kernel Launch
mod = SourceModule(βββ
__global__ void multiply_them( float *dest, float *a,
float *b )
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}βββ)
multiply_them = mod.get_function(βmultiply_themβ)
multiply_them ( *args, block=(30, 64, 1))
PyCon 4 β Florence 2010 β Fabrizio Milo
75. Hello Gpu
DEMO
PyCon 4 β Florence 2010 β Fabrizio Milo
78. PyCuda: GpuArray
gpuarray.to_gpu(numpy array)
numpy array = gpuarray.get()
PyCon 4 β Florence 2010 β Fabrizio Milo
79. PyCuda: GpuArray
gpuarray.to_gpu(numpy array)
numpy array = gpuarray.get()
+, -, !, /, ο¬ll, sin, exp, rand, basic
indexing, norm, inner product β¦
PyCon 4 β Florence 2010 β Fabrizio Milo
81. PyCuda: GpuArray: ElementWise
from pycuda.elementwise import ElementwiseKernel
lincomb = ElementwiseKernel(
β ο¬oat a , ο¬oat !x , ο¬oat b , ο¬oat !y , ο¬oat !zβ,
βz [ i ] = a !x[ i ] + b!y[i ] β
)
PyCon 4 β Florence 2010 β Fabrizio Milo
82. PyCuda: GpuArray: ElementWise
from pycuda.elementwise import ElementwiseKernel
lin comb = ElementwiseKernel(
β ο¬oat a , ο¬oat !x , ο¬oat b , ο¬oat !y , ο¬oat !zβ,
βz [ i ] = a !x[ i ] + b!y[i ] β
)
c gpu = gpuarray. empty like (a gpu)
lincomb (5, a gpu, 6, b gpu, c gpu)
assert la . norm((c gpu ! (5!a gpu+6!b gpu)).get()) < 1e!5
PyCon 4 β Florence 2010 β Fabrizio Milo
85. Meta-Programming
Generate Source !
PyCon 4 β Florence 2010 β Fabrizio Milo
87. mandelbrot
DEMO
PyCon 4 β Florence 2010 β Fabrizio Milo
90. In the Future β¦
OPENCL
PyCon 4 β Florence 2010 β Fabrizio Milo
91. THANK YOU & HAVE FUN !
PyCon 4 β Florence 2010 β Fabrizio Milo