Successfully reported this slideshow.
We use your LinkedIn profile and activity data to personalize ads and to show you more relevant ads. You can change your ad preferences anytime.
6.963
               IT /
         A@M
      CUD
    9
IAP0

       Supercomputing on your desktop:
 Programming the next ...
During this course,
                                3
                               6
                        for 6.9
   ...
Today
yey!!
6.963
               IT /
         A@M
                Intro
      CUD
    9
IAP0

                GPU?
         GPU Histo...
6.963
               IT /
         A@M
      CUD
    9
IAP0




           Intro
ion
                                                                 at
                                                  ...
ion
                                             at
                                          tiv
                        ...
6.963
               IT /
         A@M
      CUD
    9
IAP0




          GPU?
U?
                                                        GP
GPUs are REALLY fast
       Performance (gflops)     Developm...
U?
                                                          GP
GPUs are REALLY fast
         Performance (gflops)     Deve...
U?
                                                                GP
GPUs are REALLY fast
               Performance (gflo...
U?
                                                                  GP
GPUs are REALLY fast
                 Performance ...
U?
                                                                  GP
GPUs are REALLY fast
                 Performance ...
U?
                                                                  GP
GPUs are REALLY fast
                 Performance ...
U?
                                                                  GP
GPUs are REALLY fast
                 Performance ...
U?
                                                                   GP
GPUs are REALLY fast
                  Performanc...
U?
                                                                    GP
GPUs are REALLY fast
                   Performa...
U?
                                                                    GP
GPUs are REALLY fast
                   Performa...
U?
                                       GP

    6'401-'@&)*(&+,3AB0-3'-407':&C,(,DD'D&
!
    C(*8D'+4/




    E*('&3(,-...
U?
                                        GP

    GA,3&,('&3A'&.*-4'H2'-.'4I
!
    $(*1(,+&+243&8'&+*('&C('@0.3,8D'/
!
  ...
U?
                                        GP

    T+$Fquot;&'$F+2'quot;:0quot;#$123'-*Q;'$.'3quot;$'$I1#quot;'+;'
!
    O...
U?
                                            GP
Task vs. Data parallelism
• Task parallel
  – Independent processes with...
U?
                                                    GP
CPU vs. GPU
• CPU
  –   Really fast caches (great for data reuse...
Us
The Importance of Data Parallelism for GPUs P
                                         G
• GPUs are designed for highly...
6.963
               IT /
         A@M
      CUD
    9
IAP0


        GPU
       History
ory
   ist
 H




not true!
ory
                                           ist
                                         H
!quot;quot;#$%&'$()     4:.;...
ory
                                       ist
                                     H

    1.),./;.'("#,(.0&F;/.&#$2'...
ory
                                                               ist
                                                   ...
ory
                                        ist
                                      H

    /0)'1*23045&'#43)-46)'(2&'7!q...
ory
                                                                  ist
                                                ...
ory
                                                                  ist
                                                ...
ory
                                                                  ist
                                                ...
ory
                                                                                 ist
                                 ...
ory
                                             ist
                                            H

    ;(*<==>*?@+A6*7'9$...
ory
                                               ist
                                              H

    !quot;#$%&quot...
ory
                                        ist
                                       H

    0,<quot;5@2&%(%Cquot;<':*5<6...
ory
                                                                   ist
                                               ...
ory
                                        ist
                                       H

    .quot;+)quot;&53'0,%&'+$quot...
6.963
               IT /
         A@M
      CUD
    9
IAP0




  // Analysis
ysis
                                                           nal
                                                      ...
ysis
                                             nal
                                           A
                       ...
ysis
                                             nal
                                           A
                       ...
ysis
                                             nal
                                           A
                       ...
ysis
                                                     nal
                                                   A
       ...
ysis
                                             nal
                                           A
                       ...
lysis
                                            Ana
                                       //

    !quot;#$%&'()*'(#$+,-...
ysis
                                             nal
                                           A
                       ...
ysis
                                            nal
                                          A
                         ...
ysis
                                               nal
                                             A
                   ...
sis
                                               naly
                                    // A


    !5'0'%quot;0'%*quot...
ysis
                                             nal
                                           A
                       ...
ysis
                                          nal
                                        A
                             ...
ysis
                                           nal
                                         A
                           ...
ysis
                                              nal
                                            A
                     ...
ysis
                                          nal
                                        A
                             ...
ysis
                                                         nal
                                                       A...
ysis
                                             nal
                                           A
                       ...
ysis
                                              nal
                                            A
                     ...
ysis
                                            nal
                                          A
                         ...
ysis
                                             nal
                                           A
                       ...
ysis
                                            nal
                                          A
                         ...
ysis
                                       nal
                                     A
                                //
...
ysis
                                                  nal
                                                A
             ...
ysis
                                                        nal
                                                      A
 ...
ysis
                                                        nal
                                                      A
 ...
6.963
               IT /
         A@M
      CUD
    9
IAP0


         CUDA
       Overview
ie w
                                                      erv
                                                    Ov
 *,....
ie w
                                         erv
                                       Ov

    !quot;#$)'0,I=%$quot;'E+....
iew
  erv
 v
O
    CUDA Advantages over Legacy GPGPU
             Random access to memory
                       Thread ca...
ie w
Some Design Goals
                                       erv
                                     Ov
  Scale to 100’s...
iew
                                      verv
                                    O


  !quot;#$!%&'()*'+),-./'+0'1(2*'('...
ie w
  erv
Ov
ie w
CUDA Installation
                                            erv
                                          Ov
  CUDA...
rview
                                                               Ove
CUDA Software Development

                      ...
ie w
                                       erv
Compiling CUDA Code
                                     Ov
              ...
6.963
               IT /
         A@M
      CUD
    9
IAP0


         CUDA
         Basics
sics
CUDA Kernels and Threads                                              Ba
 Parallel portions of an application are exe...
sics
Arrays of Parallel Threads                                            Ba

  A CUDA kernel is executed by an array of ...
sics
Thread Cooperation                                          Ba
  The Missing Piece: threads may need to cooperate

  ...
sics
Thread Batching                                                 Ba

  Kernel launches a grid of thread blocks
       ...
sics
Transparent Scalability                                                       Ba

         Hardware is free to schedu...
sics
8-Series Architecture (G80)                                                                     Ba
   128 thread proc...
sics
10-Series Architecture                                   Ba
  240 thread processors execute kernel threads
  30 multi...
sics
 Kernel Memory Access                                                      Ba
        Per-thread
                    ...
sics
                                              Ba

%69+-EJ)03*::0)


    /;*($)&0*quot;#(11quot;*

                   ...
sics
Execution Model                                                Ba
Software    Hardware

                             ...
Key Parallel Abstractions in CUDA
                                                          sics
                         ...
sics
Managing Memory                                              Ba

  CPU and GPU have separate memory spaces
  Host (CP...
GPU Memory Allocation / Release
                                                                     sics
                ...
sics
Data Copies                                                 Ba

  cudaMemcpy(void *dst, void *src, size_t nbytes,
   ...
sics
Data Movement Example                                                           Ba
int main(void)
{
    float *a_h, *...
sics
Data Movement Example                                                                  Ba
int main(void)
{
    float ...
sics
Data Movement Example                                                                  Ba
int main(void)
{
    float ...
sics
Data Movement Example                                                                  Ba
int main(void)
{
    float ...
sics
Data Movement Example                                                                  Ba
int main(void)
{
    float ...
sics
Data Movement Example                                                                  Ba
int main(void)
{
    float ...
sics
Data Movement Example                                                                  Ba
int main(void)
{
    float ...
sics
Data Movement Example                                                                  Ba
int main(void)
{
    float ...
sics
Data Movement Example                                                                  Ba
int main(void)
{
    float ...
sics
Executing Code on the GPU                                  Ba

  Kernels are C functions with some restrictions

    ...
sics
Function Qualifiers                                            Ba
  Kernels designated by function qualifier:
     __...
sics
Launching Kernels                                                  Ba
  Modified C function call syntax:

  kernel<<<...
Execution Configuration Examples
                                                            sics
                        ...
CUDA Built-in Device Variables
                                                                       sics
               ...
sics
 Unique Thread IDs                                                Ba
     Built-in variables are used to determine un...
sics
                                                         Ba


@$A:)$


   ,5:1
                                 !
   ...
sics
                                               Ba


!quot;#$%&'()*+,-
! !quot;#$%&'(quot;)*+,-quot;.'/quot;$'0&quot;1...
sics
Minimal Kernels                                          Ba

__global__ void minimal( int* a_d, int value)
{
   *a_d ...
sics
Increment Array Example                                                  Ba
CPU program                       CUDA pr...
sics
Host Synchronization                                         Ba

  All kernel launches are asynchronous
     control ...
Host Synchronization Example
                                                                        sics
                ...
sics
Device Runtime Component:                                     Ba
Synchronization Function

         void __syncthread...
sics
Variable Qualifiers (GPU code)
                                                                           Ba

  __dev...
sics
CUDA Error Reporting to CPU
                                                                      Ba

   All CUDA cal...
sics
Host Runtime Component:                                                                 Ba
Device Management
        ...
sics
Host Runtime Component:                                             Ba
Memory Management
         Two kinds of memory...
sics
                                                                                       Ba



                        ...
sics
                                                Ba


    ./012 +%quot;./0$
!
    D&2*',&(quot;!H?
!

    ?5?Mquot;J1*...
ME
CO
Back Pocket Slides




                     slide by David Cox
Code Walkthrough 2:
 Parallel Reduction
Execution Decomposition

             Two stages of computation:
                       Sum within each block
            ...
Kernel execution

Values (shared memory) 10                    1    8    -1   0   -2   3   5   -2   -3   2   7   0    11  ...
Kernel Source Code

      __global__ void sum_kernel(int *g_input, int *g_output)
      {
        extern __shared__ int s_...
Host Source Code (1)
int main()
{
   // data set size in elements and bytes
   unsigned int n = 4096;
   unsigned int num_...
Host Source Code (2)


                            ...

       // copy the input data from CPU to the GPU device
       cu...
Upcoming SlideShare
Loading in …5
×

IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)

6,399 views

Published on

More at http://sites.google.com/site/cudaiap2009 and http://pinto.scripts.mit.edu/Classes/CUDAIAP2009

Note that some slides were borrowed from Matthew Bolitho (John Hopkins), Mike Houston (Stanford) and NVIDIA.

Published in: Education, Technology
  • Be the first to comment

IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)

  1. 1. 6.963 IT / A@M CUD 9 IAP0 Supercomputing on your desktop: Programming the next generation of cheap and massively parallel hardware using CUDA Lecture 02 Nicolas Pinto (MIT) CUDA - Basics #1
  2. 2. During this course, 3 6 for 6.9 ed adapt we’ll try to “ ” and use existing material ;-)
  3. 3. Today yey!!
  4. 4. 6.963 IT / A@M Intro CUD 9 IAP0 GPU? GPU History // Analysis CUDA Overview CUDA Basics
  5. 5. 6.963 IT / A@M CUD 9 IAP0 Intro
  6. 6. ion at tiv Mo !quot;#$%&'()*$+,-.%/'0%(,1,(2(%&'()'1$1-%&'3-3%#43% -.'#%quot;0%5&quot;,&quot;&#quot;,%&(1&#(+/3$4&quot;&1quot;',(#&(1&,2(&*%#&4%quot;#&666& 7%#,quot;-.$4&(8%#&,3%&03(#,&,%#)&,3-0&#quot;,%&'quot;.&9%&%:*%',%5&,(& '(.,-.+%;&-1&.(,&,(&-.'#%quot;0%6&<8%#&,3%&$(./%#&,%#);&,3%& #quot;,%&(1&-.'#%quot;0%&-0&quot;&9-,&)(#%&+.'%#,quot;-.;&quot;$,3(+/3&,3%#%&-0& .(&#%quot;0(.&,(&9%$-%8%&-,&2-$$&.(,&#%)quot;-.&.%quot;#$4&'(.0,quot;.,&1(#& quot;,&$%quot;0,&=>&4%quot;#06&?3quot;,&)%quot;.0&94&=@AB;&,3%&.+)9%#&(1& '()*(.%.,0&*%#&-.,%/#quot;,%5&'-#'+-,&1(#&)-.-)+)&'(0,&2-$$& 9%&CB;>>>6&D&9%$-%8%&,3quot;,&0+'3&quot;&$quot;#/%&'-#'+-,&'quot;.&9%&9+-$,& '1%4%3,15*$%64/$07 H.&6.2'J..&quot;9'>,quot;#$&.21#;'J+3+=12quot;9'KL'D0&1,'KLMN slide by Matthew Bolitho
  7. 7. ion at tiv Mo 7Fquot;'/.;$'quot;#.2./1#'2%/Cquot;&'.O'#./0.2quot;2$;' ! 12'+2'E-'I1,,'6.%C,quot;'quot;<quot;&8'8quot;+& P1;$.&1#+,,8'! -*Q;'3quot;$'O+;$quot;& ! quot; P+&6I+&quot;'&quot;+#F123'O&quot;R%quot;2#8',1/1$+$1.2; S.I'! -*Q;'3quot;$'I16quot;& ! slide by Matthew Bolitho
  8. 8. 6.963 IT / A@M CUD 9 IAP0 GPU?
  9. 9. U? GP GPUs are REALLY fast Performance (gflops) Development Time (hours) 3D Filterbank Convolution Nicolas Pinto, James DiCarlo, David Cox (MIT, Harvard)
  10. 10. U? GP GPUs are REALLY fast Performance (gflops) Development Time (hours) 3D Filterbank Convolution Matlab C/SSE PS3 GT200 Nicolas Pinto, James DiCarlo, David Cox (MIT, Harvard)
  11. 11. U? GP GPUs are REALLY fast Performance (gflops) Development Time (hours) 3D Filterbank Convolution 0.3 Matlab C/SSE PS3 GT200 Nicolas Pinto, James DiCarlo, David Cox (MIT, Harvard)
  12. 12. U? GP GPUs are REALLY fast Performance (gflops) Development Time (hours) 3D Filterbank Convolution 0.3 Matlab 9.0 C/SSE PS3 GT200 Nicolas Pinto, James DiCarlo, David Cox (MIT, Harvard)
  13. 13. U? GP GPUs are REALLY fast Performance (gflops) Development Time (hours) 3D Filterbank Convolution 0.3 Matlab 9.0 C/SSE 110.0 PS3 GT200 Nicolas Pinto, James DiCarlo, David Cox (MIT, Harvard)
  14. 14. U? GP GPUs are REALLY fast Performance (gflops) Development Time (hours) 3D Filterbank Convolution 0.3 Matlab 9.0 C/SSE 110.0 PS3 330.0 GT200 Nicolas Pinto, James DiCarlo, David Cox (MIT, Harvard)
  15. 15. U? GP GPUs are REALLY fast Performance (gflops) Development Time (hours) 3D Filterbank Convolution 0.3 Matlab 0.5 9.0 C/SSE 110.0 PS3 330.0 GT200 Nicolas Pinto, James DiCarlo, David Cox (MIT, Harvard)
  16. 16. U? GP GPUs are REALLY fast Performance (gflops) Development Time (hours) 3D Filterbank Convolution 0.3 Matlab 0.5 9.0 C/SSE 10.0 110.0 PS3 330.0 GT200 Nicolas Pinto, James DiCarlo, David Cox (MIT, Harvard)
  17. 17. U? GP GPUs are REALLY fast Performance (gflops) Development Time (hours) 3D Filterbank Convolution 0.3 Matlab 0.5 9.0 C/SSE 10.0 110.0 PS3 30.0 330.0 GT200 Nicolas Pinto, James DiCarlo, David Cox (MIT, Harvard)
  18. 18. U? GP GPUs are REALLY fast Performance (gflops) Development Time (hours) 3D Filterbank Convolution 0.3 Matlab 0.5 9.0 C/SSE 10.0 110.0 PS3 30.0 330.0 GT200 10.0 Nicolas Pinto, James DiCarlo, David Cox (MIT, Harvard)
  19. 19. U? GP 6'401-'@&)*(&+,3AB0-3'-407':&C,(,DD'D& ! C(*8D'+4/ E*('&3(,-4043*(4&@'@0.,3'@&3*&?quot;>&3A,-&)D*F& ! .*-3(*D&,-@&@,3,&.,.A' slide by Matthew Bolitho
  20. 20. U? GP GA,3&,('&3A'&.*-4'H2'-.'4I ! $(*1(,+&+243&8'&+*('&C('@0.3,8D'/ ! ! 6,3,&,..'44&.*A'('-.5 ! $(*1(,+&)D*F slide by Matthew Bolitho
  21. 21. U? GP T+$Fquot;&'$F+2'quot;:0quot;#$123'-*Q;'$.'3quot;$'$I1#quot;'+;' ! O+;$9'quot;:0quot;#$'$.'F+<quot;'$I1#quot;'+;'/+28U *+&+,,quot;,'0&.#quot;;;123'O.&'$Fquot;'/+;;quot;; ! Q2O.&$%2+$quot;,8)'*+&+,,quot;,'0&.3&+//123'1;'F+&6V'' ! quot; D,3.&1$F/;'+26'B+$+'?$&%#$%&quot;;'/%;$'Cquot;' O%26+/quot;2$+,,8'&quot;6quot;;132quot;6 slide by Matthew Bolitho
  22. 22. U? GP Task vs. Data parallelism • Task parallel – Independent processes with little communication – Easy to use • “Free” on modern operating systems with SMP • Data parallel – Lots of data on which the same computation is being executed – No dependencies between data elements in each step in the computation – Can saturate many ALUs – But often requires redesign of traditional algorithms 4 slide by Mike Houston
  23. 23. U? GP CPU vs. GPU • CPU – Really fast caches (great for data reuse) – Fine branching granularity – Lots of different processes/threads – High performance on a single thread of execution • GPU – Lots of math units – Fast access to onboard memory – Run a program on each fragment/vertex – High throughput on parallel tasks • CPUs are great for task parallelism • GPUs are great for data parallelism 5 slide by Mike Houston
  24. 24. Us The Importance of Data Parallelism for GPUs P G • GPUs are designed for highly parallel tasks like rendering • GPUs process independent vertices and fragments – Temporary registers are zeroed – No shared or static data – No read-modify-write buffers – In short, no communication between vertices or fragments • Data-parallel processing – GPU architectures are ALU-heavy • Multiple vertex & pixel pipelines • Lots of compute power – GPU memory systems are designed to stream data • Linear access patterns can be prefetched • Hide memory latency 6 slide by Mike Houston
  25. 25. 6.963 IT / A@M CUD 9 IAP0 GPU History
  26. 26. ory ist H not true!
  27. 27. ory ist H !quot;quot;#$%&'$() 4:.;'/&,$'$()&#;+(,.#;<(/;=>9;1.),./$)8 *(++&), !quot;#$% ! ?./'$%.2;&),;@/$+$'$A.2 -.(+.'/0 ! 4/&)2<(/+&'$()2 ! !quot;#$%quot;&#'()*)+,%,*-.',%/0 1&2'./$3&'$() &$%#$% 4.5'6/. ! B9;C+&8.;<(/;,$2quot;#&0 7/&8+.)' 9$2quot;#&0 slide by Matthew Bolitho
  28. 28. ory ist H 1.),./;.'(&quot;#,(.0&F;/.&#$2'$%;%(+quot;6'./; ! 8.)./&'.,;2%.).2 ! G&%=;A/&+.;$2;%(+quot;#.5 ! H..,;IJ;A/&+.2;quot;./;2.%(), quot;#$%&'()*)'+,,'&-,(. ! quot; 3&4.,#(&4)5#quot;46#quot;& slide by Matthew Bolitho
  29. 29. ory ist H *:O;P;N(2' !quot;quot;#$%&'$() 4(;$+quot;/(K.;quot;./A(/+&)%.F;+(K.;2(+.; ! L(/M;'(;,.,$%&'.,;=&/,L&/. *(++&), N&/,L&/.;%(6#,;quot;/(%.22;.&%=;K./'.5; ! -.(+.'/0 &),;.&%=;A/&8+.)';$),.quot;.),.)'#0; quot; 7.$528)*#quot;#22&2 -/&quot;=$%2;N&/,L&/. 1&2'./$3&'$() 4.5'6/. 7/&8+.)' 9$2quot;#&0 slide by Matthew Bolitho
  30. 30. ory ist H /0)'1*23045&'#43)-46)'(2&'7!quot;#$%&!'()*quot;+(8 ! quot; N&/,L&/.;L&2;=&/,L$/.,;'(;quot;./A(/+;'=.; (quot;./&'$()2;$);'=.;quot;$quot;.#$). GK.)'6&##0F;quot;$quot;.#$).;@.%&+.;+(/.; ! quot;/(8/&++&@#. slide by Matthew Bolitho
  31. 31. ory ist H *=>:?:@(2' !quot;quot;#$%&'$() 4.5'6/.:&),:7/&8+.)':2'&8.2:;.%&+.: ! +(/.:quot;/(8/&++&;#.<:%(+;$).,:$)'(: *(++&), !quot;#$%&'()*+(,)- -.(+.'/0 =/(8/&++&;#.:C$&:&22.+;#0:#&)86&8. ! D.+(/0:/.&,2:C$&:'.5'6/.:#((E6quot;2 ! -/&quot;A$%2:@&/,B&/. 1&2'./$3&'$() !.'/'(0$()-*)'1)2#'*34452/6 ! F$+$'.,:=/(8/&+:2$3. ! 7/&8+.)':>)$' G(:/.&#:;/&)%A$)8:H'A62:#((quot;$)8I ! 9$2quot;#&0 slide by Matthew Bolitho
  32. 32. ory ist H *=>:?:@(2' !quot;quot;#$%&'$() -.(+.'/0:2'&8.:;.%&+.: ! /#4%#$&&$73'8*9$33'0*!:'#)'1*+(,)- *(++&), =/(8/&++&;#.:C$&:&22.+;#0:#&)86&8. ! J./'.5:>)$' G(:+.+(/0:/.&,2K ! -/&quot;A$%2:@&/,B&/. 1&2'./$3&'$() F$+$'.,:=/(8/&+:2$3. ! G(:/.&#:;/&)%A$)8:H'A62:#((quot;$)8I ! 7/&8+.)':>)$' 9$2quot;#&0 slide by Matthew Bolitho
  33. 33. ory ist H *=>:?:@(2' !quot;quot;#$%&'$() 4A$)82:$+quot;/(C.,:(C./:'$+.L ! J./'.5:6)$':%&):,(:+.+(/0:/.&,2 ! *(++&), D&5$+6+:=/(8/&+:2$3.:$)%/.&2., ! M/&)%A$)8:26quot;quot;(/' ! J./'.5:>)$' @$8A./:#.C.#:#&)86&8.2:H.N8N:@FOF<:*8I ! -/&quot;A$%2:@&/,B&/. 1&2'./$3&'$() G.$'A./:'A.:J./'.5:(/:7/&8+.)':6)$'2: ! %(6#,:B/$'.:'(:+.+(/0N::*&):()#0:B/$'.: 7/&8+.)':>)$' '(:P/&+.:;6PP./ G(:$)'.8./:+&'A ! G(:;$'B$2.:(quot;./&'(/2 ! 9$2quot;#&0 slide by Matthew Bolitho
  34. 34. ory ist H *=>:?:@(2' !quot;quot;#$%&'$() *(++&), 1&2'./$3&'$() -/&quot;A$%2:@&/,B&/. 9$2quot;#&0 *#+,-quot;($& !quot;#$quot;%&'()$ '()$ 4.5'6/.:D.+(/0 4.5'6/.:D.+(/0 slide by Matthew Bolitho
  35. 35. ory ist H ;(*<==>*?@+A6*7'9$&'*&46)3B*/#4%#$&&$73'8* ! !C23),Q/$66-*$3%4#,)D&6*$334E'0*E#,)'6*)4* ! +.+(/0L ! R):quot;&22:S:B/$'.:'(:P/&+.;6PP./ ! 1.;$),:'A.:P/&+.;6PP./ &2:&:'.5'6/. ! 1.&,:$':$):quot;&22:T<:.'%N M6':B./.:$).PP$%$.)' ! slide by Matthew Bolitho
  36. 36. ory ist H !quot;#$%&quot;'(%)%&*&%+,#-'././0'1+))2,%&3'45quot;6 ! 7././0'8'.quot;,quot;5*('/25$+#quot;'9+)$2&*&%+,'+,'&:quot;'./0; !quot;!quot;#$quot;%&'%()* ! !quot;#$%&'()&*)+%),&-#.% ! /(*1quot;'<*&*'%,'&quot;=&25quot;# ! !5*6'*'>(*&'?2*<'7+>>@#15quot;quot;,; ! A5%&quot;')2(&%@$*##'*(4+5%&:)'2#%,4'B5*4)quot;,&'0,%&' &+'$quot;5>+5)'12#&+)'$5+1quot;##%,4 slide by Matthew Bolitho
  37. 37. ory ist H 0,<quot;5@2&%(%Cquot;<':*5<6*5quot; ! ! D,(3'2&%(%Cquot;<'B5*4)quot;,&'0,%& ! D>&quot;,')quot;)+53'E*,<6%<&:'(%)%&quot;< .*&:quot;5@E*#quot;<'*(4+5%&:)#'+,(3'7,+'#1*&&quot;5; ! 0#quot;<'&:quot;'.5*$:%1#'F/G ! slide by Matthew Bolitho
  38. 38. ory ist H 9/0'H'I+#& F$$(%1*&%+, 9+))*,< J*#&quot;5%C*&%+, .5*$:%1#'I*5<6*5quot; !%#$(*3 !,&),-%2$ 1%('),/-$ +,%-,.$#/0- #/0- #/0- Kquot;)+53 Kquot;)+53 Kquot;)+53 slide by Matthew Bolitho
  39. 39. ory ist H .quot;+)quot;&53'0,%&'+$quot;5*&quot;#'+,'*'$5%)%&%Lquot;-'1*,' ! 65%&quot;'E*1M'&+')quot;)+53 9:*,4quot;#'&+'2,<quot;5(3%,4':*5<6*5quot;N ! ! FE%(%&3'&+'65%&quot;'&+')quot;)+53 ! /-#.0.)12&3+quot;4)((.#5&'#.%( slide by Matthew Bolitho
  40. 40. 6.963 IT / A@M CUD 9 IAP0 // Analysis
  41. 41. ysis nal A // #-+- !%&'() $*(+%,() !%&'() !quot;!# !quot;$# quot;&.+/*0+%1& $*(+%,() $quot;!# $quot;$# slide by Matthew Bolitho
  42. 42. ysis nal A // 9)('.0/1)/16quot;.0&7#)+/3):')#/,')$/./11'1):;) ! !quot;#$quot;#%&'!quot;#$%&'()$!*+%,+-..!,+/0 ! <03,)-%3,/#'3&/1)$/.&()quot;-)&7')/16quot;.0&7#) &7/&)/.')('$/./:1' slide by Matthew Bolitho
  43. 43. ysis nal A // !quot;#$%&'()'*$'+&',)($'',%$)-.quot;#)$/.&0/1) ! 0#$.quot;2'#'3& 45)$.quot;$quot;.&0quot;3)quot;-)$.quot;6./#)&7/&)0()$/./11'1 ! 85)($'',%$)quot;-)$/./11'1)$quot;.&0quot;3) ! slide by Matthew Bolitho
  44. 44. ysis nal A // &'()*+)#,-,). &'+'.3'.(7%8.quot;97#,# !quot;#$%&'()*+)#,-,). /0)1+%!quot;#$# &quot;-quot;%&'()*+)#,-,). 203'0%!quot;#$# &quot;-quot;%45quot;0,.6 slide by Matthew Bolitho
  45. 45. ysis nal A // 896)0,-5*#%(quot;.%:'%3'()*+)#'3%:7%:)-5%-quot;#$% ! quot;.3%3quot;-quot;; ! !quot;#$;%<,.3%60)1+#%)=%,.#-01(-,).#%-5quot;-%(quot;.%:'% '>'(1-'3%,.%+quot;0quot;99'9 ! %quot;&quot;;%<,.3%+quot;0-,-,).#%,.%-5'%3quot;-quot;%-5quot;-%(quot;.%:'%1#'3% ?0'9quot;-,@'97A%,.3'+'.3'.-97 slide by Matthew Bolitho
  46. 46. ysis nal A // &'()*+)#,-,). &'+'.3'.(7%8.quot;97#,# !quot;#$%&'()*+)#,-,). /0)1+%!quot;#$# &quot;-quot;%&'()*+)#,-,). 203'0%!quot;#$# &quot;-quot;%45quot;0,.6 slide by Matthew Bolitho
  47. 47. lysis Ana // !quot;#$%&'()*'(#$+,-.)*/(#quot;0(1.quot;0(!quot;#$%&'#(' ! )*&+quot;$,+)#*& )*#)(#-'(2-'$#).3'$%4(.quot;0'5'quot;0'quot;) 6+7(8,$'9:$#-(;%quot;#/.9< ! ! =,/5:)'>.?-#).,quot;#$@,-9'< ! =,/5:)'A,)#).,quot;#$@,-9'< ! =,/5:)';.*'0-#$@,-9'< ! =,/5:)'B'.+*?,:-< ! =,/5:)'B,quot;C,quot;0.quot;+@,-9'< ! D50#)'E,<.).,quot;<!quot;0>'$,9.).'< slide by Matthew Bolitho
  48. 48. ysis nal A // ;'9,/5,<.).,quot; ;'5'quot;0'quot;9%(!quot;#$%<.< F#<G(;'9,/5,<.).,quot; H-,:5(F#<G< ;#)#(;'9,/5,<.).,quot; I-0'-(F#<G< ;#)#(J*#-.quot;+ slide by Matthew Bolitho
  49. 49. ysis nal A // !quot;#$%&'()*'(#$+,-.)*/(),(1.quot;0(K#%<(),( ! %-quot;+)+)#*'+./'0-+- 6+7(8#)-.L(8:$).5$.9#).,quot;7(=,$:/quot;<(#quot;0(A,K< ! 1 2 slide by Matthew Bolitho
  50. 50. ysis nal A // !quot;#$%&'()*'(#$+,-.)*/(),(1.quot;0(K#%<(),( ! %-quot;+)+)#*'+./'0-+- 6+7(8#)-.L(8:$).5$.9#).,quot;7(C$,9G< ! 1 2 slide by Matthew Bolitho
  51. 51. sis naly // A !5'0'%quot;0'%*quot;.7%:quot;7#%-)%3'()*+)#'%quot;.7% ! 6,;'.%quot;96)0,-5* 4)*'-,*'#%3quot;-quot;%3'()*+)#'%'quot;#,97 ! 4)*'-,*'#%-quot;#$#%3'()*+)#'%'quot;#,97 ! 4)*'-,*'#%<)-5= ! 4)*'-,*'#%.',-5'0= ! slide by Matthew Bolitho
  52. 52. ysis nal A // &'()*+)#,-,). &'+'.3'.(7%8.quot;97#,# !quot;#$%&'()*+)#,-,). /0)1+%!quot;#$# &quot;-quot;%&'()*+)#,-,). 203'0%!quot;#$# &quot;-quot;%45quot;0,.6 slide by Matthew Bolitho
  53. 53. ysis nal A // 2.('%-5'%quot;96)0,-5*%5quot;#%<''.%3'()*+)#'3% ! ,.-)%3quot;-quot;%quot;.3%-quot;#$#> ! 8.quot;97?' @.-'0quot;(-,).# slide by Matthew Bolitho
  54. 54. ysis nal A // !)%'quot;#'%-5'%*quot;.quot;6'*'.-%)A%3'+'.3'.(,'#% ! A,.3%-quot;#$#%-5quot;-%quot;0'%#,*,9quot;0%quot;.3%60)1+%-5'* !5'.%quot;.quot;97?'%().#-0quot;,.-#%-)%3'-'0*,.'%quot;.7% ! .'('##quot;07%)03'0 slide by Matthew Bolitho
  55. 55. ysis nal A // !quot;#$%&$#'($#)%*%+$)$*'#quot;,#-$.$*-$*/0$&# ! ,0*-#'%&1&#'(%'#%2$#&0)03%2#%*-#+2quot;4.#'($) 5+6#7quot;3$/43%2#89*%)0/& ! ! :quot;).4'$;0<2%'0quot;*%3=quot;2/$& ! :quot;).4'$>quot;'%'0quot;*%3=quot;2/$& ! :quot;).4'$80($-2%3=quot;2/$& ! :quot;).4'$?$0+(<quot;42& ! :quot;).4'$?quot;*@quot;*-0*+=quot;2/$& ! A.-%'$Bquot;&0'0quot;*&C*-;$3quot;/0'0$& slide by Matthew Bolitho
  56. 56. ysis nal A // :quot;).4'$#@quot;*-$-#=quot;2/$& ! ! :quot;).4'$;0<2%'0quot;*%3=quot;2/$& ! :quot;).4'$>quot;'%'0quot;*%3=quot;2/$& ! :quot;).4'$80($-2%3=quot;2/$& :quot;).4'$#?$0+(<quot;42& ! :quot;).4'$#?quot;*D@quot;*-0*+#=quot;2/$& ! A.-%'$Bquot;&0'0quot;*&C*-;$3quot;/0'0$& ! slide by Matthew Bolitho
  57. 57. ysis nal A // E*/$#+2quot;4.&#quot;,#'%&1&#%2$#0-$*'0,0$-F#-%'%#,3quot;G# ! /quot;*&'2%0*'&#$*,quot;2/$#%#.%2'0%3#quot;2-$26 ?$0+(<quot;2#H0&' @quot;*-$-#=quot;2/$& ?quot;*#@quot;*-$-#=quot;2/$& A.-%'$#Bquot;&0'0quot;*&#%*-#;$3quot;/0'0$& slide by Matthew Bolitho
  58. 58. ysis nal A // 8$/quot;).quot;&0'0quot;* 8$.$*-$*/9#C*%39&0& !%&1#8$/quot;).quot;&0'0quot;* I2quot;4.#!%&1& 8%'%#8$/quot;).quot;&0'0quot;* E2-$2#!%&1& 8%'%#J(%20*+ slide by Matthew Bolitho
  59. 59. ysis nal A // !quot;#$%&'()*'++,%-(.$($.%/(-0&1%-2%)'131%'quot;.% ! &'()*)*-quot;1%-2%.')'%'($%*.$quot;)*2*$.4%'quot;'+,5$%)6$% !quot;#quot;$%&quot;'()*$)6')%-##0(1 slide by Matthew Bolitho
  60. 60. ysis nal A // 7')'%16'(*quot;/%#'quot;%8$%#')$/-(*5$.%'19 ! ! :$'.;-quot;+, ! <22$#)*=$+,%>-#'+ ! :$'.;?(*)$ ! @##0A0+')$ ! B0+)*&+$%:$'.CD*quot;/+$%?(*)$ slide by Matthew Bolitho
  61. 61. ysis nal A // +,quot;!-.)/0 ! 7')'%*1%($'.4%80)%quot;-)%E(*))$quot; ! F-%#-quot;1*1)$quot;#,%&(-8+$A1 ! :$&+*#')*-quot;%*quot;%.*1)(*80)$.%1,1)$A slide by Matthew Bolitho
  62. 62. ysis nal A // 122,3#(4,/0-5.3quot;/ ! 7')'%*1%($'.%'quot;.%E(*))$quot; ! 7')'%*1%&'()*)*-quot;$.%*quot;)-%1081$)1 ! !quot;$%)'13%&$(%1081$) ! G'quot;%.*1)(*80)$%1081$)1 slide by Matthew Bolitho
  63. 63. ysis nal A // +,quot;!-6'(#, ! 7')'%*1%($'.%'quot;.%E(*))$quot; ! B'quot;,%)'131%'##$11%A'quot;,%.')' ! G-quot;1*1)$quot;#,%*110$1 ! B-1)%.*22*#0+)%)-%.$'+%E*)6 slide by Matthew Bolitho
  64. 64. ysis nal A // :8(;$/%<#=(-&,8#=2/-,$/,9(-,14 ! '%()*>4/5 3 4 '%()*>4/5 :??%9-,@%/5* A19(/ slide by Matthew Bolitho
  65. 65. ysis nal A // :8(;$/%<#=1/%92/(&#B54(;,9quot; ! F%,30I1&#A,quot;- H1&9%quot; G14)%)#H1&9%quot; F14#G14)%)#H1&9%quot; C$)(-%#D1quot;,-,14quot;#(4)#E%/19,-,%quot; slide by Matthew Bolitho
  66. 66. ysis nal A // :8(;$/%<#=1/%92/(&#B54(;,9quot; ! F%,30I1&#A,quot;- !-1;,9# J11&),4(-%quot; G14)%)#H1&9%quot; F14#G14)%)#H1&9%quot; C$)(-%#D1quot;,-,14quot;#(4)#E%/19,-,%quot; slide by Matthew Bolitho
  67. 67. 6.963 IT / A@M CUD 9 IAP0 CUDA Overview
  68. 68. ie w erv Ov *,.;<+/$%=*=*8 >?9$ !quot;!quot;# @ 6,'2A%6)+%=*8%'16.%(+1+,0<B45,4.C+% 2./456'1(%;D%20C6'1(%4,.;<+/%0C%(,04)'2C E5,1%F060%'16.%'/0(+C%GH6+I65,+%/04CJK E5,1%0<(.,'6)/C%'16.%'/0(+%CD16)+C'C%GH,+1F+,'1(%40CC+CJK *,./'C'1(%,+C5<6CL%;56$ E.5()%<+0,1'1(%25,M+L%40,6'25<0,<D%-.,%1.1B(,04)'2C%+I4+,6C *.6+16'0<<D%)'()%.M+,)+0F%.-%(,04)'2C%:*N &'()<D%2.1C6,0'1+F%/+/.,D%<0D.56%O%022+CC%/.F+< P++F%-.,%/01D%40CC+C%F,'M+C%54%;01F7'F6)%2.1C5/46'.1 quot;# !quot;#$%&'()%*+,-.,/012+%3./456'1(%7'6)%389:
  69. 69. ie w erv Ov !quot;#$)'0,I=%$quot;'E+.K.quot;-':quot;H.#quot;'F&#?.$quot;#$%&quot; ! 0&quot;1$quot;-'6B'LM*:*F ! F'A1B'$,'=quot;&K,&I'#,I=%$1$.,+',+'$?quot;'>8E ! 7=quot;#.K.#1$.,+'K,&) ! ! F'#,I=%$quot;&'1&#?.$quot;#$%&quot; ! F'31+N%1Nquot; ! F+'1==3.#1$.,+'.+$quot;&K1#quot;'OF8*P slide by Matthew Bolitho
  70. 70. iew erv v O CUDA Advantages over Legacy GPGPU Random access to memory Thread can access any memory location Unlimited access to memory Thread can read/write as many locations as needed User-managed cache (per block) Threads can cooperatively load data into SMEM Any thread can then access any SMEM location Low learning curve Just a few extensions to C No knowledge of graphics is required No graphics API overhead 9 © NVIDIA Corporation 2006
  71. 71. ie w Some Design Goals erv Ov Scale to 100’s of cores, 1000’s of parallel threads Let programmers focus on parallel algorithms Not on the mechanics of a parallel programming language Enable heterogeneous systems (i.e. CPU + GPU) CPU and GPU are separate devices with separate DRAMs © 2008 NVIDIA Corporation.
  72. 72. iew verv O !quot;#$!%&'()*'+),-./'+0'1(2*'('3014*+-./' ! 4)0563+7''890:*'quot;0'+;*'%*+(9 ! <-):+')*9*(:*'=>?$>@A'B01B*5 ! C*30.5')*9*(:*'-.'D?$>@')*EF)-++*. G4*.'C06)3* ! H0+',*+'('I-(B9*':096+-0. ! slide by Matthew Bolitho
  73. 73. ie w erv Ov
  74. 74. ie w CUDA Installation erv Ov CUDA installation consists of Driver CUDA Toolkit (compiler, libraries) CUDA SDK (example codes) © 2008 NVIDIA Corporation.
  75. 75. rview Ove CUDA Software Development Integrated CPU + GPU CUDA Optimized Libraries: C Source Code math.h, FFT, BLAS, … NVIDIA C Compiler NVIDIA Assembly CPU Host Code for Computing (PTX) CUDA Standard C Compiler Profiler Driver CPU GPU © 2008 NVIDIA Corporation.
  76. 76. ie w erv Compiling CUDA Code Ov C/C++ CUDA Application CPU Code NVCC PTX Code Virtual Physical PTX to Target Compiler G80 … GPU Target code © 2008 NVIDIA Corporation.
  77. 77. 6.963 IT / A@M CUD 9 IAP0 CUDA Basics
  78. 78. sics CUDA Kernels and Threads Ba Parallel portions of an application are executed on the device as kernels One kernel is executed at a time Many threads execute each kernel Differences between CUDA and CPU threads CUDA threads are extremely lightweight Very little creation overhead Instant switching CUDA uses 1000s of threads to achieve efficiency Multi-core CPUs can use only a few Definitions Device = GPU Host = CPU Kernel = function that runs on the device © 2008 NVIDIA Corporation.
  79. 79. sics Arrays of Parallel Threads Ba A CUDA kernel is executed by an array of threads All threads run the same code Each thread has an ID that it uses to compute memory addresses and make control decisions threadID 0 1 2 3 4 5 6 7 … float x = input[threadID]; float y = func(x); output[threadID] = y; … © 2008 NVIDIA Corporation.
  80. 80. sics Thread Cooperation Ba The Missing Piece: threads may need to cooperate Thread cooperation is valuable Share results to avoid redundant computation Share memory accesses Drastic bandwidth reduction Thread cooperation is a powerful feature of CUDA Cooperation between a monolithic array of threads is not scalable Cooperation within smaller batches of threads is scalable © 2008 NVIDIA Corporation.
  81. 81. sics Thread Batching Ba Kernel launches a grid of thread blocks Threads within a block cooperate via shared memory Threads within a block can synchronize Threads in different blocks cannot cooperate Allows programs to transparently scale to different GPUs Grid Thread Block 0 Thread Block 1 Thread Block N-1 … Shared Memory Shared Memory Shared Memory © 2008 NVIDIA Corporation.
  82. 82. sics Transparent Scalability Ba Hardware is free to schedule thread blocks on any processor A kernel scales across parallel multiprocessors Kernel grid Device Device Block 0 Block 1 Block 2 Block 3 Block 4 Block 5 Block 0 Block 1 Block 2 Block 3 Block 0 Block 1 Block 6 Block 7 Block 4 Block 5 Block 6 Block 7 Block 2 Block 3 Block 4 Block 5 Block 6 Block 7 © 2008 NVIDIA Corporation.
  83. 83. sics 8-Series Architecture (G80) Ba 128 thread processors execute kernel threads 16 multiprocessors, each contains 8 thread processors Shared memory enables thread cooperation Multiprocessor Thread Shared Shared Shared Shared Shared Shared Shared Shared Processors Memory Memory Memory Memory Memory Memory Memory Memory Shared Memory Shared Shared Shared Shared Shared Shared Shared Shared Memory Memory Memory Memory Memory Memory Memory Memory © 2008 NVIDIA Corporation.
  84. 84. sics 10-Series Architecture Ba 240 thread processors execute kernel threads 30 multiprocessors, each contains 8 thread processors One double-precision unit Shared memory enables thread cooperation Multiprocessor Thread Processors Double Shared Memory © 2008 NVIDIA Corporation.
  85. 85. sics Kernel Memory Access Ba Per-thread On-chip Registers Thread Off-chip, uncached Local Memory Per-block • On-chip, small Shared Block • Fast Memory Per-device ... • Off-chip, large Kernel 0 • Uncached • Global Persistent across Time Memory kernel launches ... • Kernel I/O Kernel 1 © 2008 NVIDIA Corporation.
  86. 86. sics Ba %69+-EJ)03*::0) /;*($)&0*quot;#(11quot;* ?%quot;@$%&'()quot;*+ !quot;#$%& <(3.1;(*1 '()quot;*+ ! Aquot;21;$2;&'()quot;*+ 8(B;C*(&'()quot;*+ /=$*(>&'()quot;*+ slide by Matthew Bolitho
  87. 87. sics Execution Model Ba Software Hardware Threads are executed by thread Thread processors Processor Thread Thread blocks are executed on multiprocessors Thread blocks do not migrate Several concurrent thread blocks can Thread reside on one multiprocessor - limited Multiprocessor Block by multiprocessor resources (shared memory and register file) A kernel is launched as a grid of thread blocks ... Only one kernel can execute on a device at one time Grid Device © 2008 NVIDIA Corporation.
  88. 88. Key Parallel Abstractions in CUDA sics Ba Trillions of lightweight threads Simple decomposition model Hierarchy of concurrent threads Simple execution model Lightweight synchronization of primitives Simple synchronization model Shared memory model for thread cooperation Simple communication model © 2008 NVIDIA Corporation.
  89. 89. sics Managing Memory Ba CPU and GPU have separate memory spaces Host (CPU) code manages device (GPU) memory: Allocate / free Copy data to and from device Applies to global device memory (DRAM) Host Device GPU DRAM Multiprocessor CPU Multiprocessor Local Memory Multiprocessor Global DRAM Chipset Registers Memory Shared Memory © 2008 NVIDIA Corporation.
  90. 90. GPU Memory Allocation / Release sics Ba cudaMalloc(void ** pointer, size_t nbytes) cudaMemset(void * pointer, int value, size_t count) cudaFree(void* pointer) int n = 1024; int nbytes = 1024*sizeof(int); int *a_d = 0; cudaMalloc( (void**)&a_d, nbytes ); cudaMemset( a_d, 0, nbytes); cudaFree(a_d); © 2008 NVIDIA Corporation.
  91. 91. sics Data Copies Ba cudaMemcpy(void *dst, void *src, size_t nbytes, enum cudaMemcpyKind direction); direction specifies locations (host or device) of src and dst Blocks CPU thread: returns after the copy is complete Doesn’t start copying until previous CUDA calls complete enum cudaMemcpyKind cudaMemcpyHostToDevice cudaMemcpyDeviceToHost cudaMemcpyDeviceToDevice © 2008 NVIDIA Corporation.
  92. 92. sics Data Movement Example Ba int main(void) { float *a_h, *b_h; // host data float *a_d, *b_d; // device data int N = 14, nBytes, i ; nBytes = N*sizeof(float); a_h = (float *)malloc(nBytes); b_h = (float *)malloc(nBytes); cudaMalloc((void **) &a_d, nBytes); cudaMalloc((void **) &b_d, nBytes); for (i=0, i<N; i++) a_h[i] = 100.f + i; cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(b_d, a_d, nBytes, cudaMemcpyDeviceToDevice); cudaMemcpy(b_h, b_d, nBytes, cudaMemcpyDeviceToHost); for (i=0; i< N; i++) assert( a_h[i] == b_h[i] ); free(a_h); free(b_h); cudaFree(a_d); cudaFree(b_d); return 0; } © 2008 NVIDIA Corporation.
  93. 93. sics Data Movement Example Ba int main(void) { float *a_h, *b_h; // host data Host float *a_d, *b_d; // device data int N = 14, nBytes, i ; a_h nBytes = N*sizeof(float); a_h = (float *)malloc(nBytes); b_h = (float *)malloc(nBytes); cudaMalloc((void **) &a_d, nBytes); cudaMalloc((void **) &b_d, nBytes); b_h for (i=0, i<N; i++) a_h[i] = 100.f + i; cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(b_d, a_d, nBytes, cudaMemcpyDeviceToDevice); cudaMemcpy(b_h, b_d, nBytes, cudaMemcpyDeviceToHost); for (i=0; i< N; i++) assert( a_h[i] == b_h[i] ); free(a_h); free(b_h); cudaFree(a_d); cudaFree(b_d); return 0; } © 2008 NVIDIA Corporation.
  94. 94. sics Data Movement Example Ba int main(void) { float *a_h, *b_h; // host data Host Device float *a_d, *b_d; // device data int N = 14, nBytes, i ; a_h a_d nBytes = N*sizeof(float); a_h = (float *)malloc(nBytes); b_h = (float *)malloc(nBytes); cudaMalloc((void **) &a_d, nBytes); cudaMalloc((void **) &b_d, nBytes); b_h b_d for (i=0, i<N; i++) a_h[i] = 100.f + i; cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(b_d, a_d, nBytes, cudaMemcpyDeviceToDevice); cudaMemcpy(b_h, b_d, nBytes, cudaMemcpyDeviceToHost); for (i=0; i< N; i++) assert( a_h[i] == b_h[i] ); free(a_h); free(b_h); cudaFree(a_d); cudaFree(b_d); return 0; } © 2008 NVIDIA Corporation.
  95. 95. sics Data Movement Example Ba int main(void) { float *a_h, *b_h; // host data Host Device float *a_d, *b_d; // device data int N = 14, nBytes, i ; a_h a_d nBytes = N*sizeof(float); a_h = (float *)malloc(nBytes); b_h = (float *)malloc(nBytes); cudaMalloc((void **) &a_d, nBytes); cudaMalloc((void **) &b_d, nBytes); b_h b_d for (i=0, i<N; i++) a_h[i] = 100.f + i; cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(b_d, a_d, nBytes, cudaMemcpyDeviceToDevice); cudaMemcpy(b_h, b_d, nBytes, cudaMemcpyDeviceToHost); for (i=0; i< N; i++) assert( a_h[i] == b_h[i] ); free(a_h); free(b_h); cudaFree(a_d); cudaFree(b_d); return 0; } © 2008 NVIDIA Corporation.
  96. 96. sics Data Movement Example Ba int main(void) { float *a_h, *b_h; // host data Host Device float *a_d, *b_d; // device data int N = 14, nBytes, i ; a_h a_d nBytes = N*sizeof(float); a_h = (float *)malloc(nBytes); b_h = (float *)malloc(nBytes); cudaMalloc((void **) &a_d, nBytes); cudaMalloc((void **) &b_d, nBytes); b_h b_d for (i=0, i<N; i++) a_h[i] = 100.f + i; cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(b_d, a_d, nBytes, cudaMemcpyDeviceToDevice); cudaMemcpy(b_h, b_d, nBytes, cudaMemcpyDeviceToHost); for (i=0; i< N; i++) assert( a_h[i] == b_h[i] ); free(a_h); free(b_h); cudaFree(a_d); cudaFree(b_d); return 0; } © 2008 NVIDIA Corporation.
  97. 97. sics Data Movement Example Ba int main(void) { float *a_h, *b_h; // host data Host Device float *a_d, *b_d; // device data int N = 14, nBytes, i ; a_h a_d nBytes = N*sizeof(float); a_h = (float *)malloc(nBytes); b_h = (float *)malloc(nBytes); cudaMalloc((void **) &a_d, nBytes); cudaMalloc((void **) &b_d, nBytes); b_h b_d for (i=0, i<N; i++) a_h[i] = 100.f + i; cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(b_d, a_d, nBytes, cudaMemcpyDeviceToDevice); cudaMemcpy(b_h, b_d, nBytes, cudaMemcpyDeviceToHost); for (i=0; i< N; i++) assert( a_h[i] == b_h[i] ); free(a_h); free(b_h); cudaFree(a_d); cudaFree(b_d); return 0; } © 2008 NVIDIA Corporation.
  98. 98. sics Data Movement Example Ba int main(void) { float *a_h, *b_h; // host data Host Device float *a_d, *b_d; // device data int N = 14, nBytes, i ; a_h a_d nBytes = N*sizeof(float); a_h = (float *)malloc(nBytes); b_h = (float *)malloc(nBytes); cudaMalloc((void **) &a_d, nBytes); cudaMalloc((void **) &b_d, nBytes); b_h b_d for (i=0, i<N; i++) a_h[i] = 100.f + i; cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(b_d, a_d, nBytes, cudaMemcpyDeviceToDevice); cudaMemcpy(b_h, b_d, nBytes, cudaMemcpyDeviceToHost); for (i=0; i< N; i++) assert( a_h[i] == b_h[i] ); free(a_h); free(b_h); cudaFree(a_d); cudaFree(b_d); return 0; } © 2008 NVIDIA Corporation.
  99. 99. sics Data Movement Example Ba int main(void) { float *a_h, *b_h; // host data Host Device float *a_d, *b_d; // device data int N = 14, nBytes, i ; a_h a_d nBytes = N*sizeof(float); a_h = (float *)malloc(nBytes); b_h = (float *)malloc(nBytes); cudaMalloc((void **) &a_d, nBytes); cudaMalloc((void **) &b_d, nBytes); b_h b_d for (i=0, i<N; i++) a_h[i] = 100.f + i; cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(b_d, a_d, nBytes, cudaMemcpyDeviceToDevice); cudaMemcpy(b_h, b_d, nBytes, cudaMemcpyDeviceToHost); for (i=0; i< N; i++) assert( a_h[i] == b_h[i] ); free(a_h); free(b_h); cudaFree(a_d); cudaFree(b_d); return 0; } © 2008 NVIDIA Corporation.
  100. 100. sics Data Movement Example Ba int main(void) { float *a_h, *b_h; // host data Host Device float *a_d, *b_d; // device data int N = 14, nBytes, i ; nBytes = N*sizeof(float); a_h = (float *)malloc(nBytes); b_h = (float *)malloc(nBytes); cudaMalloc((void **) &a_d, nBytes); cudaMalloc((void **) &b_d, nBytes); for (i=0, i<N; i++) a_h[i] = 100.f + i; cudaMemcpy(a_d, a_h, nBytes, cudaMemcpyHostToDevice); cudaMemcpy(b_d, a_d, nBytes, cudaMemcpyDeviceToDevice); cudaMemcpy(b_h, b_d, nBytes, cudaMemcpyDeviceToHost); for (i=0; i< N; i++) assert( a_h[i] == b_h[i] ); free(a_h); free(b_h); cudaFree(a_d); cudaFree(b_d); return 0; } © 2008 NVIDIA Corporation.
  101. 101. sics Executing Code on the GPU Ba Kernels are C functions with some restrictions Cannot access host memory Must have void return type No variable number of arguments (“varargs”) Not recursive No static variables Function arguments automatically copied from host to device © 2008 NVIDIA Corporation.
  102. 102. sics Function Qualifiers Ba Kernels designated by function qualifier: __global__ Function called from host and executed on device Must return void Other CUDA function qualifiers __device__ Function called from device and run on device Cannot be called from host code __host__ Function called from host and executed on host (default) __host__ and __device__ qualifiers can be combined to generate both CPU and GPU code © 2008 NVIDIA Corporation.
  103. 103. sics Launching Kernels Ba Modified C function call syntax: kernel<<<dim3 dG, dim3 dB>>>(…) Execution Configuration (“<<< >>>”) dG - dimension and size of grid in blocks Two-dimensional: x and y Blocks launched in the grid: dG.x * dG.y dB - dimension and size of blocks in threads: Three-dimensional: x, y, and z Threads per block: dB.x * dB.y * dB.z Unspecified dim3 fields initialize to 1 © 2008 NVIDIA Corporation.
  104. 104. Execution Configuration Examples sics Ba dim3 grid, block; grid.x = 2; grid.y = 4; block.x = 8; block.y = 16; kernel<<<grid, block>>>(...); Equivalent assignment using dim3 grid(2, 4), block(8,16); constructor functions kernel<<<grid, block>>>(...); kernel<<<32,512>>>(...); © 2008 NVIDIA Corporation.
  105. 105. CUDA Built-in Device Variables sics Ba All __global__ and __device__ functions have access to these automatically defined variables dim3 gridDim; Dimensions of the grid in blocks (at most 2D) dim3 blockDim; Dimensions of the block in threads dim3 blockIdx; Block index within the grid dim3 threadIdx; Thread index within the block © 2008 NVIDIA Corporation.
  106. 106. sics Unique Thread IDs Ba Built-in variables are used to determine unique thread IDs Map from local thread ID (threadIdx) to a global ID which can be used as array indices Grid 0 1 2 blockIdx.x blockDim.x = 5 threadIdx.x 0 1234 0 1234 0 1234 0 1234 5 6789 10 11 12 13 14 blockIdx.x*blockDim.x + threadIdx.x © 2008 NVIDIA Corporation.
  107. 107. sics Ba @$A:)$ ,5:1 ! 0) 0 ! 2&5$'1%O*quot;)I%PG+GQ 12324 12354 0) 0 7) 7) ! 15324 15354 12324 12354 0) 0) 7) 7) 16324 16354 15324 15354 ! 7) 7) 16324 16354 ! slide by Matthew Bolitho
  108. 108. sics Ba !quot;#$%&'()*+,- ! !quot;#$%&'(quot;)*+,-quot;.'/quot;$'0&quot;12quot;#+quot;345quot;#$%&'(6 ! !**quot;#$%&'(6quot;78quot;'quot;#$%&'(quot;)*+,-quot;'%&quot;%18quot;+8quot;#$&quot; 6'.&quot;.1*#792%+,&66+% ! :$16quot;,'8quot;,+..187,'#&quot;07'quot;6$'%&(quot;.&.+%/ ! !8(quot;6/8,$%+87;&quot; :$%&'(6quot;+<quot;'quot;)*+,-quot;'%&quot;.1*#72*&=&(quot;+8#+quot;'quot; ! .1*#792%+,&66+%quot;'6quot;!quot;#$% slide by Matthew Bolitho
  109. 109. sics Minimal Kernels Ba __global__ void minimal( int* a_d, int value) { *a_d = value; } __global__ void assign( int* a_d, int value) { int idx = blockDim.x * blockIdx.x + threadIdx.x; a_d[idx] = value; } © 2008 NVIDIA Corporation.
  110. 110. sics Increment Array Example Ba CPU program CUDA program void inc_cpu(int *a, int N) __global__ void inc_gpu(int *a, int N) { { int idx; int idx = blockIdx.x * blockDim.x + threadIdx.x; for (idx = 0; idx<N; idx++) if (idx < N) a[idx] = a[idx] + 1; a[idx] = a[idx] + 1; } } int main() int main() { { ... … inc_cpu(a, N); dim3 dimBlock (blocksize); } dim3 dimGrid( ceil( N / (float)blocksize) ); inc_gpu<<<dimGrid, dimBlock>>>(a, N); } © 2008 NVIDIA Corporation.
  111. 111. sics Host Synchronization Ba All kernel launches are asynchronous control returns to CPU immediately kernel executes after all previous CUDA calls have completed cudaMemcpy() is synchronous control returns to CPU after copy completes copy starts after all previous CUDA calls have completed cudaThreadSynchronize() blocks until all previous CUDA calls complete © 2008 NVIDIA Corporation.
  112. 112. Host Synchronization Example sics Ba // copy data from host to device cudaMemcpy(a_d, a_h, numBytes, cudaMemcpyHostToDevice); // execute the kernel inc_gpu<<<ceil(N/(float)blocksize), blocksize>>>(a_d, N); // run independent CPU code run_cpu_stuff(); // copy data from device back to host cudaMemcpy(a_h, a_d, numBytes, cudaMemcpyDeviceToHost); © 2008 NVIDIA Corporation.
  113. 113. sics Device Runtime Component: Ba Synchronization Function void __syncthreads(); Synchronizes all threads in a block Once all threads have reached this point, execution resumes normally Used to avoid RAW / WAR / WAW hazards when accessing shared Allowed in conditional code only if the conditional is uniform across the entire thread block 29 © NVIDIA Corporation 2006
  114. 114. sics Variable Qualifiers (GPU code) Ba __device__ Stored in global memory (large, high latency, no cache) Allocated with cudaMalloc (__device__ qualifier implied) Accessible by all threads Lifetime: application __shared__ Stored in on-chip shared memory (very low latency) Specified by execution configuration or at compile time Accessible by all threads in the same thread block Lifetime: thread block Unqualified variables: Scalars and built-in vector types are stored in registers What doesn’t fit in registers spills to “local” memory © 2008 NVIDIA Corporation.
  115. 115. sics CUDA Error Reporting to CPU Ba All CUDA calls return error code: Except for kernel launches cudaError_t type cudaError_t cudaGetLastError(void) Returns the code for the last error (no error has a code) Can be used to get error from kernel execution char* cudaGetErrorString(cudaError_t code) Returns a null-terminated character string describing the error printf(“%sn”, cudaGetErrorString( cudaGetLastError() ) ); © 2008 NVIDIA Corporation.
  116. 116. sics Host Runtime Component: Ba Device Management Device enumeration cudaGetDeviceCount(), cudaGetDeviceProperties() Device selection cudaChooseDevice(), cudaSetDevice() > ~/NVIDIA_CUDA_SDK/bin/linux/release/deviceQuery There is 1 device supporting CUDA Device 0: quot;Quadro FX 5600quot; Major revision number: 1 Minor revision number: 0 Total amount of global memory: 1609891840 bytes Total amount of constant memory: 65536 bytes Total amount of shared memory per block: 16384 bytes Total number of registers available per block: 8192 Warp size: 32 Maximum number of threads per block: 512 Maximum sizes of each dimension of a block: 512 x 512 x 64 Maximum sizes of each dimension of a grid: 65535 x 65535 x 1 Maximum memory pitch: 262144 bytes Texture alignment: 256 bytes Clock rate: 1350000 kilohertz 26 © NVIDIA Corporation 2006
  117. 117. sics Host Runtime Component: Ba Memory Management Two kinds of memory: Linear memory: accessed through 32-bit pointers CUDA arrays: opaque layouts with dimensionality readable only through texture objects Memory allocation cudaMalloc(), cudaFree(), cudaMallocPitch(), cudaMallocArray(), cudaFreeArray() Memory copy cudaMemcpy(), cudaMemcpy2D(), cudaMemcpyToArray(), cudaMemcpyFromArray(), etc. cudaMemcpyToSymbol(), cudaMemcpyFromSymbol() Memory addressing cudaGetSymbolAddress() 27 © NVIDIA Corporation 2006
  118. 118. sics Ba >?@ ?>L9G=2%&66quot;K16 J%+8#quot;F7(&quot;K16 H%'2$7,6quot;>'%(quot;Iquot; A+%#$)%7(B& F+1#$)%7(B& >@C! E&.+%/quot;K16 ?>Lquot;K16 ! CD!E F!:! G#$&%8&# slide by Matthew Bolitho
  119. 119. sics Ba ./012 +%quot;./0$ ! D&2*',&(quot;!H? ! ?5?Mquot;J1**quot;C12*&=quot;F&%7'*Mquot;F/..&#%7,quot;K16 ! 53NEKI6quot;)'8(O7(#$quot;78quot;&',$quot;(7%&,#7+8 ! quot;#$$#%&'()#$*+%,-(+%.#($/&.+0&,(1&2%,3( ! ,+8<7B1%'#7+86Pquot;quot;GPBQ ! ?>L9Gquot;4R=quot;Squot;4Rquot;*'8&6 ! 4Rquot;#7.&6quot;#$&quot;)'8(O7(#$quot;TUHKI6V slide by Matthew Bolitho
  120. 120. ME CO
  121. 121. Back Pocket Slides slide by David Cox
  122. 122. Code Walkthrough 2: Parallel Reduction
  123. 123. Execution Decomposition Two stages of computation: Sum within each block Sum partial results from the blocks 31704163317041633170416331704163 31704163317041633170416331704163 4 7 5 9 4 7 5 9 4 7 5 9 4 7 5 9 4 7 5 9 4 7 5 9 4 7 5 9 4 7 5 9 11 14 11 14 11 14 11 14 11 14 11 14 11 14 11 14 Stage 1: 25 25 25 25 25 25 25 25 many blocks Stage2: 31704163 4 7 5 9 1 block 11 14 25 For reductions, code for all levels is the same 37 © NVIDIA Corporation 2006
  124. 124. Kernel execution Values (shared memory) 10 1 8 -1 0 -2 3 5 -2 -3 2 7 0 11 0 2 Step 1 Distance 8 1 2 3 4 5 6 7 0 threads values 8 -2 10 6 0 9 3 7 -2 -3 2 7 0 11 0 2 Step 2 Distance 4 1 2 3 0 threads values 8 7 13 13 0 9 3 7 -2 -3 2 7 0 11 0 2 Step 3 threads 1 0 Distance 2 values 21 20 13 13 0 9 3 7 -2 -3 2 7 0 11 0 2 Step 4 threads 0 Distance 1 values 41 20 13 13 0 9 3 7 -2 -3 2 7 0 11 0 2 38 © NVIDIA Corporation 2006
  125. 125. Kernel Source Code __global__ void sum_kernel(int *g_input, int *g_output) { extern __shared__ int s_data[]; // allocated during kernel launch // read input into shared memory unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; s_data[threadIdx.x] = g_input[idx]; __syncthreads(); // compute sum for the threadblock for(int dist = blockDim.x/2; dist>0; dist/=2) { if(threadIdx.x<dist) s_data[threadIdx.x] += s_data[threadIdx.x+dist]; __syncthreads(); } // write the block's sum to global memory if(threadIdx.x==0) g_output[blockIdx.x] = s_data[0]; } 39 © NVIDIA Corporation 2006
  126. 126. Host Source Code (1) int main() { // data set size in elements and bytes unsigned int n = 4096; unsigned int num_bytes = n*sizeof(int); // launch configuration parameters unsigned int block_dim = 256; unsigned int num_blocks = n / block_dim; unsigned int num_smem_bytes = block_dim*sizeof(int); // allocate and initialize the data on the CPU int *h_a=(int*)malloc(num_bytes); for(int i=0;i<n;i++) h_a[i]=1; // allocate memory on the GPU device int *d_a=0, *d_output=0; cudaMalloc((void**)&d_a, num_bytes); cudaMalloc((void**)&d_output, num_blocks*sizeof(int)); ... 40 © NVIDIA Corporation 2006
  127. 127. Host Source Code (2) ... // copy the input data from CPU to the GPU device cudaMemcpy(d_a, h_a, num_bytes, cudaMemcpyHostToDevice); // two stages of kernel execution sum_kernel<<<num_blocks, block_dim, num_smem_bytes>>>(d_a, d_output); sum_kernel<<<1, num_blocks, num_blocks*sizeof(int)>>>(d_output, d_output); // copy the output from GPU device to CPU and print cudaMemcpy(h_a, d_output, sizeof(int), cudaMemcpyDeviceToHost); printf(quot;%dnquot;, h_a[0]); // release resources cudaFree(d_a); cudaFree(d_output); free(h_a); return 0; } 41 © NVIDIA Corporation 2006

×