SlideShare a Scribd company logo
CUDA – First Programs
“Hello, world” is traditionally the first program we write. We can do the same for CUDA. Here it is:
In file hello.cu:
#include "stdio.h"
int main()
{
printf("Hello, worldn");
return 0;
}
On our Tesla machine, you can compile and this with:
$ nvcc hello.cu
$ ./a.out
You can change the output file name with the –o flag: nvcc –o hello hello.cu
If you edit your .bashrc file you can also add your current directory to your path if you don’t want to
have to type the preceding . all of the time, which refers to the current working directory. Add
export PATH=$PATH:.
To the .bashrc file. Some would recommend not doing this for security purposes.
You might consider this program to be cheating, since it doesn’t really use any CUDA functionality.
Everything runs on the host. However, the point is that CUDA C programs can do everything a regular C
program can do.
Here is a slightly more interesting (but inefficient and only useful as an example) program that adds two
numbers together using a kernel function:
#include "stdio.h"
__global__ void add(int a, int b, int *c)
{
*c = a + b;
}
int main()
{
int a,b,c;
int *dev_c;
a=3;
b=4;
cudaMalloc((void**)&dev_c, sizeof(int));
add<<<1,1>>>(a,b,dev_c);
cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d + %d is %dn", a, b, c);
cudaFree(dev_c);
return 0;
}
To do in class: walk through program; show similar program in straight C and it runs much faster! Why?
cudaMalloc returns cudaSuccess if it was successful; could check to ensure that the program will run
correctly.
Example: Summing Vectors
This is a simple problem. Given two vectors (i.e. arrays), we would like to add them together in a third
array. For example:
A = {0, 2, 4, 6, 8}
B = {1, 1, 2, 2, 1}
Then A + B =
C = {1, 3, 6, 8, 9}
In this example the array is 5 elements long, so our approach will be to create 5 different threads. The
first thread is responsible for computing C[0] = A[0] + B[0]. The second thread is responsible for
computing C[1] = A[1] + B[1], and so forth.
Here is how we can do this with traditional C code:
#include "stdio.h"
#define N 10
void add(int *a, int *b, int *c)
{
int tID = 0;
while (tID < N)
{
c[tID] = a[tID] + b[tID];
tID += 1;
}
}
int main()
{
int a[N], b[N], c[N];
// Fill Arrays
for (int i = 0; i < N; i++)
{
a[i] = i,
b[i] = 1;
}
add (a, b, c);
for (int i = 0; i < N; i++)
{
printf("%d + %d = %dn", a[i], b[i], c[i]);
}
return 0;
}
This is a rather roundabout way to add two arrays – our reason is because this will translate a little nicer
to the CUDA version. To compile and run it, we have to use g++ (since it uses some C++ style notations
that don’t work in C). Here is the CUDA version:
#include "stdio.h"
#define N 10
__global__ void add(int *a, int *b, int *c)
{
int tID = blockIdx.x;
if (tID < N)
{
c[tID] = a[tID] + b[tID];
}
}
int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_b, N*sizeof(int));
cudaMalloc((void **) &dev_c, N*sizeof(int));
// Fill Arrays
for (int i = 0; i < N; i++)
{
a[i] = i,
b[i] = 1;
}
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);
add<<<N,1>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++)
{
printf("%d + %d = %dn", a[i], b[i], c[i]);
}
return 0;
}
blockIDx.x gives us the Block ID, which ranges from 0 to N-1. What if we used add<<<1,N>>>
instead? Then we can access by the ThreadID which is the variable threadIDx.x.
As another example, let’s add two 2D arrays. We can define a 2D array of ints as follows:
int c[2][3];
The following code illustrates how the 2D array is laid out in memory:
for (int i=0; i < 2; i++)
for (int j=0; j< 3; j++)
printf("[%d][%d] at %ldn",i,j,&c[i][j]);
Output:
[0][0] at 140733933298160
[0][1] at 140733933298164
[0][2] at 140733933298168
[1][0] at 140733933298172
[1][1] at 140733933298176
[1][2] at 140733933298180
We can see that we have a layout where the next cell in the j dimension occupies the next sequential
integer in memory, where an int is 4 bytes:
c[0][0] at &c c[0][1] at &c + 4 c[0][2] at &c + 8
c[1][0] at &c + 12 c[1][1] at &c + 16 c[1][2] at &c + 20
In general, the address of a cell can be computed by:
&c + [(sizeof(int) * sizeof-j-dimension * i] + (sizeof(int)) * j
In our example the size of the j dimension is 3. For example, the cell at c[1][1] would be combined as
the base address + (4*3*1) + (4*1) = &c+16.
C will do the addressing for us if we use the array notation, so if INDEX=i*WIDTH + J
then we can access the element via: c[INDEX]
CUDA requires we allocate memory as a one-dimensional array, so we can use the mapping above to a
2D array.
To make the mapping a little easier in the kernel function we can declare the blocks to be in a grid that is
the same dimensions as the 2D array. This will create variables blockIdx.x and blockIdx.y that
correspond to the width and height of the array.
#include "stdio.h"
#define COLUMNS 3
#define ROWS 2
__global__ void add(int *a, int *b, int *c)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
}
int main()
{
int a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
for (int y = 0; y < ROWS; y++) // Fill Arrays
for (int x = 0; x < COLUMNS; x++)
{
a[y][x] = x;
b[y][x] = y;
}
cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
dim3 grid(COLUMNS,ROWS);
add<<<grid,1>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),
cudaMemcpyDeviceToHost);
for (int y = 0; y < ROWS; y++) // Output Arrays
{
for (int x = 0; x < COLUMNS; x++)
{
printf("[%d][%d]=%d ",y,x,c[y][x]);
}
printf("n");
}
return 0;
}

More Related Content

What's hot

ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
Dr. Volkan OBAN
 
Doubly linklist
Doubly linklistDoubly linklist
Doubly linklist
ilsamaryum
 
Data Visualization with R.ggplot2 and its extensions examples.
Data Visualization with R.ggplot2 and its extensions examples.Data Visualization with R.ggplot2 and its extensions examples.
Data Visualization with R.ggplot2 and its extensions examples.
Dr. Volkan OBAN
 
C++ TUTORIAL 10
C++ TUTORIAL 10C++ TUTORIAL 10
C++ TUTORIAL 10
Farhan Ab Rahman
 
C++ TUTORIAL 7
C++ TUTORIAL 7C++ TUTORIAL 7
C++ TUTORIAL 7
Farhan Ab Rahman
 
Python Tidbits
Python TidbitsPython Tidbits
Python Tidbits
Mitchell Vitez
 
Basic Calculus in R.
Basic Calculus in R. Basic Calculus in R.
Basic Calculus in R.
Dr. Volkan OBAN
 
python高级内存管理
python高级内存管理python高级内存管理
python高级内存管理
rfyiamcool
 
NumPy Refresher
NumPy RefresherNumPy Refresher
NumPy Refresher
Lukasz Dobrzanski
 
Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.
Dr. Volkan OBAN
 
Frsa
FrsaFrsa
Frsa
_111
 
OOP 2012 - Hint: Dynamic allocation in c++
OOP 2012 - Hint: Dynamic allocation in c++OOP 2012 - Hint: Dynamic allocation in c++
OOP 2012 - Hint: Dynamic allocation in c++Allan Sun
 
Pointer Events in Canvas
Pointer Events in CanvasPointer Events in Canvas
Pointer Events in Canvasdeanhudson
 
Advanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part IIAdvanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part II
Dr. Volkan OBAN
 
Scientific Computing with Python Webinar March 19: 3D Visualization with Mayavi
Scientific Computing with Python Webinar March 19: 3D Visualization with MayaviScientific Computing with Python Webinar March 19: 3D Visualization with Mayavi
Scientific Computing with Python Webinar March 19: 3D Visualization with Mayavi
Enthought, Inc.
 
C++ TUTORIAL 1
C++ TUTORIAL 1C++ TUTORIAL 1
C++ TUTORIAL 1
Farhan Ab Rahman
 
C++ TUTORIAL 9
C++ TUTORIAL 9C++ TUTORIAL 9
C++ TUTORIAL 9
Farhan Ab Rahman
 
Class array
Class arrayClass array
Class arraynky92
 
Generative Adversarial Nets
Generative Adversarial NetsGenerative Adversarial Nets
Generative Adversarial Nets
Jinho Lee
 

What's hot (20)

ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions ggtimeseries-->ggplot2 extensions
ggtimeseries-->ggplot2 extensions
 
Doubly linklist
Doubly linklistDoubly linklist
Doubly linklist
 
Arrays
ArraysArrays
Arrays
 
Data Visualization with R.ggplot2 and its extensions examples.
Data Visualization with R.ggplot2 and its extensions examples.Data Visualization with R.ggplot2 and its extensions examples.
Data Visualization with R.ggplot2 and its extensions examples.
 
C++ TUTORIAL 10
C++ TUTORIAL 10C++ TUTORIAL 10
C++ TUTORIAL 10
 
C++ TUTORIAL 7
C++ TUTORIAL 7C++ TUTORIAL 7
C++ TUTORIAL 7
 
Python Tidbits
Python TidbitsPython Tidbits
Python Tidbits
 
Basic Calculus in R.
Basic Calculus in R. Basic Calculus in R.
Basic Calculus in R.
 
python高级内存管理
python高级内存管理python高级内存管理
python高级内存管理
 
NumPy Refresher
NumPy RefresherNumPy Refresher
NumPy Refresher
 
Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.Advanced Data Visualization in R- Somes Examples.
Advanced Data Visualization in R- Somes Examples.
 
Frsa
FrsaFrsa
Frsa
 
OOP 2012 - Hint: Dynamic allocation in c++
OOP 2012 - Hint: Dynamic allocation in c++OOP 2012 - Hint: Dynamic allocation in c++
OOP 2012 - Hint: Dynamic allocation in c++
 
Pointer Events in Canvas
Pointer Events in CanvasPointer Events in Canvas
Pointer Events in Canvas
 
Advanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part IIAdvanced Data Visualization Examples with R-Part II
Advanced Data Visualization Examples with R-Part II
 
Scientific Computing with Python Webinar March 19: 3D Visualization with Mayavi
Scientific Computing with Python Webinar March 19: 3D Visualization with MayaviScientific Computing with Python Webinar March 19: 3D Visualization with Mayavi
Scientific Computing with Python Webinar March 19: 3D Visualization with Mayavi
 
C++ TUTORIAL 1
C++ TUTORIAL 1C++ TUTORIAL 1
C++ TUTORIAL 1
 
C++ TUTORIAL 9
C++ TUTORIAL 9C++ TUTORIAL 9
C++ TUTORIAL 9
 
Class array
Class arrayClass array
Class array
 
Generative Adversarial Nets
Generative Adversarial NetsGenerative Adversarial Nets
Generative Adversarial Nets
 

Similar to CUDA First Programs: Computer Architecture CSE448 : UAA Alaska : Notes

CUDA by Example : Parallel Programming in CUDA C : Notes
CUDA by Example : Parallel Programming in CUDA C : NotesCUDA by Example : Parallel Programming in CUDA C : Notes
CUDA by Example : Parallel Programming in CUDA C : Notes
Subhajit Sahu
 
Introduction to CUDA C: NVIDIA : Notes
Introduction to CUDA C: NVIDIA : NotesIntroduction to CUDA C: NVIDIA : Notes
Introduction to CUDA C: NVIDIA : Notes
Subhajit Sahu
 
Lecture 1 mte 407
Lecture 1 mte 407Lecture 1 mte 407
Lecture 1 mte 407
rumanatasnim415
 
Lecture 1 mte 407
Lecture 1 mte 407Lecture 1 mte 407
Lecture 1 mte 407
rumanatasnim415
 
Array
ArrayArray
Array
Anil Dutt
 
CS.3.Arrays.pdf
CS.3.Arrays.pdfCS.3.Arrays.pdf
CS.3.Arrays.pdf
YasirAli74993
 
Mouse programming in c
Mouse programming in cMouse programming in c
Mouse programming in cgkgaur1987
 
"Optimization of a .NET application- is it simple ! / ?", Yevhen Tatarynov
"Optimization of a .NET application- is it simple ! / ?",  Yevhen Tatarynov"Optimization of a .NET application- is it simple ! / ?",  Yevhen Tatarynov
"Optimization of a .NET application- is it simple ! / ?", Yevhen Tatarynov
Fwdays
 
FALLSEM2022-23_BCSE202L_TH_VL2022230103292_Reference_Material_I_08-08-2022_C_...
FALLSEM2022-23_BCSE202L_TH_VL2022230103292_Reference_Material_I_08-08-2022_C_...FALLSEM2022-23_BCSE202L_TH_VL2022230103292_Reference_Material_I_08-08-2022_C_...
FALLSEM2022-23_BCSE202L_TH_VL2022230103292_Reference_Material_I_08-08-2022_C_...
AntareepMajumder
 
Object Oriented Programming using C++: C++ Templates.pptx
Object Oriented Programming using C++: C++ Templates.pptxObject Oriented Programming using C++: C++ Templates.pptx
Object Oriented Programming using C++: C++ Templates.pptx
RashidFaridChishti
 
Question In C Programming In mathematics, a set is a colle...Sav.pdf
Question In C Programming In mathematics, a set is a colle...Sav.pdfQuestion In C Programming In mathematics, a set is a colle...Sav.pdf
Question In C Programming In mathematics, a set is a colle...Sav.pdf
arihantcomp1008
 
DoublyList-cpp- #include -DoublyList-h- using namespace std- void Doub.pdf
DoublyList-cpp- #include -DoublyList-h- using namespace std- void Doub.pdfDoublyList-cpp- #include -DoublyList-h- using namespace std- void Doub.pdf
DoublyList-cpp- #include -DoublyList-h- using namespace std- void Doub.pdf
aathiauto
 
Unit 2
Unit 2Unit 2
Unit 2
TPLatchoumi
 
1sequences and sampling. Suppose we went to sample the x-axis from X.pdf
1sequences and sampling. Suppose we went to sample the x-axis from X.pdf1sequences and sampling. Suppose we went to sample the x-axis from X.pdf
1sequences and sampling. Suppose we went to sample the x-axis from X.pdf
rushabhshah600
 
Chapter2
Chapter2Chapter2
Chapter2
Krishna Kumar
 
Data structures KTU chapter2.PPT
Data structures KTU chapter2.PPTData structures KTU chapter2.PPT
Data structures KTU chapter2.PPT
Albin562191
 
Lecture 5Arrays on c++ for Beginner.pptx
Lecture 5Arrays on c++ for Beginner.pptxLecture 5Arrays on c++ for Beginner.pptx
Lecture 5Arrays on c++ for Beginner.pptx
arjurakibulhasanrrr7
 

Similar to CUDA First Programs: Computer Architecture CSE448 : UAA Alaska : Notes (20)

CUDA by Example : Parallel Programming in CUDA C : Notes
CUDA by Example : Parallel Programming in CUDA C : NotesCUDA by Example : Parallel Programming in CUDA C : Notes
CUDA by Example : Parallel Programming in CUDA C : Notes
 
Introduction to CUDA C: NVIDIA : Notes
Introduction to CUDA C: NVIDIA : NotesIntroduction to CUDA C: NVIDIA : Notes
Introduction to CUDA C: NVIDIA : Notes
 
Lecture 1 mte 407
Lecture 1 mte 407Lecture 1 mte 407
Lecture 1 mte 407
 
Lecture 1 mte 407
Lecture 1 mte 407Lecture 1 mte 407
Lecture 1 mte 407
 
Array
ArrayArray
Array
 
CS.3.Arrays.pdf
CS.3.Arrays.pdfCS.3.Arrays.pdf
CS.3.Arrays.pdf
 
Mouse programming in c
Mouse programming in cMouse programming in c
Mouse programming in c
 
"Optimization of a .NET application- is it simple ! / ?", Yevhen Tatarynov
"Optimization of a .NET application- is it simple ! / ?",  Yevhen Tatarynov"Optimization of a .NET application- is it simple ! / ?",  Yevhen Tatarynov
"Optimization of a .NET application- is it simple ! / ?", Yevhen Tatarynov
 
array2d.ppt
array2d.pptarray2d.ppt
array2d.ppt
 
FALLSEM2022-23_BCSE202L_TH_VL2022230103292_Reference_Material_I_08-08-2022_C_...
FALLSEM2022-23_BCSE202L_TH_VL2022230103292_Reference_Material_I_08-08-2022_C_...FALLSEM2022-23_BCSE202L_TH_VL2022230103292_Reference_Material_I_08-08-2022_C_...
FALLSEM2022-23_BCSE202L_TH_VL2022230103292_Reference_Material_I_08-08-2022_C_...
 
Object Oriented Programming using C++: C++ Templates.pptx
Object Oriented Programming using C++: C++ Templates.pptxObject Oriented Programming using C++: C++ Templates.pptx
Object Oriented Programming using C++: C++ Templates.pptx
 
Introduction to c part 2
Introduction to c   part  2Introduction to c   part  2
Introduction to c part 2
 
Question In C Programming In mathematics, a set is a colle...Sav.pdf
Question In C Programming In mathematics, a set is a colle...Sav.pdfQuestion In C Programming In mathematics, a set is a colle...Sav.pdf
Question In C Programming In mathematics, a set is a colle...Sav.pdf
 
DoublyList-cpp- #include -DoublyList-h- using namespace std- void Doub.pdf
DoublyList-cpp- #include -DoublyList-h- using namespace std- void Doub.pdfDoublyList-cpp- #include -DoublyList-h- using namespace std- void Doub.pdf
DoublyList-cpp- #include -DoublyList-h- using namespace std- void Doub.pdf
 
Unit 2
Unit 2Unit 2
Unit 2
 
1sequences and sampling. Suppose we went to sample the x-axis from X.pdf
1sequences and sampling. Suppose we went to sample the x-axis from X.pdf1sequences and sampling. Suppose we went to sample the x-axis from X.pdf
1sequences and sampling. Suppose we went to sample the x-axis from X.pdf
 
Chapter2
Chapter2Chapter2
Chapter2
 
Data structures KTU chapter2.PPT
Data structures KTU chapter2.PPTData structures KTU chapter2.PPT
Data structures KTU chapter2.PPT
 
Lecture 5Arrays on c++ for Beginner.pptx
Lecture 5Arrays on c++ for Beginner.pptxLecture 5Arrays on c++ for Beginner.pptx
Lecture 5Arrays on c++ for Beginner.pptx
 
Cpp tutorial
Cpp tutorialCpp tutorial
Cpp tutorial
 

More from Subhajit Sahu

About TrueTime, Spanner, Clock synchronization, CAP theorem, Two-phase lockin...
About TrueTime, Spanner, Clock synchronization, CAP theorem, Two-phase lockin...About TrueTime, Spanner, Clock synchronization, CAP theorem, Two-phase lockin...
About TrueTime, Spanner, Clock synchronization, CAP theorem, Two-phase lockin...
Subhajit Sahu
 
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Subhajit Sahu
 
Adjusting Bitset for graph : SHORT REPORT / NOTES
Adjusting Bitset for graph : SHORT REPORT / NOTESAdjusting Bitset for graph : SHORT REPORT / NOTES
Adjusting Bitset for graph : SHORT REPORT / NOTES
Subhajit Sahu
 
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
Subhajit Sahu
 
Adjusting primitives for graph : SHORT REPORT / NOTES
Adjusting primitives for graph : SHORT REPORT / NOTESAdjusting primitives for graph : SHORT REPORT / NOTES
Adjusting primitives for graph : SHORT REPORT / NOTES
Subhajit Sahu
 
Experiments with Primitive operations : SHORT REPORT / NOTES
Experiments with Primitive operations : SHORT REPORT / NOTESExperiments with Primitive operations : SHORT REPORT / NOTES
Experiments with Primitive operations : SHORT REPORT / NOTES
Subhajit Sahu
 
PageRank Experiments : SHORT REPORT / NOTES
PageRank Experiments : SHORT REPORT / NOTESPageRank Experiments : SHORT REPORT / NOTES
PageRank Experiments : SHORT REPORT / NOTES
Subhajit Sahu
 
Algorithmic optimizations for Dynamic Monolithic PageRank (from STICD) : SHOR...
Algorithmic optimizations for Dynamic Monolithic PageRank (from STICD) : SHOR...Algorithmic optimizations for Dynamic Monolithic PageRank (from STICD) : SHOR...
Algorithmic optimizations for Dynamic Monolithic PageRank (from STICD) : SHOR...
Subhajit Sahu
 
Adjusting OpenMP PageRank : SHORT REPORT / NOTES
Adjusting OpenMP PageRank : SHORT REPORT / NOTESAdjusting OpenMP PageRank : SHORT REPORT / NOTES
Adjusting OpenMP PageRank : SHORT REPORT / NOTES
Subhajit Sahu
 
word2vec, node2vec, graph2vec, X2vec: Towards a Theory of Vector Embeddings o...
word2vec, node2vec, graph2vec, X2vec: Towards a Theory of Vector Embeddings o...word2vec, node2vec, graph2vec, X2vec: Towards a Theory of Vector Embeddings o...
word2vec, node2vec, graph2vec, X2vec: Towards a Theory of Vector Embeddings o...
Subhajit Sahu
 
DyGraph: A Dynamic Graph Generator and Benchmark Suite : NOTES
DyGraph: A Dynamic Graph Generator and Benchmark Suite : NOTESDyGraph: A Dynamic Graph Generator and Benchmark Suite : NOTES
DyGraph: A Dynamic Graph Generator and Benchmark Suite : NOTES
Subhajit Sahu
 
Shared memory Parallelism (NOTES)
Shared memory Parallelism (NOTES)Shared memory Parallelism (NOTES)
Shared memory Parallelism (NOTES)
Subhajit Sahu
 
A Dynamic Algorithm for Local Community Detection in Graphs : NOTES
A Dynamic Algorithm for Local Community Detection in Graphs : NOTESA Dynamic Algorithm for Local Community Detection in Graphs : NOTES
A Dynamic Algorithm for Local Community Detection in Graphs : NOTES
Subhajit Sahu
 
Scalable Static and Dynamic Community Detection Using Grappolo : NOTES
Scalable Static and Dynamic Community Detection Using Grappolo : NOTESScalable Static and Dynamic Community Detection Using Grappolo : NOTES
Scalable Static and Dynamic Community Detection Using Grappolo : NOTES
Subhajit Sahu
 
Application Areas of Community Detection: A Review : NOTES
Application Areas of Community Detection: A Review : NOTESApplication Areas of Community Detection: A Review : NOTES
Application Areas of Community Detection: A Review : NOTES
Subhajit Sahu
 
Community Detection on the GPU : NOTES
Community Detection on the GPU : NOTESCommunity Detection on the GPU : NOTES
Community Detection on the GPU : NOTES
Subhajit Sahu
 
Survey for extra-child-process package : NOTES
Survey for extra-child-process package : NOTESSurvey for extra-child-process package : NOTES
Survey for extra-child-process package : NOTES
Subhajit Sahu
 
Dynamic Batch Parallel Algorithms for Updating PageRank : POSTER
Dynamic Batch Parallel Algorithms for Updating PageRank : POSTERDynamic Batch Parallel Algorithms for Updating PageRank : POSTER
Dynamic Batch Parallel Algorithms for Updating PageRank : POSTER
Subhajit Sahu
 
Abstract for IPDPS 2022 PhD Forum on Dynamic Batch Parallel Algorithms for Up...
Abstract for IPDPS 2022 PhD Forum on Dynamic Batch Parallel Algorithms for Up...Abstract for IPDPS 2022 PhD Forum on Dynamic Batch Parallel Algorithms for Up...
Abstract for IPDPS 2022 PhD Forum on Dynamic Batch Parallel Algorithms for Up...
Subhajit Sahu
 
Fast Incremental Community Detection on Dynamic Graphs : NOTES
Fast Incremental Community Detection on Dynamic Graphs : NOTESFast Incremental Community Detection on Dynamic Graphs : NOTES
Fast Incremental Community Detection on Dynamic Graphs : NOTES
Subhajit Sahu
 

More from Subhajit Sahu (20)

About TrueTime, Spanner, Clock synchronization, CAP theorem, Two-phase lockin...
About TrueTime, Spanner, Clock synchronization, CAP theorem, Two-phase lockin...About TrueTime, Spanner, Clock synchronization, CAP theorem, Two-phase lockin...
About TrueTime, Spanner, Clock synchronization, CAP theorem, Two-phase lockin...
 
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
 
Adjusting Bitset for graph : SHORT REPORT / NOTES
Adjusting Bitset for graph : SHORT REPORT / NOTESAdjusting Bitset for graph : SHORT REPORT / NOTES
Adjusting Bitset for graph : SHORT REPORT / NOTES
 
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
 
Adjusting primitives for graph : SHORT REPORT / NOTES
Adjusting primitives for graph : SHORT REPORT / NOTESAdjusting primitives for graph : SHORT REPORT / NOTES
Adjusting primitives for graph : SHORT REPORT / NOTES
 
Experiments with Primitive operations : SHORT REPORT / NOTES
Experiments with Primitive operations : SHORT REPORT / NOTESExperiments with Primitive operations : SHORT REPORT / NOTES
Experiments with Primitive operations : SHORT REPORT / NOTES
 
PageRank Experiments : SHORT REPORT / NOTES
PageRank Experiments : SHORT REPORT / NOTESPageRank Experiments : SHORT REPORT / NOTES
PageRank Experiments : SHORT REPORT / NOTES
 
Algorithmic optimizations for Dynamic Monolithic PageRank (from STICD) : SHOR...
Algorithmic optimizations for Dynamic Monolithic PageRank (from STICD) : SHOR...Algorithmic optimizations for Dynamic Monolithic PageRank (from STICD) : SHOR...
Algorithmic optimizations for Dynamic Monolithic PageRank (from STICD) : SHOR...
 
Adjusting OpenMP PageRank : SHORT REPORT / NOTES
Adjusting OpenMP PageRank : SHORT REPORT / NOTESAdjusting OpenMP PageRank : SHORT REPORT / NOTES
Adjusting OpenMP PageRank : SHORT REPORT / NOTES
 
word2vec, node2vec, graph2vec, X2vec: Towards a Theory of Vector Embeddings o...
word2vec, node2vec, graph2vec, X2vec: Towards a Theory of Vector Embeddings o...word2vec, node2vec, graph2vec, X2vec: Towards a Theory of Vector Embeddings o...
word2vec, node2vec, graph2vec, X2vec: Towards a Theory of Vector Embeddings o...
 
DyGraph: A Dynamic Graph Generator and Benchmark Suite : NOTES
DyGraph: A Dynamic Graph Generator and Benchmark Suite : NOTESDyGraph: A Dynamic Graph Generator and Benchmark Suite : NOTES
DyGraph: A Dynamic Graph Generator and Benchmark Suite : NOTES
 
Shared memory Parallelism (NOTES)
Shared memory Parallelism (NOTES)Shared memory Parallelism (NOTES)
Shared memory Parallelism (NOTES)
 
A Dynamic Algorithm for Local Community Detection in Graphs : NOTES
A Dynamic Algorithm for Local Community Detection in Graphs : NOTESA Dynamic Algorithm for Local Community Detection in Graphs : NOTES
A Dynamic Algorithm for Local Community Detection in Graphs : NOTES
 
Scalable Static and Dynamic Community Detection Using Grappolo : NOTES
Scalable Static and Dynamic Community Detection Using Grappolo : NOTESScalable Static and Dynamic Community Detection Using Grappolo : NOTES
Scalable Static and Dynamic Community Detection Using Grappolo : NOTES
 
Application Areas of Community Detection: A Review : NOTES
Application Areas of Community Detection: A Review : NOTESApplication Areas of Community Detection: A Review : NOTES
Application Areas of Community Detection: A Review : NOTES
 
Community Detection on the GPU : NOTES
Community Detection on the GPU : NOTESCommunity Detection on the GPU : NOTES
Community Detection on the GPU : NOTES
 
Survey for extra-child-process package : NOTES
Survey for extra-child-process package : NOTESSurvey for extra-child-process package : NOTES
Survey for extra-child-process package : NOTES
 
Dynamic Batch Parallel Algorithms for Updating PageRank : POSTER
Dynamic Batch Parallel Algorithms for Updating PageRank : POSTERDynamic Batch Parallel Algorithms for Updating PageRank : POSTER
Dynamic Batch Parallel Algorithms for Updating PageRank : POSTER
 
Abstract for IPDPS 2022 PhD Forum on Dynamic Batch Parallel Algorithms for Up...
Abstract for IPDPS 2022 PhD Forum on Dynamic Batch Parallel Algorithms for Up...Abstract for IPDPS 2022 PhD Forum on Dynamic Batch Parallel Algorithms for Up...
Abstract for IPDPS 2022 PhD Forum on Dynamic Batch Parallel Algorithms for Up...
 
Fast Incremental Community Detection on Dynamic Graphs : NOTES
Fast Incremental Community Detection on Dynamic Graphs : NOTESFast Incremental Community Detection on Dynamic Graphs : NOTES
Fast Incremental Community Detection on Dynamic Graphs : NOTES
 

CUDA First Programs: Computer Architecture CSE448 : UAA Alaska : Notes

  • 1. CUDA – First Programs “Hello, world” is traditionally the first program we write. We can do the same for CUDA. Here it is: In file hello.cu: #include "stdio.h" int main() { printf("Hello, worldn"); return 0; } On our Tesla machine, you can compile and this with: $ nvcc hello.cu $ ./a.out You can change the output file name with the –o flag: nvcc –o hello hello.cu If you edit your .bashrc file you can also add your current directory to your path if you don’t want to have to type the preceding . all of the time, which refers to the current working directory. Add export PATH=$PATH:. To the .bashrc file. Some would recommend not doing this for security purposes. You might consider this program to be cheating, since it doesn’t really use any CUDA functionality. Everything runs on the host. However, the point is that CUDA C programs can do everything a regular C program can do.
  • 2. Here is a slightly more interesting (but inefficient and only useful as an example) program that adds two numbers together using a kernel function: #include "stdio.h" __global__ void add(int a, int b, int *c) { *c = a + b; } int main() { int a,b,c; int *dev_c; a=3; b=4; cudaMalloc((void**)&dev_c, sizeof(int)); add<<<1,1>>>(a,b,dev_c); cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost); printf("%d + %d is %dn", a, b, c); cudaFree(dev_c); return 0; } To do in class: walk through program; show similar program in straight C and it runs much faster! Why? cudaMalloc returns cudaSuccess if it was successful; could check to ensure that the program will run correctly.
  • 3. Example: Summing Vectors This is a simple problem. Given two vectors (i.e. arrays), we would like to add them together in a third array. For example: A = {0, 2, 4, 6, 8} B = {1, 1, 2, 2, 1} Then A + B = C = {1, 3, 6, 8, 9} In this example the array is 5 elements long, so our approach will be to create 5 different threads. The first thread is responsible for computing C[0] = A[0] + B[0]. The second thread is responsible for computing C[1] = A[1] + B[1], and so forth. Here is how we can do this with traditional C code: #include "stdio.h" #define N 10 void add(int *a, int *b, int *c) { int tID = 0; while (tID < N) { c[tID] = a[tID] + b[tID]; tID += 1; } } int main() { int a[N], b[N], c[N]; // Fill Arrays for (int i = 0; i < N; i++) { a[i] = i, b[i] = 1; } add (a, b, c); for (int i = 0; i < N; i++) { printf("%d + %d = %dn", a[i], b[i], c[i]); } return 0; }
  • 4. This is a rather roundabout way to add two arrays – our reason is because this will translate a little nicer to the CUDA version. To compile and run it, we have to use g++ (since it uses some C++ style notations that don’t work in C). Here is the CUDA version: #include "stdio.h" #define N 10 __global__ void add(int *a, int *b, int *c) { int tID = blockIdx.x; if (tID < N) { c[tID] = a[tID] + b[tID]; } } int main() { int a[N], b[N], c[N]; int *dev_a, *dev_b, *dev_c; cudaMalloc((void **) &dev_a, N*sizeof(int)); cudaMalloc((void **) &dev_b, N*sizeof(int)); cudaMalloc((void **) &dev_c, N*sizeof(int)); // Fill Arrays for (int i = 0; i < N; i++) { a[i] = i, b[i] = 1; } cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice); add<<<N,1>>>(dev_a, dev_b, dev_c); cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost); for (int i = 0; i < N; i++) { printf("%d + %d = %dn", a[i], b[i], c[i]); } return 0; } blockIDx.x gives us the Block ID, which ranges from 0 to N-1. What if we used add<<<1,N>>> instead? Then we can access by the ThreadID which is the variable threadIDx.x.
  • 5. As another example, let’s add two 2D arrays. We can define a 2D array of ints as follows: int c[2][3]; The following code illustrates how the 2D array is laid out in memory: for (int i=0; i < 2; i++) for (int j=0; j< 3; j++) printf("[%d][%d] at %ldn",i,j,&c[i][j]); Output: [0][0] at 140733933298160 [0][1] at 140733933298164 [0][2] at 140733933298168 [1][0] at 140733933298172 [1][1] at 140733933298176 [1][2] at 140733933298180 We can see that we have a layout where the next cell in the j dimension occupies the next sequential integer in memory, where an int is 4 bytes: c[0][0] at &c c[0][1] at &c + 4 c[0][2] at &c + 8 c[1][0] at &c + 12 c[1][1] at &c + 16 c[1][2] at &c + 20 In general, the address of a cell can be computed by: &c + [(sizeof(int) * sizeof-j-dimension * i] + (sizeof(int)) * j In our example the size of the j dimension is 3. For example, the cell at c[1][1] would be combined as the base address + (4*3*1) + (4*1) = &c+16. C will do the addressing for us if we use the array notation, so if INDEX=i*WIDTH + J then we can access the element via: c[INDEX] CUDA requires we allocate memory as a one-dimensional array, so we can use the mapping above to a 2D array. To make the mapping a little easier in the kernel function we can declare the blocks to be in a grid that is the same dimensions as the 2D array. This will create variables blockIdx.x and blockIdx.y that correspond to the width and height of the array.
  • 6. #include "stdio.h" #define COLUMNS 3 #define ROWS 2 __global__ void add(int *a, int *b, int *c) { int x = blockIdx.x; int y = blockIdx.y; int i = (COLUMNS*y) + x; c[i] = a[i] + b[i]; } int main() { int a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS]; int *dev_a, *dev_b, *dev_c; cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int)); cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int)); cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int)); for (int y = 0; y < ROWS; y++) // Fill Arrays for (int x = 0; x < COLUMNS; x++) { a[y][x] = x; b[y][x] = y; } cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int), cudaMemcpyHostToDevice); dim3 grid(COLUMNS,ROWS); add<<<grid,1>>>(dev_a, dev_b, dev_c); cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int), cudaMemcpyDeviceToHost); for (int y = 0; y < ROWS; y++) // Output Arrays { for (int x = 0; x < COLUMNS; x++) { printf("[%d][%d]=%d ",y,x,c[y][x]); } printf("n"); } return 0; }