CUDA Deep Dive

CUDA
Deep
Dive
Kashif Rasul
@krasul

objective: Deeper
understanding

#include <cutil_inline.h>

int main( void )
{
int N = 50000;
size_t size = N * sizeof(float);

cudaSetDevice( cutGetMaxGflopsDeviceId() );
...
cutilSafeCall( cudaMalloc((void**)&d_A, size) );
cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) );
...

int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
add<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
...
cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) );
cudaFree(d_A);
...
cutilDeviceReset();
}

blocks & threads

__global__ void dot( float *a, float *b, float *c )
{
__shared__ float cache[threadsPerBlock];
int cacheIndex = threadIdx.x;
...
// set the cache values
cache[cacheIndex] = temp;
// synchronize threads in this block
__syncthreads();
...
}

int main( void )
{
...
dot<<<blocksPerGrid,threadsPerBlock>>>( d_a, d_b, d_c );
...
}

shared memory

• thread coop. & shared mem. useful
for reduction algorithms

• avoid race conditions by using
__syncthreads()

• avoid bank conflicts

• every thread in the block needs to
call __syncthreads()

keep in mind

__constant__ float constFloat;

__device__ float getConstFloat() { return constFloat; }

__global__ void addConstant(float *vec, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i<N)
vec[i] += getConstFloat();
}


int main( int argc, char** argv)
{
float constValue = 4.0f;

cutilSafeCall( cudaMemcpyToSymbol(constFloat,
&constValue,
sizeof(float), 0,
cudaMemcpyHostToDevice) );
...
}

constant mem.

• read-only, but conserves mem.
bandwidth

• a single read can be broadcasted and
cached for additional reads

• painfully slow when each thread
reads a different address from
constant memory

keep in mind

// textures containing look-up tables
texture<uint> edgeTex;
texture<uint, 2> edge2dTex;

int main(int argc, char** argv)
{
...
cutilSafeCall( cudaMalloc((void**) d_edgeTable, 256*sizeof(uint)) );
cutilSafeCall( cudaMemcpy((void *)*d_edgeTable, (void *)edgeTable,
256*sizeof(uint), cudaMemcpyHostToDevice) );

cutilSafeCall( cudaBindTexture(0, edgeTex, *d_edgeTable,
256*sizeof(uint)) );

// run kernel
kernel<<<blocks, threads>>>(...)

//cleanup
cutilSafeCall( cudaUnbindTexture(edgeTex) );
}

__global__ void kernel(...)
{
...
uint edge = tex1Dfetch(edgeTex, index*16 + i);
...
}
texture mem.

• read-only, like for const. mem.

• great when memory access exhibits
spatial locality, i.e. each thread
reads a loc. near where the next or
previous thread reads

• comes in 1-D, 2-D and 3-D versions
& typically used in finite diff. apps

keep in mind

surface<void, 2> output_surface;

__global__ void surfaceWrite(float* g_idata, int width, int height) {
// calculate surface coordinates
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

// read from global memory and write to cuarray (via surface reference)
surf2Dwrite(g_idata[y*width+x], output_surface, x*4, y, cudaBoundaryModeTrap);
}

int main( int argc, char** argv) {
...
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0,
cudaChannelFormatKindFloat);
cudaArray* cu_array;
cutilSafeCall( cudaMallocArray(&cu_array, &channelDesc, width, height,
cudaArraySurfaceLoadStore) );
cutilSafeCall( cudaMemcpy( d_data, h_data, size, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaBindSurfaceToArray(output_surface, cu_array) );

surfaceWrite<<<dimGrid, dimBlock>>>(d_data, width, height);
...
cutilSafeCall( cudaFree(d_data) );
cutilSafeCall( cudaFreeArray(cu_array) );
}

surface mem.

// OpenGL Graphics includes
#include <GL/glew.h>
#if defined (__APPLE__) || defined(MACOSX)
#include <GLUT/glut.h>
#else
#include <GL/freeglut.h>
#endif

int main(int argc, char **argv) {
// Initialize GL
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB);
glutInitWindowSize(1000, 1000);

// Create a window with rendering context and all else we need
glutCreateWindow("CUDA Interop.");

// initialize necessary OpenGL extensions
glewInit();

// Select CUDA device with OpenGL interoperability
if (cutCheckCmdLineFlag(argc, (const char**)argv, "device")) {
cutilGLDeviceInit(argc, argv);
}
else {
cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() );
}
}
set device

// vbo variables
GLuint vbo;
struct cudaGraphicsResource *cuda_vbo_resource;
void *d_vbo_buffer = NULL;

// create buffer object
glGenBuffers(1, vbo);
glBindBuffer(GL_ARRAY_BUFFER, *vbo);

// initialize buffer object
unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);
glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);

glBindBuffer(GL_ARRAY_BUFFER, 0);

// register this buffer object with CUDA
cutilSafeCall(cudaGraphicsGLRegisterBuffer(cuda_vbo_resource, *vbo,
cudaGraphicsMapFlagsWriteDiscard));

register data with CUDA

// map OpenGL buffer object for writing from CUDA
float4 *dptr;
cutilSafeCall( cudaGraphicsMapResources(1, cuda_vbo_resource, 0) );

size_t num_bytes;
cutilSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&dptr,
&num_bytes,
*cuda_vbo_resource) );

// run kernel
kernel<<<blocks,threads>>>(dptr,...);

// unmap buffer object
cutilSafeCall( cudaGraphicsUnmapResources(1, cuda_vbo_resource, 0) );

pass data via shared buffers

• need to tell the CUDA runtime the
device we intend to use for CUDA
and OpenGL

• initialize OpenGL first and then use
the cudaGLSetGLDevice() method

• DirectX interop. is nearly identical

keep in mind

➜ git clone https://github.com/kashif/cuda-workshop.git
Cloning into cuda-workshop...
...

➜ cd cuda-workshop

➜ cmake CMakeLists.txt
-- The C compiler identification is GNU
...

➜ make
Scanning dependencies of target cutil
[ 5%] Building CXX object cutil/CMakeFiles/cutil.dir/src/bank_checker.cpp.o
...
[100%] Built target matrixMul

➜ ./bin/matrixMul
[ matrixMul ]
bin/matrixMul Starting (CUDA and CUBLAS tests)...

Device 0: "GeForce GTX 480" with Compute 2.0 capability
...

install CMake, glut & glew

➜ ls src/matrixMul
CMakeLists.txt matrixMul.cu matrixMul.h
matrixMul_gold.cpp matrixMul_kernel.cu

➜ cat src/matrixMul/CMakeLists.txt
CUDA_ADD_EXECUTABLE( matrixMul
matrixMul.cu
matrixMul_gold.cpp
)

TARGET_LINK_LIBRARIES( matrixMul
cutil
shrutil
${CUDA_CUBLAS_LIBRARIES}
)

➜ cmake -G "Visual Studio 10 Win64" CMakeLists.txt
...

great for experimenting

cudaEvent_t start, stop;
float time;

// initialize events
cutilSafeCall( cudaEventCreate(&start) );
cutilSafeCall( cudaEventCreate(&stop) );

// warmup to avoid timing startup
kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1);

// take measurements for loop over kernel launches
cutilSafeCall( cudaEventRecord(start, 0) );
for (int i=0; i < NUM_REPS; i++) {
kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1);
// Ensure no launch failure
cutilSafeCall( cudaGetLastError() );
}
cutilSafeCall( cudaEventRecord(stop, 0) );
cutilSafeCall( cudaEventSynchronize(stop) );
cutilSafeCall( cudaEventElapsedTime(&time, start, stop) );

// report effective bandwidth in GB/s (2.0f due to read + write)
float bandwidth = 2.0f * mem_size/(1024*1024*1024)/(time/NUM_REPS);

cutilSafeCall( cudaEventDestroy(stop) );
cutilSafeCall( cudaEventDestroy(start) );
events: GPU timestamp

...
unsigned int timer_matrixMul = 0;

// start timing
cutilCheckError( cutStartTimer(timer_matrixMul) );

// do some work
kernel<<<grid, threads, mem_size>>>(d_idata, d_odata);
cutilDeviceSynchronize();

// stop timer
cutilCheckError( cutStopTimer(timer_matrixMul) );

double dSeconds = cutGetTimerValue(timer_matrixMul)/((double)nIter * 1000.0);
double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;
double gflops = 1.0e-9 * dNumOps/dSeconds;

// destroy timer
cutilCheckError( cutDeleteTimer(timer_matrixMul) );

os timers

• creating and recording events is
tricky since some CUDA calls are
asynch.

• all kernel launches are asynch.

• instruct the CPU to synch. on an
event via cudaDeviceSynchronize()

keep in mind

➜ cat hello_gpu.py
import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
import numpy
import numpy.linalg as la
from pycuda.compiler import SourceModule

mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")

multiply_them = mod.get_function("multiply_them")

a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)

dest = numpy.zeros_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=(400,1,1))

print dest-a*b
pycuda

➜ python hello_gpu.py
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0.]

// Initialize the driver and create a context for the first device.
cuInit(0);
CUdevice device = new CUdevice(); cuDeviceGet(device, 0);
CUcontext context = new CUcontext(); cuCtxCreate(context, 0, device);

// Create the PTX file by calling the NVCC and load it
String ptxFileName = preparePtxFile("JCudaVectorAddKernel.cu");
CUmodule module = new CUmodule(); cuModuleLoad(module, ptxFileName);

// Obtain a function pointer to the "add" function.
CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "add");

// Allocate the device input data
float hostInputA[] = new float[numElements]; CUdeviceptr deviceInputA = new CUdeviceptr();
cuMemAlloc(deviceInputA, numElements * Sizeof.FLOAT);
cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA), numElements * Sizeof.FLOAT);
...
// Set up the kernel parameters
Pointer kernelParameters = Pointer.to(Pointer.to(deviceInputA),...);

// Call the kernel function
int blockSizeX = 256; int gridSizeX = (int)Math.ceil((double)numElements / blockSizeX);
cuLaunchKernel(function,
gridSizeX, 1, 1, // Grid dimension
blockSizeX, 1, 1, // Block dimension
0, null, // Shared memory size and stream
kernelParameters, null); // Kernel- and extra parameters
cuCtxSynchronize();
jcuda

➜ ls
License.txt jcuda-0.4.0-beta1.jar
jcurand-0.4.0-beta1.jar libJCublas-apple-x86_64.dylib
libJCudaRuntime-apple-x86_64.dylib libJCurand-apple-x86_64.dylib
jcublas-0.4.0-beta1.jar jcufft-0.4.0-beta1.jar
jcusparse-0.4.0-beta1.jar libJCudaDriver-apple-x86_64.dylib
libJCufft-apple-x86_64.dylib libJCusparse-apple-x86_64.dylib
JCudaVectorAdd.java JCudaVectorAddKernel.cu

➜ cat JCudaVectorAddKernel.cu
extern "C"
__global__ void add(float *a, float *b, float *sum, int n)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n)
{
sum[i] = a[i] + b[i];
}
}

➜ javac -classpath jcuda-0.4.0-beta1.jar JCudaVectorAdd.java

➜ java -classpath jcuda-0.4.0-beta1.jar:. JCudaVectorAdd
Executing
nvcc -m64 -ptx JCudaVectorAddKernel.cu -o JCudaVectorAddKernel.ptx
Finished creating PTX file
Test PASSED

➜ gem install sgc-ruby-cuda
Successfully installed sgc-ruby-cuda-0.1.1
1 gem installed

➜ cat vector_add.rb
...
# Prepare and load vadd kernel.
kernel_lib_file = compile(vadd_kernel_src)
CudaFunction.load_lib_file(kernel_lib_file.path)

# Copy input buffers from host memory to device memory.
memcpy_htod(da, ha, nbytes)
memcpy_htod(db, hb, nbytes)

# Invoke vadd kernel.
nthreads_per_block = 256
block_dim = Dim3.new(nthreads_per_block, 1, 1)
grid_dim = Dim3.new((N + nthreads_per_block - 1) / nthreads_per_block, 1, 1)
CudaFunction.configure(block_dim, grid_dim)
CudaFunction.setup(da, db, dc, N)
f = CudaFunction.new("vadd")
f.launch

# Copy output buffer from device memory to host memory.
memcpy_dtoh(hc, dc, nbytes)
...

➜ ruby vector_add.rb
Vector Addition
Verification completed. All matches? YES ruby-cuda

cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);

float* h_A = (float*)malloc(N * N * sizeof(h_A[0]));
...
/* Fill the matrices with test data */
...
/* Allocate device memory for the matrices */
cudaMalloc((void**)&d_A, N * N * sizeof(d_A[0]));
...
/* Initialize the device matrices with the host matrices */
status = cublasSetVector(N * N, sizeof(h_A[0]), h_A, 1, d_A, 1);
...
/* Performs Sgemm: C <- alphaAB + betaC */
status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N,
&alpha, d_A, N, d_B, N, &beta, d_C, N);

/* Allocate host mem & read back the result from device mem */
h_C = (float*)malloc(N * N * sizeof(h_C[0]));
status = cublasGetVector(N * N, sizeof(h_C[0]), d_C, 1, h_C, 1);

/* Memory clean up */
cudaFree(d_A);
...
/* Shutdown */
status = cublasDestroy(handle);
cublas

cudaSetDevice( cutGetMaxGflopsDeviceId() );

// Allocate & init. host memory for the signal
Complex* h_signal = (Complex*)malloc(sizeof(Complex) * SIGNAL_SIZE);
...
// Pad signal
Complex* h_padded_signal;
...
// Allocate device memory for signal
Complex* d_signal;
cutilSafeCall( cudaMalloc((void**)&d_signal, mem_size) );
// Copy host memory to device
cutilSafeCall( cudaMemcpy(d_signal, h_padded_signal, mem_size,
cudaMemcpyHostToDevice) );

// CUFFT plan
cufftHandle plan;
cufftSafeCall( cufftPlan1d(&plan, new_size, CUFFT_C2C, 1) );

// Transform signal
cufftSafeCall( cufftExecC2C(plan, (cufftComplex *)d_signal,
(cufftComplex *)d_signal, CUFFT_FORWARD) );

// Destroy CUFFT context
cufftSafeCall( cufftDestroy(plan) );

// Cleanup memory
cutilSafeCall( cudaFree(d_signal) );
...
cutilDeviceReset(); cufft

cusparseHandle_t handle = 0;
cusparseStatus_t status = cusparseCreate(&handle);

// create a matrix description for the matrix M
cusparseMatDescr_t descrM = 0; status = cusparseCreateMatDescr(&descrM);
cusparseSetMatType ( descrM, CUSPARSE_MATRIX_TYPE_TRIANGULAR );
cusparseSetMatIndexBase ( descrM, CUSPARSE_INDEX_BASE_ZERO );
cusparseSetMatDiagType ( descrM, CUSPARSE_DIAG_TYPE_NON_UNIT );
cusparseSetMatFillMode ( descrM, CUSPARSE_FILL_MODE_LOWER );

// create & perform analysis info for the non-trans & trans case
cusparseSolveAnalysisInfo_t info = 0, infoTrans = 0;
cusparseCreateSolveAnalysisInfo(&info);
cusparseCreateSolveAnalysisInfo(&infoTrans);

cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descrM,
d_valsICP, d_rowPtrsICP, d_colIndsICP, info);
cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_TRANSPOSE, N, descrM,
d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans);
...
// Solve M z = H H^T z = r by first doing a forward solve: H y = r
cusparseScsrsv_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, 1.0, descrM,
d_valsICP, d_rowPtrsICP, d_colIndsICP, info, d_r, d_y);
// and then a back substitution: H^T z = y
cusparseScsrsv_solve(handle, CUSPARSE_OPERATION_TRANSPOSE, N, 1.0, descrM,
d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans, d_y, d_z);
...
cusparseDestroy(handle);
cusparse

cudaError_t cudaResult = cudaSuccess;

// Allocate memory for points
float *d_points = 0;
cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(float));

// Generate random points in unit square
curandStatus_t curandResult;
curandGenerator_t qrng;

curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL32);
curandResult = curandSetQuasiRandomGeneratorDimensions(qrng, 2);
curandResult = curandSetGeneratorOrdering(qrng, CURAND_ORDERING_QUASI_DEFAULT);
curandResult = curandGenerateUniform(qrng, (float *)d_points, 2 * m_numSims);

// Cleanup
curandResult = curandDestroyGenerator(qrng);
cudaFree(d_points);

curand

// declare a host image object for an 8-bit grayscale image
npp::ImageCPU_8u_C1 oHostSrc;
// load gray-scale image from disk
npp::loadImage(sFilename, oHostSrc);
// declare a device image and copy from the host image to the device
npp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc);

// create struct with box-filter mask size
NppiSize oMaskSize = {5, 5};
// create struct with ROI size given the current mask
NppiSize oSizeROI = {oDeviceSrc.width() - oMaskSize.width + 1,
oDeviceSrc.height() - oMaskSize.height + 1};

// allocate device image of appropriately reduced size
npp::ImageNPP_8u_C1 oDeviceDst(oSizeROI.width, oSizeROI.height);

// set anchor point inside the mask to (0, 0)
NppiPoint oAnchor = {0, 0};
// run box filter
nppiFilterBox_8u_C1R(oDeviceSrc.data(), oDeviceSrc.pitch(),
oDeviceDst.data(), oDeviceDst.pitch(),
oSizeROI, oMaskSize, oAnchor);

// declare a host image for the result
npp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size());
// and copy the device result data into it
oDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch());
npp

cudaStream_t stream;

cutilSafeCall( cudaStreamCreate(&stream) );

// allocate page locked memory
cutilSafeCall( cudaMallocHost((void**)&a, nbytes, cudaHostAllocDefault) );

// allocate device memory
cutilSafeCall( cudaMalloc((void**)&d_a, nbytes) );
cutilSafeCall( cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice,
stream) );

// run kernel and copy result back
cutilSafeCall( cudaEventRecord(start, stream) );
kernel<<<N,M,0,stream>>>(&d_a, ... );
cutilSafeCall( cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost,
stream) );

// free
cudaStreamDestroy(stream);
cudaFreeHost(a);
cudaFree(d_a);

pinned memory

// loop over full data, in bite-sized chunks
for (int i=0; i<FULL_DATA_SIZE; i+= N) {
// copy the locked memory to the device, async
cutilSafeCall( cudaMemcpyAsync(dev_a, host_a+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream) );
cutilSafeCall( cudaMemcpyAsync(dev_b, host_b+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream) );

kernel<<<N/256,256,0,stream>>>(dev_a, dev_b, dev_c);

// copy the data from device to locked memory
cutilSafeCall( cudaMemcpyAsync(host_c+i, dev_c,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream) );
}

// wait for all operations to finish
cutilSafeCall( cudaStreamSynchronize(stream) );

chunked computation

cudaStream_t *streamArray = 0;
streamArray = (cudaStream_t *)malloc(N * sizeof (cudaStream_t *));

...
for ( int i = 0; i < N ; i++) {
cudaStreamCreate(&streamArray[i]);
...
}

...
for ( int i = 0; i < N ; i++) {
cublasSetMatrix (..., devPtrA[i], ...);
...
}

...
for ( int i = 0; i < N ; i++) {
cublasSetStream(handle, streamArray[i]);
cublasSgemm(handle, ..., devPtrA[i], devPtrB[i], devPtrC[i], ...);
}
cudaThreadSynchronize();

batched computation

• use it to specify in which order
operations get executed async.

• idea is to use more than 1 stream

• requires a new kind of mem. copy
which in turn requires pinned: paged
locked mem.

• free pinned mem. when not needed

keep in mind

// Allocate resources
for( int i =0; i<STREAM_COUNT; ++i ) {
cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault);
cudaMalloc(&d_data_in[i], memsize);
...
}

int current_stream = 0;
// Do processing in a loop...
{
int next_stream = (current_stream + 1 ) % STREAM_COUNT;
// Ensure that processing and copying of the last cycle has finished
cudaEventSynchronize(cycleDone[next_stream]);

// Process current frame
kernel<<<grid, block, 0, stream[current_stream]>>>(d_data_out[current_stream],
d_data_in[current_stream],
N, ...);
// Upload next frame
cudaMemcpyAsync(d_data_in[next_stream], ..., cudaMemcpyHostToDevice,
stream[next_stream]);

// Download current frame
cudaMemcpyAsync(h_data_out[current_stream], ..., cudaMemcpyDeviceToHost,
stream[current_stream]);

cudaEventRecord(cycleDone[current_stream], stream[current_stream]);
current_stream = next_stream;
}
overlap kernel exec. & memcpy

• devices with CC 1.1 and above can
overlap a kernel exec & memcpy as
long as they are issued from
different streams

• kernels are serialized

• queue in a way that independent
streams can execute in parallel

keep in mind

float *a, *d_a;
...

/* Allocate mapped CPU memory. */
cutilSafeCall( cudaHostAlloc((void **)&a, bytes, cudaHostAllocMapped) );
...

/* Initialize the vectors. */
for(n = 0; n < nelem; n++) { a[n] = rand() / (float)RAND_MAX; ... }

/* Get the device pointers for the pinned CPU memory mapped into the GPU
memory space. */
cutilSafeCall( cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0) );
...

/* Call the GPU kernel using the device pointers for the mapped memory. */
...
kernel<<<grid, block>>>(d_a, d_b, d_c, nelem);
...

/* Memory clean up */
cutilSafeCall( cudaFreeHost(a) );
...

zero-copy host memory

//Create streams for issuing GPU command asynchronously and allocate memory
for(int i = 0; i < GPU_N; i++) {
cutilSafeCall( cudaStreamCreate(&stream[i]) );
cutilSafeCall( cudaMalloc((void**)&d_Data[i], dataN * sizeof(float)) );
cutilSafeCall( cudaMallocHost((void**)&h_Data[i], dataN * sizeof(float)) );
//init h_Data
}

//Copy data to GPU, launch the kernel and copy data back. All asynchronously
//Set device
cutilSafeCall( cudaSetDevice(i) );

// Copy input data from CPU
cutilSafeCall( cudaMemcpyAsync(d_Data[i], h_Data[i], dataN * sizeof(float),
cudaMemcpyHostToDevice, stream[i]) );

// Perform GPU computations
kernel<<<blocks, threads, 0, stream[i]>>>(...)

// Copy back the result
cutilSafeCall( cudaMemcpyAsync(h_Sum_from_device[i], d_Sum[i],
ACCUM_N * sizeof(float),
cudaMemcpyDeviceToHost, stream[i]) );
}

streams

// Process GPU results
for(i = 0; i < GPU_N; i++) {
// Set device

// Wait for all operations to finish
cudaStreamSynchronize(stream[i]);

// Shut down this GPU
cutilSafeCall( cudaFreeHost(h_Data[i]) );
cutilSafeCall( cudaFree(d_Data[i]) );
cutilSafeCall( cudaStreamDestroy(stream[i]) );
}

// shutdown
cutilDeviceReset();
}

process the result

• can also control each GPU by a
separate CPU thread

• need to assign portable pinned
memory if a different thread needs
access to one thread’s memory

• use the flag cudaHostAllocPortable
to cudaHostAlloc()

keep in mind

// Initialize MPI state
MPI_CHECK( MPI_Init(&argc, &argv) );

// Get our MPI node number and node count
int commSize, commRank;
MPI_CHECK( MPI_Comm_size(MPI_COMM_WORLD, &commSize) );
MPI_CHECK( MPI_Comm_rank(MPI_COMM_WORLD, &commRank) );

if(commRank == 0) {// Are we the root node?
//initialize dataRoot...
}

// Allocate a buffer on each node
float * dataNode = new float[dataSizePerNode];

// Dispatch a portion of the input data to each node
MPI_CHECK( MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode,
dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD) );

// if commRank == 0 then free dataRoot...

kernel<<<gridSize, blockSize>>>(dataNode, ...);

// Reduction to the root node
float sumNode = sum(dataNode, dataSizePerNode);
float sumRoot;
MPI_CHECK( MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0,
MPI_COMM_WORLD) );

MPI_CHECK( MPI_Finalize() ); mpi + cuda

// Enable peer access
cutilSafeCall(cudaSetDevice(gpuid_tesla[0]));
cutilSafeCall(cudaDeviceEnablePeerAccess(gpuid_tesla[1], gpuid_tesla[0]));
...

// Allocate buffers
cudaSetDevice(gpuid_tesla[0]); cudaMalloc(&g0, buf_size);
cudaSetDevice(gpuid_tesla[1]); cudaMalloc(&g1, buf_size);

// Ping-pong copy between GPUs
cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);

// Prepare host buffer and copy to GPU 0
cudaSetDevice(gpuid_tesla[0]); cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault);

// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
// output to the GPU 1 buffer: dst[idx] = src[idx] * 2.0f
cudaSetDevice(gpuid_tesla[1]); kernel<<<blocks, threads>>>(g0, g1);
cutilDeviceSynchronize();

// Disable peer access (also unregisters memory for non-UVA cases)
cudaSetDevice(gpuid_tesla[0]); cudaDeviceDisablePeerAccess(gpuid_tesla[1]);
cudaSetDevice(gpuid_tesla[1]); cudaDeviceDisablePeerAccess(gpuid_tesla[0]);

cudaFree(g0);
...

P2P & unified virtual address space

Thank you
download slides (2MB pdf) from
http://bit.ly/cuda-deep-dive

CUDA Deep Dive

Recommended

Recommended

More Related Content

What's hot

What's hot (19)

Viewers also liked

Viewers also liked (7)

Similar to CUDA Deep Dive

Similar to CUDA Deep Dive (20)

Recently uploaded

Recently uploaded (20)

CUDA Deep Dive