CUDA Deep Dive

8,470 views

Published on

A look at the more advanced CUDA APIs and their use.

Published in: Technology, Art & Photos
1 Comment
6 Likes
Statistics
Notes
  • really nice ppt but I wounder about implementations specially the CPU and GPU bench marks and difference of performance on computational times. how I can find codes which can show real differences between them.
       Reply 
    Are you sure you want to  Yes  No
    Your message goes here
No Downloads
Views
Total views
8,470
On SlideShare
0
From Embeds
0
Number of Embeds
4,215
Actions
Shares
0
Downloads
86
Comments
1
Likes
6
Embeds 0
No embeds

No notes for slide

CUDA Deep Dive

  1. 1. CUDADeep Dive Kashif Rasul @krasul
  2. 2. Hellomy name is Kashif
  3. 3. objective: Deeperunderstanding
  4. 4. Prerequisites 1
  5. 5. #include <cutil_inline.h>int main( void ){ int N = 50000; size_t size = N * sizeof(float); cudaSetDevice( cutGetMaxGflopsDeviceId() ); ... cutilSafeCall( cudaMalloc((void**)&d_A, size) ); cutilSafeCall( cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice) ); ... int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; add<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N); ... cutilSafeCall( cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost) ); cudaFree(d_A); ... cutilDeviceReset();} blocks & threads
  6. 6. __global__ void dot( float *a, float *b, float *c ){ __shared__ float cache[threadsPerBlock]; int cacheIndex = threadIdx.x; ... // set the cache values cache[cacheIndex] = temp; // synchronize threads in this block __syncthreads(); ...}int main( void ){ ... dot<<<blocksPerGrid,threadsPerBlock>>>( d_a, d_b, d_c ); ...} shared memory
  7. 7. • thread coop. & shared mem. useful for reduction algorithms• avoid race conditions by using __syncthreads()• avoid bank conflicts• every thread in the block needs to call __syncthreads() keep in mind
  8. 8. Memory 2
  9. 9. __constant__ float constFloat;__device__ float getConstFloat() { return constFloat; }__global__ void addConstant(float *vec, int N){ int i = blockDim.x * blockIdx.x + threadIdx.x; if (i<N) vec[i] += getConstFloat();}#include <cutil_inline.h>int main( int argc, char** argv){ float constValue = 4.0f; cutilSafeCall( cudaMemcpyToSymbol(constFloat, &constValue, sizeof(float), 0, cudaMemcpyHostToDevice) ); ...} constant mem.
  10. 10. • read-only, but conserves mem. bandwidth• a single read can be broadcasted and cached for additional reads• painfully slow when each thread reads a different address from constant memory keep in mind
  11. 11. // textures containing look-up tablestexture<uint> edgeTex;texture<uint, 2> edge2dTex;int main(int argc, char** argv){ ... cutilSafeCall( cudaMalloc((void**) d_edgeTable, 256*sizeof(uint)) ); cutilSafeCall( cudaMemcpy((void *)*d_edgeTable, (void *)edgeTable, 256*sizeof(uint), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaBindTexture(0, edgeTex, *d_edgeTable, 256*sizeof(uint)) ); // run kernel kernel<<<blocks, threads>>>(...) //cleanup cutilSafeCall( cudaUnbindTexture(edgeTex) );}__global__ void kernel(...){ ... uint edge = tex1Dfetch(edgeTex, index*16 + i); ...} texture mem.
  12. 12. • read-only, like for const. mem.• great when memory access exhibits spatial locality, i.e. each thread reads a loc. near where the next or previous thread reads• comes in 1-D, 2-D and 3-D versions & typically used in finite diff. apps keep in mind
  13. 13. surface<void, 2> output_surface;__global__ void surfaceWrite(float* g_idata, int width, int height) { // calculate surface coordinates unsigned int x = blockIdx.x*blockDim.x + threadIdx.x; unsigned int y = blockIdx.y*blockDim.y + threadIdx.y; // read from global memory and write to cuarray (via surface reference) surf2Dwrite(g_idata[y*width+x], output_surface, x*4, y, cudaBoundaryModeTrap);}int main( int argc, char** argv) { ... cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray* cu_array; cutilSafeCall( cudaMallocArray(&cu_array, &channelDesc, width, height, cudaArraySurfaceLoadStore) ); cutilSafeCall( cudaMemcpy( d_data, h_data, size, cudaMemcpyHostToDevice) ); cutilSafeCall( cudaBindSurfaceToArray(output_surface, cu_array) ); surfaceWrite<<<dimGrid, dimBlock>>>(d_data, width, height); ... cutilSafeCall( cudaFree(d_data) ); cutilSafeCall( cudaFreeArray(cu_array) );} surface mem.
  14. 14. InterOp. 3
  15. 15. // OpenGL Graphics includes#include <GL/glew.h>#if defined (__APPLE__) || defined(MACOSX)#include <GLUT/glut.h>#else#include <GL/freeglut.h>#endifint main(int argc, char **argv) { // Initialize GL glutInit(&argc, argv); glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB); glutInitWindowSize(1000, 1000); // Create a window with rendering context and all else we need glutCreateWindow("CUDA Interop."); // initialize necessary OpenGL extensions glewInit(); // Select CUDA device with OpenGL interoperability if (cutCheckCmdLineFlag(argc, (const char**)argv, "device")) { cutilGLDeviceInit(argc, argv); } else { cudaGLSetGLDevice( cutGetMaxGflopsDeviceId() ); }} set device
  16. 16. // vbo variablesGLuint vbo;struct cudaGraphicsResource *cuda_vbo_resource;void *d_vbo_buffer = NULL;// create buffer objectglGenBuffers(1, vbo);glBindBuffer(GL_ARRAY_BUFFER, *vbo);// initialize buffer objectunsigned int size = mesh_width * mesh_height * 4 * sizeof(float);glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);glBindBuffer(GL_ARRAY_BUFFER, 0);// register this buffer object with CUDAcutilSafeCall(cudaGraphicsGLRegisterBuffer(cuda_vbo_resource, *vbo, cudaGraphicsMapFlagsWriteDiscard)); register data with CUDA
  17. 17. // map OpenGL buffer object for writing from CUDAfloat4 *dptr;cutilSafeCall( cudaGraphicsMapResources(1, cuda_vbo_resource, 0) );size_t num_bytes;cutilSafeCall( cudaGraphicsResourceGetMappedPointer((void **)&dptr, &num_bytes, *cuda_vbo_resource) );// run kernelkernel<<<blocks,threads>>>(dptr,...);// unmap buffer objectcutilSafeCall( cudaGraphicsUnmapResources(1, cuda_vbo_resource, 0) ); pass data via shared buffers
  18. 18. • need to tell the CUDA runtime the device we intend to use for CUDA and OpenGL• initialize OpenGL first and then use the cudaGLSetGLDevice() method• DirectX interop. is nearly identical keep in mind
  19. 19. Pro Tip 4
  20. 20. ➜ git clone https://github.com/kashif/cuda-workshop.gitCloning into cuda-workshop......➜ cd cuda-workshop➜ cmake CMakeLists.txt-- The C compiler identification is GNU...➜ makeScanning dependencies of target cutil[ 5%] Building CXX object cutil/CMakeFiles/cutil.dir/src/bank_checker.cpp.o...[100%] Built target matrixMul➜ ./bin/matrixMul[ matrixMul ]bin/matrixMul Starting (CUDA and CUBLAS tests)...Device 0: "GeForce GTX 480" with Compute 2.0 capability... install CMake, glut & glew
  21. 21. ➜ ls src/matrixMulCMakeLists.txt matrixMul.cu matrixMul.hmatrixMul_gold.cpp matrixMul_kernel.cu➜ cat src/matrixMul/CMakeLists.txtCUDA_ADD_EXECUTABLE( matrixMul matrixMul.cu matrixMul_gold.cpp)TARGET_LINK_LIBRARIES( matrixMul cutil shrutil ${CUDA_CUBLAS_LIBRARIES})➜ cmake -G "Visual Studio 10 Win64" CMakeLists.txt... great for experimenting
  22. 22. Events & Timers 5
  23. 23. cudaEvent_t start, stop;float time;// initialize eventscutilSafeCall( cudaEventCreate(&start) );cutilSafeCall( cudaEventCreate(&stop) );// warmup to avoid timing startupkernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1);// take measurements for loop over kernel launchescutilSafeCall( cudaEventRecord(start, 0) );for (int i=0; i < NUM_REPS; i++) { kernel<<<grid, threads>>>(d_odata, d_idata, size_x, size_y, 1); // Ensure no launch failure cutilSafeCall( cudaGetLastError() );}cutilSafeCall( cudaEventRecord(stop, 0) );cutilSafeCall( cudaEventSynchronize(stop) );cutilSafeCall( cudaEventElapsedTime(&time, start, stop) );// report effective bandwidth in GB/s (2.0f due to read + write)float bandwidth = 2.0f * mem_size/(1024*1024*1024)/(time/NUM_REPS);cutilSafeCall( cudaEventDestroy(stop) );cutilSafeCall( cudaEventDestroy(start) ); events: GPU timestamp
  24. 24. #include <cutil_inline.h>...unsigned int timer_matrixMul = 0;// start timingcutilCheckError( cutStartTimer(timer_matrixMul) );// do some workkernel<<<grid, threads, mem_size>>>(d_idata, d_odata);cutilDeviceSynchronize();// stop timercutilCheckError( cutStopTimer(timer_matrixMul) );double dSeconds = cutGetTimerValue(timer_matrixMul)/((double)nIter * 1000.0);double dNumOps = 2.0 * (double)uiWA * (double)uiHA * (double)uiWB;double gflops = 1.0e-9 * dNumOps/dSeconds;// destroy timercutilCheckError( cutDeleteTimer(timer_matrixMul) ); os timers
  25. 25. • creating and recording events is tricky since some CUDA calls are asynch.• all kernel launches are asynch.• instruct the CPU to synch. on an event via cudaDeviceSynchronize() keep in mind
  26. 26. Bindings 6
  27. 27. ➜ cat hello_gpu.pyimport pycuda.driver as drvimport pycuda.toolsimport pycuda.autoinitimport numpyimport numpy.linalg as lafrom pycuda.compiler import SourceModulemod = SourceModule("""__global__ void multiply_them(float *dest, float *a, float *b){ const int i = threadIdx.x; dest[i] = a[i] * b[i];}""")multiply_them = mod.get_function("multiply_them")a = numpy.random.randn(400).astype(numpy.float32)b = numpy.random.randn(400).astype(numpy.float32)dest = numpy.zeros_like(a)multiply_them( drv.Out(dest), drv.In(a), drv.In(b), block=(400,1,1))print dest-a*b pycuda
  28. 28. ➜ python hello_gpu.py[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  29. 29. // Initialize the driver and create a context for the first device.cuInit(0);CUdevice device = new CUdevice(); cuDeviceGet(device, 0);CUcontext context = new CUcontext(); cuCtxCreate(context, 0, device);// Create the PTX file by calling the NVCC and load itString ptxFileName = preparePtxFile("JCudaVectorAddKernel.cu");CUmodule module = new CUmodule(); cuModuleLoad(module, ptxFileName);// Obtain a function pointer to the "add" function.CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "add");// Allocate the device input datafloat hostInputA[] = new float[numElements]; CUdeviceptr deviceInputA = new CUdeviceptr();cuMemAlloc(deviceInputA, numElements * Sizeof.FLOAT);cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA), numElements * Sizeof.FLOAT);...// Set up the kernel parametersPointer kernelParameters = Pointer.to(Pointer.to(deviceInputA),...);// Call the kernel functionint blockSizeX = 256; int gridSizeX = (int)Math.ceil((double)numElements / blockSizeX);cuLaunchKernel(function, gridSizeX, 1, 1, // Grid dimension blockSizeX, 1, 1, // Block dimension 0, null, // Shared memory size and stream kernelParameters, null); // Kernel- and extra parameterscuCtxSynchronize(); jcuda
  30. 30. ➜ lsLicense.txt jcuda-0.4.0-beta1.jarjcurand-0.4.0-beta1.jar libJCublas-apple-x86_64.dyliblibJCudaRuntime-apple-x86_64.dylib libJCurand-apple-x86_64.dylibjcublas-0.4.0-beta1.jar jcufft-0.4.0-beta1.jarjcusparse-0.4.0-beta1.jar libJCudaDriver-apple-x86_64.dyliblibJCufft-apple-x86_64.dylib libJCusparse-apple-x86_64.dylibJCudaVectorAdd.java JCudaVectorAddKernel.cu➜ cat JCudaVectorAddKernel.cuextern "C"__global__ void add(float *a, float *b, float *sum, int n){ int i = blockIdx.x * blockDim.x + threadIdx.x; if (i<n) { sum[i] = a[i] + b[i]; }}➜ javac -classpath jcuda-0.4.0-beta1.jar JCudaVectorAdd.java➜ java -classpath jcuda-0.4.0-beta1.jar:. JCudaVectorAddExecutingnvcc -m64 -ptx JCudaVectorAddKernel.cu -o JCudaVectorAddKernel.ptxFinished creating PTX fileTest PASSED
  31. 31. ➜ gem install sgc-ruby-cudaSuccessfully installed sgc-ruby-cuda-0.1.11 gem installed➜ cat vector_add.rb...# Prepare and load vadd kernel.kernel_lib_file = compile(vadd_kernel_src)CudaFunction.load_lib_file(kernel_lib_file.path)# Copy input buffers from host memory to device memory.memcpy_htod(da, ha, nbytes)memcpy_htod(db, hb, nbytes)# Invoke vadd kernel.nthreads_per_block = 256block_dim = Dim3.new(nthreads_per_block, 1, 1)grid_dim = Dim3.new((N + nthreads_per_block - 1) / nthreads_per_block, 1, 1)CudaFunction.configure(block_dim, grid_dim)CudaFunction.setup(da, db, dc, N)f = CudaFunction.new("vadd")f.launch# Copy output buffer from device memory to host memory.memcpy_dtoh(hc, dc, nbytes)...➜ ruby vector_add.rbVector AdditionVerification completed. All matches? YES ruby-cuda
  32. 32. Libraries 7
  33. 33. cublasHandle_t handle;cublasStatus_t status = cublasCreate(&handle);float* h_A = (float*)malloc(N * N * sizeof(h_A[0]));.../* Fill the matrices with test data */.../* Allocate device memory for the matrices */cudaMalloc((void**)&d_A, N * N * sizeof(d_A[0]));.../* Initialize the device matrices with the host matrices */status = cublasSetVector(N * N, sizeof(h_A[0]), h_A, 1, d_A, 1);.../* Performs Sgemm: C <- alphaAB + betaC */status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);/* Allocate host mem & read back the result from device mem */h_C = (float*)malloc(N * N * sizeof(h_C[0]));status = cublasGetVector(N * N, sizeof(h_C[0]), d_C, 1, h_C, 1);/* Memory clean up */cudaFree(d_A);.../* Shutdown */status = cublasDestroy(handle); cublas
  34. 34. cudaSetDevice( cutGetMaxGflopsDeviceId() );// Allocate & init. host memory for the signalComplex* h_signal = (Complex*)malloc(sizeof(Complex) * SIGNAL_SIZE);...// Pad signalComplex* h_padded_signal;...// Allocate device memory for signalComplex* d_signal;cutilSafeCall( cudaMalloc((void**)&d_signal, mem_size) );// Copy host memory to devicecutilSafeCall( cudaMemcpy(d_signal, h_padded_signal, mem_size, cudaMemcpyHostToDevice) );// CUFFT plancufftHandle plan;cufftSafeCall( cufftPlan1d(&plan, new_size, CUFFT_C2C, 1) );// Transform signalcufftSafeCall( cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD) );// Destroy CUFFT contextcufftSafeCall( cufftDestroy(plan) );// Cleanup memorycutilSafeCall( cudaFree(d_signal) );...cutilDeviceReset(); cufft
  35. 35. cusparseHandle_t handle = 0;cusparseStatus_t status = cusparseCreate(&handle);// create a matrix description for the matrix McusparseMatDescr_t descrM = 0; status = cusparseCreateMatDescr(&descrM);cusparseSetMatType ( descrM, CUSPARSE_MATRIX_TYPE_TRIANGULAR );cusparseSetMatIndexBase ( descrM, CUSPARSE_INDEX_BASE_ZERO );cusparseSetMatDiagType ( descrM, CUSPARSE_DIAG_TYPE_NON_UNIT );cusparseSetMatFillMode ( descrM, CUSPARSE_FILL_MODE_LOWER );// create & perform analysis info for the non-trans & trans casecusparseSolveAnalysisInfo_t info = 0, infoTrans = 0;cusparseCreateSolveAnalysisInfo(&info);cusparseCreateSolveAnalysisInfo(&infoTrans);cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, info);cusparseScsrsv_analysis(handle, CUSPARSE_OPERATION_TRANSPOSE, N, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans);...// Solve M z = H H^T z = r by first doing a forward solve: H y = rcusparseScsrsv_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, 1.0, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, info, d_r, d_y);// and then a back substitution: H^T z = ycusparseScsrsv_solve(handle, CUSPARSE_OPERATION_TRANSPOSE, N, 1.0, descrM, d_valsICP, d_rowPtrsICP, d_colIndsICP, infoTrans, d_y, d_z);...cusparseDestroy(handle); cusparse
  36. 36. cudaError_t cudaResult = cudaSuccess;// Allocate memory for pointsfloat *d_points = 0;cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(float));// Generate random points in unit squarecurandStatus_t curandResult;curandGenerator_t qrng;curandResult = curandCreateGenerator(&qrng, CURAND_RNG_QUASI_SOBOL32);curandResult = curandSetQuasiRandomGeneratorDimensions(qrng, 2);curandResult = curandSetGeneratorOrdering(qrng, CURAND_ORDERING_QUASI_DEFAULT);curandResult = curandGenerateUniform(qrng, (float *)d_points, 2 * m_numSims);// CleanupcurandResult = curandDestroyGenerator(qrng);cudaFree(d_points); curand
  37. 37. // declare a host image object for an 8-bit grayscale imagenpp::ImageCPU_8u_C1 oHostSrc;// load gray-scale image from disknpp::loadImage(sFilename, oHostSrc);// declare a device image and copy from the host image to the devicenpp::ImageNPP_8u_C1 oDeviceSrc(oHostSrc);// create struct with box-filter mask sizeNppiSize oMaskSize = {5, 5};// create struct with ROI size given the current maskNppiSize oSizeROI = {oDeviceSrc.width() - oMaskSize.width + 1, oDeviceSrc.height() - oMaskSize.height + 1};// allocate device image of appropriately reduced sizenpp::ImageNPP_8u_C1 oDeviceDst(oSizeROI.width, oSizeROI.height);// set anchor point inside the mask to (0, 0)NppiPoint oAnchor = {0, 0};// run box filternppiFilterBox_8u_C1R(oDeviceSrc.data(), oDeviceSrc.pitch(), oDeviceDst.data(), oDeviceDst.pitch(), oSizeROI, oMaskSize, oAnchor);// declare a host image for the resultnpp::ImageCPU_8u_C1 oHostDst(oDeviceDst.size());// and copy the device result data into itoDeviceDst.copyTo(oHostDst.data(), oHostDst.pitch()); npp
  38. 38. Streams 8
  39. 39. cudaStream_t stream;cutilSafeCall( cudaStreamCreate(&stream) );// allocate page locked memorycutilSafeCall( cudaMallocHost((void**)&a, nbytes, cudaHostAllocDefault) );// allocate device memorycutilSafeCall( cudaMalloc((void**)&d_a, nbytes) );cutilSafeCall( cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, stream) );// run kernel and copy result backcutilSafeCall( cudaEventRecord(start, stream) );kernel<<<N,M,0,stream>>>(&d_a, ... );cutilSafeCall( cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, stream) );// freecudaStreamDestroy(stream);cudaFreeHost(a);cudaFree(d_a); pinned memory
  40. 40. // loop over full data, in bite-sized chunksfor (int i=0; i<FULL_DATA_SIZE; i+= N) { // copy the locked memory to the device, async cutilSafeCall( cudaMemcpyAsync(dev_a, host_a+i, N * sizeof(int), cudaMemcpyHostToDevice, stream) ); cutilSafeCall( cudaMemcpyAsync(dev_b, host_b+i, N * sizeof(int), cudaMemcpyHostToDevice, stream) ); kernel<<<N/256,256,0,stream>>>(dev_a, dev_b, dev_c); // copy the data from device to locked memory cutilSafeCall( cudaMemcpyAsync(host_c+i, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost, stream) );}// wait for all operations to finishcutilSafeCall( cudaStreamSynchronize(stream) ); chunked computation
  41. 41. cudaStream_t *streamArray = 0;streamArray = (cudaStream_t *)malloc(N * sizeof (cudaStream_t *));...for ( int i = 0; i < N ; i++) { cudaStreamCreate(&streamArray[i]); ...}...for ( int i = 0; i < N ; i++) { cublasSetMatrix (..., devPtrA[i], ...); ...}...for ( int i = 0; i < N ; i++) { cublasSetStream(handle, streamArray[i]); cublasSgemm(handle, ..., devPtrA[i], devPtrB[i], devPtrC[i], ...);}cudaThreadSynchronize(); batched computation
  42. 42. • use it to specify in which order operations get executed async.• idea is to use more than 1 stream• requires a new kind of mem. copy which in turn requires pinned: paged locked mem.• free pinned mem. when not needed keep in mind
  43. 43. // Allocate resourcesfor( int i =0; i<STREAM_COUNT; ++i ) { cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault); cudaMalloc(&d_data_in[i], memsize); ...}int current_stream = 0;// Do processing in a loop...{ int next_stream = (current_stream + 1 ) % STREAM_COUNT; // Ensure that processing and copying of the last cycle has finished cudaEventSynchronize(cycleDone[next_stream]); // Process current frame kernel<<<grid, block, 0, stream[current_stream]>>>(d_data_out[current_stream], d_data_in[current_stream], N, ...); // Upload next frame cudaMemcpyAsync(d_data_in[next_stream], ..., cudaMemcpyHostToDevice, stream[next_stream]); // Download current frame cudaMemcpyAsync(h_data_out[current_stream], ..., cudaMemcpyDeviceToHost, stream[current_stream]); cudaEventRecord(cycleDone[current_stream], stream[current_stream]); current_stream = next_stream;} overlap kernel exec. & memcpy
  44. 44. • devices with CC 1.1 and above can overlap a kernel exec & memcpy as long as they are issued from different streams• kernels are serialized• queue in a way that independent streams can execute in parallel keep in mind
  45. 45. Multi-GPU 9
  46. 46. float *a, *d_a;.../* Allocate mapped CPU memory. */cutilSafeCall( cudaHostAlloc((void **)&a, bytes, cudaHostAllocMapped) );.../* Initialize the vectors. */for(n = 0; n < nelem; n++) { a[n] = rand() / (float)RAND_MAX; ... }/* Get the device pointers for the pinned CPU memory mapped into the GPU memory space. */cutilSafeCall( cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0) );.../* Call the GPU kernel using the device pointers for the mapped memory. */...kernel<<<grid, block>>>(d_a, d_b, d_c, nelem);.../* Memory clean up */cutilSafeCall( cudaFreeHost(a) );... zero-copy host memory
  47. 47. //Create streams for issuing GPU command asynchronously and allocate memoryfor(int i = 0; i < GPU_N; i++) { cutilSafeCall( cudaStreamCreate(&stream[i]) ); cutilSafeCall( cudaMalloc((void**)&d_Data[i], dataN * sizeof(float)) ); cutilSafeCall( cudaMallocHost((void**)&h_Data[i], dataN * sizeof(float)) ); //init h_Data}//Copy data to GPU, launch the kernel and copy data back. All asynchronouslyfor(int i = 0; i < GPU_N; i++) { //Set device cutilSafeCall( cudaSetDevice(i) ); // Copy input data from CPU cutilSafeCall( cudaMemcpyAsync(d_Data[i], h_Data[i], dataN * sizeof(float), cudaMemcpyHostToDevice, stream[i]) ); // Perform GPU computations kernel<<<blocks, threads, 0, stream[i]>>>(...) // Copy back the result cutilSafeCall( cudaMemcpyAsync(h_Sum_from_device[i], d_Sum[i], ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost, stream[i]) );} streams
  48. 48. // Process GPU resultsfor(i = 0; i < GPU_N; i++) { // Set device cutilSafeCall( cudaSetDevice(i) ); // Wait for all operations to finish cudaStreamSynchronize(stream[i]); // Shut down this GPU cutilSafeCall( cudaFreeHost(h_Data[i]) ); cutilSafeCall( cudaFree(d_Data[i]) ); cutilSafeCall( cudaStreamDestroy(stream[i]) );}// shutdownfor(int i = 0; i < GPU_N; i++) { cutilSafeCall( cudaSetDevice(i) ); cutilDeviceReset();} process the result
  49. 49. • can also control each GPU by a separate CPU thread• need to assign portable pinned memory if a different thread needs access to one thread’s memory• use the flag cudaHostAllocPortable to cudaHostAlloc() keep in mind
  50. 50. // Initialize MPI stateMPI_CHECK( MPI_Init(&argc, &argv) );// Get our MPI node number and node countint commSize, commRank;MPI_CHECK( MPI_Comm_size(MPI_COMM_WORLD, &commSize) );MPI_CHECK( MPI_Comm_rank(MPI_COMM_WORLD, &commRank) );if(commRank == 0) {// Are we the root node? //initialize dataRoot...}// Allocate a buffer on each nodefloat * dataNode = new float[dataSizePerNode];// Dispatch a portion of the input data to each nodeMPI_CHECK( MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD) );// if commRank == 0 then free dataRoot...kernel<<<gridSize, blockSize>>>(dataNode, ...);// Reduction to the root nodefloat sumNode = sum(dataNode, dataSizePerNode);float sumRoot;MPI_CHECK( MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD) );MPI_CHECK( MPI_Finalize() ); mpi + cuda
  51. 51. // Enable peer accesscutilSafeCall(cudaSetDevice(gpuid_tesla[0]));cutilSafeCall(cudaDeviceEnablePeerAccess(gpuid_tesla[1], gpuid_tesla[0]));...// Allocate bufferscudaSetDevice(gpuid_tesla[0]); cudaMalloc(&g0, buf_size);cudaSetDevice(gpuid_tesla[1]); cudaMalloc(&g1, buf_size);// Ping-pong copy between GPUscudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);// Prepare host buffer and copy to GPU 0cudaSetDevice(gpuid_tesla[0]); cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault);// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing// output to the GPU 1 buffer: dst[idx] = src[idx] * 2.0fcudaSetDevice(gpuid_tesla[1]); kernel<<<blocks, threads>>>(g0, g1);cutilDeviceSynchronize();// Disable peer access (also unregisters memory for non-UVA cases)cudaSetDevice(gpuid_tesla[0]); cudaDeviceDisablePeerAccess(gpuid_tesla[1]);cudaSetDevice(gpuid_tesla[1]); cudaDeviceDisablePeerAccess(gpuid_tesla[0]);cudaFree(g0);... P2P & unified virtual address space
  52. 52. References
  53. 53. Thank youdownload slides (2MB pdf) from http://bit.ly/cuda-deep-dive

×