Your SlideShare is downloading. ×
Matrix multiplication using CUDA
Upcoming SlideShare
Loading in...5
×

Thanks for flagging this SlideShare!

Oops! An error has occurred.

×
Saving this for later? Get the SlideShare app to save on your phone or tablet. Read anywhere, anytime – even offline.
Text the download link to your phone
Standard text messaging rates apply

Matrix multiplication using CUDA

1,293
views

Published on

Published in: Education

0 Comments
0 Likes
Statistics
Notes
  • Be the first to comment

  • Be the first to like this

No Downloads
Views
Total Views
1,293
On Slideshare
0
From Embeds
0
Number of Embeds
2
Actions
Shares
0
Downloads
24
Comments
0
Likes
0
Embeds 0
No embeds

Report content
Flagged as inappropriate Flag as inappropriate
Flag as inappropriate

Select your reason for flagging this presentation as inappropriate.

Cancel
No notes for slide

Transcript

  • 1. Basic Example: Matrix Multiplication using CUDA General-purpose Programming of Massively Parallel Graphics Processors Shiraz University, Spring 2010 Instructor: Reza Azimi Some materials/slides are adapted from: Andreas Moshovos’ Course at the University of Toronto UIUC course by Wen-Mei Hwu and David Kirk   ( 6 07 4 7 6 5 4 32 1 0) 0 A @ 9 8 B A void MatrixMulOnHost( float* M, float* N, float* P, int Width) { for (int i = 0; i < Width; ++i) N for (int j = 0; j < Width; ++j) { float sum = 0; k for (int k = 0; k < Width; ++k) { WIDTH float a = M[i * Width + k]; j float b = N[k * Width + j]; sum += a * b; } P[i * Width + j] = sum; } } M P i WIDTH kAdapted From: © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ WIDTH ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH 2David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 1
  • 2. 60 IH 34 4 G F ED A 8P A A@ __global__ void MatrixMulKernel(float* d_M, d_N float* d_N, float* d_P, k int Width) { WIDTH int row = threadIdx.y; int col = threadIdx.x; col float P_val = 0; (threadIdx.x) for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_M d_P d_p[row*Width+col] = P_val; row } (threadIdx.y) WIDTH k © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ WIDTH ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH C 3Adapted From:David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 4W 7 60 6 33 ( I2 R @ AT @ V U AT A 8TP T8T S 8 A void MatrixMulOnDevice(float* M, float* N, float* P, int Width) { int matrix_size = Width * Width * sizeof(float); float *d_M, *d_N, *d_P; // Allocate and Load M and N to device memory cudaMalloc(d_M, matrix_size); cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice); cudaMalloc(d_N, matrix_size); cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice); // Allocate P on the device cudaMalloc(d_P, matrix_size); © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡Adapted From: £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q 4David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 2
  • 3. 26 60 6 R 34 4G 3I7 4 7 a ` B U AT A 8TP YA A@ 8 // Setup the execution configuration dim3 dimGrid(1, 1); dim3 dimBlock(Width, Width); // Launch the device computation threads! MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P, Width); // Copy back the results from device to host cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost); // Free up the device memory matrices cudaFree(d_P); cudaFree(d_M); cudaFree(d_N); © David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009 © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © X 5 ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign Only One Thread Block Used c One Block of threads compute Grid 1 d_N matrix d_P Block 1 2 4 d Each thread e Loads a row of matrix d_M Thread (2, 2) 2 e Loads a column of matrix d_N 6 e Perform one multiply and addition for each pair of d_M and d_N elements e Computes one element of d_P 3 2 5 4 48 Size of matrix limited by the number of threads allowed in a thread block WIDTH d_P d_MAdapted From: © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © b 6 3
  • 4. 6 0 I 36) 6r 4 6 4 p p i 4 0D A 8 s@ @ q UT @ V PT Y hg threadIdx.x TILE_WIDTH d_P TILE_ threadIdx.y 7Each thread is assigned toa Tile ofTILE_WIDTHxTILE_WIDTHentries © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © f Solution 1: Give Each Thread More Work __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int start_row = threadIdx.y * TILE_WIDTH; int end_row = start_row + TILE_WIDTH; int start_col = threadIdx.x * TILE_WIDTH; int end_col = start_col + TILE_WIDTH; for (int row = start_row; row end_row; row++) { for(int col = start_col; col end_col; col++) { float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } With one block we utilize d_p[row*Width+col] = P_val; only one multiprocessor! } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © t } } 4
  • 5. 63 4 p 4 32 0 3I 47 F 6 0 I 36) 7 w UT @ V 8 q hv A 8 sP threadIdx.x blockIdx.x blockDim.x d_P blockDim.y blockIdx.y 9 assigned to a threadIdx.y thread assigned to a thread block © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © uSolution 2: Use Multiple ThreadBlocks__global__void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_p[row*Width+col] = P_val;} © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © x  5
  • 6. 26 60 6 R 34 4G 3I7 4 7 a ` B U AT A 8TP YA A@ 8 int block_size = 64; // Setup the execution configuration dim3 dimGrid(Width/block_size, Width/block_size); dim3 dimBlock(block_size, block_size); // Launch the device computation threads! MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,Width); … Size of matrix limited by the number of threads allowed on a device © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ ©    11 60 01 0 4 p  D 7 A 8T8 ‚ UT @ V € v yV ƒ Max Number of Threads per Block: 512 ƒ Max Number of Blocks per Streaming Multiprocessor: 8 ƒ Number of Streaming Multiprocessors: 30 ƒ Total Number of Threads Available = 30 x 8 x 512 = 122880 Let me double-check this! © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ ©   6
  • 7. 6 0 I 36) 6 4p 0 0„ 16 7 A 8 †V 8 …A A B threadIdx.x blockIdx.x blockDim.x d_P blockDim.y blockIdx.y TILE_WIDTH 13 threadIdx.y TILE_WIDT © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © C  6 0 I 36) 6 4p 0 0„ 16 7 A 8 †V 8 …A A B__global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH; int end_row = start_row + TILE_WIDTH; int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH; int end_col = start_col + TILE_WIDTH; for (int row = start_row; row end_row; row++) { for(int col = start_col; col end_col; col++) { float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_p[row*Width+col] = P_val; } }} © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q  7

×