Matrix multiplication using CUDA
Upcoming SlideShare
Loading in...5
×
 

Matrix multiplication using CUDA

on

  • 1,112 views

 

Statistics

Views

Total Views
1,112
Views on SlideShare
830
Embed Views
282

Actions

Likes
0
Downloads
13
Comments
0

3 Embeds 282

http://www.binary-beans.com 276
http://2501120322128564119_dbb7c8de0ae54dd6e61878b7a99ab96bb05126d3.blogspot.com 5
http://www.blog.piyushmittal.com 1

Accessibility

Categories

Upload Details

Uploaded via as Adobe PDF

Usage Rights

© All Rights Reserved

Report content

Flagged as inappropriate Flag as inappropriate
Flag as inappropriate

Select your reason for flagging this presentation as inappropriate.

Cancel
  • Full Name Full Name Comment goes here.
    Are you sure you want to
    Your message goes here
    Processing…
Post Comment
Edit your comment

Matrix multiplication using CUDA Matrix multiplication using CUDA Document Transcript

  • Basic Example: Matrix Multiplication using CUDA General-purpose Programming of Massively Parallel Graphics Processors Shiraz University, Spring 2010 Instructor: Reza Azimi Some materials/slides are adapted from: Andreas Moshovos’ Course at the University of Toronto UIUC course by Wen-Mei Hwu and David Kirk   ( 6 07 4 7 6 5 4 32 1 0) 0 A @ 9 8 B A void MatrixMulOnHost( float* M, float* N, float* P, int Width) { for (int i = 0; i < Width; ++i) N for (int j = 0; j < Width; ++j) { float sum = 0; k for (int k = 0; k < Width; ++k) { WIDTH float a = M[i * Width + k]; j float b = N[k * Width + j]; sum += a * b; } P[i * Width + j] = sum; } } M P i WIDTH kAdapted From:   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ WIDTH ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ © WIDTH 2David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 1
  • 60 IH 34 4 G F ED A 8P A A@ __global__ void MatrixMulKernel(float* d_M, d_N float* d_N, float* d_P, k int Width) { WIDTH int row = threadIdx.y; int col = threadIdx.x; col float P_val = 0; (threadIdx.x) for (int k = 0; k < Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_M d_P d_p[row*Width+col] = P_val; row } (threadIdx.y) WIDTH k   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ WIDTH ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ © WIDTH C 3Adapted From:David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 4W 7 60 6 33 ( I2 R @ AT @ V U AT A 8TP T8T S 8 A void MatrixMulOnDevice(float* M, float* N, float* P, int Width) { int matrix_size = Width * Width * sizeof(float); float *d_M, *d_N, *d_P; // Allocate and Load M and N to device memory cudaMalloc(&d_M, matrix_size); cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice); cudaMalloc(&d_N, matrix_size); cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice); // Allocate P on the device cudaMalloc(&d_P, matrix_size);   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡Adapted From: £ ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ © Q 4David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 2
  • 26 60 6 R 34 4G 3I7 4 7 a ` B U AT A 8TP YA A@ 8 // Setup the execution configuration dim3 dimGrid(1, 1); dim3 dimBlock(Width, Width); // Launch the device computation threads! MatrixMulKernel<<<dimGrid, dimBlock>>>(d_M, d_N, d_P, Width); // Copy back the results from device to host cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost); // Free up the device memory matrices cudaFree(d_P); cudaFree(d_M); cudaFree(d_N); © David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ © X 5 ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign Only One Thread Block Used c One Block of threads compute Grid 1 d_N matrix d_P Block 1 2 4 d Each thread e Loads a row of matrix d_M Thread (2, 2) 2 e Loads a column of matrix d_N 6 e Perform one multiply and addition for each pair of d_M and d_N elements e Computes one element of d_P 3 2 5 4 48 Size of matrix limited by the number of threads allowed in a thread block WIDTH d_P d_MAdapted From:   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ © b 6 3
  • 6 0 I 36) 6r 4 6 4 p p i 4 0D A 8 s@ @ q UT @ V PT Y hg threadIdx.x TILE_WIDTH d_P TILE_ threadIdx.y 7Each thread is assigned toa Tile ofTILE_WIDTHxTILE_WIDTHentries   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ © f Solution 1: Give Each Thread More Work __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int start_row = threadIdx.y * TILE_WIDTH; int end_row = start_row + TILE_WIDTH; int start_col = threadIdx.x * TILE_WIDTH; int end_col = start_col + TILE_WIDTH; for (int row = start_row; row < end_row; row++) { for(int col = start_col; col < end_col; col++) { float P_val = 0; for (int k = 0; k < Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } With one block we utilize d_p[row*Width+col] = P_val; only one multiprocessor! }   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ © t } } 4
  • 63 4 p 4 32 0 3I 47 F 6 0 I 36) 7 w UT @ V 8 q hv A 8 sP threadIdx.x blockIdx.x blockDim.x d_P blockDim.y blockIdx.y 9 assigned to a threadIdx.y thread assigned to a thread block   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ © uSolution 2: Use Multiple ThreadBlocks__global__void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; float P_val = 0; for (int k = 0; k < Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_p[row*Width+col] = P_val;}   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ © x  5
  • 26 60 6 R 34 4G 3I7 4 7 a ` B U AT A 8TP YA A@ 8 int block_size = 64; // Setup the execution configuration dim3 dimGrid(Width/block_size, Width/block_size); dim3 dimBlock(block_size, block_size); // Launch the device computation threads! MatrixMulKernel<<<dimGrid, dimBlock>>>(d_M, d_N, d_P,Width); … Size of matrix limited by the number of threads allowed on a device   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ ©    11 60 01 0 4 p  D 7 A 8T8 ‚ UT @ V € v yV ƒ Max Number of Threads per Block: 512 ƒ Max Number of Blocks per Streaming Multiprocessor: 8 ƒ Number of Streaming Multiprocessors: 30 ƒ Total Number of Threads Available = 30 x 8 x 512 = 122880 Let me double-check this!   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ ©   6
  • 6 0 I 36) 6 4p 0 0„ 16 7 A 8 †V 8 …A A B threadIdx.x blockIdx.x blockDim.x d_P blockDim.y blockIdx.y TILE_WIDTH 13 threadIdx.y TILE_WIDT   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ © C  6 0 I 36) 6 4p 0 0„ 16 7 A 8 †V 8 …A A B__global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH; int end_row = start_row + TILE_WIDTH; int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH; int end_col = start_col + TILE_WIDTH; for (int row = start_row; row < end_row; row++) { for(int col = start_col; col < end_col; col++) { float P_val = 0; for (int k = 0; k < Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_p[row*Width+col] = P_val; } }}   © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© & §  % $ ! ¦ ©¤ # "! ! ¤ © Q  7