Matrix multiplication using CUDA

Basic Example: Matrix
Multiplication using CUDA

General-purpose Programming of Massively Parallel
Graphics Processors
Shiraz University, Spring 2010
Instructor: Reza Azimi

Some materials/slides are adapted from:
Andreas Moshovos’ Course at the University of Toronto
UIUC course by Wen-Mei Hwu and David Kirk

( 6 07 4 7 6 5 4 32 1 0) 0
A @ 9 8 B A

void MatrixMulOnHost( float* M, float* N, float* P, int Width) {
for (int i = 0; i < Width; ++i) N
for (int j = 0; j < Width; ++j) {
float sum = 0; k
for (int k = 0; k < Width; ++k) {
WIDTH

float a = M[i * Width + k]; j
float b = N[k * Width + j];
sum += a * b;
}
P[i * Width + j] = sum;
}
}
M P

i
WIDTH

k

Adapted From:
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ WIDTH
©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH ' 2
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC

1

60 IH 34 4 G F ED
A 8P A A@

__global__
void MatrixMulKernel(float* d_M,
d_N
float* d_N,
float* d_P, k
int Width) {

WIDTH
int row = threadIdx.y;
int col = threadIdx.x; col
float P_val = 0; (threadIdx.x)
for (int k = 0; k Width; ++k) {
float M_elem = d_M[row * Width + k];
float N_elem = d_N[k * Width + col];
P_val += M_elem * N_elem;
} d_M d_P
d_p[row*Width+col] = P_val; row
} (threadIdx.y)

WIDTH
k

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ WIDTH
©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH C 3
Adapted From:

4W 7 60 6 33 ( I2 R
@ AT @ V U AT A 8TP T8T S 8 A

void MatrixMulOnDevice(float* M,
float* N,
float* P,
int Width)
{
int matrix_size = Width * Width * sizeof(float);
float *d_M, *d_N, *d_P;

// Allocate and Load M and N to device memory
cudaMalloc(d_M, matrix_size);
cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice);

cudaMalloc(d_N, matrix_size);
cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice);

// Allocate P on the device
cudaMalloc(d_P, matrix_size);

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
Adapted From:
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q 4

2

26 60 6 R 34 4G 3I7 4 7
a ` B U AT A 8TP YA A@ 8

// Setup the execution configuration
dim3 dimGrid(1, 1);
dim3 dimBlock(Width, Width);

// Launch the device computation threads!
MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
Width);

// Copy back the results from device to host
cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost);

// Free up the device memory matrices
cudaFree(d_P);
cudaFree(d_M);
cudaFree(d_N);
© David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © X 5
ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign

Only One Thread Block Used
c
One Block of threads compute Grid 1 d_N
matrix d_P Block 1
2

4
d
Each thread
e
Loads a row of matrix d_M Thread
(2, 2)
2
e
Loads a column of matrix d_N 6
e
Perform one multiply and
addition for each pair of d_M
and d_N elements
e
Computes one element of d_P

3 2 5 4 48
Size of matrix limited by
the number of threads
allowed in a thread block
WIDTH d_P
d_M
Adapted From:
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£
©¨© § % $ ! ¦ ©¤ # ! ! ¤ © b 6

3

6 0 I 36) 6r 4 6 4 p p i 4 0D
A 8 s@ @ q UT @ V PT Y hg

threadIdx.x TILE_WIDTH

d_P
TILE_

threadIdx.y

7

Each thread is assigned to
a Tile of
TILE_WIDTHxTILE_WIDTH
entries

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © f

Solution 1: Give Each Thread More
Work
__global__ void MatrixMulKernel(float* d_M,
float* d_N,
float* d_P,
int Width) {
int start_row = threadIdx.y * TILE_WIDTH;
int end_row = start_row + TILE_WIDTH;
int start_col = threadIdx.x * TILE_WIDTH;
int end_col = start_col + TILE_WIDTH;

for (int row = start_row; row end_row; row++) {
for(int col = start_col; col end_col; col++) {
float P_val = 0;
} With one block we utilize
d_p[row*Width+col] = P_val; only one multiprocessor!
}
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © t

}
}

4

63 4 p 4 32 0 3I 47 F 6 0 I 36) 7
w UT @ V 8 q hv A 8 sP

threadIdx.x

blockIdx.x blockDim.x

d_P
blockDim.y

blockIdx.y

9 assigned to a
threadIdx.y thread

assigned to a
thread block

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © u

Solution 2: Use Multiple Thread
Blocks
__global__
void MatrixMulKernel(float* d_M,
float* d_N,
float* d_P,
int Width) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float P_val = 0;

}
d_p[row*Width+col] = P_val;
}

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © x

5

26 60 6 R 34 4G 3I7 4 7
a ` B U AT A 8TP YA A@ 8

int block_size = 64;

// Setup the execution configuration
dim3 dimGrid(Width/block_size, Width/block_size);
dim3 dimBlock(block_size, block_size);

// Launch the device computation threads!
MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
Width);

…
Size of matrix limited by the
number of threads allowed
on a device

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © 11

60 01 0 4 p D 7
A 8T8 ‚ UT @ V € v yV

ƒ
Max Number of Threads per Block: 512
ƒ
Max Number of Blocks per Streaming
Multiprocessor: 8
ƒ
Number of Streaming Multiprocessors: 30
ƒ
Total Number of Threads Available =
30 x 8 x 512 = 122880

Let me double-check this!

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © '

6

6 0 I 36) 6 4p 0 0„ 16 7
A 8 †V 8 …A A B

threadIdx.x
blockIdx.x blockDim.x

d_P
blockDim.y

blockIdx.y

TILE_WIDTH

13
threadIdx.y
TILE_WIDT

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © C

6 0 I 36) 6 4p 0 0„ 16 7
A 8 †V 8 …A A B

__global__ void MatrixMulKernel(float* d_M,
float* d_N,
float* d_P,
int Width) {
int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH;
int end_row = start_row + TILE_WIDTH;
int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH;
int end_col = start_col + TILE_WIDTH;

for (int row = start_row; row end_row; row++) {
for(int col = start_col; col end_col; col++) {
float P_val = 0;
}
d_p[row*Width+col] = P_val;
}
}
} © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q

7

Matrix multiplication using CUDA

More Related Content

Similar to Matrix multiplication using CUDA

More from Piyush Mittal

Recently uploaded

Matrix multiplication using CUDA