Basic Example: Matrix
                  Multiplication using CUDA

                   General-purpose Programming of Massively Parallel
                                 Graphics Processors
                                               Shiraz University, Spring 2010
                                                  Instructor: Reza Azimi


                                Some materials/slides are adapted from:
                          Andreas Moshovos’ Course at the University of Toronto
                              UIUC course by Wen-Mei Hwu and David Kirk
                                                                                                                     




     (      6 07 4    7 6 5 4 32 1 0)                                    0
           A    @  9 8                                                       B A



 void MatrixMulOnHost( float* M, float* N, float* P, int Width) {
   for (int i = 0; i < Width; ++i)                   N
     for (int j = 0; j < Width; ++j) {
         float sum = 0;                                                                                         k
         for (int k = 0; k < Width; ++k) {
                                                                                                                        WIDTH




              float a = M[i * Width + k];              j
              float b = N[k * Width + j];
              sum += a * b;
          }
          P[i * Width + j] = sum;
     }
 }
                                                          M                                         P

                                                                     i
                                                                                                                        WIDTH




                                                              k

Adapted From:
             © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                         £                                           WIDTH
                                                          ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©       WIDTH       '      2
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC




                                                                                                                                1
60     IH 34   4 G F ED
     A  8P A       A@




 __global__
 void MatrixMulKernel(float* d_M,
                                                                                                    d_N
                      float* d_N,
                      float* d_P,                                                                                 k
                      int Width) {




                                                                                                                          WIDTH
   int row = threadIdx.y;
   int col = threadIdx.x;                                                                           col
   float P_val = 0;                                                                                 (threadIdx.x)
   for (int k = 0; k  Width; ++k) {
     float M_elem = d_M[row * Width + k];
     float N_elem = d_N[k * Width + col];
     P_val += M_elem * N_elem;
   }                            d_M                                                                 d_P
   d_p[row*Width+col] = P_val;        row
 }                                    (threadIdx.y)




                                                                                                                          WIDTH
                                                            k

             © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                         £                                           WIDTH
                                                          ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©         WIDTH       C      3
Adapted From:
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC




     4W 7              60    6 33 (         I2  R
    @     AT @ V U AT A  8TP        T8T S 8   A




   void MatrixMulOnDevice(float* M,
                          float* N,
                          float* P,
                          int Width)
   {
      int matrix_size = Width * Width * sizeof(float);
      float *d_M, *d_N, *d_P;

        // Allocate and Load M and N to device memory
        cudaMalloc(d_M, matrix_size);
        cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice);

        cudaMalloc(d_N, matrix_size);
        cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice);

        // Allocate P on the device
        cudaMalloc(d_P, matrix_size);

             © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
Adapted From:
                         £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                     Q      4
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC




                                                                                                                                  2
26         60    6    R 34   4G                                                             3I7 4        7
     a `   B U AT A  8TP   YA      A@                                                                        8




         // Setup the execution configuration
         dim3 dimGrid(1, 1);
         dim3 dimBlock(Width, Width);


      // Launch the device computation threads!
      MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
   Width);

         // Copy back the results from device to host
         cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost);

         // Free up the device memory matrices
         cudaFree(d_P);
         cudaFree(d_M);
         cudaFree(d_N);
     © David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009
              © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                          £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                          X   5
     ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign




    Only One Thread Block Used
     c
         One Block of threads compute                                                      Grid 1                    d_N
         matrix d_P                                                                     Block 1
                                                                                                                       2

                                                                                                                       4
     d
         Each thread
           e
               Loads a row of matrix d_M                                                       Thread
                                                                                                (2, 2)
                                                                                                                       2
           e
               Loads a column of matrix d_N                                                                            6
           e
               Perform one multiply and
               addition for each pair of d_M
               and d_N elements
           e
               Computes one element of d_P


                                                                                 3     2       5         4             48
   Size of matrix limited by
   the number of threads
   allowed in a thread block
                                                                                     WIDTH                           d_P
                                                                                     d_M
Adapted From:
              © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                          £
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC
                                                           ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                          b   6




                                                                                                                                    3
6 0 I 36)                  6r 4 6     4 p   p   i 4  0D
  A   8                     s@    @  q UT @  V  PT     Y   hg




                                                             threadIdx.x                           TILE_WIDTH


                                                         d_P
                                                                                                                    TILE_


                               threadIdx.y




                                                                                         7

Each thread is assigned to
a Tile of
TILE_WIDTHxTILE_WIDTH
entries


            © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                        £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                f




  Solution 1: Give Each Thread More
  Work
   __global__ void MatrixMulKernel(float* d_M,
                      float* d_N,
                      float* d_P,
                      int Width) {
     int start_row = threadIdx.y * TILE_WIDTH;
     int end_row = start_row + TILE_WIDTH;
     int start_col = threadIdx.x * TILE_WIDTH;
     int end_col = start_col + TILE_WIDTH;

       for (int row = start_row; row  end_row; row++) {
          for(int col = start_col; col  end_col; col++) {
             float P_val = 0;
             for (int k = 0; k  Width; ++k) {
                float M_elem = d_M[row * Width + k];
                float N_elem = d_N[k * Width + col];
                P_val += M_elem * N_elem;
             }                              With one block we utilize
             d_p[row*Width+col] = P_val;    only one multiprocessor!
          }
            © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                        £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                t

       }
   }




                                                                                                                            4
63    4 p   4 32 0 3I   47 F     6 0 I 36)                                                           7
  w UT @  V       8   q      hv A   8                                                           sP


                                                             threadIdx.x

                                    blockIdx.x                                     blockDim.x

       d_P
                                                                                                blockDim.y



      blockIdx.y




                                                              9                                 assigned to a
     threadIdx.y                                                                                thread

                                                                                                assigned to a
                                                                                                thread block



       © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                   £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                  u




Solution 2: Use Multiple Thread
Blocks
__global__
void MatrixMulKernel(float* d_M,
                   float* d_N,
                   float* d_P,
                   int Width) {
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  float P_val = 0;

    for (int k = 0; k  Width; ++k) {
      float M_elem = d_M[row * Width + k];
      float N_elem = d_N[k * Width + col];
      P_val += M_elem * N_elem;
    }
    d_p[row*Width+col] = P_val;
}


       © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                   £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©               x 




                                                                                                                 5
26         60    6    R 34   4G                                                      3I7 4       7
 a `   B U AT A  8TP   YA      A@                                                                8




     int block_size = 64;

     // Setup the execution configuration
     dim3 dimGrid(Width/block_size, Width/block_size);
     dim3 dimBlock(block_size, block_size);


   // Launch the device computation threads!
   MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
Width);

     …
                                                                            Size of matrix limited by the
                                                                            number of threads allowed
                                                                            on a device



        © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                    £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                     11




  60    01 0    4 p         D                                                    7
 A  8T8     ‚ UT @  V  € v yV




 ƒ
     Max Number of Threads per Block: 512
 ƒ
     Max Number of Blocks per Streaming
     Multiprocessor: 8
 ƒ
     Number of Streaming Multiprocessors: 30
 ƒ
     Total Number of Threads Available =
                30 x 8 x 512 = 122880

                          Let me double-check this!

        © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                    £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                ' 




                                                                                                                      6
6 0 I 36) 6    4p     0  0„ 16                                                   7
    A   8        †V    8 …A A       B



                                                       threadIdx.x
                                blockIdx.x                                    blockDim.x

     d_P
                                                                                                 blockDim.y



     blockIdx.y


                                                                                                 TILE_WIDTH


                                                        13
    threadIdx.y
                                                                                                              TILE_WIDT




          © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                      £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                  C 




     6 0 I 36) 6    4p     0  0„ 16                                                   7
    A   8        †V    8 …A A       B




__global__ void MatrixMulKernel(float* d_M,
                     float* d_N,
                     float* d_P,
                     int Width) {
  int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH;
  int end_row = start_row + TILE_WIDTH;
  int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH;
  int end_col = start_col + TILE_WIDTH;

    for (int row = start_row; row  end_row; row++) {
       for(int col = start_col; col  end_col; col++) {
          float P_val = 0;
          for (int k = 0; k  Width; ++k) {
             float M_elem = d_M[row * Width + k];
             float N_elem = d_N[k * Width + col];
             P_val += M_elem * N_elem;
          }
          d_p[row*Width+col] = P_val;
       }
    }
}         © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                      £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                  Q 




                                                                                                                          7

Matrix multiplication using CUDA

  • 1.
    Basic Example: Matrix Multiplication using CUDA General-purpose Programming of Massively Parallel Graphics Processors Shiraz University, Spring 2010 Instructor: Reza Azimi Some materials/slides are adapted from: Andreas Moshovos’ Course at the University of Toronto UIUC course by Wen-Mei Hwu and David Kirk   ( 6 07 4 7 6 5 4 32 1 0) 0 A @ 9 8 B A void MatrixMulOnHost( float* M, float* N, float* P, int Width) { for (int i = 0; i < Width; ++i) N for (int j = 0; j < Width; ++j) { float sum = 0; k for (int k = 0; k < Width; ++k) { WIDTH float a = M[i * Width + k]; j float b = N[k * Width + j]; sum += a * b; } P[i * Width + j] = sum; } } M P i WIDTH k Adapted From: © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ WIDTH ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH ' 2 David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 1
  • 2.
    60 IH 34 4 G F ED A 8P A A@ __global__ void MatrixMulKernel(float* d_M, d_N float* d_N, float* d_P, k int Width) { WIDTH int row = threadIdx.y; int col = threadIdx.x; col float P_val = 0; (threadIdx.x) for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_M d_P d_p[row*Width+col] = P_val; row } (threadIdx.y) WIDTH k © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ WIDTH ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH C 3 Adapted From: David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 4W 7 60 6 33 ( I2 R @ AT @ V U AT A 8TP T8T S 8 A void MatrixMulOnDevice(float* M, float* N, float* P, int Width) { int matrix_size = Width * Width * sizeof(float); float *d_M, *d_N, *d_P; // Allocate and Load M and N to device memory cudaMalloc(d_M, matrix_size); cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice); cudaMalloc(d_N, matrix_size); cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice); // Allocate P on the device cudaMalloc(d_P, matrix_size); © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ Adapted From: £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q 4 David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 2
  • 3.
    26 60 6 R 34 4G 3I7 4 7 a ` B U AT A 8TP YA A@ 8 // Setup the execution configuration dim3 dimGrid(1, 1); dim3 dimBlock(Width, Width); // Launch the device computation threads! MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P, Width); // Copy back the results from device to host cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost); // Free up the device memory matrices cudaFree(d_P); cudaFree(d_M); cudaFree(d_N); © David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009 © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © X 5 ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign Only One Thread Block Used c One Block of threads compute Grid 1 d_N matrix d_P Block 1 2 4 d Each thread e Loads a row of matrix d_M Thread (2, 2) 2 e Loads a column of matrix d_N 6 e Perform one multiply and addition for each pair of d_M and d_N elements e Computes one element of d_P 3 2 5 4 48 Size of matrix limited by the number of threads allowed in a thread block WIDTH d_P d_M Adapted From: © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © b 6 3
  • 4.
    6 0 I36) 6r 4 6 4 p p i 4 0D A 8 s@ @ q UT @ V PT Y hg threadIdx.x TILE_WIDTH d_P TILE_ threadIdx.y 7 Each thread is assigned to a Tile of TILE_WIDTHxTILE_WIDTH entries © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © f Solution 1: Give Each Thread More Work __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int start_row = threadIdx.y * TILE_WIDTH; int end_row = start_row + TILE_WIDTH; int start_col = threadIdx.x * TILE_WIDTH; int end_col = start_col + TILE_WIDTH; for (int row = start_row; row end_row; row++) { for(int col = start_col; col end_col; col++) { float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } With one block we utilize d_p[row*Width+col] = P_val; only one multiprocessor! } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © t } } 4
  • 5.
    63 4 p 4 32 0 3I 47 F 6 0 I 36) 7 w UT @ V 8 q hv A 8 sP threadIdx.x blockIdx.x blockDim.x d_P blockDim.y blockIdx.y 9 assigned to a threadIdx.y thread assigned to a thread block © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © u Solution 2: Use Multiple Thread Blocks __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_p[row*Width+col] = P_val; } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © x  5
  • 6.
    26 60 6 R 34 4G 3I7 4 7 a ` B U AT A 8TP YA A@ 8 int block_size = 64; // Setup the execution configuration dim3 dimGrid(Width/block_size, Width/block_size); dim3 dimBlock(block_size, block_size); // Launch the device computation threads! MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P, Width); … Size of matrix limited by the number of threads allowed on a device © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ ©    11 60 01 0 4 p  D 7 A 8T8 ‚ UT @ V € v yV ƒ Max Number of Threads per Block: 512 ƒ Max Number of Blocks per Streaming Multiprocessor: 8 ƒ Number of Streaming Multiprocessors: 30 ƒ Total Number of Threads Available = 30 x 8 x 512 = 122880 Let me double-check this! © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © '  6
  • 7.
    6 0 I36) 6 4p 0 0„ 16 7 A 8 †V 8 …A A B threadIdx.x blockIdx.x blockDim.x d_P blockDim.y blockIdx.y TILE_WIDTH 13 threadIdx.y TILE_WIDT © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © C  6 0 I 36) 6 4p 0 0„ 16 7 A 8 †V 8 …A A B __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH; int end_row = start_row + TILE_WIDTH; int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH; int end_col = start_col + TILE_WIDTH; for (int row = start_row; row end_row; row++) { for(int col = start_col; col end_col; col++) { float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_p[row*Width+col] = P_val; } } } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q  7