SlideShare a Scribd company logo
1 of 4
#define ROWS_BLOCKDIM_X 16
#define ROWS_BLOCKDIM_Y 4
#define ROWS_RESULT_STEPS 8
#define ROWS_HALO_STEPS 1
__global__ void convolutionRowsKernel(
float *d_Dst,
float *d_Src,
int imageW,
int imageH,
int pitch
){
__shared__ float s_Data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X];
//Offset to the left halo edge
const int baseX = (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + threadIdx.x;
const int baseY = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y;
d_Src += baseY * pitch + baseX;
d_Dst += baseY * pitch + baseX;
//Load main data
#pragma unroll
for(int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++)
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = d_Src[i * ROWS_BLOCKDIM_X];
//Load left halo
#pragma unroll
for(int i = 0; i < ROWS_HALO_STEPS; i++)
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = (baseX >= -i * ROWS_BLOCKDIM_X ) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;
//Load right halo
#pragma unroll
for(int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++)
s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = (imageW - baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;
//Compute and store results
__syncthreads();
#pragma unroll
for(int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++){
float sum = 0;
#pragma unroll
for(int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++)
sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j];
d_Dst[i * ROWS_BLOCKDIM_X] = sum;
}
}
#define COLUMNS_BLOCKDIM_X 16
#define COLUMNS_BLOCKDIM_Y 8
#define COLUMNS_RESULT_STEPS 8
#define COLUMNS_HALO_STEPS 1
__global__ void convolutionColumnsKernel(
float *d_Dst,
float *d_Src,
int imageW,
int imageH,
int pitch
){
__shared__ float s_Data[COLUMNS_BLOCKDIM_X][(COLUMNS_RESULT_STEPS + 2 * COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + 1];
//Offset to the upper halo edge
const int baseX = blockIdx.x * COLUMNS_BLOCKDIM_X + threadIdx.x;
const int baseY = (blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + threadIdx.y;
d_Src += baseY * pitch + baseX;
d_Dst += baseY * pitch + baseX;
//Main data
#pragma unroll
for(int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++)
s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = d_Src[i * COLUMNS_BLOCKDIM_Y * pitch];
//Upper halo
#pragma unroll
for(int i = 0; i < COLUMNS_HALO_STEPS; i++)
s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = (baseY >= -i * COLUMNS_BLOCKDIM_Y) ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] : 0;
//Lower halo
#pragma unroll
for(int i = COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS; i++)
s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y]= (imageH - baseY > i * COLUMNS_BLOCKDIM_Y) ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] :
0;
//Compute and store results
__syncthreads();
#pragma unroll
for(int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++){
float sum = 0;
#pragma unroll
for(int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++)
sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j];
d_Dst[i * COLUMNS_BLOCKDIM_Y * pitch] = sum;
}
}

More Related Content

Similar to Convolution separable by using cuda.docx

Following are the changes mentioned in bold in order to obtain the r.pdf
Following are the changes mentioned in bold in order to obtain the r.pdfFollowing are the changes mentioned in bold in order to obtain the r.pdf
Following are the changes mentioned in bold in order to obtain the r.pdf
anithareadymade
 
Point.h header file #ifndef POINT_H #define POINT_H Po.pdf
Point.h header file #ifndef POINT_H #define POINT_H Po.pdfPoint.h header file #ifndef POINT_H #define POINT_H Po.pdf
Point.h header file #ifndef POINT_H #define POINT_H Po.pdf
ANGELMARKETINGJAIPUR
 
need help with code I wrote. This code is a maze gui, and i need hel.pdf
need help with code I wrote. This code is a maze gui, and i need hel.pdfneed help with code I wrote. This code is a maze gui, and i need hel.pdf
need help with code I wrote. This code is a maze gui, and i need hel.pdf
arcotstarsports
 
เกมจับคู่1
เกมจับคู่1เกมจับคู่1
เกมจับคู่1
JAy YourJust'one
 
Write a program to decipher messages encoded using a prefix code, gi.pdf
Write a program to decipher messages encoded using a prefix code, gi.pdfWrite a program to decipher messages encoded using a prefix code, gi.pdf
Write a program to decipher messages encoded using a prefix code, gi.pdf
footworld1
 
There are a number of errors in the following program- All errors are.docx
There are a number of errors in the following program- All errors are.docxThere are a number of errors in the following program- All errors are.docx
There are a number of errors in the following program- All errors are.docx
clarkjanyce
 
Assignement of programming & problem solving u.s ass.(1)
Assignement of programming & problem solving u.s ass.(1)Assignement of programming & problem solving u.s ass.(1)
Assignement of programming & problem solving u.s ass.(1)
Syed Umair
 
#include iostream #includeData.h #includePerson.h#in.pdf
#include iostream #includeData.h #includePerson.h#in.pdf#include iostream #includeData.h #includePerson.h#in.pdf
#include iostream #includeData.h #includePerson.h#in.pdf
annucommunication1
 

Similar to Convolution separable by using cuda.docx (20)

Following are the changes mentioned in bold in order to obtain the r.pdf
Following are the changes mentioned in bold in order to obtain the r.pdfFollowing are the changes mentioned in bold in order to obtain the r.pdf
Following are the changes mentioned in bold in order to obtain the r.pdf
 
Point.h header file #ifndef POINT_H #define POINT_H Po.pdf
Point.h header file #ifndef POINT_H #define POINT_H Po.pdfPoint.h header file #ifndef POINT_H #define POINT_H Po.pdf
Point.h header file #ifndef POINT_H #define POINT_H Po.pdf
 
need help with code I wrote. This code is a maze gui, and i need hel.pdf
need help with code I wrote. This code is a maze gui, and i need hel.pdfneed help with code I wrote. This code is a maze gui, and i need hel.pdf
need help with code I wrote. This code is a maze gui, and i need hel.pdf
 
เกมจับคู่1
เกมจับคู่1เกมจับคู่1
เกมจับคู่1
 
C++ programs
C++ programsC++ programs
C++ programs
 
SAS codes and tricks Comprehensive all codess
SAS codes and tricks Comprehensive all codessSAS codes and tricks Comprehensive all codess
SAS codes and tricks Comprehensive all codess
 
DesignPatterns-IntroPresentation.pptx
DesignPatterns-IntroPresentation.pptxDesignPatterns-IntroPresentation.pptx
DesignPatterns-IntroPresentation.pptx
 
Write a program to decipher messages encoded using a prefix code, gi.pdf
Write a program to decipher messages encoded using a prefix code, gi.pdfWrite a program to decipher messages encoded using a prefix code, gi.pdf
Write a program to decipher messages encoded using a prefix code, gi.pdf
 
Let’s talk about microbenchmarking
Let’s talk about microbenchmarkingLet’s talk about microbenchmarking
Let’s talk about microbenchmarking
 
There are a number of errors in the following program- All errors are.docx
There are a number of errors in the following program- All errors are.docxThere are a number of errors in the following program- All errors are.docx
There are a number of errors in the following program- All errors are.docx
 
Software Visualization 101+
Software Visualization 101+Software Visualization 101+
Software Visualization 101+
 
SAS codes and tricks Comprehensive all codes
SAS codes and tricks Comprehensive all codesSAS codes and tricks Comprehensive all codes
SAS codes and tricks Comprehensive all codes
 
Assignement of programming & problem solving u.s ass.(1)
Assignement of programming & problem solving u.s ass.(1)Assignement of programming & problem solving u.s ass.(1)
Assignement of programming & problem solving u.s ass.(1)
 
Intro to c programming
Intro to c programmingIntro to c programming
Intro to c programming
 
IFC File Model Understanding -BHTech Dec2017
IFC File Model Understanding -BHTech Dec2017IFC File Model Understanding -BHTech Dec2017
IFC File Model Understanding -BHTech Dec2017
 
Writing MySQL UDFs
Writing MySQL UDFsWriting MySQL UDFs
Writing MySQL UDFs
 
2nd section
2nd section2nd section
2nd section
 
#include stdafx.h using namespace std; #include stdlib.h.docx
#include stdafx.h using namespace std; #include stdlib.h.docx#include stdafx.h using namespace std; #include stdlib.h.docx
#include stdafx.h using namespace std; #include stdlib.h.docx
 
Cquestions
Cquestions Cquestions
Cquestions
 
#include iostream #includeData.h #includePerson.h#in.pdf
#include iostream #includeData.h #includePerson.h#in.pdf#include iostream #includeData.h #includePerson.h#in.pdf
#include iostream #includeData.h #includePerson.h#in.pdf
 

Recently uploaded

ENCODERS & DECODERS - Digital Electronics - diu swe
ENCODERS & DECODERS - Digital Electronics - diu sweENCODERS & DECODERS - Digital Electronics - diu swe
ENCODERS & DECODERS - Digital Electronics - diu swe
MohammadAliNayeem
 
ONLINE VEHICLE RENTAL SYSTEM PROJECT REPORT.pdf
ONLINE VEHICLE RENTAL SYSTEM PROJECT REPORT.pdfONLINE VEHICLE RENTAL SYSTEM PROJECT REPORT.pdf
ONLINE VEHICLE RENTAL SYSTEM PROJECT REPORT.pdf
Kamal Acharya
 
一比一原版(UNK毕业证)内布拉斯加州立大学科尼分校毕业证成绩单
一比一原版(UNK毕业证)内布拉斯加州立大学科尼分校毕业证成绩单一比一原版(UNK毕业证)内布拉斯加州立大学科尼分校毕业证成绩单
一比一原版(UNK毕业证)内布拉斯加州立大学科尼分校毕业证成绩单
tuuww
 
Laundry management system project report.pdf
Laundry management system project report.pdfLaundry management system project report.pdf
Laundry management system project report.pdf
Kamal Acharya
 
DR PROF ING GURUDUTT SAHNI WIKIPEDIA.pdf
DR PROF ING GURUDUTT SAHNI WIKIPEDIA.pdfDR PROF ING GURUDUTT SAHNI WIKIPEDIA.pdf
DR PROF ING GURUDUTT SAHNI WIKIPEDIA.pdf
DrGurudutt
 
Teachers record management system project report..pdf
Teachers record management system project report..pdfTeachers record management system project report..pdf
Teachers record management system project report..pdf
Kamal Acharya
 

Recently uploaded (20)

BRAKING SYSTEM IN INDIAN RAILWAY AutoCAD DRAWING
BRAKING SYSTEM IN INDIAN RAILWAY AutoCAD DRAWINGBRAKING SYSTEM IN INDIAN RAILWAY AutoCAD DRAWING
BRAKING SYSTEM IN INDIAN RAILWAY AutoCAD DRAWING
 
ENCODERS & DECODERS - Digital Electronics - diu swe
ENCODERS & DECODERS - Digital Electronics - diu sweENCODERS & DECODERS - Digital Electronics - diu swe
ENCODERS & DECODERS - Digital Electronics - diu swe
 
An improvement in the safety of big data using blockchain technology
An improvement in the safety of big data using blockchain technologyAn improvement in the safety of big data using blockchain technology
An improvement in the safety of big data using blockchain technology
 
ONLINE VEHICLE RENTAL SYSTEM PROJECT REPORT.pdf
ONLINE VEHICLE RENTAL SYSTEM PROJECT REPORT.pdfONLINE VEHICLE RENTAL SYSTEM PROJECT REPORT.pdf
ONLINE VEHICLE RENTAL SYSTEM PROJECT REPORT.pdf
 
Construction method of steel structure space frame .pptx
Construction method of steel structure space frame .pptxConstruction method of steel structure space frame .pptx
Construction method of steel structure space frame .pptx
 
一比一原版(UNK毕业证)内布拉斯加州立大学科尼分校毕业证成绩单
一比一原版(UNK毕业证)内布拉斯加州立大学科尼分校毕业证成绩单一比一原版(UNK毕业证)内布拉斯加州立大学科尼分校毕业证成绩单
一比一原版(UNK毕业证)内布拉斯加州立大学科尼分校毕业证成绩单
 
Online resume builder management system project report.pdf
Online resume builder management system project report.pdfOnline resume builder management system project report.pdf
Online resume builder management system project report.pdf
 
Lect 2 - Design of slender column-2.pptx
Lect 2 - Design of slender column-2.pptxLect 2 - Design of slender column-2.pptx
Lect 2 - Design of slender column-2.pptx
 
Furniture showroom management system project.pdf
Furniture showroom management system project.pdfFurniture showroom management system project.pdf
Furniture showroom management system project.pdf
 
internship exam ppt.pptx on embedded system and IOT
internship exam ppt.pptx on embedded system and IOTinternship exam ppt.pptx on embedded system and IOT
internship exam ppt.pptx on embedded system and IOT
 
Introduction to Machine Learning Unit-4 Notes for II-II Mechanical Engineering
Introduction to Machine Learning Unit-4 Notes for II-II Mechanical EngineeringIntroduction to Machine Learning Unit-4 Notes for II-II Mechanical Engineering
Introduction to Machine Learning Unit-4 Notes for II-II Mechanical Engineering
 
Laundry management system project report.pdf
Laundry management system project report.pdfLaundry management system project report.pdf
Laundry management system project report.pdf
 
Dairy management system project report..pdf
Dairy management system project report..pdfDairy management system project report..pdf
Dairy management system project report..pdf
 
Attraction and Repulsion type Moving Iron Instruments.pptx
Attraction and Repulsion type Moving Iron Instruments.pptxAttraction and Repulsion type Moving Iron Instruments.pptx
Attraction and Repulsion type Moving Iron Instruments.pptx
 
DR PROF ING GURUDUTT SAHNI WIKIPEDIA.pdf
DR PROF ING GURUDUTT SAHNI WIKIPEDIA.pdfDR PROF ING GURUDUTT SAHNI WIKIPEDIA.pdf
DR PROF ING GURUDUTT SAHNI WIKIPEDIA.pdf
 
Teachers record management system project report..pdf
Teachers record management system project report..pdfTeachers record management system project report..pdf
Teachers record management system project report..pdf
 
Research Methodolgy & Intellectual Property Rights Series 1
Research Methodolgy & Intellectual Property Rights Series 1Research Methodolgy & Intellectual Property Rights Series 1
Research Methodolgy & Intellectual Property Rights Series 1
 
Natalia Rutkowska - BIM School Course in Kraków
Natalia Rutkowska - BIM School Course in KrakówNatalia Rutkowska - BIM School Course in Kraków
Natalia Rutkowska - BIM School Course in Kraków
 
Arduino based vehicle speed tracker project
Arduino based vehicle speed tracker projectArduino based vehicle speed tracker project
Arduino based vehicle speed tracker project
 
Low rpm Generator for efficient energy harnessing from a two stage wind turbine
Low rpm Generator for efficient energy harnessing from a two stage wind turbineLow rpm Generator for efficient energy harnessing from a two stage wind turbine
Low rpm Generator for efficient energy harnessing from a two stage wind turbine
 

Convolution separable by using cuda.docx

  • 1. #define ROWS_BLOCKDIM_X 16 #define ROWS_BLOCKDIM_Y 4 #define ROWS_RESULT_STEPS 8 #define ROWS_HALO_STEPS 1 __global__ void convolutionRowsKernel( float *d_Dst, float *d_Src, int imageW, int imageH, int pitch ){ __shared__ float s_Data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X]; //Offset to the left halo edge const int baseX = (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + threadIdx.x; const int baseY = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y; d_Src += baseY * pitch + baseX; d_Dst += baseY * pitch + baseX; //Load main data #pragma unroll for(int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++) s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = d_Src[i * ROWS_BLOCKDIM_X];
  • 2. //Load left halo #pragma unroll for(int i = 0; i < ROWS_HALO_STEPS; i++) s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = (baseX >= -i * ROWS_BLOCKDIM_X ) ? d_Src[i * ROWS_BLOCKDIM_X] : 0; //Load right halo #pragma unroll for(int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++) s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] = (imageW - baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0; //Compute and store results __syncthreads(); #pragma unroll for(int i = ROWS_HALO_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i++){ float sum = 0; #pragma unroll for(int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j]; d_Dst[i * ROWS_BLOCKDIM_X] = sum; } }
  • 3. #define COLUMNS_BLOCKDIM_X 16 #define COLUMNS_BLOCKDIM_Y 8 #define COLUMNS_RESULT_STEPS 8 #define COLUMNS_HALO_STEPS 1 __global__ void convolutionColumnsKernel( float *d_Dst, float *d_Src, int imageW, int imageH, int pitch ){ __shared__ float s_Data[COLUMNS_BLOCKDIM_X][(COLUMNS_RESULT_STEPS + 2 * COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + 1]; //Offset to the upper halo edge const int baseX = blockIdx.x * COLUMNS_BLOCKDIM_X + threadIdx.x; const int baseY = (blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + threadIdx.y; d_Src += baseY * pitch + baseX; d_Dst += baseY * pitch + baseX; //Main data #pragma unroll for(int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = d_Src[i * COLUMNS_BLOCKDIM_Y * pitch];
  • 4. //Upper halo #pragma unroll for(int i = 0; i < COLUMNS_HALO_STEPS; i++) s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = (baseY >= -i * COLUMNS_BLOCKDIM_Y) ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] : 0; //Lower halo #pragma unroll for(int i = COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS + COLUMNS_HALO_STEPS; i++) s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y]= (imageH - baseY > i * COLUMNS_BLOCKDIM_Y) ? d_Src[i * COLUMNS_BLOCKDIM_Y * pitch] : 0; //Compute and store results __syncthreads(); #pragma unroll for(int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++){ float sum = 0; #pragma unroll for(int j = -KERNEL_RADIUS; j <= KERNEL_RADIUS; j++) sum += c_Kernel[KERNEL_RADIUS - j] * s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j]; d_Dst[i * COLUMNS_BLOCKDIM_Y * pitch] = sum; } }