Upcoming SlideShare
×

# Using Cuda Within Mathematica

7,555

Published on

Published in: Technology, Education
0 Likes
Statistics
Notes
• Full Name
Comment goes here.

Are you sure you want to Yes No
• Be the first to comment

• Be the first to like this

Views
Total Views
7,555
On Slideshare
0
From Embeds
0
Number of Embeds
2
Actions
Shares
0
150
0
Likes
0
Embeds 0
No embeds

No notes for slide

### Using Cuda Within Mathematica

1. 1. Using CUDA within Mathematica Kashif Rasul and Raqibul Hassan l a b s
2. 2. Overview • Intro to Mathematica and its API • CUDA + Mathematica • Some examples
3. 3. Mathematica intro • Mathematica is a modular computational system in which the kernel is separate from the front end which handles the interaction with the user. • The most common way to work is to use interactive documents called notebooks which mix text input and output as well as graphics and other material.
4. 4. Structure of Mathematica • An import aspect of Mathematica is that it can also interact with other applications. • This is achieved through MathLink, a standardised API for two-way communication with the kernel.
5. 5. MathLink • MathLink allows external programs both to call Mathematica, and to be called by Mathematica. • We will use MathLink to let Mathematica call CUDA functions inside an external program.
6. 6. Simple example addtwo.tm :Begin: :Function: addtwo :Pattern: AddTwo[i_Integer,j_Integer] :Arguments: { i, j } :ArgumentTypes: {Integer,Integer} :ReturnType: Integer :End:
7. 7. addtwo.c #include <mathlink.h> int addtwo( int i, int j) { return i+j; } int main(int argc, char* argv[]) { return MLMain(argc, argv); }
10. 10. MathLink Template ﬁle • When a MathLink template ﬁle is processed, two basic things are done: • :Pattern:& :Arguments: speciﬁcations are used to generate a Mathematica deﬁnition • :Function:, :ArgumentTypes: & :ReturnType: speciﬁcations are used to generate C source code
11. 11. :ArgumentTypes: Mathematica speciﬁcation C speciﬁcation Integer int Real double IntegerList int*, long RealList double*, long String char* Symbol char* Manual void
12. 12. Handling Lists & Arrays :Begin: int sumList(int *a, long alen) :Function: sumList { :Pattern: SumList[a_List] int i, tot=0; :Arguments: {a} :ArgumentTypes:{IntegerList} for(i=0; i<alen; i++) :ReturnType: Integer tot += a[i]; :End: return tot; }
13. 13. Manual ArgumentTypes :Begin: :Function: sumList :Pattern: SumList[a:{___Integer}] :Arguments: {a} :ArgumentTypes:{Manual} :ReturnType: Integer :End: int sumList(void) { int sumList(void) { int n, i; int n; int a[MAX]; int *a; MLCheckFunction(stdlink, "List", &n); MLGetInteger32List(stdlink, &a, &n); ... for (i=0; i<n; i++) MLReleaseInteger32List(stdlink, a, n); MLGetInteger32(stdlink, a+i); ... ... } }
14. 14. Array of arb. depth #include <mathlink.h> /* read an array of double-precision floating-point numbers from a link */ void f(MLINK lp) { double *data; int *dims; char **heads; int d; /* stores the rank of the array */ if(! MLGetRealArray(lp, &data, &dims, &heads, &d)) { /* unable to read the array from lp */ return; } /* ... */ MLReleaseRealArray(lp, data, dims, heads, d); }
15. 15. Handling Complex numbers In[1]:= Head 2 3 Out[1]= Complex If you pass a list of complex numbers to your external program, then MLGetReal64Array() will create a two-dimensional array containing a sequence of pairs of real and imaginary parts. In this case, heads[0] will be "List" while heads[1] will be "Complex". //get an array of floating-point numbers of any depth MLGetReal64Array(stdlink,double**a,int**dims,char***heads,int*d);
17. 17. Manual ReturnType void bits(int i) { int a[32], k; :Begin: :Function: bits for(k=0; k<32; k++) { :Pattern: ToBits[i_Integer] a[k] = i%2; :Arguments: {i} i >>= 1; :ArgumentTypes:{Integer} if (i==0) break; :ReturnType: Manual } :End: if (k<32) k++; MLPutInteger32List(stdlink, a, k); return; }
18. 18. General array int a[8][16][100]; int dims[] = {8, 16, 100}; MLPutInteger32Array(stdlink, a, dims, NULL, 3); or int ***a; MLPutFunction(stdlink, "List", n1); for (i=0; i<n1; i++) { MLPutFunction(stdlink, "List", n2); for (j=0; j<n2; j++) { MLPutInteger32List(stdlink, a[i][j], n3); } }
19. 19. Unkown length In[10]:= Sequence 1, Sequence 4, Sequence Out[10]= 1, 4 MLPutFunction(stdlink, "List", 1); while( condition ) { /* generate an element */ MLPutFunction(stdlink, "Sequence", 2); MLPutInteger32(stdlink, i ); } MLPutFunction(stdlink, "Sequence", 0);
20. 20. Return Complex numbers // Complex data type typedef float2 Complex; Complex* h_convolved_signal; // Return transformed signal to Mathematica as a Complex List MLPutFunction(stdlink,"List",n); for (long i = 0; i < n; i++) { MLPutFunction(stdlink,"Complex",2); MLPutFloat(stdlink,h_convolved_signal[i].x*norm); MLPutFloat(stdlink,h_convolved_signal[i].y*norm); }
22. 22. Error & Interrupt if(! MLPutInteger(stdlink, 10)) if(! MLPutReal64(stdlink, 3.22)) { { /* check the possible errors */ /* unable to send 3.22 to lp */ switch(MLError(stdlink)) printf("MathLink Error: %sn", { MLErrorMessage(stdlink)); case MLEDEAD: MLClearError(stdlink); /* the link died unexpectedly */ } break; case MLECLOSED: /* the other side closed the link */ break; case MLEOK: /* no error occurred */ while(len--) break; { default: sum += *list++; /* ... */ /* check for the abort */ } if(MLAbort) return (double)0; } }
24. 24. Mathematica + CUDA #include <cutil_inline.h> int main(int argc, char **argv) { // use command-line specified CUDA device, // otherwise use device with highest Gflops/s if(cutCheckCmdLineFlag(argc, (const char**)argv, "device")) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); return MLMain(argc, argv); }
25. 25. mathematica_cuda # Add source files here EXECUTABLE := cuFourier # CUDA source files (compiled with cudacc) CUFILES := cuFourier.cu # CUDA dependency files # CU_DEPS := # C/C++ source files (compiled with gcc / c++) # CCFILES := # Additional libraries needed by the project USECUFFT := 1 # MathLink Template files TMFILES := cuFourier.tm ################################################### # Rules and targets include ../../common/common.mk
26. 26. FindCUDA + FindMathLink via CMake • CMake http://www.cmake.org/ • FindCUDA https://gforge.sci.utah.edu/ gf/project/ﬁndcuda/ • FindMathLink http://github.com/kashif/ FindMathLink/tree
28. 28. double to ﬂoat conversion #include <cutil_inline.h> // General check for CUDA GPU SM Capabilities //inline bool cutilDrvCudaCapabilities(int major_version, int minor_version); char **heads; int *dims; int rank; float *h_float; double *h_double; if (cutilDrvCudaCapabilities( 1,3 )) { MLGetReal64Array(stdlink, &h_double, &dims, &heads, &rank); } else { MLGetReal32Array(stdlink, &h_float, &dims, &heads, &rank); }
29. 29. CUBLAS & CUFFT • Follow the usual routine of sending data to the MathLink app • Use CUBLAS or CUFFT • Return result back to Mathematica
30. 30. cuFourier In[1]:= ListLinePlot Abs Fourier RandomReal 1, 200 ^2 0.30 0.25 0.20 Out[1]= 0.15 0.10 0.05 50 100 150 200
31. 31. Clone mathematica_cuda \$ git clone git://github.com/kashif/mathematica_cuda.git \$ cd mathematica_cuda/src \$ mkdir cuFourier \$ mate cuFourier
32. 32. cuFourier.tm :Begin: :Function: cuFourier1D :Pattern: CUFourier1D[ a:{__?NumericQ} ] :Arguments: { a } :ArgumentTypes:{ RealList } :ReturnType: Manual :End:
33. 33. cuFourier.cu // includes system #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> // includes cuda #include <cufft.h> #include <cutil_inline.h> // includes mathlink #include <mathlink.h> // Complex data type typedef float2 Complex; /////////////////////////////////////////////////////////////// // Showing the use of CUFFT for fast convolution using FFT. /////////////////////////////////////////////////////////////// extern "C" void cuFourier1D(double*, long);
34. 34. //////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { // use command-line specified CUDA device, otherwise use device // with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); return MLMain(argc, argv); }
35. 35. void cuFourier1D (double *h_A, long n) { double norm = 1.0/sqrt((double) n); long mem_size = sizeof(Complex) * n; // Allocate host memory for the signal Complex* h_signal = (Complex*)malloc(mem_size); // Initalize the memory for the signal for (long i = 0; i < n; ++i) { h_signal[i].x = (float)h_A[i]; h_signal[i].y = 0.0f; } // Allocate device memory for signal Complex* d_signal; cutilSafeCall(cudaMalloc((void**)&d_signal, mem_size)); // Copy host memory to device cutilSafeCall(cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice));
36. 36. // CUFFT plan cufftHandle plan; cufftSafeCall(cufftPlan1d(&plan, n, CUFFT_C2C, 1)); // Transform signal cufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE)); // Copy device memory to host Complex* h_convolved_signal = h_signal; cutilSafeCall(cudaMemcpy(h_convolved_signal, d_signal, mem_size, cudaMemcpyDeviceToHost)); // Release d_signal cutilSafeCall(cudaFree(d_signal)); // Destroy CUFFT context cufftSafeCall(cufftDestroy(plan));
38. 38. Makefile ################################################################## # # Build script for project # ################################################################## # Add source files here EXECUTABLE := cuFourier # CUDA source files (compiled with cudacc) CUFILES := cuFourier.cu # Additional libraries needed by the project USECUFFT := 1 # MathLink Template files TMFILES := cuFourier.tm ################################################################## # Rules and targets include ../../common/common.mk
39. 39. In[35]:= link Install " Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin release cuFourier" Out[35]= LinkObject Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin release cuFourier, 605, 9 In[36]:= LinkPatterns link Out[36]= CUFourier1D a : __ ?NumericQ In[37]:= ListLinePlot Abs CUFourier1D RandomReal 1, 200 ^2 0.4 0.3 Out[37]= 0.2 0.1 50 100 150 200 In[38]:= Uninstall link Out[38]= Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin release cuFourier
40. 40. Image Deconvolution for Life Sciences • Confocal and Wideﬁeld microscopy 3D or 4D images • Multichannel (3 or more channels) • Comes in a wide variety of formats
41. 41. Bio-Formats Java lib. • Standalone Java library for reading and writing life science image formats • Get both the pixels and metadata • Licensed under GPL • http://www.loci.wisc.edu/ome/ formats.html
44. 44. Reading pixel volume LoadJavaClass "loci.common.DataTools" JavaClass loci.common.DataTools, volume Flatten N Table DataTools`makeDataArray reader openBytes z, 0, 0, reader getSizeX , reader getSizeY , bpp, False, True , z, 0, reader getSizeZ 1 ; unflatten e_, d__ ? IntegerQ && Positive & : Fold Partition, e, Take d , 1, 2, 1 ; Length e Times d array unflatten volume, reader getSizeX , reader getSizeY , reader getSizeZ ;
45. 45. View a slice Image array 165, All, All 255
46. 46. Image deconvled Result 165, All, All
47. 47. Wiener Deconv. :Begin: :Function: wienerDeconvolve :Pattern: WienerDeconvolve[nx_Integer, ny_Integer, nz_Integer, epsilon_Real, sigma_Real, inImage:{___Real}] :Arguments: { nx, ny, nz, epsilon, sigma, inImage } :ArgumentTypes: { Integer, Integer, Integer, Real, Real, Manual } :ReturnType: Manual :End: void wienerDeconvolve(int nx, int ny, int nz, double epsilon, double sigma) { float *inImage; int length; if(! MLGetReal32List(stdlink, &inImage, &length)) { return; }
48. 48. amira Projection view ® http://www.amiravis.com
49. 49. Export " home kashif Amira522 data deconv alphalobe MaxLike.raw", result, "Real32" ;
50. 50. Remote Sensing application
51. 51. Reﬂectance
52. 52. Vegetation
53. 53. Landsat TM Data
54. 54. Band 3 & Band 4
55. 55. NDVI = NIR-R/NIR+R
57. 57. Loading Landsat data in Mathematica In[14]:= LoadJavaClass "loci.formats.FormatTools" Out[14]= JavaClass loci.formats.FormatTools, In[15]:= bpp FormatTools`getBytesPerPixel pixelType Out[15]= 1 In[16]:= reader getSizeX Out[16]= 512 In[17]:= isLittle reader isLittleEndian Out[17]= True In[18]:= reader getSizeY Out[18]= 512 In[19]:= LoadJavaClass "loci.common.DataTools" Out[19]= JavaClass loci.common.DataTools,
58. 58. In[31]:= red DataTools`makeDataArray reader openBytes 2, 0, 0, reader getSizeX , reader getSizeY , bpp, False, True ; In[53]:= Image Partition 100 Normalize red , reader getSizeX
59. 59. In[56]:= NIR DataTools`makeDataArray reader openBytes 3, 0, 0, reader getSizeX , reader getSizeY , bpp, False, True ; In[57]:= Image Partition 100 Normalize NIR , reader getSizeX
60. 60. In[39]:= link Install " Users sabman mathematica_cuda bin darwin emurelease ndvi" Out[39]= LinkObject Users sabman mathematica_cuda bin darwin emurelease ndvi, 41, 10 In[40]:= LinkPatterns link Out[40]= ndvi a_List, b_List In[41]:= NDVI ndvi Partition NIR, reader getSizeX , Partition red, reader getSizeX ; In[42]:= Image Partition NDVI, reader getSizeX
61. 61. ndvi.tm :Begin: :Function: ndvi :Pattern: ndvi[ a_List, b_List ] :Arguments: { a, b } :ArgumentTypes: { Manual } :ReturnType: Manual :End:
62. 62. ndvi.cu void ndvi(void) { short int *h_A, *h_B; float *h_C_GPU; short int *d_A, *d_B; float *d_C; char **heads_A, **heads_B; int *dims_A, *dims_B; int rank_A, rank_B; if(! MLGetInteger16Array(stdlink, &h_A, &dims_A, &heads_A, &rank_A)) { return; } if(! MLGetInteger16Array(stdlink, &h_B, &dims_B, &heads_B, &rank_B)) { return; }
63. 63. //Initializing data h_C_GPU = (float *)malloc(dims_A[0]*dims_A[1]*sizeof(float)); //Allocating GPU memory cutilSafeCall( cudaMalloc((void **)&d_A, dims_A[0]*dims_A[1]*sizeof(short int)) ); cutilSafeCall( cudaMalloc((void **)&d_B, dims_A[0]*dims_A[1]*sizeof(short int)) ); cutilSafeCall( cudaMalloc((void **)&d_C, dims_A[0]*dims_A[1]*sizeof(float)) ); //Copy data to GPU memory for further processing cutilSafeCall( cudaMemcpy(d_A, h_A, dims_A[0]*dims_A[1]*sizeof(short int), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaMemcpy(d_B, h_B, dims_A[0]*dims_A[1]*sizeof(short int), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaThreadSynchronize() ); dim3 grid(ceil((float)dims_A[0]/(float)16.0f), ceil((float) dims_A[1]/32.0f), 1); dim3 threads(ceil( dims_A[0]/(float)grid.x), ceil( dims_A[1]/(float)grid.y), 1); ndviGPU<<<grid, threads>>>(d_C, d_A, d_B, dims_A[0], dims_A[1]); cutilCheckMsg("ndviGPU() execution failedn"); cutilSafeCall( cudaThreadSynchronize() );
64. 64. //Release d_A and d_B cutilSafeCall( cudaFree(d_B) ); cutilSafeCall( cudaFree(d_A) ); //Read back GPU results into h_C_GPU cutilSafeCall( cudaMemcpy(h_C_GPU, d_C, dims_A[0]*dims_A[1]*sizeof(float), cudaMemcpyDeviceToHost) ); //Release d_C cutilSafeCall( cudaFree(d_C) ); //Return result MLPutReal32List(stdlink, h_C_GPU, dims_A[0]*dims_A[1]); //Release h_A and h_B MLReleaseInteger16Array(stdlink, h_A, dims_A, heads_A, rank_A); MLReleaseInteger16Array(stdlink, h_B, dims_B, heads_B, rank_B); cudaThreadExit();
65. 65. NDVI Kernel /////////////////////////////////////////////////////////////////////////////// // Calculate ndvi of two channels d_A and d_B on GPU and store result in d_C /////////////////////////////////////////////////////////////////////////////// __global__ void ndviGPU( float *d_C, short int *d_A, short int *d_B, int width, int height ){ unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y; if(xIndex < width && yIndex < height) { unsigned int i = yIndex * (width) + xIndex; d_C[i] = __fdividef( (float)(d_A[i] - d_B[i]), (float)(d_A[i] + d_B[i]) ); } }
66. 66. NDVI output 0 1 In[64]:= ArrayPlot Partition NDVI, reader getSizeX , ColorFunction "Rainbow"