SlideShare a Scribd company logo
Using CUDA
within
Mathematica

Kashif Rasul
and Raqibul Hassan

l a b s
Overview


ā€¢ Intro to Mathematica and its API
ā€¢ CUDA + Mathematica
ā€¢ Some examples
Mathematica intro
ā€¢ Mathematica is a modular
  computational system in which the
  kernel is separate from the front end
  which handles the interaction with the
  user.

ā€¢ The most common way to work is to use
  interactive documents called notebooks
  which mix text input and output as well
  as graphics and other material.
Structure of
      Mathematica
ā€¢ An import aspect of Mathematica is
  that it can also interact with other
  applications.

ā€¢ This is achieved through MathLink,
  a standardised API for two-way
  communication with the kernel.
MathLink

ā€¢ MathLink allows external programs both
  to call Mathematica, and to be called by
  Mathematica.

ā€¢ We will use MathLink to let Mathematica
  call CUDA functions inside an external
  program.
Simple example
                  addtwo.tm
:Begin:
:Function:        addtwo
:Pattern:         AddTwo[i_Integer,j_Integer]
:Arguments:       { i, j }
:ArgumentTypes:   {Integer,Integer}
:ReturnType:      Integer
:End:
addtwo.c

#include <mathlink.h>

int addtwo( int i, int j)
{
  return i+j;
}

int main(int argc, char* argv[])
{
  return MLMain(argc, argv);
}
mprep & gcc


$ mprep addtwo.tm -o addtwotm.c

$ gcc -I${INCDIR} addtwotm.c addtwo.c
  -L${LIBDIR} -lMLi3 -lstdc++ -o addtwo
In[3]:=   SetDirectory
            " Applications Mathematica.app SystemFiles Links MathLink DeveloperKit
              PrebuiltExamples"

Out[3]=     Applications Mathematica.app SystemFiles Links MathLink DeveloperKit
             PrebuiltExamples

 In[4]:=   link      Install ". addtwo"
Out[4]=    LinkObject
             Applications Mathematica.app SystemFiles Links MathLink DeveloperKit
              PrebuiltExamples addtwo, 524, 8

 In[5]:=   LinkPatterns link
Out[5]=     AddTwo i_Integer, j_Integer

 In[6]:=   ? AddTwo

           AddTwo x , y gives the sum of two machine integers x and y.


 In[7]:=   AddTwo 2, 3
Out[7]=    5

 In[8]:=   AddTwo 2^31         1, 1
Out[8]=        2 147 483 648

 In[9]:=   Uninstall link
Out[9]=     Applications Mathematica.app SystemFiles Links MathLink DeveloperKit
             PrebuiltExamples addtwo
MathLink
        Template ļ¬le
ā€¢ When a MathLink template ļ¬le is
  processed, two basic things are done:
 ā€¢   :Pattern:& :Arguments: speciļ¬cations
     are used to generate a Mathematica
     deļ¬nition
 ā€¢   :Function:, :ArgumentTypes:
     & :ReturnType: speciļ¬cations are used
     to generate C source code
:ArgumentTypes:

Mathematica speciļ¬cation   C speciļ¬cation

       Integer                  int
         Real                 double
     IntegerList            int*, long
       RealList            double*, long
        String                 char*
        Symbol                 char*
        Manual                 void
Handling
              Lists & Arrays
:Begin:                          int sumList(int *a, long alen)
:Function:     sumList           {
:Pattern:      SumList[a_List]      int i, tot=0;
:Arguments:    {a}
:ArgumentTypes:{IntegerList}         for(i=0; i<alen; i++)
:ReturnType:   Integer                  tot += a[i];
:End:
                                     return tot;
                                 }
Manual ArgumentTypes
                 :Begin:
                 :Function:     sumList
                 :Pattern:      SumList[a:{___Integer}]
                 :Arguments:    {a}
                 :ArgumentTypes:{Manual}
                 :ReturnType:   Integer
                 :End:

int sumList(void) {                       int sumList(void) {
  int n, i;                                 int n;
  int a[MAX];                               int *a;

  MLCheckFunction(stdlink, "List", &n);       MLGetInteger32List(stdlink, &a, &n);
                                              ...
  for (i=0; i<n; i++)                         MLReleaseInteger32List(stdlink, a, n);
    MLGetInteger32(stdlink, a+i);             ...
...                                       }
}
Array of arb. depth
#include <mathlink.h>

/* read an array of double-precision floating-point numbers from a link */
void f(MLINK lp)
{
    double *data;
    int *dims;
    char **heads;
    int d; /* stores the rank of the array */

    if(! MLGetRealArray(lp, &data, &dims, &heads, &d))
        {
            /* unable to read the array from lp */
            return;
        }
    /* ... */
    MLReleaseRealArray(lp, data, dims, heads, d);
}
Handling Complex
           numbers
                    In[1]:=   Head 2         3
                   Out[1]=    Complex

If you pass a list of complex numbers to your external program,
then MLGetReal64Array() will create a two-dimensional array
containing a sequence of pairs of real and imaginary parts. In this
case, heads[0] will be "List" while heads[1] will be "Complex".

    //get an array of floating-point numbers of any depth
    MLGetReal64Array(stdlink,double**a,int**dims,char***heads,int*d);
Summary of API
//get a list of integers, allocating the memory needed to store it
MLGetInteger32List(stdlink,int**a,int*n);
//get a list of floating-point numbers
MLGetReal64List(stdlink,double**a,int*n);
//release the memory associated with a list of integers
MLReleaseInteger32List(stdlink,int*a,int n);
//release the memory associated with a list of floating-point numbers
MLReleaseReal64List(stdlink,double*a,int n);



//get an array of integers of any depth
MLGetInteger32Array(stdlink,int**a,int**dims,char***heads,int*d);
//get an array of floating-point numbers of any depth
MLGetReal32Array(stdlink,float**a,int**dims,char***heads,int*d);
//release memory associated with an integer array
MLReleaseInteger32Array(stdlink,int*a,int*dims,char**heads,int d);
//release memory associated with a floating-point array
MLReleaseReal32Array(stdlink,float*a,int*dims,char**heads,int d);
Manual ReturnType
                                   void bits(int i)
                                   {
                                     int a[32], k;
:Begin:
:Function:     bits                    for(k=0; k<32; k++) {
:Pattern:      ToBits[i_Integer]         a[k] = i%2;
:Arguments:    {i}                       i >>= 1;
:ArgumentTypes:{Integer}                 if (i==0) break;
:ReturnType:   Manual                  }
:End:
                                       if (k<32) k++;

                                       MLPutInteger32List(stdlink,
                                                          a, k);
                                       return;
                                   }
General array
     int a[8][16][100];
     int dims[] = {8, 16, 100};

     MLPutInteger32Array(stdlink, a, dims, NULL, 3);

or
     int ***a;

     MLPutFunction(stdlink, "List", n1);
     for (i=0; i<n1; i++) {
       MLPutFunction(stdlink, "List", n2);
       for (j=0; j<n2; j++) {
         MLPutInteger32List(stdlink, a[i][j], n3);
       }
     }
Unkown length
 In[10]:=     Sequence 1, Sequence 4, Sequence
Out[10]=      1, 4


            MLPutFunction(stdlink, "List", 1);

            while( condition )
            {
              /* generate an element */
              MLPutFunction(stdlink, "Sequence", 2);
              MLPutInteger32(stdlink, i );
            }

            MLPutFunction(stdlink, "Sequence", 0);
Return Complex
             numbers
// Complex data type
typedef float2 Complex;

Complex* h_convolved_signal;

// Return transformed signal to Mathematica as a Complex List
MLPutFunction(stdlink,"List",n);
for (long i = 0; i < n; i++) {
    MLPutFunction(stdlink,"Complex",2);
    MLPutFloat(stdlink,h_convolved_signal[i].x*norm);
    MLPutFloat(stdlink,h_convolved_signal[i].y*norm);
}
Return Complex
              numbers
 In[4]:=   list   Table RandomReal   , 12
Out[4]=    0.389421, 0.222396, 0.434636, 0.0886136, 0.233102, 0.941771,
           0.928712, 0.764119, 0.791473, 0.381426, 0.757661, 0.44273

 In[5]:=   Map Function   x , Apply Complex, x   , Partition list, 2


Out[5]=    0.389421   0.222396 , 0.434636   0.0886136 , 0.233102 0.941771 ,
           0.928712   0.764119 , 0.791473   0.381426 , 0.757661 0.44273

           // Return transformed signal to Mathematica as a Complex List
           MLPutFunction(stdlink, "Map", 2);
           MLPutFunction(stdlink, "Function", 2);
           MLPutFunction(stdlink, "List", 1);
           MLPutSymbol(stdlink, "x");
           MLPutFunction(stdlink, "Apply", 2);
           MLPutSymbol(stdlink, "Complex");
           MLPutSymbol(stdlink, "x");
           MLPutFunction(stdlink, "Partition", 2);
           MLPutFunction(stdlink, "Times", 2);
           MLPutReal(stdlink, norm);
           MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n);
           MLPutInteger(stdlink, 2);
Error & Interrupt
if(! MLPutInteger(stdlink, 10))              if(! MLPutReal64(stdlink, 3.22))
{                                            {
  /* check the possible errors */              /* unable to send 3.22 to lp */
  switch(MLError(stdlink))                     printf("MathLink Error: %sn",
  {                                                   MLErrorMessage(stdlink));
    case MLEDEAD:                              MLClearError(stdlink);
      /* the link died unexpectedly */       }
      break;
    case MLECLOSED:
      /* the other side closed the link */
      break;
    case MLEOK:
      /* no error occurred */                while(len--)
      break;                                 {
    default:                                   sum += *list++;
    /* ... */                                  /* check for the abort */
  }                                            if(MLAbort) return (double)0;
}                                            }
Running on remote
                  computers
           $ ./addtwo -linkcreate -linkprotocol TCPIP
           Link created on: 63166@192.168.1.107,63167@192.168.1.107



 In[5]:=   Install LinkConnect "63166 192.168.1.107,63167 192.168.1.107",
             LinkProtocol "TCPIP"

Out[5]=    LinkObject 63166 192.168.1.107,63167 192.168.1.107, 1110, 8

 In[6]:=   AddTwo 2, 3
Out[6]=    5
Mathematica + CUDA
#include <cutil_inline.h>

int main(int argc, char **argv)
{
    // use command-line specified CUDA device,
    // otherwise use device with highest Gflops/s
    if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))
         cutilDeviceInit(argc, argv);
    else
         cudaSetDevice( cutGetMaxGflopsDeviceId() );

    return MLMain(argc, argv);
}
mathematica_cuda
# Add source files here
EXECUTABLE := cuFourier
# CUDA source files (compiled with cudacc)
CUFILES     := cuFourier.cu
# CUDA dependency files
# CU_DEPS       :=
# C/C++ source files (compiled with gcc / c++)
# CCFILES       :=
# Additional libraries needed by the project
USECUFFT        := 1
# MathLink Template files
TMFILES     := cuFourier.tm

###################################################
# Rules and targets

include ../../common/common.mk
FindCUDA +
FindMathLink via CMake

 ā€¢ CMake http://www.cmake.org/
 ā€¢ FindCUDA https://gforge.sci.utah.edu/
   gf/project/ļ¬ndcuda/

 ā€¢ FindMathLink http://github.com/kashif/
   FindMathLink/tree
CMakeLists.txt
set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
set(source_files test_bin.cu)
CUDA_COMPILE(CUDA_FILES test_bin.cu)

MathLink_ADD_TM(test.tm)

INCLUDE_DIRECTORIES(
  ${MathLink_INCLUDE_DIR}
  )
LINK_DIRECTORIES(
  ${MathLink_LIBRARY_DIR}
  )

ADD_EXECUTABLE(cuda_compile_example
   ${CUDA_FILES}
   ${source_files}
   test.tm.c
   main.cc
   external_dependency.h
   )
TARGET_LINK_LIBRARIES(cuda_compile_example
 ${MathLink_LIBRARIES}
 ${CUDA_LIBRARIES}
 )
double to ļ¬‚oat
                 conversion
#include <cutil_inline.h>
// General check for CUDA GPU SM Capabilities
//inline bool cutilDrvCudaCapabilities(int major_version, int minor_version);

char **heads;
int *dims;
int rank;
float *h_float;
double *h_double;

if (cutilDrvCudaCapabilities( 1,3 ))
{
     MLGetReal64Array(stdlink, &h_double, &dims, &heads, &rank);
}
else
{
     MLGetReal32Array(stdlink, &h_float, &dims, &heads, &rank);
}
CUBLAS & CUFFT

ā€¢ Follow the usual routine of sending data
  to the MathLink app

ā€¢ Use CUBLAS or CUFFT
ā€¢ Return result back to Mathematica
cuFourier
In[1]:=   ListLinePlot Abs Fourier RandomReal 1, 200   ^2

          0.30


          0.25


          0.20


Out[1]= 0.15


          0.10


          0.05



                       50          100        150           200
Clone mathematica_cuda


$ git clone
  git://github.com/kashif/mathematica_cuda.git

$ cd mathematica_cuda/src

$ mkdir cuFourier

$ mate cuFourier
cuFourier.tm


:Begin:
:Function:     cuFourier1D
:Pattern:      CUFourier1D[ a:{__?NumericQ} ]
:Arguments:    { a }
:ArgumentTypes:{ RealList }
:ReturnType:   Manual
:End:
cuFourier.cu
// includes system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes cuda
#include <cufft.h>
#include <cutil_inline.h>

// includes mathlink
#include <mathlink.h>

// Complex data type
typedef float2 Complex;

///////////////////////////////////////////////////////////////
// Showing the use of CUFFT for fast convolution using FFT.
///////////////////////////////////////////////////////////////
extern "C" void cuFourier1D(double*, long);
////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[])
{
    // use command-line specified CUDA device, otherwise use device
    // with highest Gflops/s
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
         cutilDeviceInit(argc, argv);
    else
         cudaSetDevice( cutGetMaxGflopsDeviceId() );

    return MLMain(argc, argv);
}
void cuFourier1D (double *h_A, long n)
{
    double norm = 1.0/sqrt((double) n);
    long mem_size = sizeof(Complex) * n;

   // Allocate host memory for the signal
   Complex* h_signal = (Complex*)malloc(mem_size);

   // Initalize the memory for the signal
   for (long i = 0; i < n; ++i) {
       h_signal[i].x = (float)h_A[i];
       h_signal[i].y = 0.0f;
   }

   // Allocate device memory for signal
   Complex* d_signal;
   cutilSafeCall(cudaMalloc((void**)&d_signal, mem_size));
   // Copy host memory to device
   cutilSafeCall(cudaMemcpy(d_signal, h_signal, mem_size,
                            cudaMemcpyHostToDevice));
// CUFFT plan
cufftHandle plan;
cufftSafeCall(cufftPlan1d(&plan, n, CUFFT_C2C, 1));

// Transform signal
cufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal,
                                 (cufftComplex *)d_signal,
                                 CUFFT_INVERSE));

// Copy device memory to host
Complex* h_convolved_signal = h_signal;
cutilSafeCall(cudaMemcpy(h_convolved_signal, d_signal,
                         mem_size, cudaMemcpyDeviceToHost));

// Release d_signal
cutilSafeCall(cudaFree(d_signal));

// Destroy CUFFT context
cufftSafeCall(cufftDestroy(plan));
// Return transformed signal to Mathematica as a Complex List
    MLPutFunction(stdlink, "Map", 2);
    MLPutFunction(stdlink, "Function", 2);
    MLPutFunction(stdlink, "List", 1);
    MLPutSymbol(stdlink, "x");
    MLPutFunction(stdlink, "Apply", 2);
    MLPutSymbol(stdlink, "Complex");
    MLPutSymbol(stdlink, "x");
    MLPutFunction(stdlink, "Partition", 2);
    MLPutFunction(stdlink, "Times", 2);
    MLPutReal(stdlink, norm);
    MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n);
    MLPutInteger(stdlink, 2);

    // Cleanup memory
    free(h_signal);

    cudaThreadExit();
}
Makefile
##################################################################
#
# Build script for project
#
##################################################################

# Add source files here
EXECUTABLE := cuFourier
# CUDA source files (compiled with cudacc)
CUFILES     := cuFourier.cu
# Additional libraries needed by the project
USECUFFT        := 1

# MathLink Template files
TMFILES     := cuFourier.tm

##################################################################
# Rules and targets
include ../../common/common.mk
In[35]:=   link
             Install
              " Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin
                 release cuFourier"
Out[35]=    LinkObject
              Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin
               release cuFourier, 605, 9

 In[36]:=   LinkPatterns link
Out[36]=     CUFourier1D a : __ ?NumericQ

 In[37]:=   ListLinePlot Abs CUFourier1D RandomReal 1, 200    ^2



            0.4



            0.3


Out[37]=
            0.2



            0.1




                         50          100        150          200


 In[38]:=   Uninstall link
Out[38]=     Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin
              release cuFourier
Image Deconvolution
  for Life Sciences

ā€¢ Confocal and Wideļ¬eld microscopy
  3D or 4D images

ā€¢ Multichannel (3 or more channels)
ā€¢ Comes in a wide variety of formats
Bio-Formats Java lib.

ā€¢ Standalone Java library for reading and
  writing life science image formats

ā€¢ Get both the pixels and metadata
ā€¢ Licensed under GPL
ā€¢ http://www.loci.wisc.edu/ome/
  formats.html
Java + Mathematica:
             J/Link
Needs "JLink`"

InstallJava
LinkObject
 ' usr local Wolfram Mathematica 7.0 SystemFiles Java Linux x86 64 bin java' classpath
   " usr local Wolfram Mathematica 7.0 SystemFiles Links JLink JLink.jar"
    Xmx256m   Djava.system.class.loader com.wolfram.jlink.JLinkSystemClassLoader
    Djava.util.prefs.PreferencesFactory com.wolfram.jlink.DisabledPreferencesFactory
   com.wolfram.jlink.Install init " tmp m000001207601", 4, 4

ReinstallJava ClassPath   " home kashif Dropbox BioFormats Java loci_tools.jar"
LinkObject
 ' usr local Wolfram Mathematica 7.0 SystemFiles Java Linux x86 64 bin java' classpath
   " usr local Wolfram Mathematica 7.0 SystemFiles Links JLink JLink.jar"
    Xmx256m   Djava.system.class.loader com.wolfram.jlink.JLinkSystemClassLoader
    Djava.util.prefs.PreferencesFactory com.wolfram.jlink.DisabledPreferencesFactory
   com.wolfram.jlink.Install init " tmp m000002207601", 8, 4
Reading LIF images
reader    JavaNew "loci.formats.ImageReader"      LoadJavaClass "loci.formats.FormatTools"

Ā« JavaObject loci.formats.ImageReader Ā»           JavaClass loci.formats.FormatTools,

                                                  bpp   FormatTools`getBytesPerPixel pixelType
reader setId " media cdrom xyz 1ch by2 MT1.lif"
                                                  1
reader getSeriesCount
                                                  reader getSizeX
7
                                                  512
reader setSeries 0
                                                  reader getSizeY
sizeC    reader getSizeC                          512
1
                                                  reader getSizeZ
pixelType    reader getPixelType                  90

1

num     reader getImageCount
90
Reading pixel volume
 LoadJavaClass "loci.common.DataTools"
 JavaClass loci.common.DataTools,

 volume
   Flatten
    N
      Table DataTools`makeDataArray
        reader openBytes z, 0, 0, reader getSizeX   , reader getSizeY   , bpp, False, True ,
        z, 0, reader getSizeZ    1    ;




unflatten e_, d__ ? IntegerQ               && Positive     &   :
 Fold Partition, e, Take d ,              1, 2, 1      ; Length e                Times d

array   unflatten volume, reader getSizeX                    , reader getSizeY         ,
    reader getSizeZ     ;
View a slice
Image array   165, All, All   255
Image deconvled
                   Result
                  165, All, All
Wiener Deconv.
:Begin:
:Function:      wienerDeconvolve
:Pattern:       WienerDeconvolve[nx_Integer, ny_Integer, nz_Integer,
                                 epsilon_Real, sigma_Real, inImage:{___Real}]
:Arguments:     { nx, ny, nz, epsilon, sigma, inImage }
:ArgumentTypes: { Integer, Integer, Integer, Real, Real, Manual }
:ReturnType:    Manual
:End:



void wienerDeconvolve(int nx, int ny, int nz, double epsilon, double sigma)
{
    float *inImage;
    int length;

    if(! MLGetReal32List(stdlink, &inImage, &length))
    {
        return;
    }
amira Projection view
     Ā®


http://www.amiravis.com
Export " home kashif Amira522 data deconv alphalobe MaxLike.raw",
  result, "Real32" ;
Remote Sensing
  application
Reļ¬‚ectance
Vegetation
Landsat TM Data
Band 3 & Band 4
NDVI = NIR-R/NIR+R
Reading Landsat Images
   In[4]:=   reader    JavaNew "loci.formats.ImageReader"
  Out[4]=    Ā« JavaObject loci.formats.ImageReader Ā»

   In[5]:=   reader    JavaNew "loci.formats.ChannelSeparator", reader
  Out[5]=    Ā« JavaObject loci.formats.ChannelSeparator Ā»

  In[35]:=   reader setId " Users sabman satellite_images multispectral bhtmref.tif"

   In[7]:=   reader getSeriesCount
  Out[7]=    1

   In[8]:=   sizeC    reader getSizeC
  Out[8]=    6

   In[9]:=   pixelType    reader getPixelType
  Out[9]=    1

  In[11]:=   num     reader getImageCount
 Out[11]=    6

  In[12]:=   pixelType    reader getPixelType
Loading Landsat data
  in Mathematica
    In[14]:=   LoadJavaClass "loci.formats.FormatTools"
   Out[14]=    JavaClass loci.formats.FormatTools,

    In[15]:=   bpp    FormatTools`getBytesPerPixel pixelType
   Out[15]=    1

    In[16]:=   reader getSizeX
   Out[16]=    512

    In[17]:=   isLittle   reader isLittleEndian
   Out[17]=    True

    In[18]:=   reader getSizeY
   Out[18]=    512

    In[19]:=   LoadJavaClass "loci.common.DataTools"
   Out[19]=    JavaClass loci.common.DataTools,
In[31]:=   red      DataTools`makeDataArray
                 reader openBytes 2, 0, 0, reader getSizeX   , reader getSizeY   , bpp, False, True ;

In[53]:=   Image Partition 100 Normalize red , reader getSizeX
In[56]:=   NIR      DataTools`makeDataArray
                 reader openBytes 3, 0, 0, reader getSizeX   , reader getSizeY   , bpp, False, True ;

In[57]:=   Image Partition 100 Normalize NIR , reader getSizeX
In[39]:=   link   Install " Users sabman mathematica_cuda bin darwin emurelease ndvi"
Out[39]=    LinkObject   Users sabman mathematica_cuda bin darwin emurelease ndvi, 41, 10

 In[40]:=   LinkPatterns link
Out[40]=    ndvi a_List, b_List

 In[41]:=   NDVI   ndvi Partition NIR, reader getSizeX    , Partition red, reader getSizeX   ;

 In[42]:=   Image Partition NDVI, reader getSizeX
ndvi.tm


:Begin:
:Function:        ndvi
:Pattern:         ndvi[ a_List, b_List ]
:Arguments:       { a, b }
:ArgumentTypes:   { Manual }
:ReturnType:      Manual
:End:
ndvi.cu
void ndvi(void)
{
    short int *h_A, *h_B;
    float *h_C_GPU;
    short int *d_A, *d_B;
    float *d_C;

    char **heads_A, **heads_B;
    int *dims_A, *dims_B;
    int rank_A, rank_B;

   if(! MLGetInteger16Array(stdlink, &h_A, &dims_A, &heads_A, &rank_A))
   {
       return;
   }

   if(! MLGetInteger16Array(stdlink, &h_B, &dims_B, &heads_B, &rank_B))
   {
       return;
   }
//Initializing data
h_C_GPU = (float *)malloc(dims_A[0]*dims_A[1]*sizeof(float));

//Allocating GPU memory
cutilSafeCall( cudaMalloc((void **)&d_A, dims_A[0]*dims_A[1]*sizeof(short int)) );
cutilSafeCall( cudaMalloc((void **)&d_B, dims_A[0]*dims_A[1]*sizeof(short int)) );
cutilSafeCall( cudaMalloc((void **)&d_C, dims_A[0]*dims_A[1]*sizeof(float)) );

//Copy data to GPU memory for further processing
cutilSafeCall( cudaMemcpy(d_A, h_A, dims_A[0]*dims_A[1]*sizeof(short int),
               cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy(d_B, h_B, dims_A[0]*dims_A[1]*sizeof(short int),
               cudaMemcpyHostToDevice) );

cutilSafeCall( cudaThreadSynchronize() );

dim3 grid(ceil((float)dims_A[0]/(float)16.0f), ceil((float) dims_A[1]/32.0f), 1);
dim3 threads(ceil( dims_A[0]/(float)grid.x), ceil( dims_A[1]/(float)grid.y), 1);

ndviGPU<<<grid, threads>>>(d_C, d_A, d_B, dims_A[0], dims_A[1]);
cutilCheckMsg("ndviGPU() execution failedn");
cutilSafeCall( cudaThreadSynchronize() );
//Release d_A and d_B
cutilSafeCall( cudaFree(d_B) );
cutilSafeCall( cudaFree(d_A) );

//Read back GPU results into h_C_GPU
cutilSafeCall( cudaMemcpy(h_C_GPU, d_C, dims_A[0]*dims_A[1]*sizeof(float),
               cudaMemcpyDeviceToHost) );

//Release d_C
cutilSafeCall( cudaFree(d_C) );

//Return result
MLPutReal32List(stdlink, h_C_GPU, dims_A[0]*dims_A[1]);

//Release h_A and h_B
MLReleaseInteger16Array(stdlink, h_A, dims_A, heads_A, rank_A);
MLReleaseInteger16Array(stdlink, h_B, dims_B, heads_B, rank_B);

cudaThreadExit();
NDVI Kernel
///////////////////////////////////////////////////////////////////////////////
// Calculate ndvi of two channels d_A and d_B on GPU and store result in d_C
///////////////////////////////////////////////////////////////////////////////

__global__ void ndviGPU(
    float *d_C,
    short int *d_A,
    short int *d_B,
    int width,
    int height
){

    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

    if(xIndex < width && yIndex < height)
    {
        unsigned int i = yIndex * (width) + xIndex;
        d_C[i] = __fdividef( (float)(d_A[i] - d_B[i]), (float)(d_A[i] + d_B[i]) );
    }
}
NDVI output




                                 0                     1


In[64]:=   ArrayPlot Partition NDVI, reader getSizeX       , ColorFunction   "Rainbow"
Questions?

http://hpc.nomad-labs.com
    kashif@nomad-labs.com
         twitter krasul

More Related Content

What's hot

Data structure lab manual
Data structure lab manualData structure lab manual
Data structure lab manual
nikshaikh786
Ā 
C Programming - Refresher - Part II
C Programming - Refresher - Part II C Programming - Refresher - Part II
C Programming - Refresher - Part II
Emertxe Information Technologies Pvt Ltd
Ā 
A taste of Functional Programming
A taste of Functional ProgrammingA taste of Functional Programming
A taste of Functional Programming
Jordan Open Source Association
Ā 
Scala is java8.next()
Scala is java8.next()Scala is java8.next()
Scala is java8.next()
daewon jeong
Ā 
Lecture11 standard template-library
Lecture11 standard template-libraryLecture11 standard template-library
Lecture11 standard template-libraryHariz Mustafa
Ā 
Matlab Functions
Matlab FunctionsMatlab Functions
Matlab Functions
Umer Azeem
Ā 
Single linked list
Single linked listSingle linked list
Single linked list
jasbirsingh chauhan
Ā 
Lec 45.46- virtual.functions
Lec 45.46- virtual.functionsLec 45.46- virtual.functions
Lec 45.46- virtual.functionsPrincess Sam
Ā 
Introduction to functional programming using Ocaml
Introduction to functional programming using OcamlIntroduction to functional programming using Ocaml
Introduction to functional programming using Ocaml
pramode_ce
Ā 
C++ Pointers
C++ PointersC++ Pointers
C++ Pointers
Chaand Sheikh
Ā 
C++11 & C++14
C++11 & C++14C++11 & C++14
C++11 & C++14
CyberPlusIndia
Ā 
07. Arrays
07. Arrays07. Arrays
07. Arrays
Intro C# Book
Ā 
Arrays
ArraysArrays
Arrays
archikabhatia
Ā 
Functional Core and Imperative Shell - Game of Life Example - Haskell and Scala
Functional Core and Imperative Shell - Game of Life Example - Haskell and ScalaFunctional Core and Imperative Shell - Game of Life Example - Haskell and Scala
Functional Core and Imperative Shell - Game of Life Example - Haskell and Scala
Philip Schwarz
Ā 
L7 pointers
L7 pointersL7 pointers
L7 pointers
Kathmandu University
Ā 
friends functionToshu
friends functionToshufriends functionToshu
friends functionToshuSidd Singh
Ā 
Pointers [compatibility mode]
Pointers [compatibility mode]Pointers [compatibility mode]
Pointers [compatibility mode]
Kathmandu University
Ā 
Unit 6 pointers
Unit 6   pointersUnit 6   pointers
Unit 6 pointers
George Erfesoglou
Ā 
13. Java text processing
13.  Java text processing13.  Java text processing
13. Java text processing
Intro C# Book
Ā 
C tech questions
C tech questionsC tech questions
C tech questions
vijay00791
Ā 

What's hot (20)

Data structure lab manual
Data structure lab manualData structure lab manual
Data structure lab manual
Ā 
C Programming - Refresher - Part II
C Programming - Refresher - Part II C Programming - Refresher - Part II
C Programming - Refresher - Part II
Ā 
A taste of Functional Programming
A taste of Functional ProgrammingA taste of Functional Programming
A taste of Functional Programming
Ā 
Scala is java8.next()
Scala is java8.next()Scala is java8.next()
Scala is java8.next()
Ā 
Lecture11 standard template-library
Lecture11 standard template-libraryLecture11 standard template-library
Lecture11 standard template-library
Ā 
Matlab Functions
Matlab FunctionsMatlab Functions
Matlab Functions
Ā 
Single linked list
Single linked listSingle linked list
Single linked list
Ā 
Lec 45.46- virtual.functions
Lec 45.46- virtual.functionsLec 45.46- virtual.functions
Lec 45.46- virtual.functions
Ā 
Introduction to functional programming using Ocaml
Introduction to functional programming using OcamlIntroduction to functional programming using Ocaml
Introduction to functional programming using Ocaml
Ā 
C++ Pointers
C++ PointersC++ Pointers
C++ Pointers
Ā 
C++11 & C++14
C++11 & C++14C++11 & C++14
C++11 & C++14
Ā 
07. Arrays
07. Arrays07. Arrays
07. Arrays
Ā 
Arrays
ArraysArrays
Arrays
Ā 
Functional Core and Imperative Shell - Game of Life Example - Haskell and Scala
Functional Core and Imperative Shell - Game of Life Example - Haskell and ScalaFunctional Core and Imperative Shell - Game of Life Example - Haskell and Scala
Functional Core and Imperative Shell - Game of Life Example - Haskell and Scala
Ā 
L7 pointers
L7 pointersL7 pointers
L7 pointers
Ā 
friends functionToshu
friends functionToshufriends functionToshu
friends functionToshu
Ā 
Pointers [compatibility mode]
Pointers [compatibility mode]Pointers [compatibility mode]
Pointers [compatibility mode]
Ā 
Unit 6 pointers
Unit 6   pointersUnit 6   pointers
Unit 6 pointers
Ā 
13. Java text processing
13.  Java text processing13.  Java text processing
13. Java text processing
Ā 
C tech questions
C tech questionsC tech questions
C tech questions
Ā 

Similar to Using Cuda Within Mathematica

Let Us Learn Lambda Using C# 3.0
Let Us Learn Lambda Using C# 3.0Let Us Learn Lambda Using C# 3.0
Let Us Learn Lambda Using C# 3.0
Sheik Uduman Ali
Ā 
Arrays and function basic c programming notes
Arrays and function basic c programming notesArrays and function basic c programming notes
Arrays and function basic c programming notes
GOKULKANNANMMECLECTC
Ā 
Pointer
PointerPointer
Pointer
saeeb12
Ā 
The STL
The STLThe STL
The STL
adil raja
Ā 
operating system ubuntu,linux,MacProgram will work only if you g.pdf
operating system ubuntu,linux,MacProgram will work only if you g.pdfoperating system ubuntu,linux,MacProgram will work only if you g.pdf
operating system ubuntu,linux,MacProgram will work only if you g.pdf
aptcomputerzone
Ā 
C++ Please test your program before you submit the answer.pdf
C++ Please test your program before you submit the answer.pdfC++ Please test your program before you submit the answer.pdf
C++ Please test your program before you submit the answer.pdf
aashisha5
Ā 
Best C++ Programming Homework Help
Best C++ Programming Homework HelpBest C++ Programming Homework Help
Best C++ Programming Homework Help
C++ Homework Help
Ā 
Pointers in C Language
Pointers in C LanguagePointers in C Language
Pointers in C Language
madan reddy
Ā 
Codejunk Ignitesd
Codejunk IgnitesdCodejunk Ignitesd
Codejunk Ignitesd
carlmanaster
Ā 
Operator overloading2
Operator overloading2Operator overloading2
Operator overloading2
zindadili
Ā 
CPP Homework Help
CPP Homework HelpCPP Homework Help
CPP Homework Help
C++ Homework Help
Ā 
Copy on write
Copy on writeCopy on write
Copy on write
Somenath Mukhopadhyay
Ā 
C++ Background Circular Linked List A circular linked list.pdf
C++ Background Circular Linked List A circular linked list.pdfC++ Background Circular Linked List A circular linked list.pdf
C++ Background Circular Linked List A circular linked list.pdf
saradashata
Ā 
An imperative study of c
An imperative study of cAn imperative study of c
An imperative study of c
Tushar B Kute
Ā 
Background Circular Linked List A circular linked list is .pdf
Background Circular Linked List A circular linked list is .pdfBackground Circular Linked List A circular linked list is .pdf
Background Circular Linked List A circular linked list is .pdf
aaseletronics2013
Ā 
C programming
C programmingC programming
C programming
Karthikeyan A K
Ā 
Lecture5
Lecture5Lecture5
Lecture5
Sunil Gupta
Ā 

Similar to Using Cuda Within Mathematica (20)

Let Us Learn Lambda Using C# 3.0
Let Us Learn Lambda Using C# 3.0Let Us Learn Lambda Using C# 3.0
Let Us Learn Lambda Using C# 3.0
Ā 
Arrays and function basic c programming notes
Arrays and function basic c programming notesArrays and function basic c programming notes
Arrays and function basic c programming notes
Ā 
Functional Programming
Functional ProgrammingFunctional Programming
Functional Programming
Ā 
Pointer
PointerPointer
Pointer
Ā 
The STL
The STLThe STL
The STL
Ā 
operating system ubuntu,linux,MacProgram will work only if you g.pdf
operating system ubuntu,linux,MacProgram will work only if you g.pdfoperating system ubuntu,linux,MacProgram will work only if you g.pdf
operating system ubuntu,linux,MacProgram will work only if you g.pdf
Ā 
C++ Please test your program before you submit the answer.pdf
C++ Please test your program before you submit the answer.pdfC++ Please test your program before you submit the answer.pdf
C++ Please test your program before you submit the answer.pdf
Ā 
Best C++ Programming Homework Help
Best C++ Programming Homework HelpBest C++ Programming Homework Help
Best C++ Programming Homework Help
Ā 
TechTalk - Dotnet
TechTalk - DotnetTechTalk - Dotnet
TechTalk - Dotnet
Ā 
Pointers in C Language
Pointers in C LanguagePointers in C Language
Pointers in C Language
Ā 
Codejunk Ignitesd
Codejunk IgnitesdCodejunk Ignitesd
Codejunk Ignitesd
Ā 
Operator overloading2
Operator overloading2Operator overloading2
Operator overloading2
Ā 
CPP Homework Help
CPP Homework HelpCPP Homework Help
CPP Homework Help
Ā 
Copy on write
Copy on writeCopy on write
Copy on write
Ā 
C++ Background Circular Linked List A circular linked list.pdf
C++ Background Circular Linked List A circular linked list.pdfC++ Background Circular Linked List A circular linked list.pdf
C++ Background Circular Linked List A circular linked list.pdf
Ā 
An imperative study of c
An imperative study of cAn imperative study of c
An imperative study of c
Ā 
Background Circular Linked List A circular linked list is .pdf
Background Circular Linked List A circular linked list is .pdfBackground Circular Linked List A circular linked list is .pdf
Background Circular Linked List A circular linked list is .pdf
Ā 
C programming
C programmingC programming
C programming
Ā 
Lecture5
Lecture5Lecture5
Lecture5
Ā 
Lecture5
Lecture5Lecture5
Lecture5
Ā 

More from Shoaib Burq

Async. and Realtime Geo Applications with Node.js
Async. and Realtime Geo Applications with Node.jsAsync. and Realtime Geo Applications with Node.js
Async. and Realtime Geo Applications with Node.jsShoaib Burq
Ā 
Global Random Hacks of Kindness Berlin
Global Random Hacks of Kindness BerlinGlobal Random Hacks of Kindness Berlin
Global Random Hacks of Kindness Berlin
Shoaib Burq
Ā 
OpenStreetMap & Walking-Papers Workflow
OpenStreetMap & Walking-Papers WorkflowOpenStreetMap & Walking-Papers Workflow
OpenStreetMap & Walking-Papers Workflow
Shoaib Burq
Ā 
Ian Batley's MAPS (Spatial@Gov 2009)
Ian Batley's MAPS (Spatial@Gov 2009)Ian Batley's MAPS (Spatial@Gov 2009)
Ian Batley's MAPS (Spatial@Gov 2009)
Shoaib Burq
Ā 
OpenStreetMap Response to Haiti earthquake
OpenStreetMap Response to Haiti earthquake OpenStreetMap Response to Haiti earthquake
OpenStreetMap Response to Haiti earthquake Shoaib Burq
Ā 
Haiti Qake2010 Bar Camp Canberra2010
Haiti Qake2010 Bar Camp Canberra2010Haiti Qake2010 Bar Camp Canberra2010
Haiti Qake2010 Bar Camp Canberra2010
Shoaib Burq
Ā 
Opening of Geographic Data
Opening of Geographic DataOpening of Geographic Data
Opening of Geographic DataShoaib Burq
Ā 
Mapping Multan and beyond with OSM
Mapping Multan and beyond with OSMMapping Multan and beyond with OSM
Mapping Multan and beyond with OSM
Shoaib Burq
Ā 
Where20 2008 Ruby Tutorial
Where20 2008 Ruby TutorialWhere20 2008 Ruby Tutorial
Where20 2008 Ruby Tutorial
Shoaib Burq
Ā 
learning interoperability from web2.0
learning interoperability from web2.0learning interoperability from web2.0
learning interoperability from web2.0
Shoaib Burq
Ā 
Rails Gis Hacks
Rails Gis HacksRails Gis Hacks
Rails Gis Hacks
Shoaib Burq
Ā 

More from Shoaib Burq (11)

Async. and Realtime Geo Applications with Node.js
Async. and Realtime Geo Applications with Node.jsAsync. and Realtime Geo Applications with Node.js
Async. and Realtime Geo Applications with Node.js
Ā 
Global Random Hacks of Kindness Berlin
Global Random Hacks of Kindness BerlinGlobal Random Hacks of Kindness Berlin
Global Random Hacks of Kindness Berlin
Ā 
OpenStreetMap & Walking-Papers Workflow
OpenStreetMap & Walking-Papers WorkflowOpenStreetMap & Walking-Papers Workflow
OpenStreetMap & Walking-Papers Workflow
Ā 
Ian Batley's MAPS (Spatial@Gov 2009)
Ian Batley's MAPS (Spatial@Gov 2009)Ian Batley's MAPS (Spatial@Gov 2009)
Ian Batley's MAPS (Spatial@Gov 2009)
Ā 
OpenStreetMap Response to Haiti earthquake
OpenStreetMap Response to Haiti earthquake OpenStreetMap Response to Haiti earthquake
OpenStreetMap Response to Haiti earthquake
Ā 
Haiti Qake2010 Bar Camp Canberra2010
Haiti Qake2010 Bar Camp Canberra2010Haiti Qake2010 Bar Camp Canberra2010
Haiti Qake2010 Bar Camp Canberra2010
Ā 
Opening of Geographic Data
Opening of Geographic DataOpening of Geographic Data
Opening of Geographic Data
Ā 
Mapping Multan and beyond with OSM
Mapping Multan and beyond with OSMMapping Multan and beyond with OSM
Mapping Multan and beyond with OSM
Ā 
Where20 2008 Ruby Tutorial
Where20 2008 Ruby TutorialWhere20 2008 Ruby Tutorial
Where20 2008 Ruby Tutorial
Ā 
learning interoperability from web2.0
learning interoperability from web2.0learning interoperability from web2.0
learning interoperability from web2.0
Ā 
Rails Gis Hacks
Rails Gis HacksRails Gis Hacks
Rails Gis Hacks
Ā 

Recently uploaded

IOS-PENTESTING-BEGINNERS-PRACTICAL-GUIDE-.pptx
IOS-PENTESTING-BEGINNERS-PRACTICAL-GUIDE-.pptxIOS-PENTESTING-BEGINNERS-PRACTICAL-GUIDE-.pptx
IOS-PENTESTING-BEGINNERS-PRACTICAL-GUIDE-.pptx
Abida Shariff
Ā 
Builder.ai Founder Sachin Dev Duggal's Strategic Approach to Create an Innova...
Builder.ai Founder Sachin Dev Duggal's Strategic Approach to Create an Innova...Builder.ai Founder Sachin Dev Duggal's Strategic Approach to Create an Innova...
Builder.ai Founder Sachin Dev Duggal's Strategic Approach to Create an Innova...
Ramesh Iyer
Ā 
Dev Dives: Train smarter, not harder ā€“ active learning and UiPath LLMs for do...
Dev Dives: Train smarter, not harder ā€“ active learning and UiPath LLMs for do...Dev Dives: Train smarter, not harder ā€“ active learning and UiPath LLMs for do...
Dev Dives: Train smarter, not harder ā€“ active learning and UiPath LLMs for do...
UiPathCommunity
Ā 
Knowledge engineering: from people to machines and back
Knowledge engineering: from people to machines and backKnowledge engineering: from people to machines and back
Knowledge engineering: from people to machines and back
Elena Simperl
Ā 
FIDO Alliance Osaka Seminar: The WebAuthn API and Discoverable Credentials.pdf
FIDO Alliance Osaka Seminar: The WebAuthn API and Discoverable Credentials.pdfFIDO Alliance Osaka Seminar: The WebAuthn API and Discoverable Credentials.pdf
FIDO Alliance Osaka Seminar: The WebAuthn API and Discoverable Credentials.pdf
FIDO Alliance
Ā 
Empowering NextGen Mobility via Large Action Model Infrastructure (LAMI): pav...
Empowering NextGen Mobility via Large Action Model Infrastructure (LAMI): pav...Empowering NextGen Mobility via Large Action Model Infrastructure (LAMI): pav...
Empowering NextGen Mobility via Large Action Model Infrastructure (LAMI): pav...
Thierry Lestable
Ā 
Unsubscribed: Combat Subscription Fatigue With a Membership Mentality by Head...
Unsubscribed: Combat Subscription Fatigue With a Membership Mentality by Head...Unsubscribed: Combat Subscription Fatigue With a Membership Mentality by Head...
Unsubscribed: Combat Subscription Fatigue With a Membership Mentality by Head...
Product School
Ā 
Search and Society: Reimagining Information Access for Radical Futures
Search and Society: Reimagining Information Access for Radical FuturesSearch and Society: Reimagining Information Access for Radical Futures
Search and Society: Reimagining Information Access for Radical Futures
Bhaskar Mitra
Ā 
Epistemic Interaction - tuning interfaces to provide information for AI support
Epistemic Interaction - tuning interfaces to provide information for AI supportEpistemic Interaction - tuning interfaces to provide information for AI support
Epistemic Interaction - tuning interfaces to provide information for AI support
Alan Dix
Ā 
State of ICS and IoT Cyber Threat Landscape Report 2024 preview
State of ICS and IoT Cyber Threat Landscape Report 2024 previewState of ICS and IoT Cyber Threat Landscape Report 2024 preview
State of ICS and IoT Cyber Threat Landscape Report 2024 preview
Prayukth K V
Ā 
Connector Corner: Automate dynamic content and events by pushing a button
Connector Corner: Automate dynamic content and events by pushing a buttonConnector Corner: Automate dynamic content and events by pushing a button
Connector Corner: Automate dynamic content and events by pushing a button
DianaGray10
Ā 
Kubernetes & AI - Beauty and the Beast !?! @KCD Istanbul 2024
Kubernetes & AI - Beauty and the Beast !?! @KCD Istanbul 2024Kubernetes & AI - Beauty and the Beast !?! @KCD Istanbul 2024
Kubernetes & AI - Beauty and the Beast !?! @KCD Istanbul 2024
Tobias Schneck
Ā 
Essentials of Automations: Optimizing FME Workflows with Parameters
Essentials of Automations: Optimizing FME Workflows with ParametersEssentials of Automations: Optimizing FME Workflows with Parameters
Essentials of Automations: Optimizing FME Workflows with Parameters
Safe Software
Ā 
"Impact of front-end architecture on development cost", Viktor Turskyi
"Impact of front-end architecture on development cost", Viktor Turskyi"Impact of front-end architecture on development cost", Viktor Turskyi
"Impact of front-end architecture on development cost", Viktor Turskyi
Fwdays
Ā 
GraphRAG is All You need? LLM & Knowledge Graph
GraphRAG is All You need? LLM & Knowledge GraphGraphRAG is All You need? LLM & Knowledge Graph
GraphRAG is All You need? LLM & Knowledge Graph
Guy Korland
Ā 
Leading Change strategies and insights for effective change management pdf 1.pdf
Leading Change strategies and insights for effective change management pdf 1.pdfLeading Change strategies and insights for effective change management pdf 1.pdf
Leading Change strategies and insights for effective change management pdf 1.pdf
OnBoard
Ā 
UiPath Test Automation using UiPath Test Suite series, part 4
UiPath Test Automation using UiPath Test Suite series, part 4UiPath Test Automation using UiPath Test Suite series, part 4
UiPath Test Automation using UiPath Test Suite series, part 4
DianaGray10
Ā 
DevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA ConnectDevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA Connect
Kari Kakkonen
Ā 
FIDO Alliance Osaka Seminar: Overview.pdf
FIDO Alliance Osaka Seminar: Overview.pdfFIDO Alliance Osaka Seminar: Overview.pdf
FIDO Alliance Osaka Seminar: Overview.pdf
FIDO Alliance
Ā 
De-mystifying Zero to One: Design Informed Techniques for Greenfield Innovati...
De-mystifying Zero to One: Design Informed Techniques for Greenfield Innovati...De-mystifying Zero to One: Design Informed Techniques for Greenfield Innovati...
De-mystifying Zero to One: Design Informed Techniques for Greenfield Innovati...
Product School
Ā 

Recently uploaded (20)

IOS-PENTESTING-BEGINNERS-PRACTICAL-GUIDE-.pptx
IOS-PENTESTING-BEGINNERS-PRACTICAL-GUIDE-.pptxIOS-PENTESTING-BEGINNERS-PRACTICAL-GUIDE-.pptx
IOS-PENTESTING-BEGINNERS-PRACTICAL-GUIDE-.pptx
Ā 
Builder.ai Founder Sachin Dev Duggal's Strategic Approach to Create an Innova...
Builder.ai Founder Sachin Dev Duggal's Strategic Approach to Create an Innova...Builder.ai Founder Sachin Dev Duggal's Strategic Approach to Create an Innova...
Builder.ai Founder Sachin Dev Duggal's Strategic Approach to Create an Innova...
Ā 
Dev Dives: Train smarter, not harder ā€“ active learning and UiPath LLMs for do...
Dev Dives: Train smarter, not harder ā€“ active learning and UiPath LLMs for do...Dev Dives: Train smarter, not harder ā€“ active learning and UiPath LLMs for do...
Dev Dives: Train smarter, not harder ā€“ active learning and UiPath LLMs for do...
Ā 
Knowledge engineering: from people to machines and back
Knowledge engineering: from people to machines and backKnowledge engineering: from people to machines and back
Knowledge engineering: from people to machines and back
Ā 
FIDO Alliance Osaka Seminar: The WebAuthn API and Discoverable Credentials.pdf
FIDO Alliance Osaka Seminar: The WebAuthn API and Discoverable Credentials.pdfFIDO Alliance Osaka Seminar: The WebAuthn API and Discoverable Credentials.pdf
FIDO Alliance Osaka Seminar: The WebAuthn API and Discoverable Credentials.pdf
Ā 
Empowering NextGen Mobility via Large Action Model Infrastructure (LAMI): pav...
Empowering NextGen Mobility via Large Action Model Infrastructure (LAMI): pav...Empowering NextGen Mobility via Large Action Model Infrastructure (LAMI): pav...
Empowering NextGen Mobility via Large Action Model Infrastructure (LAMI): pav...
Ā 
Unsubscribed: Combat Subscription Fatigue With a Membership Mentality by Head...
Unsubscribed: Combat Subscription Fatigue With a Membership Mentality by Head...Unsubscribed: Combat Subscription Fatigue With a Membership Mentality by Head...
Unsubscribed: Combat Subscription Fatigue With a Membership Mentality by Head...
Ā 
Search and Society: Reimagining Information Access for Radical Futures
Search and Society: Reimagining Information Access for Radical FuturesSearch and Society: Reimagining Information Access for Radical Futures
Search and Society: Reimagining Information Access for Radical Futures
Ā 
Epistemic Interaction - tuning interfaces to provide information for AI support
Epistemic Interaction - tuning interfaces to provide information for AI supportEpistemic Interaction - tuning interfaces to provide information for AI support
Epistemic Interaction - tuning interfaces to provide information for AI support
Ā 
State of ICS and IoT Cyber Threat Landscape Report 2024 preview
State of ICS and IoT Cyber Threat Landscape Report 2024 previewState of ICS and IoT Cyber Threat Landscape Report 2024 preview
State of ICS and IoT Cyber Threat Landscape Report 2024 preview
Ā 
Connector Corner: Automate dynamic content and events by pushing a button
Connector Corner: Automate dynamic content and events by pushing a buttonConnector Corner: Automate dynamic content and events by pushing a button
Connector Corner: Automate dynamic content and events by pushing a button
Ā 
Kubernetes & AI - Beauty and the Beast !?! @KCD Istanbul 2024
Kubernetes & AI - Beauty and the Beast !?! @KCD Istanbul 2024Kubernetes & AI - Beauty and the Beast !?! @KCD Istanbul 2024
Kubernetes & AI - Beauty and the Beast !?! @KCD Istanbul 2024
Ā 
Essentials of Automations: Optimizing FME Workflows with Parameters
Essentials of Automations: Optimizing FME Workflows with ParametersEssentials of Automations: Optimizing FME Workflows with Parameters
Essentials of Automations: Optimizing FME Workflows with Parameters
Ā 
"Impact of front-end architecture on development cost", Viktor Turskyi
"Impact of front-end architecture on development cost", Viktor Turskyi"Impact of front-end architecture on development cost", Viktor Turskyi
"Impact of front-end architecture on development cost", Viktor Turskyi
Ā 
GraphRAG is All You need? LLM & Knowledge Graph
GraphRAG is All You need? LLM & Knowledge GraphGraphRAG is All You need? LLM & Knowledge Graph
GraphRAG is All You need? LLM & Knowledge Graph
Ā 
Leading Change strategies and insights for effective change management pdf 1.pdf
Leading Change strategies and insights for effective change management pdf 1.pdfLeading Change strategies and insights for effective change management pdf 1.pdf
Leading Change strategies and insights for effective change management pdf 1.pdf
Ā 
UiPath Test Automation using UiPath Test Suite series, part 4
UiPath Test Automation using UiPath Test Suite series, part 4UiPath Test Automation using UiPath Test Suite series, part 4
UiPath Test Automation using UiPath Test Suite series, part 4
Ā 
DevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA ConnectDevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA Connect
Ā 
FIDO Alliance Osaka Seminar: Overview.pdf
FIDO Alliance Osaka Seminar: Overview.pdfFIDO Alliance Osaka Seminar: Overview.pdf
FIDO Alliance Osaka Seminar: Overview.pdf
Ā 
De-mystifying Zero to One: Design Informed Techniques for Greenfield Innovati...
De-mystifying Zero to One: Design Informed Techniques for Greenfield Innovati...De-mystifying Zero to One: Design Informed Techniques for Greenfield Innovati...
De-mystifying Zero to One: Design Informed Techniques for Greenfield Innovati...
Ā 

Using Cuda Within Mathematica

  • 2. Overview ā€¢ Intro to Mathematica and its API ā€¢ CUDA + Mathematica ā€¢ Some examples
  • 3. Mathematica intro ā€¢ Mathematica is a modular computational system in which the kernel is separate from the front end which handles the interaction with the user. ā€¢ The most common way to work is to use interactive documents called notebooks which mix text input and output as well as graphics and other material.
  • 4. Structure of Mathematica ā€¢ An import aspect of Mathematica is that it can also interact with other applications. ā€¢ This is achieved through MathLink, a standardised API for two-way communication with the kernel.
  • 5. MathLink ā€¢ MathLink allows external programs both to call Mathematica, and to be called by Mathematica. ā€¢ We will use MathLink to let Mathematica call CUDA functions inside an external program.
  • 6. Simple example addtwo.tm :Begin: :Function: addtwo :Pattern: AddTwo[i_Integer,j_Integer] :Arguments: { i, j } :ArgumentTypes: {Integer,Integer} :ReturnType: Integer :End:
  • 7. addtwo.c #include <mathlink.h> int addtwo( int i, int j) { return i+j; } int main(int argc, char* argv[]) { return MLMain(argc, argv); }
  • 8. mprep & gcc $ mprep addtwo.tm -o addtwotm.c $ gcc -I${INCDIR} addtwotm.c addtwo.c -L${LIBDIR} -lMLi3 -lstdc++ -o addtwo
  • 9. In[3]:= SetDirectory " Applications Mathematica.app SystemFiles Links MathLink DeveloperKit PrebuiltExamples" Out[3]= Applications Mathematica.app SystemFiles Links MathLink DeveloperKit PrebuiltExamples In[4]:= link Install ". addtwo" Out[4]= LinkObject Applications Mathematica.app SystemFiles Links MathLink DeveloperKit PrebuiltExamples addtwo, 524, 8 In[5]:= LinkPatterns link Out[5]= AddTwo i_Integer, j_Integer In[6]:= ? AddTwo AddTwo x , y gives the sum of two machine integers x and y. In[7]:= AddTwo 2, 3 Out[7]= 5 In[8]:= AddTwo 2^31 1, 1 Out[8]= 2 147 483 648 In[9]:= Uninstall link Out[9]= Applications Mathematica.app SystemFiles Links MathLink DeveloperKit PrebuiltExamples addtwo
  • 10. MathLink Template ļ¬le ā€¢ When a MathLink template ļ¬le is processed, two basic things are done: ā€¢ :Pattern:& :Arguments: speciļ¬cations are used to generate a Mathematica deļ¬nition ā€¢ :Function:, :ArgumentTypes: & :ReturnType: speciļ¬cations are used to generate C source code
  • 11. :ArgumentTypes: Mathematica speciļ¬cation C speciļ¬cation Integer int Real double IntegerList int*, long RealList double*, long String char* Symbol char* Manual void
  • 12. Handling Lists & Arrays :Begin: int sumList(int *a, long alen) :Function: sumList { :Pattern: SumList[a_List] int i, tot=0; :Arguments: {a} :ArgumentTypes:{IntegerList} for(i=0; i<alen; i++) :ReturnType: Integer tot += a[i]; :End: return tot; }
  • 13. Manual ArgumentTypes :Begin: :Function: sumList :Pattern: SumList[a:{___Integer}] :Arguments: {a} :ArgumentTypes:{Manual} :ReturnType: Integer :End: int sumList(void) { int sumList(void) { int n, i; int n; int a[MAX]; int *a; MLCheckFunction(stdlink, "List", &n); MLGetInteger32List(stdlink, &a, &n); ... for (i=0; i<n; i++) MLReleaseInteger32List(stdlink, a, n); MLGetInteger32(stdlink, a+i); ... ... } }
  • 14. Array of arb. depth #include <mathlink.h> /* read an array of double-precision floating-point numbers from a link */ void f(MLINK lp) { double *data; int *dims; char **heads; int d; /* stores the rank of the array */ if(! MLGetRealArray(lp, &data, &dims, &heads, &d)) { /* unable to read the array from lp */ return; } /* ... */ MLReleaseRealArray(lp, data, dims, heads, d); }
  • 15. Handling Complex numbers In[1]:= Head 2 3 Out[1]= Complex If you pass a list of complex numbers to your external program, then MLGetReal64Array() will create a two-dimensional array containing a sequence of pairs of real and imaginary parts. In this case, heads[0] will be "List" while heads[1] will be "Complex". //get an array of floating-point numbers of any depth MLGetReal64Array(stdlink,double**a,int**dims,char***heads,int*d);
  • 16. Summary of API //get a list of integers, allocating the memory needed to store it MLGetInteger32List(stdlink,int**a,int*n); //get a list of floating-point numbers MLGetReal64List(stdlink,double**a,int*n); //release the memory associated with a list of integers MLReleaseInteger32List(stdlink,int*a,int n); //release the memory associated with a list of floating-point numbers MLReleaseReal64List(stdlink,double*a,int n); //get an array of integers of any depth MLGetInteger32Array(stdlink,int**a,int**dims,char***heads,int*d); //get an array of floating-point numbers of any depth MLGetReal32Array(stdlink,float**a,int**dims,char***heads,int*d); //release memory associated with an integer array MLReleaseInteger32Array(stdlink,int*a,int*dims,char**heads,int d); //release memory associated with a floating-point array MLReleaseReal32Array(stdlink,float*a,int*dims,char**heads,int d);
  • 17. Manual ReturnType void bits(int i) { int a[32], k; :Begin: :Function: bits for(k=0; k<32; k++) { :Pattern: ToBits[i_Integer] a[k] = i%2; :Arguments: {i} i >>= 1; :ArgumentTypes:{Integer} if (i==0) break; :ReturnType: Manual } :End: if (k<32) k++; MLPutInteger32List(stdlink, a, k); return; }
  • 18. General array int a[8][16][100]; int dims[] = {8, 16, 100}; MLPutInteger32Array(stdlink, a, dims, NULL, 3); or int ***a; MLPutFunction(stdlink, "List", n1); for (i=0; i<n1; i++) { MLPutFunction(stdlink, "List", n2); for (j=0; j<n2; j++) { MLPutInteger32List(stdlink, a[i][j], n3); } }
  • 19. Unkown length In[10]:= Sequence 1, Sequence 4, Sequence Out[10]= 1, 4 MLPutFunction(stdlink, "List", 1); while( condition ) { /* generate an element */ MLPutFunction(stdlink, "Sequence", 2); MLPutInteger32(stdlink, i ); } MLPutFunction(stdlink, "Sequence", 0);
  • 20. Return Complex numbers // Complex data type typedef float2 Complex; Complex* h_convolved_signal; // Return transformed signal to Mathematica as a Complex List MLPutFunction(stdlink,"List",n); for (long i = 0; i < n; i++) { MLPutFunction(stdlink,"Complex",2); MLPutFloat(stdlink,h_convolved_signal[i].x*norm); MLPutFloat(stdlink,h_convolved_signal[i].y*norm); }
  • 21. Return Complex numbers In[4]:= list Table RandomReal , 12 Out[4]= 0.389421, 0.222396, 0.434636, 0.0886136, 0.233102, 0.941771, 0.928712, 0.764119, 0.791473, 0.381426, 0.757661, 0.44273 In[5]:= Map Function x , Apply Complex, x , Partition list, 2 Out[5]= 0.389421 0.222396 , 0.434636 0.0886136 , 0.233102 0.941771 , 0.928712 0.764119 , 0.791473 0.381426 , 0.757661 0.44273 // Return transformed signal to Mathematica as a Complex List MLPutFunction(stdlink, "Map", 2); MLPutFunction(stdlink, "Function", 2); MLPutFunction(stdlink, "List", 1); MLPutSymbol(stdlink, "x"); MLPutFunction(stdlink, "Apply", 2); MLPutSymbol(stdlink, "Complex"); MLPutSymbol(stdlink, "x"); MLPutFunction(stdlink, "Partition", 2); MLPutFunction(stdlink, "Times", 2); MLPutReal(stdlink, norm); MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n); MLPutInteger(stdlink, 2);
  • 22. Error & Interrupt if(! MLPutInteger(stdlink, 10)) if(! MLPutReal64(stdlink, 3.22)) { { /* check the possible errors */ /* unable to send 3.22 to lp */ switch(MLError(stdlink)) printf("MathLink Error: %sn", { MLErrorMessage(stdlink)); case MLEDEAD: MLClearError(stdlink); /* the link died unexpectedly */ } break; case MLECLOSED: /* the other side closed the link */ break; case MLEOK: /* no error occurred */ while(len--) break; { default: sum += *list++; /* ... */ /* check for the abort */ } if(MLAbort) return (double)0; } }
  • 23. Running on remote computers $ ./addtwo -linkcreate -linkprotocol TCPIP Link created on: 63166@192.168.1.107,63167@192.168.1.107 In[5]:= Install LinkConnect "63166 192.168.1.107,63167 192.168.1.107", LinkProtocol "TCPIP" Out[5]= LinkObject 63166 192.168.1.107,63167 192.168.1.107, 1110, 8 In[6]:= AddTwo 2, 3 Out[6]= 5
  • 24. Mathematica + CUDA #include <cutil_inline.h> int main(int argc, char **argv) { // use command-line specified CUDA device, // otherwise use device with highest Gflops/s if(cutCheckCmdLineFlag(argc, (const char**)argv, "device")) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); return MLMain(argc, argv); }
  • 25. mathematica_cuda # Add source files here EXECUTABLE := cuFourier # CUDA source files (compiled with cudacc) CUFILES := cuFourier.cu # CUDA dependency files # CU_DEPS := # C/C++ source files (compiled with gcc / c++) # CCFILES := # Additional libraries needed by the project USECUFFT := 1 # MathLink Template files TMFILES := cuFourier.tm ################################################### # Rules and targets include ../../common/common.mk
  • 26. FindCUDA + FindMathLink via CMake ā€¢ CMake http://www.cmake.org/ ā€¢ FindCUDA https://gforge.sci.utah.edu/ gf/project/ļ¬ndcuda/ ā€¢ FindMathLink http://github.com/kashif/ FindMathLink/tree
  • 27. CMakeLists.txt set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF) set(source_files test_bin.cu) CUDA_COMPILE(CUDA_FILES test_bin.cu) MathLink_ADD_TM(test.tm) INCLUDE_DIRECTORIES( ${MathLink_INCLUDE_DIR} ) LINK_DIRECTORIES( ${MathLink_LIBRARY_DIR} ) ADD_EXECUTABLE(cuda_compile_example ${CUDA_FILES} ${source_files} test.tm.c main.cc external_dependency.h ) TARGET_LINK_LIBRARIES(cuda_compile_example ${MathLink_LIBRARIES} ${CUDA_LIBRARIES} )
  • 28. double to ļ¬‚oat conversion #include <cutil_inline.h> // General check for CUDA GPU SM Capabilities //inline bool cutilDrvCudaCapabilities(int major_version, int minor_version); char **heads; int *dims; int rank; float *h_float; double *h_double; if (cutilDrvCudaCapabilities( 1,3 )) { MLGetReal64Array(stdlink, &h_double, &dims, &heads, &rank); } else { MLGetReal32Array(stdlink, &h_float, &dims, &heads, &rank); }
  • 29. CUBLAS & CUFFT ā€¢ Follow the usual routine of sending data to the MathLink app ā€¢ Use CUBLAS or CUFFT ā€¢ Return result back to Mathematica
  • 30. cuFourier In[1]:= ListLinePlot Abs Fourier RandomReal 1, 200 ^2 0.30 0.25 0.20 Out[1]= 0.15 0.10 0.05 50 100 150 200
  • 31. Clone mathematica_cuda $ git clone git://github.com/kashif/mathematica_cuda.git $ cd mathematica_cuda/src $ mkdir cuFourier $ mate cuFourier
  • 32. cuFourier.tm :Begin: :Function: cuFourier1D :Pattern: CUFourier1D[ a:{__?NumericQ} ] :Arguments: { a } :ArgumentTypes:{ RealList } :ReturnType: Manual :End:
  • 33. cuFourier.cu // includes system #include <stdlib.h> #include <stdio.h> #include <string.h> #include <math.h> // includes cuda #include <cufft.h> #include <cutil_inline.h> // includes mathlink #include <mathlink.h> // Complex data type typedef float2 Complex; /////////////////////////////////////////////////////////////// // Showing the use of CUFFT for fast convolution using FFT. /////////////////////////////////////////////////////////////// extern "C" void cuFourier1D(double*, long);
  • 34. //////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { // use command-line specified CUDA device, otherwise use device // with highest Gflops/s if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") ) cutilDeviceInit(argc, argv); else cudaSetDevice( cutGetMaxGflopsDeviceId() ); return MLMain(argc, argv); }
  • 35. void cuFourier1D (double *h_A, long n) { double norm = 1.0/sqrt((double) n); long mem_size = sizeof(Complex) * n; // Allocate host memory for the signal Complex* h_signal = (Complex*)malloc(mem_size); // Initalize the memory for the signal for (long i = 0; i < n; ++i) { h_signal[i].x = (float)h_A[i]; h_signal[i].y = 0.0f; } // Allocate device memory for signal Complex* d_signal; cutilSafeCall(cudaMalloc((void**)&d_signal, mem_size)); // Copy host memory to device cutilSafeCall(cudaMemcpy(d_signal, h_signal, mem_size, cudaMemcpyHostToDevice));
  • 36. // CUFFT plan cufftHandle plan; cufftSafeCall(cufftPlan1d(&plan, n, CUFFT_C2C, 1)); // Transform signal cufftSafeCall(cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE)); // Copy device memory to host Complex* h_convolved_signal = h_signal; cutilSafeCall(cudaMemcpy(h_convolved_signal, d_signal, mem_size, cudaMemcpyDeviceToHost)); // Release d_signal cutilSafeCall(cudaFree(d_signal)); // Destroy CUFFT context cufftSafeCall(cufftDestroy(plan));
  • 37. // Return transformed signal to Mathematica as a Complex List MLPutFunction(stdlink, "Map", 2); MLPutFunction(stdlink, "Function", 2); MLPutFunction(stdlink, "List", 1); MLPutSymbol(stdlink, "x"); MLPutFunction(stdlink, "Apply", 2); MLPutSymbol(stdlink, "Complex"); MLPutSymbol(stdlink, "x"); MLPutFunction(stdlink, "Partition", 2); MLPutFunction(stdlink, "Times", 2); MLPutReal(stdlink, norm); MLPutReal32List(stdlink, (float*)h_convolved_signal, 2*n); MLPutInteger(stdlink, 2); // Cleanup memory free(h_signal); cudaThreadExit(); }
  • 38. Makefile ################################################################## # # Build script for project # ################################################################## # Add source files here EXECUTABLE := cuFourier # CUDA source files (compiled with cudacc) CUFILES := cuFourier.cu # Additional libraries needed by the project USECUFFT := 1 # MathLink Template files TMFILES := cuFourier.tm ################################################################## # Rules and targets include ../../common/common.mk
  • 39. In[35]:= link Install " Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin release cuFourier" Out[35]= LinkObject Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin release cuFourier, 605, 9 In[36]:= LinkPatterns link Out[36]= CUFourier1D a : __ ?NumericQ In[37]:= ListLinePlot Abs CUFourier1D RandomReal 1, 200 ^2 0.4 0.3 Out[37]= 0.2 0.1 50 100 150 200 In[38]:= Uninstall link Out[38]= Users kashif Dropbox 20090630_NDVI_CUDA mathematica_cuda bin darwin release cuFourier
  • 40. Image Deconvolution for Life Sciences ā€¢ Confocal and Wideļ¬eld microscopy 3D or 4D images ā€¢ Multichannel (3 or more channels) ā€¢ Comes in a wide variety of formats
  • 41. Bio-Formats Java lib. ā€¢ Standalone Java library for reading and writing life science image formats ā€¢ Get both the pixels and metadata ā€¢ Licensed under GPL ā€¢ http://www.loci.wisc.edu/ome/ formats.html
  • 42. Java + Mathematica: J/Link Needs "JLink`" InstallJava LinkObject ' usr local Wolfram Mathematica 7.0 SystemFiles Java Linux x86 64 bin java' classpath " usr local Wolfram Mathematica 7.0 SystemFiles Links JLink JLink.jar" Xmx256m Djava.system.class.loader com.wolfram.jlink.JLinkSystemClassLoader Djava.util.prefs.PreferencesFactory com.wolfram.jlink.DisabledPreferencesFactory com.wolfram.jlink.Install init " tmp m000001207601", 4, 4 ReinstallJava ClassPath " home kashif Dropbox BioFormats Java loci_tools.jar" LinkObject ' usr local Wolfram Mathematica 7.0 SystemFiles Java Linux x86 64 bin java' classpath " usr local Wolfram Mathematica 7.0 SystemFiles Links JLink JLink.jar" Xmx256m Djava.system.class.loader com.wolfram.jlink.JLinkSystemClassLoader Djava.util.prefs.PreferencesFactory com.wolfram.jlink.DisabledPreferencesFactory com.wolfram.jlink.Install init " tmp m000002207601", 8, 4
  • 43. Reading LIF images reader JavaNew "loci.formats.ImageReader" LoadJavaClass "loci.formats.FormatTools" Ā« JavaObject loci.formats.ImageReader Ā» JavaClass loci.formats.FormatTools, bpp FormatTools`getBytesPerPixel pixelType reader setId " media cdrom xyz 1ch by2 MT1.lif" 1 reader getSeriesCount reader getSizeX 7 512 reader setSeries 0 reader getSizeY sizeC reader getSizeC 512 1 reader getSizeZ pixelType reader getPixelType 90 1 num reader getImageCount 90
  • 44. Reading pixel volume LoadJavaClass "loci.common.DataTools" JavaClass loci.common.DataTools, volume Flatten N Table DataTools`makeDataArray reader openBytes z, 0, 0, reader getSizeX , reader getSizeY , bpp, False, True , z, 0, reader getSizeZ 1 ; unflatten e_, d__ ? IntegerQ && Positive & : Fold Partition, e, Take d , 1, 2, 1 ; Length e Times d array unflatten volume, reader getSizeX , reader getSizeY , reader getSizeZ ;
  • 45. View a slice Image array 165, All, All 255
  • 46. Image deconvled Result 165, All, All
  • 47. Wiener Deconv. :Begin: :Function: wienerDeconvolve :Pattern: WienerDeconvolve[nx_Integer, ny_Integer, nz_Integer, epsilon_Real, sigma_Real, inImage:{___Real}] :Arguments: { nx, ny, nz, epsilon, sigma, inImage } :ArgumentTypes: { Integer, Integer, Integer, Real, Real, Manual } :ReturnType: Manual :End: void wienerDeconvolve(int nx, int ny, int nz, double epsilon, double sigma) { float *inImage; int length; if(! MLGetReal32List(stdlink, &inImage, &length)) { return; }
  • 48. amira Projection view Ā® http://www.amiravis.com
  • 49. Export " home kashif Amira522 data deconv alphalobe MaxLike.raw", result, "Real32" ;
  • 50. Remote Sensing application
  • 54. Band 3 & Band 4
  • 56. Reading Landsat Images In[4]:= reader JavaNew "loci.formats.ImageReader" Out[4]= Ā« JavaObject loci.formats.ImageReader Ā» In[5]:= reader JavaNew "loci.formats.ChannelSeparator", reader Out[5]= Ā« JavaObject loci.formats.ChannelSeparator Ā» In[35]:= reader setId " Users sabman satellite_images multispectral bhtmref.tif" In[7]:= reader getSeriesCount Out[7]= 1 In[8]:= sizeC reader getSizeC Out[8]= 6 In[9]:= pixelType reader getPixelType Out[9]= 1 In[11]:= num reader getImageCount Out[11]= 6 In[12]:= pixelType reader getPixelType
  • 57. Loading Landsat data in Mathematica In[14]:= LoadJavaClass "loci.formats.FormatTools" Out[14]= JavaClass loci.formats.FormatTools, In[15]:= bpp FormatTools`getBytesPerPixel pixelType Out[15]= 1 In[16]:= reader getSizeX Out[16]= 512 In[17]:= isLittle reader isLittleEndian Out[17]= True In[18]:= reader getSizeY Out[18]= 512 In[19]:= LoadJavaClass "loci.common.DataTools" Out[19]= JavaClass loci.common.DataTools,
  • 58. In[31]:= red DataTools`makeDataArray reader openBytes 2, 0, 0, reader getSizeX , reader getSizeY , bpp, False, True ; In[53]:= Image Partition 100 Normalize red , reader getSizeX
  • 59. In[56]:= NIR DataTools`makeDataArray reader openBytes 3, 0, 0, reader getSizeX , reader getSizeY , bpp, False, True ; In[57]:= Image Partition 100 Normalize NIR , reader getSizeX
  • 60. In[39]:= link Install " Users sabman mathematica_cuda bin darwin emurelease ndvi" Out[39]= LinkObject Users sabman mathematica_cuda bin darwin emurelease ndvi, 41, 10 In[40]:= LinkPatterns link Out[40]= ndvi a_List, b_List In[41]:= NDVI ndvi Partition NIR, reader getSizeX , Partition red, reader getSizeX ; In[42]:= Image Partition NDVI, reader getSizeX
  • 61. ndvi.tm :Begin: :Function: ndvi :Pattern: ndvi[ a_List, b_List ] :Arguments: { a, b } :ArgumentTypes: { Manual } :ReturnType: Manual :End:
  • 62. ndvi.cu void ndvi(void) { short int *h_A, *h_B; float *h_C_GPU; short int *d_A, *d_B; float *d_C; char **heads_A, **heads_B; int *dims_A, *dims_B; int rank_A, rank_B; if(! MLGetInteger16Array(stdlink, &h_A, &dims_A, &heads_A, &rank_A)) { return; } if(! MLGetInteger16Array(stdlink, &h_B, &dims_B, &heads_B, &rank_B)) { return; }
  • 63. //Initializing data h_C_GPU = (float *)malloc(dims_A[0]*dims_A[1]*sizeof(float)); //Allocating GPU memory cutilSafeCall( cudaMalloc((void **)&d_A, dims_A[0]*dims_A[1]*sizeof(short int)) ); cutilSafeCall( cudaMalloc((void **)&d_B, dims_A[0]*dims_A[1]*sizeof(short int)) ); cutilSafeCall( cudaMalloc((void **)&d_C, dims_A[0]*dims_A[1]*sizeof(float)) ); //Copy data to GPU memory for further processing cutilSafeCall( cudaMemcpy(d_A, h_A, dims_A[0]*dims_A[1]*sizeof(short int), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaMemcpy(d_B, h_B, dims_A[0]*dims_A[1]*sizeof(short int), cudaMemcpyHostToDevice) ); cutilSafeCall( cudaThreadSynchronize() ); dim3 grid(ceil((float)dims_A[0]/(float)16.0f), ceil((float) dims_A[1]/32.0f), 1); dim3 threads(ceil( dims_A[0]/(float)grid.x), ceil( dims_A[1]/(float)grid.y), 1); ndviGPU<<<grid, threads>>>(d_C, d_A, d_B, dims_A[0], dims_A[1]); cutilCheckMsg("ndviGPU() execution failedn"); cutilSafeCall( cudaThreadSynchronize() );
  • 64. //Release d_A and d_B cutilSafeCall( cudaFree(d_B) ); cutilSafeCall( cudaFree(d_A) ); //Read back GPU results into h_C_GPU cutilSafeCall( cudaMemcpy(h_C_GPU, d_C, dims_A[0]*dims_A[1]*sizeof(float), cudaMemcpyDeviceToHost) ); //Release d_C cutilSafeCall( cudaFree(d_C) ); //Return result MLPutReal32List(stdlink, h_C_GPU, dims_A[0]*dims_A[1]); //Release h_A and h_B MLReleaseInteger16Array(stdlink, h_A, dims_A, heads_A, rank_A); MLReleaseInteger16Array(stdlink, h_B, dims_B, heads_B, rank_B); cudaThreadExit();
  • 65. NDVI Kernel /////////////////////////////////////////////////////////////////////////////// // Calculate ndvi of two channels d_A and d_B on GPU and store result in d_C /////////////////////////////////////////////////////////////////////////////// __global__ void ndviGPU( float *d_C, short int *d_A, short int *d_B, int width, int height ){ unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x; unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y; if(xIndex < width && yIndex < height) { unsigned int i = yIndex * (width) + xIndex; d_C[i] = __fdividef( (float)(d_A[i] - d_B[i]), (float)(d_A[i] + d_B[i]) ); } }
  • 66. NDVI output 0 1 In[64]:= ArrayPlot Partition NDVI, reader getSizeX , ColorFunction "Rainbow"
  • 67. Questions? http://hpc.nomad-labs.com kashif@nomad-labs.com twitter krasul