This is a reupload of the talk I delivered at the Spark London Meetup group, November 2016. Original link to the event: https://www.meetup.com/Spark-London/events/235626954/
I share observations and best practices.
54. CUDA4J sample, part 1 of 3
import com.ibm.cuda.*;
import com.ibm.cuda.CudaKernel.*;
public class Sample {
private static final boolean PRINT_DATA = false;
private static int numElements;
private static int[] myData;
private static CudaBuffer buffer1;
private static CudaDevice device = new CudaDevice(0);
private static CudaModule module;
private static CudaKernel kernel;
private static CudaStream stream;
public static void main(String[] args) {
try {
module = new Loader().loadModule("AdamDoubler.fatbin", device);
kernel = new CudaKernel(module, "Cuda_cuda4j_AdamDoubler_Strider");
stream = new CudaStream(device);
doSmallProblem();
doMediumProblem();
doChunkingProblem();
} catch (CudaException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
private static void doSmallProblem() throws Exception {
System.out.println("Doing the small sized problem");
numElements = 100;
myData = new int[numElements];
Util.fillWithInts(myData);
CudaGrid grid = Util.makeGrid(numElements, stream);
System.out.println("Kernel grid: <<<" + grid.gridDimX + ", " + grid.blockDimX + ">>>");
buffer1 = new CudaBuffer(device, numElements * Integer.BYTES);
buffer1.copyFrom(myData);
Parameters kernelParams = new Parameters(2).set(0, buffer1).set(1, numElements);
kernel.launch(grid, kernelParams);
int[] originalArrayCopy = new int[myData.length];
System.arraycopy(myData, 0, originalArrayCopy, 0, myData.length);
buffer1.copyTo(myData);
Util.checkArrayResultsDoubler(myData, originalArrayCopy);
}
55. private static void doMediumProblem() throws Exception {
System.out.println("Doing the medium sized problem");
numElements = 5_000_000;
myData = new int[numElements];
Util.fillWithInts(myData);
// This is only when handling more than max blocks * max threads per kernel
// Grid dim is the number of blocks in the grid
// Block dim is the number of threads in a block
// buffer1 is how we'll use our data on the GPU
buffer1 = new CudaBuffer(device, numElements * Integer.BYTES);
// myData is on CPU, transfer it
buffer1.copyFrom(myData);
// Our stream executes the kernel, can launch many streams at once
CudaGrid grid = Util.makeGrid(numElements, stream);
System.out.println("Kernel grid: <<<" + grid.gridDimX + ", " + grid.blockDimX +
">>>");
Parameters kernelParams = new Parameters(2).set(0, buffer1).set(1,
numElements);
kernel.launch(grid, kernelParams);
int[] originalArrayCopy = new int[myData.length];
System.arraycopy(myData, 0, originalArrayCopy, 0, myData.length);
buffer1.copyTo(myData);
Util.checkArrayResultsDoubler(myData, originalArrayCopy);
}
CUDA4J sample, part 2 of 3
56. private static void doChunkingProblem() throws Exception {
// I know 5m doesn't require chunking on the GPU but this does
System.out.println("Doing the too big to handle in one kernel problem");
numElements = 70_000_000;
myData = new int[numElements];
Util.fillWithInts(myData);
buffer1 = new CudaBuffer(device, numElements * Integer.BYTES);
buffer1.copyFrom(myData);
CudaGrid grid = Util.makeGrid(numElements, stream);
System.out.println("Kernel grid: <<<" + grid.gridDimX + ", " + grid.blockDimX + ">>>");
// Check we can actually launch a kernel with this grid size
try {
Parameters kernelParams = new Parameters(2).set(0, buffer1).set(1, numElements);
kernel.launch(grid, kernelParams);
int[] originalArrayCopy = new int[numElements];
System.arraycopy(myData, 0, originalArrayCopy, 0, numElements);
buffer1.copyTo(myData);
Util.checkArrayResultsDoubler(myData, originalArrayCopy);
} catch (CudaException ce) {
if (ce.getMessage().equals("invalid argument")) {
System.out.println("it was invalid argument, too big!");
int maxThreadsPerBlockX = device.getAttribute(CudaDevice.ATTRIBUTE_MAX_BLOCK_DIM_X);
int maxBlocksPerGridX = device.getAttribute(CudaDevice.ATTRIBUTE_MAX_GRID_DIM_Y);
long maxThreadsPerGrid = maxThreadsPerBlockX * maxBlocksPerGridX;
// 67,107,840 on my Windows box
System.out.println("Max threads per grid: " + maxThreadsPerGrid);
long numElementsAtOnce = maxThreadsPerGrid;
long elementsDone = 0;
grid = new CudaGrid(maxBlocksPerGridX, maxThreadsPerBlockX, stream);
System.out.println("Kernel grid: <<<" + grid.gridDimX + ", " + grid.blockDimX + ">>>");
while (elementsDone < numElements) {
if ( (elementsDone + numElementsAtOnce) > numElements) {
numElementsAtOnce = numElements - elementsDone; // Just do the remainder
}
long toOffset = numElementsAtOnce + elementsDone;
// It's the byte offset not the element index offset
CudaBuffer slicedSection = buffer1.slice(elementsDone * Integer.BYTES, toOffset * Integer.BYTES);
Parameters kernelParams = new Parameters(2).set(0, slicedSection).set(1, numElementsAtOnce);
kernel.launch(grid, kernelParams);
elementsDone += numElementsAtOnce;
}
int[] originalArrayCopy = new int[myData.length];
System.arraycopy(myData, 0, originalArrayCopy, 0, myData.length);
buffer1.copyTo(myData);
Util.checkArrayResultsDoubler(myData, originalArrayCopy);
} else {
System.out.println(ce.getMessage());
}
}
}
CUDA4J sample, part 3 of 3
57. CUDA4J kernel
#include <stdint.h>
#include <stdio.h>
/**
* 2D grid so we can have 1024 threads and many blocks
* Remember 1 grid -> has blocks/threads and one kernel runs on one grid
* In CUDA 6.5 we have cudaOccupancyMaxPotentialBlockSize which helps
*
* Let's say we have 100 ints to double, keeping it simple
* Assume we want to run with 256 threads at once
* For this size our kernel will be set up as follows
* 1 grid, 1 block, 512 threads
* blockDim.x is going to be 1
* threadIdx.x will remain at 0
* threadIdx.y will range from 0 to 512
* So we'll go from 1 to 512 and we'll limit access to how many elements we
have
*/
extern "C" __global__ void Cuda_cuda4j_AdamDoubler(int* toDouble, int
numElements){
int index = blockDim.x * threadIdx.x + threadIdx.y;
if (index < numElements) { // Don't go out of bounds
toDouble[index] *= 2; // Just double it
}
}
extern "C" __global__ void Cuda_cuda4j_AdamDoubler_Strider(int* toDouble,
int numElements){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < numElements) { // don't go overboard
toDouble[i] *= 2;
}
}
58. Lambda example, part 1 of 2
import java.util.stream.IntStream;
public class Lambda {
private static long startTime = 0;
// -Xjit:enableGPU is our JVM option
public static void main(String[] args) {
boolean timeIt = true;
int numElements = 500_000_000;
int[] toDouble = new int[numElements];
Util.fillWithInts(toDouble);
myDoublerWithALambda(toDouble, timeIt);
double[] toHalf = new double[numElements];
Util.fillWithDoubles(toHalf);
myHalverWithALambda(toHalf, timeIt);
double[] toRandomFunc = new double[numElements];
Util.fillWithDoubles(toRandomFunc);
myRandomFuncWithALambda(toRandomFunc, timeIt);
}
private static void myDoublerWithALambda(int[] myArray, boolean timeIt) {
if (timeIt) startTime = System.currentTimeMillis();
IntStream.range(0, myArray.length).parallel().forEach(i -> {
myArray[i] = myArray[i] * 2; // Done on GPU for us
});
if (timeIt) {
System.out.println("Done doubling with a lambda, time taken: " +
(System.currentTimeMillis() - startTime) + " milliseconds");
}
}
59. private static void myHalverWithALambda(double[] myArray, boolean timeIt)
{
if (timeIt) startTime = System.currentTimeMillis();
IntStream.range(0, myArray.length).parallel().forEach(i -> {
myArray[i] = myArray[i] / 2; // Again on GPU
});
if (timeIt) {
System.out.println("Done halving with a lambda, time taken: " +
(System.currentTimeMillis() - startTime) + " milliseconds");
}
}
private static void myRandomFuncWithALambda(double[] myArray, boolean
timeIt) {
if (timeIt) startTime = System.currentTimeMillis();
IntStream.range(0, myArray.length).parallel().forEach(i -> {
myArray[i] = myArray[i] * 3.142; // Double so we don't lose precision
});
if (timeIt) {
System.out.println("Done with the random func with a lambda, time
taken: " +
(System.currentTimeMillis() - startTime) + " milliseconds");
}
}
}
Lambda example, part 2 of 2
60. Utility methods, part 1 of 2
import com.ibm.cuda.*;
public class Util {
protected static void fillWithInts(int[] toFill) {
for (int i = 0; i < toFill.length; i++) {
toFill[i] = i;
}
}
protected static void fillWithDoubles(double[] toFill) {
for (int i = 0; i < toFill.length; i++) {
toFill[i] = i;
}
}
protected static void printArray(int[] toPrint) {
System.out.println();
for (int i = 0; i < toPrint.length; i++) {
if (i == toPrint.length - 1) {
System.out.print(toPrint[i] + ".");
} else {
System.out.print(toPrint[i] + ", ");
}
}
System.out.println();
}
protected static CudaGrid makeGrid(int numElements, CudaStream stream) {
int numThreads = 512;
int numBlocks = (numElements + (numThreads - 1)) / numThreads;
return new CudaGrid(numBlocks, numThreads, stream);
}
61. /*
* Array will have been doubled at this point
*/
protected static void checkArrayResultsDoubler(int[] toCheck, int[] originalArray) {
long errorCount = 0;
// Check result, data has been copied back here
if (toCheck.length != originalArray.length) {
System.err.println("Something's gone horribly wrong, different array length");
}
for (int i = 0; i < originalArray.length; i++) {
if (toCheck[i] != (originalArray[i] * 2) ) {
errorCount++;
/*
System.err.println("Got an error, " + originalArray[i] +
" is incorrect: wasn't doubled correctly!" +
" Got " + toCheck[i] + " but should be " + originalArray[i] * 2);
*/
} else {
//System.out.println("Correct, doubled " + originalArray[i] + " and it became " +
toCheck[i]);
}
}
System.err.println("Incorrect results: " + errorCount);
}
}
Utility methods, part 2 of 2
62. CUDA4J module loader
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import com.ibm.cuda.CudaDevice;
import com.ibm.cuda.CudaException;
import com.ibm.cuda.CudaModule;
public class Loader {
private final CudaModule.Cache moduleCache = new CudaModule.Cache();
CudaModule loadModule(String moduleName, CudaDevice device) throws CudaException,
IOException {
CudaModule module = moduleCache.get(device, moduleName);
if (module == null) {
try (InputStream stream = getClass().getResourceAsStream(moduleName)) {
if (stream == null) {
throw new FileNotFoundException(moduleName);
}
module = new CudaModule(device, stream);
moduleCache.put(device, moduleName, module);
}
}
return module;
}
}
64. Set the PATH to include the CUDA library. For example, set
PATH=<CUDA_LIBRARY_PATH>;%PATH%, where the
<CUDA_LIBRARY_PATH> variable is the full path to the CUDA library. The
<CUDA_LIBRARY_PATH> variable is C:Program FilesNVIDIA GPU
Computing ToolkitCUDAv7.5bin, which assumes CUDA is installed to
the default directory.
Note: If you are using Just-In-Time Compiler (JIT) based GPU support, you must also
include paths to the NVIDIA Virtual Machine (NVVM) library, and to the NVDIA
Management Library (NVML). For example, the <CUDA_LIBRARY_PATH> variable
is C:Program FilesNVIDIA GPU Computing
ToolkitCUDAv7.5bin;<NVVM_LIBRARY_PATH>;<NVML_LIBRARY_P
ATH>.
If the NVVM library is installed to the default directory, the
<NVVM_LIBRARY_PATH> variable is C:Program FilesNVIDIA GPU
Computing ToolkitCUDAv7.5nvvmbin. You can find the NVML
library in your NVIDIA drivers directory. The default location of this directory is
C:Program FilesNVIDIA CorporationNVSMI.
From IBM's Java 8 docs
Environment example, see the docs for details
66. Notices and Disclaimers (con’t)
Information concerning non-IBM products was obtained from the suppliers of those products, their published
announcements or other publicly available sources. IBM has not tested those products in connection with this publication
and cannot confirm the accuracy of performance, compatibility or any other claims related to non-IBM products.
Questions on the capabilities of non-IBM products should be addressed to the suppliers of those products. IBM does not
warrant the quality of any third-party products, or the ability of any such third-party products to interoperate with IBM’s
products. IBM expressly disclaims all warranties, expressed or implied, including but not limited to, the implied warranties
of merchantability and fitness for a particular purpose.
The provision of the information contained herein is not intended to, and does not, grant any right or license under any
IBM patents, copyrights, trademarks or other intellectual property right.
IBM, the IBM logo, ibm.com, Bluemix, Blueworks Live, CICS, Clearcase, DOORS®, Enterprise Document
Management System™, Global Business Services ®, Global Technology Services ®, Information on Demand, ILOG,
LinuxONE™, Maximo®, MQIntegrator®, MQSeries®, Netcool®, OMEGAMON, OpenPower, PureAnalytics™,
PureApplication®, pureCluster™, PureCoverage®, PureData®, PureExperience®, PureFlex®, pureQuery®,
pureScale®, PureSystems®, QRadar®, Rational®, Rhapsody®, SoDA, SPSS, StoredIQ, Tivoli®, Trusteer®,
urban{code}®, Watson, WebSphere®, Worklight®, X-Force® and System z® Z/OS, are trademarks of International
Business Machines Corporation, registered in many jurisdictions worldwide. Other product and service names might
be trademarks of IBM or other companies. A current list of IBM trademarks is available on the Web at "Copyright and
trademark information" at: www.ibm.com/legal/copytrade.shtml.
Oracle and Java are registered trademarks of Oracle and/or its affiliates. Other names may be trademarks of their
respective owners.
Databricks is a registered trademark of Databricks, Inc.
Apache Spark, Apache Cassandra, Apache Hadoop, Apache Maven, Spark, Apache, any other Apache project
mentioned here and the Apache product logos including the Spark logo are trademarks of The Apache Software
Foundation