jcuda.driver.CUdeviceptr Java Examples

The following examples show how to use jcuda.driver.CUdeviceptr. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: JCudaTestUtils.java From jcuda with MIT License

6 votes

/**
 * Returns whether the given pointers refer to the same memory address.<br>
 * <br>
 * <b>NOTE:<b><br>
 * <br>
 * This method does NOT implement a general way for comparing arbitrary 
 * pointers. The concept of equality of pointers is subtle, and by 
 * default NOT implemented in the pointer classes. This method is 
 * SOLELY intended for the test cases in which it is used.
 * 
 * @param p0 The first pointer
 * @param p1 The second pointer
 * @return Whether the pointers are equal
 */
static boolean equal(CUdeviceptr p0, CUdeviceptr p1)
{
    class TestCUdeviceptr extends CUdeviceptr
    {
        TestCUdeviceptr(CUdeviceptr other)
        {
            super(other);
        }
        
        @Override
        public long getNativePointer()
        {
            return super.getNativePointer();
        }
    }
    TestCUdeviceptr tp0 = new TestCUdeviceptr(p0);
    TestCUdeviceptr tp1 = new TestCUdeviceptr(p1);
    return tp0.getNativePointer() == tp1.getNativePointer();
}

Example #2

Source File: GPUHelper.java From Juicebox with MIT License

5 votes

public static CUdeviceptr allocateInput(float[] input) {
    int typeSize = Sizeof.FLOAT;
    Pointer ptr = Pointer.to(input);
    int size = input.length;
    CUdeviceptr dInput = new CUdeviceptr();
    cuMemAlloc(dInput, size * Sizeof.FLOAT);
    cuMemcpyHtoD(dInput, ptr, size * typeSize);
    return dInput;
}

Example #3

Source File: JCudaDriverBasicGraphExample.java From jcuda-samples with MIT License

5 votes

/**
 * Create device data containing the given float value, the given number
 * of times
 * 
 * @param numElements The number of elements
 * @param value The value of the elements
 * @return The pointer to the data
 */
private static CUdeviceptr createDeviceData(int numElements, float value)
{
    float hostData[] = new float[numElements];
    for (int i = 0; i < numElements; i++)
    {
        hostData[i] = value;
    }
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAlloc(deviceData, numElements * Sizeof.FLOAT);
    cuMemcpyHtoD(deviceData, Pointer.to(hostData),
        numElements * Sizeof.FLOAT);
    return deviceData;
}

Example #4

Source File: JCudaReduction.java From jcuda-samples with MIT License

5 votes

/**
 * Initialize the context, module, function and other elements used 
 * in this sample
 */
private static void init()
{
    // Initialize the driver API and create a context for the first device
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Create the PTX file by calling the NVCC
    String ptxFileName = JCudaSamplesUtils.preparePtxFile(
        "src/main/resources/kernels/JCudaReductionKernel.cu");
    
    // Load the module from the PTX file
    module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain a function pointer to the "reduce" function.
    function = new CUfunction();
    cuModuleGetFunction(function, module, "reduce");
    
    // Allocate a chunk of temporary memory (must be at least
    // numberOfBlocks * Sizeof.FLOAT)
    deviceBuffer = new CUdeviceptr();
    cuMemAlloc(deviceBuffer, 1024 * Sizeof.FLOAT);
    
}

Example #5

Source File: GPUHelper.java From JuiceboxLegacy with MIT License

5 votes

public static CUdeviceptr allocateInput(float[] input) {
    int typeSize = Sizeof.FLOAT;
    Pointer ptr = Pointer.to(input);
    int size = input.length;
    CUdeviceptr dInput = new CUdeviceptr();
    cuMemAlloc(dInput, size * Sizeof.FLOAT);
    cuMemcpyHtoD(dInput, ptr, size * typeSize);
    return dInput;
}

Example #6

Source File: CUDAInnerLoop.java From ocular with GNU General Public License v3.0

5 votes

public void startup(float[][] whiteTemplates, float[][] blackTemplates, int[] templateNumIndices, int[] templateIndicesOffsets, int minTemplateWidth, int maxTemplateWidth, int maxSequenceLength, int totalTemplateNumIndices) {
	this.whiteTemplates = whiteTemplates;
	this.blackTemplates = blackTemplates;
	this.templateNumIndices = templateNumIndices;
	this.templateIndicesOffsets = templateIndicesOffsets;
	this.maxTemplateWidth = maxTemplateWidth;
	this.minTemplateWidth = minTemplateWidth;
	this.totalTemplateNumIndices = totalTemplateNumIndices;
	
	int numTemplateWidths = (maxTemplateWidth-minTemplateWidth)+1;
	int extendedMaxSeqLength = (BLOCK_SIZE_X*ROLL_X) * (int) Math.ceil(((double) maxSequenceLength) / (BLOCK_SIZE_X*ROLL_X));
	this.d_Ow = new CUdeviceptr();
	cuMemAlloc(d_Ow, (extendedMaxSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT * Sizeof.FLOAT);
	this.d_Ob = new CUdeviceptr();
	cuMemAlloc(d_Ob, (extendedMaxSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT * Sizeof.FLOAT);
	this.d_scores = new CUdeviceptr();
	cuMemAlloc(d_scores, maxSequenceLength*totalTemplateNumIndices * Sizeof.FLOAT);
	this.d_Tw = new CUdeviceptr[numTemplateWidths];
	this.d_Tb = new CUdeviceptr[numTemplateWidths];
	for (int tw=minTemplateWidth; tw<=maxTemplateWidth; ++tw) {
		if (templateNumIndices[tw-minTemplateWidth] > 0) {
			d_Tw[tw-minTemplateWidth] = new CUdeviceptr();
			cuMemAlloc(d_Tw[tw-minTemplateWidth], whiteTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT);
			cuMemcpyHtoD(d_Tw[tw-minTemplateWidth], Pointer.to(whiteTemplates[tw-minTemplateWidth]), whiteTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT);
			
			d_Tb[tw-minTemplateWidth] = new CUdeviceptr();
			cuMemAlloc(d_Tb[tw-minTemplateWidth], blackTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT);
			cuMemcpyHtoD(d_Tb[tw-minTemplateWidth], Pointer.to(blackTemplates[tw-minTemplateWidth]), blackTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT);
		}
	}
}

Example #7

Source File: Context.java From OSPREY3 with GNU General Public License v2.0

4 votes

public void free(CUdeviceptr pdBuf) {
	JCudaDriver.cuMemFree(pdBuf);
}

Example #8

Source File: JCudaDriverMemRangeTest.java From jcuda with MIT License

4 votes

@Test
public void testMemRangeAttribute()
{
    JCudaDriver.setExceptionsEnabled(true);
    
    cuInit(0);
    CUcontext contest = new CUcontext();
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    cuCtxCreate(contest, 0, device);
    
    int size = 64;
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAllocManaged(deviceData, size, CU_MEM_ATTACH_HOST);
    
    int readMostly[] = { 12345 };
    int lastPrefetchLocation[] = { 12345 };
    int preferredLocation[] = { 12345 };
    int accessedBy[] = { 12345, 12345, 12345 };
    
    cuMemRangeGetAttribute(Pointer.to(readMostly), Sizeof.INT, 
        CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, deviceData, size);

    cuMemRangeGetAttribute(Pointer.to(lastPrefetchLocation), Sizeof.INT, 
        CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, deviceData, size);

    cuMemRangeGetAttribute(Pointer.to(preferredLocation), Sizeof.INT, 
        CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, deviceData, size);

    cuMemRangeGetAttribute(
        Pointer.to(accessedBy), Sizeof.INT * accessedBy.length, 
        CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, deviceData, size);

    boolean printResults = false;
    //printResults = true;
    if (printResults)
    {
        System.out.println("readMostly          : " + 
            Arrays.toString(lastPrefetchLocation));
        System.out.println("lastPrefetchLocation: " + 
            Arrays.toString(lastPrefetchLocation));
        System.out.println("preferredLocation   : " + 
            Arrays.toString(preferredLocation));
        System.out.println("accessedBy          : " + 
            Arrays.toString(accessedBy));
    }
}

Example #9

Source File: JCudaDriverMemRangeTest.java From jcuda with MIT License

4 votes

@Test
public void testMemRangeAttributes()
{
    JCudaDriver.setExceptionsEnabled(true);
    
    cuInit(0);
    CUcontext contest = new CUcontext();
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    cuCtxCreate(contest, 0, device);
    
    int size = 64;
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAllocManaged(deviceData, size, CU_MEM_ATTACH_HOST);
    
    int readMostly[] = { 12345 };
    int lastPrefetchLocation[] = { 12345 };
    int preferredLocation[] = { 12345 };
    int accessedBy[] = { 12345, 12345, 12345 };
    
    Pointer data[] =  
    {
        Pointer.to(readMostly),
        Pointer.to(lastPrefetchLocation),
        Pointer.to(preferredLocation),
        Pointer.to(accessedBy) 
    };
    long dataSizes[] = 
    {
        Sizeof.INT, 
        Sizeof.INT, 
        Sizeof.INT, 
        Sizeof.INT * accessedBy.length
    };
    int attributes[] =  
    {
        CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
        CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION,
        CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
        CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY,
    };
    cuMemRangeGetAttributes(data, dataSizes, 
        attributes, attributes.length, deviceData, size);
    
    boolean printResults = false;
    //printResults = true;
    if (printResults)
    {
        System.out.println("readMostly          : " + 
            Arrays.toString(lastPrefetchLocation));
        System.out.println("lastPrefetchLocation: " + 
            Arrays.toString(lastPrefetchLocation));
        System.out.println("preferredLocation   : " + 
            Arrays.toString(preferredLocation));
        System.out.println("accessedBy          : " + 
            Arrays.toString(accessedBy));
    }
}

Example #10

Source File: GPUTesting.java From JuiceboxLegacy with MIT License

4 votes

public static void test() {
    JCudaDriver.setExceptionsEnabled(true);

    String sourceCode = "extern \"C\"" + "\n" +
            "__global__ void add(float *result, float *a, float *b)" +
            "\n" +
            "{" + "\n" +
            "    int i = threadIdx.x;" + "\n" +
            "    result[i] = a[i] + b[i];" + "\n" +
            "}";

    // Prepare the kernel
    System.out.println("Preparing the KernelLauncher...");
    KernelLauncher kernelLauncher =
            KernelLauncher.compile(sourceCode, "add");

    // Create the input data
    System.out.println("Creating input data...");
    int size = 10;
    float result[] = new float[size];
    float a[] = new float[size];
    float b[] = new float[size];
    for (int i = 0; i < size; i++) {
        a[i] = i;
        b[i] = i;
    }

    // Allocate the device memory and copy the input
    // data to the device
    System.out.println("Initializing device memory...");

    CUdeviceptr dResult = GPUHelper.allocateOutput(size, Sizeof.FLOAT);
    CUdeviceptr dA = GPUHelper.allocateInput(a);
    CUdeviceptr dB = GPUHelper.allocateInput(b);

    System.out.println("Calling the kernel...");
    kernelLauncher.setBlockSize(size, 1, 1);
    kernelLauncher.call(dResult, dA, dB);

    // Copy the result from the device to the host
    System.out.println("Obtaining results...");

    cuMemcpyDtoH(Pointer.to(result), dResult, size * Sizeof.FLOAT);

    System.out.println("Result: " + Arrays.toString(result));

    // Clean up
    cuMemFree(dA);
    cuMemFree(dB);
    cuMemFree(dResult);
}

Example #11

Source File: GPUHelper.java From JuiceboxLegacy with MIT License

4 votes

public static CUdeviceptr allocateOutput(int size, int typeSize) {
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, size * typeSize);
    return dOutput;
}

Example #12

Source File: GPUHelper.java From JuiceboxLegacy with MIT License

4 votes

public static void freeUpMemory(CUdeviceptr[] pointers) {
    for (CUdeviceptr pointer : pointers) {
        cuMemFree(pointer);
    }
}

Example #13

Source File: Context.java From OSPREY3 with GNU General Public License v2.0

4 votes

public CUdeviceptr malloc(long numBytes) {
	CUdeviceptr pdBuf = new CUdeviceptr();
	JCudaDriver.cuMemAlloc(pdBuf, numBytes);
	return pdBuf;
}

Example #14

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 3D float4 texture access
 */
private boolean test_float4_3D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.Depth = sizeZ;
    ad.NumChannels = 4;
    cuArray3DCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
    copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copy.srcHost = Pointer.to(input_float4_3D);
    copy.srcPitch = sizeX * Sizeof.FLOAT * 4;
    copy.srcHeight = sizeY;
    copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copy.dstArray = array;
    copy.dstHeight = sizeX;
    copy.WidthInBytes = sizeX * Sizeof.FLOAT * 4;
    copy.Height = sizeY;
    copy.Depth = sizeZ;
    cuMemcpy3D(copy);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float4_3D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float4_3D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY }),
     	Pointer.to(new float[]{ posZ })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[4];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

    // Print the results
    log("Result float4 3D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 3.5f, 3.5f, 3.5f, 3.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float4 3D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Example #15

Source File: Context.java From OSPREY3 with GNU General Public License v2.0

4 votes

public void uploadAsync(CUdeviceptr pdBuf, Pointer phBuf, long numBytes, GpuStream stream) {
	JCudaDriver.cuMemcpyHtoDAsync(pdBuf, phBuf, numBytes, stream.getStream());
}

Example #16

Source File: Context.java From OSPREY3 with GNU General Public License v2.0

4 votes

public void downloadAsync(Pointer phBuf, CUdeviceptr pdBuf, long numBytes, GpuStream stream) {
	JCudaDriver.cuMemcpyDtoHAsync(phBuf, pdBuf, numBytes, stream.getStream());
}

Example #17

Source File: GPUTesting.java From Juicebox with MIT License

4 votes

public static void test() {
    JCudaDriver.setExceptionsEnabled(true);

    String sourceCode = "extern \"C\"" + "\n" +
            "__global__ void add(float *result, float *a, float *b)" +
            "\n" +
            "{" + "\n" +
            "    int i = threadIdx.x;" + "\n" +
            "    result[i] = a[i] + b[i];" + "\n" +
            "}";

    // Prepare the kernel
    System.out.println("Preparing the KernelLauncher...");
    KernelLauncher kernelLauncher =
            KernelLauncher.compile(sourceCode, "add");

    // Create the input data
    System.out.println("Creating input data...");
    int size = 10;
    float[] result = new float[size];
    float[] a = new float[size];
    float[] b = new float[size];
    for (int i = 0; i < size; i++) {
        a[i] = i;
        b[i] = i;
    }

    // Allocate the device memory and copy the input
    // data to the device
    System.out.println("Initializing device memory...");

    CUdeviceptr dResult = GPUHelper.allocateOutput(size, Sizeof.FLOAT);
    CUdeviceptr dA = GPUHelper.allocateInput(a);
    CUdeviceptr dB = GPUHelper.allocateInput(b);

    System.out.println("Calling the kernel...");
    kernelLauncher.setBlockSize(size, 1, 1);
    kernelLauncher.call(dResult, dA, dB);

    // Copy the result from the device to the host
    System.out.println("Obtaining results...");

    cuMemcpyDtoH(Pointer.to(result), dResult, size * Sizeof.FLOAT);

    System.out.println("Result: " + Arrays.toString(result));

    // Clean up
    cuMemFree(dA);
    cuMemFree(dB);
    cuMemFree(dResult);
}

Example #18

Source File: GPUHelper.java From Juicebox with MIT License

4 votes

public static CUdeviceptr allocateOutput(int size, int typeSize) {
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, size * typeSize);
    return dOutput;
}

Example #19

Source File: GPUHelper.java From Juicebox with MIT License

4 votes

public static void freeUpMemory(CUdeviceptr[] pointers) {
    for (CUdeviceptr pointer : pointers) {
        cuMemFree(pointer);
    }
}

Example #20

Source File: VecFloatSample.java From jcuda-samples with MIT License

4 votes

public static void main(String[] args)
{
    // Enable exceptions and omit all subsequent error checks
    JCudaDriver.setExceptionsEnabled(true);

    // Initialize the driver and create a context for the first device.
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Afterwards, initialize the vector library, which will
    // attach to the current context
    VecFloat.init();
    
    // Allocate and fill the host input data
    int n = 50000;
    float hostX[] = new float[n];
    float hostY[] = new float[n];
    for(int i = 0; i < n; i++)
    {
        hostX[i] = (float)i;
        hostY[i] = (float)i;
    }

    // Allocate the device pointers, and copy the
    // host input data to the device
    CUdeviceptr deviceX = new CUdeviceptr();
    cuMemAlloc(deviceX, n * Sizeof.FLOAT);
    cuMemcpyHtoD(deviceX, Pointer.to(hostX), n * Sizeof.FLOAT);

    CUdeviceptr deviceY = new CUdeviceptr();
    cuMemAlloc(deviceY, n * Sizeof.FLOAT); 
    cuMemcpyHtoD(deviceY, Pointer.to(hostY), n * Sizeof.FLOAT);

    CUdeviceptr deviceResult = new CUdeviceptr();
    cuMemAlloc(deviceResult, n * Sizeof.FLOAT);

    // Perform the vector operations
    VecFloat.cos(n, deviceX, deviceX);               // x = cos(x)  
    VecFloat.mul(n, deviceX, deviceX, deviceX);      // x = x*x
    VecFloat.sin(n, deviceY, deviceY);               // y = sin(y)
    VecFloat.mul(n, deviceY, deviceY, deviceY);      // y = y*y
    VecFloat.add(n, deviceResult, deviceX, deviceY); // result = x+y

    // Allocate host output memory and copy the device output
    // to the host.
    float hostResult[] = new float[n];
    cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, n * Sizeof.FLOAT);

    // Verify the result
    boolean passed = true;
    for(int i = 0; i < n; i++)
    {
        float expected = (float)(
            Math.cos(hostX[i])*Math.cos(hostX[i])+
            Math.sin(hostY[i])*Math.sin(hostY[i]));
        if (Math.abs(hostResult[i] - expected) > 1e-5)
        {
            System.out.println(
                "At index "+i+ " found "+hostResult[i]+
                " but expected "+expected);
            passed = false;
            break;
        }
    }
    System.out.println("Test "+(passed?"PASSED":"FAILED"));

    // Clean up.
    cuMemFree(deviceX);
    cuMemFree(deviceY);
    cuMemFree(deviceResult);
    VecFloat.shutdown();
}

Example #21

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 2D float4 texture access
 */
private boolean test_float4_2D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.NumChannels = 4;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D();
    copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copyHD.srcHost = Pointer.to(input_float4_2D);
    copyHD.srcPitch = sizeX * Sizeof.FLOAT * 4;
    copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copyHD.dstArray = array;
    copyHD.WidthInBytes = sizeX * Sizeof.FLOAT * 4;
    copyHD.Height = sizeY;
    cuMemcpy2D(copyHD);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float4_2D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float4_2D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[4];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

    // Print the results
    log("Result float4 2D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 1.5f, 1.5f, 1.5f, 1.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float4 2D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Example #22

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 1D float4 texture access
 */
private boolean test_float4_1D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = 1;
    ad.NumChannels = 4;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    Pointer pInput = Pointer.to(input_float4_1D);
    cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT * 4);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float4_1D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float4_1D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[4];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

    // Print the results
    log("Result float4 1D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 0.5f, 0.5f, 0.5f, 0.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float4 1D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Example #23

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 3D float texture access
 */
private boolean test_float_3D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.Depth = sizeZ;
    ad.NumChannels = 1;
    cuArray3DCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
    copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copy.srcHost = Pointer.to(input_float_3D);
    copy.srcPitch = sizeX * Sizeof.FLOAT;
    copy.srcHeight = sizeY;
    copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copy.dstArray = array;
    copy.dstHeight = sizeX;
    copy.WidthInBytes = sizeX * Sizeof.FLOAT;
    copy.Height = sizeY;
    copy.Depth = sizeZ;
    cuMemcpy3D(copy);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float_3D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float_3D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY }),
     	Pointer.to(new float[]{ posZ })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[1];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

    // Print the results
    log("Result float  3D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 3.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float  3D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Example #24

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 2D float texture access
 */
private boolean test_float_2D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.NumChannels = 1;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D();
    copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copyHD.srcHost = Pointer.to(input_float_2D);
    copyHD.srcPitch = sizeX * Sizeof.FLOAT;
    copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copyHD.dstArray = array;
    copyHD.WidthInBytes = sizeX * Sizeof.FLOAT;
    copyHD.Height = sizeY;
    cuMemcpy2D(copyHD);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float_2D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float_2D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[1];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

    // Print the results
    log("Result float  2D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 1.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float  2D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Example #25

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 1D float texture access
 */
private boolean test_float_1D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = 1;
    ad.NumChannels = 1;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    Pointer pInput = Pointer.to(input_float_1D);
    cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float_1D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float_1D");
    
    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();
    
    // Obtain the output on the host
    float hOutput[] = new float[1];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

    // Print the results
    log("Result float  1D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 0.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float  1D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Example #26

Source File: JCudaConstantMemoryExample.java From jcuda-samples with MIT License

4 votes

public static void main(String[] args) throws IOException 
{
    // Enable exceptions and omit all subsequent error checks
    JCudaDriver.setExceptionsEnabled(true);

    // Initialize the driver and create a context for the first device.
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Create the PTX file by calling the NVCC
    String ptxFileName = JCudaSamplesUtils.preparePtxFile(
        "src/main/resources/kernels/JCudaConstantMemoryKernel.cu");

    // Load the PTX file.
    CUmodule module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain the pointer to the constant memory, and print some info
    CUdeviceptr constantMemoryPointer = new CUdeviceptr();
    long constantMemorySizeArray[] = { 0 };
    cuModuleGetGlobal(constantMemoryPointer, constantMemorySizeArray, 
        module, "constantMemoryData");
    int constantMemorySize = (int)constantMemorySizeArray[0];
    
    System.out.println("constantMemoryPointer: " + constantMemoryPointer);
    System.out.println("constantMemorySize: " + constantMemorySize);

    // Copy some host data to the constant memory
    int numElements = constantMemorySize / Sizeof.FLOAT;
    float hostData[] = new float[numElements];
    for (int i = 0; i < numElements; i++)
    {
        hostData[i] = i;
    }
    cuMemcpyHtoD(constantMemoryPointer, 
        Pointer.to(hostData), constantMemorySize);
    
    // Now use the constant memory in the kernel call:
    
    // Obtain a function pointer to the "constantMemoryKernel" function.
    CUfunction kernel = new CUfunction();
    cuModuleGetFunction(kernel, module, "constantMemoryKernel");

    // Allocate some device memory
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAlloc(deviceData, constantMemorySize);
    
    // Set up the kernel parameters
    Pointer kernelParameters = Pointer.to(
        Pointer.to(deviceData),
        Pointer.to(new int[]{numElements})
    );
    
    // Launch the kernel
    int blockSizeX = numElements;
    int gridSizeX = 1;
    cuLaunchKernel(kernel,
        gridSizeX,  1, 1, 
        blockSizeX, 1, 1,
        0, null,         
        kernelParameters, null 
    );
    cuCtxSynchronize();
    
    // Copy the result back to the host, and verify that it is
    // the same that was copied to the constant memory
    float hostResult[] = new float[numElements];
    cuMemcpyDtoH(Pointer.to(hostResult), deviceData, constantMemorySize);
    
    boolean passed = Arrays.equals(hostData,  hostResult);
    System.out.println("Test " + (passed ? "PASSED" : "FAILED"));
}

Example #27

Source File: JCudaDriverUnifiedMemory.java From jcuda-samples with MIT License

4 votes

public static void main(String[] args)
{
    JCudaDriver.setExceptionsEnabled(true);
    JCublas.setExceptionsEnabled(true);
    
    // Initialize the driver and create a context for the first device.
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);
    
    // Check if the device supports managed memory
    int supported[] = { 0 };
    cuDeviceGetAttribute(supported, 
        CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, device);
    if (supported[0] == 0)
    {
        System.err.println("Device does not support managed memory");
        return;
    }

    // Allocate managed memory that is accessible to the host
    int n = 10;
    long size = n * Sizeof.FLOAT;
    CUdeviceptr p = new CUdeviceptr();
    cuMemAllocManaged(p, size, CU_MEM_ATTACH_HOST);

    // Obtain the byte buffer from the pointer. This is supported only
    // for memory that was allocated to be accessible on the host:
    ByteBuffer bb = p.getByteBuffer(0, size);
    
    System.out.println("Buffer on host side: " + bb);

    // Fill the buffer with sample data
    FloatBuffer fb = bb.order(ByteOrder.nativeOrder()).asFloatBuffer();
    for (int i = 0; i < n; i++)
    {
        fb.put(i, i);
    }

    // Make the buffer accessible to all devices
    cuStreamAttachMemAsync(null, p, 0,  CU_MEM_ATTACH_GLOBAL);
    cuStreamSynchronize(null);

    // Use the pointer in a device operation (here, a dot product with 
    // JCublas, for example). The data that was filled in by the host
    // will now be used by the device.
    cublasHandle handle = new cublasHandle();
    cublasCreate(handle);
    float result[] = { -1.0f };
    cublasSdot(handle, n, p, 1, p, 1, Pointer.to(result));
    System.out.println("Result: " + result[0]);
}

Example #28

Source File: JCudaDynamicParallelism.java From jcuda-samples with MIT License

4 votes

public static void main(String[] args)
{
    JCudaDriver.setExceptionsEnabled(true);

    // Initialize a context for the first device
    cuInit(0);
    CUcontext context = new CUcontext();
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    cuCtxCreate(context, 0, device);

    // Create the CUBIN file by calling the NVCC. 
    // See the prepareDefaultCubinFile method for the details about
    // the NVCC parameters that are used here. 
    String cubinFileName = JCudaSamplesUtils.prepareDefaultCubinFile(
        "src/main/resources/kernels/JCudaDynamicParallelismKernel.cu");

    // Load the CUBIN file 
    CUmodule module = new CUmodule();
    cuModuleLoad(module, cubinFileName);

    // Obtain a function pointer to the "parentKernel" function.
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "parentKernel");

    // Define the nesting structure. 
    // 
    // NOTE: The number of child threads MUST match the value that 
    // is used in the kernel, for the childKernel<<<1, 8>>> call!
    // 
    int numParentThreads = 8;
    int numChildThreads = 8;

    // Allocate the device data that will be filled by the kernel
    int numElements = numParentThreads * numChildThreads;
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAlloc(deviceData, numElements * Sizeof.FLOAT);

    // Set up the kernel parameters: A pointer to an array
    // of pointers which point to the actual values.
    Pointer kernelParameters = Pointer.to(
        Pointer.to(new int[] { numElements }),
        Pointer.to(deviceData)
    );

    // Call the kernel function.
    int blockSizeX = numParentThreads;
    int gridSizeX = (numElements + numElements - 1) / blockSizeX;
    cuLaunchKernel(function,
        gridSizeX,  1, 1,      // Grid dimension
        blockSizeX, 1, 1,      // Block dimension
        0, null,               // Shared memory size and stream
        kernelParameters, null // Kernel- and extra parameters
    );
    cuCtxSynchronize();

    // Copy the device data to the host
    float hostData[] = new float[numElements];
    for(int i = 0; i < numElements; i++)
    {
        hostData[i] = i;
    }
    cuMemcpyDtoH(Pointer.to(hostData), 
        deviceData, numElements * Sizeof.FLOAT);

    // Compare the host data with the expected values
    float hostDataRef[] = new float[numElements];
    for(int i = 0; i < numParentThreads; i++)
    {
        for (int j=0; j < numChildThreads; j++)
        {
            hostDataRef[i * numChildThreads + j] = i + 0.1f * j;
        }
    }
    System.out.println("Result: "+Arrays.toString(hostData));
    boolean passed = Arrays.equals(hostData, hostDataRef);
    System.out.println(passed ? "PASSED" : "FAILED");

    // Clean up.
    cuMemFree(deviceData);
}

Example #29

Source File: JCudaDriverStreamCallbacks.java From jcuda-samples with MIT License

4 votes

/**
 * Create a Workload instance. This method is called by multiple host
 * threads, to create the individual workloads, and to send the 
 * commands for processing the workloads to CUDA
 * 
 * @param index The index of the workload 
 * @param executor The executor service 
 */
private static void createWorkloadOnHost(
    final int index, final ExecutorService executor)
{
    // Make sure that the CUDA context is current for the calling thread
    cuCtxSetCurrent(context);

    // Initialize the workload, and create the CUDA stream

    System.out.println(index + ": Initializing workload");
    final Workload workload = new Workload();
    workload.index = index;
    workload.stream = new CUstream();
    cuStreamCreate(workload.stream, 0);
    
    
    // Create the host data of the workload
    
    System.out.println(index + ": Create host data");
    workload.hostData = new Pointer();
    cuMemHostAlloc(workload.hostData, WORKLOAD_SIZE * Sizeof.INT, 0);
    ByteBuffer hostByteBuffer =
        workload.hostData.getByteBuffer(0, WORKLOAD_SIZE * Sizeof.INT);
    IntBuffer hostIntBuffer = 
        hostByteBuffer.order(ByteOrder.nativeOrder()).asIntBuffer();
    for (int i = 0; i < WORKLOAD_SIZE; i++)
    {
        hostIntBuffer.put(i, i);
    }
    workload.deviceData = new CUdeviceptr();
    cuMemAlloc(workload.deviceData, WORKLOAD_SIZE * Sizeof.INT);

    
    // Execute the CUDA commands:
    // - Copy the host data to the device
    // - Execute the kernel
    // - Copy the modified device data back to the host
    // All this is done asynchronously

    System.out.println(index + ": Execute CUDA commands");

    cuMemcpyHtoDAsync(workload.deviceData, workload.hostData,
        WORKLOAD_SIZE * Sizeof.INT, workload.stream);

    Pointer kernelParameters = Pointer.to(
        Pointer.to(new int[]{WORKLOAD_SIZE}),
        Pointer.to(workload.deviceData)
    );
    int blockSizeX = 256;
    int gridSizeX = (WORKLOAD_SIZE + blockSizeX - 1) / blockSizeX;
    cuLaunchKernel(function, gridSizeX,  1, 1, blockSizeX, 1, 1,
        0, workload.stream, kernelParameters, null);
    
    cuMemcpyDtoHAsync(workload.hostData, workload.deviceData,
        WORKLOAD_SIZE * Sizeof.INT, workload.stream);
    
    
    // Define the callback that will be called when all CUDA commands
    // on the stream have finished. This callback will forward the
    // workload to the "finishWorkloadOnHost" method.
    CUstreamCallback callback = new CUstreamCallback()
    {
        @Override
        public void call(
            CUstream hStream, int status, final Object userData)
        {
            System.out.println(index + ": Callback was called");
            Runnable runnable = new Runnable()
            {
                @Override
                public void run()
                {
                    finishWorkloadOnHost(userData);
                }
            };
            executor.submit(runnable);
        }
    };
    cuStreamAddCallback(workload.stream, callback, workload, 0);
}

Example #30

Source File: JCudaReduction.java From jcuda-samples with MIT License

4 votes

/**
 * Entry point of this sample
 *
 * @param args Not used
 */
public static void main(String args[])
{
    // Enable exceptions and omit all subsequent error checks
    JCudaDriver.setExceptionsEnabled(true);

    init();
    boolean passed = true;
    for (int n = 100000; n <= 26500000; n *= 2)
    {
        float hostInput[] = createRandomArray(n);

        long timeNs0 = 0;
        long timeNs1 = 0;

        // Copy the input data to the device
        timeNs0 = System.nanoTime();
        CUdeviceptr deviceInput = new CUdeviceptr();
        cuMemAlloc(deviceInput, hostInput.length * Sizeof.FLOAT);
        cuMemcpyHtoD(deviceInput, Pointer.to(hostInput), 
            hostInput.length * Sizeof.FLOAT);
        timeNs1 = System.nanoTime();
        long durationCopyNs = timeNs1 - timeNs0;

        // Execute the reduction with CUDA
        timeNs0 = System.nanoTime();
        float resultJCuda = reduce(deviceInput, hostInput.length);
        timeNs1 = System.nanoTime();
        long durationCompNs = timeNs1 - timeNs0;

        cuMemFree(deviceInput);

        // Execute the reduction with Java
        timeNs0 = System.nanoTime();
        float resultJava = reduceHost(hostInput);
        timeNs1 = System.nanoTime();
        long durationJavaNs = timeNs1 - timeNs0;

        System.out.println("Reduction of " + n + " elements");
        System.out.printf(Locale.ENGLISH,
            "  JCuda: %7.3f ms, result: %f " +
            "(copy: %7.3f ms, comp: %7.3f ms)\n",
            (durationCopyNs + durationCompNs) / 1e6, resultJCuda, 
            durationCopyNs / 1e6, durationCompNs / 1e6);
        System.out.printf(Locale.ENGLISH,
            "  Java : %7.3f ms, result: %f\n", 
            durationJavaNs / 1e6, resultJava);
        
        passed &= 
            Math.abs(resultJCuda - resultJava) < resultJava * 1e-5;
        
    }
    System.out.println("Test " + (passed ? "PASSED" : "FAILED"));

    shutdown();
}