jcuda.driver.CUfunction Java Examples

The following examples show how to use jcuda.driver.CUfunction. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JCudaDriverSimpleJOGL.java    From jcuda-samples with MIT License 6 votes vote down vote up
/**
 * Initialize the JCudaDriver. Note that this has to be done from the
 * same thread that will later use the JCudaDriver API
 */
private void initJCuda()
{
    JCudaDriver.setExceptionsEnabled(true);

    // Create a device and a context
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Prepare the PTX file containing the kernel
    String ptxFileName = JCudaSamplesUtils.preparePtxFile(
        "src/main/resources/kernels/JCudaDriverSimpleGLKernel.cu");
    
    // Load the PTX file containing the kernel
    CUmodule module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain a function pointer to the kernel function. This function
    // will later be called during the animation, in the display 
    // method of this GLEventListener.
    function = new CUfunction();
    cuModuleGetFunction(function, module, "simple_vbo_kernel");
}
 
Example #2
Source File: JCudaDriverBasicGraphExample.java    From jcuda-samples with MIT License 6 votes vote down vote up
/**
 * Create a CUDA kernel function by compiling the given code using the
 * NVRTC, and obtaining the function with the given name
 * 
 * @param name The name of the function
 * @param code The source code
 * @return The CUDA function
 */
private static CUfunction createFunction(String name, String code)
{
    nvrtcProgram program = new nvrtcProgram();
    nvrtcCreateProgram(program, code, null, 0, null, null);
    nvrtcCompileProgram(program, 0, null);
    
    String programLog[] = new String[1];
    nvrtcGetProgramLog(program, programLog);
    String log = programLog[0].trim();
    if (!log.isEmpty())
    {
        System.err.println("Compilation log for " + name + ":\n" + log);
    }
    
    String[] ptx = new String[1];
    nvrtcGetPTX(program, ptx);
    nvrtcDestroyProgram(program);
    CUmodule module = new CUmodule();
    cuModuleLoadData(module, ptx[0]);
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, name);
    
    return function;
}
 
Example #3
Source File: JCudaDriverSimpleLWJGL.java    From jcuda-samples with MIT License 6 votes vote down vote up
/**
 * Initialize the JCudaDriver. Note that this has to be done from the
 * same thread that will later use the JCudaDriver API
 */
private void initJCuda()
{
    JCudaDriver.setExceptionsEnabled(true);

    // Create a device and a context
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Prepare the PTX file containing the kernel
    String ptxFileName = JCudaSamplesUtils.preparePtxFile(
        "src/main/resources/kernels/JCudaDriverSimpleGLKernel.cu");
    
    // Load the PTX file containing the kernel
    CUmodule module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain a function pointer to the kernel function. This function
    // will later be called during the animation, in the display 
    // method of this GLEventListener.
    function = new CUfunction();
    cuModuleGetFunction(function, module, "simple_vbo_kernel");
}
 
Example #4
Source File: CUDAInnerLoop.java    From ocular with GNU General Public License v3.0 6 votes vote down vote up
public void compute(final float[] scores, final float[] whiteObservations, final float[] blackObservations, final int sequenceLength) {
	int gridSizeX = (int) Math.ceil(((double) sequenceLength) / (BLOCK_SIZE_X*ROLL_X));
	int extendedSeqLength = gridSizeX * (BLOCK_SIZE_X*ROLL_X);
	cuMemcpyHtoD(d_Ow, Pointer.to(CudaUtil.extendWithZeros(whiteObservations, (extendedSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT)), (extendedSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT * Sizeof.FLOAT);
	cuMemcpyHtoD(d_Ob, Pointer.to(CudaUtil.extendWithZeros(blackObservations, (extendedSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT)), (extendedSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT * Sizeof.FLOAT);
	for (int tw=minTemplateWidth; tw<=maxTemplateWidth; ++tw) {
		if (templateNumIndices[tw-minTemplateWidth] > 0) {
			CUfunction function = new CUfunction();
			cuModuleGetFunction(function, cudaModule, "compute_emissions_"+tw);
			JCudaDriver.cuFuncSetCacheConfig(function, CUfunc_cache.CU_FUNC_CACHE_PREFER_SHARED);
			JCudaDriver.cuFuncSetSharedMemConfig(function, CUsharedconfig.CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE);
			Pointer kernelParameters = Pointer.to(Pointer.to(new int[] {templateIndicesOffsets[tw-minTemplateWidth]*sequenceLength}), Pointer.to(new int[] {sequenceLength}), Pointer.to(new int[] {templateNumIndices[tw-minTemplateWidth]}), Pointer.to(d_Tw[tw-minTemplateWidth]), Pointer.to(d_Tb[tw-minTemplateWidth]), Pointer.to(d_Ow), Pointer.to(d_Ob), Pointer.to(d_scores));
			int gridSizeY = (int) Math.ceil(((double) templateNumIndices[tw-minTemplateWidth]) / BLOCK_SIZE_Y);
			cuLaunchKernel(function, 
					gridSizeX, gridSizeY, 1,      // Grid dimension
					BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,      // Block dimension
					0, null,               // Shared memory size and stream
					kernelParameters, null // Kernel- and extra parameters
					);
		}
	}
	cuMemcpyDtoH(Pointer.to(scores), d_scores, sequenceLength*totalTemplateNumIndices * Sizeof.FLOAT);
}
 
Example #5
Source File: JCudaAbstractKernelTest.java    From jcuda with MIT License 6 votes vote down vote up
/**
 * Tries to compile the specified .CU file into a PTX file, loads this
 * PTX file as a module, obtains the specified function from this module
 * and returns it.
 * 
 * @param cuFileName The .CU file name
 * @param functionName The kernel function name
 * @return The function
 * @throws CudaException If an error occurs
 */
protected final CUfunction initialize(
    String cuFileName, String functionName)
{
    // Enable exceptions and omit all subsequent error checks
    JCudaDriver.setExceptionsEnabled(true);
   
    // Initialize the driver and create a context for the first device.
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);

    String ptxFileName = JCudaTestUtils.preparePtxFile(cuFileName);
    
    // Load the ptx file.
    CUmodule module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain a function pointer to the kernel function.
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, functionName);
    
    return function;
}
 
Example #6
Source File: Context.java    From OSPREY3 with GNU General Public License v2.0 5 votes vote down vote up
public void launchKernel(CUfunction func, int gridBlocks, int blockThreads, int sharedMemBytes, Pointer pArgs, GpuStream stream) {
	JCudaDriver.cuLaunchKernel(
		func,
		gridBlocks, 1, 1,
		blockThreads, 1, 1,
		sharedMemBytes,
		stream.getStream(),
		pArgs,
		null
	);
}
 
Example #7
Source File: Kernel.java    From OSPREY3 with GNU General Public License v2.0 5 votes vote down vote up
public Function(String name) {
	func = new CUfunction();
	JCudaDriver.cuModuleGetFunction(func, module, name);
	pArgs = null;
	numBlocks = 1;
	blockThreads = 1;
	sharedMemCalc = new SharedMemCalculator.None();
}
 
Example #8
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void sqrt(Matrix A, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorSqrt");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #9
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void sqr(Matrix A, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorSqr");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #10
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void pow(Matrix A, Matrix B, float val) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorPow");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new float[] {val}), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #11
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void min(Matrix A, Matrix B, float val) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorMin");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new float[] {val}), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #12
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void max(Matrix A, Matrix B, float val) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorMax");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new float[] {val}), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #13
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void mul(Matrix A, Matrix B, Matrix C) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorMul");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(C.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #14
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void abs(Matrix A, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorAbs");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #15
Source File: JCudaReduction.java    From jcuda-samples with MIT License 5 votes vote down vote up
/**
 * Initialize the context, module, function and other elements used 
 * in this sample
 */
private static void init()
{
    // Initialize the driver API and create a context for the first device
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Create the PTX file by calling the NVCC
    String ptxFileName = JCudaSamplesUtils.preparePtxFile(
        "src/main/resources/kernels/JCudaReductionKernel.cu");
    
    // Load the module from the PTX file
    module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain a function pointer to the "reduce" function.
    function = new CUfunction();
    cuModuleGetFunction(function, module, "reduce");
    
    // Allocate a chunk of temporary memory (must be at least
    // numberOfBlocks * Sizeof.FLOAT)
    deviceBuffer = new CUdeviceptr();
    cuMemAlloc(deviceBuffer, 1024 * Sizeof.FLOAT);
    
}
 
Example #16
Source File: JCudaDriverStreamCallbacks.java    From jcuda-samples with MIT License 5 votes vote down vote up
/**
 * Initialize the driver API, the {@link #context} and the 
 * kernel {@link #function} 
 */
private static void initialize()
{
    System.out.println("Initializing...");
    
    JCudaDriver.setExceptionsEnabled(true);
    JNvrtc.setExceptionsEnabled(true);

    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    context = new CUcontext();
    cuCtxCreate(context, 0, device);

    nvrtcProgram program = new nvrtcProgram();
    nvrtcCreateProgram(
        program, programSourceCode, null, 0, null, null);
    nvrtcCompileProgram(program, 0, null);
    
    String[] ptx = new String[1];
    nvrtcGetPTX(program, ptx);
    nvrtcDestroyProgram(program);

    CUmodule module = new CUmodule();
    cuModuleLoadData(module, ptx[0]);

    function = new CUfunction();
    cuModuleGetFunction(function, module, "example");
    
    System.out.println("Initializing DONE");
}
 
Example #17
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void scalarSet(Matrix A, float alpha) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorScalarSet");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(new float[] {alpha}), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #18
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void scalarAdd(Matrix A, float alpha, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorScalarAdd");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new float[] {alpha}), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #19
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void log(Matrix A, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorLog");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #20
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void exp(Matrix A, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorExp");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #21
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void sign(Matrix A, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorSign");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #22
Source File: CublasUtil.java    From murphy with Apache License 2.0 5 votes vote down vote up
private static void div(Matrix A, Matrix B, Matrix C) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorDiv");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(C.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}
 
Example #23
Source File: JCudaDriverTextureTest.java    From jcuda with MIT License 4 votes vote down vote up
/**
 * Test the 1D float texture access
 */
private boolean test_float_1D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = 1;
    ad.NumChannels = 1;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    Pointer pInput = Pointer.to(input_float_1D);
    cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float_1D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float_1D");
    
    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();
    
    // Obtain the output on the host
    float hOutput[] = new float[1];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

    // Print the results
    log("Result float  1D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 0.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float  1D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}
 
Example #24
Source File: JCudaDriverTextureTest.java    From jcuda with MIT License 4 votes vote down vote up
/**
 * Test the 2D float texture access
 */
private boolean test_float_2D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.NumChannels = 1;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D();
    copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copyHD.srcHost = Pointer.to(input_float_2D);
    copyHD.srcPitch = sizeX * Sizeof.FLOAT;
    copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copyHD.dstArray = array;
    copyHD.WidthInBytes = sizeX * Sizeof.FLOAT;
    copyHD.Height = sizeY;
    cuMemcpy2D(copyHD);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float_2D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float_2D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[1];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

    // Print the results
    log("Result float  2D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 1.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float  2D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}
 
Example #25
Source File: JCudaDriverTextureTest.java    From jcuda with MIT License 4 votes vote down vote up
/**
 * Test the 3D float texture access
 */
private boolean test_float_3D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.Depth = sizeZ;
    ad.NumChannels = 1;
    cuArray3DCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
    copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copy.srcHost = Pointer.to(input_float_3D);
    copy.srcPitch = sizeX * Sizeof.FLOAT;
    copy.srcHeight = sizeY;
    copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copy.dstArray = array;
    copy.dstHeight = sizeX;
    copy.WidthInBytes = sizeX * Sizeof.FLOAT;
    copy.Height = sizeY;
    copy.Depth = sizeZ;
    cuMemcpy3D(copy);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float_3D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float_3D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY }),
     	Pointer.to(new float[]{ posZ })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[1];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

    // Print the results
    log("Result float  3D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 3.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float  3D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}
 
Example #26
Source File: JCudaDriverTextureTest.java    From jcuda with MIT License 4 votes vote down vote up
/**
 * Test the 1D float4 texture access
 */
private boolean test_float4_1D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = 1;
    ad.NumChannels = 4;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    Pointer pInput = Pointer.to(input_float4_1D);
    cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT * 4);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float4_1D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float4_1D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[4];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

    // Print the results
    log("Result float4 1D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 0.5f, 0.5f, 0.5f, 0.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float4 1D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}
 
Example #27
Source File: JCudaDriverTextureTest.java    From jcuda with MIT License 4 votes vote down vote up
/**
 * Test the 2D float4 texture access
 */
private boolean test_float4_2D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.NumChannels = 4;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D();
    copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copyHD.srcHost = Pointer.to(input_float4_2D);
    copyHD.srcPitch = sizeX * Sizeof.FLOAT * 4;
    copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copyHD.dstArray = array;
    copyHD.WidthInBytes = sizeX * Sizeof.FLOAT * 4;
    copyHD.Height = sizeY;
    cuMemcpy2D(copyHD);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float4_2D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float4_2D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[4];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

    // Print the results
    log("Result float4 2D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 1.5f, 1.5f, 1.5f, 1.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float4 2D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}
 
Example #28
Source File: JCudaDriverTextureTest.java    From jcuda with MIT License 4 votes vote down vote up
/**
 * Test the 3D float4 texture access
 */
private boolean test_float4_3D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.Depth = sizeZ;
    ad.NumChannels = 4;
    cuArray3DCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
    copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copy.srcHost = Pointer.to(input_float4_3D);
    copy.srcPitch = sizeX * Sizeof.FLOAT * 4;
    copy.srcHeight = sizeY;
    copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copy.dstArray = array;
    copy.dstHeight = sizeX;
    copy.WidthInBytes = sizeX * Sizeof.FLOAT * 4;
    copy.Height = sizeY;
    copy.Depth = sizeZ;
    cuMemcpy3D(copy);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float4_3D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float4_3D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY }),
     	Pointer.to(new float[]{ posZ })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[4];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

    // Print the results
    log("Result float4 3D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 3.5f, 3.5f, 3.5f, 3.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float4 3D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}
 
Example #29
Source File: JCudaConstantMemoryExample.java    From jcuda-samples with MIT License 4 votes vote down vote up
public static void main(String[] args) throws IOException 
{
    // Enable exceptions and omit all subsequent error checks
    JCudaDriver.setExceptionsEnabled(true);

    // Initialize the driver and create a context for the first device.
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Create the PTX file by calling the NVCC
    String ptxFileName = JCudaSamplesUtils.preparePtxFile(
        "src/main/resources/kernels/JCudaConstantMemoryKernel.cu");

    // Load the PTX file.
    CUmodule module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain the pointer to the constant memory, and print some info
    CUdeviceptr constantMemoryPointer = new CUdeviceptr();
    long constantMemorySizeArray[] = { 0 };
    cuModuleGetGlobal(constantMemoryPointer, constantMemorySizeArray, 
        module, "constantMemoryData");
    int constantMemorySize = (int)constantMemorySizeArray[0];
    
    System.out.println("constantMemoryPointer: " + constantMemoryPointer);
    System.out.println("constantMemorySize: " + constantMemorySize);

    // Copy some host data to the constant memory
    int numElements = constantMemorySize / Sizeof.FLOAT;
    float hostData[] = new float[numElements];
    for (int i = 0; i < numElements; i++)
    {
        hostData[i] = i;
    }
    cuMemcpyHtoD(constantMemoryPointer, 
        Pointer.to(hostData), constantMemorySize);
    
    // Now use the constant memory in the kernel call:
    
    // Obtain a function pointer to the "constantMemoryKernel" function.
    CUfunction kernel = new CUfunction();
    cuModuleGetFunction(kernel, module, "constantMemoryKernel");

    // Allocate some device memory
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAlloc(deviceData, constantMemorySize);
    
    // Set up the kernel parameters
    Pointer kernelParameters = Pointer.to(
        Pointer.to(deviceData),
        Pointer.to(new int[]{numElements})
    );
    
    // Launch the kernel
    int blockSizeX = numElements;
    int gridSizeX = 1;
    cuLaunchKernel(kernel,
        gridSizeX,  1, 1, 
        blockSizeX, 1, 1,
        0, null,         
        kernelParameters, null 
    );
    cuCtxSynchronize();
    
    // Copy the result back to the host, and verify that it is
    // the same that was copied to the constant memory
    float hostResult[] = new float[numElements];
    cuMemcpyDtoH(Pointer.to(hostResult), deviceData, constantMemorySize);
    
    boolean passed = Arrays.equals(hostData,  hostResult);
    System.out.println("Test " + (passed ? "PASSED" : "FAILED"));
}
 
Example #30
Source File: JCudaDynamicParallelism.java    From jcuda-samples with MIT License 4 votes vote down vote up
public static void main(String[] args)
{
    JCudaDriver.setExceptionsEnabled(true);

    // Initialize a context for the first device
    cuInit(0);
    CUcontext context = new CUcontext();
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    cuCtxCreate(context, 0, device);

    // Create the CUBIN file by calling the NVCC. 
    // See the prepareDefaultCubinFile method for the details about
    // the NVCC parameters that are used here. 
    String cubinFileName = JCudaSamplesUtils.prepareDefaultCubinFile(
        "src/main/resources/kernels/JCudaDynamicParallelismKernel.cu");

    // Load the CUBIN file 
    CUmodule module = new CUmodule();
    cuModuleLoad(module, cubinFileName);

    // Obtain a function pointer to the "parentKernel" function.
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "parentKernel");

    // Define the nesting structure. 
    // 
    // NOTE: The number of child threads MUST match the value that 
    // is used in the kernel, for the childKernel<<<1, 8>>> call!
    // 
    int numParentThreads = 8;
    int numChildThreads = 8;

    // Allocate the device data that will be filled by the kernel
    int numElements = numParentThreads * numChildThreads;
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAlloc(deviceData, numElements * Sizeof.FLOAT);

    // Set up the kernel parameters: A pointer to an array
    // of pointers which point to the actual values.
    Pointer kernelParameters = Pointer.to(
        Pointer.to(new int[] { numElements }),
        Pointer.to(deviceData)
    );

    // Call the kernel function.
    int blockSizeX = numParentThreads;
    int gridSizeX = (numElements + numElements - 1) / blockSizeX;
    cuLaunchKernel(function,
        gridSizeX,  1, 1,      // Grid dimension
        blockSizeX, 1, 1,      // Block dimension
        0, null,               // Shared memory size and stream
        kernelParameters, null // Kernel- and extra parameters
    );
    cuCtxSynchronize();

    // Copy the device data to the host
    float hostData[] = new float[numElements];
    for(int i = 0; i < numElements; i++)
    {
        hostData[i] = i;
    }
    cuMemcpyDtoH(Pointer.to(hostData), 
        deviceData, numElements * Sizeof.FLOAT);

    // Compare the host data with the expected values
    float hostDataRef[] = new float[numElements];
    for(int i = 0; i < numParentThreads; i++)
    {
        for (int j=0; j < numChildThreads; j++)
        {
            hostDataRef[i * numChildThreads + j] = i + 0.1f * j;
        }
    }
    System.out.println("Result: "+Arrays.toString(hostData));
    boolean passed = Arrays.equals(hostData, hostDataRef);
    System.out.println(passed ? "PASSED" : "FAILED");

    // Clean up.
    cuMemFree(deviceData);
}