Python pycuda.driver.Stream() Examples
The following are 30
code examples of pycuda.driver.Stream().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pycuda.driver
, or try the search function
.
Example #1
Source File: mnist_api.py From iAI with MIT License | 6 votes |
def infer(context, input_img, output_size, batch_size): # Load engine engine = context.get_engine() assert(engine.get_nb_bindings() == 2) # Convert input data to float32 input_img = input_img.astype(np.float32) # Create host buffer to receive data output = np.empty(output_size, dtype = np.float32) # Allocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() # Transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) # Execute model context.enqueue(batch_size, bindings, stream.handle, None) # Transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) # Synchronize threads stream.synchronize() # Return predictions return output
Example #2
Source File: ssd.py From tensorrt_demos with MIT License | 6 votes |
def __init__(self, model, input_shape, output_layout=7): """Initialize TensorRT plugins, engine and conetxt.""" self.model = model self.input_shape = input_shape self.output_layout = output_layout self.trt_logger = trt.Logger(trt.Logger.INFO) self._load_plugins() self.engine = self._load_engine() self.host_inputs = [] self.cuda_inputs = [] self.host_outputs = [] self.cuda_outputs = [] self.bindings = [] self.stream = cuda.Stream() self.context = self._create_context()
Example #3
Source File: yolov3.py From tensorrt_demos with MIT License | 6 votes |
def allocate_buffers(engine): """Allocates all host/device in/out buffers required for an engine.""" inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * \ engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream
Example #4
Source File: deep_neural_network.py From Hands-On-GPU-Programming-with-Python-and-CUDA with MIT License | 6 votes |
def __init__(self, layers=None, delta=None, stream = None, max_batch_size=32, max_streams=10, epochs = 10): self.network = [] self.network_summary = [] self.network_mem = [] if stream is not None: self.stream = stream else: self.stream = drv.Stream() if delta is None: delta = 0.0001 self.delta = delta self.max_batch_size=max_batch_size self.max_streams = max_streams self.epochs = epochs if layers is not None: for layer in layers: add_layer(self, layer)
Example #5
Source File: common.py From yolov3-tensorrt with MIT License | 6 votes |
def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream # This function is generalized for multiple inputs/outputs. # inputs and outputs are expected to be lists of HostDeviceMem objects.
Example #6
Source File: inference.py From torch2trt with MIT License | 6 votes |
def __init__(self, context: trt.IExecutionContext, stream=None, device=None, cuda_device=None, cuda_context=None): self.engine = context.engine if device is None: self.torch_device = torch.device("cuda:0") else: self.torch_device = device inputs, outputs, bindings = allocate_buffers_torch(self.engine, self.torch_device) self.context = context self.inputs = inputs self.outputs = outputs self.bindings = bindings self.input_dict = {mem.name: mem for mem in inputs} self.output_dict = {mem.name: mem for mem in outputs} if stream is None: self.stream = cuda.Stream() self._batch_size = None self.cuda_device = cuda_device self.cuda_context = cuda_context
Example #7
Source File: engine.py From dragon with BSD 2-Clause "Simplified" License | 6 votes |
def get_async(self, stream): """Copy and return the host buffer data. Parameters ---------- stream : pycuda.driver.Stream The cuda stream to copy data. Returns ------- numpy.array The numpy array taking the data. """ src = self.device_buffer dst = self.host_buffer src.get_async(stream, dst) return dst
Example #8
Source File: inference.py From torch2trt with MIT License | 5 votes |
def __init__(self, context: trt.IExecutionContext, stream=None, cuda_device=None, cuda_context=None): self.engine = context.engine inputs, outputs, bindings = allocate_buffers(self.engine) self.context = context self.inputs = inputs self.outputs = outputs self.bindings = bindings self.input_dict = {mem.name: mem for mem in inputs} self.output_dict = {mem.name: mem for mem in outputs} if stream is None: self.stream = cuda.Stream() self._batch_size = None self.cuda_device = cuda_device self.cuda_context = cuda_context
Example #9
Source File: tensorrt_runner.py From NeMo with Apache License 2.0 | 5 votes |
def __enter__(self): """ Vars: engine (trt.ICudaEngine): The engine tracked by this runner. The TensorRTRunnerV2 OWNS the engine it manages, and therefore is responsible for it's destruction. Do not free the engine outside of the runner, or it will result in a double free. context (trt.IExecutionContext): The context used for inference. stream (pycuda.driver.Stream): The CUDA stream that this runner will use for inference. """ return self
Example #10
Source File: tensorrt_runner.py From NeMo with Apache License 2.0 | 5 votes |
def __init__(self, model_loader=None, plugins=None, name=None): """ Creates a runner that manages a single TensorRT engine. Args: model_loader (Callable() -> trt.ICudaEngine): A callable that can supply a TensorRT engine. Optional Args: max_workspace_size (int): The maximum workspace size in bytes. plugins (List[str]): A list of paths to plugin libraries to load before inference. name (str): The human-readable name to use for this runner. """ set_trt_logging_level(logging.getEffectiveLevel()) def load_plugins(): import ctypes for plugin in plugins: path = os.path.abspath(plugin) logging.info("Loading plugin library: {:}".format(path)) ctypes.CDLL(path) # Load any user-supplied plugin libraries. This must happen before everything else, including engine deserialization. if plugins: load_plugins() # Choose a unique name for this runner. super().__init__(default_value(name, "trt-v2-runner-{:}".format(TensorRTRunnerV2.total_runners))) TensorRTRunnerV2.total_runners += 1 logging.debug("Creating {:}".format(self.name)) self.model_loader = model_loader self.engine = self.model_loader() if not self.engine: logging.critical("Invalid Engine. Please ensure the engine was built correctly.") self.buffers = Buffers.from_engine(self.engine) self.stream = cuda.Stream() self.context = self.engine.create_execution_context()
Example #11
Source File: predict_image.py From keras_imagenet with MIT License | 5 votes |
def infer_with_trt(img, model): """Inference the image with TensorRT engine.""" import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.INFO) with open(model, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) assert len(engine) == 2, 'ERROR: bad number of bindings' host_input, cuda_input, host_output, cuda_output = init_trt_buffers( cuda, trt, engine) stream = cuda.Stream() context = engine.create_execution_context() context.set_binding_shape(0, (1, 224, 224, 3)) np.copyto(host_input, img.ravel()) cuda.memcpy_htod_async(cuda_input, host_input, stream) if trt.__version__[0] >= '7': context.execute_async_v2(bindings=[int(cuda_input), int(cuda_output)], stream_handle=stream.handle) else: context.execute_async(bindings=[int(cuda_input), int(cuda_output)], stream_handle=stream.handle) cuda.memcpy_dtoh_async(host_output, cuda_output, stream) stream.synchronize() return host_output
Example #12
Source File: speed_gpu.py From Real-time-GesRec with MIT License | 5 votes |
def alloc_buf(engine): # host cpu mem h_in_size = trt.volume(engine.get_binding_shape(0)) h_out_size = trt.volume(engine.get_binding_shape(1)) h_in_dtype = trt.nptype(engine.get_binding_dtype(0)) h_out_dtype = trt.nptype(engine.get_binding_dtype(1)) in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype) out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype) # allocate gpu mem in_gpu = cuda.mem_alloc(in_cpu.nbytes) out_gpu = cuda.mem_alloc(out_cpu.nbytes) stream = cuda.Stream() return in_cpu, out_cpu, in_gpu, out_gpu, stream
Example #13
Source File: caffe_mnist.py From iAI with MIT License | 5 votes |
def infer(context, input_img, output_size, batch_size): #load engine engine = context.get_engine() assert(engine.get_nb_bindings() == 2) #convert input data to Float32 input_img = input_img.astype(np.float32) #create output array to receive data output = np.empty(output_size, dtype = np.float32) #alocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #execute model context.enqueue(batch_size, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #synchronize threads stream.synchronize() #return predictions return output
Example #14
Source File: engine.py From dragon with BSD 2-Clause "Simplified" License | 5 votes |
def __init__(self, cuda_engine, device_id=0): """Create an ``Engine``. Parameters ---------- cuda_engine : tensorrt.ICudaEngine The built cuda engine. device_id : int, optional, default=0 The index of executing device. """ # Create executing resources. self._cuda_engine = cuda_engine self._device_id = device_id self._context = cuda_engine.create_execution_context() self._stream = driver.Stream(0) # Create bindings. num_binding = self._cuda_engine.num_bindings self._bindings = [Binding(cuda_engine, self._context, i, device_id) for i in range(num_binding)] self._inputs = [b for b in self._bindings if b.is_input] self._outputs = [b for b in self._bindings if not b.is_input] # Report the engine info. logging.info('TensorRT engine built.') binding_info = 'InputInfo: {\n' for b in self._inputs: binding_info += ' * Binding("{}", shape={}, dtype={})\n' \ .format(b.name, b.shape, b.dtype) logging.info(binding_info + '}') binding_info = 'OutputInfo: {\n' for b in self._outputs: binding_info += ' * Binding("{}", shape={}, dtype={})\n' \ .format(b.name, b.shape, b.dtype) logging.info(binding_info + '}')
Example #15
Source File: main.py From Pytorch-Model-to-TensorRT with GNU General Public License v3.0 | 5 votes |
def infer(context, input_img, output_size, batch_size): # Load engine engine = context.get_engine() assert (engine.get_nb_bindings() == 2) # Convert input data to Float32 input_img = input_img.astype(np.float32) # Create output array to receive data output = np.empty(output_size, dtype=np.float32) # Allocate device memory d_input = cuda.mem_alloc(batch_size * input_img.nbytes) d_output = cuda.mem_alloc(batch_size * output.nbytes) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() # Transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) # Execute model context.enqueue(batch_size, bindings, stream.handle, None) # Transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) # Return predictions return output
Example #16
Source File: tensor_ops.py From ngraph-python with Apache License 2.0 | 5 votes |
def __init__(self, transformer, comm, op): super(CudaAllReduceKernel, self).__init__(transformer) self.op = op self.tensor = op.tensor_description() self.device_id = int(transformer.device_id) self.device_ids = list(map(int, self.op.device_ids)) self.event = drv.Event(flags=event_flags.INTERPROCESS | event_flags.DISABLE_TIMING) self.stream = drv.Stream() self.output_buff_dict = {} self.scratch_buff_dict = {} self.event_buff_dict = {} self.comm = comm self.init_buffers()
Example #17
Source File: ssd_trt_detection.py From object-detection with MIT License | 5 votes |
def __init__(self): self.colors = np.random.uniform(0, 255, size=(100, 3)) self.input_shape = INPUT_HW self.trt_logger = trt.Logger(trt.Logger.INFO) self._load_plugins() self.engine = self._load_engine() self.host_inputs = [] self.cuda_inputs = [] self.host_outputs = [] self.cuda_outputs = [] self.bindings = [] self.stream = cuda.Stream() self.context = self._create_context()
Example #18
Source File: common.py From iAI with MIT License | 5 votes |
def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) # Append to the appropriate list. if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream # This function is generalized for multiple inputs/outputs. # inputs and outputs are expected to be lists of HostDeviceMem objects.
Example #19
Source File: uff_resnet50.py From iAI with MIT License | 5 votes |
def allocate_buffers(engine): # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs. h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(ModelData.DTYPE)) h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(ModelData.DTYPE)) # Allocate device memory for inputs and outputs. d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() return h_input, d_input, h_output, d_output, stream
Example #20
Source File: custom_layers.py From iAI with MIT License | 5 votes |
def infer(context, input_img, output_size, batch_size): #load engine engine = context.get_engine() assert(engine.get_nb_bindings() == 2) #convert input data to Float32 input_img = input_img.astype(np.float32) #create output array to receive data output = np.empty(output_size, dtype = np.float32) #alocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #execute model context.enqueue(batch_size, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #return predictions return output
Example #21
Source File: uff_mnist.py From iAI with MIT License | 5 votes |
def infer(engine, input_img, batch_size): #load engine context = engine.create_execution_context() assert(engine.get_nb_bindings() == 2) #create output array to receive data dims = engine.get_binding_dimensions(1).to_DimsCHW() elt_count = dims.C() * dims.H() * dims.W() * batch_size #Allocate pagelocked memory output = cuda.pagelocked_empty(elt_count, dtype = np.float32) #alocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #execute model context.enqueue(batch_size, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #return predictions return output
Example #22
Source File: tf_to_trt.py From iAI with MIT License | 5 votes |
def infer(context, input_img, batch_size): #load engine engine = context.get_engine() assert(engine.get_nb_bindings() == 2) #create output array to receive data dims = engine.get_binding_dimensions(1).to_DimsCHW() elt_count = dims.C() * dims.H() * dims.W() * batch_size #convert input data to Float32 input_img = input_img.astype(np.float32) #Allocate pagelocked memory output = cuda.pagelocked_empty(elt_count, dtype=np.float32) #alocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #execute model context.enqueue(batch_size, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #return predictions return output
Example #23
Source File: sample_onnx.py From iAI with MIT License | 5 votes |
def inference_image(context, input_img, batch_size): # load engine engine = context.get_engine() assert(engine.get_nb_bindings() == 2) inp_dims = engine.get_binding_dimensions(0).to_DimsCHW() out_dims = engine.get_binding_dimensions(1).to_DimsCHW() # output vector size output_size = 1000 # create output array output = np.empty(output_size, dtype=np.float32) # allocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) # create input/output bindings bindings = [int(d_input), int(d_output)] stream = cuda.Stream() # transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) # execute model context.enqueue(batch_size, bindings, stream.handle, None) # transfer predictions cuda.memcpy_dtoh_async(output, d_output, stream) # synchronize threads stream.synchronize() return output
Example #24
Source File: onnx_mnist.py From iAI with MIT License | 5 votes |
def infer(engine, input_img, batch_size): #load engine context = engine.create_execution_context() assert(engine.get_nb_bindings() == 2) #create output array to receive data dims = engine.get_binding_dimensions(1).to_DimsCHW() elt_count = dims.C() * dims.H() * dims.W() * batch_size #Allocate pagelocked memory output = cuda.pagelocked_empty(elt_count, dtype = np.float32) #alocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #execute model context.enqueue(batch_size, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #return predictions return output
Example #25
Source File: custom_layers.py From iAI with MIT License | 5 votes |
def infer(context, input_img, output_size, batch_size): # Load engine engine = context.get_engine() assert(engine.get_nb_bindings() == 2) # Convert input data to Float32 input_img = input_img.astype(np.float32) # Create output array to receive data output = np.empty(output_size, dtype = np.float32) # Alocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() # Transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) # Execute model context.enqueue(batch_size, bindings, stream.handle, None) # Transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) # Return predictions return output
Example #26
Source File: caffe_mnist.py From iAI with MIT License | 5 votes |
def infer(context, input_img, output_size, batch_size): #load engine engine = context.get_engine() assert(engine.get_nb_bindings() == 2) #convert input data to Float32 input_img = input_img.astype(np.float32) #create output array to receive data output = np.empty(output_size, dtype = np.float32) #alocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #execute model context.enqueue(batch_size, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #return predictions return output
Example #27
Source File: mnist_api.py From iAI with MIT License | 5 votes |
def infer(context, input_img, output_size, batch_size): #load engine engine = context.get_engine() assert(engine.get_nb_bindings() == 2) #convert input data to Float32 input_img = input_img.astype(np.float32) #create output array to receive data output = np.empty(output_size, dtype = np.float32) #alocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #execute model context.enqueue(batch_size, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #return predictions return output
Example #28
Source File: uff_mnist.py From iAI with MIT License | 5 votes |
def infer(engine, input_img, batch_size): #load engine context = engine.create_execution_context() assert(engine.get_nb_bindings() == 2) #create output array to receive data dims = engine.get_binding_dimensions(1).to_DimsCHW() elt_count = dims.C() * dims.H() * dims.W() * batch_size #Allocate pagelocked memory output = cuda.pagelocked_empty(elt_count, dtype = np.float32) #alocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #execute model context.enqueue(batch_size, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #return predictions return output
Example #29
Source File: onnx_resnet50.py From iAI with MIT License | 5 votes |
def allocate_buffers(engine): # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs. h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(ModelData.DTYPE)) h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(ModelData.DTYPE)) # Allocate device memory for inputs and outputs. d_input = cuda.mem_alloc(h_input.nbytes) d_output = cuda.mem_alloc(h_output.nbytes) # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() return h_input, d_input, h_output, d_output, stream
Example #30
Source File: tf_to_trt.py From iAI with MIT License | 5 votes |
def infer(context, input_img, batch_size): #load engine engine = context.get_engine() assert(engine.get_nb_bindings() == 2) #create output array to receive data dims = engine.get_binding_dimensions(1).to_DimsCHW() elt_count = dims.C() * dims.H() * dims.W() * batch_size #convert input data to Float32 input_img = input_img.astype(np.float32) #Allocate pagelocked memory output = cuda.pagelocked_empty(elt_count, dtype=np.float32) #alocate device memory d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize) bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #execute model context.enqueue(batch_size, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #return predictions return output