Python pycuda.driver.Stream() Examples

The following are 30 code examples of pycuda.driver.Stream(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pycuda.driver , or try the search function .
Example #1
Source File:    From iAI with MIT License 6 votes vote down vote up
def infer(context, input_img, output_size, batch_size):
    # Load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)
    # Convert input data to float32
    input_img = input_img.astype(np.float32)
    # Create host buffer to receive data
    output = np.empty(output_size, dtype = np.float32)
    # Allocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]
    stream = cuda.Stream()
    # Transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    # Execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    # Transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)
    # Synchronize threads
    # Return predictions
    return output 
Example #2
Source File:    From tensorrt_demos with MIT License 6 votes vote down vote up
def __init__(self, model, input_shape, output_layout=7):
        """Initialize TensorRT plugins, engine and conetxt."""
        self.model = model
        self.input_shape = input_shape
        self.output_layout = output_layout
        self.trt_logger = trt.Logger(trt.Logger.INFO)
        self.engine = self._load_engine()

        self.host_inputs = []
        self.cuda_inputs = []
        self.host_outputs = []
        self.cuda_outputs = []
        self.bindings = [] = cuda.Stream()
        self.context = self._create_context() 
Example #3
Source File:    From tensorrt_demos with MIT License 6 votes vote down vote up
def allocate_buffers(engine):
    """Allocates all host/device in/out buffers required for an engine."""
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * \
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream 
Example #4
Source File:    From Hands-On-GPU-Programming-with-Python-and-CUDA with MIT License 6 votes vote down vote up
def __init__(self, layers=None, delta=None, stream = None, max_batch_size=32, max_streams=10, epochs = 10):
        = []
        self.network_summary = []
        self.network_mem = []
        if stream is not None:
   = stream
   = drv.Stream()
        if delta is None:
            delta = 0.0001
            = delta
        self.max_streams = max_streams
        self.epochs = epochs
        if layers is not None:
            for layer in layers:
                add_layer(self, layer) 
Example #5
Source File:    From yolov3-tensorrt with MIT License 6 votes vote down vote up
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects. 
Example #6
Source File:    From torch2trt with MIT License 6 votes vote down vote up
def __init__(self, context: trt.IExecutionContext, stream=None, device=None, cuda_device=None, cuda_context=None):
        self.engine = context.engine
        if device is None:
            self.torch_device = torch.device("cuda:0")
            self.torch_device = device
        inputs, outputs, bindings = allocate_buffers_torch(self.engine, self.torch_device)
        self.context = context
        self.inputs = inputs
        self.outputs = outputs
        self.bindings = bindings
        self.input_dict = { mem for mem in inputs}
        self.output_dict = { mem for mem in outputs}
        if stream is None:
   = cuda.Stream()
        self._batch_size = None
        self.cuda_device = cuda_device
        self.cuda_context = cuda_context 
Example #7
Source File:    From dragon with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def get_async(self, stream):
        """Copy and return the host buffer data.

        stream : pycuda.driver.Stream
            The cuda stream to copy data.

            The numpy array taking the data.

        src = self.device_buffer
        dst = self.host_buffer
        src.get_async(stream, dst)
        return dst 
Example #8
Source File:    From torch2trt with MIT License 5 votes vote down vote up
def __init__(self, context: trt.IExecutionContext, stream=None, cuda_device=None, cuda_context=None):
        self.engine = context.engine
        inputs, outputs, bindings = allocate_buffers(self.engine)
        self.context = context
        self.inputs = inputs
        self.outputs = outputs
        self.bindings = bindings
        self.input_dict = { mem for mem in inputs}
        self.output_dict = { mem for mem in outputs}
        if stream is None:
   = cuda.Stream()
        self._batch_size = None
        self.cuda_device = cuda_device
        self.cuda_context = cuda_context 
Example #9
Source File:    From NeMo with Apache License 2.0 5 votes vote down vote up
def __enter__(self):
            engine (trt.ICudaEngine): The engine tracked by this runner. The TensorRTRunnerV2 OWNS the engine it manages, and therefore is responsible for it's destruction. Do not free the engine outside of the runner, or it will result in a double free.
            context (trt.IExecutionContext): The context used for inference.
            stream (pycuda.driver.Stream): The CUDA stream that this runner will use for inference.
        return self 
Example #10
Source File:    From NeMo with Apache License 2.0 5 votes vote down vote up
def __init__(self, model_loader=None, plugins=None, name=None):
        Creates a runner that manages a single TensorRT engine.

            model_loader (Callable() -> trt.ICudaEngine): A callable that can supply a TensorRT engine.

        Optional Args:
            max_workspace_size (int): The maximum workspace size in bytes.
            plugins (List[str]): A list of paths to plugin libraries to load before inference.
            name (str): The human-readable name to use for this runner.

        def load_plugins():
            import ctypes

            for plugin in plugins:
                path = os.path.abspath(plugin)
      "Loading plugin library: {:}".format(path))

        # Load any user-supplied plugin libraries. This must happen before everything else, including engine deserialization.
        if plugins:

        # Choose a unique name for this runner.
        super().__init__(default_value(name, "trt-v2-runner-{:}".format(TensorRTRunnerV2.total_runners)))
        TensorRTRunnerV2.total_runners += 1
        logging.debug("Creating {:}".format(

        self.model_loader = model_loader

        self.engine = self.model_loader()
        if not self.engine:
            logging.critical("Invalid Engine. Please ensure the engine was built correctly.")

        self.buffers = Buffers.from_engine(self.engine) = cuda.Stream()

        self.context = self.engine.create_execution_context() 
Example #11
Source File:    From keras_imagenet with MIT License 5 votes vote down vote up
def infer_with_trt(img, model):
    """Inference the image with TensorRT engine."""
    import pycuda.autoinit
    import pycuda.driver as cuda
    import tensorrt as trt

    TRT_LOGGER = trt.Logger(trt.Logger.INFO)
    with open(model, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(
    assert len(engine) == 2, 'ERROR: bad number of bindings'
    host_input, cuda_input, host_output, cuda_output = init_trt_buffers(
        cuda, trt, engine)
    stream = cuda.Stream()
    context = engine.create_execution_context()
    context.set_binding_shape(0, (1, 224, 224, 3))
    np.copyto(host_input, img.ravel())
    cuda.memcpy_htod_async(cuda_input, host_input, stream)
    if trt.__version__[0] >= '7':
        context.execute_async_v2(bindings=[int(cuda_input), int(cuda_output)],
        context.execute_async(bindings=[int(cuda_input), int(cuda_output)],
    cuda.memcpy_dtoh_async(host_output, cuda_output, stream)
    return host_output 
Example #12
Source File:    From Real-time-GesRec with MIT License 5 votes vote down vote up
def alloc_buf(engine):
    # host cpu mem
    h_in_size = trt.volume(engine.get_binding_shape(0))
    h_out_size = trt.volume(engine.get_binding_shape(1))
    h_in_dtype = trt.nptype(engine.get_binding_dtype(0))
    h_out_dtype = trt.nptype(engine.get_binding_dtype(1))
    in_cpu = cuda.pagelocked_empty(h_in_size, h_in_dtype)
    out_cpu = cuda.pagelocked_empty(h_out_size, h_out_dtype)
    # allocate gpu mem
    in_gpu = cuda.mem_alloc(in_cpu.nbytes)
    out_gpu = cuda.mem_alloc(out_cpu.nbytes)
    stream = cuda.Stream()
    return in_cpu, out_cpu, in_gpu, out_gpu, stream 
Example #13
Source File:    From iAI with MIT License 5 votes vote down vote up
def infer(context, input_img, output_size, batch_size):
    #load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)
    #convert input data to Float32
    input_img = input_img.astype(np.float32)
    #create output array to receive data
    output = np.empty(output_size, dtype = np.float32)

    #alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    #execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    #synchronize threads

    #return predictions
    return output 
Example #14
Source File:    From dragon with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def __init__(self, cuda_engine, device_id=0):
        """Create an ``Engine``.

        cuda_engine : tensorrt.ICudaEngine
            The built cuda engine.
        device_id : int, optional, default=0
            The index of executing device.

        # Create executing resources.
        self._cuda_engine = cuda_engine
        self._device_id = device_id
        self._context = cuda_engine.create_execution_context()
        self._stream = driver.Stream(0)

        # Create bindings.
        num_binding = self._cuda_engine.num_bindings
        self._bindings = [Binding(cuda_engine, self._context, i, device_id)
                          for i in range(num_binding)]
        self._inputs = [b for b in self._bindings if b.is_input]
        self._outputs = [b for b in self._bindings if not b.is_input]

        # Report the engine info.'TensorRT engine built.')
        binding_info = 'InputInfo: {\n'
        for b in self._inputs:
            binding_info += '  * Binding("{}", shape={}, dtype={})\n' \
                            .format(, b.shape, b.dtype) + '}')
        binding_info = 'OutputInfo: {\n'
        for b in self._outputs:
            binding_info += '  * Binding("{}", shape={}, dtype={})\n' \
                            .format(, b.shape, b.dtype) + '}') 
Example #15
Source File:    From Pytorch-Model-to-TensorRT with GNU General Public License v3.0 5 votes vote down vote up
def infer(context, input_img, output_size, batch_size):
    # Load engine
    engine = context.get_engine()
    assert (engine.get_nb_bindings() == 2)
    # Convert input data to Float32
    input_img = input_img.astype(np.float32)
    # Create output array to receive data
    output = np.empty(output_size, dtype=np.float32)

    # Allocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.nbytes)
    d_output = cuda.mem_alloc(batch_size * output.nbytes)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    # Transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    # Execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    # Transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    # Return predictions
    return output 
Example #16
Source File:    From ngraph-python with Apache License 2.0 5 votes vote down vote up
def __init__(self, transformer, comm, op):
        super(CudaAllReduceKernel, self).__init__(transformer)
        self.op = op
        self.tensor = op.tensor_description()
        self.device_id = int(transformer.device_id)
        self.device_ids = list(map(int, self.op.device_ids))
        self.event = drv.Event(flags=event_flags.INTERPROCESS | event_flags.DISABLE_TIMING) = drv.Stream()
        self.output_buff_dict = {}
        self.scratch_buff_dict = {}
        self.event_buff_dict = {}
        self.comm = comm
Example #17
Source File:    From object-detection with MIT License 5 votes vote down vote up
def __init__(self):
        self.colors = np.random.uniform(0, 255, size=(100, 3))
        self.input_shape = INPUT_HW
        self.trt_logger = trt.Logger(trt.Logger.INFO)
        self.engine = self._load_engine()

        self.host_inputs = []
        self.cuda_inputs = []
        self.host_outputs = []
        self.cuda_outputs = []
        self.bindings = [] = cuda.Stream()
        self.context = self._create_context() 
Example #18
Source File:    From iAI with MIT License 5 votes vote down vote up
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects. 
Example #19
Source File:    From iAI with MIT License 5 votes vote down vote up
def allocate_buffers(engine):
    # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(ModelData.DTYPE))
    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(ModelData.DTYPE))
    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()
    return h_input, d_input, h_output, d_output, stream 
Example #20
Source File:    From iAI with MIT License 5 votes vote down vote up
def infer(context, input_img, output_size, batch_size):
    #load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)
    #convert input data to Float32
    input_img = input_img.astype(np.float32)
    #create output array to receive data 
    output = np.empty(output_size, dtype = np.float32)

    #alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    #execute model 
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    #return predictions
    return output 
Example #21
Source File:    From iAI with MIT License 5 votes vote down vote up
def infer(engine, input_img, batch_size):
    #load engine
    context = engine.create_execution_context()
    assert(engine.get_nb_bindings() == 2)
    #create output array to receive data
    dims = engine.get_binding_dimensions(1).to_DimsCHW()
    elt_count = dims.C() * dims.H() * dims.W() * batch_size
    #Allocate pagelocked memory
    output = cuda.pagelocked_empty(elt_count, dtype = np.float32)

    #alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    #execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    #return predictions
    return output 
Example #22
Source File:    From iAI with MIT License 5 votes vote down vote up
def infer(context, input_img, batch_size):
    #load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)
    #create output array to receive data
    dims = engine.get_binding_dimensions(1).to_DimsCHW()
    elt_count = dims.C() * dims.H() * dims.W() * batch_size
    #convert input data to Float32
    input_img = input_img.astype(np.float32)
    #Allocate pagelocked memory
    output = cuda.pagelocked_empty(elt_count, dtype=np.float32)

    #alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    #execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    #return predictions
    return output 
Example #23
Source File:    From iAI with MIT License 5 votes vote down vote up
def inference_image(context, input_img, batch_size):
    # load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)
    inp_dims = engine.get_binding_dimensions(0).to_DimsCHW()
    out_dims = engine.get_binding_dimensions(1).to_DimsCHW()
    # output vector size
    output_size = 1000
    # create output array
    output = np.empty(output_size, dtype=np.float32)
    # allocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)
    # create input/output bindings
    bindings = [int(d_input), int(d_output)]
    stream = cuda.Stream()
    # transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    # execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    # transfer predictions
    cuda.memcpy_dtoh_async(output, d_output, stream)
    # synchronize threads
    return output 
Example #24
Source File:    From iAI with MIT License 5 votes vote down vote up
def infer(engine, input_img, batch_size):
    #load engine
    context = engine.create_execution_context()
    assert(engine.get_nb_bindings() == 2)

    #create output array to receive data
    dims = engine.get_binding_dimensions(1).to_DimsCHW()
    elt_count = dims.C() * dims.H() * dims.W() * batch_size

    #Allocate pagelocked memory
    output = cuda.pagelocked_empty(elt_count, dtype = np.float32)

    #alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    #execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    #return predictions
    return output 
Example #25
Source File:    From iAI with MIT License 5 votes vote down vote up
def infer(context, input_img, output_size, batch_size):
    # Load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)
    # Convert input data to Float32
    input_img = input_img.astype(np.float32)
    # Create output array to receive data
    output = np.empty(output_size, dtype = np.float32)

    # Alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    # Transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    # Execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    # Transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    # Return predictions
    return output 
Example #26
Source File:    From iAI with MIT License 5 votes vote down vote up
def infer(context, input_img, output_size, batch_size):
    #load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)
    #convert input data to Float32
    input_img = input_img.astype(np.float32)
    #create output array to receive data 
    output = np.empty(output_size, dtype = np.float32)

    #alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    #execute model 
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    #return predictions
    return output 
Example #27
Source File:    From iAI with MIT License 5 votes vote down vote up
def infer(context, input_img, output_size, batch_size):
    #load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)
    #convert input data to Float32
    input_img = input_img.astype(np.float32)
    #create output array to receive data 
    output = np.empty(output_size, dtype = np.float32)

    #alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    #execute model 
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    #return predictions
    return output 
Example #28
Source File:    From iAI with MIT License 5 votes vote down vote up
def infer(engine, input_img, batch_size):
    #load engine
    context = engine.create_execution_context()
    assert(engine.get_nb_bindings() == 2)
    #create output array to receive data
    dims = engine.get_binding_dimensions(1).to_DimsCHW()
    elt_count = dims.C() * dims.H() * dims.W() * batch_size
    #Allocate pagelocked memory
    output = cuda.pagelocked_empty(elt_count, dtype = np.float32)

    #alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    #execute model 
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    #return predictions
    return output 
Example #29
Source File:    From iAI with MIT License 5 votes vote down vote up
def allocate_buffers(engine):
    # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(ModelData.DTYPE))
    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(ModelData.DTYPE))
    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()
    return h_input, d_input, h_output, d_output, stream 
Example #30
Source File:    From iAI with MIT License 5 votes vote down vote up
def infer(context, input_img, batch_size):
    #load engine
    engine = context.get_engine()
    assert(engine.get_nb_bindings() == 2)
    #create output array to receive data
    dims = engine.get_binding_dimensions(1).to_DimsCHW()
    elt_count = dims.C() * dims.H() * dims.W() * batch_size
    #convert input data to Float32
    input_img = input_img.astype(np.float32)
    #Allocate pagelocked memory
    output = cuda.pagelocked_empty(elt_count, dtype=np.float32)

    #alocate device memory
    d_input = cuda.mem_alloc(batch_size * input_img.size * input_img.dtype.itemsize)
    d_output = cuda.mem_alloc(batch_size * output.size * output.dtype.itemsize)

    bindings = [int(d_input), int(d_output)]

    stream = cuda.Stream()

    #transfer input data to device
    cuda.memcpy_htod_async(d_input, input_img, stream)
    #execute model
    context.enqueue(batch_size, bindings, stream.handle, None)
    #transfer predictions back
    cuda.memcpy_dtoh_async(output, d_output, stream)

    #return predictions
    return output