Example inference code to run from a TensorRT engine file (version 10.0.x)

2 min readMay 24, 2024

I have recently been trying to use TensorRT 10.0 to optimize the CLIP model. I found that the Python API of TensorRT 10.0 has significant changes and is not compatible with version 8. Additionally, it is not easy to find a complete example in the official documentation. Therefore, I am providing a piece of Python code that I can run successfully.

import numpy as np
import pucuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
import time
import torch
from PIL import Image


class TensorRTInference:
    def __init__(self, engine_path):
        self.logger = trt.Logger(trt.Logger.Error)
        self.runtime = trt.Runtime(self.Logger)
        self.engine = self.load_engine(engine_path)
        self.context = self.engine.create_execution_context()

        # Allocate buffers
        self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers(self.engine)

    def load_engine(self, engine_path):
        with open(engine_path, "rb") as f:
            engine = self.runtime.deserialize_cuda_engine(f.read())
        return engine

    class HostDeviceMem:
        def __init__(self, host_mem, device_mem):
            self.host = host_mem
            self.device = device_mem

    def allocate_buffers(self, engine):
        inputs, outputs, bindings = [], [], []
        stream = cuda.Stream()

        for i in range(engine.num_io_tensors):
            tensor_name = engine.get_tensor_name()
            size = trt.volume(engine.get_tensor_shape(tensor_name))
            dtype = trt.nptype(engine.get_tensor_dtype(tensor_name))

            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)

            # Append the device buffer address to device bindings
            bindings.append(int(device_mem))

            # Append to the appropiate input/output list
            if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
                inputs.append(self.HostDeviceMem(host_mem, device_mem))
            else
                outputs.append(self.HostDeviceMem(host_mem, device_mem))

        return inputs, outputs, bindings, stream

    def infer(self, input_data):
        # Transfer input data to device
        np.copyto(self.inputs[0].host, input_data.ravel())
        cuda.memcpy_htod_async(self.inputs[0].device, self.inputs[0].host, self.stream)

        # Set tensor address
        for i in range(self.engine.num_io_tensors):
            self.context.set_tensor_address(self.engine.get_tensor_name(i), self.bindings[i])

        # Run inference
        self.context.execute_async_v3(stream_handle=self.stream.handle)

        # Transfer predictions back
        cuda.memcpy_dtoh_async(self.outputs[0].host, self.outputs[0].device, self.stream)

        # Synchronize the stream
        self.stream.synchronize()

        return self.outputs[0].host


if __name__ == "__main__":
    engine_path = "FILE_PATH_TO_YOUR_TRT_ENGINE"
    trt_inference = TensorRTInference(engine_path)
    img = Image.open("FILE_PATH_TO_YOUR_INPUT_IMAGE")
    img_array = np.array(img)
    inputs = img_array.transpose(2, 0, 1)  # (3, 224, 224)
    
    # Run inference
    output_data = trt_inference.infer(inputs)
    # output_data in (1280, ) shape

Reference:

Example inference code to run from a TensorRT engine file (version 10.0.x)

Sign up to discover human stories that deepen your understanding of the world.

Free

Membership

Written by A life long learner

Responses (3)