Model Conversion to TensorRT Format

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
Model Conversion to TensorRT Format
Medium
from 1 business day to 3 business days
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1212
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    852
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    822

Converting models to TensorRT

TensorRT is an NVIDIA SDK for optimizing neural network models on NVIDIA GPUs. It compiles the model into an optimized plan tailored to the specific GPU: it combines operations into kernels, selects optimal algorithms, and applies precision optimizations. The result: 2–8x speedup compared to FP32 PyTorch.

Converting from ONNX to TensorRT

import tensorrt as trt

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

def build_engine(onnx_path: str, engine_path: str, fp16: bool = True, int8: bool = False):
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    parser = trt.OnnxParser(network, TRT_LOGGER)

    with open(onnx_path, "rb") as f:
        if not parser.parse(f.read()):
            for i in range(parser.num_errors):
                print(f"ONNX parse error: {parser.get_error(i)}")
            raise RuntimeError("Failed to parse ONNX")

    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 * 1024 ** 3)  # 4 GB

    if fp16 and builder.platform_has_fast_fp16:
        config.set_flag(trt.BuilderFlag.FP16)

    if int8:
        config.set_flag(trt.BuilderFlag.INT8)
        # Нужен calibrator для INT8
        config.int8_calibrator = MyCalibrator(calibration_data)

    # Dynamic shapes — важно для переменной длины входа
    profile = builder.create_optimization_profile()
    profile.set_shape(
        "input_ids",
        min=(1, 1),          # минимальный размер
        opt=(8, 128),        # оптимальный (для тюнинга kernels)
        max=(32, 512)        # максимальный
    )
    config.add_optimization_profile(profile)

    # Компиляция (долго: 5-30 минут)
    serialized_engine = builder.build_serialized_network(network, config)

    with open(engine_path, "wb") as f:
        f.write(serialized_engine)

    print(f"Engine saved to {engine_path}")

Inference with TensorRT

import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit

class TRTInferenceSession:
    def __init__(self, engine_path: str):
        runtime = trt.Runtime(TRT_LOGGER)
        with open(engine_path, "rb") as f:
            self.engine = runtime.deserialize_cuda_engine(f.read())
        self.context = self.engine.create_execution_context()

        # Выделение GPU буферов
        self.inputs = []
        self.outputs = []
        self.bindings = []
        for binding in self.engine:
            shape = self.engine.get_tensor_shape(binding)
            size = trt.volume(shape) * np.dtype(np.float32).itemsize
            device_mem = cuda.mem_alloc(size)
            self.bindings.append(int(device_mem))
            if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
                self.inputs.append({"name": binding, "mem": device_mem, "shape": shape})
            else:
                self.outputs.append({"name": binding, "mem": device_mem, "shape": shape})

        self.stream = cuda.Stream()

    def infer(self, inputs: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
        # Копирование входов на GPU
        for inp in self.inputs:
            data = inputs[inp["name"]].astype(np.float32).ravel()
            cuda.memcpy_htod_async(inp["mem"], data, self.stream)

        # Запуск inference
        self.context.execute_async_v2(self.bindings, self.stream.handle)

        # Копирование результатов с GPU
        results = {}
        for out in self.outputs:
            output = np.empty(out["shape"], dtype=np.float32)
            cuda.memcpy_dtoh_async(output, out["mem"], self.stream)
            results[out["name"]] = output

        self.stream.synchronize()
        return results

INT8 Calibration

INT8 requires calibration data to determine the value ranges:

class BertCalibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, calibration_texts: list[str], cache_file: str = "calibration.cache"):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.cache_file = cache_file
        self.data = iter(
            self.tokenizer(calibration_texts, padding="max_length",
                           truncation=True, max_length=128, return_tensors="np")
        )
        self.device_input = cuda.mem_alloc(128 * 4)  # input_ids buffer

    def get_batch_size(self) -> int:
        return 16

    def get_batch(self, names: list[str]) -> list | None:
        try:
            batch = {k: next(self.data) for k in ["input_ids", "attention_mask"]}
            cuda.memcpy_htod(self.device_input, batch["input_ids"].astype(np.int32).ravel())
            return [int(self.device_input)]
        except StopIteration:
            return None

    def read_calibration_cache(self) -> bytes | None:
        if os.path.exists(self.cache_file):
            with open(self.cache_file, "rb") as f:
                return f.read()
        return None

    def write_calibration_cache(self, cache: bytes) -> None:
        with open(self.cache_file, "wb") as f:
            f.write(cache)

Using torch2trt and torch-tensorrt

A simpler way for PyTorch models:

import torch_tensorrt

# Компиляция torch модели напрямую
trt_model = torch_tensorrt.compile(
    model,
    inputs=[
        torch_tensorrt.Input(
            min_shape=[1, 1],
            opt_shape=[8, 128],
            max_shape=[32, 512],
            dtype=torch.int32
        )
    ],
    enabled_precisions={torch.float16},
    workspace_size=4 * 1024 ** 3,
    truncate_long_and_double=True
)

# Сохранение
torch.jit.save(trt_model, "bert_trt.ts")

Typical performance gains

On T4 GPU, BERT-base, batch=8, seq=128:

Mode Latency Speedup
PyTorch FP32 12.3 ms 1x
PyTorch FP16 6.8 ms 1.8x
TensorRT FP16 2.9 ms 4.2x
TensorRT INT8 1.8 ms 6.8x