Converting models to TensorRT
TensorRT is an NVIDIA SDK for optimizing neural network models on NVIDIA GPUs. It compiles the model into an optimized plan tailored to the specific GPU: it combines operations into kernels, selects optimal algorithms, and applies precision optimizations. The result: 2–8x speedup compared to FP32 PyTorch.
Converting from ONNX to TensorRT
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
def build_engine(onnx_path: str, engine_path: str, fp16: bool = True, int8: bool = False):
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, TRT_LOGGER)
with open(onnx_path, "rb") as f:
if not parser.parse(f.read()):
for i in range(parser.num_errors):
print(f"ONNX parse error: {parser.get_error(i)}")
raise RuntimeError("Failed to parse ONNX")
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 * 1024 ** 3) # 4 GB
if fp16 and builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16)
if int8:
config.set_flag(trt.BuilderFlag.INT8)
# Нужен calibrator для INT8
config.int8_calibrator = MyCalibrator(calibration_data)
# Dynamic shapes — важно для переменной длины входа
profile = builder.create_optimization_profile()
profile.set_shape(
"input_ids",
min=(1, 1), # минимальный размер
opt=(8, 128), # оптимальный (для тюнинга kernels)
max=(32, 512) # максимальный
)
config.add_optimization_profile(profile)
# Компиляция (долго: 5-30 минут)
serialized_engine = builder.build_serialized_network(network, config)
with open(engine_path, "wb") as f:
f.write(serialized_engine)
print(f"Engine saved to {engine_path}")
Inference with TensorRT
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
class TRTInferenceSession:
def __init__(self, engine_path: str):
runtime = trt.Runtime(TRT_LOGGER)
with open(engine_path, "rb") as f:
self.engine = runtime.deserialize_cuda_engine(f.read())
self.context = self.engine.create_execution_context()
# Выделение GPU буферов
self.inputs = []
self.outputs = []
self.bindings = []
for binding in self.engine:
shape = self.engine.get_tensor_shape(binding)
size = trt.volume(shape) * np.dtype(np.float32).itemsize
device_mem = cuda.mem_alloc(size)
self.bindings.append(int(device_mem))
if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
self.inputs.append({"name": binding, "mem": device_mem, "shape": shape})
else:
self.outputs.append({"name": binding, "mem": device_mem, "shape": shape})
self.stream = cuda.Stream()
def infer(self, inputs: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
# Копирование входов на GPU
for inp in self.inputs:
data = inputs[inp["name"]].astype(np.float32).ravel()
cuda.memcpy_htod_async(inp["mem"], data, self.stream)
# Запуск inference
self.context.execute_async_v2(self.bindings, self.stream.handle)
# Копирование результатов с GPU
results = {}
for out in self.outputs:
output = np.empty(out["shape"], dtype=np.float32)
cuda.memcpy_dtoh_async(output, out["mem"], self.stream)
results[out["name"]] = output
self.stream.synchronize()
return results
INT8 Calibration
INT8 requires calibration data to determine the value ranges:
class BertCalibrator(trt.IInt8EntropyCalibrator2):
def __init__(self, calibration_texts: list[str], cache_file: str = "calibration.cache"):
super().__init__()
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
self.cache_file = cache_file
self.data = iter(
self.tokenizer(calibration_texts, padding="max_length",
truncation=True, max_length=128, return_tensors="np")
)
self.device_input = cuda.mem_alloc(128 * 4) # input_ids buffer
def get_batch_size(self) -> int:
return 16
def get_batch(self, names: list[str]) -> list | None:
try:
batch = {k: next(self.data) for k in ["input_ids", "attention_mask"]}
cuda.memcpy_htod(self.device_input, batch["input_ids"].astype(np.int32).ravel())
return [int(self.device_input)]
except StopIteration:
return None
def read_calibration_cache(self) -> bytes | None:
if os.path.exists(self.cache_file):
with open(self.cache_file, "rb") as f:
return f.read()
return None
def write_calibration_cache(self, cache: bytes) -> None:
with open(self.cache_file, "wb") as f:
f.write(cache)
Using torch2trt and torch-tensorrt
A simpler way for PyTorch models:
import torch_tensorrt
# Компиляция torch модели напрямую
trt_model = torch_tensorrt.compile(
model,
inputs=[
torch_tensorrt.Input(
min_shape=[1, 1],
opt_shape=[8, 128],
max_shape=[32, 512],
dtype=torch.int32
)
],
enabled_precisions={torch.float16},
workspace_size=4 * 1024 ** 3,
truncate_long_and_double=True
)
# Сохранение
torch.jit.save(trt_model, "bert_trt.ts")
Typical performance gains
On T4 GPU, BERT-base, batch=8, seq=128:
| Mode | Latency | Speedup |
|---|---|---|
| PyTorch FP32 | 12.3 ms | 1x |
| PyTorch FP16 | 6.8 ms | 1.8x |
| TensorRT FP16 | 2.9 ms | 4.2x |
| TensorRT INT8 | 1.8 ms | 6.8x |







