Converting models to ONNX format
ONNX (Open Neural Network Exchange) is an open format for representing ML models, not tied to a specific framework. Converted models can be run on ONNX Runtime, TensorRT, OpenVINO, and CoreML—on CPUs, GPUs, NPUs, and mobile devices.
Converting the HuggingFace model via Optimum
# Установка
pip install optimum[onnxruntime]
# Экспорт через CLI
optimum-cli export onnx \
--model bert-base-uncased \
--task text-classification \
--opset 17 \
--device cuda \
--fp16 \
./bert-onnx/
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer
# Загрузка и автоматическая конвертация
model = ORTModelForSequenceClassification.from_pretrained(
"cardiffnlp/twitter-roberta-base-sentiment",
export=True,
provider="CUDAExecutionProvider"
)
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
# Использование идентично обычной HF модели
inputs = tokenizer("Great product!", return_tensors="pt")
outputs = model(**inputs)
Converting a PyTorch model directly
import torch
class TextClassifier(torch.nn.Module):
def __init__(self, vocab_size, num_classes):
super().__init__()
self.embedding = torch.nn.EmbeddingBag(vocab_size, 128)
self.fc = torch.nn.Linear(128, num_classes)
def forward(self, input_ids, offsets):
x = self.embedding(input_ids, offsets)
return self.fc(x)
model = TextClassifier(10000, 3)
model.eval()
dummy_input = (
torch.randint(0, 10000, (32,)), # input_ids
torch.tensor([0, 16]) # offsets
)
torch.onnx.export(
model,
dummy_input,
"text_classifier.onnx",
export_params=True,
opset_version=17,
do_constant_folding=True, # свёртка константных выражений
input_names=["input_ids", "offsets"],
output_names=["logits"],
dynamic_axes={
"input_ids": {0: "num_tokens"},
}
)
ONNX graph optimization
from onnxruntime.transformers import optimizer
from onnxruntime.transformers.fusion_options import FusionOptions
# Автоматическая оптимизация для трансформеров
opt_options = FusionOptions("bert")
opt_options.enable_gelu = True
opt_options.enable_layer_norm = True
opt_options.enable_attention = True
opt_options.enable_skip_layer_norm = True
optimized_model = optimizer.optimize_model(
"bert.onnx",
model_type="bert",
num_heads=12,
hidden_size=768,
optimization_options=opt_options,
opt_level=2, # 0=нет, 1=базовая, 2=расширенная, 99=все
use_gpu=True,
only_onnxruntime=False
)
optimized_model.save_model_to_file("bert_optimized.onnx")
Verifying the correctness of the conversion
import onnx
import onnxruntime as ort
import numpy as np
# Проверка валидности модели
model = onnx.load("bert_optimized.onnx")
onnx.checker.check_model(model)
# Сравнение вывода с оригинальной PyTorch моделью
pt_model.eval()
with torch.no_grad():
pt_output = pt_model(**inputs).logits.numpy()
ort_session = ort.InferenceSession("bert_optimized.onnx")
ort_output = ort_session.run(None, {k: v.numpy() for k, v in inputs.items()})[0]
# Проверка численного соответствия
np.testing.assert_allclose(pt_output, ort_output, rtol=1e-3, atol=1e-4)
print("✓ ONNX output matches PyTorch output")
Common conversion problems
Dynamic control flow: if len(x) > 0: inside forward — ONNX doesn't support dynamic control flow. Solution: unify via masking or convert via TorchScript.
Custom operators: operators without an ONNX equivalent. Solution: register a custom operator or refactor using standard operators.
Dynamic shapes: Some operations require static dimensions. Solution: Specify dynamic_axes correctly or use fixed dimensions with padding.
Numerical precision: Error accumulation in long chains of operations at FP16. Solution: Convert to FP32, then quantize separately.







