Development of a versioning system for AI agents
AI agent versioning is the management of changes to prompts, code, configuration, and agent dependencies. Without versioning, it's impossible to roll back in the event of degradation, conduct A/B testing, or ensure reproducibility.
What is versioned in the AI agent?
The agent consists of several artifacts, each requiring versioning:
- System prompt — main prompt with agent instructions
- Tool definitions — descriptions of tools (change the agent's behavior)
- Few-shot examples — if any
- Configuration — model, temperature, max_tokens, timeouts
- Code — agent logic, orchestration, postprocessing
- Dependencies - external APIs, library versions
Agent version structure
from pydantic import BaseModel
from datetime import datetime
class AgentVersion(BaseModel):
agent_name: str
version: str # semver: 1.2.3
created_at: datetime
created_by: str
change_description: str
# Артефакты
system_prompt_hash: str # SHA256 от prompt
system_prompt: str
tool_definitions_hash: str
tool_definitions: list[dict]
config: dict # model, temperature, etc.
# Метаданные
base_version: str | None # от какой версии унаследован
tags: list[str] # prod, staging, experiment
evaluation_results: dict | None # метрики качества этой версии
class AgentRegistry:
def __init__(self, db: Database):
self.db = db
def register(self, version: AgentVersion) -> str:
# Проверяем уникальность version string
if self.db.exists(agent_name=version.agent_name, version=version.version):
raise ValueError(f"Version {version.version} already exists")
self.db.insert(version)
return version.version
def get(self, agent_name: str, version: str = "latest") -> AgentVersion:
if version == "latest":
return self.db.get_latest_tagged(agent_name, tag="prod")
return self.db.get(agent_name=agent_name, version=version)
def promote(self, agent_name: str, version: str, from_tag: str, to_tag: str):
"""Перемещаем тег (staging → prod)."""
self.db.remove_tag(agent_name, to_tag) # снимаем старый prod
self.db.add_tag(agent_name, version, to_tag)
logger.info(f"Promoted {agent_name} v{version} from {from_tag} to {to_tag}")
Git-based versioning of prompts
Prompts are stored in a git repository—change history, code review, blame:
agents/
├── customer_support/
│ ├── v1.0.0/
│ │ ├── system_prompt.md
│ │ ├── tools.json
│ │ └── config.yaml
│ ├── v1.1.0/
│ │ ├── system_prompt.md # diff от v1.0.0
│ │ ├── tools.json
│ │ └── config.yaml
│ └── CHANGELOG.md
# config.yaml
model: gpt-4o
temperature: 0.3
max_tokens: 1024
max_retries: 3
timeout_seconds: 30
tools:
- crm_lookup
- ticket_create
- kb_search
Loading the agent from the registry
class VersionedAgent:
def __init__(self, agent_name: str, version: str = "latest"):
registry = AgentRegistry(db)
self.version_info = registry.get(agent_name, version)
self.llm = LLMClient(model=self.version_info.config["model"])
self.tools = ToolRegistry.load(self.version_info.tool_definitions)
async def run(self, task: str, context: dict = None) -> AgentResult:
# Все вызовы логируются с version_id для аудита
with agent_monitor.track_task(self.version_info.agent_name, version=self.version_info.version):
return await self._execute(task, context)
Diff between versions
def diff_versions(agent_name: str, v1: str, v2: str) -> VersionDiff:
ver1 = registry.get(agent_name, v1)
ver2 = registry.get(agent_name, v2)
return VersionDiff(
prompt_diff=unified_diff(ver1.system_prompt, ver2.system_prompt),
config_changes={k: (ver1.config.get(k), ver2.config[k])
for k in ver2.config if ver1.config.get(k) != ver2.config[k]},
tool_changes=compute_tool_diff(ver1.tool_definitions, ver2.tool_definitions),
evaluation_comparison=compare_evaluations(ver1.evaluation_results, ver2.evaluation_results)
)
Rollback procedure
# CLI для управления версиями
agent-ctl versions list customer_support
# v1.3.2 prod 2024-12-01 ← current
# v1.3.1 staging 2024-11-28
# v1.3.0 archived 2024-11-15
# Откат при деградации метрик
agent-ctl rollback customer_support --to v1.3.1
# → снимает тег prod с v1.3.2
# → ставит тег prod на v1.3.1
# → notifies on-call







