Developing a canary deployment system for AI agents
Canary deployment for AI agents is a gradual increase in traffic to a new version with automatic rollback when metrics degrade. This reduces the risk of a mass rollout of defective prompts or models.
Canary pipeline
v1.3.1 (100% трафика)
↓ деплой v1.3.2
v1.3.1 (95%) + v1.3.2 (5%) — наблюдение 30 мин
↓ всё ОК
v1.3.1 (75%) + v1.3.2 (25%) — наблюдение 1 час
↓ всё ОК
v1.3.1 (50%) + v1.3.2 (50%) — наблюдение 2 часа
↓ всё ОК
v1.3.2 (100%) — полный rollout
↓ при любом этапе деградация
автоматический rollback на v1.3.1
Implementation of a canary controller
@dataclass
class CanaryDeployment:
deployment_id: str
agent_name: str
stable_version: str
canary_version: str
stages: list[CanaryStage] # [(5%, 30min), (25%, 60min), (50%, 120min), (100%, 0)]
current_stage_index: int = 0
status: str = "in_progress"
@dataclass
class CanaryStage:
canary_traffic_pct: float
observation_minutes: int
started_at: datetime | None = None
class CanaryController:
def __init__(self, router: ExperimentRouter, analyzer: MetricsAnalyzer):
self.router = router
self.analyzer = analyzer
async def advance_canary(self, deployment: CanaryDeployment):
"""Вызывается по расписанию для проверки и продвижения canary."""
current_stage = deployment.stages[deployment.current_stage_index]
# Проверяем что наблюдение завершено
if not current_stage.started_at:
current_stage.started_at = datetime.utcnow()
return
elapsed = (datetime.utcnow() - current_stage.started_at).total_seconds() / 60
if elapsed < current_stage.observation_minutes:
return # ещё наблюдаем
# Анализ метрик за период наблюдения
health = await self.analyzer.compare_versions(
deployment.agent_name,
deployment.stable_version,
deployment.canary_version,
since=current_stage.started_at
)
if health.canary_is_unhealthy:
await self.rollback(deployment, reason=health.degradation_reason)
return
# Переходим к следующей стадии
next_index = deployment.current_stage_index + 1
if next_index >= len(deployment.stages):
await self.complete_rollout(deployment)
else:
deployment.current_stage_index = next_index
next_stage = deployment.stages[next_index]
await self.router.update_traffic_split(
deployment.agent_name,
stable_pct=100 - next_stage.canary_traffic_pct,
canary_pct=next_stage.canary_traffic_pct,
canary_version=deployment.canary_version
)
logger.info(f"Canary advanced to {next_stage.canary_traffic_pct}% for {deployment.agent_name}")
async def rollback(self, deployment: CanaryDeployment, reason: str):
await self.router.update_traffic_split(
deployment.agent_name, stable_pct=100, canary_pct=0,
canary_version=deployment.canary_version
)
deployment.status = "rolled_back"
await notify_team(f"Canary rollback for {deployment.agent_name}: {reason}")
logger.error(f"Canary rolled back: {deployment.agent_name} v{deployment.canary_version} → v{deployment.stable_version}")
Metrics for Canary Health Check
class CanaryHealthChecker:
THRESHOLDS = {
"error_rate": {"max_absolute": 0.05, "max_relative_increase": 2.0},
"p99_latency_ms": {"max_relative_increase": 1.5},
"task_success_rate": {"min_absolute": 0.90, "max_relative_decrease": 0.1},
"quality_score": {"max_relative_decrease": 0.05},
}
def is_healthy(self, stable_metrics: dict, canary_metrics: dict) -> HealthCheckResult:
issues = []
for metric, thresholds in self.THRESHOLDS.items():
stable_val = stable_metrics.get(metric, 0)
canary_val = canary_metrics.get(metric, 0)
if "max_absolute" in thresholds and canary_val > thresholds["max_absolute"]:
issues.append(f"{metric} too high: {canary_val:.3f} > {thresholds['max_absolute']}")
if stable_val > 0 and "max_relative_increase" in thresholds:
relative = canary_val / stable_val
if relative > thresholds["max_relative_increase"]:
issues.append(f"{metric} increased {relative:.1f}x vs stable")
return HealthCheckResult(is_healthy=len(issues) == 0, issues=issues)
Kubernetes integration
# Flagger (progressive delivery controller) для K8s
apiVersion: flagger.app/v1beta1
kind: Canary
metadata:
name: vllm-agent
namespace: ai-serving
spec:
targetRef:
apiVersion: apps/v1
kind: Deployment
name: vllm-agent
progressDeadlineSeconds: 3600
service:
port: 8000
analysis:
interval: 5m
threshold: 5 # max failures before rollback
maxWeight: 100
stepWeight: 10 # +10% каждые 5 минут
metrics:
- name: request-success-rate
thresholdRange:
min: 99
interval: 1m
- name: request-duration
thresholdRange:
max: 5000
interval: 1m







