Development of an AI system for monitoring the performance of AI agents
AI agent performance monitoring is the systematic monitoring of the quality, speed, and reliability of AI workers in production. It differs from monitoring regular services: quality metrics (accuracy, relevance) are more important than latency metrics.
AI agent performance metrics
Three groups of metrics:
Technical metrics: latency (p50/p95/p99), throughput (tasks/hour), error rate, availability, cost per task (tokens × price).
Qualitative metrics: task completion rate (the proportion of tasks successfully completed), accuracy (the correctness of the result), hallucination rate, human override rate (how often a human corrects the agent).
Business metrics: ROI (hours saved x rate), customer satisfaction (if agent interacts with clients), SLA compliance.
Metrics collection system
from dataclasses import dataclass, field
from datetime import datetime
import uuid
@dataclass
class AgentTaskMetrics:
task_id: str = field(default_factory=lambda: str(uuid.uuid4()))
agent_id: str = ""
task_type: str = ""
started_at: datetime = field(default_factory=datetime.utcnow)
completed_at: datetime | None = None
# Технические
latency_ms: float | None = None
input_tokens: int = 0
output_tokens: int = 0
cost_usd: float = 0.0
retries: int = 0
# Качественные (заполняются post-hoc или авто-eval)
task_completed: bool | None = None
quality_score: float | None = None # 0-1, авто-eval или human
human_override: bool = False
error_type: str | None = None
class AgentMonitor:
def __init__(self, metrics_backend: MetricsBackend):
self.backend = metrics_backend
def track_task(self, agent_id: str, task_type: str):
"""Context manager для трекинга задачи."""
return AgentTaskTracker(agent_id, task_type, self.backend)
class AgentTaskTracker:
def __enter__(self) -> AgentTaskMetrics:
self.metrics = AgentTaskMetrics(agent_id=self.agent_id, task_type=self.task_type)
return self.metrics
def __exit__(self, exc_type, exc_val, exc_tb):
self.metrics.completed_at = datetime.utcnow()
self.metrics.latency_ms = (
self.metrics.completed_at - self.metrics.started_at
).total_seconds() * 1000
if exc_type:
self.metrics.error_type = exc_type.__name__
self.backend.record(self.metrics)
Automatic quality assessment
For most agent tasks, human verification of each result is impossible. Auto-eval:
def auto_evaluate_task(task: AgentTask, result: AgentResult) -> float:
"""Оценка качества результата через LLM-судью."""
eval_prompt = f"""Оцени качество выполнения задачи агентом.
Задача: {task.description}
Ожидаемый результат: {task.expected_outcome}
Фактический результат: {result.output}
Оцени от 0 до 1, где:
1.0 — задача выполнена полностью и корректно
0.5 — частичное выполнение или незначительные ошибки
0.0 — задача не выполнена или критические ошибки
Ответь только числом."""
score = float(eval_llm.generate(eval_prompt, max_tokens=10).strip())
return min(max(score, 0.0), 1.0)
Agent Monitoring Dashboard
Key panels: SLA compliance (% of tasks within the SLA), quality by task type, cost dynamics (price increase = token increase or increase in errors with retry), human override rate (trend: if it increases, the agent is degrading), error taxonomy.
Degradation alerts
class DegradationDetector:
def check(self, metrics: AgentMetricsSummary) -> list[Alert]:
alerts = []
if metrics.error_rate_7d > metrics.error_rate_30d * 1.5:
alerts.append(Alert(
severity="warning",
message=f"Error rate grew by {metrics.error_rate_7d/metrics.error_rate_30d:.1f}x over 7 days"
))
if metrics.avg_quality_score_7d < metrics.avg_quality_score_30d - 0.1:
alerts.append(Alert(
severity="warning",
message=f"Quality score dropped from {metrics.avg_quality_score_30d:.2f} to {metrics.avg_quality_score_7d:.2f}"
))
if metrics.human_override_rate_7d > 0.15: # > 15% задач переделываются
alerts.append(Alert(
severity="critical",
message=f"Human override rate too high: {metrics.human_override_rate_7d:.1%}"
))
return alerts







