Development of a system for automatic assessment of the quality of prompts
Automatic prompt evaluation allows you to understand how well a prompt performs a task without manual labeling. Key approaches include reference-based metrics (ROUGE, BLEU, BERTScore), LLM-as-judge, and task-specific metrics.
Evaluation system architecture
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Callable
@dataclass
class EvalResult:
score: float # 0-1
passed: bool
details: dict
class BaseEvaluator(ABC):
@abstractmethod
def evaluate(self, input: str, output: str, expected: str = None) -> EvalResult:
pass
class LLMJudgeEvaluator(BaseEvaluator):
"""LLM-as-judge для субъективных задач"""
def __init__(self, judge_model: str = "gpt-4o", criteria: list[str] = None):
self.model = judge_model
self.criteria = criteria or ["accuracy", "relevance", "conciseness"]
def evaluate(self, input: str, output: str, expected: str = None) -> EvalResult:
criteria_str = "\n".join(f"- {c}" for c in self.criteria)
prompt = f"""Evaluate the following AI response on these criteria:
{criteria_str}
User input: {input}
AI response: {output}
{f'Expected answer: {expected}' if expected else ''}
For each criterion, provide a score 1-5 and brief reasoning.
Respond with JSON: {{"scores": {{{{"criterion": score}}}}, "overall": 0-1, "reasoning": "..."}}"""
response = self.llm_client.complete(prompt)
result = json.loads(response)
return EvalResult(
score=result['overall'],
passed=result['overall'] >= 0.7,
details=result
)
class RougeEvaluator(BaseEvaluator):
"""Reference-based метрика ROUGE"""
def evaluate(self, input: str, output: str, expected: str) -> EvalResult:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
scores = scorer.score(expected, output)
rouge_l = scores['rougeL'].fmeasure
return EvalResult(
score=rouge_l,
passed=rouge_l >= 0.4,
details={"rouge1": scores['rouge1'].fmeasure,
"rouge2": scores['rouge2'].fmeasure,
"rougeL": rouge_l}
)
class BERTScoreEvaluator(BaseEvaluator):
def evaluate(self, input: str, output: str, expected: str) -> EvalResult:
from bert_score import score
P, R, F1 = score([output], [expected], lang='en', model_type='microsoft/deberta-xlarge-mnli')
bert_f1 = float(F1[0])
return EvalResult(score=bert_f1, passed=bert_f1 >= 0.85, details={"f1": bert_f1})
Composite appraiser
class CompositeEvaluator:
def __init__(self, evaluators: list[tuple[BaseEvaluator, float]]):
"""evaluators: [(evaluator, weight), ...]"""
self.evaluators = evaluators
def evaluate_prompt(self, prompt_version: str,
test_cases: list[dict]) -> dict:
results = []
for case in test_cases:
rendered = render_prompt(prompt_version, case['input_variables'])
output = llm_call(rendered)
case_scores = {}
for evaluator, weight in self.evaluators:
result = evaluator.evaluate(
input=case.get('input', ''),
output=output,
expected=case.get('expected')
)
case_scores[type(evaluator).__name__] = {
'score': result.score,
'weight': weight,
'passed': result.passed
}
weighted_score = sum(
v['score'] * v['weight'] for v in case_scores.values()
)
results.append({'case': case, 'output': output,
'scores': case_scores, 'weighted': weighted_score})
return {
'mean_score': np.mean([r['weighted'] for r in results]),
'pass_rate': np.mean([all(s['passed'] for s in r['scores'].values())
for r in results]),
'results': results
}
# Использование
evaluator = CompositeEvaluator([
(LLMJudgeEvaluator(criteria=["accuracy", "helpfulness"]), 0.5),
(RougeEvaluator(), 0.3),
(BERTScoreEvaluator(), 0.2),
])
score = evaluator.evaluate_prompt("summarization-v3", test_cases)
print(f"Overall score: {score['mean_score']:.3f}, Pass rate: {score['pass_rate']:.2%}")
Regression testing
Each prompt change is automatically compared to the baseline:
def check_for_regression(new_score: float, baseline_score: float,
threshold: float = 0.05) -> bool:
"""Возвращает True если регрессия обнаружена"""
relative_change = (new_score - baseline_score) / baseline_score
if relative_change < -threshold:
print(f"REGRESSION: score dropped {abs(relative_change):.1%}")
return True
return False
The system allows the team to confidently iterate on prompts, knowing that automated tests will catch degradation before it reaches production.







