AI Automated Prompt Quality Evaluation System

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
AI Automated Prompt Quality Evaluation System
Medium
~2-3 business days
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1212
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    852
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    822

Development of a system for automatic assessment of the quality of prompts

Automatic prompt evaluation allows you to understand how well a prompt performs a task without manual labeling. Key approaches include reference-based metrics (ROUGE, BLEU, BERTScore), LLM-as-judge, and task-specific metrics.

Evaluation system architecture

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Callable

@dataclass
class EvalResult:
    score: float  # 0-1
    passed: bool
    details: dict

class BaseEvaluator(ABC):
    @abstractmethod
    def evaluate(self, input: str, output: str, expected: str = None) -> EvalResult:
        pass

class LLMJudgeEvaluator(BaseEvaluator):
    """LLM-as-judge для субъективных задач"""

    def __init__(self, judge_model: str = "gpt-4o", criteria: list[str] = None):
        self.model = judge_model
        self.criteria = criteria or ["accuracy", "relevance", "conciseness"]

    def evaluate(self, input: str, output: str, expected: str = None) -> EvalResult:
        criteria_str = "\n".join(f"- {c}" for c in self.criteria)

        prompt = f"""Evaluate the following AI response on these criteria:
{criteria_str}

User input: {input}
AI response: {output}
{f'Expected answer: {expected}' if expected else ''}

For each criterion, provide a score 1-5 and brief reasoning.
Respond with JSON: {{"scores": {{{{"criterion": score}}}}, "overall": 0-1, "reasoning": "..."}}"""

        response = self.llm_client.complete(prompt)
        result = json.loads(response)

        return EvalResult(
            score=result['overall'],
            passed=result['overall'] >= 0.7,
            details=result
        )

class RougeEvaluator(BaseEvaluator):
    """Reference-based метрика ROUGE"""

    def evaluate(self, input: str, output: str, expected: str) -> EvalResult:
        from rouge_score import rouge_scorer
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
        scores = scorer.score(expected, output)

        rouge_l = scores['rougeL'].fmeasure
        return EvalResult(
            score=rouge_l,
            passed=rouge_l >= 0.4,
            details={"rouge1": scores['rouge1'].fmeasure,
                    "rouge2": scores['rouge2'].fmeasure,
                    "rougeL": rouge_l}
        )

class BERTScoreEvaluator(BaseEvaluator):
    def evaluate(self, input: str, output: str, expected: str) -> EvalResult:
        from bert_score import score
        P, R, F1 = score([output], [expected], lang='en', model_type='microsoft/deberta-xlarge-mnli')
        bert_f1 = float(F1[0])
        return EvalResult(score=bert_f1, passed=bert_f1 >= 0.85, details={"f1": bert_f1})

Composite appraiser

class CompositeEvaluator:
    def __init__(self, evaluators: list[tuple[BaseEvaluator, float]]):
        """evaluators: [(evaluator, weight), ...]"""
        self.evaluators = evaluators

    def evaluate_prompt(self, prompt_version: str,
                        test_cases: list[dict]) -> dict:
        results = []
        for case in test_cases:
            rendered = render_prompt(prompt_version, case['input_variables'])
            output = llm_call(rendered)

            case_scores = {}
            for evaluator, weight in self.evaluators:
                result = evaluator.evaluate(
                    input=case.get('input', ''),
                    output=output,
                    expected=case.get('expected')
                )
                case_scores[type(evaluator).__name__] = {
                    'score': result.score,
                    'weight': weight,
                    'passed': result.passed
                }

            weighted_score = sum(
                v['score'] * v['weight'] for v in case_scores.values()
            )
            results.append({'case': case, 'output': output,
                           'scores': case_scores, 'weighted': weighted_score})

        return {
            'mean_score': np.mean([r['weighted'] for r in results]),
            'pass_rate': np.mean([all(s['passed'] for s in r['scores'].values())
                                  for r in results]),
            'results': results
        }

# Использование
evaluator = CompositeEvaluator([
    (LLMJudgeEvaluator(criteria=["accuracy", "helpfulness"]), 0.5),
    (RougeEvaluator(), 0.3),
    (BERTScoreEvaluator(), 0.2),
])

score = evaluator.evaluate_prompt("summarization-v3", test_cases)
print(f"Overall score: {score['mean_score']:.3f}, Pass rate: {score['pass_rate']:.2%}")

Regression testing

Each prompt change is automatically compared to the baseline:

def check_for_regression(new_score: float, baseline_score: float,
                          threshold: float = 0.05) -> bool:
    """Возвращает True если регрессия обнаружена"""
    relative_change = (new_score - baseline_score) / baseline_score
    if relative_change < -threshold:
        print(f"REGRESSION: score dropped {abs(relative_change):.1%}")
        return True
    return False

The system allows the team to confidently iterate on prompts, knowing that automated tests will catch degradation before it reaches production.