Implementation of a prompt management platform
The Prompt Management Platform is a centralized system for storing, versioning, testing, and deploying LLM prompts. When working with dozens of prompts in production, the need for such a platform becomes obvious: prompts change, their impact needs to be measured, and rollback to a previous version must be instantaneous.
Problems without prompt management
- Prompts are hardcoded in the code - changes require deployment
- There is no history: it is unclear what changed and when
- No A/B testing: it's unclear which version of the prompt is better
- Different environments (dev/staging/prod) use different prompts randomly
- There are no quality metrics for each version of the prompt
Prompt Registry Architecture
from dataclasses import dataclass
from typing import Optional
import hashlib
@dataclass
class PromptVersion:
id: str
name: str
version: int
content: str
variables: list[str] # Переменные в промпте {{variable}}
model: str
temperature: float
max_tokens: int
created_by: str
created_at: datetime
metadata: dict
hash: str = None
def __post_init__(self):
self.hash = hashlib.sha256(self.content.encode()).hexdigest()[:8]
class PromptRegistry:
def __init__(self, db_connection, cache):
self.db = db_connection
self.cache = cache
def register(self, name: str, content: str, model: str = "gpt-4o",
temperature: float = 0.0, **kwargs) -> PromptVersion:
"""Регистрация новой версии промпта"""
last_version = self.db.get_latest_version(name)
version_num = (last_version.version + 1) if last_version else 1
variables = self._extract_variables(content) # {{var}} → ['var']
prompt = PromptVersion(
id=str(uuid.uuid4()),
name=name,
version=version_num,
content=content,
variables=variables,
model=model,
temperature=temperature,
max_tokens=kwargs.get('max_tokens', 1000),
created_by=kwargs.get('created_by', 'system'),
created_at=datetime.utcnow(),
metadata=kwargs.get('metadata', {})
)
self.db.save(prompt)
return prompt
def get(self, name: str, version: str = "latest",
environment: str = "production") -> PromptVersion:
"""Получение промпта по имени и версии"""
cache_key = f"prompt:{name}:{version}:{environment}"
cached = self.cache.get(cache_key)
if cached:
return cached
if version == "latest":
prompt = self.db.get_latest_deployed(name, environment)
else:
prompt = self.db.get_by_version(name, int(version))
self.cache.set(cache_key, prompt, ttl=300)
return prompt
def render(self, name: str, variables: dict, **kwargs) -> str:
"""Получение и рендеринг промпта"""
prompt = self.get(name, **kwargs)
rendered = prompt.content
for var, value in variables.items():
rendered = rendered.replace(f"{{{{{var}}}}}", str(value))
# Проверка: все переменные заполнены?
missing = [v for v in prompt.variables if f"{{{{{v}}}}}" in rendered]
if missing:
raise ValueError(f"Missing variables: {missing}")
return rendered
Deploying prompts by environment
class PromptDeploymentManager:
def deploy(self, prompt_name: str, version: int,
environment: str, require_review: bool = True):
prompt = self.registry.get_by_version(prompt_name, version)
if require_review and not prompt.is_reviewed:
raise ValueError("Prompt requires review before deployment to production")
# Запись деплоя
self.db.create_deployment(
prompt_id=prompt.id,
environment=environment,
deployed_by=current_user(),
deployed_at=datetime.utcnow()
)
# Инвалидация кэша
self.cache.delete(f"prompt:{prompt_name}:latest:{environment}")
# Webhook уведомление
self.notify_team(
f"Prompt '{prompt_name}' v{version} deployed to {environment}"
)
Prompt Quality Metrics
For each prompt and version, you need to measure: latency (response time), token usage (cost), output quality score (automatic assessment via LLM-judge or human evaluation), task-specific metrics (precision@k for RAG, helpfulness score for chatbots).
Integration with LangSmith, Weights & Biases, or a custom tracker allows you to compare prompt versions using these metrics and make data-driven decisions about updates.







