Developing a pay-per-inference system for using AI models
Pay-per-inference billing is the primary monetization model for AI APIs. The system must accurately measure usage, prevent fraud, ensure real-time balances, and generate transparent invoices.
Units of measurement of use
- Requests: the simplest metric, but does not reflect complexity
- Tokens (input + output): standard for LLM
- GPU seconds: for resource-intensive models (image generation, video)
- Compute units: a normalized metric aggregating CPU/GPU time
Billing architecture
from decimal import Decimal
import asyncio
class InferenceBillingSystem:
def __init__(self, redis_client, db, stripe_client):
self.redis = redis_client
self.db = db
self.stripe = stripe_client
async def pre_authorize(self, customer_id: str,
model_id: str, request: dict) -> str:
"""Проверка баланса и резервирование средств перед инференсом"""
model_pricing = await self.get_pricing(model_id)
estimated_cost = self.estimate_cost(model_pricing, request)
# Проверка баланса в Redis (быстро)
balance_key = f"balance:{customer_id}"
balance = Decimal(await self.redis.get(balance_key) or '0')
if balance < estimated_cost:
raise InsufficientBalanceError(
f"Balance ${balance} < estimated ${estimated_cost}"
)
# Временная блокировка средств
reservation_id = str(uuid.uuid4())
pipe = self.redis.pipeline()
pipe.decrby(balance_key, int(estimated_cost * 100)) # Центы
pipe.setex(f"reservation:{reservation_id}", 300, str(estimated_cost))
await pipe.execute()
return reservation_id
async def settle(self, reservation_id: str, model_id: str,
actual_usage: dict):
"""Финальный расчёт по факту использования"""
model_pricing = await self.get_pricing(model_id)
actual_cost = self.compute_actual_cost(model_pricing, actual_usage)
estimated_cost = Decimal(await self.redis.get(f"reservation:{reservation_id}"))
# Возврат переплаты или доначисление
delta = estimated_cost - actual_cost
customer_id = await self.get_customer_by_reservation(reservation_id)
if delta > 0:
await self.redis.incrbyfloat(f"balance:{customer_id}", float(delta))
# Запись в БД для аудита
await self.db.insert_usage_record({
'customer_id': customer_id,
'model_id': model_id,
'reservation_id': reservation_id,
'actual_cost': float(actual_cost),
'usage_details': actual_usage,
'timestamp': datetime.utcnow()
})
await self.redis.delete(f"reservation:{reservation_id}")
def compute_actual_cost(self, pricing: dict, usage: dict) -> Decimal:
cost = Decimal('0')
if 'input_tokens' in usage:
cost += Decimal(str(usage['input_tokens'])) / 1_000_000 * \
Decimal(str(pricing['input_token_price']))
if 'output_tokens' in usage:
cost += Decimal(str(usage['output_tokens'])) / 1_000_000 * \
Decimal(str(pricing['output_token_price']))
if 'gpu_seconds' in usage:
cost += Decimal(str(usage['gpu_seconds'])) * \
Decimal(str(pricing['gpu_second_price']))
return cost
Volume Discounts and Pricing Tiers
PRICING_TIERS = {
"gpt4-equivalent": {
"tiers": [
{"up_to_tokens": 1_000_000, "input_price": 10.0, "output_price": 30.0},
{"up_to_tokens": 10_000_000, "input_price": 8.0, "output_price": 24.0},
{"up_to_tokens": 100_000_000, "input_price": 6.0, "output_price": 18.0},
{"up_to_tokens": None, "input_price": 5.0, "output_price": 15.0},
]
}
}
def compute_tiered_price(total_tokens: int, pricing_key: str) -> Decimal:
"""Вычисление цены с учётом volume discounts"""
tiers = PRICING_TIERS[pricing_key]['tiers']
remaining = total_tokens
total_cost = Decimal('0')
for tier in tiers:
if tier['up_to_tokens'] is None or remaining <= tier['up_to_tokens']:
total_cost += Decimal(str(remaining)) / 1_000_000 * \
Decimal(str(tier['input_price']))
break
else:
tier_tokens = tier['up_to_tokens']
total_cost += Decimal(str(tier_tokens)) / 1_000_000 * \
Decimal(str(tier['input_price']))
remaining -= tier_tokens
return total_cost
Fraud Prevention
class FraudDetector:
async def check_request(self, customer_id: str, model_id: str) -> bool:
# Аномальный spike: > 10x среднего использования за последние 5 мин
recent_usage = await self.get_recent_usage(customer_id, minutes=5)
avg_usage = await self.get_average_usage(customer_id, hours=24)
if recent_usage > avg_usage * 10 and recent_usage > 1000:
await self.flag_for_review(customer_id, "usage_spike")
await self.notify_security(customer_id, recent_usage)
return False # Блокировка до проверки
return True
The system automatically generates monthly invoices via Stripe, supports prepaid balances, and credit card auto-recharge when a minimum threshold is reached.







