AI Model Pay-Per-Inference Billing System

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
AI Model Pay-Per-Inference Billing System
Medium
~2-4 weeks
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1212
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    852
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    822

Developing a pay-per-inference system for using AI models

Pay-per-inference billing is the primary monetization model for AI APIs. The system must accurately measure usage, prevent fraud, ensure real-time balances, and generate transparent invoices.

Units of measurement of use

  • Requests: the simplest metric, but does not reflect complexity
  • Tokens (input + output): standard for LLM
  • GPU seconds: for resource-intensive models (image generation, video)
  • Compute units: a normalized metric aggregating CPU/GPU time

Billing architecture

from decimal import Decimal
import asyncio

class InferenceBillingSystem:
    def __init__(self, redis_client, db, stripe_client):
        self.redis = redis_client
        self.db = db
        self.stripe = stripe_client

    async def pre_authorize(self, customer_id: str,
                             model_id: str, request: dict) -> str:
        """Проверка баланса и резервирование средств перед инференсом"""
        model_pricing = await self.get_pricing(model_id)
        estimated_cost = self.estimate_cost(model_pricing, request)

        # Проверка баланса в Redis (быстро)
        balance_key = f"balance:{customer_id}"
        balance = Decimal(await self.redis.get(balance_key) or '0')

        if balance < estimated_cost:
            raise InsufficientBalanceError(
                f"Balance ${balance} < estimated ${estimated_cost}"
            )

        # Временная блокировка средств
        reservation_id = str(uuid.uuid4())
        pipe = self.redis.pipeline()
        pipe.decrby(balance_key, int(estimated_cost * 100))  # Центы
        pipe.setex(f"reservation:{reservation_id}", 300, str(estimated_cost))
        await pipe.execute()

        return reservation_id

    async def settle(self, reservation_id: str, model_id: str,
                     actual_usage: dict):
        """Финальный расчёт по факту использования"""
        model_pricing = await self.get_pricing(model_id)
        actual_cost = self.compute_actual_cost(model_pricing, actual_usage)
        estimated_cost = Decimal(await self.redis.get(f"reservation:{reservation_id}"))

        # Возврат переплаты или доначисление
        delta = estimated_cost - actual_cost
        customer_id = await self.get_customer_by_reservation(reservation_id)

        if delta > 0:
            await self.redis.incrbyfloat(f"balance:{customer_id}", float(delta))

        # Запись в БД для аудита
        await self.db.insert_usage_record({
            'customer_id': customer_id,
            'model_id': model_id,
            'reservation_id': reservation_id,
            'actual_cost': float(actual_cost),
            'usage_details': actual_usage,
            'timestamp': datetime.utcnow()
        })

        await self.redis.delete(f"reservation:{reservation_id}")

    def compute_actual_cost(self, pricing: dict, usage: dict) -> Decimal:
        cost = Decimal('0')

        if 'input_tokens' in usage:
            cost += Decimal(str(usage['input_tokens'])) / 1_000_000 * \
                    Decimal(str(pricing['input_token_price']))

        if 'output_tokens' in usage:
            cost += Decimal(str(usage['output_tokens'])) / 1_000_000 * \
                    Decimal(str(pricing['output_token_price']))

        if 'gpu_seconds' in usage:
            cost += Decimal(str(usage['gpu_seconds'])) * \
                    Decimal(str(pricing['gpu_second_price']))

        return cost

Volume Discounts and Pricing Tiers

PRICING_TIERS = {
    "gpt4-equivalent": {
        "tiers": [
            {"up_to_tokens": 1_000_000, "input_price": 10.0, "output_price": 30.0},
            {"up_to_tokens": 10_000_000, "input_price": 8.0, "output_price": 24.0},
            {"up_to_tokens": 100_000_000, "input_price": 6.0, "output_price": 18.0},
            {"up_to_tokens": None, "input_price": 5.0, "output_price": 15.0},
        ]
    }
}

def compute_tiered_price(total_tokens: int, pricing_key: str) -> Decimal:
    """Вычисление цены с учётом volume discounts"""
    tiers = PRICING_TIERS[pricing_key]['tiers']
    remaining = total_tokens
    total_cost = Decimal('0')

    for tier in tiers:
        if tier['up_to_tokens'] is None or remaining <= tier['up_to_tokens']:
            total_cost += Decimal(str(remaining)) / 1_000_000 * \
                          Decimal(str(tier['input_price']))
            break
        else:
            tier_tokens = tier['up_to_tokens']
            total_cost += Decimal(str(tier_tokens)) / 1_000_000 * \
                          Decimal(str(tier['input_price']))
            remaining -= tier_tokens

    return total_cost

Fraud Prevention

class FraudDetector:
    async def check_request(self, customer_id: str, model_id: str) -> bool:
        # Аномальный spike: > 10x среднего использования за последние 5 мин
        recent_usage = await self.get_recent_usage(customer_id, minutes=5)
        avg_usage = await self.get_average_usage(customer_id, hours=24)

        if recent_usage > avg_usage * 10 and recent_usage > 1000:
            await self.flag_for_review(customer_id, "usage_spike")
            await self.notify_security(customer_id, recent_usage)
            return False  # Блокировка до проверки

        return True

The system automatically generates monthly invoices via Stripe, supports prepaid balances, and credit card auto-recharge when a minimum threshold is reached.