LLM Token Counting and Request Budget Management Implementation

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
LLM Token Counting and Request Budget Management Implementation
Simple
~2-3 business days
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1212
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    852
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    822

Implementation of Token Counting and LLM Request Budgeting

With intensive use of LLM (GPT-4, Claude, Gemini), API costs quickly become a significant budget item. Token counting and budgeting is a system for tracking, forecasting, and controlling LLM costs at the level of individual functions, users, and teams.

Counting tokens before sending a request

import tiktoken
from anthropic import Anthropic

# OpenAI / GPT-4
def count_tokens_openai(text: str, model: str = "gpt-4") -> int:
    enc = tiktoken.encoding_for_model(model)
    return len(enc.encode(text))

def estimate_request_cost(prompt: str, max_completion: int = 1000,
                          model: str = "gpt-4-turbo") -> dict:
    input_tokens = count_tokens_openai(prompt, model)
    total_tokens = input_tokens + max_completion

    # Цены за 1M токенов (актуальны на момент написания)
    prices = {
        "gpt-4-turbo": {"input": 10.0, "output": 30.0},
        "gpt-4o": {"input": 5.0, "output": 15.0},
        "gpt-4o-mini": {"input": 0.15, "output": 0.60},
        "claude-3-5-sonnet": {"input": 3.0, "output": 15.0},
    }

    price = prices.get(model, {"input": 10.0, "output": 30.0})
    estimated_cost = (input_tokens / 1_000_000 * price["input"] +
                      max_completion / 1_000_000 * price["output"])

    return {
        "input_tokens": input_tokens,
        "max_output_tokens": max_completion,
        "estimated_cost_usd": estimated_cost
    }

Budgeting system

from dataclasses import dataclass, field
import threading

@dataclass
class TokenBudget:
    daily_limit_usd: float
    monthly_limit_usd: float
    per_user_daily_limit_usd: float = 1.0
    spent_today: float = field(default=0.0)
    spent_month: float = field(default=0.0)
    _lock: threading.Lock = field(default_factory=threading.Lock)

class LLMBudgetManager:
    def __init__(self, redis_client, budget: TokenBudget):
        self.redis = redis_client
        self.budget = budget

    def check_and_reserve(self, user_id: str, estimated_cost: float) -> bool:
        """Проверка бюджета перед запросом"""
        # Общий дневной лимит
        daily_spent = float(self.redis.get(f"budget:daily") or 0)
        if daily_spent + estimated_cost > self.budget.daily_limit_usd:
            raise BudgetExceededError(
                f"Daily budget ${self.budget.daily_limit_usd} exceeded"
            )

        # Лимит пользователя
        user_spent = float(self.redis.get(f"budget:user:{user_id}:daily") or 0)
        if user_spent + estimated_cost > self.budget.per_user_daily_limit_usd:
            raise BudgetExceededError(
                f"User daily budget ${self.budget.per_user_daily_limit_usd} exceeded"
            )

        # Резервирование (атомарно)
        pipe = self.redis.pipeline()
        pipe.incrbyfloat(f"budget:daily", estimated_cost)
        pipe.expire(f"budget:daily", 86400)
        pipe.incrbyfloat(f"budget:user:{user_id}:daily", estimated_cost)
        pipe.expire(f"budget:user:{user_id}:daily", 86400)
        pipe.execute()

        return True

    def record_actual_cost(self, user_id: str, actual_cost: float,
                           estimated_cost: float):
        """Коррекция после получения реального usage"""
        correction = actual_cost - estimated_cost
        if abs(correction) > 0.001:
            self.redis.incrbyfloat("budget:daily", correction)

Middleware for automatic accounting

import functools

def track_llm_cost(model: str = "gpt-4o"):
    def decorator(func):
        @functools.wraps(func)
        async def wrapper(*args, **kwargs):
            result = await func(*args, **kwargs)
            # Извлечение usage из response
            if hasattr(result, 'usage'):
                cost = compute_cost(
                    result.usage.prompt_tokens,
                    result.usage.completion_tokens,
                    model
                )
                analytics.record(
                    function=func.__name__,
                    model=model,
                    cost=cost,
                    input_tokens=result.usage.prompt_tokens,
                    output_tokens=result.usage.completion_tokens
                )
            return result
        return wrapper
    return decorator

Typical implementation result: reduction of LLM costs by 20-40% due to identification of inefficient queries (long unnecessary system prompts, duplicate queries without caching) and correct selection of the model for each task.