Implementation of Token Counting and LLM Request Budgeting
With intensive use of LLM (GPT-4, Claude, Gemini), API costs quickly become a significant budget item. Token counting and budgeting is a system for tracking, forecasting, and controlling LLM costs at the level of individual functions, users, and teams.
Counting tokens before sending a request
import tiktoken
from anthropic import Anthropic
# OpenAI / GPT-4
def count_tokens_openai(text: str, model: str = "gpt-4") -> int:
enc = tiktoken.encoding_for_model(model)
return len(enc.encode(text))
def estimate_request_cost(prompt: str, max_completion: int = 1000,
model: str = "gpt-4-turbo") -> dict:
input_tokens = count_tokens_openai(prompt, model)
total_tokens = input_tokens + max_completion
# Цены за 1M токенов (актуальны на момент написания)
prices = {
"gpt-4-turbo": {"input": 10.0, "output": 30.0},
"gpt-4o": {"input": 5.0, "output": 15.0},
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
"claude-3-5-sonnet": {"input": 3.0, "output": 15.0},
}
price = prices.get(model, {"input": 10.0, "output": 30.0})
estimated_cost = (input_tokens / 1_000_000 * price["input"] +
max_completion / 1_000_000 * price["output"])
return {
"input_tokens": input_tokens,
"max_output_tokens": max_completion,
"estimated_cost_usd": estimated_cost
}
Budgeting system
from dataclasses import dataclass, field
import threading
@dataclass
class TokenBudget:
daily_limit_usd: float
monthly_limit_usd: float
per_user_daily_limit_usd: float = 1.0
spent_today: float = field(default=0.0)
spent_month: float = field(default=0.0)
_lock: threading.Lock = field(default_factory=threading.Lock)
class LLMBudgetManager:
def __init__(self, redis_client, budget: TokenBudget):
self.redis = redis_client
self.budget = budget
def check_and_reserve(self, user_id: str, estimated_cost: float) -> bool:
"""Проверка бюджета перед запросом"""
# Общий дневной лимит
daily_spent = float(self.redis.get(f"budget:daily") or 0)
if daily_spent + estimated_cost > self.budget.daily_limit_usd:
raise BudgetExceededError(
f"Daily budget ${self.budget.daily_limit_usd} exceeded"
)
# Лимит пользователя
user_spent = float(self.redis.get(f"budget:user:{user_id}:daily") or 0)
if user_spent + estimated_cost > self.budget.per_user_daily_limit_usd:
raise BudgetExceededError(
f"User daily budget ${self.budget.per_user_daily_limit_usd} exceeded"
)
# Резервирование (атомарно)
pipe = self.redis.pipeline()
pipe.incrbyfloat(f"budget:daily", estimated_cost)
pipe.expire(f"budget:daily", 86400)
pipe.incrbyfloat(f"budget:user:{user_id}:daily", estimated_cost)
pipe.expire(f"budget:user:{user_id}:daily", 86400)
pipe.execute()
return True
def record_actual_cost(self, user_id: str, actual_cost: float,
estimated_cost: float):
"""Коррекция после получения реального usage"""
correction = actual_cost - estimated_cost
if abs(correction) > 0.001:
self.redis.incrbyfloat("budget:daily", correction)
Middleware for automatic accounting
import functools
def track_llm_cost(model: str = "gpt-4o"):
def decorator(func):
@functools.wraps(func)
async def wrapper(*args, **kwargs):
result = await func(*args, **kwargs)
# Извлечение usage из response
if hasattr(result, 'usage'):
cost = compute_cost(
result.usage.prompt_tokens,
result.usage.completion_tokens,
model
)
analytics.record(
function=func.__name__,
model=model,
cost=cost,
input_tokens=result.usage.prompt_tokens,
output_tokens=result.usage.completion_tokens
)
return result
return wrapper
return decorator
Typical implementation result: reduction of LLM costs by 20-40% due to identification of inefficient queries (long unnecessary system prompts, duplicate queries without caching) and correct selection of the model for each task.







