LLM Response Caching Exact and Semantic Cache Implementation

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
LLM Response Caching Exact and Semantic Cache Implementation
Medium
~2-3 business days
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1212
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    852
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    822

Реализация кэширования LLM-ответов: Exact и Semantic Cache

LLM-запросы дорогие и медленные. Кэширование — самый дешёвый способ снизить оба показателя. Два подхода: Exact Cache (хэш промпта) — для строго одинаковых запросов; Semantic Cache (эмбеддинги + векторный поиск) — для семантически похожих запросов. В типичном приложении 30–40% запросов можно закрыть кешем.

Exact Cache

import hashlib
import json
import redis
from typing import Optional
from functools import wraps

class ExactLLMCache:

    def __init__(self, redis_url: str = "redis://localhost:6379", ttl: int = 3600):
        self.redis = redis.from_url(redis_url)
        self.ttl = ttl

    def _make_key(self, messages: list[dict], model: str, temperature: float) -> str:
        """Создаёт ключ кэша из параметров запроса"""
        cache_input = {
            "messages": messages,
            "model": model,
            "temperature": temperature,
        }
        content = json.dumps(cache_input, sort_keys=True, ensure_ascii=False)
        return f"llm:exact:{hashlib.sha256(content.encode()).hexdigest()}"

    def get(self, messages: list[dict], model: str, temperature: float = 0) -> Optional[str]:
        key = self._make_key(messages, model, temperature)
        cached = self.redis.get(key)
        if cached:
            return cached.decode()
        return None

    def set(self, messages: list[dict], model: str, temperature: float, response: str):
        key = self._make_key(messages, model, temperature)
        self.redis.setex(key, self.ttl, response.encode())

    def cached_complete(self, complete_fn):
        """Декоратор для кэширования функций"""
        @wraps(complete_fn)
        def wrapper(messages, model="gpt-4o", temperature=0, **kwargs):
            cached = self.get(messages, model, temperature)
            if cached:
                return cached

            result = complete_fn(messages, model=model, temperature=temperature, **kwargs)
            self.set(messages, model, temperature, result)
            return result
        return wrapper

Semantic Cache с векторным поиском

from openai import OpenAI
import numpy as np
from dataclasses import dataclass

@dataclass
class CachedEntry:
    query_embedding: list[float]
    question: str
    answer: str
    model: str
    created_at: float

class SemanticLLMCache:
    """Кэш на основе семантического сходства вопросов"""

    def __init__(
        self,
        similarity_threshold: float = 0.92,
        max_entries: int = 10000,
    ):
        self.openai = OpenAI()
        self.threshold = similarity_threshold
        self.entries: list[CachedEntry] = []

    def _get_embedding(self, text: str) -> list[float]:
        response = self.openai.embeddings.create(
            model="text-embedding-3-small",
            input=text,
        )
        return response.data[0].embedding

    def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
        a_arr = np.array(a)
        b_arr = np.array(b)
        return np.dot(a_arr, b_arr) / (np.linalg.norm(a_arr) * np.linalg.norm(b_arr))

    def get(self, question: str, model: str = None) -> Optional[str]:
        """Ищет похожий вопрос в кэше"""
        if not self.entries:
            return None

        query_embedding = self._get_embedding(question)

        best_similarity = 0
        best_answer = None

        for entry in self.entries:
            if model and entry.model != model:
                continue

            similarity = self._cosine_similarity(query_embedding, entry.query_embedding)
            if similarity > best_similarity:
                best_similarity = similarity
                best_answer = entry.answer

        if best_similarity >= self.threshold:
            return best_answer
        return None

    def set(self, question: str, answer: str, model: str):
        """Добавляет запись в кэш"""
        import time
        embedding = self._get_embedding(question)
        entry = CachedEntry(
            query_embedding=embedding,
            question=question,
            answer=answer,
            model=model,
            created_at=time.time(),
        )
        self.entries.append(entry)

        # Ограничиваем размер кэша
        if len(self.entries) > 10000:
            self.entries = sorted(self.entries, key=lambda e: e.created_at)[-10000:]

Комбинированный кэш с Redis + векторным хранилищем

import chromadb
import time

class ProductionSemanticCache:
    """Production-ready кэш: Redis для exact, Chroma для semantic"""

    def __init__(self):
        self.redis = redis.from_url("redis://localhost:6379")
        self.chroma = chromadb.HttpClient(host="localhost", port=8000)
        self.collection = self.chroma.get_or_create_collection("llm_cache")
        self.openai = OpenAI()
        self.similarity_threshold = 0.93
        self.exact_ttl = 3600
        self.semantic_ttl = 86400  # 24 часа

    def get(self, question: str, model: str) -> Optional[dict]:
        # 1. Exact match сначала (быстро)
        exact_key = f"llm:exact:{hashlib.md5(f'{question}:{model}'.encode()).hexdigest()}"
        exact_hit = self.redis.get(exact_key)
        if exact_hit:
            return {"answer": exact_hit.decode(), "cache_type": "exact"}

        # 2. Semantic match
        embedding = self.openai.embeddings.create(
            model="text-embedding-3-small",
            input=question,
        ).data[0].embedding

        results = self.collection.query(
            query_embeddings=[embedding],
            n_results=1,
            where={"model": model},
        )

        if results["distances"] and results["distances"][0]:
            distance = results["distances"][0][0]
            similarity = 1 - distance  # Chroma использует косинусное расстояние

            if similarity >= self.similarity_threshold:
                answer = results["documents"][0][0]
                return {"answer": answer, "cache_type": "semantic", "similarity": similarity}

        return None

    def set(self, question: str, answer: str, model: str):
        # Exact cache в Redis
        exact_key = f"llm:exact:{hashlib.md5(f'{question}:{model}'.encode()).hexdigest()}"
        self.redis.setex(exact_key, self.exact_ttl, answer.encode())

        # Semantic cache в Chroma
        embedding = self.openai.embeddings.create(
            model="text-embedding-3-small",
            input=question,
        ).data[0].embedding

        self.collection.add(
            ids=[f"{int(time.time())}_{hash(question)}"],
            embeddings=[embedding],
            documents=[answer],
            metadatas=[{"model": model, "question": question, "created_at": time.time()}],
        )

Практический кейс: FAQ-бот

Профиль: 5000 вопросов/день, 70% повторяющихся (FAQ про продукт).

До кэша: все запросы → GPT-4o = $180/мес, p95 latency 2.3 сек.

После кэша:

  • Exact cache hit: 35% запросов, latency < 5 мс
  • Semantic cache hit: 28% запросов, latency ~50 мс (embedding + поиск)
  • LLM запросы: 37% от исходного объёма = $67/мес (-63%)
  • Средняя latency: 2.3 сек → 0.4 сек

Сроки

  • Exact cache (Redis): 0.5–1 день
  • Semantic cache (Chroma + embeddings): 2–3 дня
  • Production с метриками hit rate: 1 неделя