Setting up load balancing for GPU instances
Load balancing between GPU instances with LLM has its own nuances compared to regular web servers: stateful KV cache, long-running requests (streaming), and varying request costs (from one to thousands of tokens).
Balancing algorithms for LLM
Round-robin: simple, ignores the current load. Suboptimal: one long request overloads the server while others are idle.
Least connections: Routes to the server with the fewest active connections. Better than round-robin, but doesn't take request length into account.
Least pending tokens: directs to the server with the fewest tokens in the generation queue. Most effective for LLM. Implemented via a custom load balancer.
Nginx upstream with health checks
upstream vllm_cluster {
# Least connections — базовый вариант
least_conn;
server 10.0.1.10:8000 max_fails=3 fail_timeout=30s weight=1;
server 10.0.1.11:8000 max_fails=3 fail_timeout=30s weight=1;
server 10.0.1.12:8000 max_fails=3 fail_timeout=30s weight=1;
server 10.0.1.13:8000 max_fails=3 fail_timeout=30s weight=1;
keepalive 100; # persistent connections к backend
keepalive_requests 1000;
keepalive_timeout 60s;
}
server {
listen 443 ssl http2;
server_name llm-api.internal;
location /v1/ {
proxy_pass http://vllm_cluster;
proxy_http_version 1.1;
proxy_set_header Connection "";
# Timeout для длинных streaming ответов
proxy_read_timeout 600s;
proxy_send_timeout 600s;
proxy_connect_timeout 5s;
# Streaming: отключаем буферизацию
proxy_buffering off;
proxy_cache off;
chunked_transfer_encoding on;
# Circuit breaker
proxy_next_upstream error timeout http_500 http_502 http_503;
proxy_next_upstream_tries 2;
proxy_next_upstream_timeout 10s;
}
# Active health check (nginx plus) или через отдельный endpoint
location /health {
proxy_pass http://vllm_cluster/health;
}
}
Custom load balancer for pending requests
from fastapi import FastAPI, Request
import httpx
import asyncio
class LLMLeastPendingBalancer:
def __init__(self, backends: list[str]):
self.backends = {url: {"pending": 0, "healthy": True} for url in backends}
self.client = httpx.AsyncClient(timeout=300)
async def get_backend(self) -> str:
"""Выбираем backend с наименьшим числом pending токенов."""
healthy = {url: info for url, info in self.backends.items() if info["healthy"]}
if not healthy:
raise RuntimeError("No healthy backends")
# Получаем актуальные метрики
metrics = await self._fetch_metrics(list(healthy.keys()))
# Выбираем backend с минимальным queue
best = min(metrics.items(), key=lambda x: x[1].get("vllm_num_requests_waiting", 0))
return best[0]
async def _fetch_metrics(self, backends: list[str]) -> dict:
tasks = [self._get_backend_queue(url) for url in backends]
results = await asyncio.gather(*tasks, return_exceptions=True)
return {url: result for url, result in zip(backends, results)
if not isinstance(result, Exception)}
async def _get_backend_queue(self, url: str) -> dict:
response = await self.client.get(f"{url}/metrics")
# Парсим Prometheus метрики
for line in response.text.split('\n'):
if line.startswith('vllm:num_requests_waiting'):
return {"vllm_num_requests_waiting": float(line.split()[-1])}
return {"vllm_num_requests_waiting": 0}
async def forward(self, request: Request) -> httpx.Response:
backend = await self.get_backend()
url = f"{backend}{request.url.path}"
self.backends[backend]["pending"] += 1
try:
return await self.client.request(
method=request.method,
url=url,
content=await request.body(),
headers=dict(request.headers)
)
finally:
self.backends[backend]["pending"] -= 1
app = FastAPI()
balancer = LLMLeastPendingBalancer(["http://gpu1:8000", "http://gpu2:8000", "http://gpu3:8000"])
@app.api_route("/v1/{path:path}", methods=["GET", "POST"])
async def proxy(path: str, request: Request):
return await balancer.forward(request)
Sticky sessions for context-heavy requests
If LLM uses the KV cache prefix reuse (common system prompt), it is useful to direct requests with the same prefix to the same server:
def get_backend_by_prefix(prompt: str, backends: list[str]) -> str:
"""Consistent hashing по prefix для максимального cache hit."""
# Хэш от первых 256 символов (system prompt)
prefix_hash = hashlib.md5(prompt[:256].encode()).hexdigest()
# Consistent hashing — одинаковый prefix → один backend
idx = int(prefix_hash, 16) % len(backends)
return backends[idx]
Load distribution monitoring
Key metrics: RPS distribution across backends (should be uniform), queue depth for each backend, error rate for each backend. Alert: one backend is receiving > 80% of traffic while other backends are healthy.







