Deploying LLM on AWS
AWS offers several services for LLM deployment: Amazon SageMaker (managed ML service), EC2 G/P instances (GPU VM), and Amazon Bedrock (managed LLM API). The choice depends on control requirements, cost, and operational load.
Amazon SageMaker Endpoints
SageMaker is a managed service: autoscaling, A/B testing, and monitoring out of the box. Pay for compute, and AWS manages the infrastructure.
import boto3
import json
from sagemaker.huggingface import HuggingFaceModel
# Деплой Llama-3-8B через SageMaker LMI (Large Model Inference)
hub = {
"HF_MODEL_ID": "meta-llama/Llama-3-8b-instruct",
"HF_TOKEN": "hf_xxx",
"SM_NUM_GPUS": "1",
"MAX_INPUT_LENGTH": "4096",
"MAX_TOTAL_TOKENS": "8192",
"MAX_BATCH_PREFILL_TOKENS": "32768",
"HF_MODEL_QUANTIZE": "bitsandbytes", # 4-bit quantization
}
model = HuggingFaceModel(
image_uri="763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi1.4.2-gpu-py310-cu121-ubuntu22.04",
env=hub,
role="arn:aws:iam::123456789012:role/SageMakerRole",
)
predictor = model.deploy(
initial_instance_count=1,
instance_type="ml.g5.2xlarge", # 1x A10G 24GB
endpoint_name="llama3-8b-prod",
container_startup_health_check_timeout=900,
)
SageMaker Endpoint Invocation
import boto3
import json
runtime = boto3.client("sagemaker-runtime", region_name="us-east-1")
def invoke_llm(prompt: str, max_tokens: int = 512) -> str:
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": max_tokens,
"temperature": 0.7,
"top_p": 0.9,
"repetition_penalty": 1.1,
}
}
response = runtime.invoke_endpoint(
EndpointName="llama3-8b-prod",
ContentType="application/json",
Body=json.dumps(payload),
)
return json.loads(response["Body"].read())["generated_text"]
SageMaker Autoscaling
client = boto3.client("application-autoscaling", region_name="us-east-1")
# Регистрация ресурса
client.register_scalable_target(
ServiceNamespace="sagemaker",
ResourceId="endpoint/llama3-8b-prod/variant/AllTraffic",
ScalableDimension="sagemaker:variant:DesiredInstanceCount",
MinCapacity=1,
MaxCapacity=10,
)
# Политика по загрузке GPU
client.put_scaling_policy(
PolicyName="gpu-invocations-scaling",
ServiceNamespace="sagemaker",
ResourceId="endpoint/llama3-8b-prod/variant/AllTraffic",
ScalableDimension="sagemaker:variant:DesiredInstanceCount",
PolicyType="TargetTrackingScaling",
TargetTrackingScalingPolicyConfiguration={
"TargetValue": 5.0, # target: 5 инвокаций на инстанцию в минуту
"CustomizedMetricSpecification": {
"MetricName": "InvocationsPerInstance",
"Namespace": "AWS/SageMaker",
"Dimensions": [
{"Name": "EndpointName", "Value": "llama3-8b-prod"},
],
"Statistic": "Average",
},
"ScaleInCooldown": 300,
"ScaleOutCooldown": 60,
},
)
EC2 G5 / P4 deployment
For complete control – EC2 with AMI GPUs:
# Запуск G5.2xlarge (1x A10G 24GB)
aws ec2 run-instances \
--image-id ami-0abcdef1234567890 \ # Deep Learning AMI GPU PyTorch 2.x
--instance-type g5.2xlarge \
--key-name my-key \
--security-group-ids sg-xxx \
--subnet-id subnet-xxx \
--iam-instance-profile Name=EC2LLMRole \
--block-device-mappings '[{"DeviceName":"/dev/sda1","Ebs":{"VolumeSize":200,"VolumeType":"gp3"}}]' \
--user-data file://startup.sh
Spot Instances for batch inference
Spot Instances: up to 70% savings, but with the risk of interruption. Suitable for batch inference (queue processing), not for synchronous APIs.
# SageMaker Spot Training
from sagemaker.estimator import Estimator
estimator = Estimator(
image_uri="...",
instance_type="ml.p3.2xlarge",
instance_count=1,
use_spot_instances=True,
max_wait=3600, # максимальное ожидание spot
max_run=1800, # максимальное время обучения
checkpoint_s3_uri="s3://bucket/checkpoints/",
)
Amazon Bedrock for managed LLM
If you don't need open source models, Bedrock provides Claude, Llama, and Mistral as APIs:
bedrock = boto3.client("bedrock-runtime", region_name="us-east-1")
response = bedrock.invoke_model(
modelId="anthropic.claude-3-5-sonnet-20241022-v2:0",
contentType="application/json",
accept="application/json",
body=json.dumps({
"anthropic_version": "bedrock-2023-05-31",
"max_tokens": 1024,
"messages": [{"role": "user", "content": "Hello"}]
})
)
Bedrock: no infrastructure costs, payment in tokens. Suitable for intermittent load.







