Deploying LLM on Microsoft Azure
Azure provides Azure Machine Learning (AML) for managed ML deployment, Azure OpenAI Service for GPT/Claude, Azure Container Apps for containerized deployment, and the NC/ND series of GPU-powered VMs. Strong integration with enterprise infrastructure (Active Directory, Azure DevOps).
Azure Machine Learning Online Endpoints
AML Endpoints — managed deployment with autoscaling, monitoring, and traffic splitting:
from azure.ai.ml import MLClient
from azure.ai.ml.entities import (
ManagedOnlineEndpoint, ManagedOnlineDeployment,
Environment, CodeConfiguration
)
from azure.identity import DefaultAzureCredential
ml_client = MLClient(
credential=DefaultAzureCredential(),
subscription_id="xxx",
resource_group_name="rg-llm",
workspace_name="ws-llm-prod"
)
# Создание endpoint
endpoint = ManagedOnlineEndpoint(
name="llama3-8b-endpoint",
description="LLaMA-3-8B inference endpoint",
auth_mode="key",
)
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
# Создание deployment
deployment = ManagedOnlineDeployment(
name="llama3-8b-v1",
endpoint_name="llama3-8b-endpoint",
environment=Environment(
image="mcr.microsoft.com/azureml/curated/acft-hf-nlp-gpu:latest",
conda_file="conda_env.yaml"
),
code_configuration=CodeConfiguration(
code="./inference_code/",
scoring_script="score.py"
),
instance_type="Standard_NC24ads_A100_v4", # 1x A100 80GB
instance_count=1,
scale_settings={
"scale_type": "TargetUtilization",
"min_instances": 1,
"max_instances": 8,
"target_utilization_percentage": 70,
"polling_interval": 60,
"cooldown_period": 300,
}
)
ml_client.online_deployments.begin_create_or_update(deployment).result()
Scoring script
# inference_code/score.py
import os
import json
from vllm import LLM, SamplingParams
def init():
global llm
model_path = os.getenv("AZUREML_MODEL_DIR") + "/llama3-8b/"
llm = LLM(
model=model_path,
max_model_len=8192,
gpu_memory_utilization=0.90,
tensor_parallel_size=1
)
def run(raw_data: str) -> str:
data = json.loads(raw_data)
prompts = data.get("inputs") if isinstance(data.get("inputs"), list) else [data["inputs"]]
params_dict = data.get("parameters", {})
sampling_params = SamplingParams(
max_tokens=params_dict.get("max_new_tokens", 512),
temperature=params_dict.get("temperature", 0.7),
top_p=params_dict.get("top_p", 0.9),
)
outputs = llm.generate(prompts, sampling_params)
results = [{"generated_text": o.outputs[0].text} for o in outputs]
return json.dumps(results if len(results) > 1 else results[0])
Azure OpenAI Service
For GPT-4, GPT-4o, GPT-3.5-turbo with SLA and corporate guarantees:
from openai import AzureOpenAI
client = AzureOpenAI(
azure_endpoint="https://company-openai.openai.azure.com/",
api_key=os.getenv("AZURE_OPENAI_KEY"),
api_version="2024-02-01"
)
response = client.chat.completions.create(
model="gpt-4o", # deployment name в Azure OpenAI
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain Azure Machine Learning"}
],
max_tokens=1024,
temperature=0.7
)
# PTU (Provisioned Throughput Units) для предсказуемой производительности
# Покупается заранее, гарантирует фиксированный TPM без throttling
Managed Compute for batch inference
from azure.ai.ml import Input, Output, command
from azure.ai.ml.constants import AssetTypes, InputOutputModes
batch_job = command(
inputs={
"input_data": Input(
type=AssetTypes.URI_FOLDER,
path="azureml://datastores/workspaceblobstore/paths/batch_input/",
mode=InputOutputModes.RO_MOUNT
)
},
outputs={
"output_data": Output(
type=AssetTypes.URI_FOLDER,
path="azureml://datastores/workspaceblobstore/paths/batch_output/",
mode=InputOutputModes.RW_MOUNT
)
},
code="./batch_inference/",
command="python run_batch.py --input-dir ${{inputs.input_data}} --output-dir ${{outputs.output_data}}",
environment="azureml:curated-acft-hf-nlp-gpu:latest",
compute="gpu-cluster",
instance_count=4, # 4 GPU VM для параллельной обработки
)
ml_client.jobs.create_or_update(batch_job)
Monitoring via Azure Monitor
AML Endpoints automatically sends metrics to Azure Monitor: RequestsPerMinute, Latency, ModelDataCollector (inputs/outputs for data drift). Integration with Azure Application Insights for request tracing.
Private Endpoints and VNet Integration
For enterprise: all services are on a private network, public access is blocked:
# Endpoint с private network access
endpoint = ManagedOnlineEndpoint(
name="llama3-private",
public_network_access="Disabled",
identity=IdentityConfiguration(
type=ManagedServiceIdentityType.SYSTEM_ASSIGNED
)
)







