Repository Reading Site
00-inference-service-config.yaml
ml-platform/examples/21-llm-serving/00-inference-service-config.yaml
service:
service_id: "qwen2.5-7b-k8s-chat-prod"
display_name: "K8s 平台问答助手"
model_version: "qwen2.5-7b-k8s-sft-lora-v1"
service_version: "svc-2026-04-13-awq-canary"
endpoint: "https://chat.k8s-lab.internal/v1"
stage: "production"
model_source:
runtime_package_uri: "s3://ml-assets/runtime/qwen2.5-7b-k8s-sft-v1-awq"
runtime_manifest: "/models/runtime/runtime-manifest.yaml"
tokenizer_source: "/models/runtime/tokenizer.json"
chat_template_source: "/models/runtime/tokenizer_config.json"
engine:
name: "vllm"
served_model_name: "qwen2.5-7b-k8s-chat"
tensor_parallel_size: 2
pipeline_parallel_size: 1
dtype: "float16"
quantization: "awq"
max_model_len: 8192
max_num_seqs: 32
max_num_batched_tokens: 16384
gpu_memory_utilization: 0.92
enable_prefix_caching: true
swap_space_gb: 16
scheduling_policy: "continuous_batching"
kv_cache:
block_size: 16
dtype: "fp16"
eviction_policy: "request_end_release"
traffic:
api_style: "openai-compatible"
streaming_enabled: true
request_timeout_ms: 45000
max_concurrent_requests_per_pod: 128
resources:
replicas: 2
per_pod:
cpu: "8"
memory: "64Gi"
nvidia.com/gpu: 2
observability:
metrics:
- "ttft_ms_p95"
- "itl_ms_p95"
- "tokens_per_second"
- "queue_depth"
- "kv_cache_usage_ratio"
- "gpu_memory_used_bytes"
alerts:
- "ttft_ms_p95 > 2000 for 10m"
- "kv_cache_usage_ratio > 0.9 for 5m"
- "request_error_rate > 0.02 for 5m"
notes:
- "runtime package 负责可加载,service config 负责可运行。"
- "同一个 model_version 可以在不同 service_version 下用不同推理参数上线。"