Repository Reading Site
06-serving-deployment.yaml
ml-platform/examples/21-llm-serving/06-serving-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: qwen2.5-7b-k8s-chat
namespace: llm-serving
spec:
replicas: 2
selector:
matchLabels:
app: qwen2.5-7b-k8s-chat
template:
metadata:
labels:
app: qwen2.5-7b-k8s-chat
spec:
nodeSelector:
accelerator: nvidia-a100
containers:
- name: vllm-server
image: registry.example.com/llm/vllm-openai:0.6.x-cu124
args:
- "--model"
- "/models/runtime"
- "--served-model-name"
- "qwen2.5-7b-k8s-chat"
- "--tensor-parallel-size"
- "2"
- "--max-model-len"
- "8192"
- "--max-num-seqs"
- "32"
- "--gpu-memory-utilization"
- "0.92"
- "--quantization"
- "awq"
- "--enable-prefix-caching"
- "--port"
- "8000"
ports:
- containerPort: 8000
name: http
resources:
requests:
cpu: "8"
memory: "64Gi"
nvidia.com/gpu: "2"
limits:
cpu: "8"
memory: "64Gi"
nvidia.com/gpu: "2"
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 20
periodSeconds: 10
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 30
periodSeconds: 15
volumeMounts:
- name: runtime-package
mountPath: /models/runtime
volumes:
- name: runtime-package
persistentVolumeClaim:
claimName: llm-runtime-package-pvc
---
apiVersion: v1
kind: Service
metadata:
name: qwen2.5-7b-k8s-chat
namespace: llm-serving
spec:
selector:
app: qwen2.5-7b-k8s-chat
ports:
- name: http
port: 80
targetPort: 8000
# 这份清单想强调:
# - 线上部署对象关注副本、GPU、探针和挂载。
# - 它不是模型训练产物,也不是 model registry 条目。