Repository Reading Site
04-quantization-plan.yaml
ml-platform/examples/21-llm-serving/04-quantization-plan.yaml
plan:
service: "qwen2.5-7b-k8s-chat-prod"
baseline_runtime: "fp16"
target_runtime: "awq-int4"
objective:
- "单机可容纳更多并发请求"
- "为长上下文请求留出更多 KV cache 空间"
- "在业务质量可接受前提下降低单 token 成本"
candidates:
- name: "fp16-baseline"
precision: "fp16"
expected_gain: "质量基线"
risk: "显存占用高,吞吐空间有限"
- name: "awq-int4"
precision: "int4"
expected_gain: "权重更小,吞吐和容量更友好"
risk: "某些复杂推理任务可能退化"
- name: "fp8-runtime"
precision: "fp8"
expected_gain: "可能在特定硬件上有更好速度"
risk: "依赖硬件和引擎支持,验证成本更高"
acceptance_gates:
offline_eval:
domain_accuracy_drop_max: 0.02
hallucination_rate_increase_max: 0.01
online_slo:
ttft_p95_ms_max: 1800
itl_p95_ms_max: 35
error_rate_max: 0.02
business_guardrail:
user_abort_rate_increase_max: 0.03
decision:
selected: "awq-int4"
reason:
- "离线任务精度下降在阈值内"
- "峰值并发下 TTFT 和 GPU 显存占用明显改善"
- "当前引擎对 AWQ 方案支持成熟"