Repository Reading Site

04-quantization-plan.yaml

ml-platform/examples/21-llm-serving/04-quantization-plan.yaml

Text Assetml-platform/examples/21-llm-serving/04-quantization-plan.yaml1.2 KB2026年4月13日 07:47查看原始内容

plan:
  service: "qwen2.5-7b-k8s-chat-prod"
  baseline_runtime: "fp16"
  target_runtime: "awq-int4"
  objective:
    - "单机可容纳更多并发请求"
    - "为长上下文请求留出更多 KV cache 空间"
    - "在业务质量可接受前提下降低单 token 成本"

candidates:
  - name: "fp16-baseline"
    precision: "fp16"
    expected_gain: "质量基线"
    risk: "显存占用高，吞吐空间有限"
  - name: "awq-int4"
    precision: "int4"
    expected_gain: "权重更小，吞吐和容量更友好"
    risk: "某些复杂推理任务可能退化"
  - name: "fp8-runtime"
    precision: "fp8"
    expected_gain: "可能在特定硬件上有更好速度"
    risk: "依赖硬件和引擎支持，验证成本更高"

acceptance_gates:
  offline_eval:
    domain_accuracy_drop_max: 0.02
    hallucination_rate_increase_max: 0.01
  online_slo:
    ttft_p95_ms_max: 1800
    itl_p95_ms_max: 35
    error_rate_max: 0.02
  business_guardrail:
    user_abort_rate_increase_max: 0.03

decision:
  selected: "awq-int4"
  reason:
    - "离线任务精度下降在阈值内"
    - "峰值并发下 TTFT 和 GPU 显存占用明显改善"
    - "当前引擎对 AWQ 方案支持成熟"