rollout:
  service: "qwen2.5-7b-k8s-chat-prod"
  stable_release: "qwen2.5-7b-k8s-chat-prod-20260406"
  candidate_release: "qwen2.5-7b-k8s-chat-prod-20260413"

stages:
  - traffic_percent: 10
    hold_minutes: 30
    pass_if:
      - "ttft_p95_ms < 1800"
      - "error_rate < 0.02"
      - "user_abort_rate delta < 0.03"
  - traffic_percent: 25
    hold_minutes: 60
    pass_if:
      - "ttft_p95_ms < 1800"
      - "itl_p95_ms < 35"
      - "business_success_rate >= stable - 0.01"
  - traffic_percent: 50
    hold_minutes: 120
    pass_if:
      - "queue_depth p95 < 20"
      - "gpu_memory_used_bytes < limit * 0.95"
  - traffic_percent: 100
    hold_minutes: 0
    pass_if:
      - "all previous gates passed"

rollback:
  trigger_if:
    - "error_rate >= 0.03 for 5m"
    - "ttft_p95_ms >= 2500 for 10m"
    - "business_success_rate < stable - 0.03"
  action: "route all traffic back to stable_release"

notes:
  - "灰度发布看的是服务指标，不只是离线评测。"
  - "同一个 model version 在不同引擎参数下，也应该被当作不同服务版本治理。"
