service:
  service_id: "qwen2.5-7b-k8s-chat-prod"
  display_name: "K8s 平台问答助手"
  model_version: "qwen2.5-7b-k8s-sft-lora-v1"
  service_version: "svc-2026-04-13-awq-canary"
  endpoint: "https://chat.k8s-lab.internal/v1"
  stage: "production"

model_source:
  runtime_package_uri: "s3://ml-assets/runtime/qwen2.5-7b-k8s-sft-v1-awq"
  runtime_manifest: "/models/runtime/runtime-manifest.yaml"
  tokenizer_source: "/models/runtime/tokenizer.json"
  chat_template_source: "/models/runtime/tokenizer_config.json"

engine:
  name: "vllm"
  served_model_name: "qwen2.5-7b-k8s-chat"
  tensor_parallel_size: 2
  pipeline_parallel_size: 1
  dtype: "float16"
  quantization: "awq"
  max_model_len: 8192
  max_num_seqs: 32
  max_num_batched_tokens: 16384
  gpu_memory_utilization: 0.92
  enable_prefix_caching: true
  swap_space_gb: 16
  scheduling_policy: "continuous_batching"
  kv_cache:
    block_size: 16
    dtype: "fp16"
    eviction_policy: "request_end_release"

traffic:
  api_style: "openai-compatible"
  streaming_enabled: true
  request_timeout_ms: 45000
  max_concurrent_requests_per_pod: 128

resources:
  replicas: 2
  per_pod:
    cpu: "8"
    memory: "64Gi"
    nvidia.com/gpu: 2

observability:
  metrics:
    - "ttft_ms_p95"
    - "itl_ms_p95"
    - "tokens_per_second"
    - "queue_depth"
    - "kv_cache_usage_ratio"
    - "gpu_memory_used_bytes"
  alerts:
    - "ttft_ms_p95 > 2000 for 10m"
    - "kv_cache_usage_ratio > 0.9 for 5m"
    - "request_error_rate > 0.02 for 5m"

notes:
  - "runtime package 负责可加载，service config 负责可运行。"
  - "同一个 model_version 可以在不同 service_version 下用不同推理参数上线。"
