Repository Reading Site
trainer_state.json
ml-platform/examples/20-llm-training/02-checkpoint/checkpoint-000120/trainer_state.json
{
"best_metric": 1.82,
"best_model_checkpoint": "/mnt/models/runs/sft-qwen2.5-7b-k8s-001/checkpoint-000120",
"epoch": 0.40,
"global_step": 120,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"step": 10,
"loss": 2.91,
"learning_rate": 0.00006,
"epoch": 0.03
},
{
"step": 60,
"loss": 2.14,
"learning_rate": 0.00018,
"epoch": 0.20
},
{
"step": 120,
"eval_loss": 1.82,
"epoch": 0.40
}
],
"save_steps": 120,
"logging_steps": 10,
"train_runtime": 842.31,
"train_samples_per_second": 21.7
}