Repository Reading Site
06-dataset-card.yaml
ml-platform/examples/19-llm-data/06-dataset-card.yaml
name: llm-data-samples-v1
version: 2026-04-10
owner: yuanjun
purpose:
- explain_data_shapes
- teach_dataset_governance
splits:
raw: 4
cleaned: 3
sft: 3
preference: 2
eval: 3
rag_docs: 3
sources:
- public-web
- internal-wiki
- forum-export
- code-repo
- manual-labeling
- expert-authored
- human-ranking
- security-policy
license_notes:
- teaching_only
- not_for_production_training
pii_policy:
raw_contains_pii: true
cleaned_contains_pii: false
review_required: true
quality_checks:
- dedup
- pii_masking
- language_tagging
- task_schema_validation
- safety_review
lineage:
raw_to_cleaned: yes
cleaned_to_sft: partial
cleaned_to_rag: partial