content-moderation/config.yaml

paths:
  data_dir: data
  generated_subdir: generated
  splits_subdir: splits
  cache_dir: cache/generated
  models_dir: models

claude_engine:
  model: haiku
  max_concurrent: 10
  batch_size: 25

local_engine:
  base_url: http://localhost:8210    # model-boss coordinator (manages GPU leases + model lifecycle)
  model: ministral-14b-reasoning     # model ID from model-boss manifest
  max_concurrent: 1
  batch_size: 5
  temperature: 0.95
  priority: normal                   # model-boss priority: urgent|high|normal|low|batch
  categories: [csam, bestiality, necrophilia, snuff, scat, extreme_gore, self_harm]

inference:
  include_optional_categories: false  # Set true to surface anti_trans and other optional categories

generation:
  positives_per_category: 550
  hard_negatives_per_category: 600
  innocuous_count: 3000
  # Category definitions (severity, subtypes, overlaps, seeds) live in
  # CATEGORY_SPECS (category_specs.py). Use --categories CLI flag to
  # filter generation to specific categories.

# Training caps: generated files grow indefinitely; these caps control
# how many examples per category are used in each training run.
# Exp 32/33 finding: tier-based downsampling of T5 categories (550→350) regressed
# T2/T3 category precision by removing safe-adult-content calibration examples.
# Reverted to flat caps with only empirically validated per-category overrides.
# Lookup order: per-category override > global default (0 = no cap).
training_caps:
  by_tier: {}  # tier-based caps disabled — flat distribution is better calibrated
  # Per-category overrides (empirically validated across Exp 25-31)
  positives: {}
  hard_negatives:
    predatory_behavior: 400   # sweet-spot: 750 causes precision collapse via harassment overlap
    harassment: 600            # 18 seeds, 600 is validated sweet-spot
    extreme_gore: 700          # 22 seeds, all needed for snuff/gore/medical boundary