# Copy to configs/llm.local.yaml and edit. Used for the Phase-3/4 local # transformers + LoRA path (LocalHFLogitsProvider). For OpenAI-compatible API # or local OpenAI-compat servers (vLLM, Ollama, LM Studio), use llm.yaml. provider: local_hf model: Qwen/Qwen3-8B # Optional: path to a LoRA adapter trained by scripts/train_lora.py lora_adapter: null # e.g. reports/training/v1/lora_final # bf16 / fp16 / fp32. bf16 is the recommended default on A100. dtype: bf16 # Set to "cuda" to put the whole model on GPU; "auto" to let HF accelerate # device-map across two A100 cards. For 8B + LoRA + bf16 a single A100 40GB # is enough. device_map: auto # First-token classification protocol. Tokens to read logits for. # The score is softmax over (yes_token_logit, no_token_logit) at decode step 0. yes_tokens: ["Yes", " Yes", "YES"] no_tokens: ["No", " No", "NO"] # How many extra new tokens after the first to record (for prompt audit only; # scoring does not depend on them). trace_max_new_tokens: 4 # Used by NodeTextSummarizer / MetapathTextSummarizer (Phase 2). # The summarizer uses the SAME backbone unless summarizer_model is set. summarizer: model: null # null = reuse `model` b_node: 10 b_meta: 10 cache_dir: reports/cache/text_summary task_agnostic_prompt: "Summarize the text within {budget} tokens." max_input_tokens: 4096 # Embedder used by MarkovDiffusionTrimmer (Phase 2). embedder: model: sentence-transformers/all-MiniLM-L6-v2 device: cuda cache_dir: reports/cache/embeddings