Initial commit: ER-TP-DGP research prototype

Event-Reified Temporal Provenance Dual-Granularity Prompting for LLM-based APT detection on DARPA provenance datasets. Includes phase 0-14 method spec, IR/graph/metapath/trimming/prompt modules, scripts for THEIA candidate universe, landmark CSG construction, hybrid prompting, and LLM inference. Excludes data/, reports/, and local LLM config from version control.
2026-05-15 16:53:57 +08:00
commit b86ae87b75
88 changed files with 18570 additions and 0 deletions
--- a/configs/llm.example.yaml
+++ b/configs/llm.example.yaml
@@ -0,0 +1,25 @@
+# Copy this file to configs/llm.yaml and edit local values.
+# Do not commit real API keys.
+
+provider: local  # local or api
+base_url: http://localhost:8000/v1
+model: your-local-model
+
+# For remote API, prefer api_key_env instead of api_key.
+api_key_env: OPENAI_COMPAT_API_KEY
+# api_key: null
+
+timeout_seconds: 120
+temperature: 0.0
+max_tokens: 512
+# top_p: 1.0
+
+# Some self-hosted gateways behind WAF/CDN rules may reject Python's default
+# user agent. Prefer fixing server-side allow rules, but this can help with
+# basic User-Agent filtering.
+# If your endpoint is behind a WAF/CDN that rejects Python's default signature,
+# use a browser-like User-Agent or configure the server to allow this client.
+user_agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36
+extra_headers: {}
+
+extra_body: {}
--- a/configs/llm.local.example.yaml
+++ b/configs/llm.local.example.yaml
@@ -0,0 +1,41 @@
+# Copy to configs/llm.local.yaml and edit. Used for the Phase-3/4 local
+# transformers + LoRA path (LocalHFLogitsProvider). For OpenAI-compatible API
+# or local OpenAI-compat servers (vLLM, Ollama, LM Studio), use llm.yaml.
+
+provider: local_hf
+model: Qwen/Qwen3-8B
+# Optional: path to a LoRA adapter trained by scripts/train_lora.py
+lora_adapter: null  # e.g. reports/training/v1/lora_final
+
+# bf16 / fp16 / fp32. bf16 is the recommended default on A100.
+dtype: bf16
+
+# Set to "cuda" to put the whole model on GPU; "auto" to let HF accelerate
+# device-map across two A100 cards. For 8B + LoRA + bf16 a single A100 40GB
+# is enough.
+device_map: auto
+
+# First-token classification protocol. Tokens to read logits for.
+# The score is softmax over (yes_token_logit, no_token_logit) at decode step 0.
+yes_tokens: ["Yes", " Yes", "YES"]
+no_tokens: ["No", " No", "NO"]
+
+# How many extra new tokens after the first to record (for prompt audit only;
+# scoring does not depend on them).
+trace_max_new_tokens: 4
+
+# Used by NodeTextSummarizer / MetapathTextSummarizer (Phase 2).
+# The summarizer uses the SAME backbone unless summarizer_model is set.
+summarizer:
+  model: null              # null = reuse `model`
+  b_node: 10
+  b_meta: 10
+  cache_dir: reports/cache/text_summary
+  task_agnostic_prompt: "Summarize the text within {budget} tokens."
+  max_input_tokens: 4096
+
+# Embedder used by MarkovDiffusionTrimmer (Phase 2).
+embedder:
+  model: sentence-transformers/all-MiniLM-L6-v2
+  device: cuda
+  cache_dir: reports/cache/embeddings