Initial commit: ER-TP-DGP research prototype

Event-Reified Temporal Provenance Dual-Granularity Prompting for LLM-based APT detection on DARPA provenance datasets. Includes phase 0-14 method spec, IR/graph/metapath/trimming/prompt modules, scripts for THEIA candidate universe, landmark CSG construction, hybrid prompting, and LLM inference. Excludes data/, reports/, and local LLM config from version control.
2026-05-15 16:53:57 +08:00
commit b86ae87b75
88 changed files with 18570 additions and 0 deletions
--- a/scripts/anchor_coverage_audit.py
+++ b/scripts/anchor_coverage_audit.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+"""Quantify the gap between oracle (GT-derived) and end-to-end anchor selection.
+
+For each ground-truth-malicious process target, compare:
+  - oracle anchor: the anchor recorded in the oracle labeled-targets JSONL
+    (from `import_orthrus_ground_truth.py` or the GT-event-match builder).
+    The oracle path picks anchors using ground-truth event matches and so
+    cannot be deployed in production.
+  - end-to-end anchor: produced from raw-log weak signals only, using
+    ``select_anchor_for_candidate`` over the candidate universe. This is
+    deployable.
+
+Outputs per-subject rows and an aggregate report:
+  - end-to-end anchor recall under a fixed lookback/lookahead window:
+    fraction of GT-malicious subjects for which the end-to-end window
+    [t_e2e - L, t_e2e + L] contains the oracle anchor's timestamp (a
+    proxy for "would the LLM see at least one ground-truth attack
+    event in its window?")
+  - delta_seconds distribution: |t_oracle - t_e2e|
+  - reasons for fallback / failure (no weak signal recorded, no events
+    indexed, etc.)
+
+The number this audit publishes is the ceiling on end-to-end LLM recall
+for the current weak-signal anchor strategy.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+import sys
+from pathlib import Path
+from typing import Any
+
+# Allow running as a standalone script without `pip install -e .`.
+SRC = Path(__file__).resolve().parent.parent / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from er_tp_dgp.candidate_universe import select_anchor_for_candidate  # noqa: E402
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Compare oracle (GT-derived) vs end-to-end (weak-signal) anchor "
+            "selection. Reports the recall ceiling for the deployable anchor."
+        )
+    )
+    parser.add_argument(
+        "--oracle-targets",
+        required=True,
+        help=(
+            "Path to oracle labeled_targets JSONL (e.g., the orthrus output). "
+            "Must contain target_id, anchor_event_id, anchor_timestamp_nanos, label."
+        ),
+    )
+    parser.add_argument(
+        "--candidate-universe",
+        required=True,
+        help="Path to candidate-universe JSONL (with weak_signal_events field).",
+    )
+    parser.add_argument(
+        "--anchor-strategy",
+        default="first_weak_signal",
+        choices=("first_weak_signal", "first_event"),
+    )
+    parser.add_argument(
+        "--lookback-seconds",
+        type=float,
+        default=300.0,
+        help="Window half-width used to score 'oracle event inside e2e window'.",
+    )
+    parser.add_argument(
+        "--lookahead-seconds",
+        type=float,
+        default=300.0,
+    )
+    parser.add_argument(
+        "--out-jsonl",
+        required=True,
+        help="Per-subject comparison rows.",
+    )
+    parser.add_argument(
+        "--out-markdown",
+        required=True,
+        help="Aggregate audit report.",
+    )
+    args = parser.parse_args()
+
+    oracle_rows = _read_jsonl(args.oracle_targets)
+    universe_index = _index_universe(args.candidate_universe)
+
+    rows: list[dict[str, Any]] = []
+    for oracle in oracle_rows:
+        if oracle.get("label") != "malicious":
+            continue
+        target_id = oracle.get("target_id")
+        if not target_id:
+            continue
+        oracle_ts = oracle.get("anchor_timestamp_nanos")
+        oracle_event_id = oracle.get("anchor_event_id")
+        if not isinstance(oracle_ts, int) or not oracle_event_id:
+            continue
+
+        profile_row = universe_index.get(target_id)
+        if profile_row is None:
+            rows.append(
+                {
+                    "target_id": target_id,
+                    "in_candidate_universe": False,
+                    "oracle_anchor_event_id": oracle_event_id,
+                    "oracle_anchor_timestamp_nanos": oracle_ts,
+                    "e2e_anchor_event_id": None,
+                    "e2e_anchor_timestamp_nanos": None,
+                    "delta_seconds": None,
+                    "oracle_inside_e2e_window": False,
+                    "fallback_used": None,
+                    "anchor_strategy": args.anchor_strategy,
+                    "reason": "candidate_not_in_universe",
+                    "atom_id": oracle.get("atom_id"),
+                    "process_path": oracle.get("process_path"),
+                }
+            )
+            continue
+
+        anchor = select_anchor_for_candidate(profile_row, strategy=args.anchor_strategy)
+        e2e_event_id = anchor.anchor_event_id
+        e2e_ts = anchor.anchor_timestamp_nanos
+        delta_seconds: float | None = None
+        inside = False
+        if isinstance(e2e_ts, int):
+            delta_ns = oracle_ts - e2e_ts
+            delta_seconds = delta_ns / 1_000_000_000
+            window_start = e2e_ts - int(args.lookback_seconds * 1_000_000_000)
+            window_end = e2e_ts + int(args.lookahead_seconds * 1_000_000_000)
+            inside = window_start <= oracle_ts <= window_end
+
+        rows.append(
+            {
+                "target_id": target_id,
+                "in_candidate_universe": True,
+                "oracle_anchor_event_id": oracle_event_id,
+                "oracle_anchor_timestamp_nanos": oracle_ts,
+                "e2e_anchor_event_id": e2e_event_id,
+                "e2e_anchor_timestamp_nanos": e2e_ts,
+                "delta_seconds": delta_seconds,
+                "oracle_inside_e2e_window": inside,
+                "fallback_used": anchor.fallback_used,
+                "anchor_strategy": anchor.strategy,
+                "triggering_signals": list(anchor.triggering_signals),
+                "weak_signal_events_count": len(profile_row.get("weak_signal_events") or []),
+                "weak_signal_events_truncated": bool(profile_row.get("weak_signal_events_truncated")),
+                "reason": anchor.reason,
+                "atom_id": oracle.get("atom_id"),
+                "process_path": oracle.get("process_path"),
+                "weak_signal_score": profile_row.get("weak_signal_score"),
+            }
+        )
+
+    Path(args.out_jsonl).parent.mkdir(parents=True, exist_ok=True)
+    with Path(args.out_jsonl).open("w", encoding="utf-8") as out:
+        for row in rows:
+            out.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n")
+
+    Path(args.out_markdown).parent.mkdir(parents=True, exist_ok=True)
+    Path(args.out_markdown).write_text(
+        _render_markdown(rows, args), encoding="utf-8"
+    )
+
+    summary = _summarize(rows)
+    print(
+        "[anchor-coverage] subjects={total} in_universe={in_u} "
+        "anchor_inside_window={inside} fallback={fb} no_anchor={no_anchor}".format(
+            total=summary["total"],
+            in_u=summary["in_universe"],
+            inside=summary["inside_window"],
+            fb=summary["fallback_used"],
+            no_anchor=summary["no_e2e_anchor"],
+        )
+    )
+    return 0
+
+
+def _index_universe(path: str) -> dict[str, dict[str, Any]]:
+    index: dict[str, dict[str, Any]] = {}
+    with Path(path).open("r", encoding="utf-8") as handle:
+        for line in handle:
+            if not line.strip():
+                continue
+            row = json.loads(line)
+            cid = row.get("candidate_id") or row.get("target_id")
+            if cid:
+                index[str(cid)] = row
+    return index
+
+
+def _read_jsonl(path: str) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    with Path(path).open("r", encoding="utf-8") as handle:
+        for line in handle:
+            if line.strip():
+                rows.append(json.loads(line))
+    return rows
+
+
+def _summarize(rows: list[dict[str, Any]]) -> dict[str, Any]:
+    total = len(rows)
+    in_universe = sum(1 for r in rows if r["in_candidate_universe"])
+    inside_window = sum(1 for r in rows if r.get("oracle_inside_e2e_window"))
+    fallback_used = sum(1 for r in rows if r.get("fallback_used"))
+    no_e2e_anchor = sum(1 for r in rows if r.get("e2e_anchor_event_id") is None)
+    deltas = [r["delta_seconds"] for r in rows if isinstance(r.get("delta_seconds"), (int, float))]
+    abs_deltas = [abs(d) for d in deltas]
+    summary = {
+        "total": total,
+        "in_universe": in_universe,
+        "inside_window": inside_window,
+        "fallback_used": fallback_used,
+        "no_e2e_anchor": no_e2e_anchor,
+        "anchor_recall_at_window": (inside_window / total) if total else None,
+        "abs_delta_seconds_median": statistics.median(abs_deltas) if abs_deltas else None,
+        "abs_delta_seconds_p90": _percentile(abs_deltas, 0.9) if abs_deltas else None,
+        "abs_delta_seconds_p99": _percentile(abs_deltas, 0.99) if abs_deltas else None,
+        "abs_delta_seconds_max": max(abs_deltas) if abs_deltas else None,
+    }
+    return summary
+
+
+def _percentile(values: list[float], q: float) -> float:
+    if not values:
+        return float("nan")
+    ordered = sorted(values)
+    k = max(0, min(len(ordered) - 1, int(round(q * (len(ordered) - 1)))))
+    return ordered[k]
+
+
+def _render_markdown(rows: list[dict[str, Any]], args: argparse.Namespace) -> str:
+    summary = _summarize(rows)
+    lines = [
+        "# Anchor Coverage Audit",
+        "",
+        "This audit measures the deployable anchor strategy against the GT-derived",
+        "oracle anchor. The headline number is `anchor_recall_at_window` — the",
+        "ceiling on end-to-end LLM recall under the chosen anchor strategy and",
+        "lookback/lookahead window.",
+        "",
+        f"- oracle_targets: `{args.oracle_targets}`",
+        f"- candidate_universe: `{args.candidate_universe}`",
+        f"- anchor_strategy: `{args.anchor_strategy}`",
+        f"- lookback_seconds: {args.lookback_seconds}",
+        f"- lookahead_seconds: {args.lookahead_seconds}",
+        "",
+        "## Aggregate",
+        "",
+        f"- ground_truth_positive_subjects: {summary['total']}",
+        f"- in_candidate_universe: {summary['in_universe']}",
+        f"- end_to_end_anchor_resolved: {summary['total'] - summary['no_e2e_anchor']}",
+        f"- end_to_end_anchor_used_fallback: {summary['fallback_used']}",
+        f"- oracle_anchor_inside_e2e_window: {summary['inside_window']}",
+        (
+            "- **anchor_recall_at_window**: "
+            f"{summary['anchor_recall_at_window']:.3f}"
+            if summary["anchor_recall_at_window"] is not None
+            else "- anchor_recall_at_window: n/a"
+        ),
+        "",
+        "## |delta_seconds| distribution (oracle_ts - e2e_ts)",
+        "",
+        f"- median: {summary['abs_delta_seconds_median']}",
+        f"- p90:    {summary['abs_delta_seconds_p90']}",
+        f"- p99:    {summary['abs_delta_seconds_p99']}",
+        f"- max:    {summary['abs_delta_seconds_max']}",
+        "",
+        "## Failure breakdown",
+        "",
+    ]
+    failures = [r for r in rows if not r.get("oracle_inside_e2e_window")]
+    if not failures:
+        lines.append("- (none)")
+    else:
+        reasons: dict[str, int] = {}
+        for r in failures:
+            key = r.get("reason") or "unknown"
+            reasons[key] = reasons.get(key, 0) + 1
+        for reason, count in sorted(reasons.items(), key=lambda kv: -kv[1]):
+            lines.append(f"- {reason}: {count}")
+    lines.extend(
+        [
+            "",
+            "## Interpretation",
+            "",
+            "- If `anchor_recall_at_window` is well below 1.0, the anchor strategy",
+            "  is the bottleneck — even a perfect LLM cannot exceed this number.",
+            "  Either widen the window, switch to multi-anchor lifecycle tiling",
+            "  (`select_anchors_for_lifecycle`), or expand the weak-signal set.",
+            "- If `fallback_used` is high, many candidates have no weak-signal",
+            "  trigger at all; consider whether they should be filtered out of",
+            "  the candidate universe or treated as low-priority.",
+            "- The oracle column shows what the GT-coupled pipeline was",
+            "  effectively assuming — any AUPRC delta between GT-anchored runs",
+            "  and end-to-end runs lower-bounds the oracle leakage.",
+        ]
+    )
+    return "\n".join(lines) + "\n"
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/build_hybrid_community_prompts.py
+++ b/scripts/build_hybrid_community_prompts.py
@@ -0,0 +1,348 @@
+#!/usr/bin/env python3
+"""Render one hybrid (community + v0.1 fine-grained) prompt per landmark community.
+
+Hybrid pipeline:
+  1) read landmark communities from Phase 14 output;
+  2) re-stream the raw THEIA corpus once and demux each event into
+     per-community fine-grained subgraphs (community_to_subgraph);
+  3) on each subgraph, run v0.1 APT metapath extraction +
+     temporal-security-aware trimming;
+  4) compose a layered prompt: community overview + landmark skeleton
+     + landmark bridges + per-metapath blocks (DGP path summary +
+     numerical aggregate + APT stats + evidence path ids).
+
+Writes:
+  - prompts/<community_id>.txt
+  - prompt_metadata.jsonl  — one row per prompt with label + community
+    summary + subgraph stats (entity / event counts, truncation flag,
+    metapath hits). Labels are attached to metadata only, never enter
+    the prompt body.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+SRC = Path(__file__).resolve().parent.parent / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from er_tp_dgp.community_to_subgraph import build_community_subgraphs  # noqa: E402
+from er_tp_dgp.hybrid_prompt import (  # noqa: E402
+    HybridCommunityPromptBuilder,
+    HybridPromptSwitches,
+)
+from er_tp_dgp.landmark import (  # noqa: E402
+    LandmarkEdge,
+    LandmarkEvent,
+    read_communities_jsonl,
+)
+from er_tp_dgp.theia import discover_theia_json_files  # noqa: E402
+
+
+def _stream_filter_landmarks(
+    path: Path, allowed_ids: set[str]
+) -> dict[str, LandmarkEvent]:
+    """Stream-read landmarks.jsonl and keep only rows whose event_id is in allowed_ids.
+
+    The landmarks file is multi-GB on real datasets — a full ``read_landmarks_jsonl``
+    eats hundreds of GB of RAM and minutes of wall time. We need only the
+    landmarks referenced by the selected communities.
+    """
+    out: dict[str, LandmarkEvent] = {}
+    if not allowed_ids:
+        return out
+    needed = set(allowed_ids)
+    with path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            if not line.strip():
+                continue
+            r = json.loads(line)
+            event_id = r.get("event_id")
+            if event_id not in needed:
+                continue
+            out[event_id] = LandmarkEvent(
+                event_id=event_id,
+                timestamp_nanos=r["timestamp_nanos"],
+                host_id=r.get("host_id"),
+                actor_subject_id=r["actor_subject_id"],
+                actor_path=r.get("actor_path"),
+                object_id=r.get("object_id"),
+                object_type=r.get("object_type"),
+                object_summary=r.get("object_summary"),
+                canonical_action=r["canonical_action"],
+                raw_event_type=r["raw_event_type"],
+                signals=tuple(r.get("signals") or ()),
+                metapath_hints=tuple(r.get("metapath_hints") or ()),
+                landmark_classes=tuple(r.get("landmark_classes") or ()),
+            )
+            if len(out) == len(needed):
+                break
+    return out
+
+
+def _stream_filter_edges(
+    path: Path, allowed_ids: set[str]
+) -> dict[str, LandmarkEdge]:
+    """Stream-read landmark_edges.jsonl with allowed_ids filter."""
+    out: dict[str, LandmarkEdge] = {}
+    if not allowed_ids:
+        return out
+    needed = set(allowed_ids)
+    with path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            if not line.strip():
+                continue
+            r = json.loads(line)
+            edge_id = r.get("edge_id")
+            if edge_id not in needed:
+                continue
+            out[edge_id] = LandmarkEdge(
+                edge_id=edge_id,
+                src_event_id=r["src_event_id"],
+                dst_event_id=r["dst_event_id"],
+                host_id=r.get("host_id"),
+                delta_nanos=r["delta_nanos"],
+                bridge_hops=r["bridge_hops"],
+                bridge_summary=r["bridge_summary"],
+            )
+            if len(out) == len(needed):
+                break
+    return out
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--communities", required=True)
+    parser.add_argument("--landmarks", required=True)
+    parser.add_argument("--landmark-edges", required=True)
+    parser.add_argument(
+        "--labeled-communities",
+        default=None,
+        help="Optional. Adds label/atom_id to prompt_metadata.jsonl, never the prompt body.",
+    )
+    parser.add_argument("--data-dir", default="data/raw/e3_theia_json")
+    parser.add_argument("--input-file", action="append", default=None)
+    parser.add_argument("--output-dir", required=True)
+    parser.add_argument("--margin-seconds", type=float, default=60.0)
+    parser.add_argument("--max-events-per-community", type=int, default=5000)
+    parser.add_argument("--max-landmarks-in-prompt", type=int, default=60)
+    parser.add_argument("--max-edges-in-prompt", type=int, default=80)
+    parser.add_argument("--top-m-per-metapath", type=int, default=5)
+    parser.add_argument("--max-prompts", type=int, default=None)
+    parser.add_argument("--progress-every", type=int, default=2_000_000)
+    parser.add_argument("--max-lines", type=int, default=None)
+    parser.add_argument("--max-lines-per-file", type=int, default=None)
+    parser.add_argument(
+        "--include-only",
+        choices=("all", "malicious", "balanced"),
+        default="balanced",
+        help=(
+            "Which communities to render. 'malicious' = only GT-malicious, "
+            "'balanced' = all malicious + a random benign sample (--benign-per-malicious)."
+        ),
+    )
+    parser.add_argument(
+        "--benign-per-malicious",
+        type=int,
+        default=24,
+        help="When --include-only=balanced, sample this many benign per malicious.",
+    )
+    parser.add_argument("--seed", type=int, default=7)
+    args = parser.parse_args()
+
+    paths = (
+        [Path(p) for p in args.input_file]
+        if args.input_file
+        else discover_theia_json_files(args.data_dir)
+    )
+    if not paths:
+        raise SystemExit(f"No THEIA JSON files found at {args.data_dir}")
+
+    print("[hybrid] reading communities...", flush=True)
+    communities = read_communities_jsonl(args.communities)
+    print(f"[hybrid] communities loaded: {len(communities)}", flush=True)
+
+    label_index: dict[str, dict] = {}
+    if args.labeled_communities:
+        with Path(args.labeled_communities).open("r", encoding="utf-8") as handle:
+            for line in handle:
+                if not line.strip():
+                    continue
+                row = json.loads(line)
+                label_index[row["community_id"]] = row
+
+    # --- selection ---------------------------------------------------- #
+    if args.include_only != "all":
+        if not label_index:
+            raise SystemExit("--include-only != all requires --labeled-communities")
+        if args.include_only == "malicious":
+            communities = [
+                c for c in communities
+                if label_index.get(c.community_id, {}).get("label") == "malicious"
+            ]
+        elif args.include_only == "balanced":
+            import random
+
+            rng = random.Random(args.seed)
+            mal = [
+                c for c in communities
+                if label_index.get(c.community_id, {}).get("label") == "malicious"
+            ]
+            ben = [
+                c for c in communities
+                if label_index.get(c.community_id, {}).get("label") == "benign"
+            ]
+            rng.shuffle(ben)
+            target_ben = max(1, args.benign_per_malicious * max(1, len(mal)))
+            communities = mal + ben[:target_ben]
+            communities.sort(
+                key=lambda c: (-len(c.landmark_event_ids), c.start_timestamp_nanos, c.community_id)
+            )
+
+    if args.max_prompts is not None:
+        communities = communities[: args.max_prompts]
+
+    print(
+        f"[hybrid] selected {len(communities)} communities "
+        f"({sum(1 for c in communities if label_index.get(c.community_id, {}).get('label') == 'malicious')} malicious)",
+        flush=True,
+    )
+
+    # --- stream-filtered loads of landmarks + edges ------------------- #
+    needed_landmark_ids: set[str] = set()
+    needed_edge_ids: set[str] = set()
+    for c in communities:
+        needed_landmark_ids.update(c.landmark_event_ids)
+        needed_edge_ids.update(c.edge_ids)
+    print(
+        f"[hybrid] need {len(needed_landmark_ids)} landmark rows / "
+        f"{len(needed_edge_ids)} edge rows from disk",
+        flush=True,
+    )
+    print("[hybrid] stream-loading landmarks...", flush=True)
+    landmarks_by_id = _stream_filter_landmarks(Path(args.landmarks), needed_landmark_ids)
+    print(f"[hybrid] landmarks loaded: {len(landmarks_by_id)}", flush=True)
+    print("[hybrid] stream-loading edges...", flush=True)
+    edges_by_id = _stream_filter_edges(Path(args.landmark_edges), needed_edge_ids)
+    print(f"[hybrid] edges loaded: {len(edges_by_id)}", flush=True)
+
+    # --- materialize fine-grained subgraphs (single THEIA pass) ------- #
+    print(f"[hybrid] streaming THEIA from {len(paths)} files to build subgraphs...", flush=True)
+    subgraphs = build_community_subgraphs(
+        communities,
+        paths,
+        margin_seconds=args.margin_seconds,
+        max_events_per_community=args.max_events_per_community,
+        max_lines=args.max_lines,
+        max_lines_per_file=args.max_lines_per_file,
+        progress_every=args.progress_every,
+    )
+    truncated = sum(1 for s in subgraphs.values() if s.truncated)
+    total_events = sum(len(s.events) for s in subgraphs.values())
+    total_entities = sum(len(s.entities) for s in subgraphs.values())
+    print(
+        f"[hybrid] subgraphs ready: communities={len(subgraphs)} "
+        f"truncated={truncated} total_events={total_events} total_entities={total_entities}",
+        flush=True,
+    )
+
+    # --- build hybrid prompts ----------------------------------------- #
+    output_dir = Path(args.output_dir)
+    prompts_dir = output_dir / "prompts"
+    prompts_dir.mkdir(parents=True, exist_ok=True)
+    metadata_path = output_dir / "prompt_metadata.jsonl"
+
+    builder = HybridCommunityPromptBuilder(
+        landmarks_by_id=landmarks_by_id,
+        edges_by_id=edges_by_id,
+        # No NodeText / PathSumm summarizers — keeps the experiment
+        # cost-bounded and removes a confounder. Switches below disable them.
+        node_summarizer=None,
+        path_summarizer=None,
+        switches=HybridPromptSwitches(
+            use_text_summarization=False,
+            use_path_summarization_llm=False,
+            use_numerical_aggregation_dgp=True,
+            use_apt_numerical_stats=True,
+            include_evidence_ids=True,
+            include_landmark_skeleton=True,
+            include_landmark_bridges=True,
+            max_landmarks_in_prompt=args.max_landmarks_in_prompt,
+            max_edges_in_prompt=args.max_edges_in_prompt,
+            top_m_per_metapath=args.top_m_per_metapath,
+        ),
+    )
+
+    written = 0
+    with metadata_path.open("w", encoding="utf-8") as meta_out:
+        for community in communities:
+            sub = subgraphs.get(community.community_id)
+            if sub is None:
+                # Stream filter produced nothing for this community — emit
+                # a stub prompt with empty metapath blocks rather than
+                # silently dropping it (we want to count this in metrics).
+                continue
+            bundle = builder.build(community, sub)
+            (prompts_dir / f"{community.community_id}.txt").write_text(
+                bundle.prompt_text, encoding="utf-8"
+            )
+            label_row = label_index.get(community.community_id) or {}
+            meta_out.write(
+                json.dumps(
+                    {
+                        "community_id": community.community_id,
+                        "host_id": community.host_id,
+                        "label": label_row.get("label", "unlabeled"),
+                        "label_source": label_row.get(
+                            "label_source", "no_ground_truth_join"
+                        ),
+                        "gt_atoms_hit": label_row.get("gt_atoms_hit") or [],
+                        "gt_subjects_hit": label_row.get("gt_subjects_hit") or [],
+                        "span_seconds": community.span_seconds,
+                        "subjects_in_community": len(community.subjects),
+                        "num_landmarks_total": len(community.landmark_event_ids),
+                        "num_landmarks_in_prompt": bundle.metadata[
+                            "num_landmarks_in_prompt"
+                        ],
+                        "num_edges_total": len(community.edge_ids),
+                        "num_edges_in_prompt": bundle.metadata["num_edges_in_prompt"],
+                        "subgraph_entities_count": bundle.metadata[
+                            "subgraph_entities_count"
+                        ],
+                        "subgraph_events_count": bundle.metadata[
+                            "subgraph_events_count"
+                        ],
+                        "subgraph_truncated": bundle.metadata["subgraph_truncated"],
+                        "metapath_paths_extracted": bundle.metadata[
+                            "metapath_paths_extracted"
+                        ],
+                        "metapath_paths_after_trim": bundle.metadata[
+                            "metapath_paths_after_trim"
+                        ],
+                        "selected_landmark_ids": list(bundle.selected_landmark_ids),
+                        "evidence_path_ids": list(bundle.evidence_path_ids),
+                        "prompt_path": str(
+                            (prompts_dir / f"{community.community_id}.txt").resolve()
+                        ),
+                        "prompt_char_length": len(bundle.prompt_text),
+                    },
+                    ensure_ascii=False,
+                    sort_keys=True,
+                )
+                + "\n"
+            )
+            written += 1
+    print(
+        f"[hybrid] wrote {written} prompts to {prompts_dir} "
+        f"and metadata to {metadata_path}",
+        flush=True,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/build_hybrid_labeled_targets.py
+++ b/scripts/build_hybrid_labeled_targets.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+"""Convert hybrid prompt_metadata.jsonl → labeled_targets.jsonl for run_evaluation.py.
+
+Hybrid prompts use ``community_id`` as the prompt id; the v0.1 evaluator
+expects ``target_id``. This script does the rename and emits a minimal
+labeled_targets.jsonl with the fields the evaluator needs.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--prompt-metadata", required=True)
+    parser.add_argument("--output", required=True)
+    args = parser.parse_args()
+
+    out = Path(args.output)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    written = 0
+    with Path(args.prompt_metadata).open("r", encoding="utf-8") as inp, out.open(
+        "w", encoding="utf-8"
+    ) as outf:
+        for line in inp:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            label = row.get("label", "unlabeled")
+            if label not in {"malicious", "benign"}:
+                continue  # skip unlabeled communities
+            payload = {
+                "target_id": row["community_id"],
+                "target_type": "COMMUNITY_SUBGRAPH",
+                "label": label,
+                "label_confidence": "high" if label == "malicious" else "default",
+                "label_source": row.get("label_source", "no_ground_truth_join"),
+                "anchor_event_id": row.get("selected_landmark_ids", [""])[0]
+                if row.get("selected_landmark_ids")
+                else "",
+                "host_id": row.get("host_id"),
+                "span_seconds": row.get("span_seconds"),
+                "subjects_in_community": row.get("subjects_in_community"),
+                "num_landmarks_total": row.get("num_landmarks_total"),
+                "subgraph_events_count": row.get("subgraph_events_count"),
+                "gt_atoms_hit": row.get("gt_atoms_hit") or [],
+            }
+            outf.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")
+            written += 1
+    print(f"wrote {written} labeled targets to {out}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/build_labeled_eval_batch.py
+++ b/scripts/build_labeled_eval_batch.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+"""Build a protocol-based labeled target batch for prompt generation."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from er_tp_dgp.evaluation_batch import build_evaluation_batch
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Build labeled target metadata. Labels remain evaluation-only and are not prompt input."
+    )
+    parser.add_argument(
+        "--positive-process-labels",
+        default="reports/ground_truth/e3_mapping_ioc_files_time/process_labels_high_plus.jsonl",
+    )
+    parser.add_argument(
+        "--positive-event-matches",
+        default="reports/ground_truth/e3_mapping_ioc_files_time/event_matches_high_plus.jsonl",
+    )
+    parser.add_argument(
+        "--all-mapped-process-labels",
+        default="reports/ground_truth/e3_mapping_ioc_files_time/process_labels.jsonl",
+    )
+    parser.add_argument(
+        "--candidate-universe",
+        default="reports/theia_candidate_universe_ioc_files/candidate_universe.jsonl",
+    )
+    parser.add_argument("--output-dir", default="reports/evaluation/e3_theia_v0_1")
+    parser.add_argument("--num-positives", type=int, default=8)
+    parser.add_argument("--num-hard-negative-proxies", type=int, default=8)
+    parser.add_argument("--max-hard-negative-events", type=int, default=1000)
+    parser.add_argument("--seed", type=int, default=7)
+    args = parser.parse_args()
+
+    batch = build_evaluation_batch(
+        positive_process_labels_path=args.positive_process_labels,
+        positive_event_matches_path=args.positive_event_matches,
+        candidate_universe_path=args.candidate_universe,
+        all_mapped_process_labels_path=args.all_mapped_process_labels,
+        num_positives=args.num_positives,
+        num_hard_negative_proxies=args.num_hard_negative_proxies,
+        max_hard_negative_events=args.max_hard_negative_events,
+        seed=args.seed,
+    )
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    targets_path = output_dir / "labeled_targets.jsonl"
+    report_path = output_dir / "labeled_targets.md"
+    batch.write_jsonl(targets_path)
+    report_path.write_text(batch.to_markdown() + "\n", encoding="utf-8")
+    print(f"targets={len(batch.targets)}")
+    print(f"wrote {targets_path}")
+    print(f"wrote {report_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/build_landmark_graph.py
+++ b/scripts/build_landmark_graph.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+"""Stream the THEIA corpus once and emit the Landmark-Bridged Causal Story Graph.
+
+Outputs:
+  - landmarks.jsonl       — one row per landmark event
+  - landmark_edges.jsonl  — one row per landmark→landmark causal bridge
+  - landmark_communities.jsonl — one row per detection unit (subgraph)
+  - landmark_stats.json   — corpus-level counts and class histogram
+
+This script is the construction phase of Phase 14. Detection (per-community
+LLM prompting) is a separate step (`build_landmark_prompts.py`).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+SRC = Path(__file__).resolve().parent.parent / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from er_tp_dgp.landmark import (  # noqa: E402
+    StreamingLandmarkGraphBuilder,
+    compute_landmark_communities,
+    write_communities_jsonl,
+    write_edges_jsonl,
+    write_landmarks_jsonl,
+)
+from er_tp_dgp.theia import discover_theia_json_files, iter_theia_records  # noqa: E402
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--data-dir", default="data/raw/e3_theia_json")
+    parser.add_argument("--input-file", action="append", default=None)
+    parser.add_argument("--output-dir", default="reports/landmark_csg")
+    parser.add_argument("--progress-every", type=int, default=1_000_000)
+    parser.add_argument(
+        "--k-ancestors",
+        type=int,
+        default=8,
+        help="Per-entity ancestor cache size. Bigger = denser landmark edges.",
+    )
+    parser.add_argument(
+        "--max-bridge-seconds",
+        type=float,
+        default=600.0,
+        help="Drop ancestor→landmark edges whose time delta exceeds this.",
+    )
+    parser.add_argument(
+        "--max-edges-per-landmark-in",
+        type=int,
+        default=16,
+        help="Cap inbound edges per landmark to keep the graph sparse.",
+    )
+    parser.add_argument(
+        "--silence-split-seconds",
+        type=float,
+        default=300.0,
+        help="Inside a connected component, split on landmark gaps wider than this.",
+    )
+    parser.add_argument(
+        "--min-community-landmarks",
+        type=int,
+        default=2,
+        help="Drop communities smaller than this (singletons are not stories).",
+    )
+    parser.add_argument("--max-lines", type=int, default=None)
+    parser.add_argument("--max-lines-per-file", type=int, default=None)
+    args = parser.parse_args()
+
+    paths = (
+        [Path(p) for p in args.input_file]
+        if args.input_file
+        else discover_theia_json_files(args.data_dir)
+    )
+    if not paths:
+        raise SystemExit(f"no THEIA JSON files found under {args.data_dir}")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print(
+        f"[start] paths={len(paths)} k_ancestors={args.k_ancestors} "
+        f"max_bridge_seconds={args.max_bridge_seconds} "
+        f"max_edges_per_landmark_in={args.max_edges_per_landmark_in}",
+        flush=True,
+    )
+
+    builder = StreamingLandmarkGraphBuilder(
+        k_ancestors_per_entity=args.k_ancestors,
+        max_bridge_nanos=int(args.max_bridge_seconds * 1_000_000_000),
+        max_edges_per_landmark_in=args.max_edges_per_landmark_in,
+    )
+    builder.feed_iterable(
+        iter_theia_records(
+            paths,
+            max_lines=args.max_lines,
+            max_lines_per_file=args.max_lines_per_file,
+        ),
+        progress_every=args.progress_every,
+    )
+    landmarks, edges, stats = builder.finalize()
+
+    print(
+        f"[built] records={stats.records_seen} events={stats.events_seen} "
+        f"landmarks={stats.landmarks} edges={stats.edges} "
+        f"edges_skipped_time={stats.edges_skipped_time} "
+        f"edges_skipped_self={stats.edges_skipped_self}",
+        flush=True,
+    )
+
+    print("[community] computing weakly connected components + temporal split", flush=True)
+    communities = compute_landmark_communities(
+        landmarks,
+        edges,
+        min_landmarks=args.min_community_landmarks,
+        silence_split_seconds=args.silence_split_seconds,
+    )
+    print(f"[community] {len(communities)} communities produced", flush=True)
+
+    landmarks_path = output_dir / "landmarks.jsonl"
+    edges_path = output_dir / "landmark_edges.jsonl"
+    communities_path = output_dir / "landmark_communities.jsonl"
+    stats_path = output_dir / "landmark_stats.json"
+
+    write_landmarks_jsonl(landmarks, landmarks_path)
+    write_edges_jsonl(edges, edges_path)
+    write_communities_jsonl(communities, communities_path)
+
+    summary = {
+        "records_seen": stats.records_seen,
+        "events_seen": stats.events_seen,
+        "landmarks": stats.landmarks,
+        "edges": stats.edges,
+        "edges_skipped_time": stats.edges_skipped_time,
+        "edges_skipped_self": stats.edges_skipped_self,
+        "landmarks_by_class": dict(stats.landmarks_by_class),
+        "communities": len(communities),
+        "community_size_min": min((len(c.landmark_event_ids) for c in communities), default=0),
+        "community_size_max": max((len(c.landmark_event_ids) for c in communities), default=0),
+        "community_size_p50": _percentile(
+            [len(c.landmark_event_ids) for c in communities], 0.5
+        ),
+        "community_size_p90": _percentile(
+            [len(c.landmark_event_ids) for c in communities], 0.9
+        ),
+        "community_size_p99": _percentile(
+            [len(c.landmark_event_ids) for c in communities], 0.99
+        ),
+        "config": {
+            "k_ancestors": args.k_ancestors,
+            "max_bridge_seconds": args.max_bridge_seconds,
+            "max_edges_per_landmark_in": args.max_edges_per_landmark_in,
+            "silence_split_seconds": args.silence_split_seconds,
+            "min_community_landmarks": args.min_community_landmarks,
+        },
+        "files": {
+            "landmarks": str(landmarks_path),
+            "landmark_edges": str(edges_path),
+            "landmark_communities": str(communities_path),
+        },
+    }
+    stats_path.write_text(json.dumps(summary, indent=2, sort_keys=True), encoding="utf-8")
+    print(f"[write] {landmarks_path}", flush=True)
+    print(f"[write] {edges_path}", flush=True)
+    print(f"[write] {communities_path}", flush=True)
+    print(f"[write] {stats_path}", flush=True)
+    return 0
+
+
+def _percentile(values: list[int], q: float) -> int | None:
+    if not values:
+        return None
+    ordered = sorted(values)
+    k = max(0, min(len(ordered) - 1, int(round(q * (len(ordered) - 1)))))
+    return ordered[k]
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/build_landmark_prompts.py
+++ b/scripts/build_landmark_prompts.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+"""Render one LLM prompt per landmark community.
+
+Reads:
+  - communities (output of build_landmark_graph.py)
+  - landmarks (output of build_landmark_graph.py)
+  - landmark_edges (output of build_landmark_graph.py)
+  - labeled_communities (output of evaluate_landmark_detection.py) — labels
+    are *only* attached to per-prompt metadata for downstream evaluation;
+    they never enter the prompt body.
+
+Writes:
+  - prompts/<community_id>.txt
+  - prompt_metadata.jsonl    — one row per prompt with label + community
+    summary, suitable for downstream LLM-runner + AUPRC computation.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+SRC = Path(__file__).resolve().parent.parent / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from er_tp_dgp.landmark import (  # noqa: E402
+    read_communities_jsonl,
+    read_edges_jsonl,
+    read_landmarks_jsonl,
+)
+from er_tp_dgp.landmark_prompt import (  # noqa: E402
+    CommunityPromptSwitches,
+    LandmarkCommunityPromptBuilder,
+)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--communities", required=True)
+    parser.add_argument("--landmarks", required=True)
+    parser.add_argument("--landmark-edges", required=True)
+    parser.add_argument(
+        "--labeled-communities",
+        default=None,
+        help="Optional. Adds label/atom_id to prompt_metadata.jsonl, never the prompt body.",
+    )
+    parser.add_argument("--output-dir", required=True)
+    parser.add_argument("--max-landmarks-in-prompt", type=int, default=60)
+    parser.add_argument("--max-edges-in-prompt", type=int, default=80)
+    parser.add_argument("--max-prompts", type=int, default=None)
+    parser.add_argument(
+        "--include-only",
+        choices=("all", "malicious", "balanced"),
+        default="all",
+        help=(
+            "Which communities to render. 'malicious' = only GT-malicious, "
+            "'balanced' = all malicious + an equal-sized random benign sample."
+        ),
+    )
+    parser.add_argument("--seed", type=int, default=7)
+    args = parser.parse_args()
+
+    communities = read_communities_jsonl(args.communities)
+    landmarks = read_landmarks_jsonl(args.landmarks)
+    edges = read_edges_jsonl(args.landmark_edges)
+    landmarks_by_id = {lm.event_id: lm for lm in landmarks}
+    edges_by_id = {edge.edge_id: edge for edge in edges}
+
+    label_index: dict[str, dict] = {}
+    if args.labeled_communities:
+        with Path(args.labeled_communities).open("r", encoding="utf-8") as handle:
+            for line in handle:
+                if not line.strip():
+                    continue
+                row = json.loads(line)
+                label_index[row["community_id"]] = row
+
+    if args.include_only != "all":
+        if not label_index:
+            raise SystemExit(
+                "--include-only != all requires --labeled-communities"
+            )
+        if args.include_only == "malicious":
+            communities = [c for c in communities if label_index.get(c.community_id, {}).get("label") == "malicious"]
+        elif args.include_only == "balanced":
+            import random
+
+            rng = random.Random(args.seed)
+            mal = [c for c in communities if label_index.get(c.community_id, {}).get("label") == "malicious"]
+            ben = [c for c in communities if label_index.get(c.community_id, {}).get("label") == "benign"]
+            rng.shuffle(ben)
+            communities = mal + ben[: len(mal)]
+            communities.sort(
+                key=lambda c: (-len(c.landmark_event_ids), c.start_timestamp_nanos, c.community_id)
+            )
+
+    if args.max_prompts is not None:
+        communities = communities[: args.max_prompts]
+
+    output_dir = Path(args.output_dir)
+    prompts_dir = output_dir / "prompts"
+    prompts_dir.mkdir(parents=True, exist_ok=True)
+    metadata_path = output_dir / "prompt_metadata.jsonl"
+
+    builder = LandmarkCommunityPromptBuilder(
+        landmarks_by_id=landmarks_by_id,
+        edges_by_id=edges_by_id,
+        switches=CommunityPromptSwitches(
+            max_landmarks_in_prompt=args.max_landmarks_in_prompt,
+            max_edges_in_prompt=args.max_edges_in_prompt,
+        ),
+    )
+
+    with metadata_path.open("w", encoding="utf-8") as meta_out:
+        for community in communities:
+            bundle = builder.build(community)
+            (prompts_dir / f"{community.community_id}.txt").write_text(
+                bundle.prompt_text, encoding="utf-8"
+            )
+            label_row = label_index.get(community.community_id) or {}
+            meta_out.write(
+                json.dumps(
+                    {
+                        "community_id": community.community_id,
+                        "host_id": community.host_id,
+                        "label": label_row.get("label", "unlabeled"),
+                        "label_source": label_row.get("label_source", "no_ground_truth_join"),
+                        "gt_atoms_hit": label_row.get("gt_atoms_hit") or [],
+                        "gt_subjects_hit": label_row.get("gt_subjects_hit") or [],
+                        "num_landmarks_total": len(community.landmark_event_ids),
+                        "num_landmarks_in_prompt": bundle.metadata["num_landmarks_in_prompt"],
+                        "num_edges_total": len(community.edge_ids),
+                        "num_edges_in_prompt": bundle.metadata["num_edges_in_prompt"],
+                        "span_seconds": community.span_seconds,
+                        "subjects_in_community": len(community.subjects),
+                        "selected_landmark_ids": list(bundle.selected_landmark_ids),
+                        "prompt_path": str((prompts_dir / f"{community.community_id}.txt").resolve()),
+                    },
+                    ensure_ascii=False,
+                    sort_keys=True,
+                )
+                + "\n"
+            )
+    print(
+        f"[prompts] wrote {len(communities)} prompts to {prompts_dir} "
+        f"and metadata to {metadata_path}",
+        flush=True,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/build_landmark_prompts_for_ids.py
+++ b/scripts/build_landmark_prompts_for_ids.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""Render Phase 14 raw landmark prompts for a specific list of community IDs.
+
+For head-to-head comparison with the hybrid pipeline: feed in the same
+community_ids the hybrid pipeline rendered, get a parallel set of raw
+landmark-only prompts on the same set.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+SRC = Path(__file__).resolve().parent.parent / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+from er_tp_dgp.landmark import (  # noqa: E402
+    LandmarkEdge,
+    LandmarkEvent,
+    read_communities_jsonl,
+)
+from er_tp_dgp.landmark_prompt import (  # noqa: E402
+    CommunityPromptSwitches,
+    LandmarkCommunityPromptBuilder,
+)
+
+
+def _stream_filter_landmarks(path: Path, allowed_ids: set[str]) -> dict[str, LandmarkEvent]:
+    out: dict[str, LandmarkEvent] = {}
+    if not allowed_ids:
+        return out
+    needed = set(allowed_ids)
+    with path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            if not line.strip():
+                continue
+            r = json.loads(line)
+            event_id = r.get("event_id")
+            if event_id not in needed:
+                continue
+            out[event_id] = LandmarkEvent(
+                event_id=event_id,
+                timestamp_nanos=r["timestamp_nanos"],
+                host_id=r.get("host_id"),
+                actor_subject_id=r["actor_subject_id"],
+                actor_path=r.get("actor_path"),
+                object_id=r.get("object_id"),
+                object_type=r.get("object_type"),
+                object_summary=r.get("object_summary"),
+                canonical_action=r["canonical_action"],
+                raw_event_type=r["raw_event_type"],
+                signals=tuple(r.get("signals") or ()),
+                metapath_hints=tuple(r.get("metapath_hints") or ()),
+                landmark_classes=tuple(r.get("landmark_classes") or ()),
+            )
+            if len(out) == len(needed):
+                break
+    return out
+
+
+def _stream_filter_edges(path: Path, allowed_ids: set[str]) -> dict[str, LandmarkEdge]:
+    out: dict[str, LandmarkEdge] = {}
+    if not allowed_ids:
+        return out
+    needed = set(allowed_ids)
+    with path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            if not line.strip():
+                continue
+            r = json.loads(line)
+            edge_id = r.get("edge_id")
+            if edge_id not in needed:
+                continue
+            out[edge_id] = LandmarkEdge(
+                edge_id=edge_id,
+                src_event_id=r["src_event_id"],
+                dst_event_id=r["dst_event_id"],
+                host_id=r.get("host_id"),
+                delta_nanos=r["delta_nanos"],
+                bridge_hops=r["bridge_hops"],
+                bridge_summary=r["bridge_summary"],
+            )
+            if len(out) == len(needed):
+                break
+    return out
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--communities", required=True)
+    parser.add_argument("--landmarks", required=True)
+    parser.add_argument("--landmark-edges", required=True)
+    parser.add_argument(
+        "--ids-from-metadata",
+        required=True,
+        help="prompt_metadata.jsonl from the hybrid run; we replicate its community_id set.",
+    )
+    parser.add_argument("--labeled-communities", default=None)
+    parser.add_argument("--output-dir", required=True)
+    parser.add_argument("--max-landmarks-in-prompt", type=int, default=60)
+    parser.add_argument("--max-edges-in-prompt", type=int, default=80)
+    args = parser.parse_args()
+
+    target_ids: set[str] = set()
+    with Path(args.ids_from_metadata).open("r", encoding="utf-8") as handle:
+        for line in handle:
+            if not line.strip():
+                continue
+            row = json.loads(line)
+            target_ids.add(row["community_id"])
+    print(f"[raw] target community ids: {len(target_ids)}", flush=True)
+
+    print("[raw] reading communities...", flush=True)
+    communities = read_communities_jsonl(args.communities)
+    communities = [c for c in communities if c.community_id in target_ids]
+    print(f"[raw] communities matched: {len(communities)}", flush=True)
+
+    label_index: dict[str, dict] = {}
+    if args.labeled_communities:
+        with Path(args.labeled_communities).open("r", encoding="utf-8") as handle:
+            for line in handle:
+                if not line.strip():
+                    continue
+                r = json.loads(line)
+                label_index[r["community_id"]] = r
+
+    needed_lm_ids: set[str] = set()
+    needed_edge_ids: set[str] = set()
+    for c in communities:
+        needed_lm_ids.update(c.landmark_event_ids)
+        needed_edge_ids.update(c.edge_ids)
+    print(
+        f"[raw] need {len(needed_lm_ids)} landmark rows / {len(needed_edge_ids)} edge rows",
+        flush=True,
+    )
+
+    print("[raw] stream-loading landmarks...", flush=True)
+    landmarks_by_id = _stream_filter_landmarks(Path(args.landmarks), needed_lm_ids)
+    print(f"[raw] landmarks loaded: {len(landmarks_by_id)}", flush=True)
+    print("[raw] stream-loading edges...", flush=True)
+    edges_by_id = _stream_filter_edges(Path(args.landmark_edges), needed_edge_ids)
+    print(f"[raw] edges loaded: {len(edges_by_id)}", flush=True)
+
+    out_dir = Path(args.output_dir)
+    prompts_dir = out_dir / "prompts"
+    prompts_dir.mkdir(parents=True, exist_ok=True)
+    metadata_path = out_dir / "prompt_metadata.jsonl"
+
+    builder = LandmarkCommunityPromptBuilder(
+        landmarks_by_id=landmarks_by_id,
+        edges_by_id=edges_by_id,
+        switches=CommunityPromptSwitches(
+            max_landmarks_in_prompt=args.max_landmarks_in_prompt,
+            max_edges_in_prompt=args.max_edges_in_prompt,
+        ),
+    )
+
+    written = 0
+    with metadata_path.open("w", encoding="utf-8") as meta_out:
+        for community in communities:
+            bundle = builder.build(community)
+            (prompts_dir / f"{community.community_id}.txt").write_text(
+                bundle.prompt_text, encoding="utf-8"
+            )
+            label_row = label_index.get(community.community_id) or {}
+            meta_out.write(
+                json.dumps(
+                    {
+                        "community_id": community.community_id,
+                        "host_id": community.host_id,
+                        "label": label_row.get("label", "unlabeled"),
+                        "label_source": label_row.get(
+                            "label_source", "no_ground_truth_join"
+                        ),
+                        "gt_atoms_hit": label_row.get("gt_atoms_hit") or [],
+                        "num_landmarks_total": len(community.landmark_event_ids),
+                        "num_landmarks_in_prompt": bundle.metadata[
+                            "num_landmarks_in_prompt"
+                        ],
+                        "num_edges_total": len(community.edge_ids),
+                        "num_edges_in_prompt": bundle.metadata["num_edges_in_prompt"],
+                        "span_seconds": community.span_seconds,
+                        "subjects_in_community": len(community.subjects),
+                        "selected_landmark_ids": list(bundle.selected_landmark_ids),
+                        "prompt_path": str(
+                            (prompts_dir / f"{community.community_id}.txt").resolve()
+                        ),
+                        "prompt_char_length": len(bundle.prompt_text),
+                    },
+                    ensure_ascii=False,
+                    sort_keys=True,
+                )
+                + "\n"
+            )
+            written += 1
+    print(f"[raw] wrote {written} prompts to {prompts_dir}", flush=True)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/build_theia_prompt_batch.py
+++ b/scripts/build_theia_prompt_batch.py
@@ -0,0 +1,486 @@
+#!/usr/bin/env python3
+"""Generate ER-TP-DGP prompts for a labeled THEIA evaluation batch."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from pathlib import Path
+
+from er_tp_dgp.experiments import default_method_registry
+from er_tp_dgp.metapaths import APTMetapathExtractor
+from er_tp_dgp.numerical_aggregator import NumericalAggregator
+from er_tp_dgp.prompt import PromptBuilder, PromptComponentSwitches
+from er_tp_dgp.theia import (
+    build_cached_theia_window_ir,
+    build_multi_target_window_irs,
+    discover_theia_json_files,
+)
+from er_tp_dgp.trimming import TemporalSecurityAwareTrimmer
+from er_tp_dgp.validation import validate_evidence_paths, validate_graph, validate_ir
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Build graph-enhanced ER-TP-DGP prompts from labeled target metadata. "
+            "Labels are written only to metadata, never into prompt text."
+        )
+    )
+    parser.add_argument("--targets", default="reports/evaluation/e3_theia_v0_1/labeled_targets.jsonl")
+    parser.add_argument("--data-dir", default="data/raw/e3_theia_json")
+    parser.add_argument(
+        "--input-file",
+        action="append",
+        default=None,
+        help="Specific THEIA JSON file to scan. Can be repeated. Overrides --data-dir discovery.",
+    )
+    parser.add_argument("--output-dir", default="reports/evaluation/e3_theia_v0_1/prompts_graph_dgp_full")
+    parser.add_argument("--lookback-seconds", type=float, default=300.0)
+    parser.add_argument("--lookahead-seconds", type=float, default=300.0)
+    parser.add_argument("--top-m-per-metapath", type=int, default=5)
+    parser.add_argument(
+        "--max-window-events",
+        type=int,
+        default=50000,
+        help=(
+            "Soft audit threshold: windows above this size are recorded in "
+            "prompt_size_audit.jsonl but still proceed to prompt construction "
+            "(trimming controls actual prompt size, not raw window event count)."
+        ),
+    )
+    parser.add_argument(
+        "--hard-skip-window-events",
+        type=int,
+        default=None,
+        help=(
+            "If set, hard-skip targets whose window exceeds this size. Default "
+            "is no hard skip; only soft audit applies."
+        ),
+    )
+    parser.add_argument(
+        "--cache-dir",
+        default="reports/cache/theia_window_ir",
+        help="Directory for compressed window-IR snapshots. Pass empty to disable.",
+    )
+    parser.add_argument("--max-targets", type=int, default=None)
+    parser.add_argument(
+        "--include-cohort",
+        action="append",
+        default=None,
+        help="Only include this cohort. Can be repeated.",
+    )
+    parser.add_argument(
+        "--max-per-cohort",
+        type=int,
+        default=None,
+        help="Maximum targets to keep from each cohort after filtering.",
+    )
+    parser.add_argument(
+        "--method-variant",
+        default="graph_dgp",
+        help=(
+            "Method variant from experiments.default_method_registry(). "
+            "Drives prompt component switches (TextSumm / MDK / PathSumm / "
+            "NumSumm / TempTrim / SecAware / EvidenceIDs)."
+        ),
+    )
+    parser.add_argument(
+        "--summarizer-config",
+        default=None,
+        help=(
+            "Path to summarizer LLM config (YAML). Required if method variant "
+            "enables DGP TextSumm or PathSumm."
+        ),
+    )
+    parser.add_argument(
+        "--summarizer-workers",
+        type=int,
+        default=8,
+        help=(
+            "Concurrency for batched LLM summarization (ThreadPoolExecutor). "
+            "Higher values shorten first-cold-cache batches; bound by your "
+            "endpoint's per-key rate limit."
+        ),
+    )
+    parser.add_argument(
+        "--skip-multi-anchor-prewarm",
+        action="store_true",
+        help=(
+            "Skip the one-time multi-anchor IR prewarm and let the per-target "
+            "loop scan the corpus once per target. Only useful for debugging."
+        ),
+    )
+    args = parser.parse_args()
+
+    paths = [Path(path) for path in args.input_file] if args.input_file else discover_theia_json_files(args.data_dir)
+    if not paths:
+        raise SystemExit("no THEIA JSON files found")
+    targets = _read_jsonl(args.targets)
+    targets = _filter_targets(targets, args.include_cohort, args.max_per_cohort)
+    if args.max_targets is not None:
+        targets = targets[: args.max_targets]
+
+    output_dir = Path(args.output_dir)
+    prompts_dir = output_dir / "prompt_text"
+    validations_dir = output_dir / "validations"
+    prompts_dir.mkdir(parents=True, exist_ok=True)
+    validations_dir.mkdir(parents=True, exist_ok=True)
+    metadata_path = output_dir / "prompt_metadata.jsonl"
+    failures_path = output_dir / "prompt_failures.jsonl"
+    audit_path = output_dir / "prompt_size_audit.jsonl"
+    cache_dir = args.cache_dir or None
+
+    registry = default_method_registry()
+    if args.method_variant not in registry:
+        raise SystemExit(
+            f"unknown method variant: {args.method_variant}; "
+            f"choose from {sorted(registry)}"
+        )
+    method = registry[args.method_variant]
+    switches = PromptComponentSwitches(
+        use_text_summarization=method.uses_dgp_text_summarization,
+        use_path_summarization_llm=method.uses_dgp_path_summarization_llm,
+        use_numerical_aggregation_dgp=method.uses_dgp_numerical_aggregation,
+        use_apt_numerical_stats=method.uses_numerical_summary,
+        include_evidence_ids=method.uses_evidence_ids,
+        include_local_one_hop_context=method.uses_local_context,
+    )
+
+    summarizer_pair = _maybe_build_summarizers(
+        switches=switches,
+        summarizer_config_path=args.summarizer_config,
+        max_workers=args.summarizer_workers,
+    )
+
+    # Pre-warm the THEIA window-IR cache for *all* targets in one two-pass scan,
+    # so the per-target loop below hits cache instead of scanning the 80 GB
+    # corpus once per target. For 16 targets this is 16x less disk IO.
+    if cache_dir and not args.skip_multi_anchor_prewarm and len(targets) > 1:
+        anchors = [
+            {
+                "anchor_event_uuid": t["anchor_event_id"],
+                "lookback_seconds": args.lookback_seconds,
+                "lookahead_seconds": args.lookahead_seconds,
+            }
+            for t in targets
+        ]
+        from time import time as _now
+        prewarm_started = _now()
+        print(
+            f"[multi-anchor prewarm] {len(anchors)} anchors, lookback={args.lookback_seconds}s, "
+            f"lookahead={args.lookahead_seconds}s, cache={cache_dir}"
+        )
+        prewarm_results = build_multi_target_window_irs(
+            paths,
+            anchors=anchors,
+            cache_dir=cache_dir,
+        )
+        prewarm_elapsed = _now() - prewarm_started
+        print(
+            f"[multi-anchor prewarm] populated {len(prewarm_results)}/{len(anchors)} anchors "
+            f"in {prewarm_elapsed:.1f}s"
+        )
+
+    metadata_rows: list[dict[str, object]] = []
+    failure_rows: list[dict[str, object]] = []
+    audit_rows: list[dict[str, object]] = []
+    for index, target in enumerate(targets, start=1):
+        target_id = target["target_id"]
+        anchor_event_id = target["anchor_event_id"]
+        try:
+            window = build_cached_theia_window_ir(
+                paths,
+                target_event_uuid=anchor_event_id,
+                lookback_seconds=args.lookback_seconds,
+                lookahead_seconds=args.lookahead_seconds,
+                cache_dir=cache_dir,
+            )
+            graph = window.to_graph()
+            graph_target_id = window.target_subject_id or window.target_event_id
+            if graph_target_id != target_id:
+                raise ValueError(
+                    f"anchor event subject mismatch: expected {target_id}, got {graph_target_id}"
+                )
+            if (
+                args.hard_skip_window_events is not None
+                and len(window.events) > args.hard_skip_window_events
+            ):
+                raise ValueError(
+                    f"window too large for direct prompt construction: "
+                    f"{len(window.events)} events > {args.hard_skip_window_events}; "
+                    "consider narrower lookback/lookahead or remove --hard-skip-window-events."
+                )
+            window_oversize = len(window.events) > args.max_window_events
+            if window_oversize:
+                audit_rows.append(
+                    {
+                        "target_id": target_id,
+                        "anchor_event_id": anchor_event_id,
+                        "cohort": target.get("cohort"),
+                        "events": len(window.events),
+                        "audit_threshold": args.max_window_events,
+                        "note": (
+                            "Window exceeded soft threshold; prompt was still "
+                            "constructed because trimming controls prompt size."
+                        ),
+                    }
+                )
+
+            ir_report = validate_ir(list(window.entities), list(window.events))
+            graph_report = validate_graph(graph)
+            paths_all = APTMetapathExtractor(graph).extract_for_target(graph_target_id)
+            selected = _select_paths(
+                graph=graph,
+                graph_target_id=graph_target_id,
+                paths_all=paths_all,
+                method_variant=method,
+                top_m_per_metapath=args.top_m_per_metapath,
+            )
+            evidence_report = validate_evidence_paths(graph, selected)
+            node_summarizer, path_summarizer = summarizer_pair
+            prompt = PromptBuilder(
+                graph,
+                node_summarizer=node_summarizer,
+                path_summarizer=path_summarizer,
+                numerical_aggregator=NumericalAggregator(graph),
+                switches=switches,
+            ).build(graph_target_id, selected)
+
+            safe_name = f"{index:04d}_{_safe_id(target_id)}"
+            prompt_path = prompts_dir / f"{safe_name}.txt"
+            prompt_path.write_text(prompt.prompt_text, encoding="utf-8")
+            (validations_dir / f"{safe_name}_ir.md").write_text(ir_report.to_markdown(), encoding="utf-8")
+            (validations_dir / f"{safe_name}_graph.md").write_text(graph_report.to_markdown(), encoding="utf-8")
+            (validations_dir / f"{safe_name}_evidence.md").write_text(evidence_report.to_markdown(), encoding="utf-8")
+
+            metadata_rows.append(
+                {
+                    "target_id": target_id,
+                    "target_type": target["target_type"],
+                    "label": target["label"],
+                    "label_confidence": target["label_confidence"],
+                    "cohort": target["cohort"],
+                    "anchor_event_id": anchor_event_id,
+                    "prompt_path": str(prompt_path),
+                    "prompt_chars": len(prompt.prompt_text),
+                    "prompt_estimated_tokens": int(len(prompt.prompt_text) / 4),
+                    "entities": len(window.entities),
+                    "events": len(window.events),
+                    "extracted_evidence_paths": len(paths_all),
+                    "selected_evidence_paths": len(selected),
+                    "evidence_path_ids": list(prompt.evidence_path_ids),
+                    "ir_ok": ir_report.ok,
+                    "graph_ok": graph_report.ok,
+                    "evidence_ok": evidence_report.ok,
+                    "schema_gaps": list(window.schema_gaps),
+                    "label_fields_excluded_from_prompt": True,
+                    "method_variant": method.name,
+                    "window_exceeded_soft_threshold": window_oversize,
+                }
+            )
+            tag = " (oversize-window)" if window_oversize else ""
+            print(
+                f"[{index}/{len(targets)}] built {target_id} "
+                f"events={len(window.events)} selected={len(selected)}{tag}"
+            )
+        except Exception as exc:
+            failure_rows.append(
+                {
+                    "target_id": target_id,
+                    "anchor_event_id": anchor_event_id,
+                    "cohort": target.get("cohort"),
+                    "error": str(exc),
+                }
+            )
+            print(f"[{index}/{len(targets)}] failed {target_id}: {exc}")
+
+    _write_jsonl(metadata_path, metadata_rows)
+    _write_jsonl(failures_path, failure_rows)
+    _write_jsonl(audit_path, audit_rows)
+    summary = _summary_markdown(metadata_rows, failure_rows, audit_rows, args)
+    (output_dir / "prompt_batch.md").write_text(summary + "\n", encoding="utf-8")
+
+    print(f"built={len(metadata_rows)} failed={len(failure_rows)} oversize_audited={len(audit_rows)}")
+    print(f"wrote {metadata_path}")
+    print(f"wrote {failures_path}")
+    print(f"wrote {audit_path}")
+
+
+def _read_jsonl(path: str | Path) -> list[dict[str, object]]:
+    rows: list[dict[str, object]] = []
+    with Path(path).open("r", encoding="utf-8") as handle:
+        for line in handle:
+            if line.strip():
+                rows.append(json.loads(line))
+    return rows
+
+
+def _filter_targets(
+    targets: list[dict[str, object]],
+    include_cohorts: list[str] | None,
+    max_per_cohort: int | None,
+) -> list[dict[str, object]]:
+    if include_cohorts:
+        allowed = set(include_cohorts)
+        targets = [target for target in targets if target.get("cohort") in allowed]
+    if max_per_cohort is None:
+        return targets
+    counts: dict[str, int] = {}
+    selected: list[dict[str, object]] = []
+    for target in targets:
+        cohort = str(target.get("cohort"))
+        if counts.get(cohort, 0) >= max_per_cohort:
+            continue
+        selected.append(target)
+        counts[cohort] = counts.get(cohort, 0) + 1
+    return selected
+
+
+def _write_jsonl(path: str | Path, rows: list[dict[str, object]]) -> None:
+    destination = Path(path)
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    with destination.open("w", encoding="utf-8") as handle:
+        for row in rows:
+            handle.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n")
+
+
+def _summary_markdown(
+    metadata_rows: list[dict[str, object]],
+    failure_rows: list[dict[str, object]],
+    audit_rows: list[dict[str, object]],
+    args: argparse.Namespace,
+) -> str:
+    cohorts: dict[str, int] = {}
+    for row in metadata_rows:
+        cohort = str(row.get("cohort"))
+        cohorts[cohort] = cohorts.get(cohort, 0) + 1
+    lines = [
+        "# ER-TP-DGP Prompt Batch",
+        "",
+        "Labels are metadata only and are excluded from prompt text.",
+        "",
+        f"- method_variant: {args.method_variant}",
+        f"- built: {len(metadata_rows)}",
+        f"- failed: {len(failure_rows)}",
+        f"- oversize_audited: {len(audit_rows)}",
+        f"- lookback_seconds: {args.lookback_seconds}",
+        f"- lookahead_seconds: {args.lookahead_seconds}",
+        f"- top_m_per_metapath: {args.top_m_per_metapath}",
+        f"- max_window_events_soft: {args.max_window_events}",
+        f"- hard_skip_window_events: {args.hard_skip_window_events}",
+        f"- cache_dir: {args.cache_dir}",
+        "",
+        "## Cohorts",
+        "",
+    ]
+    lines.extend([f"- {key}: {value}" for key, value in sorted(cohorts.items())] or ["- none"])
+    lines.extend(["", "## Prompt Size", ""])
+    if metadata_rows:
+        token_values = [int(row["prompt_estimated_tokens"]) for row in metadata_rows]
+        lines.extend(
+            [
+                f"- min_estimated_tokens: {min(token_values)}",
+                f"- max_estimated_tokens: {max(token_values)}",
+                f"- avg_estimated_tokens: {sum(token_values) / len(token_values):.1f}",
+            ]
+        )
+    else:
+        lines.append("- none")
+    if failure_rows:
+        lines.extend(["", "## Failures", ""])
+        for row in failure_rows:
+            lines.append(f"- target={row['target_id']} error={row['error']}")
+    return "\n".join(lines)
+
+
+def _safe_id(value: str) -> str:
+    return re.sub(r"[^A-Za-z0-9_.-]+", "_", value)[:120]
+
+
+def _select_paths(*, graph, graph_target_id, paths_all, method_variant, top_m_per_metapath):
+    """Pick the trimmer that matches the method variant's switches.
+
+    Four regimes (in order of preference):
+      0. No graph at all: ``uses_event_reified_graph=False`` → return [].
+         Used by target_only_llm and similar non-graph baselines so the prompt
+         really contains zero metapath context.
+      1. DGP MDK: ``uses_dgp_diffusion_trimming=True`` → MDK trimmer.
+      2. APT rule trimmer: MDK off but TempTrim/SecAware still on.
+      3. No trimming at all: return paths_all (w/o TempTrim ablation when
+         MDK is also off, but graph still present).
+    """
+    if not method_variant.uses_event_reified_graph:
+        return []
+    if method_variant.uses_dgp_diffusion_trimming:
+        try:
+            from er_tp_dgp.diffusion_trimmer import (
+                HashingEmbedder,
+                MarkovDiffusionTrimmer,
+                MDKConfig,
+            )
+        except RuntimeError:
+            print("WARNING: numpy unavailable; falling back to rule trimmer for MDK request.")
+        else:
+            embedder = HashingEmbedder(dim=64)
+            return MarkovDiffusionTrimmer(
+                graph,
+                embedder=embedder,
+                config=MDKConfig(k_hops=3, top_m=top_m_per_metapath),
+            ).trim(graph_target_id, paths_all)
+
+    if method_variant.uses_temporal_trimming or method_variant.uses_security_aware_trimming:
+        return TemporalSecurityAwareTrimmer(
+            graph,
+            top_m_per_metapath=top_m_per_metapath,
+        ).trim(graph_target_id, paths_all)
+
+    # No trimming.
+    return paths_all
+
+
+def _maybe_build_summarizers(*, switches, summarizer_config_path, max_workers):
+    """Build NodeTextSummarizer / MetapathTextSummarizer iff DGP TextSumm/PathSumm enabled.
+
+    Returns ``(None, None)`` when summarization is disabled.
+    """
+    needs_node = switches.use_text_summarization
+    needs_path = switches.use_path_summarization_llm
+    if not (needs_node or needs_path):
+        return None, None
+    if not summarizer_config_path:
+        print(
+            "WARNING: method variant requests TextSumm/PathSumm but "
+            "--summarizer-config was not provided; falling back to truncation-only summaries."
+        )
+        from er_tp_dgp.text_summarizer import (
+            MetapathTextSummarizer,
+            NodeTextSummarizer,
+            SummarizerConfig,
+            _NullLLM,
+        )
+
+        cfg = SummarizerConfig(model_name="null-fallback", max_workers=max_workers)
+        node = NodeTextSummarizer(llm=_NullLLM(), config=cfg) if needs_node else None
+        path = MetapathTextSummarizer(llm=_NullLLM(), config=cfg) if needs_path else None
+        return node, path
+
+    from er_tp_dgp.llm import OpenAICompatibleHTTPProvider
+    from er_tp_dgp.llm_config import load_llm_config
+    from er_tp_dgp.text_summarizer import (
+        MetapathTextSummarizer,
+        NodeTextSummarizer,
+        SummarizerConfig,
+    )
+
+    llm_config = load_llm_config(summarizer_config_path)
+    provider = OpenAICompatibleHTTPProvider(llm_config)
+    cfg = SummarizerConfig(model_name=llm_config.model, max_workers=max_workers)
+    node = NodeTextSummarizer(llm=provider, config=cfg) if needs_node else None
+    path = MetapathTextSummarizer(llm=provider, config=cfg) if needs_path else None
+    return node, path
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/evaluate_landmark_detection.py
+++ b/scripts/evaluate_landmark_detection.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+"""Join ORTHRUS ground truth onto landmark communities and report coverage.
+
+The CSG construction is GT-free. This script is the evaluation phase: it
+reads the constructed communities and asks two questions:
+
+1. **Subject coverage** — for each GT-malicious subject, is it touched by
+   at least one community? Lower bounds detection recall.
+2. **Community-level GT join** — for each community, is any of its landmark
+   events a GT-malicious-subject event? Communities flagged this way are
+   the positive class for downstream LLM evaluation.
+
+The output of this script is the labeled-community manifest fed to LLM
+prompting + AUPRC computation.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Any
+
+SRC = Path(__file__).resolve().parent.parent / "src"
+if str(SRC) not in sys.path:
+    sys.path.insert(0, str(SRC))
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--communities", required=True)
+    parser.add_argument("--landmarks", required=True)
+    parser.add_argument(
+        "--oracle-targets",
+        required=True,
+        help="ORTHRUS labeled targets (target_id is the malicious subject UUID).",
+    )
+    parser.add_argument(
+        "--out-labeled-communities",
+        required=True,
+        help="Per-community manifest with label/atom_id joined for evaluation only.",
+    )
+    parser.add_argument(
+        "--out-markdown",
+        required=True,
+        help="Aggregate evaluation report.",
+    )
+    args = parser.parse_args()
+
+    communities = _read_jsonl(args.communities)
+    landmarks_by_id = {row["event_id"]: row for row in _read_jsonl(args.landmarks)}
+    oracle_rows = _read_jsonl(args.oracle_targets)
+
+    # Build subject → atom_id lookup over ground-truth-malicious subjects.
+    gt_subject_to_atom: dict[str, str | None] = {}
+    for row in oracle_rows:
+        if row.get("label") == "malicious":
+            gt_subject_to_atom[row["target_id"]] = row.get("atom_id")
+    print(f"[gt] malicious subjects in oracle: {len(gt_subject_to_atom)}", flush=True)
+
+    # Index communities by subject for "did we cover this GT subject?".
+    communities_by_subject: dict[str, list[dict[str, Any]]] = defaultdict(list)
+    for c in communities:
+        for sid in c.get("subjects") or ():
+            communities_by_subject[sid].append(c)
+
+    # Per-GT-subject coverage report.
+    covered = 0
+    coverage_rows: list[dict[str, Any]] = []
+    for sid, atom in gt_subject_to_atom.items():
+        cs = communities_by_subject.get(sid, [])
+        if cs:
+            covered += 1
+            coverage_rows.append(
+                {
+                    "subject_uuid": sid,
+                    "atom_id": atom,
+                    "covered": True,
+                    "communities": [c["community_id"] for c in cs],
+                    "max_community_landmarks": max(len(c["landmark_event_ids"]) for c in cs),
+                }
+            )
+        else:
+            coverage_rows.append(
+                {
+                    "subject_uuid": sid,
+                    "atom_id": atom,
+                    "covered": False,
+                    "communities": [],
+                    "max_community_landmarks": 0,
+                }
+            )
+
+    # Per-community label join: a community is malicious if any of its
+    # landmarks' actor_subject_id is in the GT-malicious set.
+    labeled: list[dict[str, Any]] = []
+    malicious_community_count = 0
+    benign_community_count = 0
+    for c in communities:
+        gt_subjects_hit: list[str] = []
+        gt_atoms_hit: set[str] = set()
+        for eid in c["landmark_event_ids"]:
+            lm = landmarks_by_id.get(eid)
+            if not lm:
+                continue
+            sid = lm.get("actor_subject_id")
+            if sid in gt_subject_to_atom:
+                gt_subjects_hit.append(sid)
+                atom = gt_subject_to_atom[sid]
+                if atom:
+                    gt_atoms_hit.add(atom)
+        is_malicious = bool(gt_subjects_hit)
+        if is_malicious:
+            malicious_community_count += 1
+        else:
+            benign_community_count += 1
+        labeled.append(
+            {
+                "community_id": c["community_id"],
+                "host_id": c.get("host_id"),
+                "label": "malicious" if is_malicious else "benign",
+                "label_source": (
+                    "orthrus_subject_membership" if is_malicious else "no_gt_subject_overlap"
+                ),
+                "gt_subjects_hit": sorted(set(gt_subjects_hit)),
+                "gt_atoms_hit": sorted(gt_atoms_hit),
+                "num_landmarks": len(c["landmark_event_ids"]),
+                "num_edges": len(c.get("edge_ids") or ()),
+                "subjects_in_community": len(c.get("subjects") or ()),
+                "span_seconds": c["span_seconds"],
+                "start_timestamp_nanos": c["start_timestamp_nanos"],
+                "landmark_class_counts": c.get("landmark_class_counts") or {},
+            }
+        )
+
+    Path(args.out_labeled_communities).parent.mkdir(parents=True, exist_ok=True)
+    with Path(args.out_labeled_communities).open("w", encoding="utf-8") as out:
+        for row in labeled:
+            out.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n")
+
+    md = _render_markdown(
+        coverage_rows=coverage_rows,
+        labeled=labeled,
+        gt_subjects=gt_subject_to_atom,
+        communities=communities,
+        config={
+            "communities_path": args.communities,
+            "landmarks_path": args.landmarks,
+            "oracle_targets_path": args.oracle_targets,
+            "out_labeled_communities": args.out_labeled_communities,
+        },
+        coverage=covered,
+    )
+    Path(args.out_markdown).parent.mkdir(parents=True, exist_ok=True)
+    Path(args.out_markdown).write_text(md, encoding="utf-8")
+
+    print(
+        f"[eval] gt_subjects={len(gt_subject_to_atom)} covered={covered} "
+        f"communities={len(communities)} malicious={malicious_community_count} "
+        f"benign={benign_community_count}",
+        flush=True,
+    )
+    print(f"[eval] wrote {args.out_labeled_communities}", flush=True)
+    print(f"[eval] wrote {args.out_markdown}", flush=True)
+    return 0
+
+
+def _read_jsonl(path: str) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    with Path(path).open("r", encoding="utf-8") as handle:
+        for line in handle:
+            if not line.strip():
+                continue
+            rows.append(json.loads(line))
+    return rows
+
+
+def _render_markdown(
+    *,
+    coverage_rows: list[dict[str, Any]],
+    labeled: list[dict[str, Any]],
+    gt_subjects: dict[str, str | None],
+    communities: list[dict[str, Any]],
+    config: dict[str, Any],
+    coverage: int,
+) -> str:
+    total_subjects = len(gt_subjects)
+    malicious_communities = sum(1 for r in labeled if r["label"] == "malicious")
+    benign_communities = sum(1 for r in labeled if r["label"] == "benign")
+    atoms_with_at_least_one_community: set[str] = set()
+    for row in labeled:
+        if row["label"] == "malicious":
+            atoms_with_at_least_one_community.update(row.get("gt_atoms_hit") or [])
+    total_atoms = {atom for atom in gt_subjects.values() if atom}
+
+    sizes = [len(c["landmark_event_ids"]) for c in communities]
+    sizes_malicious = [r["num_landmarks"] for r in labeled if r["label"] == "malicious"]
+    sizes_benign = [r["num_landmarks"] for r in labeled if r["label"] == "benign"]
+
+    failures = [
+        (row["atom_id"] or "(no atom)", row["subject_uuid"])
+        for row in coverage_rows
+        if not row["covered"]
+    ]
+    failure_atoms = Counter(atom for atom, _ in failures)
+
+    lines = [
+        "# Landmark CSG Detection Coverage",
+        "",
+        "Construction is GT-free. This report joins GT only for evaluation.",
+        "",
+        "## Inputs",
+        "",
+        f"- communities: `{config['communities_path']}`",
+        f"- landmarks: `{config['landmarks_path']}`",
+        f"- oracle: `{config['oracle_targets_path']}`",
+        f"- output (labeled communities): `{config['out_labeled_communities']}`",
+        "",
+        "## Subject coverage",
+        "",
+        f"- GT-malicious subjects: {total_subjects}",
+        f"- subjects touched by at least one community: {coverage}",
+        (
+            f"- **subject_coverage_recall**: {coverage / total_subjects:.3f}"
+            if total_subjects
+            else "- subject_coverage_recall: n/a"
+        ),
+        "",
+        "## Community-level join",
+        "",
+        f"- communities total: {len(communities)}",
+        f"- malicious communities: {malicious_communities}",
+        f"- benign communities: {benign_communities}",
+        (
+            f"- malicious_share: {malicious_communities / len(communities):.4f}"
+            if communities
+            else "- malicious_share: n/a"
+        ),
+        f"- distinct GT atoms with ≥1 community: {len(atoms_with_at_least_one_community)} / {len(total_atoms)}",
+        "",
+        "## Community size",
+        "",
+        f"- all (n={len(sizes)}): min={min(sizes, default=0)} median={_pct(sizes, 0.5)} p90={_pct(sizes, 0.9)} max={max(sizes, default=0)}",
+        f"- malicious (n={len(sizes_malicious)}): median={_pct(sizes_malicious, 0.5)} p90={_pct(sizes_malicious, 0.9)} max={max(sizes_malicious, default=0)}",
+        f"- benign (n={len(sizes_benign)}): median={_pct(sizes_benign, 0.5)} p90={_pct(sizes_benign, 0.9)} max={max(sizes_benign, default=0)}",
+        "",
+        "## Failure breakdown (uncovered GT subjects)",
+        "",
+    ]
+    if not failures:
+        lines.append("- (none)")
+    else:
+        for atom, n in failure_atoms.most_common(20):
+            lines.append(f"- {atom}: {n}")
+    lines.extend(
+        [
+            "",
+            "## Interpretation",
+            "",
+            "- `subject_coverage_recall` is the upper bound on detection recall:",
+            "  any subject NOT touched by a community cannot be flagged.",
+            "- `malicious_share` is the inverse of the LLM's class imbalance — too low",
+            "  means LLM faces an extreme imbalance; too high means the construction is",
+            "  over-clustering benign and malicious into shared communities.",
+            "- Median malicious community size vs benign indicates whether attack",
+            "  stories naturally form longer chains than benign noise.",
+            "",
+        ]
+    )
+    return "\n".join(lines)
+
+
+def _pct(values: list[int], q: float) -> int | None:
+    if not values:
+        return None
+    ordered = sorted(values)
+    k = max(0, min(len(ordered) - 1, int(round(q * (len(ordered) - 1)))))
+    return ordered[k]
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/extract_e3_ground_truth_atoms.py
+++ b/scripts/extract_e3_ground_truth_atoms.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+"""Extract label-only structured atoms from the E3 ground-truth PDF."""
+
+from __future__ import annotations
+
+import argparse
+import subprocess
+from pathlib import Path
+
+from er_tp_dgp.ground_truth import write_ground_truth_atom_report
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Extract label-only E3 ground-truth atoms. Output must not be used in prompts."
+    )
+    parser.add_argument(
+        "--pdf",
+        default="data/ground_truth/e3/TC_Ground_Truth_Report_E3_Update.pdf",
+    )
+    parser.add_argument("--output-dir", default="reports/ground_truth/e3")
+    parser.add_argument("--target-filter", default="THEIA")
+    args = parser.parse_args()
+
+    pdf_path = Path(args.pdf)
+    if not pdf_path.exists():
+        raise SystemExit(f"missing PDF: {pdf_path}")
+
+    result = subprocess.run(
+        ["pdftotext", "-layout", str(pdf_path), "-"],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    report = write_ground_truth_atom_report(
+        result.stdout,
+        jsonl_path=output_dir / "ground_truth_atoms.jsonl",
+        markdown_path=output_dir / "ground_truth_atoms.md",
+        target_filter=None if args.target_filter.lower() == "all" else args.target_filter,
+    )
+    print(f"atoms={len(report.atoms)} lines_seen={report.lines_seen}")
+    print(f"wrote {output_dir / 'ground_truth_atoms.jsonl'}")
+    print(f"wrote {output_dir / 'ground_truth_atoms.md'}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/freeze_method_version.py
+++ b/scripts/freeze_method_version.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""Freeze an auditable ER-TP-DGP method-version manifest."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from er_tp_dgp.versioning import write_method_version_manifest
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Write a sanitized, hash-based ER-TP-DGP method manifest."
+    )
+    parser.add_argument(
+        "--output",
+        default="reports/method_versions/ER-TP-DGP-v0.1.json",
+        help="Destination JSON path.",
+    )
+    parser.add_argument("--version", default="ER-TP-DGP-v0.1")
+    parser.add_argument("--repo-root", default=".")
+    parser.add_argument(
+        "--llm-config",
+        default="configs/llm.yaml",
+        help="LLM YAML to sanitize and include. Use 'none' to skip.",
+    )
+    args = parser.parse_args()
+
+    llm_config_path = None if args.llm_config.lower() == "none" else args.llm_config
+    manifest = write_method_version_manifest(
+        args.output,
+        repo_root=args.repo_root,
+        version=args.version,
+        llm_config_path=llm_config_path,
+    )
+    print(f"wrote {Path(args.output)}")
+    print(f"method={manifest.method_name} version={manifest.version}")
+    print(f"components={len(manifest.components)}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/import_orthrus_ground_truth.py
+++ b/scripts/import_orthrus_ground_truth.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+"""Import ORTHRUS (USENIX Sec 2025) ground truth into ER-TP-DGP labeled_targets.jsonl format.
+
+ORTHRUS publishes manually curated, attack-graph-aligned ground truth for
+DARPA TC E3 + E5 (12 attack scenarios across 6 sub-datasets) on Zenodo
+(record 14641608, https://github.com/ubc-provenance/ground-truth). It is the
+most conservative and reproducible labeling currently available — see
+ORTHRUS Appendix C "Ground Truth Construction" for methodology.
+
+This script:
+  1. Reads ORTHRUS CSV files (UUID, attrs, index_id) per attack scenario
+  2. Filters to subject (process) entities — those are the ones our pipeline
+     can score as targets. Files / netflows are kept as evidence of attack
+     scope but excluded from the target list.
+  3. Produces labeled_targets.jsonl rows with:
+       label = "malicious"
+       atom_id = ORTHRUS scenario name (e.g. e3-theia-Browser_Extension_Drakon_Dropper)
+       process_path = parsed from ORTHRUS attributes['subject']
+       cohort = "positive_high_confidence_orthrus"
+  4. Optionally augments with hard_negative_proxy from candidate_universe.
+
+NOTE: each malicious target needs an ``anchor_event_id``. ORTHRUS labels
+entities, not events — so for each subject UUID we pick the FIRST event in
+the THEIA log where that subject appears as actor (i.e. its earliest action).
+This requires scanning the corpus once to map subject_uuid → first
+event_uuid, which is built lazily and cached.
+"""
+
+from __future__ import annotations
+
+import argparse
+import ast
+import csv
+import json
+import re
+from collections import defaultdict
+from pathlib import Path
+
+from er_tp_dgp.theia import discover_theia_json_files, iter_theia_records
+from er_tp_dgp.theia import _unwrap_uuid
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
+    parser.add_argument(
+        "--orthrus-dir",
+        default="data/ground_truth/orthrus/ubc-provenance-ground-truth-ff65bc7/darpa",
+        help="Root of the unpacked ORTHRUS ground-truth zip.",
+    )
+    parser.add_argument(
+        "--sub-dataset",
+        default="E3-THEIA",
+        choices=["E3-CADETS", "E3-CLEARSCOPE", "E3-THEIA", "E5-CADETS", "E5-CLEARSCOPE", "E5-THEIA"],
+    )
+    parser.add_argument(
+        "--data-dir",
+        default="data/raw/e3_theia_json",
+        help="Raw THEIA JSON corpus to scan for first-event-per-subject anchors.",
+    )
+    parser.add_argument(
+        "--out-jsonl",
+        required=True,
+        help="Output labeled_targets.jsonl path (will be written; parent dirs created).",
+    )
+    parser.add_argument(
+        "--anchor-cache",
+        default="reports/cache/orthrus_subject_first_event_e3_theia.jsonl",
+        help="Cache mapping subject UUID -> first event UUID (built once).",
+    )
+    parser.add_argument(
+        "--include-non-subject",
+        action="store_true",
+        help=(
+            "Include ORTHRUS-labeled file/netflow entities as separate targets. "
+            "Default: only subject (process) entities."
+        ),
+    )
+    parser.add_argument(
+        "--candidate-universe",
+        default="reports/theia_candidate_universe/candidate_universe.jsonl",
+        help="Used to draw diverse benign cohort.",
+    )
+    parser.add_argument(
+        "--num-benign",
+        type=int,
+        default=0,
+        help="Number of hard_negative_proxy candidates to draw from candidate_universe.",
+    )
+    parser.add_argument(
+        "--benign-process-paths",
+        nargs="*",
+        default=None,
+        help=(
+            "Optional list of process paths to include in benign cohort. If unset, "
+            "stratify by unique process_path to maximize cohort diversity."
+        ),
+    )
+    args = parser.parse_args()
+
+    orthrus_root = Path(args.orthrus_dir) / args.sub_dataset
+    if not orthrus_root.exists():
+        raise SystemExit(f"missing ORTHRUS dir: {orthrus_root}")
+    csv_files = sorted(orthrus_root.glob("node_*.csv"))
+    if not csv_files:
+        raise SystemExit(f"no node_*.csv under {orthrus_root}")
+
+    # Step 1: load all ORTHRUS-labeled UUIDs + their attributes per scenario.
+    malicious_records: list[dict] = []
+    for f in csv_files:
+        scenario = f.stem.replace("node_", "")
+        with f.open() as handle:
+            for row in csv.reader(handle):
+                if not row or len(row) < 3:
+                    continue
+                uuid, attrs_str, _idx = row[0], row[1], row[2]
+                try:
+                    attrs = ast.literal_eval(attrs_str)
+                except (SyntaxError, ValueError):
+                    attrs = {}
+                if not isinstance(attrs, dict) or not attrs:
+                    continue
+                attr_type = next(iter(attrs.keys()))
+                attr_value = attrs[attr_type]
+                if not args.include_non_subject and attr_type != "subject":
+                    continue
+                process_path, command_line = _parse_subject_attr(attr_value) if attr_type == "subject" else (None, None)
+                malicious_records.append({
+                    "target_id": uuid,
+                    "target_type": "PROCESS" if attr_type == "subject" else attr_type.upper(),
+                    "atom_id": f"{args.sub_dataset.lower()}-{scenario}",
+                    "label": "malicious",
+                    "label_confidence": "high",
+                    "label_source": "orthrus_manual_curated",
+                    "cohort": "positive_high_confidence_orthrus",
+                    "process_path": process_path,
+                    "command_line": command_line,
+                    "attrs_raw": attrs,
+                })
+    print(f"ORTHRUS {args.sub_dataset}: scenarios={len(csv_files)} records={len(malicious_records)} (subjects only={not args.include_non_subject})")
+
+    # Step 2: anchor mapping. We need a target_event_uuid for each subject so
+    # build_theia_window_ir can pick a time window. Build subject_uuid →
+    # first_event_uuid via one corpus scan, with on-disk cache.
+    anchor_cache_path = Path(args.anchor_cache)
+    subject_to_anchor: dict[str, dict] = {}
+    if anchor_cache_path.exists():
+        with anchor_cache_path.open() as handle:
+            for line in handle:
+                if line.strip():
+                    row = json.loads(line)
+                    subject_to_anchor[row["subject_uuid"]] = row
+        print(f"loaded {len(subject_to_anchor)} subject→event anchors from {anchor_cache_path}")
+    else:
+        wanted = {r["target_id"] for r in malicious_records if r["target_type"] == "PROCESS"}
+        print(f"scanning corpus for first event per subject (n={len(wanted)} subjects)... this may take ~5 min for 80 GB E3-THEIA")
+        paths = discover_theia_json_files(args.data_dir)
+        for record in iter_theia_records(paths):
+            if record.record_type != "Event":
+                continue
+            payload = record.payload
+            sid = _unwrap_uuid(payload.get("subject"))
+            if sid not in wanted or sid in subject_to_anchor:
+                continue
+            ts = payload.get("timestampNanos")
+            if not isinstance(ts, int):
+                continue
+            evid = payload.get("uuid")
+            if not evid:
+                continue
+            subject_to_anchor[sid] = {
+                "subject_uuid": sid,
+                "anchor_event_id": evid,
+                "anchor_event_type": payload.get("type"),
+                "anchor_timestamp_nanos": ts,
+            }
+            if len(subject_to_anchor) >= len(wanted):
+                break
+        anchor_cache_path.parent.mkdir(parents=True, exist_ok=True)
+        with anchor_cache_path.open("w") as out:
+            for row in subject_to_anchor.values():
+                out.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n")
+        print(f"cached {len(subject_to_anchor)} subject→event anchors → {anchor_cache_path}")
+
+    # Step 3: emit labeled_targets.jsonl rows.
+    out_rows: list[dict] = []
+    skipped = 0
+    for r in malicious_records:
+        anchor = subject_to_anchor.get(r["target_id"])
+        if r["target_type"] == "PROCESS" and not anchor:
+            skipped += 1
+            continue
+        out_rows.append({
+            "target_id": r["target_id"],
+            "target_type": r["target_type"],
+            "label": r["label"],
+            "label_confidence": r["label_confidence"],
+            "cohort": r["cohort"],
+            "anchor_event_id": (anchor or {}).get("anchor_event_id"),
+            "anchor_timestamp_nanos": (anchor or {}).get("anchor_timestamp_nanos"),
+            "atom_id": r["atom_id"],
+            "label_source": r["label_source"],
+            "matched_event_count": 0,
+            "weak_signal_score": None,
+            "candidate_total_events": None,
+            "candidate_estimated_prompt_tokens": None,
+            "process_path": r["process_path"],
+            "command_line": r["command_line"],
+            "prompt_allowed_label_fields": False,
+            "notes": [
+                "Ground truth from ORTHRUS USENIX Sec 2025 (Zenodo 14641608).",
+                "Manually curated, conservative attack-graph-aligned labels.",
+                f"Attack scenario: {r['atom_id']}.",
+            ],
+        })
+    print(f"emitted {len(out_rows)} malicious targets ({skipped} skipped due to missing anchor)")
+
+    # Step 4: optional benign cohort.
+    if args.num_benign > 0:
+        cu_path = Path(args.candidate_universe)
+        if not cu_path.exists():
+            print(f"WARNING: candidate_universe missing at {cu_path}; skipping benign cohort.")
+        else:
+            benign_rows = _select_diverse_benign(
+                cu_path,
+                num=args.num_benign,
+                exclude_uuids={r["target_id"] for r in out_rows},
+                allowed_paths=set(args.benign_process_paths) if args.benign_process_paths else None,
+            )
+            out_rows.extend(benign_rows)
+            print(f"appended {len(benign_rows)} hard_negative_proxy targets")
+
+    out_path = Path(args.out_jsonl)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", encoding="utf-8") as handle:
+        for row in out_rows:
+            handle.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n")
+    print(f"wrote {len(out_rows)} targets → {out_path}")
+    return 0
+
+
+def _parse_subject_attr(value: str) -> tuple[str | None, str | None]:
+    """ORTHRUS subject attrs look like: '/usr/bin/firefox firefox-bin -P default -e'.
+
+    Heuristic: the FIRST whitespace-separated token that starts with `/` or
+    contains a slash is the path; everything else is the command line.
+    """
+    if not isinstance(value, str) or not value.strip():
+        return None, None
+    tokens = value.strip().split()
+    path = None
+    for tok in tokens:
+        if tok.startswith("/") or "/" in tok:
+            path = tok
+            break
+    return path, value.strip()
+
+
+def _select_diverse_benign(
+    candidate_universe_path: Path,
+    *,
+    num: int,
+    exclude_uuids: set[str],
+    allowed_paths: set[str] | None,
+) -> list[dict]:
+    rows: list[dict] = []
+    by_path: dict[str, list[dict]] = defaultdict(list)
+    with candidate_universe_path.open() as handle:
+        for line in handle:
+            if not line.strip():
+                continue
+            r = json.loads(line)
+            cid = r.get("candidate_id")
+            if not cid or cid in exclude_uuids:
+                continue
+            sample_events = r.get("sample_raw_event_ids") or []
+            if not sample_events:
+                continue
+            path = r.get("process_path") or "unknown"
+            if allowed_paths is not None and path not in allowed_paths:
+                continue
+            by_path[path].append(r)
+
+    # Stratify: round-robin over distinct process_paths to maximize diversity.
+    paths_sorted = sorted(by_path.keys(), key=lambda p: (-len(by_path[p]), p))
+    picked: list[dict] = []
+    while len(picked) < num and paths_sorted:
+        for p in list(paths_sorted):
+            if not by_path[p]:
+                paths_sorted.remove(p)
+                continue
+            picked.append(by_path[p].pop(0))
+            if len(picked) >= num:
+                break
+
+    for r in picked:
+        rows.append({
+            "target_id": r["candidate_id"],
+            "target_type": "PROCESS",
+            "label": "benign_proxy",
+            "label_confidence": "unverified",
+            "cohort": "hard_negative_proxy",
+            "anchor_event_id": str((r.get("sample_raw_event_ids") or [None])[0]),
+            "atom_id": None,
+            "label_source": "candidate_not_in_orthrus_ground_truth",
+            "matched_event_count": 0,
+            "weak_signal_score": _safe_float(r.get("weak_signal_score")),
+            "candidate_total_events": _safe_int(r.get("total_events")),
+            "candidate_estimated_prompt_tokens": _safe_int(r.get("estimated_prompt_tokens")),
+            "process_path": r.get("process_path"),
+            "command_line": r.get("command_line"),
+            "prompt_allowed_label_fields": False,
+            "notes": [
+                "Hard negative proxy: process not in ORTHRUS ground truth and not matching any attack atom.",
+                "Diversity-stratified across process paths from candidate_universe.",
+            ],
+        })
+    return rows
+
+
+def _safe_float(value):
+    try: return float(value)
+    except (TypeError, ValueError): return None
+
+
+def _safe_int(value):
+    try: return int(value)
+    except (TypeError, ValueError): return None
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/map_theia_ground_truth.py
+++ b/scripts/map_theia_ground_truth.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""Map E3 THEIA ground-truth atoms to THEIA events/processes."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+from er_tp_dgp.ground_truth_mapping import (
+    evaluate_candidate_recall,
+    match_theia_ground_truth_atoms,
+    read_ground_truth_atoms_jsonl,
+)
+from er_tp_dgp.theia import discover_theia_json_files
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Map label-only E3 ground-truth atoms to THEIA events/processes. "
+            "Outputs are forbidden from prompt construction."
+        )
+    )
+    parser.add_argument("--data-dir", default="data/raw/e3_theia_json")
+    parser.add_argument(
+        "--input-file",
+        action="append",
+        default=None,
+        help="Specific THEIA JSON file to scan. Can be repeated. Overrides --data-dir discovery.",
+    )
+    parser.add_argument("--atoms", default="reports/ground_truth/e3/ground_truth_atoms.jsonl")
+    parser.add_argument("--candidate-jsonl", default="reports/theia_candidate_universe/candidate_universe.jsonl")
+    parser.add_argument("--output-dir", default="reports/ground_truth/e3_mapping")
+    parser.add_argument("--max-lines", type=int, default=None)
+    parser.add_argument("--max-lines-per-file", type=int, default=None)
+    parser.add_argument("--min-score", type=float, default=3.0)
+    parser.add_argument("--include-term-only", action="store_true")
+    parser.add_argument("--require-time-window", action="store_true")
+    parser.add_argument("--time-window-hours", type=float, default=6.0)
+    parser.add_argument(
+        "--recall-min-confidence",
+        choices=("low", "medium", "high"),
+        default="high",
+        help="Minimum mapped label confidence used for candidate recall.",
+    )
+    parser.add_argument(
+        "--timezone-offsets-hours",
+        default="0",
+        help="Comma-separated local offsets to try when interpreting ground-truth times.",
+    )
+    parser.add_argument(
+        "--include-target-network-ips",
+        action="store_true",
+        help="Allow 128.55.12.* target network addresses to act as hard match indicators.",
+    )
+    args = parser.parse_args()
+
+    paths = [Path(path) for path in args.input_file] if args.input_file else discover_theia_json_files(args.data_dir)
+    if not paths:
+        raise SystemExit(f"no THEIA JSON files found under {args.data_dir}")
+    atoms = read_ground_truth_atoms_jsonl(args.atoms)
+    offsets = tuple(int(value) for value in args.timezone_offsets_hours.split(",") if value.strip())
+
+    report = match_theia_ground_truth_atoms(
+        paths,
+        atoms,
+        max_lines=args.max_lines,
+        max_lines_per_file=args.max_lines_per_file,
+        min_score=args.min_score,
+        include_term_only=args.include_term_only,
+        require_time_window=args.require_time_window,
+        time_window_hours=args.time_window_hours,
+        timezone_offsets_hours=offsets or (0,),
+        ignore_target_network_prefixes=() if args.include_target_network_ips else ("128.55.12.",),
+    )
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    event_path = output_dir / "event_matches.jsonl"
+    process_path = output_dir / "process_labels.jsonl"
+    report_path = output_dir / "mapping_report.md"
+    report.write_event_jsonl(event_path)
+    report.write_process_jsonl(process_path)
+    report_path.write_text(report.to_markdown() + "\n", encoding="utf-8")
+
+    filtered_event_path = output_dir / f"event_matches_{args.recall_min_confidence}_plus.jsonl"
+    filtered_process_path = output_dir / f"process_labels_{args.recall_min_confidence}_plus.jsonl"
+    _write_filtered_event_matches(filtered_event_path, report.event_matches, args.recall_min_confidence)
+    _write_filtered_process_labels(filtered_process_path, report.process_labels, args.recall_min_confidence)
+
+    recall = evaluate_candidate_recall(
+        args.candidate_jsonl,
+        report.process_labels,
+        report.event_matches,
+        min_confidence=args.recall_min_confidence,
+    )
+    recall_json = output_dir / "candidate_recall.json"
+    recall_md = output_dir / "candidate_recall.md"
+    recall_json.write_text(
+        json.dumps(recall.to_json_dict(), indent=2, sort_keys=True, ensure_ascii=False) + "\n",
+        encoding="utf-8",
+    )
+    recall_md.write_text(recall.to_markdown() + "\n", encoding="utf-8")
+
+    print(
+        f"atoms={report.atoms_seen} lines_seen={report.lines_seen} "
+        f"events_seen={report.events_seen} event_matches={len(report.event_matches)} "
+        f"process_labels={len(report.process_labels)}"
+    )
+    print(f"candidate_process_recall={recall.process_recall}")
+    print(f"event_subject_recall={recall.event_subject_recall}")
+    print(f"wrote {event_path}")
+    print(f"wrote {process_path}")
+    print(f"wrote {filtered_event_path}")
+    print(f"wrote {filtered_process_path}")
+    print(f"wrote {report_path}")
+    print(f"wrote {recall_json}")
+
+
+def _write_filtered_event_matches(path, matches, min_confidence):
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as handle:
+        for match in matches:
+            if _confidence_rank(match.confidence) >= _confidence_rank(min_confidence):
+                handle.write(json.dumps(match.to_json_dict(), ensure_ascii=False, sort_keys=True) + "\n")
+
+
+def _write_filtered_process_labels(path, labels, min_confidence):
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as handle:
+        for label in labels:
+            if _confidence_rank(label.confidence) >= _confidence_rank(min_confidence):
+                handle.write(json.dumps(label.to_json_dict(), ensure_ascii=False, sort_keys=True) + "\n")
+
+
+def _confidence_rank(value):
+    return {"low": 0, "medium": 1, "high": 2}.get(value, -1)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/retry_skipped_llm.py
+++ b/scripts/retry_skipped_llm.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+"""Retry LLM inference for prompts that were skipped due to transient errors.
+
+Reads predictions_jsonl, identifies rows with ``skipped: true``, looks up
+the corresponding prompt files, and re-runs LLM inference with retries.
+Successful retries replace the skipped row; persistent failures keep
+the original skip row.
+
+Adds in-process exponential-backoff retry on the API ``no choices``
+response — the proxy hche3637.com returns transient empty bodies that
+look like HTTP-200 but lack ``choices``.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import time
+from pathlib import Path
+from typing import Any
+
+from er_tp_dgp.llm import OpenAICompatibleHTTPProvider
+from er_tp_dgp.llm_config import load_llm_config
+
+
+def _read_predictions(path: Path) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            line = line.strip()
+            if not line:
+                continue
+            rows.append(json.loads(line))
+    return rows
+
+
+def _retry_inference(
+    provider: OpenAICompatibleHTTPProvider,
+    target_id: str,
+    prompt_text: str,
+    *,
+    max_attempts: int,
+    backoff_seconds: float,
+) -> tuple[dict[str, Any] | None, str | None]:
+    """Try up to ``max_attempts`` times with exponential backoff. Returns
+    (payload, error_str). On success, payload is the to_json_dict() result."""
+    last_error: str | None = None
+    for attempt in range(1, max_attempts + 1):
+        try:
+            result = provider.classify(target_id=target_id, prompt_text=prompt_text)
+            return result.to_json_dict(), None
+        except Exception as exc:  # noqa: BLE001
+            last_error = f"{type(exc).__name__}: {str(exc)[:200]}"
+            if attempt < max_attempts:
+                time.sleep(backoff_seconds * (2 ** (attempt - 1)))
+    return None, last_error
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--predictions-jsonl", required=True)
+    parser.add_argument("--prompt-dir", required=True)
+    parser.add_argument("--config", default="configs/llm.yaml")
+    parser.add_argument("--max-attempts", type=int, default=4)
+    parser.add_argument("--backoff-seconds", type=float, default=2.0)
+    parser.add_argument("--output-jsonl", default=None,
+                        help="Defaults to predictions-jsonl (in-place).")
+    args = parser.parse_args()
+
+    config = load_llm_config(args.config)
+    # Honor request_logprobs from upstream config — does NOT enable here
+    # by default since the proxy seems to ignore it anyway.
+    provider = OpenAICompatibleHTTPProvider(config)
+
+    predictions_path = Path(args.predictions_jsonl)
+    prompt_dir = Path(args.prompt_dir)
+    rows = _read_predictions(predictions_path)
+    skipped = [r for r in rows if r.get("skipped")]
+    print(f"[retry] total rows: {len(rows)}, skipped: {len(skipped)}", flush=True)
+
+    successes = 0
+    persistent_failures = 0
+    for row in rows:
+        if not row.get("skipped"):
+            continue
+        target_id = row.get("target_id")
+        prompt_file = prompt_dir / f"{target_id}.txt"
+        if not prompt_file.exists():
+            print(f"[retry] {target_id}: prompt file missing, keeping skip", flush=True)
+            persistent_failures += 1
+            continue
+        prompt_text = prompt_file.read_text(encoding="utf-8")
+        payload, error = _retry_inference(
+            provider,
+            target_id=target_id,
+            prompt_text=prompt_text,
+            max_attempts=args.max_attempts,
+            backoff_seconds=args.backoff_seconds,
+        )
+        if payload is None:
+            print(f"[retry] {target_id}: persistent failure: {error}", flush=True)
+            row["skip_reason"] = f"after {args.max_attempts} retries: {error}"
+            persistent_failures += 1
+            continue
+        # Replace the skipped row with the successful payload.
+        payload["prompt_file"] = str(prompt_file)
+        row.clear()
+        row.update(payload)
+        successes += 1
+        print(f"[retry] {target_id}: SUCCESS {payload.get('output', {}).get('first_token_label')}",
+              flush=True)
+
+    output_path = Path(args.output_jsonl) if args.output_jsonl else predictions_path
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with output_path.open("w", encoding="utf-8") as handle:
+        for row in rows:
+            handle.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n")
+    print(
+        f"[retry] DONE successes={successes} persistent_failures={persistent_failures} "
+        f"wrote={output_path}",
+        flush=True,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/run_evaluation.py
+++ b/scripts/run_evaluation.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+"""End-to-end evaluation: join LLM predictions with labels and aggregate metrics.
+
+Inputs:
+    --predictions-jsonl   One file per method variant, produced by
+                          run_llm_inference.py. The file's basename is used as
+                          the method name in the metrics table.
+    --labeled-targets     evaluation_batch jsonl (target_id, label, ...)
+
+Output:
+    --output-dir/metrics.md       Paper-Table-2-style table:
+                                    method | AUPRC | AUROC | Macro-F1 |
+                                    Recall@10 | FPR@0.9 | avg_tokens |
+                                    avg_latency | evidence_path_hit_rate
+    --output-dir/metrics.json     Machine-readable equivalent.
+
+Each row uses the calibrated first-token softmax score from
+``LLMInferenceResult.first_token_score`` (DGP paper formula 14). If a row's
+score is missing, it is excluded from the metrics with a warning.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+from pathlib import Path
+
+from er_tp_dgp.metrics import PredictionRecord, evaluate_classification
+
+
+_log = logging.getLogger("run_evaluation")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
+    parser.add_argument(
+        "--predictions-jsonl",
+        action="append",
+        required=True,
+        help="Repeat once per method variant. Filename stem is used as method name.",
+    )
+    parser.add_argument("--labeled-targets", required=True)
+    parser.add_argument("--output-dir", required=True)
+    parser.add_argument(
+        "--k-values",
+        type=int,
+        nargs="+",
+        default=[1, 5, 10],
+    )
+    parser.add_argument(
+        "--recall-levels",
+        type=float,
+        nargs="+",
+        default=[0.8, 0.9],
+    )
+    args = parser.parse_args()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    labels = _index_labels(Path(args.labeled_targets))
+
+    method_metrics: dict[str, dict] = {}
+    for path in args.predictions_jsonl:
+        prediction_path = Path(path)
+        method_name = prediction_path.stem
+        records = _build_prediction_records(prediction_path, labels)
+        if not records:
+            _log.warning("No usable predictions in %s; skipping.", prediction_path)
+            continue
+        metrics = evaluate_classification(
+            records, k_values=args.k_values, recall_levels=args.recall_levels
+        )
+        method_metrics[method_name] = {
+            "metrics": metrics.to_dict(),
+            "num_records_used": len(records),
+            "predictions_path": str(prediction_path),
+        }
+
+    (output_dir / "metrics.json").write_text(
+        json.dumps(method_metrics, ensure_ascii=False, sort_keys=True, indent=2),
+        encoding="utf-8",
+    )
+    (output_dir / "metrics.md").write_text(_render_markdown_table(method_metrics), encoding="utf-8")
+    print(f"wrote {output_dir/'metrics.md'}")
+    print(f"wrote {output_dir/'metrics.json'}")
+    return 0
+
+
+def _index_labels(path: Path) -> dict[str, dict]:
+    labels: dict[str, dict] = {}
+    with path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            line = line.strip()
+            if not line:
+                continue
+            row = json.loads(line)
+            target_id = row.get("target_id")
+            if target_id:
+                labels[target_id] = row
+    return labels
+
+
+def _build_prediction_records(
+    predictions_path: Path, labels: dict[str, dict]
+) -> list[PredictionRecord]:
+    records: list[PredictionRecord] = []
+    with predictions_path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            line = line.strip()
+            if not line:
+                continue
+            payload = json.loads(line)
+            target_id = payload.get("target_id")
+            output = payload.get("output") or {}
+            score = (
+                payload.get("first_token_score")
+                if payload.get("first_token_score") is not None
+                else output.get("score")
+            )
+            if score is None:
+                # Fallback: many OpenAI-compatible endpoints don't honor logprobs.
+                # Derive a degraded binary score from the first-token label so the
+                # row is still usable (Macro-F1 / Precision@K stay valid; AUROC
+                # collapses but AUPRC still works on rank order).
+                first_label = (output.get("first_token_label") or "").upper()
+                predicted_upper = str(output.get("predicted_label") or "").upper()
+                if first_label == "MALICIOUS" or predicted_upper == "MALICIOUS":
+                    score = 1.0
+                elif first_label == "BENIGN" or predicted_upper == "BENIGN":
+                    score = 0.0
+                else:
+                    _log.warning(
+                        "missing first-token score AND no usable label for %s; skipping",
+                        target_id,
+                    )
+                    continue
+            # Prompt-batch filenames carry an "NNNN_<uuid>" prefix (see
+            # build_theia_prompt_batch.py:_safe_id). Recover the bare UUID
+            # so that labeled_targets.jsonl lookups succeed.
+            label_row = labels.get(target_id)
+            if not label_row and isinstance(target_id, str) and "_" in target_id:
+                bare = target_id.split("_", 1)[1]
+                label_row = labels.get(bare)
+                if label_row:
+                    target_id = bare
+            if not label_row:
+                _log.warning("no label for %s; skipping", target_id)
+                continue
+            true_label = "malicious" if label_row.get("label") == "malicious" else "benign"
+            predicted = output.get("predicted_label", "BENIGN")
+            predicted_label = "malicious" if str(predicted).upper() == "MALICIOUS" else "benign"
+            records.append(
+                PredictionRecord(
+                    target_id=target_id,
+                    target_type=label_row.get("target_type", "PROCESS"),
+                    score=float(max(0.0, min(1.0, score))),
+                    predicted_label=predicted_label,
+                    true_label=true_label,
+                    timestamp=label_row.get("anchor_timestamp"),
+                    evidence_path_ids=tuple(output.get("evidence_path_ids") or ()),
+                    prompt_tokens=payload.get("prompt_tokens"),
+                    inference_cost=None,
+                    prompt_construction_time=None,
+                )
+            )
+    return records
+
+
+def _render_markdown_table(method_metrics: dict[str, dict]) -> str:
+    if not method_metrics:
+        return "# ER-TP-DGP Evaluation\n\nNo method metrics produced.\n"
+    headers = [
+        "method",
+        "n",
+        "n+",
+        "AUPRC",
+        "AUROC",
+        "Macro-F1",
+        "Recall@10",
+        "FPR@0.9",
+        "avg_tokens",
+        "evidence_hit",
+    ]
+    rows: list[list[str]] = []
+    for method_name, payload in sorted(method_metrics.items()):
+        m = payload["metrics"]
+        rows.append(
+            [
+                method_name,
+                str(m["num_examples"]),
+                str(m["num_positive"]),
+                _fmt(m["auprc"]),
+                _fmt(m["auroc"]),
+                _fmt(m["macro_f1"]),
+                _fmt(m["recall_at_k"].get(10)),
+                _fmt(m["fpr_at_recall"].get(0.9)),
+                _fmt(m["avg_prompt_tokens"]),
+                _fmt(m["evidence_path_hit_rate"]),
+            ]
+        )
+    lines = [
+        "# ER-TP-DGP Evaluation",
+        "",
+        "Per-method metrics. Score column is calibrated first-token softmax over (Yes, No)",
+        "(DGP paper formula 14). Records missing logprobs are excluded with a warning.",
+        "",
+        "| " + " | ".join(headers) + " |",
+        "|" + "|".join(["---"] * len(headers)) + "|",
+    ]
+    for row in rows:
+        lines.append("| " + " | ".join(row) + " |")
+    return "\n".join(lines) + "\n"
+
+
+def _fmt(value) -> str:
+    if isinstance(value, float):
+        return f"{value:.4f}"
+    if value is None:
+        return "n/a"
+    return str(value)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/run_hybrid_experiment.sh
+++ b/scripts/run_hybrid_experiment.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+# End-to-end hybrid (community + v0.1 fine-grained) experiment driver.
+#
+# Steps:
+#   1) Build hybrid prompts on a balanced set of communities (Phase 14 +
+#      v0.1 fine-grained re-injection).
+#   2) Build a parallel set of Phase 14 raw landmark-only prompts on the
+#      SAME communities (head-to-head ablation).
+#   3) Convert prompt metadata → labeled_targets.jsonl.
+#   4) Run LLM inference on both prompt sets.
+#   5) Run evaluation, write metrics.md.
+#
+# Usage:
+#   bash scripts/run_hybrid_experiment.sh [BENIGN_PER_MALICIOUS]
+#
+# Defaults to BENIGN_PER_MALICIOUS=24 → 6 mal + 144 ben = 150 communities,
+# matching the v0.1 evaluation scale of n=146.
+
+set -euo pipefail
+
+BENIGN_PER_MAL=${1:-24}
+OUT_ROOT="reports/hybrid_v0_3"
+PROMPTS_HYBRID="${OUT_ROOT}/prompts_hybrid"
+PROMPTS_RAW="${OUT_ROOT}/prompts_landmark_raw"
+LABELED_TARGETS="${OUT_ROOT}/labeled_targets.jsonl"
+PRED_HYBRID="${OUT_ROOT}/predictions_hybrid.jsonl"
+PRED_RAW="${OUT_ROOT}/predictions_landmark_raw.jsonl"
+METRICS_DIR="${OUT_ROOT}/metrics"
+
+mkdir -p "${OUT_ROOT}" "${METRICS_DIR}"
+
+LANDMARK_DIR="reports/landmark_csg"
+COMMUNITIES="${LANDMARK_DIR}/landmark_communities.jsonl"
+LANDMARKS="${LANDMARK_DIR}/landmarks.jsonl"
+EDGES="${LANDMARK_DIR}/landmark_edges.jsonl"
+LABELED_COMMUNITIES="${LANDMARK_DIR}/labeled_communities.jsonl"
+
+echo "=== STEP 1: build hybrid prompts (community + v0.1 fine-grained) ==="
+.venv/bin/python -u scripts/build_hybrid_community_prompts.py \
+    --communities "${COMMUNITIES}" \
+    --landmarks "${LANDMARKS}" \
+    --landmark-edges "${EDGES}" \
+    --labeled-communities "${LABELED_COMMUNITIES}" \
+    --output-dir "${PROMPTS_HYBRID}" \
+    --include-only balanced \
+    --benign-per-malicious "${BENIGN_PER_MAL}" \
+    --margin-seconds 60 \
+    --max-events-per-community 5000 \
+    --max-landmarks-in-prompt 60 \
+    --max-edges-in-prompt 80 \
+    --top-m-per-metapath 5 \
+    --progress-every 2000000
+
+echo "=== STEP 2: build Phase 14 raw landmark prompts on the SAME communities ==="
+.venv/bin/python -u scripts/build_landmark_prompts_for_ids.py \
+    --communities "${COMMUNITIES}" \
+    --landmarks "${LANDMARKS}" \
+    --landmark-edges "${EDGES}" \
+    --labeled-communities "${LABELED_COMMUNITIES}" \
+    --ids-from-metadata "${PROMPTS_HYBRID}/prompt_metadata.jsonl" \
+    --output-dir "${PROMPTS_RAW}" \
+    --max-landmarks-in-prompt 60 \
+    --max-edges-in-prompt 80
+
+echo "=== STEP 3: build labeled_targets.jsonl from hybrid metadata ==="
+.venv/bin/python -u scripts/build_hybrid_labeled_targets.py \
+    --prompt-metadata "${PROMPTS_HYBRID}/prompt_metadata.jsonl" \
+    --output "${LABELED_TARGETS}"
+
+echo "=== STEP 4a: LLM inference on hybrid prompts ==="
+.venv/bin/python -u scripts/run_llm_inference.py \
+    --config configs/llm.yaml \
+    --prompt-dir "${PROMPTS_HYBRID}/prompts" \
+    --output-jsonl "${PRED_HYBRID}" \
+    --request-logprobs \
+    --max-prompt-chars 200000
+
+echo "=== STEP 4b: LLM inference on Phase 14 raw landmark prompts (same set) ==="
+.venv/bin/python -u scripts/run_llm_inference.py \
+    --config configs/llm.yaml \
+    --prompt-dir "${PROMPTS_RAW}/prompts" \
+    --output-jsonl "${PRED_RAW}" \
+    --request-logprobs \
+    --max-prompt-chars 200000
+
+echo "=== STEP 5: aggregate metrics ==="
+.venv/bin/python -u scripts/run_evaluation.py \
+    --predictions-jsonl "${PRED_HYBRID}" \
+    --predictions-jsonl "${PRED_RAW}" \
+    --labeled-targets "${LABELED_TARGETS}" \
+    --output-dir "${METRICS_DIR}"
+
+echo "=== STEP 6: cross-compare with v0.1/v0.2 baselines ==="
+.venv/bin/python -u scripts/summarize_hybrid_experiment.py \
+    --hybrid-metrics "${METRICS_DIR}/metrics.json" \
+    --output "${OUT_ROOT}/summary.md"
+
+echo "=== ALL STAGES COMPLETE ==="
+echo "Metrics:"
+cat "${METRICS_DIR}/metrics.md"
+echo
+echo "Summary:"
+cat "${OUT_ROOT}/summary.md"
--- a/scripts/run_hybrid_inference_local.sh
+++ b/scripts/run_hybrid_inference_local.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+# Continuation of run_hybrid_experiment.sh — resumes from STEP 4 onward
+# using local_hf (HuggingFace transformers) provider instead of the API.
+# Steps 1-3 (prompt build + labeled targets) already completed; reuse their outputs.
+#
+# Usage:
+#   bash scripts/run_hybrid_inference_local.sh [MODEL]
+# Default MODEL = Qwen/Qwen3.5-27B (matches v0.1/v0.2 baselines).
+
+set -euo pipefail
+
+MODEL=${1:-Qwen/Qwen3.5-27B}
+MAX_GIB=${HYBRID_MAX_GIB:-30}
+
+OUT_ROOT="reports/hybrid_v0_3"
+PROMPTS_HYBRID="${OUT_ROOT}/prompts_hybrid"
+PROMPTS_RAW="${OUT_ROOT}/prompts_landmark_raw"
+LABELED_TARGETS="${OUT_ROOT}/labeled_targets.jsonl"
+PRED_HYBRID="${OUT_ROOT}/predictions_hybrid_local.jsonl"
+PRED_RAW="${OUT_ROOT}/predictions_landmark_raw_local.jsonl"
+METRICS_DIR="${OUT_ROOT}/metrics_local"
+
+mkdir -p "${OUT_ROOT}" "${METRICS_DIR}"
+
+# Step 2 redo: ensure raw landmark prompts cover the SAME 150 community ids.
+# (The first run hit a bash-mid-execution cache miss and used the legacy
+# build_landmark_prompts.py which only produced 12 prompts.)
+RAW_COUNT=$(ls "${PROMPTS_RAW}/prompts" 2>/dev/null | wc -l | tr -d ' ')
+if [[ "${RAW_COUNT}" != "150" ]]; then
+    echo "=== STEP 2 (redo): build raw landmark prompts for the same 150 ids ==="
+    .venv/bin/python -u scripts/build_landmark_prompts_for_ids.py \
+        --communities reports/landmark_csg/landmark_communities.jsonl \
+        --landmarks reports/landmark_csg/landmarks.jsonl \
+        --landmark-edges reports/landmark_csg/landmark_edges.jsonl \
+        --labeled-communities reports/landmark_csg/labeled_communities.jsonl \
+        --ids-from-metadata "${PROMPTS_HYBRID}/prompt_metadata.jsonl" \
+        --output-dir "${PROMPTS_RAW}" \
+        --max-landmarks-in-prompt 60 \
+        --max-edges-in-prompt 80
+fi
+
+echo "=== STEP 4a: LLM inference on hybrid prompts (local_hf, ${MODEL}) ==="
+.venv/bin/python -u scripts/run_llm_inference.py \
+    --provider local_hf \
+    --model "${MODEL}" \
+    --dtype bf16 \
+    --device-map auto \
+    --max-memory-per-gpu-gib "${MAX_GIB}" \
+    --prompt-dir "${PROMPTS_HYBRID}/prompts" \
+    --output-jsonl "${PRED_HYBRID}" \
+    --max-prompt-chars 200000
+
+echo "=== STEP 4b: LLM inference on Phase 14 raw landmark prompts (same set) ==="
+.venv/bin/python -u scripts/run_llm_inference.py \
+    --provider local_hf \
+    --model "${MODEL}" \
+    --dtype bf16 \
+    --device-map auto \
+    --max-memory-per-gpu-gib "${MAX_GIB}" \
+    --prompt-dir "${PROMPTS_RAW}/prompts" \
+    --output-jsonl "${PRED_RAW}" \
+    --max-prompt-chars 200000
+
+echo "=== STEP 5: aggregate metrics ==="
+.venv/bin/python -u scripts/run_evaluation.py \
+    --predictions-jsonl "${PRED_HYBRID}" \
+    --predictions-jsonl "${PRED_RAW}" \
+    --labeled-targets "${LABELED_TARGETS}" \
+    --output-dir "${METRICS_DIR}"
+
+echo "=== STEP 6: cross-compare with v0.1/v0.2 baselines ==="
+.venv/bin/python -u scripts/summarize_hybrid_experiment.py \
+    --hybrid-metrics "${METRICS_DIR}/metrics.json" \
+    --output "${OUT_ROOT}/summary_local.md"
+
+echo "=== ALL STAGES COMPLETE ==="
+echo "Metrics:"
+cat "${METRICS_DIR}/metrics.md"
+echo
+echo "Summary:"
+cat "${OUT_ROOT}/summary_local.md"
--- a/scripts/run_llm_inference.py
+++ b/scripts/run_llm_inference.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+"""Run OpenAI-compatible LLM inference for saved ER-TP-DGP prompts."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+from er_tp_dgp.llm import LocalHFLogitsProvider, OpenAICompatibleHTTPProvider
+from er_tp_dgp.llm_config import load_llm_config, merge_llm_config
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", help="YAML LLM config file, e.g. configs/llm.yaml")
+    parser.add_argument("--provider", choices=["api", "local", "local_hf"])
+    parser.add_argument("--base-url")
+    parser.add_argument("--model")
+    parser.add_argument("--prompt-file", action="append", default=[])
+    parser.add_argument("--prompt-dir")
+    parser.add_argument("--output-jsonl", default="reports/llm_predictions.jsonl")
+    parser.add_argument("--api-key-env", default=None)
+    parser.add_argument("--timeout-seconds", type=float)
+    parser.add_argument("--temperature", type=float)
+    parser.add_argument("--max-tokens", type=int)
+    parser.add_argument(
+        "--request-logprobs",
+        action="store_true",
+        help="(API/local-OpenAI) Ask server for first-token top_logprobs and "
+        "compute calibrated softmax score (DGP formula 14).",
+    )
+    parser.add_argument("--lora-adapter", default=None, help="(local_hf) path to LoRA adapter.")
+    parser.add_argument("--dtype", default="bf16", choices=["bf16", "fp16", "fp32"])
+    parser.add_argument("--device-map", default="auto")
+    parser.add_argument(
+        "--model-class",
+        default="auto",
+        choices=["auto", "causal_lm", "image_text_to_text", "seq2seq"],
+        help=(
+            "(local_hf) HF AutoModelFor* class. 'auto' inspects "
+            "config.architectures (multimodal Qwen3.5-27B → image_text_to_text)."
+        ),
+    )
+    parser.add_argument(
+        "--max-memory-per-gpu-gib",
+        type=float,
+        default=None,
+        help=(
+            "(local_hf) Cap per-GPU memory so accelerate balances across cards "
+            "instead of filling GPU 0. Use ~30 for 2x A100 40GB on Qwen3.5-27B."
+        ),
+    )
+    parser.add_argument(
+        "--max-prompt-chars",
+        type=int,
+        default=None,
+        help=(
+            "Skip any prompt larger than this (chars). Outliers (e.g. firefox "
+            "30s windows producing 1M+ tokens) trigger attention OOM even with "
+            "SDPA. The skipped target gets first_token_score=None and is "
+            "excluded by the metrics aggregator."
+        ),
+    )
+    args = parser.parse_args()
+
+    prompt_files = [Path(path) for path in args.prompt_file]
+    if args.prompt_dir:
+        prompt_files.extend(sorted(Path(args.prompt_dir).glob("*.txt")))
+    if not prompt_files:
+        raise SystemExit("No prompt files provided. Use --prompt-file or --prompt-dir.")
+
+    if args.provider == "local_hf":
+        if not args.model:
+            raise SystemExit("local_hf requires --model (HF model id, e.g. Qwen/Qwen3-8B).")
+        provider = LocalHFLogitsProvider(
+            base_model=args.model,
+            lora_adapter=args.lora_adapter,
+            dtype=args.dtype,
+            device_map=args.device_map,
+            model_class=args.model_class,
+            max_memory_per_gpu_gib=args.max_memory_per_gpu_gib,
+        )
+    else:
+        if args.config:
+            config = load_llm_config(args.config)
+            config = merge_llm_config(
+                config,
+                provider=args.provider,
+                base_url=args.base_url,
+                model=args.model,
+                api_key_env=args.api_key_env,
+                timeout_seconds=args.timeout_seconds,
+                temperature=args.temperature,
+                max_tokens=args.max_tokens,
+            )
+        else:
+            missing = [
+                name
+                for name, value in (
+                    ("--provider", args.provider),
+                    ("--base-url", args.base_url),
+                    ("--model", args.model),
+                )
+                if not value
+            ]
+            if missing:
+                raise SystemExit(
+                    f"Missing required arguments without --config: {', '.join(missing)}"
+                )
+            config = merge_llm_config(
+                load_default_inline_config(args.provider, args.base_url, args.model),
+                api_key_env=args.api_key_env,
+                timeout_seconds=args.timeout_seconds,
+                temperature=args.temperature,
+                max_tokens=args.max_tokens,
+            )
+        if args.request_logprobs:
+            config = LLMRequestConfig_with_logprobs(config)
+        provider = OpenAICompatibleHTTPProvider(config)
+
+    output_path = Path(args.output_jsonl)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    skipped = 0
+    with output_path.open("w", encoding="utf-8") as handle:
+        for idx, prompt_file in enumerate(prompt_files, start=1):
+            prompt_text = prompt_file.read_text(encoding="utf-8")
+            target_id = prompt_file.stem
+            if args.max_prompt_chars is not None and len(prompt_text) > args.max_prompt_chars:
+                payload = {
+                    "target_id": target_id,
+                    "prompt_file": str(prompt_file),
+                    "skipped": True,
+                    "skip_reason": f"prompt size {len(prompt_text)} > --max-prompt-chars {args.max_prompt_chars}",
+                    "first_token_score": None,
+                    "first_token_yes_logprob": None,
+                    "first_token_no_logprob": None,
+                    "output": {"first_token_label": None, "score": None, "predicted_label": None,
+                               "evidence_path_ids": []},
+                }
+                handle.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")
+                handle.flush()
+                skipped += 1
+                print(f"[{idx}/{len(prompt_files)}] {prompt_file}: SKIP ({len(prompt_text)} chars > cap)")
+                continue
+            try:
+                result = provider.classify(target_id=target_id, prompt_text=prompt_text)
+            except Exception as exc:  # noqa: BLE001 - any GPU/inference error → skip, keep batch alive
+                payload = {
+                    "target_id": target_id,
+                    "prompt_file": str(prompt_file),
+                    "skipped": True,
+                    "skip_reason": f"inference error: {type(exc).__name__}: {str(exc)[:200]}",
+                    "first_token_score": None,
+                    "first_token_yes_logprob": None,
+                    "first_token_no_logprob": None,
+                    "output": {"first_token_label": None, "score": None, "predicted_label": None,
+                               "evidence_path_ids": []},
+                }
+                handle.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")
+                handle.flush()
+                skipped += 1
+                print(f"[{idx}/{len(prompt_files)}] {prompt_file}: ERROR {type(exc).__name__} (continuing)")
+                # Free CUDA cache before next prompt to avoid cascading OOM.
+                try:
+                    import torch
+                    if torch.cuda.is_available():
+                        torch.cuda.empty_cache()
+                except Exception:
+                    pass
+                continue
+            payload = result.to_json_dict()
+            payload["prompt_file"] = str(prompt_file)
+            handle.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")
+            score = (
+                result.first_token_score
+                if result.first_token_score is not None
+                else result.output.score
+            )
+            print(
+                f"{prompt_file}: {result.output.first_token_label} "
+                f"score={score} latency={result.latency_seconds:.2f}s"
+            )
+    print(f"wrote={output_path}")
+    return 0
+
+
+def load_default_inline_config(provider: str, base_url: str, model: str):
+    from er_tp_dgp.llm import LLMRequestConfig
+
+    return LLMRequestConfig(
+        provider_type=provider,
+        base_url=base_url,
+        model=model,
+        api_key_env="OPENAI_COMPAT_API_KEY" if provider == "api" else None,
+    )
+
+
+def LLMRequestConfig_with_logprobs(config):
+    """Return a copy of `config` with logprobs/top_logprobs requested."""
+    from dataclasses import replace as _replace
+
+    return _replace(config, request_logprobs=True, top_logprobs=20)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/run_multiround_inference.py
+++ b/scripts/run_multiround_inference.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+"""Causal Graph-of-Thought (CGoT) multi-round inference.
+
+Loads the same prompt batch produced by build_theia_prompt_batch.py BUT also
+needs the underlying provenance graph (from the cached THEIA window IR) and
+the labeled_targets to know each target's anchor + UUID. Round prompts are
+constructed live from the graph; the per-target prompt_text/*.txt files are
+NOT used here.
+
+Output format: one JSONL line per target with:
+    target_id, score (final round softmax), yes_logprob, no_logprob,
+    intermediate_findings (list of {round_id, metapath_type, observation}),
+    rounds_run, total_latency_seconds.
+
+The output is shaped so that scripts/run_evaluation.py can ingest it like any
+other predictions file (first_token_score / first_token_yes_logprob /
+first_token_no_logprob fields are populated identically).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import time
+from dataclasses import asdict
+from pathlib import Path
+
+from er_tp_dgp.llm import LocalHFLogitsProvider
+from er_tp_dgp.metapaths import APTMetapathExtractor
+from er_tp_dgp.multiround import MultiRoundPromptBuilder
+from er_tp_dgp.numerical_aggregator import NumericalAggregator
+from er_tp_dgp.prompt import PromptComponentSwitches
+from er_tp_dgp.scoring import score_from_hf_logits
+from er_tp_dgp.text_summarizer import (
+    MetapathTextSummarizer,
+    NodeTextSummarizer,
+    SummarizerConfig,
+    _NullLLM,
+)
+from er_tp_dgp.theia import build_cached_theia_window_ir, discover_theia_json_files
+from er_tp_dgp.trimming import TemporalSecurityAwareTrimmer
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
+    parser.add_argument("--labeled-targets", required=True)
+    parser.add_argument("--data-dir", default="data/raw/e3_theia_json")
+    parser.add_argument(
+        "--cache-dir",
+        default="reports/cache/theia_window_ir",
+        help="Where pre-warmed window-IR snapshots live.",
+    )
+    parser.add_argument("--lookback-seconds", type=float, default=30.0)
+    parser.add_argument("--lookahead-seconds", type=float, default=30.0)
+    parser.add_argument("--top-m-per-metapath", type=int, default=5)
+    parser.add_argument("--model", required=True, help="HF model id, e.g. Qwen/Qwen3-1.7B")
+    parser.add_argument("--dtype", default="bf16", choices=["bf16", "fp16", "fp32"])
+    parser.add_argument("--device-map", default="auto")
+    parser.add_argument("--max-memory-per-gpu-gib", type=float, default=None)
+    parser.add_argument("--lora-adapter", default=None)
+    parser.add_argument("--output-jsonl", required=True)
+    parser.add_argument(
+        "--intermediate-max-tokens",
+        type=int,
+        default=80,
+        help="Max new tokens for non-final rounds (short observations).",
+    )
+    parser.add_argument(
+        "--use-llm-summarizer",
+        action="store_true",
+        help=(
+            "Use a remote OpenAI-compat config for TextSumm/PathSumm "
+            "(via --summarizer-config). Default: NullSummarizer (truncation only)."
+        ),
+    )
+    parser.add_argument("--summarizer-config", default=None)
+    parser.add_argument("--summarizer-workers", type=int, default=8)
+    parser.add_argument("--max-targets", type=int, default=None)
+    args = parser.parse_args()
+
+    paths = discover_theia_json_files(args.data_dir)
+    if not paths:
+        raise SystemExit(f"no THEIA JSON files in {args.data_dir}")
+
+    targets = _read_jsonl(Path(args.labeled_targets))
+    if args.max_targets is not None:
+        targets = targets[: args.max_targets]
+
+    provider = LocalHFLogitsProvider(
+        base_model=args.model,
+        lora_adapter=args.lora_adapter,
+        dtype=args.dtype,
+        device_map=args.device_map,
+        max_memory_per_gpu_gib=args.max_memory_per_gpu_gib,
+    )
+
+    node_summ, path_summ = _build_summarizers(
+        use_llm=args.use_llm_summarizer,
+        config_path=args.summarizer_config,
+        workers=args.summarizer_workers,
+    )
+
+    output_path = Path(args.output_jsonl)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with output_path.open("w", encoding="utf-8") as out:
+        for index, target in enumerate(targets, start=1):
+            target_id = target["target_id"]
+            anchor_event_id = target["anchor_event_id"]
+            print(f"[{index}/{len(targets)}] target={target_id} anchor={anchor_event_id}", flush=True)
+
+            window = build_cached_theia_window_ir(
+                paths,
+                target_event_uuid=anchor_event_id,
+                lookback_seconds=args.lookback_seconds,
+                lookahead_seconds=args.lookahead_seconds,
+                cache_dir=args.cache_dir,
+            )
+            graph = window.to_graph()
+            graph_target_id = window.target_subject_id or window.target_event_id
+            evidence_paths = APTMetapathExtractor(graph).extract_for_target(graph_target_id)
+            selected = TemporalSecurityAwareTrimmer(
+                graph, top_m_per_metapath=args.top_m_per_metapath
+            ).trim(graph_target_id, evidence_paths)
+
+            switches = PromptComponentSwitches(
+                use_text_summarization=(node_summ is not None),
+                use_path_summarization_llm=(path_summ is not None),
+            )
+            builder = MultiRoundPromptBuilder(
+                graph,
+                node_summarizer=node_summ,
+                path_summarizer=path_summ,
+                numerical_aggregator=NumericalAggregator(graph),
+                switches=switches,
+            )
+            plan = builder.build(graph_target_id, selected)
+
+            result = _run_plan(
+                provider=provider,
+                plan=plan,
+                intermediate_max_tokens=args.intermediate_max_tokens,
+            )
+            payload = {
+                "target_id": graph_target_id,
+                "anchor_event_id": anchor_event_id,
+                "rounds_run": len(plan.rounds),
+                "intermediate_findings": result["intermediate_findings"],
+                "raw_text": result["final_text"],
+                "first_token_score": result["score"],
+                "first_token_yes_logprob": result["yes_logprob"],
+                "first_token_no_logprob": result["no_logprob"],
+                "output": {
+                    "first_token_label": "MALICIOUS" if (result["score"] or 0.0) >= 0.5 else "BENIGN",
+                    "score": result["score"],
+                    "predicted_label": "MALICIOUS" if (result["score"] or 0.0) >= 0.5 else "BENIGN",
+                    "evidence_path_ids": list(plan.evidence_path_ids),
+                },
+                "latency_seconds": result["total_latency"],
+            }
+            out.write(json.dumps(payload, ensure_ascii=False, sort_keys=True) + "\n")
+            out.flush()
+            print(
+                f"  rounds={len(plan.rounds)} score={result['score']} "
+                f"latency={result['total_latency']:.1f}s",
+                flush=True,
+            )
+
+    print(f"wrote {output_path}", flush=True)
+    return 0
+
+
+def _run_plan(*, provider: LocalHFLogitsProvider, plan, intermediate_max_tokens: int) -> dict:
+    intermediate: list[dict] = []
+    started = time.time()
+
+    def _format(prompt_template: str) -> str:
+        prior_block = "\n".join(
+            f"- {entry['round_id']} ({entry.get('metapath_type') or '-'}): {entry['observation']}"
+            for entry in intermediate
+        )
+        if "{prior_findings}" in prompt_template:
+            return prompt_template.replace(
+                "{prior_findings}",
+                f"Prior reasoning:\n{prior_block}" if prior_block else "Prior reasoning: (none yet)",
+            )
+        return prompt_template
+
+    score = None
+    yes_lp = None
+    no_lp = None
+    final_text = ""
+
+    for round_prompt in plan.rounds:
+        prompt = _format(round_prompt.prompt_text)
+        if round_prompt.is_final:
+            # Final round: classify, read first-token Yes/No softmax.
+            r = provider.classify(target_id=plan.target_id, prompt_text=prompt)
+            score = r.first_token_score
+            yes_lp = r.first_token_yes_logprob
+            no_lp = r.first_token_no_logprob
+            final_text = r.raw_text
+        else:
+            # Intermediate round: short text generation.
+            obs = provider.complete(prompt, max_tokens=intermediate_max_tokens)
+            # Trim the observation aggressively: take up to first newline.
+            short = obs.split("\n", 1)[0].strip()[:280]
+            intermediate.append(
+                {
+                    "round_id": round_prompt.round_id,
+                    "metapath_type": round_prompt.metapath_type,
+                    "observation": short,
+                }
+            )
+
+    return {
+        "intermediate_findings": intermediate,
+        "final_text": final_text,
+        "score": score,
+        "yes_logprob": yes_lp,
+        "no_logprob": no_lp,
+        "total_latency": time.time() - started,
+    }
+
+
+def _build_summarizers(
+    *, use_llm: bool, config_path: str | None, workers: int
+) -> tuple[NodeTextSummarizer | None, MetapathTextSummarizer | None]:
+    if not use_llm:
+        return None, None
+    if not config_path:
+        cfg = SummarizerConfig(model_name="null-fallback", max_workers=workers)
+        return NodeTextSummarizer(llm=_NullLLM(), config=cfg), MetapathTextSummarizer(
+            llm=_NullLLM(), config=cfg
+        )
+    from er_tp_dgp.llm import OpenAICompatibleHTTPProvider
+    from er_tp_dgp.llm_config import load_llm_config
+
+    llm_cfg = load_llm_config(config_path)
+    provider = OpenAICompatibleHTTPProvider(llm_cfg)
+    cfg = SummarizerConfig(model_name=llm_cfg.model, max_workers=workers)
+    return NodeTextSummarizer(llm=provider, config=cfg), MetapathTextSummarizer(llm=provider, config=cfg)
+
+
+def _read_jsonl(path: Path) -> list[dict]:
+    rows: list[dict] = []
+    with path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            if line.strip():
+                rows.append(json.loads(line))
+    return rows
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/summarize_hybrid_experiment.py
+++ b/scripts/summarize_hybrid_experiment.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""Summarize the hybrid v0.3 experiment + cross-compare with v0.1 v0.2 baselines.
+
+Reads:
+  - reports/hybrid_v0_3/metrics/metrics.json (this experiment)
+  - reports/evaluation/e3_theia_v0_2/metrics_n146_4methods/metrics.json (v0.1 baseline)
+
+Writes:
+  - reports/hybrid_v0_3/summary.md  — head-to-head comparison table
+
+The two experiments use different target populations (v0.1 = per-process
+n=146, hybrid = per-community n=150) so this is NOT a direct AUPRC
+comparison — it's a "how does the new method compare in absolute
+detection capability" snapshot. The within-experiment row comparison
+(hybrid vs Phase 14 raw landmarks on the SAME 150 communities) IS direct.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+
+def _load(path: Path) -> dict:
+    if not path.exists():
+        return {}
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def _row(name: str, m: dict) -> str:
+    metrics = m.get("metrics") if "metrics" in m else m
+    n = metrics.get("num_examples", "?")
+    n_pos = metrics.get("num_positive", "?")
+    return (
+        f"| {name} | {n} | {n_pos} | "
+        f"{_fmt(metrics.get('auprc'))} | {_fmt(metrics.get('auroc'))} | "
+        f"{_fmt(metrics.get('macro_f1'))} | "
+        f"{_fmt((metrics.get('recall_at_k') or {}).get('10') or (metrics.get('recall_at_k') or {}).get(10))} | "
+        f"{_fmt((metrics.get('fpr_at_recall') or {}).get('0.9') or (metrics.get('fpr_at_recall') or {}).get(0.9))} | "
+        f"{_fmt(metrics.get('avg_prompt_tokens'))} | "
+        f"{_fmt(metrics.get('evidence_path_hit_rate'))} |"
+    )
+
+
+def _fmt(value) -> str:
+    if isinstance(value, (int, float)):
+        return f"{value:.4f}" if isinstance(value, float) else str(value)
+    if value is None:
+        return "n/a"
+    return str(value)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--hybrid-metrics",
+        default="reports/hybrid_v0_3/metrics/metrics.json",
+    )
+    parser.add_argument(
+        "--baseline-metrics",
+        default="reports/evaluation/e3_theia_v0_2/metrics_n146_4methods/metrics.json",
+    )
+    parser.add_argument("--output", default="reports/hybrid_v0_3/summary.md")
+    args = parser.parse_args()
+
+    hybrid_data = _load(Path(args.hybrid_metrics))
+    baseline_data = _load(Path(args.baseline_metrics))
+
+    rows: list[tuple[str, dict]] = []
+    for name, payload in sorted(hybrid_data.items()):
+        rows.append((f"hybrid_v0_3 / {name}", payload))
+    for name, payload in sorted(baseline_data.items()):
+        rows.append((f"baseline_v0_2 / {name}", payload))
+
+    headers = [
+        "method",
+        "n",
+        "n+",
+        "AUPRC",
+        "AUROC",
+        "Macro-F1",
+        "Recall@10",
+        "FPR@0.9",
+        "avg_tokens",
+        "evidence_hit",
+    ]
+    lines = [
+        "# ER-TP-DGP Hybrid v0.3 — Head-to-Head Summary",
+        "",
+        "## Comparison axes",
+        "",
+        "- **hybrid_v0_3 / predictions_hybrid** — Phase 14 community detection unit + ",
+        "  v0.1 fine-grained subgraph re-injection + DGP-12 layered prompt.",
+        "- **hybrid_v0_3 / predictions_landmark_raw** — Phase 14 raw landmark-only ",
+        "  prompts on the SAME 150 communities (head-to-head ablation).",
+        "- **baseline_v0_2 / predictions_graph_dgp_*** — v0.1 graph_dgp pipeline ",
+        "  on n=146 per-process targets (different population, included as scale reference).",
+        "- **baseline_v0_2 / predictions_target_only_*** — v0.1 target-only baseline ",
+        "  on n=146 per-process targets.",
+        "",
+        "## Metrics",
+        "",
+        "| " + " | ".join(headers) + " |",
+        "|" + "|".join(["---"] * len(headers)) + "|",
+    ]
+    for name, payload in rows:
+        lines.append(_row(name, payload))
+    lines.append("")
+    lines.append(
+        "Score column is calibrated first-token softmax over (Yes, No) "
+        "(DGP paper formula 14). Rows missing logprobs are excluded with a warning."
+    )
+    out_path = Path(args.output)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+    print(f"wrote {out_path}")
+    print()
+    print("\n".join(lines))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/theia_candidate_universe.py
+++ b/scripts/theia_candidate_universe.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+"""Build a label-free THEIA candidate universe and QA sampling frame."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from er_tp_dgp.candidate_universe import (
+    build_theia_candidate_universe,
+    write_stratified_sample_jsonl,
+)
+from er_tp_dgp.theia import discover_theia_json_files
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Build protocol-based process candidates from THEIA JSON. "
+            "This is label-free candidate generation, not detection evaluation."
+        )
+    )
+    parser.add_argument("--data-dir", default="data/raw/e3_theia_json")
+    parser.add_argument(
+        "--input-file",
+        action="append",
+        default=None,
+        help="Specific THEIA JSON file to scan. Can be repeated. Overrides --data-dir discovery.",
+    )
+    parser.add_argument("--output-dir", default="reports/theia_candidate_universe")
+    parser.add_argument("--dataset-name", default="DARPA_TC_E3_THEIA")
+    parser.add_argument("--max-lines", type=int, default=None)
+    parser.add_argument("--max-lines-per-file", type=int, default=None)
+    parser.add_argument(
+        "--progress-every",
+        type=int,
+        default=None,
+        help="Emit '[progress] lines=...' every N records. Useful for long full-corpus scans.",
+    )
+    parser.add_argument("--min-score", type=float, default=1.0)
+    parser.add_argument("--min-events", type=int, default=1)
+    parser.add_argument("--per-stratum", type=int, default=5)
+    parser.add_argument("--seed", type=int, default=7)
+    parser.add_argument("--report-limit", type=int, default=40)
+    args = parser.parse_args()
+
+    paths = [Path(path) for path in args.input_file] if args.input_file else discover_theia_json_files(args.data_dir)
+    if not paths:
+        raise SystemExit(f"no THEIA JSON files found under {args.data_dir}")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    universe = build_theia_candidate_universe(
+        paths,
+        dataset_name=args.dataset_name,
+        max_lines=args.max_lines,
+        max_lines_per_file=args.max_lines_per_file,
+        progress_every=args.progress_every,
+    )
+    candidates = universe.candidate_profiles(
+        min_score=args.min_score,
+        min_events=args.min_events,
+    )
+
+    universe_path = output_dir / "candidate_universe.jsonl"
+    report_path = output_dir / "candidate_universe.md"
+    sample_path = output_dir / "qa_stratified_sample.jsonl"
+
+    universe.write_jsonl(
+        universe_path,
+        min_score=args.min_score,
+        min_events=args.min_events,
+    )
+    report_path.write_text(
+        universe.to_markdown(
+            min_score=args.min_score,
+            min_events=args.min_events,
+            limit=args.report_limit,
+        )
+        + "\n",
+        encoding="utf-8",
+    )
+    sample = write_stratified_sample_jsonl(
+        candidates,
+        sample_path,
+        per_stratum=args.per_stratum,
+        seed=args.seed,
+    )
+
+    print(f"files={len(paths)} lines_seen={universe.lines_seen} events_seen={universe.events_seen}")
+    print(f"profiles={len(universe.profiles)} candidates={len(candidates)} qa_sample={len(sample)}")
+    print(f"wrote {universe_path}")
+    print(f"wrote {report_path}")
+    print(f"wrote {sample_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/theia_idea_validate.py
+++ b/scripts/theia_idea_validate.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""Build one real THEIA E3 ER-TP-DGP prompt for idea validation."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from er_tp_dgp.metapaths import APTMetapathExtractor
+from er_tp_dgp.prompt import PromptBuilder
+from er_tp_dgp.theia import build_theia_window_ir, discover_theia_json_files
+from er_tp_dgp.trimming import TemporalSecurityAwareTrimmer
+from er_tp_dgp.validation import validate_evidence_paths, validate_graph, validate_ir
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-dir", default="data/raw/e3_theia_json")
+    parser.add_argument("--output-dir", default="reports/theia_e3_idea")
+    parser.add_argument(
+        "--target-event",
+        default="86E0FB61-B300-2215-3C6D-8F0000000010",
+        help="Raw THEIA event UUID to use as target anchor.",
+    )
+    parser.add_argument("--lookback-seconds", type=float, default=120.0)
+    parser.add_argument("--lookahead-seconds", type=float, default=120.0)
+    parser.add_argument("--max-lines", type=int, default=1_250_000)
+    parser.add_argument("--max-lines-per-file", type=int, default=50_000)
+    parser.add_argument("--top-m-per-metapath", type=int, default=5)
+    args = parser.parse_args()
+
+    files = discover_theia_json_files(args.data_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    window = build_theia_window_ir(
+        files,
+        target_event_uuid=args.target_event,
+        lookback_seconds=args.lookback_seconds,
+        lookahead_seconds=args.lookahead_seconds,
+        max_lines=args.max_lines,
+        max_lines_per_file=args.max_lines_per_file,
+    )
+    graph = window.to_graph()
+    target_id = window.target_subject_id or window.target_event_id
+
+    ir_report = validate_ir(list(window.entities), list(window.events))
+    graph_report = validate_graph(graph)
+    paths = APTMetapathExtractor(graph).extract_for_target(target_id)
+    selected = TemporalSecurityAwareTrimmer(
+        graph,
+        top_m_per_metapath=args.top_m_per_metapath,
+    ).trim(target_id, paths)
+    evidence_report = validate_evidence_paths(graph, selected)
+    prompt = PromptBuilder(graph).build(target_id, selected)
+
+    summary = [
+        "# THEIA E3 ER-TP-DGP Idea Validation",
+        "",
+        "This is a method plumbing validation on a real THEIA E3 window. It is not a detection-performance result.",
+        "",
+        f"- target_event_id: {window.target_event_id}",
+        f"- target_subject_id: {window.target_subject_id}",
+        f"- window_start_nanos: {window.start_timestamp_nanos}",
+        f"- window_end_nanos: {window.end_timestamp_nanos}",
+        f"- entities: {len(window.entities)}",
+        f"- events: {len(window.events)}",
+        f"- extracted_evidence_paths: {len(paths)}",
+        f"- selected_evidence_paths: {len(selected)}",
+        f"- schema_gaps: {list(window.schema_gaps)}",
+        "",
+        "## Validation",
+        "",
+        f"- ir_ok: {ir_report.ok}",
+        f"- graph_ok: {graph_report.ok}",
+        f"- evidence_ok: {evidence_report.ok}",
+        "",
+        "## Selected Evidence Paths",
+        "",
+    ]
+    for path in selected:
+        summary.append(
+            "- "
+            f"{path.path_id} metapath={path.metapath_type} score={path.trimming_score:.3f} "
+            f"events={list(path.ordered_event_ids)} reason={path.selected_reason}"
+        )
+    if not selected:
+        summary.append("- none")
+
+    (output_dir / "idea_validation.md").write_text("\n".join(summary), encoding="utf-8")
+    (output_dir / "prompt.txt").write_text(prompt.prompt_text, encoding="utf-8")
+    (output_dir / "ir_validation.md").write_text(ir_report.to_markdown(), encoding="utf-8")
+    (output_dir / "graph_validation.md").write_text(graph_report.to_markdown(), encoding="utf-8")
+    (output_dir / "evidence_validation.md").write_text(evidence_report.to_markdown(), encoding="utf-8")
+
+    print(f"target_subject_id={window.target_subject_id}")
+    print(f"entities={len(window.entities)}")
+    print(f"events={len(window.events)}")
+    print(f"paths={len(paths)}")
+    print(f"selected={len(selected)}")
+    print(f"schema_gaps={list(window.schema_gaps)}")
+    print(f"wrote={output_dir / 'idea_validation.md'}")
+    print(f"wrote={output_dir / 'prompt.txt'}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/theia_preliminary.py
+++ b/scripts/theia_preliminary.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+"""Run THEIA E3 schema audit and debugging-only preliminary scan."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from er_tp_dgp.theia import audit_theia_files, discover_theia_json_files, preliminary_scan_theia_files
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-dir", default="data/raw/e3_theia_json")
+    parser.add_argument("--output-dir", default="reports/theia_e3")
+    parser.add_argument("--max-lines", type=int, default=250_000)
+    parser.add_argument("--max-lines-per-file", type=int, default=None)
+    parser.add_argument("--max-candidates", type=int, default=200)
+    args = parser.parse_args()
+
+    data_dir = Path(args.data_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    files = discover_theia_json_files(data_dir)
+    if not files:
+        raise SystemExit(f"No THEIA JSON files found in {data_dir}")
+
+    profile = audit_theia_files(
+        files,
+        max_lines=args.max_lines,
+        max_lines_per_file=args.max_lines_per_file,
+    )
+    scan = preliminary_scan_theia_files(
+        files,
+        max_lines=args.max_lines,
+        max_lines_per_file=args.max_lines_per_file,
+        max_candidates=args.max_candidates,
+    )
+
+    (output_dir / "schema_profile.md").write_text(profile.to_markdown(), encoding="utf-8")
+    (output_dir / "preliminary_candidates.md").write_text(scan.to_markdown(), encoding="utf-8")
+
+    print(f"files={len(files)}")
+    print(f"schema_lines={profile.lines_seen}")
+    print(f"scan_lines={scan.lines_seen}")
+    print(f"candidates={len(scan.candidates)}")
+    print(f"wrote={output_dir / 'schema_profile.md'}")
+    print(f"wrote={output_dir / 'preliminary_candidates.md'}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/scripts/train_lora.py
+++ b/scripts/train_lora.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+"""LoRA fine-tune Qwen3-8B (or compatible) on ER-TP-DGP prompt batches.
+
+Inputs:
+    --prompt-batch-dir  Directory produced by build_theia_prompt_batch.py.
+                        Expected files inside:
+                          - prompt_metadata.jsonl
+                          - prompt_text/<NNNN_targetid>.txt
+    --labeled-targets   Path to evaluation_batch.jsonl with `label` field.
+    --train-until / --val-until  Time-based split timestamps (paper-aligned
+                        anti-leakage protocol; see splits.time_based_split).
+
+Outputs:
+    --output-dir/lora_final  PEFT adapter directory + tokenizer
+    --output-dir/splits.json Train/val/test target ID lists
+    --output-dir/leakage_audit.md  splits.check_leakage report
+
+Implements paper formula 13: CE on first generated token Yes/No, computed
+under the standard transformers Trainer with label_ids = -100 except at the
+target token position. Adapter loadable later via LocalHFLogitsProvider.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from dataclasses import asdict
+from pathlib import Path
+
+from er_tp_dgp.splits import TargetMetadata, check_leakage, time_based_split
+from er_tp_dgp.training import LoRAConfig, TrainConfig, TrainExample, train_lora
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
+    parser.add_argument("--prompt-batch-dir", required=True)
+    parser.add_argument("--labeled-targets", required=True)
+    parser.add_argument("--output-dir", default="reports/training/v1")
+    parser.add_argument("--base-model", default="Qwen/Qwen3-8B")
+    parser.add_argument("--epochs", type=int, default=3)
+    parser.add_argument("--learning-rate", type=float, default=2e-4)
+    parser.add_argument("--per-device-batch-size", type=int, default=2)
+    parser.add_argument("--gradient-accumulation-steps", type=int, default=8)
+    parser.add_argument("--max-seq-length", type=int, default=8192)
+    parser.add_argument("--lora-r", type=int, default=16)
+    parser.add_argument("--lora-alpha", type=int, default=32)
+    parser.add_argument(
+        "--train-until",
+        type=float,
+        required=True,
+        help="Targets with timestamp <= train_until go to train split.",
+    )
+    parser.add_argument("--val-until", type=float, required=True)
+    parser.add_argument("--seed", type=int, default=7)
+    args = parser.parse_args()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    examples, target_meta = _load_prompt_batch_examples(
+        prompt_batch_dir=Path(args.prompt_batch_dir),
+        labeled_targets=Path(args.labeled_targets),
+    )
+    if not examples:
+        raise SystemExit("No prompt examples found; check --prompt-batch-dir/--labeled-targets.")
+
+    assignment = time_based_split(
+        target_meta,
+        train_until=args.train_until,
+        validation_until=args.val_until,
+    )
+    leakage = check_leakage(target_meta, assignment)
+    (output_dir / "leakage_audit.md").write_text(leakage.to_markdown(), encoding="utf-8")
+    if not leakage.ok:
+        # Don't abort; the audit file is the artifact. Operator decides.
+        print(
+            f"WARNING: leakage audit reported {len(leakage.findings)} findings; "
+            f"see {output_dir/'leakage_audit.md'}"
+        )
+
+    splits_payload: dict[str, list[str]] = {"train": [], "val": [], "test": []}
+    train_examples: list[TrainExample] = []
+    val_examples: list[TrainExample] = []
+    for example, meta in zip(examples, target_meta, strict=True):
+        split = assignment.split_by_target[meta.target_id].value
+        if split == "train":
+            train_examples.append(example)
+            splits_payload["train"].append(meta.target_id)
+        elif split == "validation":
+            val_examples.append(example)
+            splits_payload["val"].append(meta.target_id)
+        else:
+            splits_payload["test"].append(meta.target_id)
+    (output_dir / "splits.json").write_text(
+        json.dumps(splits_payload, ensure_ascii=False, sort_keys=True, indent=2),
+        encoding="utf-8",
+    )
+
+    print(
+        f"train={len(train_examples)} val={len(val_examples)} "
+        f"test={len(splits_payload['test'])}"
+    )
+
+    train_cfg = TrainConfig(
+        base_model=args.base_model,
+        output_dir=output_dir,
+        epochs=args.epochs,
+        learning_rate=args.learning_rate,
+        per_device_batch_size=args.per_device_batch_size,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        max_seq_length=args.max_seq_length,
+        seed=args.seed,
+    )
+    lora_cfg = LoRAConfig(r=args.lora_r, alpha=args.lora_alpha)
+    final_dir = train_lora(train_examples, val_examples, train_config=train_cfg, lora_config=lora_cfg)
+
+    manifest = {
+        "base_model": args.base_model,
+        "lora_r": args.lora_r,
+        "lora_alpha": args.lora_alpha,
+        "epochs": args.epochs,
+        "learning_rate": args.learning_rate,
+        "train_until": args.train_until,
+        "val_until": args.val_until,
+        "train_size": len(train_examples),
+        "val_size": len(val_examples),
+        "test_size": len(splits_payload["test"]),
+        "adapter_path": str(final_dir),
+        "splits_path": str(output_dir / "splits.json"),
+        "leakage_audit_path": str(output_dir / "leakage_audit.md"),
+    }
+    (output_dir / "train_manifest.json").write_text(
+        json.dumps(manifest, ensure_ascii=False, sort_keys=True, indent=2), encoding="utf-8"
+    )
+    print(f"adapter saved to: {final_dir}")
+    print(f"manifest: {output_dir/'train_manifest.json'}")
+    return 0
+
+
+def _load_prompt_batch_examples(
+    *, prompt_batch_dir: Path, labeled_targets: Path
+) -> tuple[list[TrainExample], list[TargetMetadata]]:
+    """Cross-reference prompt files with labeled_targets for supervised pairs."""
+    metadata_path = prompt_batch_dir / "prompt_metadata.jsonl"
+    if not metadata_path.exists():
+        raise SystemExit(f"missing {metadata_path}")
+    label_by_id: dict[str, dict] = {}
+    for row in _read_jsonl(labeled_targets):
+        label_by_id[row["target_id"]] = row
+
+    examples: list[TrainExample] = []
+    metas: list[TargetMetadata] = []
+    for row in _read_jsonl(metadata_path):
+        target_id = row["target_id"]
+        prompt_path = Path(row["prompt_path"])
+        label_row = label_by_id.get(target_id)
+        if not prompt_path.exists() or not label_row:
+            continue
+        label_value = label_row.get("label")
+        if label_value not in {"malicious", "benign", "benign_proxy"}:
+            continue
+        prompt_text = prompt_path.read_text(encoding="utf-8")
+        examples.append(
+            TrainExample(
+                prompt_text=prompt_text,
+                label="Yes" if label_value == "malicious" else "No",
+            )
+        )
+        metas.append(
+            TargetMetadata(
+                target_id=target_id,
+                target_type=str(label_row.get("target_type", "PROCESS")),
+                timestamp=float(row.get("anchor_timestamp") or label_row.get("anchor_timestamp") or 0.0),
+                host=label_row.get("host"),
+                campaign_id=label_row.get("atom_id"),
+                prompt_text=prompt_text,
+                raw_event_ids=tuple(row.get("evidence_path_ids") or ()),
+                process_ids=(target_id,) if label_row.get("target_type") == "PROCESS" else (),
+                file_paths=tuple([label_row["process_path"]] if label_row.get("process_path") else ()),
+            )
+        )
+    return examples, metas
+
+
+def _read_jsonl(path: Path) -> list[dict]:
+    rows: list[dict] = []
+    with path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            line = line.strip()
+            if line:
+                rows.append(json.loads(line))
+    return rows
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())