Initial commit: ER-TP-DGP research prototype

Event-Reified Temporal Provenance Dual-Granularity Prompting for
LLM-based APT detection on DARPA provenance datasets.

Includes phase 0-14 method spec, IR/graph/metapath/trimming/prompt
modules, scripts for THEIA candidate universe, landmark CSG construction,
hybrid prompting, and LLM inference. Excludes data/, reports/, and
local LLM config from version control.
This commit is contained in:
BattleTag
2026-05-15 16:53:57 +08:00
commit b86ae87b75
88 changed files with 18570 additions and 0 deletions

View File

@@ -0,0 +1,54 @@
#!/usr/bin/env python3
"""Run THEIA E3 schema audit and debugging-only preliminary scan."""
from __future__ import annotations
import argparse
from pathlib import Path
from er_tp_dgp.theia import audit_theia_files, discover_theia_json_files, preliminary_scan_theia_files
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--data-dir", default="data/raw/e3_theia_json")
parser.add_argument("--output-dir", default="reports/theia_e3")
parser.add_argument("--max-lines", type=int, default=250_000)
parser.add_argument("--max-lines-per-file", type=int, default=None)
parser.add_argument("--max-candidates", type=int, default=200)
args = parser.parse_args()
data_dir = Path(args.data_dir)
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
files = discover_theia_json_files(data_dir)
if not files:
raise SystemExit(f"No THEIA JSON files found in {data_dir}")
profile = audit_theia_files(
files,
max_lines=args.max_lines,
max_lines_per_file=args.max_lines_per_file,
)
scan = preliminary_scan_theia_files(
files,
max_lines=args.max_lines,
max_lines_per_file=args.max_lines_per_file,
max_candidates=args.max_candidates,
)
(output_dir / "schema_profile.md").write_text(profile.to_markdown(), encoding="utf-8")
(output_dir / "preliminary_candidates.md").write_text(scan.to_markdown(), encoding="utf-8")
print(f"files={len(files)}")
print(f"schema_lines={profile.lines_seen}")
print(f"scan_lines={scan.lines_seen}")
print(f"candidates={len(scan.candidates)}")
print(f"wrote={output_dir / 'schema_profile.md'}")
print(f"wrote={output_dir / 'preliminary_candidates.md'}")
return 0
if __name__ == "__main__":
raise SystemExit(main())