Event-Reified Temporal Provenance Dual-Granularity Prompting for LLM-based APT detection on DARPA provenance datasets. Includes phase 0-14 method spec, IR/graph/metapath/trimming/prompt modules, scripts for THEIA candidate universe, landmark CSG construction, hybrid prompting, and LLM inference. Excludes data/, reports/, and local LLM config from version control.
55 lines
1.8 KiB
Python
55 lines
1.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Run THEIA E3 schema audit and debugging-only preliminary scan."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
from er_tp_dgp.theia import audit_theia_files, discover_theia_json_files, preliminary_scan_theia_files
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--data-dir", default="data/raw/e3_theia_json")
|
|
parser.add_argument("--output-dir", default="reports/theia_e3")
|
|
parser.add_argument("--max-lines", type=int, default=250_000)
|
|
parser.add_argument("--max-lines-per-file", type=int, default=None)
|
|
parser.add_argument("--max-candidates", type=int, default=200)
|
|
args = parser.parse_args()
|
|
|
|
data_dir = Path(args.data_dir)
|
|
output_dir = Path(args.output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
files = discover_theia_json_files(data_dir)
|
|
if not files:
|
|
raise SystemExit(f"No THEIA JSON files found in {data_dir}")
|
|
|
|
profile = audit_theia_files(
|
|
files,
|
|
max_lines=args.max_lines,
|
|
max_lines_per_file=args.max_lines_per_file,
|
|
)
|
|
scan = preliminary_scan_theia_files(
|
|
files,
|
|
max_lines=args.max_lines,
|
|
max_lines_per_file=args.max_lines_per_file,
|
|
max_candidates=args.max_candidates,
|
|
)
|
|
|
|
(output_dir / "schema_profile.md").write_text(profile.to_markdown(), encoding="utf-8")
|
|
(output_dir / "preliminary_candidates.md").write_text(scan.to_markdown(), encoding="utf-8")
|
|
|
|
print(f"files={len(files)}")
|
|
print(f"schema_lines={profile.lines_seen}")
|
|
print(f"scan_lines={scan.lines_seen}")
|
|
print(f"candidates={len(scan.candidates)}")
|
|
print(f"wrote={output_dir / 'schema_profile.md'}")
|
|
print(f"wrote={output_dir / 'preliminary_candidates.md'}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|