Initial commit: ER-TP-DGP research prototype
Event-Reified Temporal Provenance Dual-Granularity Prompting for LLM-based APT detection on DARPA provenance datasets. Includes phase 0-14 method spec, IR/graph/metapath/trimming/prompt modules, scripts for THEIA candidate universe, landmark CSG construction, hybrid prompting, and LLM inference. Excludes data/, reports/, and local LLM config from version control.
This commit is contained in:
54
scripts/theia_preliminary.py
Normal file
54
scripts/theia_preliminary.py
Normal file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Run THEIA E3 schema audit and debugging-only preliminary scan."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from er_tp_dgp.theia import audit_theia_files, discover_theia_json_files, preliminary_scan_theia_files
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--data-dir", default="data/raw/e3_theia_json")
|
||||
parser.add_argument("--output-dir", default="reports/theia_e3")
|
||||
parser.add_argument("--max-lines", type=int, default=250_000)
|
||||
parser.add_argument("--max-lines-per-file", type=int, default=None)
|
||||
parser.add_argument("--max-candidates", type=int, default=200)
|
||||
args = parser.parse_args()
|
||||
|
||||
data_dir = Path(args.data_dir)
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
files = discover_theia_json_files(data_dir)
|
||||
if not files:
|
||||
raise SystemExit(f"No THEIA JSON files found in {data_dir}")
|
||||
|
||||
profile = audit_theia_files(
|
||||
files,
|
||||
max_lines=args.max_lines,
|
||||
max_lines_per_file=args.max_lines_per_file,
|
||||
)
|
||||
scan = preliminary_scan_theia_files(
|
||||
files,
|
||||
max_lines=args.max_lines,
|
||||
max_lines_per_file=args.max_lines_per_file,
|
||||
max_candidates=args.max_candidates,
|
||||
)
|
||||
|
||||
(output_dir / "schema_profile.md").write_text(profile.to_markdown(), encoding="utf-8")
|
||||
(output_dir / "preliminary_candidates.md").write_text(scan.to_markdown(), encoding="utf-8")
|
||||
|
||||
print(f"files={len(files)}")
|
||||
print(f"schema_lines={profile.lines_seen}")
|
||||
print(f"scan_lines={scan.lines_seen}")
|
||||
print(f"candidates={len(scan.candidates)}")
|
||||
print(f"wrote={output_dir / 'schema_profile.md'}")
|
||||
print(f"wrote={output_dir / 'preliminary_candidates.md'}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user