#!/usr/bin/env python3 """Run THEIA E3 schema audit and debugging-only preliminary scan.""" from __future__ import annotations import argparse from pathlib import Path from er_tp_dgp.theia import audit_theia_files, discover_theia_json_files, preliminary_scan_theia_files def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--data-dir", default="data/raw/e3_theia_json") parser.add_argument("--output-dir", default="reports/theia_e3") parser.add_argument("--max-lines", type=int, default=250_000) parser.add_argument("--max-lines-per-file", type=int, default=None) parser.add_argument("--max-candidates", type=int, default=200) args = parser.parse_args() data_dir = Path(args.data_dir) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) files = discover_theia_json_files(data_dir) if not files: raise SystemExit(f"No THEIA JSON files found in {data_dir}") profile = audit_theia_files( files, max_lines=args.max_lines, max_lines_per_file=args.max_lines_per_file, ) scan = preliminary_scan_theia_files( files, max_lines=args.max_lines, max_lines_per_file=args.max_lines_per_file, max_candidates=args.max_candidates, ) (output_dir / "schema_profile.md").write_text(profile.to_markdown(), encoding="utf-8") (output_dir / "preliminary_candidates.md").write_text(scan.to_markdown(), encoding="utf-8") print(f"files={len(files)}") print(f"schema_lines={profile.lines_seen}") print(f"scan_lines={scan.lines_seen}") print(f"candidates={len(scan.candidates)}") print(f"wrote={output_dir / 'schema_profile.md'}") print(f"wrote={output_dir / 'preliminary_candidates.md'}") return 0 if __name__ == "__main__": raise SystemExit(main())