feat(refit): complete S1-S6 — case abstraction, grounding, log-odds, plugins, coref, multi-source

Consolidates the long-running refit work (DESIGN.md as authoritative spec) into a single baseline commit. Six stages landed together: S1 Case + EvidenceSource abstraction; tools parameterised by source_id (case.py, main.py multi-source bootstrap, .bin extension support) S2 Grounding gateway in add_phenomenon: verified_facts cite real ToolInvocation ids; substring / normalised match enforced; agent + task scope checked. Phenomenon.description split into verified_facts (grounded) + interpretation (free text). [invocation: inv-xxx] prefix on every wrapped tool result so the LLM can cite. S3 Confidence as additive log-odds: edge_type → log10(LR) calibration table; commutative updates; supported / refuted thresholds derived from log_odds; hypothesis × evidence matrix view. S4 iOS plugin: unzip_archive + parse_plist / sqlite_tables / sqlite_query / parse_ios_keychain / read_idevice_info; IOSArtifactAgent; SOURCE_TYPE_AGENTS routing. S5 Cross-source entity resolution: typed identifiers on Entity, observe_identity gateway, auto coref hypothesis with shared / conflicting strong/weak LR edges, reversible same_as edges, actor_clusters() view. S6 Android partition probe + AndroidArtifactAgent; MediaAgent with OCR fallback; orchestrator Phase 1 iterates every analysable source; platform-aware get_triage_agent_type; ReportAgent renders actor clusters + per-source breakdown. 142 unit tests / 1 skipped — full coverage of the new gateway, log-odds math, coref hypothesis fall-out, and orchestrator multi-source dispatch. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 02:12:10 -10:00
parent 444d58726a
commit 81ade8f7ac
24 changed files with 5137 additions and 244 deletions
--- a/main.py
+++ b/main.py
@@ -15,17 +15,21 @@ from pathlib import Path
 import yaml

 from agent_factory import AgentFactory
+from case import (
+    DISK_IMAGE_EXTS, Case, EvidenceSource, load_case, single_source_case,
+)
 from evidence_graph import EvidenceGraph
 from llm_client import LLMClient
 from log_config import setup_logging
 from orchestrator import AnalysisAborted, Orchestrator
 from tool_registry import register_all_tools
+from tools.archive import unzip_archive_sync

 RUNS_DIR = Path("runs")
 IMAGE_DIR = Path("image")
-
-# Common forensic image extensions (only first segment / single-file formats)
-_IMAGE_GLOBS = ["*.001", "*.dd", "*.raw", "*.img", "*.E01", "*.iso"]
+# Persistent unpack cache for tree-mode sources (zip extractions). Lives
+# at project root so multiple runs can reuse the same unpacked tree.
+SOURCE_CACHE_DIR = Path(".cache/sources")


 def load_config(path: str = "config.yaml") -> dict:
@@ -38,11 +42,13 @@ def load_config(path: str = "config.yaml") -> dict:
 # ---------------------------------------------------------------------------

 def _discover_images(search_dir: Path = IMAGE_DIR) -> list[Path]:
-    """Find forensic disk image files under *search_dir*."""
-    images: set[Path] = set()
-    for glob in _IMAGE_GLOBS:
-        images.update(search_dir.glob(glob))
-    return sorted(images)
+    """Find forensic disk image files under *search_dir* (case-insensitive ext)."""
+    if not search_dir.is_dir():
+        return []
+    return sorted(
+        p for p in search_dir.iterdir()
+        if p.is_file() and p.suffix.lower() in DISK_IMAGE_EXTS
+    )


 def _parse_mmls(output: str) -> list[dict]:
@@ -110,7 +116,7 @@ def select_image_interactive(image_dir: Path | None = None) -> tuple[str, int]:
    images = _discover_images(image_dir)
    if not images:
        print(f"No disk images found in {image_dir}/")
-        print("Supported formats: " + ", ".join(_IMAGE_GLOBS))
+        print("Supported extensions: " + ", ".join(sorted(DISK_IMAGE_EXTS)))
        sys.exit(1)

    if len(images) == 1:
@@ -153,6 +159,118 @@ def select_image_interactive(image_dir: Path | None = None) -> tuple[str, int]:
        print("Invalid choice.")


+def resolve_case() -> Case:
+    """Resolve the Case to analyze.
+
+    Priority: an explicit case file given as a CLI argument, then ./case.yaml
+    in the working directory, then legacy interactive single-image selection.
+    """
+    # 1. Explicit case file passed on the command line
+    if len(sys.argv) > 1 and sys.argv[1].lower().endswith((".yaml", ".yml")):
+        case = load_case(sys.argv[1])
+        if case is None:
+            print(f"Error: could not load case file {sys.argv[1]}")
+            sys.exit(1)
+        print(f"Loaded case: {case.name} ({len(case.sources)} sources)")
+        return case
+
+    # 2. ./case.yaml in the working directory
+    case = load_case()
+    if case is not None:
+        print(f"Loaded case: {case.name} ({len(case.sources)} sources)")
+        return case
+
+    # 3. Legacy interactive single-image selection
+    cli_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else None
+    image_path, partition_offset = select_image_interactive(cli_dir)
+    return single_source_case(image_path, partition_offset)
+
+
+def _is_analysable(src: EvidenceSource) -> bool:
+    """A source is analysable when it has a path AND its mode has tooling.
+
+    S4 lights up tree-mode iOS extractions; image-mode disks were already
+    supported. Media-collection (screenshots) remain skipped until S6.
+    """
+    if not src.path:
+        return False
+    if src.access_mode == "image":
+        return True
+    if src.access_mode == "tree" and src.type in ("mobile_extraction", "archive"):
+        return True
+    return False
+
+
+def list_analysable_sources(case: Case) -> list[EvidenceSource]:
+    """Return every analysable source in the case (orchestrator iterates them).
+
+    Pre-S6 main.py used to force-choose one source here; the multi-source
+    orchestrator (Phase 1 per-source triage) now consumes the full list.
+    Skipped sources are still reported for visibility.
+    """
+    analysable = [s for s in case.sources if _is_analysable(s)]
+    skipped = [s for s in case.sources if not _is_analysable(s)]
+    if skipped:
+        print(
+            f"Note: {len(skipped)} source(s) not analysable in this build: "
+            + ", ".join(f"{s.label} ({s.type})" for s in skipped)
+        )
+    if not analysable:
+        print("No analysable sources in this case.")
+        sys.exit(1)
+    print(f"Analysing {len(analysable)} source(s) — orchestrator will triage each in Phase 1:")
+    for s in analysable:
+        print(f"  - {s.summary()}")
+    return analysable
+
+
+def prepare_source(src: EvidenceSource) -> EvidenceSource:
+    """Materialise a tree-mode source for analysis.
+
+    Mobile / archive sources arrive as .zip files. We unpack once into a
+    project-level cache (``.cache/sources/<src.id>/``) and rewrite
+    ``src.path`` to point at the unpacked directory. Idempotent — a
+    second run with the cache present is a no-op (unzip_archive_sync
+    skips files that already exist with the matching size).
+
+    Disk-image and already-tree sources pass through unchanged.
+    """
+    if src.access_mode != "tree":
+        return src
+    p = Path(src.path)
+    if p.is_dir():
+        return src  # already a directory, nothing to do
+    if not p.is_file():
+        print(f"Warning: source path {src.path} does not exist; leaving as-is.")
+        return src
+    if p.suffix.lower() != ".zip":
+        # Other archive types (tar, 7z, ...) — not handled yet.
+        print(f"Warning: tree-mode source {src.id} is not a .zip "
+                f"({p.suffix}); leaving as-is.")
+        return src
+
+    dest = SOURCE_CACHE_DIR / src.id
+    dest.mkdir(parents=True, exist_ok=True)
+    # Password-protected zips (e.g. CTF artefacts) carry their key in
+    # case.yaml's meta.password — never logged, never persisted.
+    password = (src.meta or {}).get("password")
+    pw_note = " (password from meta)" if password else ""
+    print(f"Unpacking {p.name} → {dest}{pw_note} (idempotent) ...")
+    result = unzip_archive_sync(str(p), str(dest), password=password)
+    first_line = result.split("\n", 1)[0]
+    print("  " + first_line)
+    if first_line.startswith("Error:"):
+        # Surface the multi-line guidance from _do_extract verbatim.
+        for extra in result.split("\n")[1:]:
+            print("  " + extra)
+        print(f"  Source {src.id} stays unanalysable until this is resolved.")
+        # Leave src.path unchanged so the source remains marked unanalysable.
+        return src
+    src.path = str(dest)
+    src.access_mode = "tree"
+    return src
+
+
 def find_resumable_run() -> Path | None:
    """Find the most recent incomplete run with a saved graph state."""
    if not RUNS_DIR.exists():
@@ -225,22 +343,30 @@ async def async_main() -> None:

    # Initialize evidence graph
    if graph is None:
-        # CLI arg takes priority, otherwise interactive prompt
-        cli_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else None
-        image_path, partition_offset = select_image_interactive(cli_dir)
+        case = resolve_case()
+        # case_info derived from THIS case's meta (case.yaml), not from
+        # config.yaml's legacy `cfreds_hacking_case` block. Without this,
+        # the old CFReDS evidence MD5s would be embedded in reports for
+        # every subsequent unrelated case.
        graph = EvidenceGraph(
-            case_info=config.get("cfreds_hacking_case", {}),
+            case_info=dict(case.meta or {}),
            persist_path=run_dir / "graph_state.json",
-            edge_weights=config.get("hypothesis_edge_weights"),
+            edge_log_lr=config.get("hypothesis_log_lr"),
        )
-        graph.image_path = image_path
-        graph.partition_offset = partition_offset
+        graph.case = case
        graph.extracted_dir = str(run_dir / "extracted")
+        analysable = list_analysable_sources(case)
+        # Prepare every analysable source up front (unzip tree-mode zips,
+        # etc.). Idempotent on cache hits — second run is a no-op.
+        prepared = [prepare_source(s) for s in analysable]
+        # Seed the active source so tools that resolve lazily have a target
+        # before Phase 1 begins; the orchestrator resets it per source.
+        graph.set_active_source(prepared[0])
    else:
        graph._persist_path = run_dir / "graph_state.json"

-    # Register all tools with bound image path
-    register_all_tools(graph.image_path, graph.partition_offset, graph, graph.extracted_dir)
+    # Register all tools — they resolve the active evidence source at call time
+    register_all_tools(graph)

    # Create agent factory
    factory = AgentFactory(llm, graph)