feat(refit): complete S1-S6 — case abstraction, grounding, log-odds, plugins, coref, multi-source

Consolidates the long-running refit work (DESIGN.md as authoritative spec) into a single baseline commit. Six stages landed together: S1 Case + EvidenceSource abstraction; tools parameterised by source_id (case.py, main.py multi-source bootstrap, .bin extension support) S2 Grounding gateway in add_phenomenon: verified_facts cite real ToolInvocation ids; substring / normalised match enforced; agent + task scope checked. Phenomenon.description split into verified_facts (grounded) + interpretation (free text). [invocation: inv-xxx] prefix on every wrapped tool result so the LLM can cite. S3 Confidence as additive log-odds: edge_type → log10(LR) calibration table; commutative updates; supported / refuted thresholds derived from log_odds; hypothesis × evidence matrix view. S4 iOS plugin: unzip_archive + parse_plist / sqlite_tables / sqlite_query / parse_ios_keychain / read_idevice_info; IOSArtifactAgent; SOURCE_TYPE_AGENTS routing. S5 Cross-source entity resolution: typed identifiers on Entity, observe_identity gateway, auto coref hypothesis with shared / conflicting strong/weak LR edges, reversible same_as edges, actor_clusters() view. S6 Android partition probe + AndroidArtifactAgent; MediaAgent with OCR fallback; orchestrator Phase 1 iterates every analysable source; platform-aware get_triage_agent_type; ReportAgent renders actor clusters + per-source breakdown. 142 unit tests / 1 skipped — full coverage of the new gateway, log-odds math, coref hypothesis fall-out, and orchestrator multi-source dispatch. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 02:12:10 -10:00
parent 444d58726a
commit 81ade8f7ac
24 changed files with 5137 additions and 244 deletions
--- a/tools/archive.py
+++ b/tools/archive.py
@@ -0,0 +1,156 @@
+"""Archive extraction tools — generic unzip for tree-mode evidence sources.
+
+Mobile extractions (iOS / Android backups), archive sources, and shared
+work products all arrive as .zip files. The forensic agents work on the
+unpacked tree; this module is the single entry point for safely turning
+an archive into a directory.
+
+Stdlib-only. No graph dependency.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import zipfile
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def _is_within(base: Path, target: Path) -> bool:
+    """True when *target* resolves to a path inside *base* — symlink-safe."""
+    try:
+        base_r = base.resolve()
+        target_r = target.resolve()
+    except OSError:
+        return False
+    try:
+        target_r.relative_to(base_r)
+    except ValueError:
+        return False
+    return True
+
+
+def _is_zip_encrypted(zf: zipfile.ZipFile) -> bool:
+    """True when any entry has the zip 'encrypted' flag bit set."""
+    return any(info.flag_bits & 0x1 for info in zf.infolist())
+
+
+def _do_extract(
+    zip_path: str,
+    dest_dir: str,
+    password: str | None = None,
+) -> str:
+    """Shared core for unzip_archive (async) and unzip_archive_sync.
+
+    Pure stdlib + filesystem I/O — no asyncio. Idempotent on rerun (files
+    whose target already exists at the matching size are skipped). Returns
+    a multi-line summary the agent can read directly.
+    """
+    zp = Path(zip_path)
+    if not zp.is_file():
+        return f"Error: {zip_path} is not a file."
+
+    dest = Path(dest_dir)
+    dest.mkdir(parents=True, exist_ok=True)
+
+    extracted = 0
+    skipped: list[str] = []
+    total_bytes = 0
+    pwd_bytes = password.encode("utf-8") if password else None
+
+    try:
+        with zipfile.ZipFile(zp, "r") as zf:
+            encrypted = _is_zip_encrypted(zf)
+            if encrypted and pwd_bytes is None:
+                return (
+                    f"Error: {zip_path} is password-protected. "
+                    f"Provide the password via case.yaml's "
+                    f"meta.password on this source, or pass `password=` "
+                    f"explicitly. Stdlib zipfile only supports the legacy "
+                    f"ZipCrypto algorithm — AES-encrypted zips (created by "
+                    f"7-Zip / WinZip) need an external tool like 7z."
+                )
+            for info in zf.infolist():
+                name = info.filename
+                # Block absolute paths and parent-escape attempts up front.
+                if name.startswith(("/", "\\")) or ".." in Path(name).parts:
+                    skipped.append(f"escape: {name}")
+                    continue
+                target = dest / name
+                if not _is_within(dest, target):
+                    skipped.append(f"escape: {name}")
+                    continue
+                # Symlink entries — skip rather than risk traversing out.
+                if info.external_attr >> 16 & 0o120000 == 0o120000:
+                    skipped.append(f"symlink: {name}")
+                    continue
+                if info.is_dir():
+                    target.mkdir(parents=True, exist_ok=True)
+                    continue
+                # Skip if already extracted with matching size (idempotent rerun).
+                if target.exists() and target.stat().st_size == info.file_size:
+                    continue
+                target.parent.mkdir(parents=True, exist_ok=True)
+                try:
+                    with zf.open(info, "r", pwd=pwd_bytes) as src, open(target, "wb") as out:
+                        while True:
+                            chunk = src.read(65536)
+                            if not chunk:
+                                break
+                            out.write(chunk)
+                except RuntimeError as e:
+                    # zipfile raises RuntimeError for bad-password / AES-encrypted.
+                    msg = str(e)
+                    if "Bad password" in msg or "password required" in msg:
+                        return (
+                            f"Error: bad or missing password for {zip_path}. "
+                            f"If the zip is AES-encrypted (7-Zip/WinZip), stdlib "
+                            f"cannot decrypt it — use `7z x -p<pwd> ...` "
+                            f"externally and point the source path at the result."
+                        )
+                    raise
+                extracted += 1
+                total_bytes += info.file_size
+    except zipfile.BadZipFile as e:
+        return f"Error: {zip_path} is not a valid zip archive: {e}"
+    except Exception as e:
+        return f"Error extracting {zip_path}: {e}"
+
+    parts = [
+        f"Extracted {extracted} file(s), {total_bytes} bytes, into {dest}",
+    ]
+    if skipped:
+        parts.append(f"Skipped {len(skipped)} unsafe entries:")
+        for s in skipped[:10]:
+            parts.append(f"  - {s}")
+        if len(skipped) > 10:
+            parts.append(f"  ... ({len(skipped) - 10} more)")
+    return "\n".join(parts)
+
+
+async def unzip_archive(
+    zip_path: str, dest_dir: str, password: str | None = None,
+) -> str:
+    """Extract *zip_path* into *dest_dir*. Idempotent on rerun.
+
+    Defensive: rejects entries with absolute paths, leading '..', or that
+    would resolve outside *dest_dir* (the classic zip-slip vector). Symlink
+    entries are skipped (we never follow symlinks into the host filesystem).
+    Password-protected zips need the password argument (or
+    ``meta.password`` on the source in case.yaml) — stdlib ``zipfile``
+    only handles the legacy ZipCrypto algorithm.
+    """
+    return _do_extract(zip_path, dest_dir, password)
+
+
+def unzip_archive_sync(
+    zip_path: str, dest_dir: str, password: str | None = None,
+) -> str:
+    """Synchronous variant of :func:`unzip_archive` for startup-time prepare_source.
+
+    Same behaviour, just no async wrapping — used before the event loop
+    starts so we don't have to spin one up just to unpack a zip.
+    """
+    return _do_extract(zip_path, dest_dir, password)
--- a/tools/media.py
+++ b/tools/media.py
@@ -0,0 +1,87 @@
+"""Media plugin — OCR for image evidence.
+
+DESIGN.md §4.7: the model backend (DeepSeek) has no vision, so we MUST run
+OCR locally for any image-bearing evidence. Tesseract via pytesseract is
+the default; if the runtime is missing those packages, the tool returns a
+clear install hint rather than failing silently.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+MAX_OUTPUT = 8000
+
+_INSTALL_HINT = (
+    "Error: OCR runtime not available. Install with:\n"
+    "  pip install pytesseract pillow\n"
+    "  sudo apt install tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-chi-tra\n"
+    "(or the equivalent for your distribution). Then retry."
+)
+
+
+def _has_ocr_runtime() -> tuple[bool, str]:
+    """Return (available, reason). reason is empty when available."""
+    try:
+        import pytesseract  # noqa: F401
+        from PIL import Image  # noqa: F401
+    except ImportError as e:
+        return False, f"missing python package: {e.name}"
+    # Check the tesseract binary too.
+    import shutil
+    if shutil.which("tesseract") is None:
+        return False, "tesseract binary not on PATH"
+    return True, ""
+
+
+async def ocr_image(file_path: str, lang: str = "eng+chi_sim+chi_tra") -> str:
+    """Extract text from an image via tesseract.
+
+    *lang* defaults to English + Simplified + Traditional Chinese, matching
+    the multi-language artefacts the current case involves. Pass a single
+    language code (e.g. ``"eng"``) to skip language packs that aren't
+    installed.
+    """
+    p = Path(file_path)
+    if not p.is_file():
+        return f"Error: {file_path} is not a file."
+    available, reason = _has_ocr_runtime()
+    if not available:
+        return f"{_INSTALL_HINT}\n[detail: {reason}]"
+
+    import pytesseract
+    from PIL import Image
+
+    try:
+        img = Image.open(p)
+    except Exception as e:
+        return f"Error: could not open image {file_path}: {e}"
+
+    try:
+        text = pytesseract.image_to_string(img, lang=lang)
+    except pytesseract.TesseractError as e:
+        msg = str(e)
+        if "Failed loading language" in msg or "Error opening data file" in msg:
+            return (
+                f"Error: tesseract is installed but missing language pack(s) for {lang!r}. "
+                f"Install the language data (e.g. tesseract-ocr-chi-sim) or pass a "
+                f"different `lang`. Detail: {msg}"
+            )
+        return f"Error running tesseract: {msg}"
+    except Exception as e:
+        return f"Error during OCR: {e}"
+
+    size = p.stat().st_size
+    header = (
+        f"ocr: {file_path} ({size} bytes, lang={lang}, "
+        f"{len(text.splitlines())} line(s))\n"
+    )
+    if len(text) > MAX_OUTPUT - len(header):
+        body = text[:MAX_OUTPUT - len(header)] + "\n[truncated]"
+    else:
+        body = text
+    return header + body
--- a/tools/mobile_android.py
+++ b/tools/mobile_android.py
@@ -0,0 +1,160 @@
+"""Android plugin tools — partition survey + sector translation.
+
+DESIGN.md §4.7 安卓: ``mmls`` partitions → per-partition image-mode source;
+``fsstat`` per partition to classify ext4/F2FS/raw/encrypted. The shared TSK
+toolchain already handles ext4/F2FS reads, so once the agent picks a partition
+offset the standard list_directory / extract_file / search_strings tools work.
+
+Quirk: Samsung dumps (e.g. ``blk0_sda.bin``) use 4096-byte image sectors but
+TSK tool flags accept 512-byte sectors by default. ``probe_android_partitions``
+emits BOTH unit systems so the agent can plug the right ``partition_offset``
+value into ``set_active_partition``.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import re
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+MAX_OUTPUT = 8000
+
+# Partitions worth flagging when we encounter them — informs the agent's
+# strategy. Not exhaustive; just opinionated hints.
+_PARTITION_HINTS: dict[str, str] = {
+    "EFS":      "modem firmware area; often contains IMEI / MAC / serial",
+    "PARAM":    "boot parameters; cmdline + flags",
+    "BOOT":     "kernel + initramfs (raw image)",
+    "RECOVERY": "recovery image (raw)",
+    "SYSTEM":   "Android /system — read-only OS partition (ext4)",
+    "CACHE":    "downloaded OTA payloads; usually transient",
+    "USERDATA": "/data — user apps, dbs, accounts; FBE-encrypted on modern devices",
+    "PERSISTENT": "Samsung persistent partition; carrier/device flags",
+    "STEADY":   "Samsung steady-state config",
+    "HIDDEN":   "Samsung hidden partition; check before assuming empty",
+    "CP_DEBUG": "modem debug logs",
+    "TOMBSTONES": "userland crash dumps",
+}
+
+
+def _parse_mmls_with_unit(output: str) -> tuple[int, list[dict]]:
+    """Parse mmls output, returning (sector_size_bytes, partitions).
+
+    mmls states ``Units are in N-byte sectors`` near the top; we extract N
+    to translate between image-native units and the 512-byte units TSK
+    tools accept via ``-o``.
+    """
+    sector_size = 512
+    m = re.search(r"Units are in (\d+)-byte sectors", output)
+    if m:
+        sector_size = int(m.group(1))
+
+    parts: list[dict] = []
+    for line in output.splitlines():
+        m = re.match(
+            r"\s*(\d{3}):\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.*)",
+            line,
+        )
+        if not m:
+            continue
+        _row, slot, start, end, length, desc = m.groups()
+        if slot == "Meta" or slot.startswith("---"):
+            continue
+        parts.append({
+            "slot": slot,
+            "start_native": int(start),
+            "end_native": int(end),
+            "length_native": int(length),
+            "description": desc.strip(),
+        })
+    return sector_size, parts
+
+
+async def _run(cmd: list[str], timeout: int = 30) -> tuple[int, str, str]:
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+    try:
+        stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
+    except asyncio.TimeoutError:
+        proc.kill()
+        return 124, "", f"timeout after {timeout}s"
+    return proc.returncode or 0, stdout.decode("utf-8", "replace"), stderr.decode("utf-8", "replace")
+
+
+_FS_TYPE_RE = re.compile(r"File System Type:\s*(\S+)", re.IGNORECASE)
+
+
+async def _classify_partition(image_path: str, sector_offset_512: int) -> str:
+    """Run fsstat on a partition; return 'Ext4'/'Yaffs2'/'FAT'/'unknown'/'inaccessible'.
+
+    fsstat's "Cannot determine file system type" is treated as 'unknown' —
+    typically means raw image (BOOT/RECOVERY/RADIO/…) or encrypted data
+    (modern userdata under FBE).
+    """
+    rc, out, _err = await _run(["fsstat", "-o", str(sector_offset_512), image_path], timeout=15)
+    if rc != 0:
+        return "unknown"
+    m = _FS_TYPE_RE.search(out)
+    if m:
+        return m.group(1)
+    return "unknown"
+
+
+async def probe_android_partitions(image_path: str) -> str:
+    """Survey every partition on an Android disk dump and return a table.
+
+    The agent reads this once to plan its work: which partitions are
+    Ext4/F2FS (use TSK), which are raw (extract image / strings only),
+    which are encrypted (skip until decrypted).
+    """
+    p = Path(image_path)
+    if not p.is_file():
+        return f"Error: {image_path} is not a file."
+
+    rc, out, err = await _run(["mmls", str(p)], timeout=30)
+    if rc != 0:
+        return f"Error: mmls failed (rc={rc}): {err.strip() or out.strip()}"
+
+    sector_size, parts = _parse_mmls_with_unit(out)
+    if not parts:
+        return f"No partitions detected in {image_path}."
+
+    lines = [
+        f"Android partition survey: {image_path}",
+        f"  mmls reports {sector_size}-byte sectors (TSK -o expects 512-byte sectors)",
+        f"  {len(parts)} data partitions",
+        "",
+        "| slot | name | start (native) | start (512-sector) | size | fs_type | hint |",
+        "|---|---|---:|---:|---|---|---|",
+    ]
+    for prt in parts:
+        sector_512 = prt["start_native"] * sector_size // 512
+        bytes_size = prt["length_native"] * sector_size
+        # human-readable size
+        if bytes_size >= 1 << 30:
+            size_h = f"{bytes_size / (1 << 30):.1f} GB"
+        elif bytes_size >= 1 << 20:
+            size_h = f"{bytes_size / (1 << 20):.1f} MB"
+        else:
+            size_h = f"{bytes_size // 1024} KB"
+        fs_type = await _classify_partition(str(p), sector_512)
+        # Try to extract a friendly partition name from the description
+        # (mmls description often includes the partition name uppercase).
+        name_match = re.search(r"[A-Z][A-Z0-9_]{2,}", prt["description"])
+        pname = name_match.group(0) if name_match else prt["description"][:20]
+        hint = _PARTITION_HINTS.get(pname, "")
+        lines.append(
+            f"| {prt['slot']} | {pname} | {prt['start_native']} | "
+            f"{sector_512} | {size_h} | {fs_type} | {hint} |"
+        )
+
+    body = "\n".join(lines)
+    if len(body) > MAX_OUTPUT:
+        body = body[:MAX_OUTPUT] + "\n\n[truncated]"
+    return body
--- a/tools/mobile_ios.py
+++ b/tools/mobile_ios.py
@@ -0,0 +1,274 @@
+"""iOS extraction parsers — plist / sqlite / keychain / iDevice info.
+
+DESIGN.md §4.7 iOS plugin tools. All tree-mode, path-based — no Sleuth
+Kit, no graph dependency. Stdlib + sqlite3 only.
+
+iOS extractions typically arrive as a zip containing domain-rooted trees
+(HomeDomain, AppDomain, etc.) with a flat ``iDevice_info.txt`` summary,
+binary/XML plists, and several SQLite databases (sms.db, AddressBook,
+keychain-2.db, app-specific stores like WhatsApp's ChatStorage.sqlite).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+import plistlib
+import re
+import sqlite3
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+# Output cap (chars) — keeps a single tool result under the LLM context budget.
+MAX_OUTPUT = 8000
+
+
+def _trunc(text: str, limit: int = MAX_OUTPUT) -> str:
+    if len(text) <= limit:
+        return text
+    return text[:limit] + f"\n\n[Output truncated: {len(text)} chars total]"
+
+
+# ---------------------------------------------------------------------------
+# plist
+# ---------------------------------------------------------------------------
+
+def _to_jsonable(obj):
+    """Make plist values JSON-serializable: bytes → hex preview, dates → iso."""
+    import datetime
+    if isinstance(obj, bytes):
+        if len(obj) <= 64:
+            return {"_bytes_hex": obj.hex()}
+        return {"_bytes_hex_preview": obj[:64].hex(), "_total_bytes": len(obj)}
+    if isinstance(obj, datetime.datetime):
+        return obj.isoformat()
+    if isinstance(obj, dict):
+        return {str(k): _to_jsonable(v) for k, v in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        return [_to_jsonable(v) for v in obj]
+    return obj
+
+
+async def parse_plist(file_path: str) -> str:
+    """Parse a .plist file (XML or binary) and return its contents as JSON.
+
+    Both formats are handled transparently by ``plistlib.load``.
+    """
+    p = Path(file_path)
+    if not p.is_file():
+        return f"Error: {file_path} is not a file."
+    try:
+        with open(p, "rb") as f:
+            data = plistlib.load(f)
+    except plistlib.InvalidFileException as e:
+        return f"Error: {file_path} is not a valid plist ({e})"
+    except Exception as e:
+        return f"Error parsing plist {file_path}: {e}"
+
+    serial = _to_jsonable(data)
+    rendered = json.dumps(serial, ensure_ascii=False, indent=2, default=str)
+    header = f"plist: {file_path} ({p.stat().st_size} bytes)\n"
+    return header + _trunc(rendered)
+
+
+# ---------------------------------------------------------------------------
+# sqlite
+# ---------------------------------------------------------------------------
+
+_SELECT_RE = re.compile(r"^\s*SELECT\b", re.IGNORECASE)
+
+
+async def sqlite_tables(db_path: str) -> str:
+    """List user tables in a sqlite file with row counts and column names."""
+    p = Path(db_path)
+    if not p.is_file():
+        return f"Error: {db_path} is not a file."
+    try:
+        conn = sqlite3.connect(f"file:{p}?mode=ro", uri=True)
+    except sqlite3.OperationalError as e:
+        return f"Error opening {db_path} (read-only): {e}"
+    try:
+        cur = conn.cursor()
+        cur.execute(
+            "SELECT name FROM sqlite_master "
+            "WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name"
+        )
+        tables = [r[0] for r in cur.fetchall()]
+        if not tables:
+            return f"No user tables in {db_path}."
+        lines = [f"sqlite: {db_path} ({len(tables)} tables)"]
+        for name in tables:
+            try:
+                cur.execute(f"SELECT COUNT(*) FROM \"{name}\"")
+                count = cur.fetchone()[0]
+            except sqlite3.DatabaseError as e:
+                count = f"(count failed: {e})"
+            try:
+                cur.execute(f"PRAGMA table_info(\"{name}\")")
+                cols = [r[1] for r in cur.fetchall()]
+            except sqlite3.DatabaseError:
+                cols = []
+            lines.append(f"  {name}: {count} row(s); cols: {', '.join(cols)}")
+        return _trunc("\n".join(lines))
+    finally:
+        conn.close()
+
+
+async def sqlite_query(
+    db_path: str,
+    query: str,
+    max_rows: int = 100,
+) -> str:
+    """Run a single read-only SELECT against a sqlite file.
+
+    Multi-statement queries and anything other than a SELECT are rejected
+    (we open the database in read-only mode anyway, so writes would fail
+    too — but the explicit check keeps the agent honest).
+    """
+    if not _SELECT_RE.match(query):
+        return "Error: only single SELECT statements are allowed."
+    if ";" in query.rstrip(";"):
+        return "Error: multi-statement queries are not allowed."
+
+    p = Path(db_path)
+    if not p.is_file():
+        return f"Error: {db_path} is not a file."
+    try:
+        conn = sqlite3.connect(f"file:{p}?mode=ro", uri=True)
+    except sqlite3.OperationalError as e:
+        return f"Error opening {db_path} (read-only): {e}"
+
+    try:
+        cur = conn.cursor()
+        try:
+            cur.execute(query)
+        except sqlite3.DatabaseError as e:
+            return f"Error executing query: {e}"
+        cols = [d[0] for d in cur.description] if cur.description else []
+        rows = cur.fetchmany(max(1, int(max_rows)))
+        lines = [
+            f"sqlite query: {db_path}",
+            f"columns: {cols}",
+            f"rows ({len(rows)}, capped at {max_rows}):",
+        ]
+        for row in rows:
+            rendered = [
+                (v.hex() if isinstance(v, bytes) else str(v))
+                for v in row
+            ]
+            lines.append("  " + " | ".join(rendered))
+        return _trunc("\n".join(lines))
+    finally:
+        conn.close()
+
+
+# ---------------------------------------------------------------------------
+# iOS keychain (keychain-2.db)
+# ---------------------------------------------------------------------------
+
+# Standard iOS keychain tables. genp = generic passwords, inet = internet
+# passwords, cert = certificates, keys = key material. Forensic extractions
+# of locked keychains have ``data`` columns NULL but accounting metadata
+# (agrp, acct, svce) intact — already useful for attribution work.
+_KEYCHAIN_TABLES = ("genp", "inet", "cert", "keys")
+
+
+async def parse_ios_keychain(keychain_root: str) -> str:
+    """Locate and summarize iOS keychain entries under *keychain_root*.
+
+    *keychain_root* may be a path to ``keychain-2.db`` directly or to a
+    directory that contains it (e.g. ``.../var/keychains``).
+    """
+    root = Path(keychain_root)
+    db: Path | None = None
+    if root.is_file() and root.name == "keychain-2.db":
+        db = root
+    elif root.is_dir():
+        candidate = root / "keychain-2.db"
+        if candidate.is_file():
+            db = candidate
+        else:
+            # Fall back to a shallow recursive search.
+            for found in root.rglob("keychain-2.db"):
+                db = found
+                break
+    if db is None:
+        return f"No keychain-2.db found under {keychain_root}."
+
+    try:
+        conn = sqlite3.connect(f"file:{db}?mode=ro", uri=True)
+    except sqlite3.OperationalError as e:
+        return f"Error opening {db}: {e}"
+
+    try:
+        cur = conn.cursor()
+        cur.execute(
+            "SELECT name FROM sqlite_master "
+            "WHERE type='table' AND name IN ({})".format(
+                ",".join("?" * len(_KEYCHAIN_TABLES))
+            ),
+            _KEYCHAIN_TABLES,
+        )
+        present = [r[0] for r in cur.fetchall()]
+        if not present:
+            return f"keychain-2.db at {db} has no recognised tables."
+
+        lines = [f"keychain: {db}"]
+        for name in present:
+            cur.execute(f"SELECT COUNT(*) FROM \"{name}\"")
+            count = cur.fetchone()[0]
+            lines.append(f"\n[{name}] {count} row(s)")
+            cur.execute(f"PRAGMA table_info(\"{name}\")")
+            cols = [r[1] for r in cur.fetchall()]
+            # Pick a useful subset of accounting columns when present.
+            preferred = [
+                c for c in ("agrp", "acct", "svce", "labl", "desc", "atyp", "srvr")
+                if c in cols
+            ]
+            if not preferred:
+                preferred = cols[:5]
+            sel = ", ".join(f'"{c}"' for c in preferred)
+            cur.execute(f"SELECT {sel} FROM \"{name}\" LIMIT 30")
+            for row in cur.fetchall():
+                lines.append("  " + " | ".join(
+                    (v.hex() if isinstance(v, bytes) else str(v))
+                    for v in row
+                ))
+        return _trunc("\n".join(lines))
+    finally:
+        conn.close()
+
+
+# ---------------------------------------------------------------------------
+# iDevice_info.txt
+# ---------------------------------------------------------------------------
+
+async def read_idevice_info(file_path: str, max_chars: int = 6000) -> str:
+    """Read the standard iDevice_info.txt summary at the root of an iOS extraction.
+
+    The file is a flat ``Key: value`` dump from libimobiledevice / native
+    extraction tools. We surface the first *max_chars* of content verbatim
+    — the agent can search/extract specific keys via search_text_file if
+    the head isn't enough.
+    """
+    p = Path(file_path)
+    if p.is_dir():
+        # Be helpful: if the agent passed the extraction root, find the file.
+        candidate = p / "iDevice_info.txt"
+        if candidate.is_file():
+            p = candidate
+    if not p.is_file():
+        return f"Error: {file_path} is not a file."
+    try:
+        with open(p, "r", encoding="utf-8", errors="replace") as f:
+            content = f.read(max_chars)
+        size = p.stat().st_size
+        header = f"iDevice_info: {p} ({size} bytes)\n"
+        if size > max_chars:
+            content += f"\n\n[Truncated: file is {size} bytes, showing first {max_chars}]"
+        return header + content
+    except Exception as e:
+        return f"Error reading {file_path}: {e}"
--- a/tools/parsers.py
+++ b/tools/parsers.py
@@ -215,20 +215,178 @@ async def parse_prefetch(file_path: str) -> str:
        return f"[Error parsing Prefetch: {e}]"


-async def list_extracted_dir(dir_path: str) -> str:
-    """List files in an extracted directory."""
+async def list_extracted_dir(dir_path: str, max_entries: int = 200) -> str:
+    """Smart summary of a (potentially huge) extracted tree.
+
+    Earlier versions dumped up to 200 random entries then truncated — that
+    leaves the agent blind on 10k+-file iOS extractions. The new layout
+    returns a compact summary that scales: total counts, extension
+    breakdown, top-level directories with their sizes, and the largest
+    files. For targeted lookups (e.g. find every ``*.sqlite`` under the
+    tree) the agent should use ``find_files`` instead.
+    """
+    if not os.path.isdir(dir_path):
+        return f"[Error: {dir_path} is not a directory]"
+
    try:
-        entries = []
-        for root, dirs, files in os.walk(dir_path):
+        total_files = 0
+        total_bytes = 0
+        ext_counts: dict[str, int] = {}
+        ext_bytes: dict[str, int] = {}
+        top_level_dirs: dict[str, dict] = {}
+        biggest: list[tuple[int, str]] = []   # (size, relpath)
+
+        dir_path_abs = os.path.abspath(dir_path)
+        for root, dirs, files in os.walk(dir_path_abs):
+            # Track top-level directory aggregates (cheap; no per-entry cost
+            # beyond the walk we're already doing).
+            rel_root = os.path.relpath(root, dir_path_abs)
+            if rel_root == ".":
+                top_dirs = {d: {"files": 0, "bytes": 0} for d in dirs}
+                top_level_dirs.update(top_dirs)
+                top_key = None
+            else:
+                top_key = rel_root.split(os.sep, 1)[0]
+                if top_key not in top_level_dirs:
+                    top_level_dirs[top_key] = {"files": 0, "bytes": 0}
+
            for f in files:
                full = os.path.join(root, f)
-                rel = os.path.relpath(full, dir_path)
-                size = os.path.getsize(full)
-                entries.append(f"  {rel} ({size} bytes)")
-            if len(entries) > 200:
-                entries.append(f"  ... (truncated)")
-                break
+                try:
+                    size = os.path.getsize(full)
+                except OSError:
+                    continue
+                total_files += 1
+                total_bytes += size
+                ext = os.path.splitext(f)[1].lower() or "(no ext)"
+                ext_counts[ext] = ext_counts.get(ext, 0) + 1
+                ext_bytes[ext] = ext_bytes.get(ext, 0) + size
+                if top_key is not None:
+                    top_level_dirs[top_key]["files"] += 1
+                    top_level_dirs[top_key]["bytes"] += size
+                # Maintain a top-10 largest list cheaply (bounded insertion).
+                if len(biggest) < 10:
+                    biggest.append((size, os.path.relpath(full, dir_path_abs)))
+                    biggest.sort(reverse=True)
+                elif size > biggest[-1][0]:
+                    biggest[-1] = (size, os.path.relpath(full, dir_path_abs))
+                    biggest.sort(reverse=True)

-        return f"Directory: {dir_path}\nFiles ({len(entries)}):\n" + "\n".join(entries)
+        def _human(n: int) -> str:
+            for unit in ("B", "KB", "MB", "GB"):
+                if n < 1024:
+                    return f"{n:.1f}{unit}" if unit != "B" else f"{n}B"
+                n /= 1024
+            return f"{n:.1f}TB"
+
+        lines = [
+            f"Directory: {dir_path}",
+            f"  Total: {total_files} file(s), {_human(total_bytes)}",
+        ]
+
+        # Top-level directory layout (immediate children, sorted by file count).
+        if top_level_dirs:
+            lines.append(f"\nTop-level layout ({len(top_level_dirs)} dirs at root):")
+            sorted_tlds = sorted(
+                top_level_dirs.items(), key=lambda kv: -kv[1]["files"],
+            )[:15]
+            for d, stats in sorted_tlds:
+                lines.append(
+                    f"  {d}/  ({stats['files']} files, {_human(stats['bytes'])})"
+                )
+            if len(top_level_dirs) > 15:
+                lines.append(f"  ... ({len(top_level_dirs) - 15} more top-level dirs)")
+
+        # Extension breakdown.
+        if ext_counts:
+            lines.append(f"\nExtension breakdown (top 15):")
+            for ext, count in sorted(ext_counts.items(), key=lambda kv: -kv[1])[:15]:
+                lines.append(
+                    f"  {ext}: {count} files, {_human(ext_bytes.get(ext, 0))}"
+                )
+
+        # Largest files (often the highest-value forensic targets).
+        if biggest:
+            lines.append("\nLargest files:")
+            for size, rel in biggest:
+                lines.append(f"  {rel} ({_human(size)})")
+
+        lines.append(
+            f"\nNext step: call find_files with a pattern like "
+            f"'**/*.plist' or '**/keychain-2.db' to locate specific artefacts."
+        )
+
+        return "\n".join(lines)
    except Exception as e:
        return f"[Error listing {dir_path}: {e}]"
+
+
+async def find_files(
+    root: str,
+    pattern: str,
+    max_results: int = 500,
+) -> str:
+    """Recursively find files under *root* whose path matches *pattern*.
+
+    Uses fnmatch-style globs against the *full relative path*; ``**`` is
+    treated as "any number of path segments" (so ``**/*.plist`` finds
+    every plist no matter how deep). Examples:
+
+      - ``**/sms.db``               — iOS SMS database
+      - ``**/keychain-2.db``        — iOS keychain
+      - ``**/ChatStorage.sqlite``   — WhatsApp app store
+      - ``HomeDomain/Library/**``   — anchor at a known iOS domain root
+      - ``**/*.{plist,sqlite,db}``  — multi-extension (use 2+ calls or a regex if needed)
+
+    Results are sorted by size descending — the biggest hits usually
+    matter most. Capped at *max_results* to keep the LLM context bounded.
+    """
+    import fnmatch
+
+    if not os.path.isdir(root):
+        return f"[Error: {root} is not a directory]"
+
+    root_abs = os.path.abspath(root)
+    # Convert ``**`` (any-depth) to fnmatch's ``*`` (any chars including /).
+    # fnmatch doesn't natively distinguish segment vs path; expanding ``**``
+    # to ``*`` and letting fnmatch match the full relpath is good enough for
+    # forensic lookups.
+    fn_pattern = pattern.replace("**", "*")
+
+    hits: list[tuple[int, str]] = []
+    truncated = False
+    try:
+        for dirpath, _dirs, files in os.walk(root_abs):
+            for f in files:
+                full = os.path.join(dirpath, f)
+                rel = os.path.relpath(full, root_abs)
+                if fnmatch.fnmatch(rel, fn_pattern) or fnmatch.fnmatch(f, fn_pattern):
+                    try:
+                        size = os.path.getsize(full)
+                    except OSError:
+                        size = 0
+                    hits.append((size, rel))
+                    if len(hits) >= max_results * 4:
+                        # Hard upper bound to keep the walk cheap on huge trees.
+                        truncated = True
+                        break
+            if truncated:
+                break
+    except Exception as e:
+        return f"[Error searching {root}: {e}]"
+
+    hits.sort(reverse=True)
+    if len(hits) > max_results:
+        truncated = True
+        hits = hits[:max_results]
+
+    lines = [
+        f"find_files: pattern={pattern!r} under {root}",
+        f"  matches: {len(hits)}" + (" (truncated)" if truncated else ""),
+    ]
+    if not hits:
+        lines.append("  (no matches)")
+    else:
+        for size, rel in hits:
+            lines.append(f"  {rel} ({size} bytes)")
+    return "\n".join(lines)