feat(refit): complete S1-S6 — case abstraction, grounding, log-odds, plugins, coref, multi-source

Consolidates the long-running refit work (DESIGN.md as authoritative spec)
into a single baseline commit. Six stages landed together:

  S1  Case + EvidenceSource abstraction; tools parameterised by source_id
      (case.py, main.py multi-source bootstrap, .bin extension support)
  S2  Grounding gateway in add_phenomenon: verified_facts cite real
      ToolInvocation ids; substring / normalised match enforced; agent +
      task scope checked. Phenomenon.description split into verified_facts
      (grounded) + interpretation (free text). [invocation: inv-xxx]
      prefix on every wrapped tool result so the LLM can cite.
  S3  Confidence as additive log-odds: edge_type → log10(LR) calibration
      table; commutative updates; supported / refuted thresholds derived
      from log_odds; hypothesis × evidence matrix view.
  S4  iOS plugin: unzip_archive + parse_plist / sqlite_tables /
      sqlite_query / parse_ios_keychain / read_idevice_info;
      IOSArtifactAgent; SOURCE_TYPE_AGENTS routing.
  S5  Cross-source entity resolution: typed identifiers on Entity,
      observe_identity gateway, auto coref hypothesis with shared /
      conflicting strong/weak LR edges, reversible same_as edges,
      actor_clusters() view.
  S6  Android partition probe + AndroidArtifactAgent; MediaAgent with
      OCR fallback; orchestrator Phase 1 iterates every analysable
      source; platform-aware get_triage_agent_type; ReportAgent renders
      actor clusters + per-source breakdown.

142 unit tests / 1 skipped — full coverage of the new gateway, log-odds
math, coref hypothesis fall-out, and orchestrator multi-source dispatch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
BattleTag
2026-05-21 02:12:10 -10:00
parent 444d58726a
commit 81ade8f7ac
24 changed files with 5137 additions and 244 deletions

156
tools/archive.py Normal file
View File

@@ -0,0 +1,156 @@
"""Archive extraction tools — generic unzip for tree-mode evidence sources.
Mobile extractions (iOS / Android backups), archive sources, and shared
work products all arrive as .zip files. The forensic agents work on the
unpacked tree; this module is the single entry point for safely turning
an archive into a directory.
Stdlib-only. No graph dependency.
"""
from __future__ import annotations
import logging
import os
import zipfile
from pathlib import Path
logger = logging.getLogger(__name__)
def _is_within(base: Path, target: Path) -> bool:
"""True when *target* resolves to a path inside *base* — symlink-safe."""
try:
base_r = base.resolve()
target_r = target.resolve()
except OSError:
return False
try:
target_r.relative_to(base_r)
except ValueError:
return False
return True
def _is_zip_encrypted(zf: zipfile.ZipFile) -> bool:
"""True when any entry has the zip 'encrypted' flag bit set."""
return any(info.flag_bits & 0x1 for info in zf.infolist())
def _do_extract(
zip_path: str,
dest_dir: str,
password: str | None = None,
) -> str:
"""Shared core for unzip_archive (async) and unzip_archive_sync.
Pure stdlib + filesystem I/O — no asyncio. Idempotent on rerun (files
whose target already exists at the matching size are skipped). Returns
a multi-line summary the agent can read directly.
"""
zp = Path(zip_path)
if not zp.is_file():
return f"Error: {zip_path} is not a file."
dest = Path(dest_dir)
dest.mkdir(parents=True, exist_ok=True)
extracted = 0
skipped: list[str] = []
total_bytes = 0
pwd_bytes = password.encode("utf-8") if password else None
try:
with zipfile.ZipFile(zp, "r") as zf:
encrypted = _is_zip_encrypted(zf)
if encrypted and pwd_bytes is None:
return (
f"Error: {zip_path} is password-protected. "
f"Provide the password via case.yaml's "
f"meta.password on this source, or pass `password=` "
f"explicitly. Stdlib zipfile only supports the legacy "
f"ZipCrypto algorithm — AES-encrypted zips (created by "
f"7-Zip / WinZip) need an external tool like 7z."
)
for info in zf.infolist():
name = info.filename
# Block absolute paths and parent-escape attempts up front.
if name.startswith(("/", "\\")) or ".." in Path(name).parts:
skipped.append(f"escape: {name}")
continue
target = dest / name
if not _is_within(dest, target):
skipped.append(f"escape: {name}")
continue
# Symlink entries — skip rather than risk traversing out.
if info.external_attr >> 16 & 0o120000 == 0o120000:
skipped.append(f"symlink: {name}")
continue
if info.is_dir():
target.mkdir(parents=True, exist_ok=True)
continue
# Skip if already extracted with matching size (idempotent rerun).
if target.exists() and target.stat().st_size == info.file_size:
continue
target.parent.mkdir(parents=True, exist_ok=True)
try:
with zf.open(info, "r", pwd=pwd_bytes) as src, open(target, "wb") as out:
while True:
chunk = src.read(65536)
if not chunk:
break
out.write(chunk)
except RuntimeError as e:
# zipfile raises RuntimeError for bad-password / AES-encrypted.
msg = str(e)
if "Bad password" in msg or "password required" in msg:
return (
f"Error: bad or missing password for {zip_path}. "
f"If the zip is AES-encrypted (7-Zip/WinZip), stdlib "
f"cannot decrypt it — use `7z x -p<pwd> ...` "
f"externally and point the source path at the result."
)
raise
extracted += 1
total_bytes += info.file_size
except zipfile.BadZipFile as e:
return f"Error: {zip_path} is not a valid zip archive: {e}"
except Exception as e:
return f"Error extracting {zip_path}: {e}"
parts = [
f"Extracted {extracted} file(s), {total_bytes} bytes, into {dest}",
]
if skipped:
parts.append(f"Skipped {len(skipped)} unsafe entries:")
for s in skipped[:10]:
parts.append(f" - {s}")
if len(skipped) > 10:
parts.append(f" ... ({len(skipped) - 10} more)")
return "\n".join(parts)
async def unzip_archive(
zip_path: str, dest_dir: str, password: str | None = None,
) -> str:
"""Extract *zip_path* into *dest_dir*. Idempotent on rerun.
Defensive: rejects entries with absolute paths, leading '..', or that
would resolve outside *dest_dir* (the classic zip-slip vector). Symlink
entries are skipped (we never follow symlinks into the host filesystem).
Password-protected zips need the password argument (or
``meta.password`` on the source in case.yaml) — stdlib ``zipfile``
only handles the legacy ZipCrypto algorithm.
"""
return _do_extract(zip_path, dest_dir, password)
def unzip_archive_sync(
zip_path: str, dest_dir: str, password: str | None = None,
) -> str:
"""Synchronous variant of :func:`unzip_archive` for startup-time prepare_source.
Same behaviour, just no async wrapping — used before the event loop
starts so we don't have to spin one up just to unpack a zip.
"""
return _do_extract(zip_path, dest_dir, password)

87
tools/media.py Normal file
View File

@@ -0,0 +1,87 @@
"""Media plugin — OCR for image evidence.
DESIGN.md §4.7: the model backend (DeepSeek) has no vision, so we MUST run
OCR locally for any image-bearing evidence. Tesseract via pytesseract is
the default; if the runtime is missing those packages, the tool returns a
clear install hint rather than failing silently.
"""
from __future__ import annotations
import logging
import os
from pathlib import Path
logger = logging.getLogger(__name__)
MAX_OUTPUT = 8000
_INSTALL_HINT = (
"Error: OCR runtime not available. Install with:\n"
" pip install pytesseract pillow\n"
" sudo apt install tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-chi-tra\n"
"(or the equivalent for your distribution). Then retry."
)
def _has_ocr_runtime() -> tuple[bool, str]:
"""Return (available, reason). reason is empty when available."""
try:
import pytesseract # noqa: F401
from PIL import Image # noqa: F401
except ImportError as e:
return False, f"missing python package: {e.name}"
# Check the tesseract binary too.
import shutil
if shutil.which("tesseract") is None:
return False, "tesseract binary not on PATH"
return True, ""
async def ocr_image(file_path: str, lang: str = "eng+chi_sim+chi_tra") -> str:
"""Extract text from an image via tesseract.
*lang* defaults to English + Simplified + Traditional Chinese, matching
the multi-language artefacts the current case involves. Pass a single
language code (e.g. ``"eng"``) to skip language packs that aren't
installed.
"""
p = Path(file_path)
if not p.is_file():
return f"Error: {file_path} is not a file."
available, reason = _has_ocr_runtime()
if not available:
return f"{_INSTALL_HINT}\n[detail: {reason}]"
import pytesseract
from PIL import Image
try:
img = Image.open(p)
except Exception as e:
return f"Error: could not open image {file_path}: {e}"
try:
text = pytesseract.image_to_string(img, lang=lang)
except pytesseract.TesseractError as e:
msg = str(e)
if "Failed loading language" in msg or "Error opening data file" in msg:
return (
f"Error: tesseract is installed but missing language pack(s) for {lang!r}. "
f"Install the language data (e.g. tesseract-ocr-chi-sim) or pass a "
f"different `lang`. Detail: {msg}"
)
return f"Error running tesseract: {msg}"
except Exception as e:
return f"Error during OCR: {e}"
size = p.stat().st_size
header = (
f"ocr: {file_path} ({size} bytes, lang={lang}, "
f"{len(text.splitlines())} line(s))\n"
)
if len(text) > MAX_OUTPUT - len(header):
body = text[:MAX_OUTPUT - len(header)] + "\n[truncated]"
else:
body = text
return header + body

160
tools/mobile_android.py Normal file
View File

@@ -0,0 +1,160 @@
"""Android plugin tools — partition survey + sector translation.
DESIGN.md §4.7 安卓: ``mmls`` partitions → per-partition image-mode source;
``fsstat`` per partition to classify ext4/F2FS/raw/encrypted. The shared TSK
toolchain already handles ext4/F2FS reads, so once the agent picks a partition
offset the standard list_directory / extract_file / search_strings tools work.
Quirk: Samsung dumps (e.g. ``blk0_sda.bin``) use 4096-byte image sectors but
TSK tool flags accept 512-byte sectors by default. ``probe_android_partitions``
emits BOTH unit systems so the agent can plug the right ``partition_offset``
value into ``set_active_partition``.
"""
from __future__ import annotations
import asyncio
import logging
import re
from pathlib import Path
logger = logging.getLogger(__name__)
MAX_OUTPUT = 8000
# Partitions worth flagging when we encounter them — informs the agent's
# strategy. Not exhaustive; just opinionated hints.
_PARTITION_HINTS: dict[str, str] = {
"EFS": "modem firmware area; often contains IMEI / MAC / serial",
"PARAM": "boot parameters; cmdline + flags",
"BOOT": "kernel + initramfs (raw image)",
"RECOVERY": "recovery image (raw)",
"SYSTEM": "Android /system — read-only OS partition (ext4)",
"CACHE": "downloaded OTA payloads; usually transient",
"USERDATA": "/data — user apps, dbs, accounts; FBE-encrypted on modern devices",
"PERSISTENT": "Samsung persistent partition; carrier/device flags",
"STEADY": "Samsung steady-state config",
"HIDDEN": "Samsung hidden partition; check before assuming empty",
"CP_DEBUG": "modem debug logs",
"TOMBSTONES": "userland crash dumps",
}
def _parse_mmls_with_unit(output: str) -> tuple[int, list[dict]]:
"""Parse mmls output, returning (sector_size_bytes, partitions).
mmls states ``Units are in N-byte sectors`` near the top; we extract N
to translate between image-native units and the 512-byte units TSK
tools accept via ``-o``.
"""
sector_size = 512
m = re.search(r"Units are in (\d+)-byte sectors", output)
if m:
sector_size = int(m.group(1))
parts: list[dict] = []
for line in output.splitlines():
m = re.match(
r"\s*(\d{3}):\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.*)",
line,
)
if not m:
continue
_row, slot, start, end, length, desc = m.groups()
if slot == "Meta" or slot.startswith("---"):
continue
parts.append({
"slot": slot,
"start_native": int(start),
"end_native": int(end),
"length_native": int(length),
"description": desc.strip(),
})
return sector_size, parts
async def _run(cmd: list[str], timeout: int = 30) -> tuple[int, str, str]:
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
try:
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
except asyncio.TimeoutError:
proc.kill()
return 124, "", f"timeout after {timeout}s"
return proc.returncode or 0, stdout.decode("utf-8", "replace"), stderr.decode("utf-8", "replace")
_FS_TYPE_RE = re.compile(r"File System Type:\s*(\S+)", re.IGNORECASE)
async def _classify_partition(image_path: str, sector_offset_512: int) -> str:
"""Run fsstat on a partition; return 'Ext4'/'Yaffs2'/'FAT'/'unknown'/'inaccessible'.
fsstat's "Cannot determine file system type" is treated as 'unknown'
typically means raw image (BOOT/RECOVERY/RADIO/…) or encrypted data
(modern userdata under FBE).
"""
rc, out, _err = await _run(["fsstat", "-o", str(sector_offset_512), image_path], timeout=15)
if rc != 0:
return "unknown"
m = _FS_TYPE_RE.search(out)
if m:
return m.group(1)
return "unknown"
async def probe_android_partitions(image_path: str) -> str:
"""Survey every partition on an Android disk dump and return a table.
The agent reads this once to plan its work: which partitions are
Ext4/F2FS (use TSK), which are raw (extract image / strings only),
which are encrypted (skip until decrypted).
"""
p = Path(image_path)
if not p.is_file():
return f"Error: {image_path} is not a file."
rc, out, err = await _run(["mmls", str(p)], timeout=30)
if rc != 0:
return f"Error: mmls failed (rc={rc}): {err.strip() or out.strip()}"
sector_size, parts = _parse_mmls_with_unit(out)
if not parts:
return f"No partitions detected in {image_path}."
lines = [
f"Android partition survey: {image_path}",
f" mmls reports {sector_size}-byte sectors (TSK -o expects 512-byte sectors)",
f" {len(parts)} data partitions",
"",
"| slot | name | start (native) | start (512-sector) | size | fs_type | hint |",
"|---|---|---:|---:|---|---|---|",
]
for prt in parts:
sector_512 = prt["start_native"] * sector_size // 512
bytes_size = prt["length_native"] * sector_size
# human-readable size
if bytes_size >= 1 << 30:
size_h = f"{bytes_size / (1 << 30):.1f} GB"
elif bytes_size >= 1 << 20:
size_h = f"{bytes_size / (1 << 20):.1f} MB"
else:
size_h = f"{bytes_size // 1024} KB"
fs_type = await _classify_partition(str(p), sector_512)
# Try to extract a friendly partition name from the description
# (mmls description often includes the partition name uppercase).
name_match = re.search(r"[A-Z][A-Z0-9_]{2,}", prt["description"])
pname = name_match.group(0) if name_match else prt["description"][:20]
hint = _PARTITION_HINTS.get(pname, "")
lines.append(
f"| {prt['slot']} | {pname} | {prt['start_native']} | "
f"{sector_512} | {size_h} | {fs_type} | {hint} |"
)
body = "\n".join(lines)
if len(body) > MAX_OUTPUT:
body = body[:MAX_OUTPUT] + "\n\n[truncated]"
return body

274
tools/mobile_ios.py Normal file
View File

@@ -0,0 +1,274 @@
"""iOS extraction parsers — plist / sqlite / keychain / iDevice info.
DESIGN.md §4.7 iOS plugin tools. All tree-mode, path-based — no Sleuth
Kit, no graph dependency. Stdlib + sqlite3 only.
iOS extractions typically arrive as a zip containing domain-rooted trees
(HomeDomain, AppDomain, etc.) with a flat ``iDevice_info.txt`` summary,
binary/XML plists, and several SQLite databases (sms.db, AddressBook,
keychain-2.db, app-specific stores like WhatsApp's ChatStorage.sqlite).
"""
from __future__ import annotations
import asyncio
import json
import logging
import os
import plistlib
import re
import sqlite3
from pathlib import Path
logger = logging.getLogger(__name__)
# Output cap (chars) — keeps a single tool result under the LLM context budget.
MAX_OUTPUT = 8000
def _trunc(text: str, limit: int = MAX_OUTPUT) -> str:
if len(text) <= limit:
return text
return text[:limit] + f"\n\n[Output truncated: {len(text)} chars total]"
# ---------------------------------------------------------------------------
# plist
# ---------------------------------------------------------------------------
def _to_jsonable(obj):
"""Make plist values JSON-serializable: bytes → hex preview, dates → iso."""
import datetime
if isinstance(obj, bytes):
if len(obj) <= 64:
return {"_bytes_hex": obj.hex()}
return {"_bytes_hex_preview": obj[:64].hex(), "_total_bytes": len(obj)}
if isinstance(obj, datetime.datetime):
return obj.isoformat()
if isinstance(obj, dict):
return {str(k): _to_jsonable(v) for k, v in obj.items()}
if isinstance(obj, (list, tuple)):
return [_to_jsonable(v) for v in obj]
return obj
async def parse_plist(file_path: str) -> str:
"""Parse a .plist file (XML or binary) and return its contents as JSON.
Both formats are handled transparently by ``plistlib.load``.
"""
p = Path(file_path)
if not p.is_file():
return f"Error: {file_path} is not a file."
try:
with open(p, "rb") as f:
data = plistlib.load(f)
except plistlib.InvalidFileException as e:
return f"Error: {file_path} is not a valid plist ({e})"
except Exception as e:
return f"Error parsing plist {file_path}: {e}"
serial = _to_jsonable(data)
rendered = json.dumps(serial, ensure_ascii=False, indent=2, default=str)
header = f"plist: {file_path} ({p.stat().st_size} bytes)\n"
return header + _trunc(rendered)
# ---------------------------------------------------------------------------
# sqlite
# ---------------------------------------------------------------------------
_SELECT_RE = re.compile(r"^\s*SELECT\b", re.IGNORECASE)
async def sqlite_tables(db_path: str) -> str:
"""List user tables in a sqlite file with row counts and column names."""
p = Path(db_path)
if not p.is_file():
return f"Error: {db_path} is not a file."
try:
conn = sqlite3.connect(f"file:{p}?mode=ro", uri=True)
except sqlite3.OperationalError as e:
return f"Error opening {db_path} (read-only): {e}"
try:
cur = conn.cursor()
cur.execute(
"SELECT name FROM sqlite_master "
"WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name"
)
tables = [r[0] for r in cur.fetchall()]
if not tables:
return f"No user tables in {db_path}."
lines = [f"sqlite: {db_path} ({len(tables)} tables)"]
for name in tables:
try:
cur.execute(f"SELECT COUNT(*) FROM \"{name}\"")
count = cur.fetchone()[0]
except sqlite3.DatabaseError as e:
count = f"(count failed: {e})"
try:
cur.execute(f"PRAGMA table_info(\"{name}\")")
cols = [r[1] for r in cur.fetchall()]
except sqlite3.DatabaseError:
cols = []
lines.append(f" {name}: {count} row(s); cols: {', '.join(cols)}")
return _trunc("\n".join(lines))
finally:
conn.close()
async def sqlite_query(
db_path: str,
query: str,
max_rows: int = 100,
) -> str:
"""Run a single read-only SELECT against a sqlite file.
Multi-statement queries and anything other than a SELECT are rejected
(we open the database in read-only mode anyway, so writes would fail
too — but the explicit check keeps the agent honest).
"""
if not _SELECT_RE.match(query):
return "Error: only single SELECT statements are allowed."
if ";" in query.rstrip(";"):
return "Error: multi-statement queries are not allowed."
p = Path(db_path)
if not p.is_file():
return f"Error: {db_path} is not a file."
try:
conn = sqlite3.connect(f"file:{p}?mode=ro", uri=True)
except sqlite3.OperationalError as e:
return f"Error opening {db_path} (read-only): {e}"
try:
cur = conn.cursor()
try:
cur.execute(query)
except sqlite3.DatabaseError as e:
return f"Error executing query: {e}"
cols = [d[0] for d in cur.description] if cur.description else []
rows = cur.fetchmany(max(1, int(max_rows)))
lines = [
f"sqlite query: {db_path}",
f"columns: {cols}",
f"rows ({len(rows)}, capped at {max_rows}):",
]
for row in rows:
rendered = [
(v.hex() if isinstance(v, bytes) else str(v))
for v in row
]
lines.append(" " + " | ".join(rendered))
return _trunc("\n".join(lines))
finally:
conn.close()
# ---------------------------------------------------------------------------
# iOS keychain (keychain-2.db)
# ---------------------------------------------------------------------------
# Standard iOS keychain tables. genp = generic passwords, inet = internet
# passwords, cert = certificates, keys = key material. Forensic extractions
# of locked keychains have ``data`` columns NULL but accounting metadata
# (agrp, acct, svce) intact — already useful for attribution work.
_KEYCHAIN_TABLES = ("genp", "inet", "cert", "keys")
async def parse_ios_keychain(keychain_root: str) -> str:
"""Locate and summarize iOS keychain entries under *keychain_root*.
*keychain_root* may be a path to ``keychain-2.db`` directly or to a
directory that contains it (e.g. ``.../var/keychains``).
"""
root = Path(keychain_root)
db: Path | None = None
if root.is_file() and root.name == "keychain-2.db":
db = root
elif root.is_dir():
candidate = root / "keychain-2.db"
if candidate.is_file():
db = candidate
else:
# Fall back to a shallow recursive search.
for found in root.rglob("keychain-2.db"):
db = found
break
if db is None:
return f"No keychain-2.db found under {keychain_root}."
try:
conn = sqlite3.connect(f"file:{db}?mode=ro", uri=True)
except sqlite3.OperationalError as e:
return f"Error opening {db}: {e}"
try:
cur = conn.cursor()
cur.execute(
"SELECT name FROM sqlite_master "
"WHERE type='table' AND name IN ({})".format(
",".join("?" * len(_KEYCHAIN_TABLES))
),
_KEYCHAIN_TABLES,
)
present = [r[0] for r in cur.fetchall()]
if not present:
return f"keychain-2.db at {db} has no recognised tables."
lines = [f"keychain: {db}"]
for name in present:
cur.execute(f"SELECT COUNT(*) FROM \"{name}\"")
count = cur.fetchone()[0]
lines.append(f"\n[{name}] {count} row(s)")
cur.execute(f"PRAGMA table_info(\"{name}\")")
cols = [r[1] for r in cur.fetchall()]
# Pick a useful subset of accounting columns when present.
preferred = [
c for c in ("agrp", "acct", "svce", "labl", "desc", "atyp", "srvr")
if c in cols
]
if not preferred:
preferred = cols[:5]
sel = ", ".join(f'"{c}"' for c in preferred)
cur.execute(f"SELECT {sel} FROM \"{name}\" LIMIT 30")
for row in cur.fetchall():
lines.append(" " + " | ".join(
(v.hex() if isinstance(v, bytes) else str(v))
for v in row
))
return _trunc("\n".join(lines))
finally:
conn.close()
# ---------------------------------------------------------------------------
# iDevice_info.txt
# ---------------------------------------------------------------------------
async def read_idevice_info(file_path: str, max_chars: int = 6000) -> str:
"""Read the standard iDevice_info.txt summary at the root of an iOS extraction.
The file is a flat ``Key: value`` dump from libimobiledevice / native
extraction tools. We surface the first *max_chars* of content verbatim
— the agent can search/extract specific keys via search_text_file if
the head isn't enough.
"""
p = Path(file_path)
if p.is_dir():
# Be helpful: if the agent passed the extraction root, find the file.
candidate = p / "iDevice_info.txt"
if candidate.is_file():
p = candidate
if not p.is_file():
return f"Error: {file_path} is not a file."
try:
with open(p, "r", encoding="utf-8", errors="replace") as f:
content = f.read(max_chars)
size = p.stat().st_size
header = f"iDevice_info: {p} ({size} bytes)\n"
if size > max_chars:
content += f"\n\n[Truncated: file is {size} bytes, showing first {max_chars}]"
return header + content
except Exception as e:
return f"Error reading {file_path}: {e}"

View File

@@ -215,20 +215,178 @@ async def parse_prefetch(file_path: str) -> str:
return f"[Error parsing Prefetch: {e}]"
async def list_extracted_dir(dir_path: str) -> str:
"""List files in an extracted directory."""
async def list_extracted_dir(dir_path: str, max_entries: int = 200) -> str:
"""Smart summary of a (potentially huge) extracted tree.
Earlier versions dumped up to 200 random entries then truncated — that
leaves the agent blind on 10k+-file iOS extractions. The new layout
returns a compact summary that scales: total counts, extension
breakdown, top-level directories with their sizes, and the largest
files. For targeted lookups (e.g. find every ``*.sqlite`` under the
tree) the agent should use ``find_files`` instead.
"""
if not os.path.isdir(dir_path):
return f"[Error: {dir_path} is not a directory]"
try:
entries = []
for root, dirs, files in os.walk(dir_path):
total_files = 0
total_bytes = 0
ext_counts: dict[str, int] = {}
ext_bytes: dict[str, int] = {}
top_level_dirs: dict[str, dict] = {}
biggest: list[tuple[int, str]] = [] # (size, relpath)
dir_path_abs = os.path.abspath(dir_path)
for root, dirs, files in os.walk(dir_path_abs):
# Track top-level directory aggregates (cheap; no per-entry cost
# beyond the walk we're already doing).
rel_root = os.path.relpath(root, dir_path_abs)
if rel_root == ".":
top_dirs = {d: {"files": 0, "bytes": 0} for d in dirs}
top_level_dirs.update(top_dirs)
top_key = None
else:
top_key = rel_root.split(os.sep, 1)[0]
if top_key not in top_level_dirs:
top_level_dirs[top_key] = {"files": 0, "bytes": 0}
for f in files:
full = os.path.join(root, f)
rel = os.path.relpath(full, dir_path)
size = os.path.getsize(full)
entries.append(f" {rel} ({size} bytes)")
if len(entries) > 200:
entries.append(f" ... (truncated)")
break
try:
size = os.path.getsize(full)
except OSError:
continue
total_files += 1
total_bytes += size
ext = os.path.splitext(f)[1].lower() or "(no ext)"
ext_counts[ext] = ext_counts.get(ext, 0) + 1
ext_bytes[ext] = ext_bytes.get(ext, 0) + size
if top_key is not None:
top_level_dirs[top_key]["files"] += 1
top_level_dirs[top_key]["bytes"] += size
# Maintain a top-10 largest list cheaply (bounded insertion).
if len(biggest) < 10:
biggest.append((size, os.path.relpath(full, dir_path_abs)))
biggest.sort(reverse=True)
elif size > biggest[-1][0]:
biggest[-1] = (size, os.path.relpath(full, dir_path_abs))
biggest.sort(reverse=True)
return f"Directory: {dir_path}\nFiles ({len(entries)}):\n" + "\n".join(entries)
def _human(n: int) -> str:
for unit in ("B", "KB", "MB", "GB"):
if n < 1024:
return f"{n:.1f}{unit}" if unit != "B" else f"{n}B"
n /= 1024
return f"{n:.1f}TB"
lines = [
f"Directory: {dir_path}",
f" Total: {total_files} file(s), {_human(total_bytes)}",
]
# Top-level directory layout (immediate children, sorted by file count).
if top_level_dirs:
lines.append(f"\nTop-level layout ({len(top_level_dirs)} dirs at root):")
sorted_tlds = sorted(
top_level_dirs.items(), key=lambda kv: -kv[1]["files"],
)[:15]
for d, stats in sorted_tlds:
lines.append(
f" {d}/ ({stats['files']} files, {_human(stats['bytes'])})"
)
if len(top_level_dirs) > 15:
lines.append(f" ... ({len(top_level_dirs) - 15} more top-level dirs)")
# Extension breakdown.
if ext_counts:
lines.append(f"\nExtension breakdown (top 15):")
for ext, count in sorted(ext_counts.items(), key=lambda kv: -kv[1])[:15]:
lines.append(
f" {ext}: {count} files, {_human(ext_bytes.get(ext, 0))}"
)
# Largest files (often the highest-value forensic targets).
if biggest:
lines.append("\nLargest files:")
for size, rel in biggest:
lines.append(f" {rel} ({_human(size)})")
lines.append(
f"\nNext step: call find_files with a pattern like "
f"'**/*.plist' or '**/keychain-2.db' to locate specific artefacts."
)
return "\n".join(lines)
except Exception as e:
return f"[Error listing {dir_path}: {e}]"
async def find_files(
root: str,
pattern: str,
max_results: int = 500,
) -> str:
"""Recursively find files under *root* whose path matches *pattern*.
Uses fnmatch-style globs against the *full relative path*; ``**`` is
treated as "any number of path segments" (so ``**/*.plist`` finds
every plist no matter how deep). Examples:
- ``**/sms.db`` — iOS SMS database
- ``**/keychain-2.db`` — iOS keychain
- ``**/ChatStorage.sqlite`` — WhatsApp app store
- ``HomeDomain/Library/**`` — anchor at a known iOS domain root
- ``**/*.{plist,sqlite,db}`` — multi-extension (use 2+ calls or a regex if needed)
Results are sorted by size descending — the biggest hits usually
matter most. Capped at *max_results* to keep the LLM context bounded.
"""
import fnmatch
if not os.path.isdir(root):
return f"[Error: {root} is not a directory]"
root_abs = os.path.abspath(root)
# Convert ``**`` (any-depth) to fnmatch's ``*`` (any chars including /).
# fnmatch doesn't natively distinguish segment vs path; expanding ``**``
# to ``*`` and letting fnmatch match the full relpath is good enough for
# forensic lookups.
fn_pattern = pattern.replace("**", "*")
hits: list[tuple[int, str]] = []
truncated = False
try:
for dirpath, _dirs, files in os.walk(root_abs):
for f in files:
full = os.path.join(dirpath, f)
rel = os.path.relpath(full, root_abs)
if fnmatch.fnmatch(rel, fn_pattern) or fnmatch.fnmatch(f, fn_pattern):
try:
size = os.path.getsize(full)
except OSError:
size = 0
hits.append((size, rel))
if len(hits) >= max_results * 4:
# Hard upper bound to keep the walk cheap on huge trees.
truncated = True
break
if truncated:
break
except Exception as e:
return f"[Error searching {root}: {e}]"
hits.sort(reverse=True)
if len(hits) > max_results:
truncated = True
hits = hits[:max_results]
lines = [
f"find_files: pattern={pattern!r} under {root}",
f" matches: {len(hits)}" + (" (truncated)" if truncated else ""),
]
if not hits:
lines.append(" (no matches)")
else:
for size, rel in hits:
lines.append(f" {rel} ({size} bytes)")
return "\n".join(lines)