feat(refit): complete S1-S6 — case abstraction, grounding, log-odds, plugins, coref, multi-source
Consolidates the long-running refit work (DESIGN.md as authoritative spec)
into a single baseline commit. Six stages landed together:
S1 Case + EvidenceSource abstraction; tools parameterised by source_id
(case.py, main.py multi-source bootstrap, .bin extension support)
S2 Grounding gateway in add_phenomenon: verified_facts cite real
ToolInvocation ids; substring / normalised match enforced; agent +
task scope checked. Phenomenon.description split into verified_facts
(grounded) + interpretation (free text). [invocation: inv-xxx]
prefix on every wrapped tool result so the LLM can cite.
S3 Confidence as additive log-odds: edge_type → log10(LR) calibration
table; commutative updates; supported / refuted thresholds derived
from log_odds; hypothesis × evidence matrix view.
S4 iOS plugin: unzip_archive + parse_plist / sqlite_tables /
sqlite_query / parse_ios_keychain / read_idevice_info;
IOSArtifactAgent; SOURCE_TYPE_AGENTS routing.
S5 Cross-source entity resolution: typed identifiers on Entity,
observe_identity gateway, auto coref hypothesis with shared /
conflicting strong/weak LR edges, reversible same_as edges,
actor_clusters() view.
S6 Android partition probe + AndroidArtifactAgent; MediaAgent with
OCR fallback; orchestrator Phase 1 iterates every analysable
source; platform-aware get_triage_agent_type; ReportAgent renders
actor clusters + per-source breakdown.
142 unit tests / 1 skipped — full coverage of the new gateway, log-odds
math, coref hypothesis fall-out, and orchestrator multi-source dispatch.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
156
tools/archive.py
Normal file
156
tools/archive.py
Normal file
@@ -0,0 +1,156 @@
|
||||
"""Archive extraction tools — generic unzip for tree-mode evidence sources.
|
||||
|
||||
Mobile extractions (iOS / Android backups), archive sources, and shared
|
||||
work products all arrive as .zip files. The forensic agents work on the
|
||||
unpacked tree; this module is the single entry point for safely turning
|
||||
an archive into a directory.
|
||||
|
||||
Stdlib-only. No graph dependency.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _is_within(base: Path, target: Path) -> bool:
|
||||
"""True when *target* resolves to a path inside *base* — symlink-safe."""
|
||||
try:
|
||||
base_r = base.resolve()
|
||||
target_r = target.resolve()
|
||||
except OSError:
|
||||
return False
|
||||
try:
|
||||
target_r.relative_to(base_r)
|
||||
except ValueError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _is_zip_encrypted(zf: zipfile.ZipFile) -> bool:
|
||||
"""True when any entry has the zip 'encrypted' flag bit set."""
|
||||
return any(info.flag_bits & 0x1 for info in zf.infolist())
|
||||
|
||||
|
||||
def _do_extract(
|
||||
zip_path: str,
|
||||
dest_dir: str,
|
||||
password: str | None = None,
|
||||
) -> str:
|
||||
"""Shared core for unzip_archive (async) and unzip_archive_sync.
|
||||
|
||||
Pure stdlib + filesystem I/O — no asyncio. Idempotent on rerun (files
|
||||
whose target already exists at the matching size are skipped). Returns
|
||||
a multi-line summary the agent can read directly.
|
||||
"""
|
||||
zp = Path(zip_path)
|
||||
if not zp.is_file():
|
||||
return f"Error: {zip_path} is not a file."
|
||||
|
||||
dest = Path(dest_dir)
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
extracted = 0
|
||||
skipped: list[str] = []
|
||||
total_bytes = 0
|
||||
pwd_bytes = password.encode("utf-8") if password else None
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(zp, "r") as zf:
|
||||
encrypted = _is_zip_encrypted(zf)
|
||||
if encrypted and pwd_bytes is None:
|
||||
return (
|
||||
f"Error: {zip_path} is password-protected. "
|
||||
f"Provide the password via case.yaml's "
|
||||
f"meta.password on this source, or pass `password=` "
|
||||
f"explicitly. Stdlib zipfile only supports the legacy "
|
||||
f"ZipCrypto algorithm — AES-encrypted zips (created by "
|
||||
f"7-Zip / WinZip) need an external tool like 7z."
|
||||
)
|
||||
for info in zf.infolist():
|
||||
name = info.filename
|
||||
# Block absolute paths and parent-escape attempts up front.
|
||||
if name.startswith(("/", "\\")) or ".." in Path(name).parts:
|
||||
skipped.append(f"escape: {name}")
|
||||
continue
|
||||
target = dest / name
|
||||
if not _is_within(dest, target):
|
||||
skipped.append(f"escape: {name}")
|
||||
continue
|
||||
# Symlink entries — skip rather than risk traversing out.
|
||||
if info.external_attr >> 16 & 0o120000 == 0o120000:
|
||||
skipped.append(f"symlink: {name}")
|
||||
continue
|
||||
if info.is_dir():
|
||||
target.mkdir(parents=True, exist_ok=True)
|
||||
continue
|
||||
# Skip if already extracted with matching size (idempotent rerun).
|
||||
if target.exists() and target.stat().st_size == info.file_size:
|
||||
continue
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
with zf.open(info, "r", pwd=pwd_bytes) as src, open(target, "wb") as out:
|
||||
while True:
|
||||
chunk = src.read(65536)
|
||||
if not chunk:
|
||||
break
|
||||
out.write(chunk)
|
||||
except RuntimeError as e:
|
||||
# zipfile raises RuntimeError for bad-password / AES-encrypted.
|
||||
msg = str(e)
|
||||
if "Bad password" in msg or "password required" in msg:
|
||||
return (
|
||||
f"Error: bad or missing password for {zip_path}. "
|
||||
f"If the zip is AES-encrypted (7-Zip/WinZip), stdlib "
|
||||
f"cannot decrypt it — use `7z x -p<pwd> ...` "
|
||||
f"externally and point the source path at the result."
|
||||
)
|
||||
raise
|
||||
extracted += 1
|
||||
total_bytes += info.file_size
|
||||
except zipfile.BadZipFile as e:
|
||||
return f"Error: {zip_path} is not a valid zip archive: {e}"
|
||||
except Exception as e:
|
||||
return f"Error extracting {zip_path}: {e}"
|
||||
|
||||
parts = [
|
||||
f"Extracted {extracted} file(s), {total_bytes} bytes, into {dest}",
|
||||
]
|
||||
if skipped:
|
||||
parts.append(f"Skipped {len(skipped)} unsafe entries:")
|
||||
for s in skipped[:10]:
|
||||
parts.append(f" - {s}")
|
||||
if len(skipped) > 10:
|
||||
parts.append(f" ... ({len(skipped) - 10} more)")
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
async def unzip_archive(
|
||||
zip_path: str, dest_dir: str, password: str | None = None,
|
||||
) -> str:
|
||||
"""Extract *zip_path* into *dest_dir*. Idempotent on rerun.
|
||||
|
||||
Defensive: rejects entries with absolute paths, leading '..', or that
|
||||
would resolve outside *dest_dir* (the classic zip-slip vector). Symlink
|
||||
entries are skipped (we never follow symlinks into the host filesystem).
|
||||
Password-protected zips need the password argument (or
|
||||
``meta.password`` on the source in case.yaml) — stdlib ``zipfile``
|
||||
only handles the legacy ZipCrypto algorithm.
|
||||
"""
|
||||
return _do_extract(zip_path, dest_dir, password)
|
||||
|
||||
|
||||
def unzip_archive_sync(
|
||||
zip_path: str, dest_dir: str, password: str | None = None,
|
||||
) -> str:
|
||||
"""Synchronous variant of :func:`unzip_archive` for startup-time prepare_source.
|
||||
|
||||
Same behaviour, just no async wrapping — used before the event loop
|
||||
starts so we don't have to spin one up just to unpack a zip.
|
||||
"""
|
||||
return _do_extract(zip_path, dest_dir, password)
|
||||
87
tools/media.py
Normal file
87
tools/media.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""Media plugin — OCR for image evidence.
|
||||
|
||||
DESIGN.md §4.7: the model backend (DeepSeek) has no vision, so we MUST run
|
||||
OCR locally for any image-bearing evidence. Tesseract via pytesseract is
|
||||
the default; if the runtime is missing those packages, the tool returns a
|
||||
clear install hint rather than failing silently.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MAX_OUTPUT = 8000
|
||||
|
||||
_INSTALL_HINT = (
|
||||
"Error: OCR runtime not available. Install with:\n"
|
||||
" pip install pytesseract pillow\n"
|
||||
" sudo apt install tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-chi-tra\n"
|
||||
"(or the equivalent for your distribution). Then retry."
|
||||
)
|
||||
|
||||
|
||||
def _has_ocr_runtime() -> tuple[bool, str]:
|
||||
"""Return (available, reason). reason is empty when available."""
|
||||
try:
|
||||
import pytesseract # noqa: F401
|
||||
from PIL import Image # noqa: F401
|
||||
except ImportError as e:
|
||||
return False, f"missing python package: {e.name}"
|
||||
# Check the tesseract binary too.
|
||||
import shutil
|
||||
if shutil.which("tesseract") is None:
|
||||
return False, "tesseract binary not on PATH"
|
||||
return True, ""
|
||||
|
||||
|
||||
async def ocr_image(file_path: str, lang: str = "eng+chi_sim+chi_tra") -> str:
|
||||
"""Extract text from an image via tesseract.
|
||||
|
||||
*lang* defaults to English + Simplified + Traditional Chinese, matching
|
||||
the multi-language artefacts the current case involves. Pass a single
|
||||
language code (e.g. ``"eng"``) to skip language packs that aren't
|
||||
installed.
|
||||
"""
|
||||
p = Path(file_path)
|
||||
if not p.is_file():
|
||||
return f"Error: {file_path} is not a file."
|
||||
available, reason = _has_ocr_runtime()
|
||||
if not available:
|
||||
return f"{_INSTALL_HINT}\n[detail: {reason}]"
|
||||
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
try:
|
||||
img = Image.open(p)
|
||||
except Exception as e:
|
||||
return f"Error: could not open image {file_path}: {e}"
|
||||
|
||||
try:
|
||||
text = pytesseract.image_to_string(img, lang=lang)
|
||||
except pytesseract.TesseractError as e:
|
||||
msg = str(e)
|
||||
if "Failed loading language" in msg or "Error opening data file" in msg:
|
||||
return (
|
||||
f"Error: tesseract is installed but missing language pack(s) for {lang!r}. "
|
||||
f"Install the language data (e.g. tesseract-ocr-chi-sim) or pass a "
|
||||
f"different `lang`. Detail: {msg}"
|
||||
)
|
||||
return f"Error running tesseract: {msg}"
|
||||
except Exception as e:
|
||||
return f"Error during OCR: {e}"
|
||||
|
||||
size = p.stat().st_size
|
||||
header = (
|
||||
f"ocr: {file_path} ({size} bytes, lang={lang}, "
|
||||
f"{len(text.splitlines())} line(s))\n"
|
||||
)
|
||||
if len(text) > MAX_OUTPUT - len(header):
|
||||
body = text[:MAX_OUTPUT - len(header)] + "\n[truncated]"
|
||||
else:
|
||||
body = text
|
||||
return header + body
|
||||
160
tools/mobile_android.py
Normal file
160
tools/mobile_android.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Android plugin tools — partition survey + sector translation.
|
||||
|
||||
DESIGN.md §4.7 安卓: ``mmls`` partitions → per-partition image-mode source;
|
||||
``fsstat`` per partition to classify ext4/F2FS/raw/encrypted. The shared TSK
|
||||
toolchain already handles ext4/F2FS reads, so once the agent picks a partition
|
||||
offset the standard list_directory / extract_file / search_strings tools work.
|
||||
|
||||
Quirk: Samsung dumps (e.g. ``blk0_sda.bin``) use 4096-byte image sectors but
|
||||
TSK tool flags accept 512-byte sectors by default. ``probe_android_partitions``
|
||||
emits BOTH unit systems so the agent can plug the right ``partition_offset``
|
||||
value into ``set_active_partition``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MAX_OUTPUT = 8000
|
||||
|
||||
# Partitions worth flagging when we encounter them — informs the agent's
|
||||
# strategy. Not exhaustive; just opinionated hints.
|
||||
_PARTITION_HINTS: dict[str, str] = {
|
||||
"EFS": "modem firmware area; often contains IMEI / MAC / serial",
|
||||
"PARAM": "boot parameters; cmdline + flags",
|
||||
"BOOT": "kernel + initramfs (raw image)",
|
||||
"RECOVERY": "recovery image (raw)",
|
||||
"SYSTEM": "Android /system — read-only OS partition (ext4)",
|
||||
"CACHE": "downloaded OTA payloads; usually transient",
|
||||
"USERDATA": "/data — user apps, dbs, accounts; FBE-encrypted on modern devices",
|
||||
"PERSISTENT": "Samsung persistent partition; carrier/device flags",
|
||||
"STEADY": "Samsung steady-state config",
|
||||
"HIDDEN": "Samsung hidden partition; check before assuming empty",
|
||||
"CP_DEBUG": "modem debug logs",
|
||||
"TOMBSTONES": "userland crash dumps",
|
||||
}
|
||||
|
||||
|
||||
def _parse_mmls_with_unit(output: str) -> tuple[int, list[dict]]:
|
||||
"""Parse mmls output, returning (sector_size_bytes, partitions).
|
||||
|
||||
mmls states ``Units are in N-byte sectors`` near the top; we extract N
|
||||
to translate between image-native units and the 512-byte units TSK
|
||||
tools accept via ``-o``.
|
||||
"""
|
||||
sector_size = 512
|
||||
m = re.search(r"Units are in (\d+)-byte sectors", output)
|
||||
if m:
|
||||
sector_size = int(m.group(1))
|
||||
|
||||
parts: list[dict] = []
|
||||
for line in output.splitlines():
|
||||
m = re.match(
|
||||
r"\s*(\d{3}):\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.*)",
|
||||
line,
|
||||
)
|
||||
if not m:
|
||||
continue
|
||||
_row, slot, start, end, length, desc = m.groups()
|
||||
if slot == "Meta" or slot.startswith("---"):
|
||||
continue
|
||||
parts.append({
|
||||
"slot": slot,
|
||||
"start_native": int(start),
|
||||
"end_native": int(end),
|
||||
"length_native": int(length),
|
||||
"description": desc.strip(),
|
||||
})
|
||||
return sector_size, parts
|
||||
|
||||
|
||||
async def _run(cmd: list[str], timeout: int = 30) -> tuple[int, str, str]:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
||||
except asyncio.TimeoutError:
|
||||
proc.kill()
|
||||
return 124, "", f"timeout after {timeout}s"
|
||||
return proc.returncode or 0, stdout.decode("utf-8", "replace"), stderr.decode("utf-8", "replace")
|
||||
|
||||
|
||||
_FS_TYPE_RE = re.compile(r"File System Type:\s*(\S+)", re.IGNORECASE)
|
||||
|
||||
|
||||
async def _classify_partition(image_path: str, sector_offset_512: int) -> str:
|
||||
"""Run fsstat on a partition; return 'Ext4'/'Yaffs2'/'FAT'/'unknown'/'inaccessible'.
|
||||
|
||||
fsstat's "Cannot determine file system type" is treated as 'unknown' —
|
||||
typically means raw image (BOOT/RECOVERY/RADIO/…) or encrypted data
|
||||
(modern userdata under FBE).
|
||||
"""
|
||||
rc, out, _err = await _run(["fsstat", "-o", str(sector_offset_512), image_path], timeout=15)
|
||||
if rc != 0:
|
||||
return "unknown"
|
||||
m = _FS_TYPE_RE.search(out)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return "unknown"
|
||||
|
||||
|
||||
async def probe_android_partitions(image_path: str) -> str:
|
||||
"""Survey every partition on an Android disk dump and return a table.
|
||||
|
||||
The agent reads this once to plan its work: which partitions are
|
||||
Ext4/F2FS (use TSK), which are raw (extract image / strings only),
|
||||
which are encrypted (skip until decrypted).
|
||||
"""
|
||||
p = Path(image_path)
|
||||
if not p.is_file():
|
||||
return f"Error: {image_path} is not a file."
|
||||
|
||||
rc, out, err = await _run(["mmls", str(p)], timeout=30)
|
||||
if rc != 0:
|
||||
return f"Error: mmls failed (rc={rc}): {err.strip() or out.strip()}"
|
||||
|
||||
sector_size, parts = _parse_mmls_with_unit(out)
|
||||
if not parts:
|
||||
return f"No partitions detected in {image_path}."
|
||||
|
||||
lines = [
|
||||
f"Android partition survey: {image_path}",
|
||||
f" mmls reports {sector_size}-byte sectors (TSK -o expects 512-byte sectors)",
|
||||
f" {len(parts)} data partitions",
|
||||
"",
|
||||
"| slot | name | start (native) | start (512-sector) | size | fs_type | hint |",
|
||||
"|---|---|---:|---:|---|---|---|",
|
||||
]
|
||||
for prt in parts:
|
||||
sector_512 = prt["start_native"] * sector_size // 512
|
||||
bytes_size = prt["length_native"] * sector_size
|
||||
# human-readable size
|
||||
if bytes_size >= 1 << 30:
|
||||
size_h = f"{bytes_size / (1 << 30):.1f} GB"
|
||||
elif bytes_size >= 1 << 20:
|
||||
size_h = f"{bytes_size / (1 << 20):.1f} MB"
|
||||
else:
|
||||
size_h = f"{bytes_size // 1024} KB"
|
||||
fs_type = await _classify_partition(str(p), sector_512)
|
||||
# Try to extract a friendly partition name from the description
|
||||
# (mmls description often includes the partition name uppercase).
|
||||
name_match = re.search(r"[A-Z][A-Z0-9_]{2,}", prt["description"])
|
||||
pname = name_match.group(0) if name_match else prt["description"][:20]
|
||||
hint = _PARTITION_HINTS.get(pname, "")
|
||||
lines.append(
|
||||
f"| {prt['slot']} | {pname} | {prt['start_native']} | "
|
||||
f"{sector_512} | {size_h} | {fs_type} | {hint} |"
|
||||
)
|
||||
|
||||
body = "\n".join(lines)
|
||||
if len(body) > MAX_OUTPUT:
|
||||
body = body[:MAX_OUTPUT] + "\n\n[truncated]"
|
||||
return body
|
||||
274
tools/mobile_ios.py
Normal file
274
tools/mobile_ios.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""iOS extraction parsers — plist / sqlite / keychain / iDevice info.
|
||||
|
||||
DESIGN.md §4.7 iOS plugin tools. All tree-mode, path-based — no Sleuth
|
||||
Kit, no graph dependency. Stdlib + sqlite3 only.
|
||||
|
||||
iOS extractions typically arrive as a zip containing domain-rooted trees
|
||||
(HomeDomain, AppDomain, etc.) with a flat ``iDevice_info.txt`` summary,
|
||||
binary/XML plists, and several SQLite databases (sms.db, AddressBook,
|
||||
keychain-2.db, app-specific stores like WhatsApp's ChatStorage.sqlite).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import plistlib
|
||||
import re
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Output cap (chars) — keeps a single tool result under the LLM context budget.
|
||||
MAX_OUTPUT = 8000
|
||||
|
||||
|
||||
def _trunc(text: str, limit: int = MAX_OUTPUT) -> str:
|
||||
if len(text) <= limit:
|
||||
return text
|
||||
return text[:limit] + f"\n\n[Output truncated: {len(text)} chars total]"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# plist
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _to_jsonable(obj):
|
||||
"""Make plist values JSON-serializable: bytes → hex preview, dates → iso."""
|
||||
import datetime
|
||||
if isinstance(obj, bytes):
|
||||
if len(obj) <= 64:
|
||||
return {"_bytes_hex": obj.hex()}
|
||||
return {"_bytes_hex_preview": obj[:64].hex(), "_total_bytes": len(obj)}
|
||||
if isinstance(obj, datetime.datetime):
|
||||
return obj.isoformat()
|
||||
if isinstance(obj, dict):
|
||||
return {str(k): _to_jsonable(v) for k, v in obj.items()}
|
||||
if isinstance(obj, (list, tuple)):
|
||||
return [_to_jsonable(v) for v in obj]
|
||||
return obj
|
||||
|
||||
|
||||
async def parse_plist(file_path: str) -> str:
|
||||
"""Parse a .plist file (XML or binary) and return its contents as JSON.
|
||||
|
||||
Both formats are handled transparently by ``plistlib.load``.
|
||||
"""
|
||||
p = Path(file_path)
|
||||
if not p.is_file():
|
||||
return f"Error: {file_path} is not a file."
|
||||
try:
|
||||
with open(p, "rb") as f:
|
||||
data = plistlib.load(f)
|
||||
except plistlib.InvalidFileException as e:
|
||||
return f"Error: {file_path} is not a valid plist ({e})"
|
||||
except Exception as e:
|
||||
return f"Error parsing plist {file_path}: {e}"
|
||||
|
||||
serial = _to_jsonable(data)
|
||||
rendered = json.dumps(serial, ensure_ascii=False, indent=2, default=str)
|
||||
header = f"plist: {file_path} ({p.stat().st_size} bytes)\n"
|
||||
return header + _trunc(rendered)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# sqlite
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_SELECT_RE = re.compile(r"^\s*SELECT\b", re.IGNORECASE)
|
||||
|
||||
|
||||
async def sqlite_tables(db_path: str) -> str:
|
||||
"""List user tables in a sqlite file with row counts and column names."""
|
||||
p = Path(db_path)
|
||||
if not p.is_file():
|
||||
return f"Error: {db_path} is not a file."
|
||||
try:
|
||||
conn = sqlite3.connect(f"file:{p}?mode=ro", uri=True)
|
||||
except sqlite3.OperationalError as e:
|
||||
return f"Error opening {db_path} (read-only): {e}"
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT name FROM sqlite_master "
|
||||
"WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name"
|
||||
)
|
||||
tables = [r[0] for r in cur.fetchall()]
|
||||
if not tables:
|
||||
return f"No user tables in {db_path}."
|
||||
lines = [f"sqlite: {db_path} ({len(tables)} tables)"]
|
||||
for name in tables:
|
||||
try:
|
||||
cur.execute(f"SELECT COUNT(*) FROM \"{name}\"")
|
||||
count = cur.fetchone()[0]
|
||||
except sqlite3.DatabaseError as e:
|
||||
count = f"(count failed: {e})"
|
||||
try:
|
||||
cur.execute(f"PRAGMA table_info(\"{name}\")")
|
||||
cols = [r[1] for r in cur.fetchall()]
|
||||
except sqlite3.DatabaseError:
|
||||
cols = []
|
||||
lines.append(f" {name}: {count} row(s); cols: {', '.join(cols)}")
|
||||
return _trunc("\n".join(lines))
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
async def sqlite_query(
|
||||
db_path: str,
|
||||
query: str,
|
||||
max_rows: int = 100,
|
||||
) -> str:
|
||||
"""Run a single read-only SELECT against a sqlite file.
|
||||
|
||||
Multi-statement queries and anything other than a SELECT are rejected
|
||||
(we open the database in read-only mode anyway, so writes would fail
|
||||
too — but the explicit check keeps the agent honest).
|
||||
"""
|
||||
if not _SELECT_RE.match(query):
|
||||
return "Error: only single SELECT statements are allowed."
|
||||
if ";" in query.rstrip(";"):
|
||||
return "Error: multi-statement queries are not allowed."
|
||||
|
||||
p = Path(db_path)
|
||||
if not p.is_file():
|
||||
return f"Error: {db_path} is not a file."
|
||||
try:
|
||||
conn = sqlite3.connect(f"file:{p}?mode=ro", uri=True)
|
||||
except sqlite3.OperationalError as e:
|
||||
return f"Error opening {db_path} (read-only): {e}"
|
||||
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
try:
|
||||
cur.execute(query)
|
||||
except sqlite3.DatabaseError as e:
|
||||
return f"Error executing query: {e}"
|
||||
cols = [d[0] for d in cur.description] if cur.description else []
|
||||
rows = cur.fetchmany(max(1, int(max_rows)))
|
||||
lines = [
|
||||
f"sqlite query: {db_path}",
|
||||
f"columns: {cols}",
|
||||
f"rows ({len(rows)}, capped at {max_rows}):",
|
||||
]
|
||||
for row in rows:
|
||||
rendered = [
|
||||
(v.hex() if isinstance(v, bytes) else str(v))
|
||||
for v in row
|
||||
]
|
||||
lines.append(" " + " | ".join(rendered))
|
||||
return _trunc("\n".join(lines))
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# iOS keychain (keychain-2.db)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Standard iOS keychain tables. genp = generic passwords, inet = internet
|
||||
# passwords, cert = certificates, keys = key material. Forensic extractions
|
||||
# of locked keychains have ``data`` columns NULL but accounting metadata
|
||||
# (agrp, acct, svce) intact — already useful for attribution work.
|
||||
_KEYCHAIN_TABLES = ("genp", "inet", "cert", "keys")
|
||||
|
||||
|
||||
async def parse_ios_keychain(keychain_root: str) -> str:
|
||||
"""Locate and summarize iOS keychain entries under *keychain_root*.
|
||||
|
||||
*keychain_root* may be a path to ``keychain-2.db`` directly or to a
|
||||
directory that contains it (e.g. ``.../var/keychains``).
|
||||
"""
|
||||
root = Path(keychain_root)
|
||||
db: Path | None = None
|
||||
if root.is_file() and root.name == "keychain-2.db":
|
||||
db = root
|
||||
elif root.is_dir():
|
||||
candidate = root / "keychain-2.db"
|
||||
if candidate.is_file():
|
||||
db = candidate
|
||||
else:
|
||||
# Fall back to a shallow recursive search.
|
||||
for found in root.rglob("keychain-2.db"):
|
||||
db = found
|
||||
break
|
||||
if db is None:
|
||||
return f"No keychain-2.db found under {keychain_root}."
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(f"file:{db}?mode=ro", uri=True)
|
||||
except sqlite3.OperationalError as e:
|
||||
return f"Error opening {db}: {e}"
|
||||
|
||||
try:
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"SELECT name FROM sqlite_master "
|
||||
"WHERE type='table' AND name IN ({})".format(
|
||||
",".join("?" * len(_KEYCHAIN_TABLES))
|
||||
),
|
||||
_KEYCHAIN_TABLES,
|
||||
)
|
||||
present = [r[0] for r in cur.fetchall()]
|
||||
if not present:
|
||||
return f"keychain-2.db at {db} has no recognised tables."
|
||||
|
||||
lines = [f"keychain: {db}"]
|
||||
for name in present:
|
||||
cur.execute(f"SELECT COUNT(*) FROM \"{name}\"")
|
||||
count = cur.fetchone()[0]
|
||||
lines.append(f"\n[{name}] {count} row(s)")
|
||||
cur.execute(f"PRAGMA table_info(\"{name}\")")
|
||||
cols = [r[1] for r in cur.fetchall()]
|
||||
# Pick a useful subset of accounting columns when present.
|
||||
preferred = [
|
||||
c for c in ("agrp", "acct", "svce", "labl", "desc", "atyp", "srvr")
|
||||
if c in cols
|
||||
]
|
||||
if not preferred:
|
||||
preferred = cols[:5]
|
||||
sel = ", ".join(f'"{c}"' for c in preferred)
|
||||
cur.execute(f"SELECT {sel} FROM \"{name}\" LIMIT 30")
|
||||
for row in cur.fetchall():
|
||||
lines.append(" " + " | ".join(
|
||||
(v.hex() if isinstance(v, bytes) else str(v))
|
||||
for v in row
|
||||
))
|
||||
return _trunc("\n".join(lines))
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# iDevice_info.txt
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def read_idevice_info(file_path: str, max_chars: int = 6000) -> str:
|
||||
"""Read the standard iDevice_info.txt summary at the root of an iOS extraction.
|
||||
|
||||
The file is a flat ``Key: value`` dump from libimobiledevice / native
|
||||
extraction tools. We surface the first *max_chars* of content verbatim
|
||||
— the agent can search/extract specific keys via search_text_file if
|
||||
the head isn't enough.
|
||||
"""
|
||||
p = Path(file_path)
|
||||
if p.is_dir():
|
||||
# Be helpful: if the agent passed the extraction root, find the file.
|
||||
candidate = p / "iDevice_info.txt"
|
||||
if candidate.is_file():
|
||||
p = candidate
|
||||
if not p.is_file():
|
||||
return f"Error: {file_path} is not a file."
|
||||
try:
|
||||
with open(p, "r", encoding="utf-8", errors="replace") as f:
|
||||
content = f.read(max_chars)
|
||||
size = p.stat().st_size
|
||||
header = f"iDevice_info: {p} ({size} bytes)\n"
|
||||
if size > max_chars:
|
||||
content += f"\n\n[Truncated: file is {size} bytes, showing first {max_chars}]"
|
||||
return header + content
|
||||
except Exception as e:
|
||||
return f"Error reading {file_path}: {e}"
|
||||
180
tools/parsers.py
180
tools/parsers.py
@@ -215,20 +215,178 @@ async def parse_prefetch(file_path: str) -> str:
|
||||
return f"[Error parsing Prefetch: {e}]"
|
||||
|
||||
|
||||
async def list_extracted_dir(dir_path: str) -> str:
|
||||
"""List files in an extracted directory."""
|
||||
async def list_extracted_dir(dir_path: str, max_entries: int = 200) -> str:
|
||||
"""Smart summary of a (potentially huge) extracted tree.
|
||||
|
||||
Earlier versions dumped up to 200 random entries then truncated — that
|
||||
leaves the agent blind on 10k+-file iOS extractions. The new layout
|
||||
returns a compact summary that scales: total counts, extension
|
||||
breakdown, top-level directories with their sizes, and the largest
|
||||
files. For targeted lookups (e.g. find every ``*.sqlite`` under the
|
||||
tree) the agent should use ``find_files`` instead.
|
||||
"""
|
||||
if not os.path.isdir(dir_path):
|
||||
return f"[Error: {dir_path} is not a directory]"
|
||||
|
||||
try:
|
||||
entries = []
|
||||
for root, dirs, files in os.walk(dir_path):
|
||||
total_files = 0
|
||||
total_bytes = 0
|
||||
ext_counts: dict[str, int] = {}
|
||||
ext_bytes: dict[str, int] = {}
|
||||
top_level_dirs: dict[str, dict] = {}
|
||||
biggest: list[tuple[int, str]] = [] # (size, relpath)
|
||||
|
||||
dir_path_abs = os.path.abspath(dir_path)
|
||||
for root, dirs, files in os.walk(dir_path_abs):
|
||||
# Track top-level directory aggregates (cheap; no per-entry cost
|
||||
# beyond the walk we're already doing).
|
||||
rel_root = os.path.relpath(root, dir_path_abs)
|
||||
if rel_root == ".":
|
||||
top_dirs = {d: {"files": 0, "bytes": 0} for d in dirs}
|
||||
top_level_dirs.update(top_dirs)
|
||||
top_key = None
|
||||
else:
|
||||
top_key = rel_root.split(os.sep, 1)[0]
|
||||
if top_key not in top_level_dirs:
|
||||
top_level_dirs[top_key] = {"files": 0, "bytes": 0}
|
||||
|
||||
for f in files:
|
||||
full = os.path.join(root, f)
|
||||
rel = os.path.relpath(full, dir_path)
|
||||
size = os.path.getsize(full)
|
||||
entries.append(f" {rel} ({size} bytes)")
|
||||
if len(entries) > 200:
|
||||
entries.append(f" ... (truncated)")
|
||||
break
|
||||
try:
|
||||
size = os.path.getsize(full)
|
||||
except OSError:
|
||||
continue
|
||||
total_files += 1
|
||||
total_bytes += size
|
||||
ext = os.path.splitext(f)[1].lower() or "(no ext)"
|
||||
ext_counts[ext] = ext_counts.get(ext, 0) + 1
|
||||
ext_bytes[ext] = ext_bytes.get(ext, 0) + size
|
||||
if top_key is not None:
|
||||
top_level_dirs[top_key]["files"] += 1
|
||||
top_level_dirs[top_key]["bytes"] += size
|
||||
# Maintain a top-10 largest list cheaply (bounded insertion).
|
||||
if len(biggest) < 10:
|
||||
biggest.append((size, os.path.relpath(full, dir_path_abs)))
|
||||
biggest.sort(reverse=True)
|
||||
elif size > biggest[-1][0]:
|
||||
biggest[-1] = (size, os.path.relpath(full, dir_path_abs))
|
||||
biggest.sort(reverse=True)
|
||||
|
||||
return f"Directory: {dir_path}\nFiles ({len(entries)}):\n" + "\n".join(entries)
|
||||
def _human(n: int) -> str:
|
||||
for unit in ("B", "KB", "MB", "GB"):
|
||||
if n < 1024:
|
||||
return f"{n:.1f}{unit}" if unit != "B" else f"{n}B"
|
||||
n /= 1024
|
||||
return f"{n:.1f}TB"
|
||||
|
||||
lines = [
|
||||
f"Directory: {dir_path}",
|
||||
f" Total: {total_files} file(s), {_human(total_bytes)}",
|
||||
]
|
||||
|
||||
# Top-level directory layout (immediate children, sorted by file count).
|
||||
if top_level_dirs:
|
||||
lines.append(f"\nTop-level layout ({len(top_level_dirs)} dirs at root):")
|
||||
sorted_tlds = sorted(
|
||||
top_level_dirs.items(), key=lambda kv: -kv[1]["files"],
|
||||
)[:15]
|
||||
for d, stats in sorted_tlds:
|
||||
lines.append(
|
||||
f" {d}/ ({stats['files']} files, {_human(stats['bytes'])})"
|
||||
)
|
||||
if len(top_level_dirs) > 15:
|
||||
lines.append(f" ... ({len(top_level_dirs) - 15} more top-level dirs)")
|
||||
|
||||
# Extension breakdown.
|
||||
if ext_counts:
|
||||
lines.append(f"\nExtension breakdown (top 15):")
|
||||
for ext, count in sorted(ext_counts.items(), key=lambda kv: -kv[1])[:15]:
|
||||
lines.append(
|
||||
f" {ext}: {count} files, {_human(ext_bytes.get(ext, 0))}"
|
||||
)
|
||||
|
||||
# Largest files (often the highest-value forensic targets).
|
||||
if biggest:
|
||||
lines.append("\nLargest files:")
|
||||
for size, rel in biggest:
|
||||
lines.append(f" {rel} ({_human(size)})")
|
||||
|
||||
lines.append(
|
||||
f"\nNext step: call find_files with a pattern like "
|
||||
f"'**/*.plist' or '**/keychain-2.db' to locate specific artefacts."
|
||||
)
|
||||
|
||||
return "\n".join(lines)
|
||||
except Exception as e:
|
||||
return f"[Error listing {dir_path}: {e}]"
|
||||
|
||||
|
||||
async def find_files(
|
||||
root: str,
|
||||
pattern: str,
|
||||
max_results: int = 500,
|
||||
) -> str:
|
||||
"""Recursively find files under *root* whose path matches *pattern*.
|
||||
|
||||
Uses fnmatch-style globs against the *full relative path*; ``**`` is
|
||||
treated as "any number of path segments" (so ``**/*.plist`` finds
|
||||
every plist no matter how deep). Examples:
|
||||
|
||||
- ``**/sms.db`` — iOS SMS database
|
||||
- ``**/keychain-2.db`` — iOS keychain
|
||||
- ``**/ChatStorage.sqlite`` — WhatsApp app store
|
||||
- ``HomeDomain/Library/**`` — anchor at a known iOS domain root
|
||||
- ``**/*.{plist,sqlite,db}`` — multi-extension (use 2+ calls or a regex if needed)
|
||||
|
||||
Results are sorted by size descending — the biggest hits usually
|
||||
matter most. Capped at *max_results* to keep the LLM context bounded.
|
||||
"""
|
||||
import fnmatch
|
||||
|
||||
if not os.path.isdir(root):
|
||||
return f"[Error: {root} is not a directory]"
|
||||
|
||||
root_abs = os.path.abspath(root)
|
||||
# Convert ``**`` (any-depth) to fnmatch's ``*`` (any chars including /).
|
||||
# fnmatch doesn't natively distinguish segment vs path; expanding ``**``
|
||||
# to ``*`` and letting fnmatch match the full relpath is good enough for
|
||||
# forensic lookups.
|
||||
fn_pattern = pattern.replace("**", "*")
|
||||
|
||||
hits: list[tuple[int, str]] = []
|
||||
truncated = False
|
||||
try:
|
||||
for dirpath, _dirs, files in os.walk(root_abs):
|
||||
for f in files:
|
||||
full = os.path.join(dirpath, f)
|
||||
rel = os.path.relpath(full, root_abs)
|
||||
if fnmatch.fnmatch(rel, fn_pattern) or fnmatch.fnmatch(f, fn_pattern):
|
||||
try:
|
||||
size = os.path.getsize(full)
|
||||
except OSError:
|
||||
size = 0
|
||||
hits.append((size, rel))
|
||||
if len(hits) >= max_results * 4:
|
||||
# Hard upper bound to keep the walk cheap on huge trees.
|
||||
truncated = True
|
||||
break
|
||||
if truncated:
|
||||
break
|
||||
except Exception as e:
|
||||
return f"[Error searching {root}: {e}]"
|
||||
|
||||
hits.sort(reverse=True)
|
||||
if len(hits) > max_results:
|
||||
truncated = True
|
||||
hits = hits[:max_results]
|
||||
|
||||
lines = [
|
||||
f"find_files: pattern={pattern!r} under {root}",
|
||||
f" matches: {len(hits)}" + (" (truncated)" if truncated else ""),
|
||||
]
|
||||
if not hits:
|
||||
lines.append(" (no matches)")
|
||||
else:
|
||||
for size, rel in hits:
|
||||
lines.append(f" {rel} ({size} bytes)")
|
||||
return "\n".join(lines)
|
||||
|
||||
Reference in New Issue
Block a user