Consolidates the long-running refit work (DESIGN.md as authoritative spec)
into a single baseline commit. Six stages landed together:
S1 Case + EvidenceSource abstraction; tools parameterised by source_id
(case.py, main.py multi-source bootstrap, .bin extension support)
S2 Grounding gateway in add_phenomenon: verified_facts cite real
ToolInvocation ids; substring / normalised match enforced; agent +
task scope checked. Phenomenon.description split into verified_facts
(grounded) + interpretation (free text). [invocation: inv-xxx]
prefix on every wrapped tool result so the LLM can cite.
S3 Confidence as additive log-odds: edge_type → log10(LR) calibration
table; commutative updates; supported / refuted thresholds derived
from log_odds; hypothesis × evidence matrix view.
S4 iOS plugin: unzip_archive + parse_plist / sqlite_tables /
sqlite_query / parse_ios_keychain / read_idevice_info;
IOSArtifactAgent; SOURCE_TYPE_AGENTS routing.
S5 Cross-source entity resolution: typed identifiers on Entity,
observe_identity gateway, auto coref hypothesis with shared /
conflicting strong/weak LR edges, reversible same_as edges,
actor_clusters() view.
S6 Android partition probe + AndroidArtifactAgent; MediaAgent with
OCR fallback; orchestrator Phase 1 iterates every analysable
source; platform-aware get_triage_agent_type; ReportAgent renders
actor clusters + per-source breakdown.
142 unit tests / 1 skipped — full coverage of the new gateway, log-odds
math, coref hypothesis fall-out, and orchestrator multi-source dispatch.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
157 lines
5.9 KiB
Python
157 lines
5.9 KiB
Python
"""Archive extraction tools — generic unzip for tree-mode evidence sources.
|
|
|
|
Mobile extractions (iOS / Android backups), archive sources, and shared
|
|
work products all arrive as .zip files. The forensic agents work on the
|
|
unpacked tree; this module is the single entry point for safely turning
|
|
an archive into a directory.
|
|
|
|
Stdlib-only. No graph dependency.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _is_within(base: Path, target: Path) -> bool:
|
|
"""True when *target* resolves to a path inside *base* — symlink-safe."""
|
|
try:
|
|
base_r = base.resolve()
|
|
target_r = target.resolve()
|
|
except OSError:
|
|
return False
|
|
try:
|
|
target_r.relative_to(base_r)
|
|
except ValueError:
|
|
return False
|
|
return True
|
|
|
|
|
|
def _is_zip_encrypted(zf: zipfile.ZipFile) -> bool:
|
|
"""True when any entry has the zip 'encrypted' flag bit set."""
|
|
return any(info.flag_bits & 0x1 for info in zf.infolist())
|
|
|
|
|
|
def _do_extract(
|
|
zip_path: str,
|
|
dest_dir: str,
|
|
password: str | None = None,
|
|
) -> str:
|
|
"""Shared core for unzip_archive (async) and unzip_archive_sync.
|
|
|
|
Pure stdlib + filesystem I/O — no asyncio. Idempotent on rerun (files
|
|
whose target already exists at the matching size are skipped). Returns
|
|
a multi-line summary the agent can read directly.
|
|
"""
|
|
zp = Path(zip_path)
|
|
if not zp.is_file():
|
|
return f"Error: {zip_path} is not a file."
|
|
|
|
dest = Path(dest_dir)
|
|
dest.mkdir(parents=True, exist_ok=True)
|
|
|
|
extracted = 0
|
|
skipped: list[str] = []
|
|
total_bytes = 0
|
|
pwd_bytes = password.encode("utf-8") if password else None
|
|
|
|
try:
|
|
with zipfile.ZipFile(zp, "r") as zf:
|
|
encrypted = _is_zip_encrypted(zf)
|
|
if encrypted and pwd_bytes is None:
|
|
return (
|
|
f"Error: {zip_path} is password-protected. "
|
|
f"Provide the password via case.yaml's "
|
|
f"meta.password on this source, or pass `password=` "
|
|
f"explicitly. Stdlib zipfile only supports the legacy "
|
|
f"ZipCrypto algorithm — AES-encrypted zips (created by "
|
|
f"7-Zip / WinZip) need an external tool like 7z."
|
|
)
|
|
for info in zf.infolist():
|
|
name = info.filename
|
|
# Block absolute paths and parent-escape attempts up front.
|
|
if name.startswith(("/", "\\")) or ".." in Path(name).parts:
|
|
skipped.append(f"escape: {name}")
|
|
continue
|
|
target = dest / name
|
|
if not _is_within(dest, target):
|
|
skipped.append(f"escape: {name}")
|
|
continue
|
|
# Symlink entries — skip rather than risk traversing out.
|
|
if info.external_attr >> 16 & 0o120000 == 0o120000:
|
|
skipped.append(f"symlink: {name}")
|
|
continue
|
|
if info.is_dir():
|
|
target.mkdir(parents=True, exist_ok=True)
|
|
continue
|
|
# Skip if already extracted with matching size (idempotent rerun).
|
|
if target.exists() and target.stat().st_size == info.file_size:
|
|
continue
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
try:
|
|
with zf.open(info, "r", pwd=pwd_bytes) as src, open(target, "wb") as out:
|
|
while True:
|
|
chunk = src.read(65536)
|
|
if not chunk:
|
|
break
|
|
out.write(chunk)
|
|
except RuntimeError as e:
|
|
# zipfile raises RuntimeError for bad-password / AES-encrypted.
|
|
msg = str(e)
|
|
if "Bad password" in msg or "password required" in msg:
|
|
return (
|
|
f"Error: bad or missing password for {zip_path}. "
|
|
f"If the zip is AES-encrypted (7-Zip/WinZip), stdlib "
|
|
f"cannot decrypt it — use `7z x -p<pwd> ...` "
|
|
f"externally and point the source path at the result."
|
|
)
|
|
raise
|
|
extracted += 1
|
|
total_bytes += info.file_size
|
|
except zipfile.BadZipFile as e:
|
|
return f"Error: {zip_path} is not a valid zip archive: {e}"
|
|
except Exception as e:
|
|
return f"Error extracting {zip_path}: {e}"
|
|
|
|
parts = [
|
|
f"Extracted {extracted} file(s), {total_bytes} bytes, into {dest}",
|
|
]
|
|
if skipped:
|
|
parts.append(f"Skipped {len(skipped)} unsafe entries:")
|
|
for s in skipped[:10]:
|
|
parts.append(f" - {s}")
|
|
if len(skipped) > 10:
|
|
parts.append(f" ... ({len(skipped) - 10} more)")
|
|
return "\n".join(parts)
|
|
|
|
|
|
async def unzip_archive(
|
|
zip_path: str, dest_dir: str, password: str | None = None,
|
|
) -> str:
|
|
"""Extract *zip_path* into *dest_dir*. Idempotent on rerun.
|
|
|
|
Defensive: rejects entries with absolute paths, leading '..', or that
|
|
would resolve outside *dest_dir* (the classic zip-slip vector). Symlink
|
|
entries are skipped (we never follow symlinks into the host filesystem).
|
|
Password-protected zips need the password argument (or
|
|
``meta.password`` on the source in case.yaml) — stdlib ``zipfile``
|
|
only handles the legacy ZipCrypto algorithm.
|
|
"""
|
|
return _do_extract(zip_path, dest_dir, password)
|
|
|
|
|
|
def unzip_archive_sync(
|
|
zip_path: str, dest_dir: str, password: str | None = None,
|
|
) -> str:
|
|
"""Synchronous variant of :func:`unzip_archive` for startup-time prepare_source.
|
|
|
|
Same behaviour, just no async wrapping — used before the event loop
|
|
starts so we don't have to spin one up just to unpack a zip.
|
|
"""
|
|
return _do_extract(zip_path, dest_dir, password)
|