Files
MASForensic/tools/archive.py
BattleTag 81ade8f7ac feat(refit): complete S1-S6 — case abstraction, grounding, log-odds, plugins, coref, multi-source
Consolidates the long-running refit work (DESIGN.md as authoritative spec)
into a single baseline commit. Six stages landed together:

  S1  Case + EvidenceSource abstraction; tools parameterised by source_id
      (case.py, main.py multi-source bootstrap, .bin extension support)
  S2  Grounding gateway in add_phenomenon: verified_facts cite real
      ToolInvocation ids; substring / normalised match enforced; agent +
      task scope checked. Phenomenon.description split into verified_facts
      (grounded) + interpretation (free text). [invocation: inv-xxx]
      prefix on every wrapped tool result so the LLM can cite.
  S3  Confidence as additive log-odds: edge_type → log10(LR) calibration
      table; commutative updates; supported / refuted thresholds derived
      from log_odds; hypothesis × evidence matrix view.
  S4  iOS plugin: unzip_archive + parse_plist / sqlite_tables /
      sqlite_query / parse_ios_keychain / read_idevice_info;
      IOSArtifactAgent; SOURCE_TYPE_AGENTS routing.
  S5  Cross-source entity resolution: typed identifiers on Entity,
      observe_identity gateway, auto coref hypothesis with shared /
      conflicting strong/weak LR edges, reversible same_as edges,
      actor_clusters() view.
  S6  Android partition probe + AndroidArtifactAgent; MediaAgent with
      OCR fallback; orchestrator Phase 1 iterates every analysable
      source; platform-aware get_triage_agent_type; ReportAgent renders
      actor clusters + per-source breakdown.

142 unit tests / 1 skipped — full coverage of the new gateway, log-odds
math, coref hypothesis fall-out, and orchestrator multi-source dispatch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 02:12:10 -10:00

157 lines
5.9 KiB
Python

"""Archive extraction tools — generic unzip for tree-mode evidence sources.
Mobile extractions (iOS / Android backups), archive sources, and shared
work products all arrive as .zip files. The forensic agents work on the
unpacked tree; this module is the single entry point for safely turning
an archive into a directory.
Stdlib-only. No graph dependency.
"""
from __future__ import annotations
import logging
import os
import zipfile
from pathlib import Path
logger = logging.getLogger(__name__)
def _is_within(base: Path, target: Path) -> bool:
"""True when *target* resolves to a path inside *base* — symlink-safe."""
try:
base_r = base.resolve()
target_r = target.resolve()
except OSError:
return False
try:
target_r.relative_to(base_r)
except ValueError:
return False
return True
def _is_zip_encrypted(zf: zipfile.ZipFile) -> bool:
"""True when any entry has the zip 'encrypted' flag bit set."""
return any(info.flag_bits & 0x1 for info in zf.infolist())
def _do_extract(
zip_path: str,
dest_dir: str,
password: str | None = None,
) -> str:
"""Shared core for unzip_archive (async) and unzip_archive_sync.
Pure stdlib + filesystem I/O — no asyncio. Idempotent on rerun (files
whose target already exists at the matching size are skipped). Returns
a multi-line summary the agent can read directly.
"""
zp = Path(zip_path)
if not zp.is_file():
return f"Error: {zip_path} is not a file."
dest = Path(dest_dir)
dest.mkdir(parents=True, exist_ok=True)
extracted = 0
skipped: list[str] = []
total_bytes = 0
pwd_bytes = password.encode("utf-8") if password else None
try:
with zipfile.ZipFile(zp, "r") as zf:
encrypted = _is_zip_encrypted(zf)
if encrypted and pwd_bytes is None:
return (
f"Error: {zip_path} is password-protected. "
f"Provide the password via case.yaml's "
f"meta.password on this source, or pass `password=` "
f"explicitly. Stdlib zipfile only supports the legacy "
f"ZipCrypto algorithm — AES-encrypted zips (created by "
f"7-Zip / WinZip) need an external tool like 7z."
)
for info in zf.infolist():
name = info.filename
# Block absolute paths and parent-escape attempts up front.
if name.startswith(("/", "\\")) or ".." in Path(name).parts:
skipped.append(f"escape: {name}")
continue
target = dest / name
if not _is_within(dest, target):
skipped.append(f"escape: {name}")
continue
# Symlink entries — skip rather than risk traversing out.
if info.external_attr >> 16 & 0o120000 == 0o120000:
skipped.append(f"symlink: {name}")
continue
if info.is_dir():
target.mkdir(parents=True, exist_ok=True)
continue
# Skip if already extracted with matching size (idempotent rerun).
if target.exists() and target.stat().st_size == info.file_size:
continue
target.parent.mkdir(parents=True, exist_ok=True)
try:
with zf.open(info, "r", pwd=pwd_bytes) as src, open(target, "wb") as out:
while True:
chunk = src.read(65536)
if not chunk:
break
out.write(chunk)
except RuntimeError as e:
# zipfile raises RuntimeError for bad-password / AES-encrypted.
msg = str(e)
if "Bad password" in msg or "password required" in msg:
return (
f"Error: bad or missing password for {zip_path}. "
f"If the zip is AES-encrypted (7-Zip/WinZip), stdlib "
f"cannot decrypt it — use `7z x -p<pwd> ...` "
f"externally and point the source path at the result."
)
raise
extracted += 1
total_bytes += info.file_size
except zipfile.BadZipFile as e:
return f"Error: {zip_path} is not a valid zip archive: {e}"
except Exception as e:
return f"Error extracting {zip_path}: {e}"
parts = [
f"Extracted {extracted} file(s), {total_bytes} bytes, into {dest}",
]
if skipped:
parts.append(f"Skipped {len(skipped)} unsafe entries:")
for s in skipped[:10]:
parts.append(f" - {s}")
if len(skipped) > 10:
parts.append(f" ... ({len(skipped) - 10} more)")
return "\n".join(parts)
async def unzip_archive(
zip_path: str, dest_dir: str, password: str | None = None,
) -> str:
"""Extract *zip_path* into *dest_dir*. Idempotent on rerun.
Defensive: rejects entries with absolute paths, leading '..', or that
would resolve outside *dest_dir* (the classic zip-slip vector). Symlink
entries are skipped (we never follow symlinks into the host filesystem).
Password-protected zips need the password argument (or
``meta.password`` on the source in case.yaml) — stdlib ``zipfile``
only handles the legacy ZipCrypto algorithm.
"""
return _do_extract(zip_path, dest_dir, password)
def unzip_archive_sync(
zip_path: str, dest_dir: str, password: str | None = None,
) -> str:
"""Synchronous variant of :func:`unzip_archive` for startup-time prepare_source.
Same behaviour, just no async wrapping — used before the event loop
starts so we don't have to spin one up just to unpack a zip.
"""
return _do_extract(zip_path, dest_dir, password)