feat(refit): complete S1-S6 — case abstraction, grounding, log-odds, plugins, coref, multi-source
Consolidates the long-running refit work (DESIGN.md as authoritative spec)
into a single baseline commit. Six stages landed together:
S1 Case + EvidenceSource abstraction; tools parameterised by source_id
(case.py, main.py multi-source bootstrap, .bin extension support)
S2 Grounding gateway in add_phenomenon: verified_facts cite real
ToolInvocation ids; substring / normalised match enforced; agent +
task scope checked. Phenomenon.description split into verified_facts
(grounded) + interpretation (free text). [invocation: inv-xxx]
prefix on every wrapped tool result so the LLM can cite.
S3 Confidence as additive log-odds: edge_type → log10(LR) calibration
table; commutative updates; supported / refuted thresholds derived
from log_odds; hypothesis × evidence matrix view.
S4 iOS plugin: unzip_archive + parse_plist / sqlite_tables /
sqlite_query / parse_ios_keychain / read_idevice_info;
IOSArtifactAgent; SOURCE_TYPE_AGENTS routing.
S5 Cross-source entity resolution: typed identifiers on Entity,
observe_identity gateway, auto coref hypothesis with shared /
conflicting strong/weak LR edges, reversible same_as edges,
actor_clusters() view.
S6 Android partition probe + AndroidArtifactAgent; MediaAgent with
OCR fallback; orchestrator Phase 1 iterates every analysable
source; platform-aware get_triage_agent_type; ReportAgent renders
actor clusters + per-source breakdown.
142 unit tests / 1 skipped — full coverage of the new gateway, log-odds
math, coref hypothesis fall-out, and orchestrator multi-source dispatch.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
156
tools/archive.py
Normal file
156
tools/archive.py
Normal file
@@ -0,0 +1,156 @@
|
||||
"""Archive extraction tools — generic unzip for tree-mode evidence sources.
|
||||
|
||||
Mobile extractions (iOS / Android backups), archive sources, and shared
|
||||
work products all arrive as .zip files. The forensic agents work on the
|
||||
unpacked tree; this module is the single entry point for safely turning
|
||||
an archive into a directory.
|
||||
|
||||
Stdlib-only. No graph dependency.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _is_within(base: Path, target: Path) -> bool:
|
||||
"""True when *target* resolves to a path inside *base* — symlink-safe."""
|
||||
try:
|
||||
base_r = base.resolve()
|
||||
target_r = target.resolve()
|
||||
except OSError:
|
||||
return False
|
||||
try:
|
||||
target_r.relative_to(base_r)
|
||||
except ValueError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _is_zip_encrypted(zf: zipfile.ZipFile) -> bool:
|
||||
"""True when any entry has the zip 'encrypted' flag bit set."""
|
||||
return any(info.flag_bits & 0x1 for info in zf.infolist())
|
||||
|
||||
|
||||
def _do_extract(
|
||||
zip_path: str,
|
||||
dest_dir: str,
|
||||
password: str | None = None,
|
||||
) -> str:
|
||||
"""Shared core for unzip_archive (async) and unzip_archive_sync.
|
||||
|
||||
Pure stdlib + filesystem I/O — no asyncio. Idempotent on rerun (files
|
||||
whose target already exists at the matching size are skipped). Returns
|
||||
a multi-line summary the agent can read directly.
|
||||
"""
|
||||
zp = Path(zip_path)
|
||||
if not zp.is_file():
|
||||
return f"Error: {zip_path} is not a file."
|
||||
|
||||
dest = Path(dest_dir)
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
extracted = 0
|
||||
skipped: list[str] = []
|
||||
total_bytes = 0
|
||||
pwd_bytes = password.encode("utf-8") if password else None
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(zp, "r") as zf:
|
||||
encrypted = _is_zip_encrypted(zf)
|
||||
if encrypted and pwd_bytes is None:
|
||||
return (
|
||||
f"Error: {zip_path} is password-protected. "
|
||||
f"Provide the password via case.yaml's "
|
||||
f"meta.password on this source, or pass `password=` "
|
||||
f"explicitly. Stdlib zipfile only supports the legacy "
|
||||
f"ZipCrypto algorithm — AES-encrypted zips (created by "
|
||||
f"7-Zip / WinZip) need an external tool like 7z."
|
||||
)
|
||||
for info in zf.infolist():
|
||||
name = info.filename
|
||||
# Block absolute paths and parent-escape attempts up front.
|
||||
if name.startswith(("/", "\\")) or ".." in Path(name).parts:
|
||||
skipped.append(f"escape: {name}")
|
||||
continue
|
||||
target = dest / name
|
||||
if not _is_within(dest, target):
|
||||
skipped.append(f"escape: {name}")
|
||||
continue
|
||||
# Symlink entries — skip rather than risk traversing out.
|
||||
if info.external_attr >> 16 & 0o120000 == 0o120000:
|
||||
skipped.append(f"symlink: {name}")
|
||||
continue
|
||||
if info.is_dir():
|
||||
target.mkdir(parents=True, exist_ok=True)
|
||||
continue
|
||||
# Skip if already extracted with matching size (idempotent rerun).
|
||||
if target.exists() and target.stat().st_size == info.file_size:
|
||||
continue
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
with zf.open(info, "r", pwd=pwd_bytes) as src, open(target, "wb") as out:
|
||||
while True:
|
||||
chunk = src.read(65536)
|
||||
if not chunk:
|
||||
break
|
||||
out.write(chunk)
|
||||
except RuntimeError as e:
|
||||
# zipfile raises RuntimeError for bad-password / AES-encrypted.
|
||||
msg = str(e)
|
||||
if "Bad password" in msg or "password required" in msg:
|
||||
return (
|
||||
f"Error: bad or missing password for {zip_path}. "
|
||||
f"If the zip is AES-encrypted (7-Zip/WinZip), stdlib "
|
||||
f"cannot decrypt it — use `7z x -p<pwd> ...` "
|
||||
f"externally and point the source path at the result."
|
||||
)
|
||||
raise
|
||||
extracted += 1
|
||||
total_bytes += info.file_size
|
||||
except zipfile.BadZipFile as e:
|
||||
return f"Error: {zip_path} is not a valid zip archive: {e}"
|
||||
except Exception as e:
|
||||
return f"Error extracting {zip_path}: {e}"
|
||||
|
||||
parts = [
|
||||
f"Extracted {extracted} file(s), {total_bytes} bytes, into {dest}",
|
||||
]
|
||||
if skipped:
|
||||
parts.append(f"Skipped {len(skipped)} unsafe entries:")
|
||||
for s in skipped[:10]:
|
||||
parts.append(f" - {s}")
|
||||
if len(skipped) > 10:
|
||||
parts.append(f" ... ({len(skipped) - 10} more)")
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
async def unzip_archive(
|
||||
zip_path: str, dest_dir: str, password: str | None = None,
|
||||
) -> str:
|
||||
"""Extract *zip_path* into *dest_dir*. Idempotent on rerun.
|
||||
|
||||
Defensive: rejects entries with absolute paths, leading '..', or that
|
||||
would resolve outside *dest_dir* (the classic zip-slip vector). Symlink
|
||||
entries are skipped (we never follow symlinks into the host filesystem).
|
||||
Password-protected zips need the password argument (or
|
||||
``meta.password`` on the source in case.yaml) — stdlib ``zipfile``
|
||||
only handles the legacy ZipCrypto algorithm.
|
||||
"""
|
||||
return _do_extract(zip_path, dest_dir, password)
|
||||
|
||||
|
||||
def unzip_archive_sync(
|
||||
zip_path: str, dest_dir: str, password: str | None = None,
|
||||
) -> str:
|
||||
"""Synchronous variant of :func:`unzip_archive` for startup-time prepare_source.
|
||||
|
||||
Same behaviour, just no async wrapping — used before the event loop
|
||||
starts so we don't have to spin one up just to unpack a zip.
|
||||
"""
|
||||
return _do_extract(zip_path, dest_dir, password)
|
||||
Reference in New Issue
Block a user