"""Archive extraction tools — generic unzip for tree-mode evidence sources. Mobile extractions (iOS / Android backups), archive sources, and shared work products all arrive as .zip files. The forensic agents work on the unpacked tree; this module is the single entry point for safely turning an archive into a directory. Stdlib-only. No graph dependency. """ from __future__ import annotations import logging import os import zipfile from pathlib import Path logger = logging.getLogger(__name__) def _is_within(base: Path, target: Path) -> bool: """True when *target* resolves to a path inside *base* — symlink-safe.""" try: base_r = base.resolve() target_r = target.resolve() except OSError: return False try: target_r.relative_to(base_r) except ValueError: return False return True def _is_zip_encrypted(zf: zipfile.ZipFile) -> bool: """True when any entry has the zip 'encrypted' flag bit set.""" return any(info.flag_bits & 0x1 for info in zf.infolist()) def _do_extract( zip_path: str, dest_dir: str, password: str | None = None, ) -> str: """Shared core for unzip_archive (async) and unzip_archive_sync. Pure stdlib + filesystem I/O — no asyncio. Idempotent on rerun (files whose target already exists at the matching size are skipped). Returns a multi-line summary the agent can read directly. """ zp = Path(zip_path) if not zp.is_file(): return f"Error: {zip_path} is not a file." dest = Path(dest_dir) dest.mkdir(parents=True, exist_ok=True) extracted = 0 skipped: list[str] = [] total_bytes = 0 pwd_bytes = password.encode("utf-8") if password else None try: with zipfile.ZipFile(zp, "r") as zf: encrypted = _is_zip_encrypted(zf) if encrypted and pwd_bytes is None: return ( f"Error: {zip_path} is password-protected. " f"Provide the password via case.yaml's " f"meta.password on this source, or pass `password=` " f"explicitly. Stdlib zipfile only supports the legacy " f"ZipCrypto algorithm — AES-encrypted zips (created by " f"7-Zip / WinZip) need an external tool like 7z." ) for info in zf.infolist(): name = info.filename # Block absolute paths and parent-escape attempts up front. if name.startswith(("/", "\\")) or ".." in Path(name).parts: skipped.append(f"escape: {name}") continue target = dest / name if not _is_within(dest, target): skipped.append(f"escape: {name}") continue # Symlink entries — skip rather than risk traversing out. if info.external_attr >> 16 & 0o120000 == 0o120000: skipped.append(f"symlink: {name}") continue if info.is_dir(): target.mkdir(parents=True, exist_ok=True) continue # Skip if already extracted with matching size (idempotent rerun). if target.exists() and target.stat().st_size == info.file_size: continue target.parent.mkdir(parents=True, exist_ok=True) try: with zf.open(info, "r", pwd=pwd_bytes) as src, open(target, "wb") as out: while True: chunk = src.read(65536) if not chunk: break out.write(chunk) except RuntimeError as e: # zipfile raises RuntimeError for bad-password / AES-encrypted. msg = str(e) if "Bad password" in msg or "password required" in msg: return ( f"Error: bad or missing password for {zip_path}. " f"If the zip is AES-encrypted (7-Zip/WinZip), stdlib " f"cannot decrypt it — use `7z x -p ...` " f"externally and point the source path at the result." ) raise extracted += 1 total_bytes += info.file_size except zipfile.BadZipFile as e: return f"Error: {zip_path} is not a valid zip archive: {e}" except Exception as e: return f"Error extracting {zip_path}: {e}" parts = [ f"Extracted {extracted} file(s), {total_bytes} bytes, into {dest}", ] if skipped: parts.append(f"Skipped {len(skipped)} unsafe entries:") for s in skipped[:10]: parts.append(f" - {s}") if len(skipped) > 10: parts.append(f" ... ({len(skipped) - 10} more)") return "\n".join(parts) async def unzip_archive( zip_path: str, dest_dir: str, password: str | None = None, ) -> str: """Extract *zip_path* into *dest_dir*. Idempotent on rerun. Defensive: rejects entries with absolute paths, leading '..', or that would resolve outside *dest_dir* (the classic zip-slip vector). Symlink entries are skipped (we never follow symlinks into the host filesystem). Password-protected zips need the password argument (or ``meta.password`` on the source in case.yaml) — stdlib ``zipfile`` only handles the legacy ZipCrypto algorithm. """ return _do_extract(zip_path, dest_dir, password) def unzip_archive_sync( zip_path: str, dest_dir: str, password: str | None = None, ) -> str: """Synchronous variant of :func:`unzip_archive` for startup-time prepare_source. Same behaviour, just no async wrapping — used before the event loop starts so we don't have to spin one up just to unpack a zip. """ return _do_extract(zip_path, dest_dir, password)