Files
MASForensic/tools/media.py
BattleTag 81ade8f7ac feat(refit): complete S1-S6 — case abstraction, grounding, log-odds, plugins, coref, multi-source
Consolidates the long-running refit work (DESIGN.md as authoritative spec)
into a single baseline commit. Six stages landed together:

  S1  Case + EvidenceSource abstraction; tools parameterised by source_id
      (case.py, main.py multi-source bootstrap, .bin extension support)
  S2  Grounding gateway in add_phenomenon: verified_facts cite real
      ToolInvocation ids; substring / normalised match enforced; agent +
      task scope checked. Phenomenon.description split into verified_facts
      (grounded) + interpretation (free text). [invocation: inv-xxx]
      prefix on every wrapped tool result so the LLM can cite.
  S3  Confidence as additive log-odds: edge_type → log10(LR) calibration
      table; commutative updates; supported / refuted thresholds derived
      from log_odds; hypothesis × evidence matrix view.
  S4  iOS plugin: unzip_archive + parse_plist / sqlite_tables /
      sqlite_query / parse_ios_keychain / read_idevice_info;
      IOSArtifactAgent; SOURCE_TYPE_AGENTS routing.
  S5  Cross-source entity resolution: typed identifiers on Entity,
      observe_identity gateway, auto coref hypothesis with shared /
      conflicting strong/weak LR edges, reversible same_as edges,
      actor_clusters() view.
  S6  Android partition probe + AndroidArtifactAgent; MediaAgent with
      OCR fallback; orchestrator Phase 1 iterates every analysable
      source; platform-aware get_triage_agent_type; ReportAgent renders
      actor clusters + per-source breakdown.

142 unit tests / 1 skipped — full coverage of the new gateway, log-odds
math, coref hypothesis fall-out, and orchestrator multi-source dispatch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 02:12:10 -10:00

88 lines
2.8 KiB
Python

"""Media plugin — OCR for image evidence.
DESIGN.md §4.7: the model backend (DeepSeek) has no vision, so we MUST run
OCR locally for any image-bearing evidence. Tesseract via pytesseract is
the default; if the runtime is missing those packages, the tool returns a
clear install hint rather than failing silently.
"""
from __future__ import annotations
import logging
import os
from pathlib import Path
logger = logging.getLogger(__name__)
MAX_OUTPUT = 8000
_INSTALL_HINT = (
"Error: OCR runtime not available. Install with:\n"
" pip install pytesseract pillow\n"
" sudo apt install tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-chi-tra\n"
"(or the equivalent for your distribution). Then retry."
)
def _has_ocr_runtime() -> tuple[bool, str]:
"""Return (available, reason). reason is empty when available."""
try:
import pytesseract # noqa: F401
from PIL import Image # noqa: F401
except ImportError as e:
return False, f"missing python package: {e.name}"
# Check the tesseract binary too.
import shutil
if shutil.which("tesseract") is None:
return False, "tesseract binary not on PATH"
return True, ""
async def ocr_image(file_path: str, lang: str = "eng+chi_sim+chi_tra") -> str:
"""Extract text from an image via tesseract.
*lang* defaults to English + Simplified + Traditional Chinese, matching
the multi-language artefacts the current case involves. Pass a single
language code (e.g. ``"eng"``) to skip language packs that aren't
installed.
"""
p = Path(file_path)
if not p.is_file():
return f"Error: {file_path} is not a file."
available, reason = _has_ocr_runtime()
if not available:
return f"{_INSTALL_HINT}\n[detail: {reason}]"
import pytesseract
from PIL import Image
try:
img = Image.open(p)
except Exception as e:
return f"Error: could not open image {file_path}: {e}"
try:
text = pytesseract.image_to_string(img, lang=lang)
except pytesseract.TesseractError as e:
msg = str(e)
if "Failed loading language" in msg or "Error opening data file" in msg:
return (
f"Error: tesseract is installed but missing language pack(s) for {lang!r}. "
f"Install the language data (e.g. tesseract-ocr-chi-sim) or pass a "
f"different `lang`. Detail: {msg}"
)
return f"Error running tesseract: {msg}"
except Exception as e:
return f"Error during OCR: {e}"
size = p.stat().st_size
header = (
f"ocr: {file_path} ({size} bytes, lang={lang}, "
f"{len(text.splitlines())} line(s))\n"
)
if len(text) > MAX_OUTPUT - len(header):
body = text[:MAX_OUTPUT - len(header)] + "\n[truncated]"
else:
body = text
return header + body