Consolidates the long-running refit work (DESIGN.md as authoritative spec)
into a single baseline commit. Six stages landed together:
S1 Case + EvidenceSource abstraction; tools parameterised by source_id
(case.py, main.py multi-source bootstrap, .bin extension support)
S2 Grounding gateway in add_phenomenon: verified_facts cite real
ToolInvocation ids; substring / normalised match enforced; agent +
task scope checked. Phenomenon.description split into verified_facts
(grounded) + interpretation (free text). [invocation: inv-xxx]
prefix on every wrapped tool result so the LLM can cite.
S3 Confidence as additive log-odds: edge_type → log10(LR) calibration
table; commutative updates; supported / refuted thresholds derived
from log_odds; hypothesis × evidence matrix view.
S4 iOS plugin: unzip_archive + parse_plist / sqlite_tables /
sqlite_query / parse_ios_keychain / read_idevice_info;
IOSArtifactAgent; SOURCE_TYPE_AGENTS routing.
S5 Cross-source entity resolution: typed identifiers on Entity,
observe_identity gateway, auto coref hypothesis with shared /
conflicting strong/weak LR edges, reversible same_as edges,
actor_clusters() view.
S6 Android partition probe + AndroidArtifactAgent; MediaAgent with
OCR fallback; orchestrator Phase 1 iterates every analysable
source; platform-aware get_triage_agent_type; ReportAgent renders
actor clusters + per-source breakdown.
142 unit tests / 1 skipped — full coverage of the new gateway, log-odds
math, coref hypothesis fall-out, and orchestrator multi-source dispatch.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
88 lines
2.8 KiB
Python
88 lines
2.8 KiB
Python
"""Media plugin — OCR for image evidence.
|
|
|
|
DESIGN.md §4.7: the model backend (DeepSeek) has no vision, so we MUST run
|
|
OCR locally for any image-bearing evidence. Tesseract via pytesseract is
|
|
the default; if the runtime is missing those packages, the tool returns a
|
|
clear install hint rather than failing silently.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MAX_OUTPUT = 8000
|
|
|
|
_INSTALL_HINT = (
|
|
"Error: OCR runtime not available. Install with:\n"
|
|
" pip install pytesseract pillow\n"
|
|
" sudo apt install tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-chi-tra\n"
|
|
"(or the equivalent for your distribution). Then retry."
|
|
)
|
|
|
|
|
|
def _has_ocr_runtime() -> tuple[bool, str]:
|
|
"""Return (available, reason). reason is empty when available."""
|
|
try:
|
|
import pytesseract # noqa: F401
|
|
from PIL import Image # noqa: F401
|
|
except ImportError as e:
|
|
return False, f"missing python package: {e.name}"
|
|
# Check the tesseract binary too.
|
|
import shutil
|
|
if shutil.which("tesseract") is None:
|
|
return False, "tesseract binary not on PATH"
|
|
return True, ""
|
|
|
|
|
|
async def ocr_image(file_path: str, lang: str = "eng+chi_sim+chi_tra") -> str:
|
|
"""Extract text from an image via tesseract.
|
|
|
|
*lang* defaults to English + Simplified + Traditional Chinese, matching
|
|
the multi-language artefacts the current case involves. Pass a single
|
|
language code (e.g. ``"eng"``) to skip language packs that aren't
|
|
installed.
|
|
"""
|
|
p = Path(file_path)
|
|
if not p.is_file():
|
|
return f"Error: {file_path} is not a file."
|
|
available, reason = _has_ocr_runtime()
|
|
if not available:
|
|
return f"{_INSTALL_HINT}\n[detail: {reason}]"
|
|
|
|
import pytesseract
|
|
from PIL import Image
|
|
|
|
try:
|
|
img = Image.open(p)
|
|
except Exception as e:
|
|
return f"Error: could not open image {file_path}: {e}"
|
|
|
|
try:
|
|
text = pytesseract.image_to_string(img, lang=lang)
|
|
except pytesseract.TesseractError as e:
|
|
msg = str(e)
|
|
if "Failed loading language" in msg or "Error opening data file" in msg:
|
|
return (
|
|
f"Error: tesseract is installed but missing language pack(s) for {lang!r}. "
|
|
f"Install the language data (e.g. tesseract-ocr-chi-sim) or pass a "
|
|
f"different `lang`. Detail: {msg}"
|
|
)
|
|
return f"Error running tesseract: {msg}"
|
|
except Exception as e:
|
|
return f"Error during OCR: {e}"
|
|
|
|
size = p.stat().st_size
|
|
header = (
|
|
f"ocr: {file_path} ({size} bytes, lang={lang}, "
|
|
f"{len(text.splitlines())} line(s))\n"
|
|
)
|
|
if len(text) > MAX_OUTPUT - len(header):
|
|
body = text[:MAX_OUTPUT - len(header)] + "\n[truncated]"
|
|
else:
|
|
body = text
|
|
return header + body
|