"""Media plugin — OCR for image evidence. DESIGN.md §4.7: the model backend (DeepSeek) has no vision, so we MUST run OCR locally for any image-bearing evidence. Tesseract via pytesseract is the default; if the runtime is missing those packages, the tool returns a clear install hint rather than failing silently. """ from __future__ import annotations import logging import os from pathlib import Path logger = logging.getLogger(__name__) MAX_OUTPUT = 8000 _INSTALL_HINT = ( "Error: OCR runtime not available. Install with:\n" " pip install pytesseract pillow\n" " sudo apt install tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-chi-tra\n" "(or the equivalent for your distribution). Then retry." ) def _has_ocr_runtime() -> tuple[bool, str]: """Return (available, reason). reason is empty when available.""" try: import pytesseract # noqa: F401 from PIL import Image # noqa: F401 except ImportError as e: return False, f"missing python package: {e.name}" # Check the tesseract binary too. import shutil if shutil.which("tesseract") is None: return False, "tesseract binary not on PATH" return True, "" async def ocr_image(file_path: str, lang: str = "eng+chi_sim+chi_tra") -> str: """Extract text from an image via tesseract. *lang* defaults to English + Simplified + Traditional Chinese, matching the multi-language artefacts the current case involves. Pass a single language code (e.g. ``"eng"``) to skip language packs that aren't installed. """ p = Path(file_path) if not p.is_file(): return f"Error: {file_path} is not a file." available, reason = _has_ocr_runtime() if not available: return f"{_INSTALL_HINT}\n[detail: {reason}]" import pytesseract from PIL import Image try: img = Image.open(p) except Exception as e: return f"Error: could not open image {file_path}: {e}" try: text = pytesseract.image_to_string(img, lang=lang) except pytesseract.TesseractError as e: msg = str(e) if "Failed loading language" in msg or "Error opening data file" in msg: return ( f"Error: tesseract is installed but missing language pack(s) for {lang!r}. " f"Install the language data (e.g. tesseract-ocr-chi-sim) or pass a " f"different `lang`. Detail: {msg}" ) return f"Error running tesseract: {msg}" except Exception as e: return f"Error during OCR: {e}" size = p.stat().st_size header = ( f"ocr: {file_path} ({size} bytes, lang={lang}, " f"{len(text.splitlines())} line(s))\n" ) if len(text) > MAX_OUTPUT - len(header): body = text[:MAX_OUTPUT - len(header)] + "\n[truncated]" else: body = text return header + body