MASForensic/tools/media.py

"""Media plugin — OCR for image evidence.

DESIGN.md §4.7: the model backend (DeepSeek) has no vision, so we MUST run
OCR locally for any image-bearing evidence. Tesseract via pytesseract is
the default; if the runtime is missing those packages, the tool returns a
clear install hint rather than failing silently.
"""

from __future__ import annotations

import logging
import os
from pathlib import Path

logger = logging.getLogger(__name__)

MAX_OUTPUT = 8000

_INSTALL_HINT = (
    "Error: OCR runtime not available. Install with:\n"
    "  pip install pytesseract pillow\n"
    "  sudo apt install tesseract-ocr tesseract-ocr-chi-sim tesseract-ocr-chi-tra\n"
    "(or the equivalent for your distribution). Then retry."
)


def _has_ocr_runtime() -> tuple[bool, str]:
    """Return (available, reason). reason is empty when available."""
    try:
        import pytesseract  # noqa: F401
        from PIL import Image  # noqa: F401
    except ImportError as e:
        return False, f"missing python package: {e.name}"
    # Check the tesseract binary too.
    import shutil
    if shutil.which("tesseract") is None:
        return False, "tesseract binary not on PATH"
    return True, ""


async def ocr_image(file_path: str, lang: str = "eng+chi_sim+chi_tra") -> str:
    """Extract text from an image via tesseract.

    *lang* defaults to English + Simplified + Traditional Chinese, matching
    the multi-language artefacts the current case involves. Pass a single
    language code (e.g. ``"eng"``) to skip language packs that aren't
    installed.
    """
    p = Path(file_path)
    if not p.is_file():
        return f"Error: {file_path} is not a file."
    available, reason = _has_ocr_runtime()
    if not available:
        return f"{_INSTALL_HINT}\n[detail: {reason}]"

    import pytesseract
    from PIL import Image

    try:
        img = Image.open(p)
    except Exception as e:
        return f"Error: could not open image {file_path}: {e}"

    try:
        text = pytesseract.image_to_string(img, lang=lang)
    except pytesseract.TesseractError as e:
        msg = str(e)
        if "Failed loading language" in msg or "Error opening data file" in msg:
            return (
                f"Error: tesseract is installed but missing language pack(s) for {lang!r}. "
                f"Install the language data (e.g. tesseract-ocr-chi-sim) or pass a "
                f"different `lang`. Detail: {msg}"
            )
        return f"Error running tesseract: {msg}"
    except Exception as e:
        return f"Error during OCR: {e}"

    size = p.stat().st_size
    header = (
        f"ocr: {file_path} ({size} bytes, lang={lang}, "
        f"{len(text.splitlines())} line(s))\n"
    )
    if len(text) > MAX_OUTPUT - len(header):
        body = text[:MAX_OUTPUT - len(header)] + "\n[truncated]"
    else:
        body = text
    return header + body