MASForensic/case.py

"""Case and evidence-source model — the foundation for multi-evidence analysis.

A :class:`Case` is a collection of :class:`EvidenceSource` entries. Each source
has a *type* (disk image, mobile extraction, archive, ...) and an *access mode*
that determines how forensic tools reach its contents:

  - ``"image"`` — a block device / disk image, navigated by The Sleuth Kit via
    inode addressing (raw, E01, dd, ...).
  - ``"tree"``  — an already-mounted filesystem or unpacked extraction,
    navigated by ordinary filesystem paths.

This module is pure data model + loading. Partition probing and interactive
selection live in ``main.py``.
"""

from __future__ import annotations

import logging
import re
from dataclasses import asdict, dataclass, field
from pathlib import Path

logger = logging.getLogger(__name__)

# Recognised source types and access modes.
SOURCE_TYPES = {"disk_image", "mobile_extraction", "archive", "media_collection"}
ACCESS_MODES = {"image", "tree"}

# Disk-image file extensions for interactive discovery.
# P6 fix: ``.bin`` (and vmdk/vhd) added — extension globbing previously missed
# raw block-device dumps such as ``blk0_sda.bin``.
DISK_IMAGE_EXTS = {
    ".001", ".dd", ".raw", ".img", ".bin", ".e01", ".iso", ".vmdk", ".vhd",
}

# Default access mode per source type.
_DEFAULT_ACCESS_MODE = {
    "disk_image": "image",
    "mobile_extraction": "tree",
    "archive": "tree",
    "media_collection": "tree",
}


def slugify(text: str) -> str:
    """Reduce *text* to a lowercase, hyphen-separated slug for use in IDs."""
    slug = re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-")
    return slug or "src"


@dataclass
class EvidenceSource:
    """One piece of evidence within a :class:`Case`."""

    id: str                       # "src-<slug>"
    label: str                    # human-readable name
    type: str                     # one of SOURCE_TYPES
    path: str                     # filesystem path to the evidence
    access_mode: str              # "image" | "tree"
    owner: str = ""               # associated person, if known
    partition_offset: int = 0     # sector offset (image-mode sources only)
    meta: dict = field(default_factory=dict)

    def to_dict(self) -> dict:
        return asdict(self)

    @classmethod
    def from_dict(cls, d: dict) -> EvidenceSource:
        """Reconstruct from a dict, ignoring unknown keys (forward-compatible)."""
        known = set(cls.__dataclass_fields__)
        return cls(**{k: v for k, v in d.items() if k in known})

    def summary(self) -> str:
        loc = (
            f"@{self.partition_offset}"
            if self.access_mode == "image" and self.partition_offset
            else ""
        )
        owner = f" owner={self.owner}" if self.owner else ""
        return f"[{self.id}] {self.label} ({self.type}/{self.access_mode}{loc}){owner}"


@dataclass
class Case:
    """A forensic case: a set of evidence sources plus metadata."""

    case_id: str
    name: str
    sources: list[EvidenceSource] = field(default_factory=list)
    meta: dict = field(default_factory=dict)

    def to_dict(self) -> dict:
        return {
            "case_id": self.case_id,
            "name": self.name,
            "sources": [s.to_dict() for s in self.sources],
            "meta": dict(self.meta),
        }

    @classmethod
    def from_dict(cls, d: dict) -> Case:
        return cls(
            case_id=d.get("case_id", ""),
            name=d.get("name", ""),
            sources=[EvidenceSource.from_dict(s) for s in d.get("sources", [])],
            meta=d.get("meta", {}),
        )

    def get_source(self, source_id: str) -> EvidenceSource | None:
        for s in self.sources:
            if s.id == source_id:
                return s
        return None


# ---------------------------------------------------------------------------
# case.yaml loading
# ---------------------------------------------------------------------------

def _build_source(raw: dict, base_dir: Path, index: int) -> EvidenceSource:
    """Validate and normalise one source entry from case.yaml.

    Missing ``id`` is derived from the label; missing ``access_mode`` defaults
    by type; relative paths are resolved against *base_dir* (the case file's
    directory).
    """
    label = str(raw.get("label") or raw.get("id") or f"source-{index}")
    src_type = str(raw.get("type", "disk_image"))
    if src_type not in SOURCE_TYPES:
        logger.warning("Unknown source type %r for %r — treating as disk_image",
                        src_type, label)
        src_type = "disk_image"

    access_mode = str(raw.get("access_mode") or _DEFAULT_ACCESS_MODE.get(src_type, "tree"))
    if access_mode not in ACCESS_MODES:
        logger.warning("Unknown access_mode %r for %r — defaulting", access_mode, label)
        access_mode = _DEFAULT_ACCESS_MODE.get(src_type, "tree")

    src_id = str(raw.get("id") or f"src-{slugify(label)}")
    if not src_id.startswith("src-"):
        src_id = f"src-{slugify(src_id)}"

    raw_path = str(raw.get("path", "")).strip()
    path = raw_path
    if raw_path:
        p = Path(raw_path).expanduser()
        if not p.is_absolute():
            p = (base_dir / p)
        path = str(p)

    return EvidenceSource(
        id=src_id,
        label=label,
        type=src_type,
        path=path,
        access_mode=access_mode,
        owner=str(raw.get("owner", "")),
        partition_offset=int(raw.get("partition_offset", 0) or 0),
        meta=dict(raw.get("meta", {})),
    )


def build_case(data: dict, base_dir: Path | None = None) -> Case:
    """Build a validated :class:`Case` from a loosely-typed case.yaml dict."""
    base_dir = base_dir or Path.cwd()
    sources: list[EvidenceSource] = []
    seen_ids: set[str] = set()
    for i, raw in enumerate(data.get("sources", []) or []):
        if not isinstance(raw, dict):
            logger.warning("Skipping malformed source entry #%d", i)
            continue
        src = _build_source(raw, base_dir, i)
        if src.id in seen_ids:
            src.id = f"{src.id}-{i}"
        seen_ids.add(src.id)
        if not src.path:
            logger.warning("Source %r has no path — keeping but it is not analysable",
                            src.label)
        sources.append(src)

    return Case(
        case_id=str(data.get("case_id", "case")),
        name=str(data.get("name", "Untitled case")),
        sources=sources,
        meta=dict(data.get("meta", {})),
    )


def load_case(path: str | Path = "case.yaml") -> Case | None:
    """Load a :class:`Case` from a case.yaml file. Returns None if absent."""
    case_path = Path(path)
    if not case_path.exists():
        return None
    import yaml

    try:
        data = yaml.safe_load(case_path.read_text()) or {}
    except Exception as e:
        logger.error("Failed to parse %s: %s", case_path, e)
        return None
    if not isinstance(data, dict):
        logger.error("%s is not a YAML mapping", case_path)
        return None

    case = build_case(data, base_dir=case_path.resolve().parent)
    logger.info("Loaded case %r with %d source(s) from %s",
                case.name, len(case.sources), case_path)
    return case


def single_source_case(
    image_path: str,
    partition_offset: int = 0,
    label: str | None = None,
) -> Case:
    """Wrap a single disk image as a one-source Case (interactive fallback)."""
    name = label or Path(image_path).name
    src = EvidenceSource(
        id=f"src-{slugify(Path(image_path).stem)}",
        label=name,
        type="disk_image",
        path=image_path,
        access_mode="image",
        partition_offset=partition_offset,
    )
    return Case(case_id="adhoc", name=name, sources=[src])