MASForensic/evidence_graph.py

"""Evidence Knowledge Graph for multi-agent forensic analysis.

Replaces the flat Blackboard with a graph-based evidence store.
Nodes: Phenomenon (observable artifacts), Hypothesis (interpretive claims), Entity (recurring objects).
Edges: typed relationships with predefined weights for hypothesis confidence computation.
"""

from __future__ import annotations

import asyncio
import contextvars
import hashlib
import json
import logging
import re
import uuid
from dataclasses import asdict, dataclass, field
from datetime import datetime
from pathlib import Path

# Per-asyncio-task scoped values for "which agent is currently running" and
# "which task scope does that agent's grounding live in". Backed by
# ContextVars so concurrent agent runs (Phase 3's _dispatch_leads_parallel)
# don't clobber each other — asyncio.create_task / asyncio.gather copies
# the parent context per child task, and writes inside one task stay there.
# Pre-P0 these were plain attributes on EvidenceGraph; the last setter won
# under concurrency, tagging tool invocations with the WRONG agent and
# making the grounding gateway falsely reject legitimate facts.
_current_agent_ctx: contextvars.ContextVar[str] = contextvars.ContextVar(
    "masforensics_current_agent", default="",
)
_current_task_id_ctx: contextvars.ContextVar[str] = contextvars.ContextVar(
    "masforensics_current_task_id", default="",
)

from case import Case, EvidenceSource, single_source_case

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Per-edge-type log₁₀(LR) — the calibration table backing hypothesis
# confidence updates (DESIGN.md §4.5).
#
# The LLM only picks the *category* (direct_evidence, supports, …); the
# numerical contribution is looked up here. Updates use the additive,
# order-independent log-odds form
#     L_post = L_prior + Σ log10(LR_i)
#     confidence = 1 / (1 + 10^(−L_post))
# which fixes the pre-S3 delta-update bug whose result depended on the
# order edges arrived in.
#
# Override per-graph via EvidenceGraph(edge_log_lr=...) or config.yaml's
# `hypothesis_log_lr` section.
# ---------------------------------------------------------------------------
_DEFAULT_LOG_LR: dict[str, float] = {
    "direct_evidence":      +2.0,
    "supports":             +1.0,
    "consequence_observed": +1.0,
    "prerequisite_met":     +0.5,
    "weakens":              -0.5,
    "contradicts":          -2.0,
    # S5 cross-source coref (DESIGN.md §4.6) — same calibration scale.
    # A single shared strong identifier (email, phone, wallet, IMEI…) is
    # near-decisive; weak identifiers (nickname) accumulate slowly; a
    # conflicting strong identifier is strong negative evidence.
    "shared_strong_identifier":      +2.0,
    "shared_weak_identifier":        +0.5,
    "conflicting_strong_identifier": -2.0,
}


# DESIGN.md §4.6 identifier taxonomy. Strong identifiers approximate
# global uniqueness — sharing one is high-confidence coref evidence.
# Weak identifiers are nicknames / display names — accumulate via Bayes.
STRONG_IDENTIFIER_TYPES: set[str] = {
    "email",
    "phone_number",
    "imei",
    "imsi",
    "apple_id",
    "icloud_id",
    "google_account",
    "wallet_address",
    "udid",
    "mac_address",
    "device_serial",
}

WEAK_IDENTIFIER_TYPES: set[str] = {
    "nickname",
    "display_name",
    "username",
    "screen_name",
}


def is_strong_identifier(identifier_type: str) -> bool:
    """True if the identifier carries enough uniqueness for a strong LR edge."""
    return identifier_type in STRONG_IDENTIFIER_TYPES


def _normalize_identifier(identifier_type: str, value: str) -> str:
    """Canonicalise an identifier value so trivial spelling variants match.

    - Lowercase for case-insensitive identifiers (email, hostnames, hex).
    - Strip whitespace and the leading '+' on phone numbers / international
      dialling, then keep only digits for phone matching.
    - Pass-through for free-form strings (nicknames).
    """
    v = (value or "").strip()
    if identifier_type in {"email", "apple_id", "icloud_id", "google_account",
                            "mac_address", "wallet_address", "udid",
                            "imei", "imsi", "device_serial"}:
        v = v.lower()
    if identifier_type == "phone_number":
        import re as _re
        v = _re.sub(r"\D", "", v)
    return v


def prob_to_log_odds(p: float) -> float:
    """Logit (base 10). Clipped to keep ±∞ out of the graph."""
    p = max(1e-9, min(1 - 1e-9, float(p)))
    import math
    return math.log10(p / (1.0 - p))


def log_odds_to_prob(log_odds: float) -> float:
    """Inverse of :func:`prob_to_log_odds`: 1 / (1 + 10^(−L))."""
    return 1.0 / (1.0 + 10.0 ** (-float(log_odds)))


_WS_RUN = re.compile(r"\s+")


def _normalize_for_grounding(s: str) -> str:
    """Canonicalise a string for the loose-match branch of fact grounding.

    Strict ``value in inv.output`` rejected real evidence because the LLM
    routinely normalises tool output before quoting:
      - case-folds hex (``89 50 4e 47`` → ``89 50 4E 47``)
      - flips path separators (``Sunny\\foo.exe`` → ``Sunny/foo.exe``)
      - collapses whitespace across newlines (``AppleID:\\n  alice@x.com``
        → ``AppleID: alice@x.com``)

    None of those are hallucinations — they're presentation choices.  This
    normaliser does the inverse so both sides line up:
      - lowercase everything (handles hex case + email case + MAC case)
      - collapse any run of whitespace to a single space
      - replace ``\\`` with ``/`` (path-sep flip)

    Genuine fabrications still fail: a value that doesn't appear (in any
    form) inside the output normalises to a string that isn't a substring
    of the normalised output, and the gateway rejects exactly as before.
    """
    if not s:
        return ""
    s = s.lower().replace("\\", "/")
    s = _WS_RUN.sub(" ", s)
    return s.strip()

class GroundingError(ValueError):
    """Raised by the add_phenomenon gateway when one or more verified_facts
    fail the grounding check (missing/wrong invocation_id, wrong agent or
    task, or fact.value not present in the cited tool output).

    Carries the failed facts so callers (BaseAgent) can format them back
    to the LLM for a corrective retry.
    """

    def __init__(self, message: str, failures: list[dict]) -> None:
        super().__init__(message)
        self.failures = failures


# All valid edge types across the graph.
VALID_EDGE_TYPES: set[str] = {
    # Phenomenon → Hypothesis
    "direct_evidence", "supports", "prerequisite_met",
    "consequence_observed", "contradicts", "weakens",
    # Phenomenon → Hypothesis (S5 coref-specific — used between identifier
    # observation phenomena and the "Entity A ≡ Entity B" coref hypothesis)
    "shared_strong_identifier", "shared_weak_identifier",
    "conflicting_strong_identifier",
    # Phenomenon → Phenomenon
    "temporal", "causal", "input_to", "modifies", "co_located", "corroborates",
    # Phenomenon → Entity
    "created_by", "executed_by", "owned_by", "targets",
    "associated_with", "found_on", "used_by",
    # Hypothesis → Hypothesis
    "refines", "conflicts", "depends_on",
    # Entity → Entity (S5 — backed by a coref hypothesis ≥ threshold)
    "same_as",
}


# ---------------------------------------------------------------------------
# Graph node types
# ---------------------------------------------------------------------------

def _compute_quality_score(
    source_tool: str,
    timestamp: str | None,
    raw_data: dict,
    interpretation: str,
    verified_facts: list[dict],
    related_ids: list[str],
) -> float:
    """Compute a quality score (0.0-1.0) based on evidence completeness.

    A grounded phenomenon (any verified_facts) outweighs a long free-text
    interpretation: the facts carry provenance, the interpretation doesn't.
    """
    score = 0.0
    if source_tool:
        score += 0.20
    if timestamp is not None:
        score += 0.15
    if raw_data:
        score += 0.15
    if verified_facts:
        # Capped contribution: 0.05 per fact up to 0.25.
        score += min(0.25, 0.05 * len(verified_facts))
    if len(interpretation) >= 50:
        score += 0.10
    if related_ids:
        score += 0.15
    return min(1.0, score)


def _jaccard_similarity(a: str, b: str) -> float:
    """Token-level Jaccard similarity between two strings."""
    tokens_a = set(a.lower().split())
    tokens_b = set(b.lower().split())
    if not tokens_a or not tokens_b:
        return 0.0
    return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)


@dataclass
class Phenomenon:
    """Raw observable artifact found on disk.

    DESIGN.md §4.4: a phenomenon is split into provenance-bound *facts*
    and free-text *interpretation*. The gateway hard-validates every
    fact against the recorded tool invocation it cites; interpretation
    is the agent's narrative and is rendered as "agent analysis" in the
    final report — not as truth.
    """

    id: str                     # "ph-{uuid8}"
    source_agent: str
    category: str               # filesystem, registry, email, network, timeline
    title: str
    # Free-form analysis text — the agent's reasoning. NOT verified.
    interpretation: str = ""
    # Grounded atoms. Each fact: {type, value, invocation_id}.
    # type ∈ {path, timestamp, inode, hash, identifier, count, raw, ...}
    verified_facts: list[dict] = field(default_factory=list)
    raw_data: dict = field(default_factory=dict)
    timestamp: str | None = None
    confidence: float = 1.0
    source_tool: str = ""
    source_id: str = ""         # id of the EvidenceSource this finding came from
    corroborating_agents: list[str] = field(default_factory=list)
    from_lead_id: str | None = None
    created_at: str = ""

    def to_dict(self) -> dict:
        return asdict(self)

    @classmethod
    def from_dict(cls, d: dict) -> Phenomenon:
        """Reconstruct from a dict; migrate legacy ``description`` field.

        Older runs persisted free text in ``description``; treat that as
        ``interpretation`` so old graph_state.json files keep loading.
        """
        d = dict(d)
        if "description" in d:
            legacy = d.pop("description")
            d.setdefault("interpretation", legacy or "")
        d.setdefault("verified_facts", [])
        known = set(cls.__dataclass_fields__)
        return cls(**{k: v for k, v in d.items() if k in known})

    def summary(self) -> str:
        ts = f" @ {self.timestamp}" if self.timestamp else ""
        nf = len(self.verified_facts)
        facts_note = f" facts={nf}" if nf else ""
        return (
            f"[{self.id}] [{self.category}] {self.title}{ts} "
            f"(conf={self.confidence:.2f}{facts_note})"
        )


@dataclass
class Hypothesis:
    """Interpretive claim about what happened on the system.

    Confidence is a *derived* projection of ``log_odds`` (DESIGN.md §4.5):
    every Phenomenon→Hypothesis edge contributes log₁₀(LR) to ``log_odds``,
    and ``confidence = 1 / (1 + 10^(−log_odds))``. ``log_odds`` is the
    canonical state; ``confidence`` is kept in sync for display and
    threshold checks (≥0.8 supported / ≤0.2 refuted).

    ``prior_prob`` seeds the starting log_odds (default 0.5 → 0.0).
    """

    id: str                     # "hyp-{uuid8}"
    title: str
    description: str
    prior_prob: float = 0.5
    log_odds: float = 0.0
    confidence: float = 0.5     # derived from log_odds — kept in sync on update
    status: str = "active"      # active, supported, refuted, inconclusive
    parent_id: str | None = None
    created_by: str = ""        # "manual", "hypothesis_agent", agent name
    created_at: str = ""
    confidence_log: list[dict] = field(default_factory=list)
    # S5 coref-specific: pair of entity ids this hypothesis claims are the
    # same actor. Lets update_hypothesis_confidence sync the backing
    # ``same_as`` edge automatically when contradicting evidence arrives.
    coref_entity_pair: list[str] = field(default_factory=list)

    def to_dict(self) -> dict:
        return asdict(self)

    @classmethod
    def from_dict(cls, d: dict) -> Hypothesis:
        """Reconstruct from a dict. Migrates pre-S3 records that only had
        ``confidence`` by deriving ``log_odds`` via the logit transform.
        """
        d = dict(d)
        if "log_odds" not in d:
            d["log_odds"] = prob_to_log_odds(d.get("confidence", 0.5))
        d.setdefault("prior_prob", 0.5)
        # Re-sync confidence from log_odds in case of drift in old files.
        d["confidence"] = log_odds_to_prob(d["log_odds"])
        known = set(cls.__dataclass_fields__)
        return cls(**{k: v for k, v in d.items() if k in known})

    def summary(self) -> str:
        return (
            f"[{self.id}] {self.title} "
            f"(conf={self.confidence:.2f}, L={self.log_odds:+.2f}, {self.status})"
        )


@dataclass
class Entity:
    """Recurring actor or object across phenomena.

    DESIGN.md §4.6 attaches typed identifiers directly to the entity for
    fast blocking lookups during coref. Each identifier entry:
        {type, value, normalized, strong, invocation_id, phenomenon_id, observed_at}
    where ``normalized`` is the canonicalised form used for matching
    (lower-cased email, digits-only phone, …).
    """

    id: str                     # "ent-{uuid8}"
    name: str
    entity_type: str            # person, program, file, host, ip_address
    description: str = ""
    identifiers: list[dict] = field(default_factory=list)
    created_at: str = ""

    def to_dict(self) -> dict:
        return asdict(self)

    @classmethod
    def from_dict(cls, d: dict) -> Entity:
        d = dict(d)
        d.setdefault("identifiers", [])
        known = set(cls.__dataclass_fields__)
        return cls(**{k: v for k, v in d.items() if k in known})

    def has_identifier(self, identifier_type: str, normalized_value: str) -> bool:
        return any(
            i.get("type") == identifier_type
            and i.get("normalized") == normalized_value
            for i in self.identifiers
        )

    def summary(self) -> str:
        idents = ""
        if self.identifiers:
            top = self.identifiers[:3]
            preview = ", ".join(f"{i.get('type')}={i.get('value')}" for i in top)
            extra = (
                f" (+{len(self.identifiers) - 3} more)"
                if len(self.identifiers) > 3 else ""
            )
            idents = f" [{preview}{extra}]"
        return f"[{self.id}] {self.entity_type}: {self.name}{idents}"


@dataclass
class Edge:
    """Directed edge in the evidence graph."""

    id: str                     # "edge-{uuid8}"
    source_id: str
    target_id: str
    edge_type: str
    metadata: dict = field(default_factory=dict)
    created_by: str = ""
    created_at: str = ""

    def to_dict(self) -> dict:
        return asdict(self)

    @classmethod
    def from_dict(cls, d: dict) -> Edge:
        return cls(**d)


@dataclass
class Lead:
    """An investigative lead that should be followed up by an agent."""

    id: str
    target_agent: str
    description: str
    priority: int = 5           # 1 (highest) - 10 (lowest)
    context: dict = field(default_factory=dict)
    status: str = "pending"     # pending, assigned, completed, failed
    hypothesis_id: str | None = None

    def to_dict(self) -> dict:
        return asdict(self)

    @classmethod
    def from_dict(cls, d: dict) -> Lead:
        return cls(**d)


@dataclass
class InvestigationArea:
    """An area to investigate to confirm/refute one or more hypotheses.

    Derived by the orchestrator from active hypotheses after Phase 2; also
    seeded from config.yaml:investigation_areas as an optional manual
    override. Each area carries its own keywords + expected tools so the
    gap-analysis coverage check is generic, not tied to hard-coded constants.
    """

    id: str                                     # "area-{slug}"
    area: str                                   # snake_case slug (dedupe key)
    description: str
    suggested_agent: str                        # filesystem / registry / communication / network / timeline
    expected_keywords: list[str] = field(default_factory=list)
    expected_tools: list[str] = field(default_factory=list)
    priority: int = 5                           # 1 (highest) - 10 (lowest)
    motivating_hypothesis_ids: list[str] = field(default_factory=list)
    created_by: str = ""                        # "manual" | "llm_derive" | "fallback"
    created_at: str = ""

    def to_dict(self) -> dict:
        return asdict(self)

    @classmethod
    def from_dict(cls, d: dict) -> InvestigationArea:
        return cls(**d)

    def summary(self) -> str:
        return (
            f"[{self.area}] P{self.priority} agent={self.suggested_agent} "
            f"(motivating: {len(self.motivating_hypothesis_ids)})"
        )


@dataclass
class ExtractedAsset:
    """A file extracted from the disk image and tracked in the asset library."""

    id: str                     # "asset-{uuid8}"
    inode: str                  # e.g. "334-128-4"
    original_path: str          # disk image path from ffind
    local_path: str             # "extracted/SYSTEM"
    category: str               # registry_hive, chat_log, prefetch, ...
    filename: str               # "SYSTEM"
    size_bytes: int
    extracted_by: str           # agent name
    extracted_at: str           # ISO timestamp

    def to_dict(self) -> dict:
        return asdict(self)

    @classmethod
    def from_dict(cls, d: dict) -> ExtractedAsset:
        return cls(**d)

    def summary(self) -> str:
        size_kb = self.size_bytes / 1024
        return (
            f"[{self.id}] {self.filename} ({self.category}) "
            f"— {size_kb:.1f}KB @ {self.local_path} [inode:{self.inode}]"
        )


@dataclass
class ToolInvocation:
    """One recorded tool call — the provenance unit for grounded facts.

    Every wrapped tool executor records a ToolInvocation when it runs. The
    grounding gateway looks these up by id when validating that a fact in
    an ``add_phenomenon`` call traces back to a real tool output. Persisted
    with the graph so a re-loaded run can still verify provenance.
    """

    id: str                     # "inv-{uuid8}"
    tool: str                   # tool name as registered in TOOL_CATALOG
    args: dict                  # kwargs passed to the executor
    output: str                 # the raw output string the tool produced
    output_sha256: str          # hexdigest — tamper-evident hash of output
    agent: str                  # agent that issued the call
    task_id: str                # agent run scope (graph._current_task_id at call time)
    source_id: str              # active evidence source at call time
    created_at: str             # ISO timestamp
    cached: bool = False        # served from result cache without re-running

    def to_dict(self) -> dict:
        return asdict(self)

    @classmethod
    def from_dict(cls, d: dict) -> ToolInvocation:
        return cls(**d)

    def summary(self) -> str:
        return (
            f"[{self.id}] {self.tool}({json.dumps(self.args, ensure_ascii=False)}) "
            f"@{self.created_at} agent={self.agent} cached={self.cached}"
        )


# ---------------------------------------------------------------------------
# Evidence Graph
# ---------------------------------------------------------------------------

class EvidenceGraph:
    """Graph-based evidence store for multi-agent forensic analysis.

    Agents interact with the graph via query tools (list_phenomena,
    get_phenomenon, search_graph, get_related) rather than reading
    a full dump in the system prompt.
    """

    def __init__(
        self,
        case_info: dict | None = None,
        persist_path: Path | None = None,
        edge_log_lr: dict[str, float] | None = None,
    ) -> None:
        self.case_info: dict = case_info or {}
        # log₁₀(LR) per edge type — calibration table for confidence updates.
        # Renamed from edge_weights (S3): the values are no longer deltas in
        # confidence space, they are log-likelihood ratios in odds space.
        self.edge_log_lr: dict[str, float] = (
            dict(edge_log_lr) if edge_log_lr else dict(_DEFAULT_LOG_LR)
        )
        self.image_path: str = ""
        self.partition_offset: int = 0
        self.extracted_dir: str = "extracted"

        # Multi-evidence: the case and the source tools/phenomena bind to.
        # image_path / partition_offset above mirror active_source for
        # backward-compatible readers; set_active_source keeps them in sync.
        self.case: Case | None = None
        self.active_source: EvidenceSource | None = None

        # Graph storage
        self.phenomena: dict[str, Phenomenon] = {}
        self.hypotheses: dict[str, Hypothesis] = {}
        self.entities: dict[str, Entity] = {}
        self.edges: list[Edge] = []

        # Adjacency index for fast traversal
        self._adj: dict[str, list[Edge]] = {}       # node_id → outgoing edges
        self._adj_rev: dict[str, list[Edge]] = {}    # node_id → incoming edges

        # Lead / status management (carried over from Blackboard)
        self.leads: list[Lead] = []
        self.agent_status: dict[str, str] = {}

        # Asset library — tracks all files extracted from the disk image
        self.asset_library: dict[str, ExtractedAsset] = {}
        self._inode_index: dict[str, str] = {}    # inode → asset_id

        # Investigation areas — derived from hypotheses (LLM) and/or seeded
        # from config.yaml:investigation_areas (manual override). Drives the
        # gap-analysis coverage check.
        self.investigation_areas: dict[str, InvestigationArea] = {}

        # Tool invocations — provenance log for grounded facts. Every wrapped
        # tool executor records one entry; add_phenomenon's grounding gateway
        # looks them up to validate cited invocation_ids and substring-match
        # claimed fact values against real tool outputs.
        self.tool_invocations: dict[str, ToolInvocation] = {}

        # _current_agent / _current_task_id are exposed as @property below,
        # backed by module-level ContextVars (race-free under asyncio.gather).

        self._lock = asyncio.Lock()
        self._persist_path: Path | None = persist_path

    # ---- Per-asyncio-task scoped state ---------------------------------------
    #
    # Reads/writes through these properties hit ContextVars rather than
    # instance attributes. Concurrent agent runs (Phase 3 parallel
    # dispatch) each have their own task-local context, so writes inside
    # one agent's run() are invisible to siblings — which means
    # ``record_tool_invocation`` always tags an invocation with the agent
    # and task scope that actually issued it.

    @property
    def _current_agent(self) -> str:
        return _current_agent_ctx.get()

    @_current_agent.setter
    def _current_agent(self, value: str) -> None:
        _current_agent_ctx.set(value or "")

    @property
    def _current_task_id(self) -> str:
        return _current_task_id_ctx.get()

    @_current_task_id.setter
    def _current_task_id(self, value: str) -> None:
        _current_task_id_ctx.set(value or "")

    # ---- Persistence -------------------------------------------------------

    def _auto_save(self) -> None:
        """Persist full state to disk. Must be called inside _lock."""
        if self._persist_path is None:
            return
        try:
            state = {
                "case_info": self.case_info,
                "case": self.case.to_dict() if self.case else None,
                "active_source_id": (
                    self.active_source.id if self.active_source else ""
                ),
                "image_path": self.image_path,
                "partition_offset": self.partition_offset,
                "extracted_dir": self.extracted_dir,
                "phenomena": {pid: p.to_dict() for pid, p in self.phenomena.items()},
                "hypotheses": {hid: h.to_dict() for hid, h in self.hypotheses.items()},
                "entities": {eid: e.to_dict() for eid, e in self.entities.items()},
                "edges": [e.to_dict() for e in self.edges],
                "leads": [l.to_dict() for l in self.leads],
                "agent_status": dict(self.agent_status),
                "asset_library": {aid: a.to_dict() for aid, a in self.asset_library.items()},
                "investigation_areas": {
                    aid: a.to_dict() for aid, a in self.investigation_areas.items()
                },
                "tool_invocations": {
                    iid: inv.to_dict() for iid, inv in self.tool_invocations.items()
                },
                "saved_at": datetime.now().isoformat(),
            }
            tmp = self._persist_path.with_suffix(".tmp")
            tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2))
            tmp.replace(self._persist_path)
        except Exception as e:
            logger.error("EvidenceGraph auto-save failed: %s", e)

    def save_state(self, path: Path) -> None:
        """Explicitly save state to the given path."""
        old = self._persist_path
        self._persist_path = path
        self._auto_save()
        self._persist_path = old

    @classmethod
    def load_state(
        cls,
        path: Path,
        edge_log_lr: dict[str, float] | None = None,
    ) -> EvidenceGraph:
        """Restore an EvidenceGraph from a saved JSON state file."""
        data = json.loads(path.read_text())
        graph = cls(
            case_info=data.get("case_info", {}),
            persist_path=path,
            edge_log_lr=edge_log_lr,
        )
        graph.image_path = data.get("image_path", "")
        graph.partition_offset = data.get("partition_offset", 0)
        graph.extracted_dir = data.get("extracted_dir", "extracted")

        # Restore the evidence-source model. State files predating the Case
        # model carry only image_path/partition_offset → wrap as one source.
        case_data = data.get("case")
        if case_data:
            graph.case = Case.from_dict(case_data)
        elif graph.image_path:
            graph.case = single_source_case(
                graph.image_path, graph.partition_offset
            )
        if graph.case and graph.case.sources:
            active = graph.case.get_source(data.get("active_source_id", ""))
            graph.set_active_source(active or graph.case.sources[0])

        graph.phenomena = {
            pid: Phenomenon.from_dict(p)
            for pid, p in data.get("phenomena", {}).items()
        }
        graph.hypotheses = {
            hid: Hypothesis.from_dict(h)
            for hid, h in data.get("hypotheses", {}).items()
        }
        graph.entities = {
            eid: Entity.from_dict(e)
            for eid, e in data.get("entities", {}).items()
        }
        graph.edges = [Edge.from_dict(e) for e in data.get("edges", [])]
        graph.leads = [Lead.from_dict(l) for l in data.get("leads", [])]
        graph.agent_status = data.get("agent_status", {})
        for aid, a_data in data.get("asset_library", {}).items():
            asset = ExtractedAsset.from_dict(a_data)
            graph.asset_library[aid] = asset
            graph._inode_index[asset.inode] = aid
        graph.investigation_areas = {
            aid: InvestigationArea.from_dict(a)
            for aid, a in data.get("investigation_areas", {}).items()
        }
        graph.tool_invocations = {
            iid: ToolInvocation.from_dict(inv)
            for iid, inv in data.get("tool_invocations", {}).items()
        }
        graph._rebuild_adjacency()
        logger.info(
            "EvidenceGraph restored: %d phenomena, %d hypotheses, %d entities, "
            "%d edges, %d assets",
            len(graph.phenomena), len(graph.hypotheses),
            len(graph.entities), len(graph.edges), len(graph.asset_library),
        )
        return graph

    def _rebuild_adjacency(self) -> None:
        """Rebuild adjacency index from edges list."""
        self._adj.clear()
        self._adj_rev.clear()
        for edge in self.edges:
            self._adj.setdefault(edge.source_id, []).append(edge)
            self._adj_rev.setdefault(edge.target_id, []).append(edge)

    # ---- Evidence source ----------------------------------------------------

    def set_active_source(self, source: EvidenceSource | None) -> None:
        """Bind tools and newly recorded phenomena to *source*.

        Syncs the legacy image_path / partition_offset fields so existing
        readers (orchestrator logs, report naming, agent prompts) keep
        working unchanged. The orchestrator calls this before dispatching an
        agent; single-source runs call it once at startup.
        """
        self.active_source = source
        if source is not None:
            self.image_path = source.path
            self.partition_offset = source.partition_offset

    # ---- Node helpers -------------------------------------------------------

    def _node_exists(self, node_id: str) -> bool:
        if node_id.startswith("ph-"):
            return node_id in self.phenomena
        if node_id.startswith("hyp-"):
            return node_id in self.hypotheses
        if node_id.startswith("ent-"):
            return node_id in self.entities
        return False

    def get_node(self, node_id: str) -> Phenomenon | Hypothesis | Entity | None:
        if node_id.startswith("ph-"):
            return self.phenomena.get(node_id)
        if node_id.startswith("hyp-"):
            return self.hypotheses.get(node_id)
        if node_id.startswith("ent-"):
            return self.entities.get(node_id)
        return None

    # ---- Similarity merging (Phenomenon only) --------------------------------

    def _find_similar_phenomenon(
        self, title: str, interpretation: str, category: str,
    ) -> Phenomenon | None:
        best_match: Phenomenon | None = None
        best_score = 0.0
        for ph in self.phenomena.values():
            if ph.category != category:
                continue
            title_sim = _jaccard_similarity(ph.title, title)
            if title_sim <= 0.6:
                continue
            desc_sim = _jaccard_similarity(
                ph.interpretation[:200], interpretation[:200],
            )
            if desc_sim <= 0.4:
                continue
            combined = title_sim * 0.6 + desc_sim * 0.4
            if combined > best_score:
                best_score = combined
                best_match = ph
        return best_match

    # ---- Mutation methods (async, under lock) --------------------------------

    async def add_phenomenon(
        self,
        source_agent: str,
        category: str,
        title: str,
        interpretation: str = "",
        verified_facts: list[dict] | None = None,
        raw_data: dict | None = None,
        timestamp: str | None = None,
        source_tool: str = "",
        from_lead_id: str | None = None,
        task_id: str | None = None,
        # Pre-S2 callers passed analysis text as ``description``. Accept it
        # as an alias for ``interpretation`` so legacy tests and any in-flight
        # tool-call messages don't break. Not advertised in the LLM-facing
        # tool schema — BaseAgent's add_phenomenon advertises the new fields.
        description: str | None = None,
    ) -> tuple[str, bool]:
        """Add a phenomenon under the grounding gateway. Returns (id, was_merged).

        Each fact in ``verified_facts`` must point at a real ToolInvocation
        made by this agent within ``task_id`` (defaults to the graph's
        current task scope). Any fact failing grounding raises
        :class:`GroundingError` — the whole call is rejected; the caller
        must fix and retry. This is the code-level enforcement of
        DESIGN.md §4.4.
        """
        if description and not interpretation:
            interpretation = description
        facts = list(verified_facts or [])
        active_task_id = task_id if task_id is not None else self._current_task_id

        # Grounding gateway — validate every fact BEFORE acquiring the lock
        # (read-only check; lookup uses dict access which is thread-safe).
        failures: list[dict] = []
        for fact in facts:
            ok, reason = self.validate_fact_grounding(
                fact, agent=source_agent, task_id=active_task_id or "",
            )
            if not ok:
                failures.append({"fact": fact, "reason": reason})
        if failures:
            raise GroundingError(
                "Phenomenon rejected — one or more facts are not grounded:\n"
                + "\n".join(
                    f"  - {f['reason']}: {json.dumps(f['fact'], ensure_ascii=False)}"
                    for f in failures
                ),
                failures=failures,
            )

        async with self._lock:
            similar = self._find_similar_phenomenon(title, interpretation, category)
            if similar is not None:
                similar.confidence = min(1.0, similar.confidence + 0.15)
                if source_agent not in similar.corroborating_agents:
                    similar.corroborating_agents.append(source_agent)
                if raw_data:
                    for k, v in raw_data.items():
                        if k not in similar.raw_data:
                            similar.raw_data[k] = v
                # Merge any new facts whose (type, value, invocation_id)
                # tuple isn't already on the existing phenomenon.
                if facts:
                    seen = {
                        (f.get("type"), f.get("value"), f.get("invocation_id"))
                        for f in similar.verified_facts
                    }
                    for f in facts:
                        key = (f.get("type"), f.get("value"), f.get("invocation_id"))
                        if key not in seen:
                            similar.verified_facts.append(f)
                            seen.add(key)
                if from_lead_id and similar.from_lead_id is None:
                    similar.from_lead_id = from_lead_id
                self._auto_save()
                return similar.id, True

            pid = f"ph-{uuid.uuid4().hex[:8]}"
            confidence = _compute_quality_score(
                source_tool, timestamp, raw_data or {},
                interpretation, facts, [],
            )
            ph = Phenomenon(
                id=pid,
                source_agent=source_agent,
                category=category,
                title=title,
                interpretation=interpretation,
                verified_facts=facts,
                raw_data=raw_data or {},
                timestamp=timestamp,
                confidence=confidence,
                source_tool=source_tool,
                source_id=self.active_source.id if self.active_source else "",
                from_lead_id=from_lead_id,
                created_at=datetime.now().isoformat(),
            )
            self.phenomena[pid] = ph
            self._auto_save()
            return pid, False

    async def add_hypothesis(
        self,
        title: str,
        description: str,
        created_by: str = "",
        parent_id: str | None = None,
        prior_prob: float = 0.5,
    ) -> str:
        """Add a hypothesis. Returns the hypothesis ID.

        ``prior_prob`` seeds the starting log_odds (default 0.5 → 0.0).
        Pick a different prior when you have base-rate knowledge — e.g.
        prior_prob=0.1 for an unusual claim, 0.9 for a strong default.
        """
        async with self._lock:
            hid = f"hyp-{uuid.uuid4().hex[:8]}"
            l_prior = prob_to_log_odds(prior_prob)
            hyp = Hypothesis(
                id=hid,
                title=title,
                description=description,
                prior_prob=prior_prob,
                log_odds=l_prior,
                confidence=log_odds_to_prob(l_prior),
                status="active",
                parent_id=parent_id,
                created_by=created_by,
                created_at=datetime.now().isoformat(),
            )
            self.hypotheses[hid] = hyp
            self._auto_save()
            return hid

    async def add_entity(
        self,
        name: str,
        entity_type: str,
        description: str = "",
    ) -> tuple[str, bool]:
        """Add an entity. Deduplicates on (name, entity_type). Returns (id, was_existing)."""
        async with self._lock:
            for ent in self.entities.values():
                if ent.name == name and ent.entity_type == entity_type:
                    return ent.id, True

            eid = f"ent-{uuid.uuid4().hex[:8]}"
            self.entities[eid] = Entity(
                id=eid,
                name=name,
                entity_type=entity_type,
                description=description,
                created_at=datetime.now().isoformat(),
            )
            self._auto_save()
            return eid, False

    async def add_edge(
        self,
        source_id: str,
        target_id: str,
        edge_type: str,
        metadata: dict | None = None,
        created_by: str = "",
    ) -> str:
        """Add a directed edge. Validates nodes exist and edge type is valid."""
        async with self._lock:
            if not self._node_exists(source_id):
                raise ValueError(f"Source node not found: {source_id}")
            if not self._node_exists(target_id):
                raise ValueError(f"Target node not found: {target_id}")
            if edge_type not in VALID_EDGE_TYPES:
                raise ValueError(f"Invalid edge type: {edge_type}")

            eid = f"edge-{uuid.uuid4().hex[:8]}"
            edge = Edge(
                id=eid,
                source_id=source_id,
                target_id=target_id,
                edge_type=edge_type,
                metadata=metadata or {},
                created_by=created_by,
                created_at=datetime.now().isoformat(),
            )
            self.edges.append(edge)
            self._adj.setdefault(source_id, []).append(edge)
            self._adj_rev.setdefault(target_id, []).append(edge)
            self._auto_save()
            return eid

    async def update_hypothesis_confidence(
        self,
        hyp_id: str,
        phenomenon_id: str,
        edge_type: str,
        reason: str = "",
    ) -> float:
        """Apply one phenomenon→hypothesis edge as an additive log_odds update.

        DESIGN.md §4.5: edge_type → log₁₀(LR) is looked up in
        ``self.edge_log_lr`` (LLM never emits the number). The update is
        ``L_post = L_prior + log_lr`` and ``confidence = sigmoid(L_post)``
        — commutative and order-independent, fixing the pre-S3 ordering
        bug. Status flips at ≥0.8 → supported / ≤0.2 → refuted.

        **Idempotency**: if a ``(phenomenon, hypothesis, edge_type)`` edge
        already exists, this is a no-op — the same agent re-recording the
        same link (or two agents linking via the orchestrator's batch
        judge and a manual override) does not double-count. Independent
        evidence — *different* phenomena pointing the same way — still
        accumulates fully.
        """
        if edge_type not in self.edge_log_lr:
            raise ValueError(
                f"Invalid hypothesis edge type: {edge_type}. "
                f"Must be one of: {list(self.edge_log_lr.keys())}"
            )

        async with self._lock:
            if not self._node_exists(phenomenon_id):
                raise ValueError(f"Phenomenon not found: {phenomenon_id}")
            hyp = self.hypotheses.get(hyp_id)
            if hyp is None:
                raise ValueError(f"Hypothesis not found: {hyp_id}")

            # Idempotency check — same (ph, hyp, edge_type) already on graph.
            for existing in self._adj.get(phenomenon_id, []):
                if (
                    existing.target_id == hyp_id
                    and existing.edge_type == edge_type
                ):
                    return hyp.confidence

            log_lr = self.edge_log_lr[edge_type]
            old_log_odds = hyp.log_odds
            old_conf = hyp.confidence
            new_log_odds = old_log_odds + log_lr
            new_conf = log_odds_to_prob(new_log_odds)

            hyp.log_odds = new_log_odds
            hyp.confidence = new_conf

            if new_conf >= 0.8:
                hyp.status = "supported"
            elif new_conf <= 0.2:
                hyp.status = "refuted"
            else:
                hyp.status = "active"

            hyp.confidence_log.append({
                "timestamp": datetime.now().isoformat(),
                "phenomenon_id": phenomenon_id,
                "edge_type": edge_type,
                "log_lr": log_lr,
                "old_log_odds": round(old_log_odds, 4),
                "new_log_odds": round(new_log_odds, 4),
                "old_confidence": round(old_conf, 4),
                "new_confidence": round(new_conf, 4),
                "reason": reason,
            })

            # Also create the edge in the graph
            eid = f"edge-{uuid.uuid4().hex[:8]}"
            edge = Edge(
                id=eid,
                source_id=phenomenon_id,
                target_id=hyp_id,
                edge_type=edge_type,
                metadata={"reason": reason, "log_lr": log_lr},
                created_by="hypothesis_engine",
                created_at=datetime.now().isoformat(),
            )
            self.edges.append(edge)
            self._adj.setdefault(phenomenon_id, []).append(edge)
            self._adj_rev.setdefault(hyp_id, []).append(edge)

            self._auto_save()

        # If this is a coref hypothesis, mirror the new confidence into the
        # entity-level same_as edge. Done OUTSIDE the lock — _sync_same_as_edge
        # re-acquires it internally — so we avoid reentrant locking.
        if hyp.coref_entity_pair and len(hyp.coref_entity_pair) == 2:
            await self._sync_same_as_edge(
                hyp.coref_entity_pair[0],
                hyp.coref_entity_pair[1],
                hyp_id,
            )
        return new_conf

    # ---- Cross-source entity coreference (DESIGN.md §4.6) -------------------

    @staticmethod
    def _coref_hypothesis_id(eid_a: str, eid_b: str) -> str:
        """Deterministic id for the coref hypothesis between an entity pair.

        Same pair (regardless of arg order) always maps to the same id so
        repeated observations augment the existing hypothesis rather than
        spawning duplicates.
        """
        pair = "|".join(sorted([eid_a, eid_b]))
        return f"hyp-coref-{hashlib.sha256(pair.encode()).hexdigest()[:10]}"

    async def get_or_create_coref_hypothesis(
        self, eid_a: str, eid_b: str,
    ) -> tuple[str, bool]:
        """Look up (or insert) the coreference hypothesis for an entity pair.

        Uses a low prior (``prior_prob=0.1``) — saying any two entities are
        the same actor is a strong claim, so the default should be
        skeptical and let evidence move the needle.
        """
        hid = self._coref_hypothesis_id(eid_a, eid_b)
        async with self._lock:
            if hid in self.hypotheses:
                return hid, False
            ea = self.entities.get(eid_a)
            eb = self.entities.get(eid_b)
            if ea is None or eb is None:
                raise ValueError(f"Unknown entity in coref pair: {eid_a}, {eid_b}")
            l_prior = prob_to_log_odds(0.1)
            self.hypotheses[hid] = Hypothesis(
                id=hid,
                title=f"Coreference: {ea.name} ≡ {eb.name}",
                description=(
                    f"Hypothesis that {ea.id} ({ea.name}, {ea.entity_type}) "
                    f"and {eb.id} ({eb.name}, {eb.entity_type}) refer to "
                    f"the same actor across evidence sources."
                ),
                prior_prob=0.1,
                log_odds=l_prior,
                confidence=log_odds_to_prob(l_prior),
                status="active",
                created_by="coref_engine",
                created_at=datetime.now().isoformat(),
                coref_entity_pair=sorted([eid_a, eid_b]),
            )
            self._auto_save()
        return hid, True

    async def _sync_same_as_edge(
        self, eid_a: str, eid_b: str, hyp_id: str,
    ) -> None:
        """Mirror coref hypothesis confidence into a ``same_as`` entity edge.

        - Confidence ≥ 0.8 → ensure an active ``same_as`` edge exists.
        - Confidence  < 0.8 → mark any existing edge inactive (audit, not delete).
        Idempotent on both transitions.
        """
        hyp = self.hypotheses.get(hyp_id)
        if hyp is None:
            return
        active = hyp.confidence >= 0.8
        async with self._lock:
            existing = None
            for edge in self.edges:
                if (edge.edge_type == "same_as"
                    and {edge.source_id, edge.target_id} == {eid_a, eid_b}):
                    existing = edge
                    break
            if active:
                if existing is None:
                    eid = f"edge-{uuid.uuid4().hex[:8]}"
                    edge = Edge(
                        id=eid,
                        source_id=eid_a,
                        target_id=eid_b,
                        edge_type="same_as",
                        metadata={
                            "backed_by": hyp_id,
                            "active": True,
                            "confidence_at_creation": hyp.confidence,
                        },
                        created_by="coref_engine",
                        created_at=datetime.now().isoformat(),
                    )
                    self.edges.append(edge)
                    self._adj.setdefault(eid_a, []).append(edge)
                    self._adj_rev.setdefault(eid_b, []).append(edge)
                elif not existing.metadata.get("active"):
                    existing.metadata["active"] = True
                    existing.metadata["reactivated_at"] = datetime.now().isoformat()
            else:
                if existing is not None and existing.metadata.get("active"):
                    existing.metadata["active"] = False
                    existing.metadata["deactivated_at"] = datetime.now().isoformat()
            self._auto_save()

    async def observe_identity(
        self,
        entity_name: str,
        entity_type: str,
        identifier_type: str,
        value: str,
        source_agent: str,
        invocation_id: str,
        source_tool: str = "",
        task_id: str | None = None,
    ) -> dict:
        """Record a typed identifier for an entity through the grounding gateway.

        DESIGN.md §4.6. Steps:

        1. Validate ``invocation_id`` + ``value`` via the same gateway
           ``add_phenomenon`` uses (raises :class:`GroundingError` on failure).
        2. Get-or-create the entity.
        3. Record an ``identity_observation`` phenomenon carrying the
           identifier as its sole verified fact.
        4. Attach the identifier to the entity (idempotent by
           ``(type, normalized_value)``).
        5. If the attachment is new, scan other entities for shared
           identifiers (strong / weak) and any conflicting strong
           identifiers, then propose / strengthen / weaken the coref
           hypothesis between each candidate pair. ``same_as`` edges are
           kept in sync with the hypothesis confidence.

        Returns a dict summarising the entity id, observation phenomenon,
        whether the identifier was new, and any coref proposals fired.
        """
        if identifier_type not in (STRONG_IDENTIFIER_TYPES | WEAK_IDENTIFIER_TYPES):
            raise ValueError(
                f"Unknown identifier_type: {identifier_type}. "
                f"Strong: {sorted(STRONG_IDENTIFIER_TYPES)}; "
                f"Weak: {sorted(WEAK_IDENTIFIER_TYPES)}."
            )
        if not value:
            raise ValueError("identifier value must be non-empty")

        # add_phenomenon enforces the grounding contract for the fact below.
        active_task = task_id if task_id is not None else self._current_task_id
        fact = {"type": identifier_type, "value": value, "invocation_id": invocation_id}

        # Get-or-create entity first so we can attribute the observation.
        eid, _existed = await self.add_entity(entity_name, entity_type)

        norm = _normalize_identifier(identifier_type, value)
        title = f"{identifier_type}={value} on {entity_name}"
        pid, _merged = await self.add_phenomenon(
            source_agent=source_agent,
            category="identity_observation",
            title=title,
            interpretation=(
                f"Agent attributed identifier {identifier_type}={value} "
                f"(normalized={norm}) to entity {entity_name} ({entity_type})."
            ),
            verified_facts=[fact],
            source_tool=source_tool,
            task_id=active_task,
        )

        # Attach identifier to entity (idempotent on type + normalized value).
        new_identifier = False
        async with self._lock:
            ent = self.entities[eid]
            if not ent.has_identifier(identifier_type, norm):
                ent.identifiers.append({
                    "type": identifier_type,
                    "value": value,
                    "normalized": norm,
                    "strong": is_strong_identifier(identifier_type),
                    "invocation_id": invocation_id,
                    "phenomenon_id": pid,
                    "observed_at": datetime.now().isoformat(),
                })
                new_identifier = True
            self._auto_save()

        coref_proposals: list[dict] = []
        if new_identifier:
            coref_proposals = await self._propose_coref_for_new_identifier(
                new_eid=eid,
                new_type=identifier_type,
                new_norm=norm,
                new_phenomenon_id=pid,
            )

        return {
            "entity_id": eid,
            "phenomenon_id": pid,
            "new_identifier": new_identifier,
            "coref_proposals": coref_proposals,
        }

    async def _propose_coref_for_new_identifier(
        self,
        new_eid: str,
        new_type: str,
        new_norm: str,
        new_phenomenon_id: str,
    ) -> list[dict]:
        """Blocking + propose: find candidate entities that share this
        identifier with ``new_eid``, register / strengthen the coref
        hypothesis for each pair, and emit conflicting-identifier edges
        where the two entities have *different* values for the same
        strong identifier type. O(|entities| × identifiers) — blocking
        is implicit in the fact that the new identifier is fixed.
        """
        new_ent = self.entities.get(new_eid)
        if new_ent is None:
            return []

        is_strong_new = is_strong_identifier(new_type)
        match_edge = "shared_strong_identifier" if is_strong_new else "shared_weak_identifier"

        proposals: list[dict] = []

        for other_eid, other_ent in list(self.entities.items()):
            if other_eid == new_eid:
                continue

            # Match: other entity carries the same (type, normalized).
            if not other_ent.has_identifier(new_type, new_norm):
                continue

            # Collect conflicting strong identifiers between the pair —
            # they'll fire negative-LR edges on the same coref hypothesis.
            conflicts: list[dict] = []
            for a_ident in new_ent.identifiers:
                if not a_ident.get("strong"):
                    continue
                for b_ident in other_ent.identifiers:
                    if (b_ident.get("type") == a_ident.get("type")
                        and b_ident.get("strong")
                        and b_ident.get("normalized") != a_ident.get("normalized")):
                        conflicts.append({
                            "type": a_ident.get("type"),
                            "new_value": a_ident.get("value"),
                            "other_value": b_ident.get("value"),
                            "new_phenomenon_id": a_ident.get("phenomenon_id"),
                        })

            hid, _created = await self.get_or_create_coref_hypothesis(
                new_eid, other_eid,
            )

            # +shared identifier edge (one per identifier, anchored to the
            # newly recorded observation phenomenon). update_hypothesis_
            # confidence is idempotent on (ph, hyp, edge_type), so re-running
            # the same observation does not double-count.
            await self.update_hypothesis_confidence(
                hid, new_phenomenon_id, match_edge,
                reason=f"shared {new_type}={new_norm}",
            )

            # −conflicting strong identifier edges — one per conflict, anchored
            # to the *new* entity's observation phenomenon for that identifier.
            for c in conflicts:
                ph_src = c["new_phenomenon_id"]
                if not ph_src:
                    continue
                await self.update_hypothesis_confidence(
                    hid, ph_src, "conflicting_strong_identifier",
                    reason=(
                        f"conflict {c['type']}: "
                        f"{c['new_value']} vs {c['other_value']}"
                    ),
                )

            await self._sync_same_as_edge(new_eid, other_eid, hid)

            proposals.append({
                "hypothesis_id": hid,
                "other_entity_id": other_eid,
                "match": {"type": new_type, "normalized": new_norm,
                          "edge_type": match_edge},
                "conflicts": conflicts,
                "confidence": self.hypotheses[hid].confidence,
            })

        return proposals

    # ---- Cross-source entity cluster queries (DESIGN.md §4.6) ----------------

    def _active_same_as_neighbors(self, entity_id: str) -> set[str]:
        """Neighbours of *entity_id* via ``same_as`` edges that are still active.

        ``same_as`` edges are non-destructive: a coref hypothesis that drops
        below threshold marks ``metadata['active']=False`` rather than
        deleting, so the audit trail survives. Cluster queries respect that.
        """
        out: set[str] = set()
        for edge in self.edges:
            if edge.edge_type != "same_as":
                continue
            if not edge.metadata.get("active", True):
                continue
            if edge.source_id == entity_id:
                out.add(edge.target_id)
            elif edge.target_id == entity_id:
                out.add(edge.source_id)
        return out

    def resolve_actor_cluster(self, entity_id: str) -> set[str]:
        """Return the connected component containing *entity_id* via active
        ``same_as`` edges — the set of entity ids that current coref evidence
        treats as the same actor.

        Reversible: deactivating a ``same_as`` edge (because the backing
        coref hypothesis drops below 0.8) breaks the component, so this
        always reflects the *current* state of the graph.
        """
        if entity_id not in self.entities:
            return set()
        seen: set[str] = {entity_id}
        frontier: list[str] = [entity_id]
        while frontier:
            cur = frontier.pop()
            for nbr in self._active_same_as_neighbors(cur):
                if nbr not in seen:
                    seen.add(nbr)
                    frontier.append(nbr)
        return seen

    def actor_clusters(self) -> list[dict]:
        """Group all entities into actor clusters via active ``same_as``.

        Returns a list of ``{members: [...], identifiers: [...], coref_hypotheses: [...]}``
        for the report agent and the orchestrator's cross-source views.
        """
        unseen = set(self.entities.keys())
        clusters: list[dict] = []
        while unseen:
            start = next(iter(unseen))
            members = self.resolve_actor_cluster(start)
            unseen -= members
            # Aggregate identifiers across the cluster (deduped on type+normalized).
            ident_seen: set[tuple[str, str]] = set()
            idents: list[dict] = []
            for eid in members:
                for ident in self.entities[eid].identifiers:
                    key = (ident.get("type"), ident.get("normalized"))
                    if key in ident_seen:
                        continue
                    ident_seen.add(key)
                    idents.append({
                        "type": ident.get("type"),
                        "value": ident.get("value"),
                        "strong": ident.get("strong"),
                        "on_entity": eid,
                    })
            coref_hyps = sorted({
                e.metadata.get("backed_by", "")
                for e in self.edges
                if e.edge_type == "same_as"
                and e.metadata.get("active", True)
                and (e.source_id in members or e.target_id in members)
            } - {""})
            clusters.append({
                "members": sorted(members),
                "identifiers": idents,
                "coref_hypotheses": coref_hyps,
            })
        return clusters

    # ---- Lead management (same as old Blackboard) ----------------------------

    async def add_lead(
        self,
        target_agent: str,
        description: str,
        priority: int = 5,
        context: dict | None = None,
        hypothesis_id: str | None = None,
    ) -> str:
        async with self._lock:
            lid = f"lead-{uuid.uuid4().hex[:8]}"
            self.leads.append(Lead(
                id=lid,
                target_agent=target_agent,
                description=description,
                priority=priority,
                context=context or {},
                hypothesis_id=hypothesis_id,
            ))
            self._auto_save()
            return lid

    async def get_pending_leads(self, agent_type: str | None = None) -> list[Lead]:
        async with self._lock:
            leads = [l for l in self.leads if l.status == "pending"]
            if agent_type:
                leads = [l for l in leads if l.target_agent == agent_type]
            return sorted(leads, key=lambda l: l.priority)

    async def mark_lead_completed(self, lead_id: str) -> None:
        async with self._lock:
            for lead in self.leads:
                if lead.id == lead_id:
                    lead.status = "completed"
                    break
            self._auto_save()

    async def mark_lead_failed(self, lead_id: str, error: str = "") -> None:
        async with self._lock:
            for lead in self.leads:
                if lead.id == lead_id:
                    lead.status = "failed"
                    lead.context["failure_reason"] = error
                    break
            self._auto_save()

    # ---- Investigation areas -------------------------------------------------

    async def add_investigation_area(
        self,
        area: str,
        description: str,
        suggested_agent: str,
        expected_keywords: list[str] | None = None,
        expected_tools: list[str] | None = None,
        priority: int = 5,
        motivating_hypothesis_ids: list[str] | None = None,
        created_by: str = "",
    ) -> tuple[str, bool]:
        """Add or merge an investigation area. Dedupe key is the `area` slug.

        On collision, union the three list fields (keywords / tools /
        motivating_hypothesis_ids); description / suggested_agent / priority
        are preserved from the first writer (manual seed wins over LLM derive).
        Returns (id, was_existing).
        """
        async with self._lock:
            for existing in self.investigation_areas.values():
                if existing.area == area:
                    for kw in (expected_keywords or []):
                        if kw not in existing.expected_keywords:
                            existing.expected_keywords.append(kw)
                    for t in (expected_tools or []):
                        if t not in existing.expected_tools:
                            existing.expected_tools.append(t)
                    for hid in (motivating_hypothesis_ids or []):
                        if hid not in existing.motivating_hypothesis_ids:
                            existing.motivating_hypothesis_ids.append(hid)
                    self._auto_save()
                    return existing.id, True

            aid = f"area-{area}"
            self.investigation_areas[aid] = InvestigationArea(
                id=aid,
                area=area,
                description=description,
                suggested_agent=suggested_agent,
                expected_keywords=list(expected_keywords or []),
                expected_tools=list(expected_tools or []),
                priority=priority,
                motivating_hypothesis_ids=list(motivating_hypothesis_ids or []),
                created_by=created_by,
                created_at=datetime.now().isoformat(),
            )
            self._auto_save()
            return aid, False

    # ---- Tool invocation log -------------------------------------------------

    async def record_tool_invocation(
        self,
        tool: str,
        args: dict,
        output: str,
        cached: bool = False,
    ) -> str:
        """Record one tool call. Returns the invocation_id.

        Source / agent / task_id are read from the graph's current run
        context (set by BaseAgent.run and set_active_source) so executors
        can stay stateless.
        """
        iid = f"inv-{uuid.uuid4().hex[:8]}"
        src_id = self.active_source.id if self.active_source else ""
        inv = ToolInvocation(
            id=iid,
            tool=tool,
            args=dict(args),
            output=output,
            output_sha256=hashlib.sha256(output.encode("utf-8", errors="replace")).hexdigest(),
            agent=self._current_agent or "unknown",
            task_id=self._current_task_id or "",
            source_id=src_id,
            created_at=datetime.now().isoformat(),
            cached=cached,
        )
        async with self._lock:
            self.tool_invocations[iid] = inv
            # Cheap on cache hit; expensive but bounded otherwise. Skip
            # auto-save here — too noisy if every tool call rewrites the
            # state file; the next phenomenon write will flush.
        return iid

    def validate_fact_grounding(
        self,
        fact: dict,
        agent: str,
        task_id: str,
    ) -> tuple[bool, str]:
        """Check a single verified_fact's grounding. Returns (ok, reason).

        Rules (DESIGN.md §4.4, refined after first end-to-end run):
          1. invocation_id must exist in self.tool_invocations.
          2. The invocation must have been made by `agent` within `task_id`.
          3. fact.value must appear in invocation.output — either as a
             strict substring, OR (loose-match fallback) once both sides
             are normalised via :func:`_normalize_for_grounding`
             (case-folded, whitespace-collapsed, path-sep unified).

        The loose match catches the LLM's routine presentation
        normalisations (case-folded hex, slash-flipped paths, collapsed
        multi-line labels) without enabling fabrication: a string that
        isn't present in ANY form still fails the normalised check.
        """
        inv_id = fact.get("invocation_id", "")
        value = fact.get("value", "")
        if not inv_id:
            return False, "missing invocation_id"
        inv = self.tool_invocations.get(inv_id)
        if inv is None:
            return False, f"invocation_id {inv_id} not found in invocation log"
        if inv.agent != agent:
            return False, (
                f"invocation {inv_id} was made by agent '{inv.agent}', "
                f"not '{agent}' — cannot be cited by a different agent"
            )
        if task_id and inv.task_id and inv.task_id != task_id:
            return False, (
                f"invocation {inv_id} was made in a different task scope "
                f"({inv.task_id}) — cite only invocations from your current task"
            )
        if not isinstance(value, str) or not value:
            return False, "fact.value must be a non-empty string"
        if value in inv.output:
            return True, "ok"
        # Loose fallback: normalised comparison absorbs case / whitespace /
        # path-sep differences but a genuinely absent value still fails.
        if _normalize_for_grounding(value) in _normalize_for_grounding(inv.output):
            return True, "ok-normalized"
        return False, (
            f"fact.value not found in invocation {inv_id} output — even after "
            f"case/whitespace/path-sep normalisation. Copy a literal substring "
            f"from that tool's result; if the content is a guess (device model, "
            f"constructed path, label-joined value), move it into `interpretation` "
            f"instead of `verified_facts`."
        )

    # ---- Asset library -------------------------------------------------------

    async def register_asset(
        self,
        inode: str,
        original_path: str,
        local_path: str,
        category: str,
        filename: str,
        size_bytes: int,
        extracted_by: str,
    ) -> tuple[str, bool]:
        """Register an extracted file. Deduplicates by inode. Returns (id, already_existed)."""
        async with self._lock:
            if inode in self._inode_index:
                return self._inode_index[inode], True

            aid = f"asset-{uuid.uuid4().hex[:8]}"
            asset = ExtractedAsset(
                id=aid,
                inode=inode,
                original_path=original_path,
                local_path=local_path,
                category=category,
                filename=filename,
                size_bytes=size_bytes,
                extracted_by=extracted_by,
                extracted_at=datetime.now().isoformat(),
            )
            self.asset_library[aid] = asset
            self._inode_index[inode] = aid
            self._auto_save()
            return aid, False

    def lookup_asset_by_inode(self, inode: str) -> ExtractedAsset | None:
        """Look up an extracted asset by inode (synchronous, no lock needed for reads)."""
        aid = self._inode_index.get(inode)
        return self.asset_library.get(aid) if aid else None

    def list_assets(self, category: str | None = None) -> list[str]:
        """Return one-line summaries of all assets, optionally filtered."""
        results = []
        for asset in self.asset_library.values():
            if category and asset.category != category:
                continue
            results.append(asset.summary())
        return results

    def query_assets(
        self,
        category: str | None = None,
        filename_pattern: str | None = None,
    ) -> list[ExtractedAsset]:
        """Query the asset library with optional filters."""
        results = []
        for asset in self.asset_library.values():
            if category and asset.category != category:
                continue
            if filename_pattern and filename_pattern.lower() not in asset.filename.lower():
                continue
            results.append(asset)
        return results

    # ---- Query methods (for agent tools) ------------------------------------

    def list_phenomena(self, category: str | None = None) -> list[str]:
        """Return one-line summaries of all phenomena, optionally filtered."""
        results = []
        for ph in self.phenomena.values():
            if category and ph.category != category:
                continue
            results.append(ph.summary())
        return results

    def get_phenomenon(self, ph_id: str) -> dict | None:
        """Return full phenomenon details as dict, or None."""
        ph = self.phenomena.get(ph_id)
        return ph.to_dict() if ph else None

    def search_graph(self, keyword: str) -> list[str]:
        """Search across all node types by keyword. Returns summaries."""
        kw = keyword.lower()
        results = []
        for ph in self.phenomena.values():
            haystack = (
                ph.title.lower()
                + " " + ph.interpretation.lower()
                + " " + " ".join(str(f.get("value", "")).lower() for f in ph.verified_facts)
            )
            if kw in haystack:
                results.append(ph.summary())
        for hyp in self.hypotheses.values():
            if kw in hyp.title.lower() or kw in hyp.description.lower():
                results.append(hyp.summary())
        for ent in self.entities.values():
            if kw in ent.name.lower() or kw in ent.description.lower():
                results.append(ent.summary())
        return results

    def get_related(
        self,
        node_id: str,
        edge_type: str | None = None,
        direction: str = "both",
    ) -> list[dict]:
        """Get nodes connected to the given node. Returns list of {node_summary, edge_type, direction}."""
        results = []
        if direction in ("out", "both"):
            for edge in self._adj.get(node_id, []):
                if edge_type and edge.edge_type != edge_type:
                    continue
                node = self.get_node(edge.target_id)
                if node:
                    results.append({
                        "node": node.summary(),
                        "edge_type": edge.edge_type,
                        "direction": "outgoing",
                        "metadata": edge.metadata,
                    })
        if direction in ("in", "both"):
            for edge in self._adj_rev.get(node_id, []):
                if edge_type and edge.edge_type != edge_type:
                    continue
                node = self.get_node(edge.source_id)
                if node:
                    results.append({
                        "node": node.summary(),
                        "edge_type": edge.edge_type,
                        "direction": "incoming",
                        "metadata": edge.metadata,
                    })
        return results

    def get_hypothesis_status(self) -> list[str]:
        """Return summaries of all hypotheses."""
        return [h.summary() for h in self.hypotheses.values()]

    def get_phenomena_by_category(self, category: str) -> list[Phenomenon]:
        return [p for p in self.phenomena.values() if p.category == category]

    def hypotheses_converged(self) -> bool:
        """True if no hypotheses are still active."""
        return all(h.status != "active" for h in self.hypotheses.values())

    def mark_remaining_inconclusive(self) -> None:
        """Mark all still-active hypotheses as inconclusive."""
        for h in self.hypotheses.values():
            if h.status == "active":
                h.status = "inconclusive"

    # ---- Hypothesis × Evidence matrix (DESIGN.md §4.5) -----------------------

    def hypothesis_evidence_matrix(self) -> dict:
        """Structured pivot of every Phenomenon→Hypothesis edge.

        Returns ``{"hypotheses": [...], "phenomena": [...], "cells": {...},
        "counts_by_edge_type": {hyp_id: {edge_type: count}}}`` — the cells
        map ``(hyp_id, ph_id)`` to a *list* of edge_type strings (a single
        phenomenon may link via several edge_types after a manual override
        plus an LLM judge call). Drives report rendering and gap selection.
        """
        cells: dict[tuple[str, str], list[str]] = {}
        counts: dict[str, dict[str, int]] = {h: {} for h in self.hypotheses}
        for edge in self.edges:
            if not (
                edge.source_id.startswith("ph-")
                and edge.target_id.startswith("hyp-")
                and edge.edge_type in self.edge_log_lr
            ):
                continue
            key = (edge.target_id, edge.source_id)
            cells.setdefault(key, []).append(edge.edge_type)
            counts.setdefault(edge.target_id, {})[edge.edge_type] = (
                counts.setdefault(edge.target_id, {}).get(edge.edge_type, 0) + 1
            )

        hypotheses = [
            {
                "id": h.id,
                "title": h.title,
                "confidence": h.confidence,
                "log_odds": h.log_odds,
                "status": h.status,
            }
            for h in self.hypotheses.values()
        ]
        referenced = {ph_id for (_, ph_id) in cells}
        phenomena = [
            {"id": ph.id, "title": ph.title, "category": ph.category}
            for ph in self.phenomena.values()
            if ph.id in referenced
        ]
        return {
            "hypotheses": hypotheses,
            "phenomena": phenomena,
            "cells": {f"{h}|{p}": types for (h, p), types in cells.items()},
            "counts_by_edge_type": counts,
        }

    def hypothesis_evidence_matrix_markdown(self) -> str:
        """Render the matrix as a compact markdown pivot.

        Columns are the edge types (counts), plus log_odds, confidence,
        status — enough for the report agent to ground every hypothesis
        in its supporting and contradicting evidence at a glance.
        """
        if not self.hypotheses:
            return "(no hypotheses)"
        matrix = self.hypothesis_evidence_matrix()
        edge_types = sorted(self.edge_log_lr.keys())
        header = (
            "| Hypothesis | "
            + " | ".join(edge_types)
            + " | log_odds | conf | status |"
        )
        sep = (
            "|---|"
            + "|".join(["---:"] * len(edge_types))
            + "|---:|---:|---|"
        )
        rows = [header, sep]
        for h in matrix["hypotheses"]:
            counts = matrix["counts_by_edge_type"].get(h["id"], {})
            cells = [str(counts.get(et, 0)) for et in edge_types]
            rows.append(
                f"| {h['title']} | "
                + " | ".join(cells)
                + f" | {h['log_odds']:+.2f} | {h['confidence']:.2f} | {h['status']} |"
            )
        return "\n".join(rows)

    # ---- Summary (lightweight, for system prompt) ----------------------------

    def stats_summary(self) -> str:
        """Ultra-compact stats for inclusion in system prompt."""
        active_hyp = sum(1 for h in self.hypotheses.values() if h.status == "active")
        return (
            f"Graph: {len(self.phenomena)} phenomena, "
            f"{len(self.hypotheses)} hypotheses ({active_hyp} active), "
            f"{len(self.entities)} entities, {len(self.edges)} edges. "
            f"Asset library: {len(self.asset_library)} extracted files. "
            f"Pending leads: {sum(1 for l in self.leads if l.status == 'pending')}."
        )