Files
MASForensic/evidence_graph.py
BattleTag 81ade8f7ac feat(refit): complete S1-S6 — case abstraction, grounding, log-odds, plugins, coref, multi-source
Consolidates the long-running refit work (DESIGN.md as authoritative spec)
into a single baseline commit. Six stages landed together:

  S1  Case + EvidenceSource abstraction; tools parameterised by source_id
      (case.py, main.py multi-source bootstrap, .bin extension support)
  S2  Grounding gateway in add_phenomenon: verified_facts cite real
      ToolInvocation ids; substring / normalised match enforced; agent +
      task scope checked. Phenomenon.description split into verified_facts
      (grounded) + interpretation (free text). [invocation: inv-xxx]
      prefix on every wrapped tool result so the LLM can cite.
  S3  Confidence as additive log-odds: edge_type → log10(LR) calibration
      table; commutative updates; supported / refuted thresholds derived
      from log_odds; hypothesis × evidence matrix view.
  S4  iOS plugin: unzip_archive + parse_plist / sqlite_tables /
      sqlite_query / parse_ios_keychain / read_idevice_info;
      IOSArtifactAgent; SOURCE_TYPE_AGENTS routing.
  S5  Cross-source entity resolution: typed identifiers on Entity,
      observe_identity gateway, auto coref hypothesis with shared /
      conflicting strong/weak LR edges, reversible same_as edges,
      actor_clusters() view.
  S6  Android partition probe + AndroidArtifactAgent; MediaAgent with
      OCR fallback; orchestrator Phase 1 iterates every analysable
      source; platform-aware get_triage_agent_type; ReportAgent renders
      actor clusters + per-source breakdown.

142 unit tests / 1 skipped — full coverage of the new gateway, log-odds
math, coref hypothesis fall-out, and orchestrator multi-source dispatch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 02:12:10 -10:00

1896 lines
74 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Evidence Knowledge Graph for multi-agent forensic analysis.
Replaces the flat Blackboard with a graph-based evidence store.
Nodes: Phenomenon (observable artifacts), Hypothesis (interpretive claims), Entity (recurring objects).
Edges: typed relationships with predefined weights for hypothesis confidence computation.
"""
from __future__ import annotations
import asyncio
import contextvars
import hashlib
import json
import logging
import re
import uuid
from dataclasses import asdict, dataclass, field
from datetime import datetime
from pathlib import Path
# Per-asyncio-task scoped values for "which agent is currently running" and
# "which task scope does that agent's grounding live in". Backed by
# ContextVars so concurrent agent runs (Phase 3's _dispatch_leads_parallel)
# don't clobber each other — asyncio.create_task / asyncio.gather copies
# the parent context per child task, and writes inside one task stay there.
# Pre-P0 these were plain attributes on EvidenceGraph; the last setter won
# under concurrency, tagging tool invocations with the WRONG agent and
# making the grounding gateway falsely reject legitimate facts.
_current_agent_ctx: contextvars.ContextVar[str] = contextvars.ContextVar(
"masforensics_current_agent", default="",
)
_current_task_id_ctx: contextvars.ContextVar[str] = contextvars.ContextVar(
"masforensics_current_task_id", default="",
)
from case import Case, EvidenceSource, single_source_case
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Per-edge-type log₁₀(LR) — the calibration table backing hypothesis
# confidence updates (DESIGN.md §4.5).
#
# The LLM only picks the *category* (direct_evidence, supports, …); the
# numerical contribution is looked up here. Updates use the additive,
# order-independent log-odds form
# L_post = L_prior + Σ log10(LR_i)
# confidence = 1 / (1 + 10^(L_post))
# which fixes the pre-S3 delta-update bug whose result depended on the
# order edges arrived in.
#
# Override per-graph via EvidenceGraph(edge_log_lr=...) or config.yaml's
# `hypothesis_log_lr` section.
# ---------------------------------------------------------------------------
_DEFAULT_LOG_LR: dict[str, float] = {
"direct_evidence": +2.0,
"supports": +1.0,
"consequence_observed": +1.0,
"prerequisite_met": +0.5,
"weakens": -0.5,
"contradicts": -2.0,
# S5 cross-source coref (DESIGN.md §4.6) — same calibration scale.
# A single shared strong identifier (email, phone, wallet, IMEI…) is
# near-decisive; weak identifiers (nickname) accumulate slowly; a
# conflicting strong identifier is strong negative evidence.
"shared_strong_identifier": +2.0,
"shared_weak_identifier": +0.5,
"conflicting_strong_identifier": -2.0,
}
# DESIGN.md §4.6 identifier taxonomy. Strong identifiers approximate
# global uniqueness — sharing one is high-confidence coref evidence.
# Weak identifiers are nicknames / display names — accumulate via Bayes.
STRONG_IDENTIFIER_TYPES: set[str] = {
"email",
"phone_number",
"imei",
"imsi",
"apple_id",
"icloud_id",
"google_account",
"wallet_address",
"udid",
"mac_address",
"device_serial",
}
WEAK_IDENTIFIER_TYPES: set[str] = {
"nickname",
"display_name",
"username",
"screen_name",
}
def is_strong_identifier(identifier_type: str) -> bool:
"""True if the identifier carries enough uniqueness for a strong LR edge."""
return identifier_type in STRONG_IDENTIFIER_TYPES
def _normalize_identifier(identifier_type: str, value: str) -> str:
"""Canonicalise an identifier value so trivial spelling variants match.
- Lowercase for case-insensitive identifiers (email, hostnames, hex).
- Strip whitespace and the leading '+' on phone numbers / international
dialling, then keep only digits for phone matching.
- Pass-through for free-form strings (nicknames).
"""
v = (value or "").strip()
if identifier_type in {"email", "apple_id", "icloud_id", "google_account",
"mac_address", "wallet_address", "udid",
"imei", "imsi", "device_serial"}:
v = v.lower()
if identifier_type == "phone_number":
import re as _re
v = _re.sub(r"\D", "", v)
return v
def prob_to_log_odds(p: float) -> float:
"""Logit (base 10). Clipped to keep ±∞ out of the graph."""
p = max(1e-9, min(1 - 1e-9, float(p)))
import math
return math.log10(p / (1.0 - p))
def log_odds_to_prob(log_odds: float) -> float:
"""Inverse of :func:`prob_to_log_odds`: 1 / (1 + 10^(L))."""
return 1.0 / (1.0 + 10.0 ** (-float(log_odds)))
_WS_RUN = re.compile(r"\s+")
def _normalize_for_grounding(s: str) -> str:
"""Canonicalise a string for the loose-match branch of fact grounding.
Strict ``value in inv.output`` rejected real evidence because the LLM
routinely normalises tool output before quoting:
- case-folds hex (``89 50 4e 47`` → ``89 50 4E 47``)
- flips path separators (``Sunny\\foo.exe`` → ``Sunny/foo.exe``)
- collapses whitespace across newlines (``AppleID:\\n alice@x.com``
→ ``AppleID: alice@x.com``)
None of those are hallucinations — they're presentation choices. This
normaliser does the inverse so both sides line up:
- lowercase everything (handles hex case + email case + MAC case)
- collapse any run of whitespace to a single space
- replace ``\\`` with ``/`` (path-sep flip)
Genuine fabrications still fail: a value that doesn't appear (in any
form) inside the output normalises to a string that isn't a substring
of the normalised output, and the gateway rejects exactly as before.
"""
if not s:
return ""
s = s.lower().replace("\\", "/")
s = _WS_RUN.sub(" ", s)
return s.strip()
class GroundingError(ValueError):
"""Raised by the add_phenomenon gateway when one or more verified_facts
fail the grounding check (missing/wrong invocation_id, wrong agent or
task, or fact.value not present in the cited tool output).
Carries the failed facts so callers (BaseAgent) can format them back
to the LLM for a corrective retry.
"""
def __init__(self, message: str, failures: list[dict]) -> None:
super().__init__(message)
self.failures = failures
# All valid edge types across the graph.
VALID_EDGE_TYPES: set[str] = {
# Phenomenon → Hypothesis
"direct_evidence", "supports", "prerequisite_met",
"consequence_observed", "contradicts", "weakens",
# Phenomenon → Hypothesis (S5 coref-specific — used between identifier
# observation phenomena and the "Entity A ≡ Entity B" coref hypothesis)
"shared_strong_identifier", "shared_weak_identifier",
"conflicting_strong_identifier",
# Phenomenon → Phenomenon
"temporal", "causal", "input_to", "modifies", "co_located", "corroborates",
# Phenomenon → Entity
"created_by", "executed_by", "owned_by", "targets",
"associated_with", "found_on", "used_by",
# Hypothesis → Hypothesis
"refines", "conflicts", "depends_on",
# Entity → Entity (S5 — backed by a coref hypothesis ≥ threshold)
"same_as",
}
# ---------------------------------------------------------------------------
# Graph node types
# ---------------------------------------------------------------------------
def _compute_quality_score(
source_tool: str,
timestamp: str | None,
raw_data: dict,
interpretation: str,
verified_facts: list[dict],
related_ids: list[str],
) -> float:
"""Compute a quality score (0.0-1.0) based on evidence completeness.
A grounded phenomenon (any verified_facts) outweighs a long free-text
interpretation: the facts carry provenance, the interpretation doesn't.
"""
score = 0.0
if source_tool:
score += 0.20
if timestamp is not None:
score += 0.15
if raw_data:
score += 0.15
if verified_facts:
# Capped contribution: 0.05 per fact up to 0.25.
score += min(0.25, 0.05 * len(verified_facts))
if len(interpretation) >= 50:
score += 0.10
if related_ids:
score += 0.15
return min(1.0, score)
def _jaccard_similarity(a: str, b: str) -> float:
"""Token-level Jaccard similarity between two strings."""
tokens_a = set(a.lower().split())
tokens_b = set(b.lower().split())
if not tokens_a or not tokens_b:
return 0.0
return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)
@dataclass
class Phenomenon:
"""Raw observable artifact found on disk.
DESIGN.md §4.4: a phenomenon is split into provenance-bound *facts*
and free-text *interpretation*. The gateway hard-validates every
fact against the recorded tool invocation it cites; interpretation
is the agent's narrative and is rendered as "agent analysis" in the
final report — not as truth.
"""
id: str # "ph-{uuid8}"
source_agent: str
category: str # filesystem, registry, email, network, timeline
title: str
# Free-form analysis text — the agent's reasoning. NOT verified.
interpretation: str = ""
# Grounded atoms. Each fact: {type, value, invocation_id}.
# type ∈ {path, timestamp, inode, hash, identifier, count, raw, ...}
verified_facts: list[dict] = field(default_factory=list)
raw_data: dict = field(default_factory=dict)
timestamp: str | None = None
confidence: float = 1.0
source_tool: str = ""
source_id: str = "" # id of the EvidenceSource this finding came from
corroborating_agents: list[str] = field(default_factory=list)
from_lead_id: str | None = None
created_at: str = ""
def to_dict(self) -> dict:
return asdict(self)
@classmethod
def from_dict(cls, d: dict) -> Phenomenon:
"""Reconstruct from a dict; migrate legacy ``description`` field.
Older runs persisted free text in ``description``; treat that as
``interpretation`` so old graph_state.json files keep loading.
"""
d = dict(d)
if "description" in d:
legacy = d.pop("description")
d.setdefault("interpretation", legacy or "")
d.setdefault("verified_facts", [])
known = set(cls.__dataclass_fields__)
return cls(**{k: v for k, v in d.items() if k in known})
def summary(self) -> str:
ts = f" @ {self.timestamp}" if self.timestamp else ""
nf = len(self.verified_facts)
facts_note = f" facts={nf}" if nf else ""
return (
f"[{self.id}] [{self.category}] {self.title}{ts} "
f"(conf={self.confidence:.2f}{facts_note})"
)
@dataclass
class Hypothesis:
"""Interpretive claim about what happened on the system.
Confidence is a *derived* projection of ``log_odds`` (DESIGN.md §4.5):
every Phenomenon→Hypothesis edge contributes log₁₀(LR) to ``log_odds``,
and ``confidence = 1 / (1 + 10^(log_odds))``. ``log_odds`` is the
canonical state; ``confidence`` is kept in sync for display and
threshold checks (≥0.8 supported / ≤0.2 refuted).
``prior_prob`` seeds the starting log_odds (default 0.5 → 0.0).
"""
id: str # "hyp-{uuid8}"
title: str
description: str
prior_prob: float = 0.5
log_odds: float = 0.0
confidence: float = 0.5 # derived from log_odds — kept in sync on update
status: str = "active" # active, supported, refuted, inconclusive
parent_id: str | None = None
created_by: str = "" # "manual", "hypothesis_agent", agent name
created_at: str = ""
confidence_log: list[dict] = field(default_factory=list)
# S5 coref-specific: pair of entity ids this hypothesis claims are the
# same actor. Lets update_hypothesis_confidence sync the backing
# ``same_as`` edge automatically when contradicting evidence arrives.
coref_entity_pair: list[str] = field(default_factory=list)
def to_dict(self) -> dict:
return asdict(self)
@classmethod
def from_dict(cls, d: dict) -> Hypothesis:
"""Reconstruct from a dict. Migrates pre-S3 records that only had
``confidence`` by deriving ``log_odds`` via the logit transform.
"""
d = dict(d)
if "log_odds" not in d:
d["log_odds"] = prob_to_log_odds(d.get("confidence", 0.5))
d.setdefault("prior_prob", 0.5)
# Re-sync confidence from log_odds in case of drift in old files.
d["confidence"] = log_odds_to_prob(d["log_odds"])
known = set(cls.__dataclass_fields__)
return cls(**{k: v for k, v in d.items() if k in known})
def summary(self) -> str:
return (
f"[{self.id}] {self.title} "
f"(conf={self.confidence:.2f}, L={self.log_odds:+.2f}, {self.status})"
)
@dataclass
class Entity:
"""Recurring actor or object across phenomena.
DESIGN.md §4.6 attaches typed identifiers directly to the entity for
fast blocking lookups during coref. Each identifier entry:
{type, value, normalized, strong, invocation_id, phenomenon_id, observed_at}
where ``normalized`` is the canonicalised form used for matching
(lower-cased email, digits-only phone, …).
"""
id: str # "ent-{uuid8}"
name: str
entity_type: str # person, program, file, host, ip_address
description: str = ""
identifiers: list[dict] = field(default_factory=list)
created_at: str = ""
def to_dict(self) -> dict:
return asdict(self)
@classmethod
def from_dict(cls, d: dict) -> Entity:
d = dict(d)
d.setdefault("identifiers", [])
known = set(cls.__dataclass_fields__)
return cls(**{k: v for k, v in d.items() if k in known})
def has_identifier(self, identifier_type: str, normalized_value: str) -> bool:
return any(
i.get("type") == identifier_type
and i.get("normalized") == normalized_value
for i in self.identifiers
)
def summary(self) -> str:
idents = ""
if self.identifiers:
top = self.identifiers[:3]
preview = ", ".join(f"{i.get('type')}={i.get('value')}" for i in top)
extra = (
f" (+{len(self.identifiers) - 3} more)"
if len(self.identifiers) > 3 else ""
)
idents = f" [{preview}{extra}]"
return f"[{self.id}] {self.entity_type}: {self.name}{idents}"
@dataclass
class Edge:
"""Directed edge in the evidence graph."""
id: str # "edge-{uuid8}"
source_id: str
target_id: str
edge_type: str
metadata: dict = field(default_factory=dict)
created_by: str = ""
created_at: str = ""
def to_dict(self) -> dict:
return asdict(self)
@classmethod
def from_dict(cls, d: dict) -> Edge:
return cls(**d)
@dataclass
class Lead:
"""An investigative lead that should be followed up by an agent."""
id: str
target_agent: str
description: str
priority: int = 5 # 1 (highest) - 10 (lowest)
context: dict = field(default_factory=dict)
status: str = "pending" # pending, assigned, completed, failed
hypothesis_id: str | None = None
def to_dict(self) -> dict:
return asdict(self)
@classmethod
def from_dict(cls, d: dict) -> Lead:
return cls(**d)
@dataclass
class InvestigationArea:
"""An area to investigate to confirm/refute one or more hypotheses.
Derived by the orchestrator from active hypotheses after Phase 2; also
seeded from config.yaml:investigation_areas as an optional manual
override. Each area carries its own keywords + expected tools so the
gap-analysis coverage check is generic, not tied to hard-coded constants.
"""
id: str # "area-{slug}"
area: str # snake_case slug (dedupe key)
description: str
suggested_agent: str # filesystem / registry / communication / network / timeline
expected_keywords: list[str] = field(default_factory=list)
expected_tools: list[str] = field(default_factory=list)
priority: int = 5 # 1 (highest) - 10 (lowest)
motivating_hypothesis_ids: list[str] = field(default_factory=list)
created_by: str = "" # "manual" | "llm_derive" | "fallback"
created_at: str = ""
def to_dict(self) -> dict:
return asdict(self)
@classmethod
def from_dict(cls, d: dict) -> InvestigationArea:
return cls(**d)
def summary(self) -> str:
return (
f"[{self.area}] P{self.priority} agent={self.suggested_agent} "
f"(motivating: {len(self.motivating_hypothesis_ids)})"
)
@dataclass
class ExtractedAsset:
"""A file extracted from the disk image and tracked in the asset library."""
id: str # "asset-{uuid8}"
inode: str # e.g. "334-128-4"
original_path: str # disk image path from ffind
local_path: str # "extracted/SYSTEM"
category: str # registry_hive, chat_log, prefetch, ...
filename: str # "SYSTEM"
size_bytes: int
extracted_by: str # agent name
extracted_at: str # ISO timestamp
def to_dict(self) -> dict:
return asdict(self)
@classmethod
def from_dict(cls, d: dict) -> ExtractedAsset:
return cls(**d)
def summary(self) -> str:
size_kb = self.size_bytes / 1024
return (
f"[{self.id}] {self.filename} ({self.category}) "
f"{size_kb:.1f}KB @ {self.local_path} [inode:{self.inode}]"
)
@dataclass
class ToolInvocation:
"""One recorded tool call — the provenance unit for grounded facts.
Every wrapped tool executor records a ToolInvocation when it runs. The
grounding gateway looks these up by id when validating that a fact in
an ``add_phenomenon`` call traces back to a real tool output. Persisted
with the graph so a re-loaded run can still verify provenance.
"""
id: str # "inv-{uuid8}"
tool: str # tool name as registered in TOOL_CATALOG
args: dict # kwargs passed to the executor
output: str # the raw output string the tool produced
output_sha256: str # hexdigest — tamper-evident hash of output
agent: str # agent that issued the call
task_id: str # agent run scope (graph._current_task_id at call time)
source_id: str # active evidence source at call time
created_at: str # ISO timestamp
cached: bool = False # served from result cache without re-running
def to_dict(self) -> dict:
return asdict(self)
@classmethod
def from_dict(cls, d: dict) -> ToolInvocation:
return cls(**d)
def summary(self) -> str:
return (
f"[{self.id}] {self.tool}({json.dumps(self.args, ensure_ascii=False)}) "
f"@{self.created_at} agent={self.agent} cached={self.cached}"
)
# ---------------------------------------------------------------------------
# Evidence Graph
# ---------------------------------------------------------------------------
class EvidenceGraph:
"""Graph-based evidence store for multi-agent forensic analysis.
Agents interact with the graph via query tools (list_phenomena,
get_phenomenon, search_graph, get_related) rather than reading
a full dump in the system prompt.
"""
def __init__(
self,
case_info: dict | None = None,
persist_path: Path | None = None,
edge_log_lr: dict[str, float] | None = None,
) -> None:
self.case_info: dict = case_info or {}
# log₁₀(LR) per edge type — calibration table for confidence updates.
# Renamed from edge_weights (S3): the values are no longer deltas in
# confidence space, they are log-likelihood ratios in odds space.
self.edge_log_lr: dict[str, float] = (
dict(edge_log_lr) if edge_log_lr else dict(_DEFAULT_LOG_LR)
)
self.image_path: str = ""
self.partition_offset: int = 0
self.extracted_dir: str = "extracted"
# Multi-evidence: the case and the source tools/phenomena bind to.
# image_path / partition_offset above mirror active_source for
# backward-compatible readers; set_active_source keeps them in sync.
self.case: Case | None = None
self.active_source: EvidenceSource | None = None
# Graph storage
self.phenomena: dict[str, Phenomenon] = {}
self.hypotheses: dict[str, Hypothesis] = {}
self.entities: dict[str, Entity] = {}
self.edges: list[Edge] = []
# Adjacency index for fast traversal
self._adj: dict[str, list[Edge]] = {} # node_id → outgoing edges
self._adj_rev: dict[str, list[Edge]] = {} # node_id → incoming edges
# Lead / status management (carried over from Blackboard)
self.leads: list[Lead] = []
self.agent_status: dict[str, str] = {}
# Asset library — tracks all files extracted from the disk image
self.asset_library: dict[str, ExtractedAsset] = {}
self._inode_index: dict[str, str] = {} # inode → asset_id
# Investigation areas — derived from hypotheses (LLM) and/or seeded
# from config.yaml:investigation_areas (manual override). Drives the
# gap-analysis coverage check.
self.investigation_areas: dict[str, InvestigationArea] = {}
# Tool invocations — provenance log for grounded facts. Every wrapped
# tool executor records one entry; add_phenomenon's grounding gateway
# looks them up to validate cited invocation_ids and substring-match
# claimed fact values against real tool outputs.
self.tool_invocations: dict[str, ToolInvocation] = {}
# _current_agent / _current_task_id are exposed as @property below,
# backed by module-level ContextVars (race-free under asyncio.gather).
self._lock = asyncio.Lock()
self._persist_path: Path | None = persist_path
# ---- Per-asyncio-task scoped state ---------------------------------------
#
# Reads/writes through these properties hit ContextVars rather than
# instance attributes. Concurrent agent runs (Phase 3 parallel
# dispatch) each have their own task-local context, so writes inside
# one agent's run() are invisible to siblings — which means
# ``record_tool_invocation`` always tags an invocation with the agent
# and task scope that actually issued it.
@property
def _current_agent(self) -> str:
return _current_agent_ctx.get()
@_current_agent.setter
def _current_agent(self, value: str) -> None:
_current_agent_ctx.set(value or "")
@property
def _current_task_id(self) -> str:
return _current_task_id_ctx.get()
@_current_task_id.setter
def _current_task_id(self, value: str) -> None:
_current_task_id_ctx.set(value or "")
# ---- Persistence -------------------------------------------------------
def _auto_save(self) -> None:
"""Persist full state to disk. Must be called inside _lock."""
if self._persist_path is None:
return
try:
state = {
"case_info": self.case_info,
"case": self.case.to_dict() if self.case else None,
"active_source_id": (
self.active_source.id if self.active_source else ""
),
"image_path": self.image_path,
"partition_offset": self.partition_offset,
"extracted_dir": self.extracted_dir,
"phenomena": {pid: p.to_dict() for pid, p in self.phenomena.items()},
"hypotheses": {hid: h.to_dict() for hid, h in self.hypotheses.items()},
"entities": {eid: e.to_dict() for eid, e in self.entities.items()},
"edges": [e.to_dict() for e in self.edges],
"leads": [l.to_dict() for l in self.leads],
"agent_status": dict(self.agent_status),
"asset_library": {aid: a.to_dict() for aid, a in self.asset_library.items()},
"investigation_areas": {
aid: a.to_dict() for aid, a in self.investigation_areas.items()
},
"tool_invocations": {
iid: inv.to_dict() for iid, inv in self.tool_invocations.items()
},
"saved_at": datetime.now().isoformat(),
}
tmp = self._persist_path.with_suffix(".tmp")
tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2))
tmp.replace(self._persist_path)
except Exception as e:
logger.error("EvidenceGraph auto-save failed: %s", e)
def save_state(self, path: Path) -> None:
"""Explicitly save state to the given path."""
old = self._persist_path
self._persist_path = path
self._auto_save()
self._persist_path = old
@classmethod
def load_state(
cls,
path: Path,
edge_log_lr: dict[str, float] | None = None,
) -> EvidenceGraph:
"""Restore an EvidenceGraph from a saved JSON state file."""
data = json.loads(path.read_text())
graph = cls(
case_info=data.get("case_info", {}),
persist_path=path,
edge_log_lr=edge_log_lr,
)
graph.image_path = data.get("image_path", "")
graph.partition_offset = data.get("partition_offset", 0)
graph.extracted_dir = data.get("extracted_dir", "extracted")
# Restore the evidence-source model. State files predating the Case
# model carry only image_path/partition_offset → wrap as one source.
case_data = data.get("case")
if case_data:
graph.case = Case.from_dict(case_data)
elif graph.image_path:
graph.case = single_source_case(
graph.image_path, graph.partition_offset
)
if graph.case and graph.case.sources:
active = graph.case.get_source(data.get("active_source_id", ""))
graph.set_active_source(active or graph.case.sources[0])
graph.phenomena = {
pid: Phenomenon.from_dict(p)
for pid, p in data.get("phenomena", {}).items()
}
graph.hypotheses = {
hid: Hypothesis.from_dict(h)
for hid, h in data.get("hypotheses", {}).items()
}
graph.entities = {
eid: Entity.from_dict(e)
for eid, e in data.get("entities", {}).items()
}
graph.edges = [Edge.from_dict(e) for e in data.get("edges", [])]
graph.leads = [Lead.from_dict(l) for l in data.get("leads", [])]
graph.agent_status = data.get("agent_status", {})
for aid, a_data in data.get("asset_library", {}).items():
asset = ExtractedAsset.from_dict(a_data)
graph.asset_library[aid] = asset
graph._inode_index[asset.inode] = aid
graph.investigation_areas = {
aid: InvestigationArea.from_dict(a)
for aid, a in data.get("investigation_areas", {}).items()
}
graph.tool_invocations = {
iid: ToolInvocation.from_dict(inv)
for iid, inv in data.get("tool_invocations", {}).items()
}
graph._rebuild_adjacency()
logger.info(
"EvidenceGraph restored: %d phenomena, %d hypotheses, %d entities, "
"%d edges, %d assets",
len(graph.phenomena), len(graph.hypotheses),
len(graph.entities), len(graph.edges), len(graph.asset_library),
)
return graph
def _rebuild_adjacency(self) -> None:
"""Rebuild adjacency index from edges list."""
self._adj.clear()
self._adj_rev.clear()
for edge in self.edges:
self._adj.setdefault(edge.source_id, []).append(edge)
self._adj_rev.setdefault(edge.target_id, []).append(edge)
# ---- Evidence source ----------------------------------------------------
def set_active_source(self, source: EvidenceSource | None) -> None:
"""Bind tools and newly recorded phenomena to *source*.
Syncs the legacy image_path / partition_offset fields so existing
readers (orchestrator logs, report naming, agent prompts) keep
working unchanged. The orchestrator calls this before dispatching an
agent; single-source runs call it once at startup.
"""
self.active_source = source
if source is not None:
self.image_path = source.path
self.partition_offset = source.partition_offset
# ---- Node helpers -------------------------------------------------------
def _node_exists(self, node_id: str) -> bool:
if node_id.startswith("ph-"):
return node_id in self.phenomena
if node_id.startswith("hyp-"):
return node_id in self.hypotheses
if node_id.startswith("ent-"):
return node_id in self.entities
return False
def get_node(self, node_id: str) -> Phenomenon | Hypothesis | Entity | None:
if node_id.startswith("ph-"):
return self.phenomena.get(node_id)
if node_id.startswith("hyp-"):
return self.hypotheses.get(node_id)
if node_id.startswith("ent-"):
return self.entities.get(node_id)
return None
# ---- Similarity merging (Phenomenon only) --------------------------------
def _find_similar_phenomenon(
self, title: str, interpretation: str, category: str,
) -> Phenomenon | None:
best_match: Phenomenon | None = None
best_score = 0.0
for ph in self.phenomena.values():
if ph.category != category:
continue
title_sim = _jaccard_similarity(ph.title, title)
if title_sim <= 0.6:
continue
desc_sim = _jaccard_similarity(
ph.interpretation[:200], interpretation[:200],
)
if desc_sim <= 0.4:
continue
combined = title_sim * 0.6 + desc_sim * 0.4
if combined > best_score:
best_score = combined
best_match = ph
return best_match
# ---- Mutation methods (async, under lock) --------------------------------
async def add_phenomenon(
self,
source_agent: str,
category: str,
title: str,
interpretation: str = "",
verified_facts: list[dict] | None = None,
raw_data: dict | None = None,
timestamp: str | None = None,
source_tool: str = "",
from_lead_id: str | None = None,
task_id: str | None = None,
# Pre-S2 callers passed analysis text as ``description``. Accept it
# as an alias for ``interpretation`` so legacy tests and any in-flight
# tool-call messages don't break. Not advertised in the LLM-facing
# tool schema — BaseAgent's add_phenomenon advertises the new fields.
description: str | None = None,
) -> tuple[str, bool]:
"""Add a phenomenon under the grounding gateway. Returns (id, was_merged).
Each fact in ``verified_facts`` must point at a real ToolInvocation
made by this agent within ``task_id`` (defaults to the graph's
current task scope). Any fact failing grounding raises
:class:`GroundingError` — the whole call is rejected; the caller
must fix and retry. This is the code-level enforcement of
DESIGN.md §4.4.
"""
if description and not interpretation:
interpretation = description
facts = list(verified_facts or [])
active_task_id = task_id if task_id is not None else self._current_task_id
# Grounding gateway — validate every fact BEFORE acquiring the lock
# (read-only check; lookup uses dict access which is thread-safe).
failures: list[dict] = []
for fact in facts:
ok, reason = self.validate_fact_grounding(
fact, agent=source_agent, task_id=active_task_id or "",
)
if not ok:
failures.append({"fact": fact, "reason": reason})
if failures:
raise GroundingError(
"Phenomenon rejected — one or more facts are not grounded:\n"
+ "\n".join(
f" - {f['reason']}: {json.dumps(f['fact'], ensure_ascii=False)}"
for f in failures
),
failures=failures,
)
async with self._lock:
similar = self._find_similar_phenomenon(title, interpretation, category)
if similar is not None:
similar.confidence = min(1.0, similar.confidence + 0.15)
if source_agent not in similar.corroborating_agents:
similar.corroborating_agents.append(source_agent)
if raw_data:
for k, v in raw_data.items():
if k not in similar.raw_data:
similar.raw_data[k] = v
# Merge any new facts whose (type, value, invocation_id)
# tuple isn't already on the existing phenomenon.
if facts:
seen = {
(f.get("type"), f.get("value"), f.get("invocation_id"))
for f in similar.verified_facts
}
for f in facts:
key = (f.get("type"), f.get("value"), f.get("invocation_id"))
if key not in seen:
similar.verified_facts.append(f)
seen.add(key)
if from_lead_id and similar.from_lead_id is None:
similar.from_lead_id = from_lead_id
self._auto_save()
return similar.id, True
pid = f"ph-{uuid.uuid4().hex[:8]}"
confidence = _compute_quality_score(
source_tool, timestamp, raw_data or {},
interpretation, facts, [],
)
ph = Phenomenon(
id=pid,
source_agent=source_agent,
category=category,
title=title,
interpretation=interpretation,
verified_facts=facts,
raw_data=raw_data or {},
timestamp=timestamp,
confidence=confidence,
source_tool=source_tool,
source_id=self.active_source.id if self.active_source else "",
from_lead_id=from_lead_id,
created_at=datetime.now().isoformat(),
)
self.phenomena[pid] = ph
self._auto_save()
return pid, False
async def add_hypothesis(
self,
title: str,
description: str,
created_by: str = "",
parent_id: str | None = None,
prior_prob: float = 0.5,
) -> str:
"""Add a hypothesis. Returns the hypothesis ID.
``prior_prob`` seeds the starting log_odds (default 0.5 → 0.0).
Pick a different prior when you have base-rate knowledge — e.g.
prior_prob=0.1 for an unusual claim, 0.9 for a strong default.
"""
async with self._lock:
hid = f"hyp-{uuid.uuid4().hex[:8]}"
l_prior = prob_to_log_odds(prior_prob)
hyp = Hypothesis(
id=hid,
title=title,
description=description,
prior_prob=prior_prob,
log_odds=l_prior,
confidence=log_odds_to_prob(l_prior),
status="active",
parent_id=parent_id,
created_by=created_by,
created_at=datetime.now().isoformat(),
)
self.hypotheses[hid] = hyp
self._auto_save()
return hid
async def add_entity(
self,
name: str,
entity_type: str,
description: str = "",
) -> tuple[str, bool]:
"""Add an entity. Deduplicates on (name, entity_type). Returns (id, was_existing)."""
async with self._lock:
for ent in self.entities.values():
if ent.name == name and ent.entity_type == entity_type:
return ent.id, True
eid = f"ent-{uuid.uuid4().hex[:8]}"
self.entities[eid] = Entity(
id=eid,
name=name,
entity_type=entity_type,
description=description,
created_at=datetime.now().isoformat(),
)
self._auto_save()
return eid, False
async def add_edge(
self,
source_id: str,
target_id: str,
edge_type: str,
metadata: dict | None = None,
created_by: str = "",
) -> str:
"""Add a directed edge. Validates nodes exist and edge type is valid."""
async with self._lock:
if not self._node_exists(source_id):
raise ValueError(f"Source node not found: {source_id}")
if not self._node_exists(target_id):
raise ValueError(f"Target node not found: {target_id}")
if edge_type not in VALID_EDGE_TYPES:
raise ValueError(f"Invalid edge type: {edge_type}")
eid = f"edge-{uuid.uuid4().hex[:8]}"
edge = Edge(
id=eid,
source_id=source_id,
target_id=target_id,
edge_type=edge_type,
metadata=metadata or {},
created_by=created_by,
created_at=datetime.now().isoformat(),
)
self.edges.append(edge)
self._adj.setdefault(source_id, []).append(edge)
self._adj_rev.setdefault(target_id, []).append(edge)
self._auto_save()
return eid
async def update_hypothesis_confidence(
self,
hyp_id: str,
phenomenon_id: str,
edge_type: str,
reason: str = "",
) -> float:
"""Apply one phenomenon→hypothesis edge as an additive log_odds update.
DESIGN.md §4.5: edge_type → log₁₀(LR) is looked up in
``self.edge_log_lr`` (LLM never emits the number). The update is
``L_post = L_prior + log_lr`` and ``confidence = sigmoid(L_post)``
— commutative and order-independent, fixing the pre-S3 ordering
bug. Status flips at ≥0.8 → supported / ≤0.2 → refuted.
**Idempotency**: if a ``(phenomenon, hypothesis, edge_type)`` edge
already exists, this is a no-op — the same agent re-recording the
same link (or two agents linking via the orchestrator's batch
judge and a manual override) does not double-count. Independent
evidence — *different* phenomena pointing the same way — still
accumulates fully.
"""
if edge_type not in self.edge_log_lr:
raise ValueError(
f"Invalid hypothesis edge type: {edge_type}. "
f"Must be one of: {list(self.edge_log_lr.keys())}"
)
async with self._lock:
if not self._node_exists(phenomenon_id):
raise ValueError(f"Phenomenon not found: {phenomenon_id}")
hyp = self.hypotheses.get(hyp_id)
if hyp is None:
raise ValueError(f"Hypothesis not found: {hyp_id}")
# Idempotency check — same (ph, hyp, edge_type) already on graph.
for existing in self._adj.get(phenomenon_id, []):
if (
existing.target_id == hyp_id
and existing.edge_type == edge_type
):
return hyp.confidence
log_lr = self.edge_log_lr[edge_type]
old_log_odds = hyp.log_odds
old_conf = hyp.confidence
new_log_odds = old_log_odds + log_lr
new_conf = log_odds_to_prob(new_log_odds)
hyp.log_odds = new_log_odds
hyp.confidence = new_conf
if new_conf >= 0.8:
hyp.status = "supported"
elif new_conf <= 0.2:
hyp.status = "refuted"
else:
hyp.status = "active"
hyp.confidence_log.append({
"timestamp": datetime.now().isoformat(),
"phenomenon_id": phenomenon_id,
"edge_type": edge_type,
"log_lr": log_lr,
"old_log_odds": round(old_log_odds, 4),
"new_log_odds": round(new_log_odds, 4),
"old_confidence": round(old_conf, 4),
"new_confidence": round(new_conf, 4),
"reason": reason,
})
# Also create the edge in the graph
eid = f"edge-{uuid.uuid4().hex[:8]}"
edge = Edge(
id=eid,
source_id=phenomenon_id,
target_id=hyp_id,
edge_type=edge_type,
metadata={"reason": reason, "log_lr": log_lr},
created_by="hypothesis_engine",
created_at=datetime.now().isoformat(),
)
self.edges.append(edge)
self._adj.setdefault(phenomenon_id, []).append(edge)
self._adj_rev.setdefault(hyp_id, []).append(edge)
self._auto_save()
# If this is a coref hypothesis, mirror the new confidence into the
# entity-level same_as edge. Done OUTSIDE the lock — _sync_same_as_edge
# re-acquires it internally — so we avoid reentrant locking.
if hyp.coref_entity_pair and len(hyp.coref_entity_pair) == 2:
await self._sync_same_as_edge(
hyp.coref_entity_pair[0],
hyp.coref_entity_pair[1],
hyp_id,
)
return new_conf
# ---- Cross-source entity coreference (DESIGN.md §4.6) -------------------
@staticmethod
def _coref_hypothesis_id(eid_a: str, eid_b: str) -> str:
"""Deterministic id for the coref hypothesis between an entity pair.
Same pair (regardless of arg order) always maps to the same id so
repeated observations augment the existing hypothesis rather than
spawning duplicates.
"""
pair = "|".join(sorted([eid_a, eid_b]))
return f"hyp-coref-{hashlib.sha256(pair.encode()).hexdigest()[:10]}"
async def get_or_create_coref_hypothesis(
self, eid_a: str, eid_b: str,
) -> tuple[str, bool]:
"""Look up (or insert) the coreference hypothesis for an entity pair.
Uses a low prior (``prior_prob=0.1``) — saying any two entities are
the same actor is a strong claim, so the default should be
skeptical and let evidence move the needle.
"""
hid = self._coref_hypothesis_id(eid_a, eid_b)
async with self._lock:
if hid in self.hypotheses:
return hid, False
ea = self.entities.get(eid_a)
eb = self.entities.get(eid_b)
if ea is None or eb is None:
raise ValueError(f"Unknown entity in coref pair: {eid_a}, {eid_b}")
l_prior = prob_to_log_odds(0.1)
self.hypotheses[hid] = Hypothesis(
id=hid,
title=f"Coreference: {ea.name}{eb.name}",
description=(
f"Hypothesis that {ea.id} ({ea.name}, {ea.entity_type}) "
f"and {eb.id} ({eb.name}, {eb.entity_type}) refer to "
f"the same actor across evidence sources."
),
prior_prob=0.1,
log_odds=l_prior,
confidence=log_odds_to_prob(l_prior),
status="active",
created_by="coref_engine",
created_at=datetime.now().isoformat(),
coref_entity_pair=sorted([eid_a, eid_b]),
)
self._auto_save()
return hid, True
async def _sync_same_as_edge(
self, eid_a: str, eid_b: str, hyp_id: str,
) -> None:
"""Mirror coref hypothesis confidence into a ``same_as`` entity edge.
- Confidence ≥ 0.8 → ensure an active ``same_as`` edge exists.
- Confidence < 0.8 → mark any existing edge inactive (audit, not delete).
Idempotent on both transitions.
"""
hyp = self.hypotheses.get(hyp_id)
if hyp is None:
return
active = hyp.confidence >= 0.8
async with self._lock:
existing = None
for edge in self.edges:
if (edge.edge_type == "same_as"
and {edge.source_id, edge.target_id} == {eid_a, eid_b}):
existing = edge
break
if active:
if existing is None:
eid = f"edge-{uuid.uuid4().hex[:8]}"
edge = Edge(
id=eid,
source_id=eid_a,
target_id=eid_b,
edge_type="same_as",
metadata={
"backed_by": hyp_id,
"active": True,
"confidence_at_creation": hyp.confidence,
},
created_by="coref_engine",
created_at=datetime.now().isoformat(),
)
self.edges.append(edge)
self._adj.setdefault(eid_a, []).append(edge)
self._adj_rev.setdefault(eid_b, []).append(edge)
elif not existing.metadata.get("active"):
existing.metadata["active"] = True
existing.metadata["reactivated_at"] = datetime.now().isoformat()
else:
if existing is not None and existing.metadata.get("active"):
existing.metadata["active"] = False
existing.metadata["deactivated_at"] = datetime.now().isoformat()
self._auto_save()
async def observe_identity(
self,
entity_name: str,
entity_type: str,
identifier_type: str,
value: str,
source_agent: str,
invocation_id: str,
source_tool: str = "",
task_id: str | None = None,
) -> dict:
"""Record a typed identifier for an entity through the grounding gateway.
DESIGN.md §4.6. Steps:
1. Validate ``invocation_id`` + ``value`` via the same gateway
``add_phenomenon`` uses (raises :class:`GroundingError` on failure).
2. Get-or-create the entity.
3. Record an ``identity_observation`` phenomenon carrying the
identifier as its sole verified fact.
4. Attach the identifier to the entity (idempotent by
``(type, normalized_value)``).
5. If the attachment is new, scan other entities for shared
identifiers (strong / weak) and any conflicting strong
identifiers, then propose / strengthen / weaken the coref
hypothesis between each candidate pair. ``same_as`` edges are
kept in sync with the hypothesis confidence.
Returns a dict summarising the entity id, observation phenomenon,
whether the identifier was new, and any coref proposals fired.
"""
if identifier_type not in (STRONG_IDENTIFIER_TYPES | WEAK_IDENTIFIER_TYPES):
raise ValueError(
f"Unknown identifier_type: {identifier_type}. "
f"Strong: {sorted(STRONG_IDENTIFIER_TYPES)}; "
f"Weak: {sorted(WEAK_IDENTIFIER_TYPES)}."
)
if not value:
raise ValueError("identifier value must be non-empty")
# add_phenomenon enforces the grounding contract for the fact below.
active_task = task_id if task_id is not None else self._current_task_id
fact = {"type": identifier_type, "value": value, "invocation_id": invocation_id}
# Get-or-create entity first so we can attribute the observation.
eid, _existed = await self.add_entity(entity_name, entity_type)
norm = _normalize_identifier(identifier_type, value)
title = f"{identifier_type}={value} on {entity_name}"
pid, _merged = await self.add_phenomenon(
source_agent=source_agent,
category="identity_observation",
title=title,
interpretation=(
f"Agent attributed identifier {identifier_type}={value} "
f"(normalized={norm}) to entity {entity_name} ({entity_type})."
),
verified_facts=[fact],
source_tool=source_tool,
task_id=active_task,
)
# Attach identifier to entity (idempotent on type + normalized value).
new_identifier = False
async with self._lock:
ent = self.entities[eid]
if not ent.has_identifier(identifier_type, norm):
ent.identifiers.append({
"type": identifier_type,
"value": value,
"normalized": norm,
"strong": is_strong_identifier(identifier_type),
"invocation_id": invocation_id,
"phenomenon_id": pid,
"observed_at": datetime.now().isoformat(),
})
new_identifier = True
self._auto_save()
coref_proposals: list[dict] = []
if new_identifier:
coref_proposals = await self._propose_coref_for_new_identifier(
new_eid=eid,
new_type=identifier_type,
new_norm=norm,
new_phenomenon_id=pid,
)
return {
"entity_id": eid,
"phenomenon_id": pid,
"new_identifier": new_identifier,
"coref_proposals": coref_proposals,
}
async def _propose_coref_for_new_identifier(
self,
new_eid: str,
new_type: str,
new_norm: str,
new_phenomenon_id: str,
) -> list[dict]:
"""Blocking + propose: find candidate entities that share this
identifier with ``new_eid``, register / strengthen the coref
hypothesis for each pair, and emit conflicting-identifier edges
where the two entities have *different* values for the same
strong identifier type. O(|entities| × identifiers) — blocking
is implicit in the fact that the new identifier is fixed.
"""
new_ent = self.entities.get(new_eid)
if new_ent is None:
return []
is_strong_new = is_strong_identifier(new_type)
match_edge = "shared_strong_identifier" if is_strong_new else "shared_weak_identifier"
proposals: list[dict] = []
for other_eid, other_ent in list(self.entities.items()):
if other_eid == new_eid:
continue
# Match: other entity carries the same (type, normalized).
if not other_ent.has_identifier(new_type, new_norm):
continue
# Collect conflicting strong identifiers between the pair —
# they'll fire negative-LR edges on the same coref hypothesis.
conflicts: list[dict] = []
for a_ident in new_ent.identifiers:
if not a_ident.get("strong"):
continue
for b_ident in other_ent.identifiers:
if (b_ident.get("type") == a_ident.get("type")
and b_ident.get("strong")
and b_ident.get("normalized") != a_ident.get("normalized")):
conflicts.append({
"type": a_ident.get("type"),
"new_value": a_ident.get("value"),
"other_value": b_ident.get("value"),
"new_phenomenon_id": a_ident.get("phenomenon_id"),
})
hid, _created = await self.get_or_create_coref_hypothesis(
new_eid, other_eid,
)
# +shared identifier edge (one per identifier, anchored to the
# newly recorded observation phenomenon). update_hypothesis_
# confidence is idempotent on (ph, hyp, edge_type), so re-running
# the same observation does not double-count.
await self.update_hypothesis_confidence(
hid, new_phenomenon_id, match_edge,
reason=f"shared {new_type}={new_norm}",
)
# conflicting strong identifier edges — one per conflict, anchored
# to the *new* entity's observation phenomenon for that identifier.
for c in conflicts:
ph_src = c["new_phenomenon_id"]
if not ph_src:
continue
await self.update_hypothesis_confidence(
hid, ph_src, "conflicting_strong_identifier",
reason=(
f"conflict {c['type']}: "
f"{c['new_value']} vs {c['other_value']}"
),
)
await self._sync_same_as_edge(new_eid, other_eid, hid)
proposals.append({
"hypothesis_id": hid,
"other_entity_id": other_eid,
"match": {"type": new_type, "normalized": new_norm,
"edge_type": match_edge},
"conflicts": conflicts,
"confidence": self.hypotheses[hid].confidence,
})
return proposals
# ---- Cross-source entity cluster queries (DESIGN.md §4.6) ----------------
def _active_same_as_neighbors(self, entity_id: str) -> set[str]:
"""Neighbours of *entity_id* via ``same_as`` edges that are still active.
``same_as`` edges are non-destructive: a coref hypothesis that drops
below threshold marks ``metadata['active']=False`` rather than
deleting, so the audit trail survives. Cluster queries respect that.
"""
out: set[str] = set()
for edge in self.edges:
if edge.edge_type != "same_as":
continue
if not edge.metadata.get("active", True):
continue
if edge.source_id == entity_id:
out.add(edge.target_id)
elif edge.target_id == entity_id:
out.add(edge.source_id)
return out
def resolve_actor_cluster(self, entity_id: str) -> set[str]:
"""Return the connected component containing *entity_id* via active
``same_as`` edges — the set of entity ids that current coref evidence
treats as the same actor.
Reversible: deactivating a ``same_as`` edge (because the backing
coref hypothesis drops below 0.8) breaks the component, so this
always reflects the *current* state of the graph.
"""
if entity_id not in self.entities:
return set()
seen: set[str] = {entity_id}
frontier: list[str] = [entity_id]
while frontier:
cur = frontier.pop()
for nbr in self._active_same_as_neighbors(cur):
if nbr not in seen:
seen.add(nbr)
frontier.append(nbr)
return seen
def actor_clusters(self) -> list[dict]:
"""Group all entities into actor clusters via active ``same_as``.
Returns a list of ``{members: [...], identifiers: [...], coref_hypotheses: [...]}``
for the report agent and the orchestrator's cross-source views.
"""
unseen = set(self.entities.keys())
clusters: list[dict] = []
while unseen:
start = next(iter(unseen))
members = self.resolve_actor_cluster(start)
unseen -= members
# Aggregate identifiers across the cluster (deduped on type+normalized).
ident_seen: set[tuple[str, str]] = set()
idents: list[dict] = []
for eid in members:
for ident in self.entities[eid].identifiers:
key = (ident.get("type"), ident.get("normalized"))
if key in ident_seen:
continue
ident_seen.add(key)
idents.append({
"type": ident.get("type"),
"value": ident.get("value"),
"strong": ident.get("strong"),
"on_entity": eid,
})
coref_hyps = sorted({
e.metadata.get("backed_by", "")
for e in self.edges
if e.edge_type == "same_as"
and e.metadata.get("active", True)
and (e.source_id in members or e.target_id in members)
} - {""})
clusters.append({
"members": sorted(members),
"identifiers": idents,
"coref_hypotheses": coref_hyps,
})
return clusters
# ---- Lead management (same as old Blackboard) ----------------------------
async def add_lead(
self,
target_agent: str,
description: str,
priority: int = 5,
context: dict | None = None,
hypothesis_id: str | None = None,
) -> str:
async with self._lock:
lid = f"lead-{uuid.uuid4().hex[:8]}"
self.leads.append(Lead(
id=lid,
target_agent=target_agent,
description=description,
priority=priority,
context=context or {},
hypothesis_id=hypothesis_id,
))
self._auto_save()
return lid
async def get_pending_leads(self, agent_type: str | None = None) -> list[Lead]:
async with self._lock:
leads = [l for l in self.leads if l.status == "pending"]
if agent_type:
leads = [l for l in leads if l.target_agent == agent_type]
return sorted(leads, key=lambda l: l.priority)
async def mark_lead_completed(self, lead_id: str) -> None:
async with self._lock:
for lead in self.leads:
if lead.id == lead_id:
lead.status = "completed"
break
self._auto_save()
async def mark_lead_failed(self, lead_id: str, error: str = "") -> None:
async with self._lock:
for lead in self.leads:
if lead.id == lead_id:
lead.status = "failed"
lead.context["failure_reason"] = error
break
self._auto_save()
# ---- Investigation areas -------------------------------------------------
async def add_investigation_area(
self,
area: str,
description: str,
suggested_agent: str,
expected_keywords: list[str] | None = None,
expected_tools: list[str] | None = None,
priority: int = 5,
motivating_hypothesis_ids: list[str] | None = None,
created_by: str = "",
) -> tuple[str, bool]:
"""Add or merge an investigation area. Dedupe key is the `area` slug.
On collision, union the three list fields (keywords / tools /
motivating_hypothesis_ids); description / suggested_agent / priority
are preserved from the first writer (manual seed wins over LLM derive).
Returns (id, was_existing).
"""
async with self._lock:
for existing in self.investigation_areas.values():
if existing.area == area:
for kw in (expected_keywords or []):
if kw not in existing.expected_keywords:
existing.expected_keywords.append(kw)
for t in (expected_tools or []):
if t not in existing.expected_tools:
existing.expected_tools.append(t)
for hid in (motivating_hypothesis_ids or []):
if hid not in existing.motivating_hypothesis_ids:
existing.motivating_hypothesis_ids.append(hid)
self._auto_save()
return existing.id, True
aid = f"area-{area}"
self.investigation_areas[aid] = InvestigationArea(
id=aid,
area=area,
description=description,
suggested_agent=suggested_agent,
expected_keywords=list(expected_keywords or []),
expected_tools=list(expected_tools or []),
priority=priority,
motivating_hypothesis_ids=list(motivating_hypothesis_ids or []),
created_by=created_by,
created_at=datetime.now().isoformat(),
)
self._auto_save()
return aid, False
# ---- Tool invocation log -------------------------------------------------
async def record_tool_invocation(
self,
tool: str,
args: dict,
output: str,
cached: bool = False,
) -> str:
"""Record one tool call. Returns the invocation_id.
Source / agent / task_id are read from the graph's current run
context (set by BaseAgent.run and set_active_source) so executors
can stay stateless.
"""
iid = f"inv-{uuid.uuid4().hex[:8]}"
src_id = self.active_source.id if self.active_source else ""
inv = ToolInvocation(
id=iid,
tool=tool,
args=dict(args),
output=output,
output_sha256=hashlib.sha256(output.encode("utf-8", errors="replace")).hexdigest(),
agent=self._current_agent or "unknown",
task_id=self._current_task_id or "",
source_id=src_id,
created_at=datetime.now().isoformat(),
cached=cached,
)
async with self._lock:
self.tool_invocations[iid] = inv
# Cheap on cache hit; expensive but bounded otherwise. Skip
# auto-save here — too noisy if every tool call rewrites the
# state file; the next phenomenon write will flush.
return iid
def validate_fact_grounding(
self,
fact: dict,
agent: str,
task_id: str,
) -> tuple[bool, str]:
"""Check a single verified_fact's grounding. Returns (ok, reason).
Rules (DESIGN.md §4.4, refined after first end-to-end run):
1. invocation_id must exist in self.tool_invocations.
2. The invocation must have been made by `agent` within `task_id`.
3. fact.value must appear in invocation.output — either as a
strict substring, OR (loose-match fallback) once both sides
are normalised via :func:`_normalize_for_grounding`
(case-folded, whitespace-collapsed, path-sep unified).
The loose match catches the LLM's routine presentation
normalisations (case-folded hex, slash-flipped paths, collapsed
multi-line labels) without enabling fabrication: a string that
isn't present in ANY form still fails the normalised check.
"""
inv_id = fact.get("invocation_id", "")
value = fact.get("value", "")
if not inv_id:
return False, "missing invocation_id"
inv = self.tool_invocations.get(inv_id)
if inv is None:
return False, f"invocation_id {inv_id} not found in invocation log"
if inv.agent != agent:
return False, (
f"invocation {inv_id} was made by agent '{inv.agent}', "
f"not '{agent}' — cannot be cited by a different agent"
)
if task_id and inv.task_id and inv.task_id != task_id:
return False, (
f"invocation {inv_id} was made in a different task scope "
f"({inv.task_id}) — cite only invocations from your current task"
)
if not isinstance(value, str) or not value:
return False, "fact.value must be a non-empty string"
if value in inv.output:
return True, "ok"
# Loose fallback: normalised comparison absorbs case / whitespace /
# path-sep differences but a genuinely absent value still fails.
if _normalize_for_grounding(value) in _normalize_for_grounding(inv.output):
return True, "ok-normalized"
return False, (
f"fact.value not found in invocation {inv_id} output — even after "
f"case/whitespace/path-sep normalisation. Copy a literal substring "
f"from that tool's result; if the content is a guess (device model, "
f"constructed path, label-joined value), move it into `interpretation` "
f"instead of `verified_facts`."
)
# ---- Asset library -------------------------------------------------------
async def register_asset(
self,
inode: str,
original_path: str,
local_path: str,
category: str,
filename: str,
size_bytes: int,
extracted_by: str,
) -> tuple[str, bool]:
"""Register an extracted file. Deduplicates by inode. Returns (id, already_existed)."""
async with self._lock:
if inode in self._inode_index:
return self._inode_index[inode], True
aid = f"asset-{uuid.uuid4().hex[:8]}"
asset = ExtractedAsset(
id=aid,
inode=inode,
original_path=original_path,
local_path=local_path,
category=category,
filename=filename,
size_bytes=size_bytes,
extracted_by=extracted_by,
extracted_at=datetime.now().isoformat(),
)
self.asset_library[aid] = asset
self._inode_index[inode] = aid
self._auto_save()
return aid, False
def lookup_asset_by_inode(self, inode: str) -> ExtractedAsset | None:
"""Look up an extracted asset by inode (synchronous, no lock needed for reads)."""
aid = self._inode_index.get(inode)
return self.asset_library.get(aid) if aid else None
def list_assets(self, category: str | None = None) -> list[str]:
"""Return one-line summaries of all assets, optionally filtered."""
results = []
for asset in self.asset_library.values():
if category and asset.category != category:
continue
results.append(asset.summary())
return results
def query_assets(
self,
category: str | None = None,
filename_pattern: str | None = None,
) -> list[ExtractedAsset]:
"""Query the asset library with optional filters."""
results = []
for asset in self.asset_library.values():
if category and asset.category != category:
continue
if filename_pattern and filename_pattern.lower() not in asset.filename.lower():
continue
results.append(asset)
return results
# ---- Query methods (for agent tools) ------------------------------------
def list_phenomena(self, category: str | None = None) -> list[str]:
"""Return one-line summaries of all phenomena, optionally filtered."""
results = []
for ph in self.phenomena.values():
if category and ph.category != category:
continue
results.append(ph.summary())
return results
def get_phenomenon(self, ph_id: str) -> dict | None:
"""Return full phenomenon details as dict, or None."""
ph = self.phenomena.get(ph_id)
return ph.to_dict() if ph else None
def search_graph(self, keyword: str) -> list[str]:
"""Search across all node types by keyword. Returns summaries."""
kw = keyword.lower()
results = []
for ph in self.phenomena.values():
haystack = (
ph.title.lower()
+ " " + ph.interpretation.lower()
+ " " + " ".join(str(f.get("value", "")).lower() for f in ph.verified_facts)
)
if kw in haystack:
results.append(ph.summary())
for hyp in self.hypotheses.values():
if kw in hyp.title.lower() or kw in hyp.description.lower():
results.append(hyp.summary())
for ent in self.entities.values():
if kw in ent.name.lower() or kw in ent.description.lower():
results.append(ent.summary())
return results
def get_related(
self,
node_id: str,
edge_type: str | None = None,
direction: str = "both",
) -> list[dict]:
"""Get nodes connected to the given node. Returns list of {node_summary, edge_type, direction}."""
results = []
if direction in ("out", "both"):
for edge in self._adj.get(node_id, []):
if edge_type and edge.edge_type != edge_type:
continue
node = self.get_node(edge.target_id)
if node:
results.append({
"node": node.summary(),
"edge_type": edge.edge_type,
"direction": "outgoing",
"metadata": edge.metadata,
})
if direction in ("in", "both"):
for edge in self._adj_rev.get(node_id, []):
if edge_type and edge.edge_type != edge_type:
continue
node = self.get_node(edge.source_id)
if node:
results.append({
"node": node.summary(),
"edge_type": edge.edge_type,
"direction": "incoming",
"metadata": edge.metadata,
})
return results
def get_hypothesis_status(self) -> list[str]:
"""Return summaries of all hypotheses."""
return [h.summary() for h in self.hypotheses.values()]
def get_phenomena_by_category(self, category: str) -> list[Phenomenon]:
return [p for p in self.phenomena.values() if p.category == category]
def hypotheses_converged(self) -> bool:
"""True if no hypotheses are still active."""
return all(h.status != "active" for h in self.hypotheses.values())
def mark_remaining_inconclusive(self) -> None:
"""Mark all still-active hypotheses as inconclusive."""
for h in self.hypotheses.values():
if h.status == "active":
h.status = "inconclusive"
# ---- Hypothesis × Evidence matrix (DESIGN.md §4.5) -----------------------
def hypothesis_evidence_matrix(self) -> dict:
"""Structured pivot of every Phenomenon→Hypothesis edge.
Returns ``{"hypotheses": [...], "phenomena": [...], "cells": {...},
"counts_by_edge_type": {hyp_id: {edge_type: count}}}`` — the cells
map ``(hyp_id, ph_id)`` to a *list* of edge_type strings (a single
phenomenon may link via several edge_types after a manual override
plus an LLM judge call). Drives report rendering and gap selection.
"""
cells: dict[tuple[str, str], list[str]] = {}
counts: dict[str, dict[str, int]] = {h: {} for h in self.hypotheses}
for edge in self.edges:
if not (
edge.source_id.startswith("ph-")
and edge.target_id.startswith("hyp-")
and edge.edge_type in self.edge_log_lr
):
continue
key = (edge.target_id, edge.source_id)
cells.setdefault(key, []).append(edge.edge_type)
counts.setdefault(edge.target_id, {})[edge.edge_type] = (
counts.setdefault(edge.target_id, {}).get(edge.edge_type, 0) + 1
)
hypotheses = [
{
"id": h.id,
"title": h.title,
"confidence": h.confidence,
"log_odds": h.log_odds,
"status": h.status,
}
for h in self.hypotheses.values()
]
referenced = {ph_id for (_, ph_id) in cells}
phenomena = [
{"id": ph.id, "title": ph.title, "category": ph.category}
for ph in self.phenomena.values()
if ph.id in referenced
]
return {
"hypotheses": hypotheses,
"phenomena": phenomena,
"cells": {f"{h}|{p}": types for (h, p), types in cells.items()},
"counts_by_edge_type": counts,
}
def hypothesis_evidence_matrix_markdown(self) -> str:
"""Render the matrix as a compact markdown pivot.
Columns are the edge types (counts), plus log_odds, confidence,
status — enough for the report agent to ground every hypothesis
in its supporting and contradicting evidence at a glance.
"""
if not self.hypotheses:
return "(no hypotheses)"
matrix = self.hypothesis_evidence_matrix()
edge_types = sorted(self.edge_log_lr.keys())
header = (
"| Hypothesis | "
+ " | ".join(edge_types)
+ " | log_odds | conf | status |"
)
sep = (
"|---|"
+ "|".join(["---:"] * len(edge_types))
+ "|---:|---:|---|"
)
rows = [header, sep]
for h in matrix["hypotheses"]:
counts = matrix["counts_by_edge_type"].get(h["id"], {})
cells = [str(counts.get(et, 0)) for et in edge_types]
rows.append(
f"| {h['title']} | "
+ " | ".join(cells)
+ f" | {h['log_odds']:+.2f} | {h['confidence']:.2f} | {h['status']} |"
)
return "\n".join(rows)
# ---- Summary (lightweight, for system prompt) ----------------------------
def stats_summary(self) -> str:
"""Ultra-compact stats for inclusion in system prompt."""
active_hyp = sum(1 for h in self.hypotheses.values() if h.status == "active")
return (
f"Graph: {len(self.phenomena)} phenomena, "
f"{len(self.hypotheses)} hypotheses ({active_hyp} active), "
f"{len(self.entities)} entities, {len(self.edges)} edges. "
f"Asset library: {len(self.asset_library)} extracted files. "
f"Pending leads: {sum(1 for l in self.leads if l.status == 'pending')}."
)