DESIGN_STRATEGIST.md §1. Foundation for the Phase 3 strategist loop. Lead now carries four annotations that let the orchestrator measure marginal yield per lead and dedupe strategist proposals: - proposed_by (agent that proposed it: "strategist", "filesystem", …) - motivating_hypothesis (hyp-id the lead is meant to corroborate/refute) - expected_evidence_type (edge type the lead's worker should produce) - round_number (0 = Phase 1 lead, ≥1 = strategist-proposed) add_lead idempotently dedupes strategist proposals on (motivating_hypothesis, expected_evidence_type, target_agent, source_id) to prevent the "strategist loops on the same lead" failure mode. New InvestigationRound dataclass records per-round provenance: before/ after hypothesis status snapshots, phenomena + edge count deltas, and the strategist's decision_rationale. ``new_phenomena_count``, ``new_edges_count``, ``status_flips`` are derived properties that the marginal_yield tool will use. start_investigation_round / complete_investigation_round / get_investigation_round / latest_round / leads_from_round complete the lifecycle. complete is idempotent on already-closed rounds. Lead.from_dict is forward-compat for state files written before this commit. InvestigationRound persists as a top-level list in graph_state.json (auto-save + load_state both wired). EvidenceGraph also gains graph.budgets and graph.run_start_monotonic fields that the budget_status view (S2) will read; orchestrator populates them in S5. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2183 lines
87 KiB
Python
2183 lines
87 KiB
Python
"""Evidence Knowledge Graph for multi-agent forensic analysis.
|
||
|
||
Replaces the flat Blackboard with a graph-based evidence store.
|
||
Nodes: Phenomenon (observable artifacts), Hypothesis (interpretive claims), Entity (recurring objects).
|
||
Edges: typed relationships with predefined weights for hypothesis confidence computation.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import contextvars
|
||
import hashlib
|
||
import json
|
||
import logging
|
||
import re
|
||
import uuid
|
||
from dataclasses import asdict, dataclass, field
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
|
||
# Per-asyncio-task scoped values for "which agent is currently running" and
|
||
# "which task scope does that agent's grounding live in". Backed by
|
||
# ContextVars so concurrent agent runs (Phase 3's _dispatch_leads_parallel)
|
||
# don't clobber each other — asyncio.create_task / asyncio.gather copies
|
||
# the parent context per child task, and writes inside one task stay there.
|
||
# Pre-P0 these were plain attributes on EvidenceGraph; the last setter won
|
||
# under concurrency, tagging tool invocations with the WRONG agent and
|
||
# making the grounding gateway falsely reject legitimate facts.
|
||
_current_agent_ctx: contextvars.ContextVar[str] = contextvars.ContextVar(
|
||
"masforensics_current_agent", default="",
|
||
)
|
||
_current_task_id_ctx: contextvars.ContextVar[str] = contextvars.ContextVar(
|
||
"masforensics_current_task_id", default="",
|
||
)
|
||
|
||
from case import Case, EvidenceSource, single_source_case
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Per-edge-type log₁₀(LR) — the calibration table backing hypothesis
|
||
# confidence updates (DESIGN.md §4.5).
|
||
#
|
||
# The LLM only picks the *category* (direct_evidence, supports, …); the
|
||
# numerical contribution is looked up here. Updates use the additive,
|
||
# order-independent log-odds form
|
||
# L_post = L_prior + Σ log10(LR_i)
|
||
# confidence = 1 / (1 + 10^(−L_post))
|
||
# which fixes the pre-S3 delta-update bug whose result depended on the
|
||
# order edges arrived in.
|
||
#
|
||
# Override per-graph via EvidenceGraph(edge_log_lr=...) or config.yaml's
|
||
# `hypothesis_log_lr` section.
|
||
# ---------------------------------------------------------------------------
|
||
_DEFAULT_LOG_LR: dict[str, float] = {
|
||
"direct_evidence": +2.0,
|
||
"supports": +1.0,
|
||
"consequence_observed": +1.0,
|
||
"prerequisite_met": +0.5,
|
||
"weakens": -0.5,
|
||
"contradicts": -2.0,
|
||
# S5 cross-source coref (DESIGN.md §4.6) — same calibration scale.
|
||
# A single shared strong identifier (email, phone, wallet, IMEI…) is
|
||
# near-decisive; weak identifiers (nickname) accumulate slowly; a
|
||
# conflicting strong identifier is strong negative evidence.
|
||
"shared_strong_identifier": +2.0,
|
||
"shared_weak_identifier": +0.5,
|
||
"conflicting_strong_identifier": -2.0,
|
||
}
|
||
|
||
|
||
# DESIGN.md §4.6 identifier taxonomy. Strong identifiers approximate
|
||
# global uniqueness — sharing one is high-confidence coref evidence.
|
||
# Weak identifiers are nicknames / display names — accumulate via Bayes.
|
||
STRONG_IDENTIFIER_TYPES: set[str] = {
|
||
"email",
|
||
"phone_number",
|
||
"imei",
|
||
"imsi",
|
||
"apple_id",
|
||
"icloud_id",
|
||
"google_account",
|
||
"wallet_address",
|
||
"udid",
|
||
"mac_address",
|
||
"device_serial",
|
||
}
|
||
|
||
WEAK_IDENTIFIER_TYPES: set[str] = {
|
||
"nickname",
|
||
"display_name",
|
||
"username",
|
||
"screen_name",
|
||
}
|
||
|
||
|
||
def is_strong_identifier(identifier_type: str) -> bool:
|
||
"""True if the identifier carries enough uniqueness for a strong LR edge."""
|
||
return identifier_type in STRONG_IDENTIFIER_TYPES
|
||
|
||
|
||
def _normalize_identifier(identifier_type: str, value: str) -> str:
|
||
"""Canonicalise an identifier value so trivial spelling variants match.
|
||
|
||
- Lowercase for case-insensitive identifiers (email, hostnames, hex).
|
||
- Strip whitespace and the leading '+' on phone numbers / international
|
||
dialling, then keep only digits for phone matching.
|
||
- Pass-through for free-form strings (nicknames).
|
||
"""
|
||
v = (value or "").strip()
|
||
if identifier_type in {"email", "apple_id", "icloud_id", "google_account",
|
||
"mac_address", "wallet_address", "udid",
|
||
"imei", "imsi", "device_serial"}:
|
||
v = v.lower()
|
||
if identifier_type == "phone_number":
|
||
import re as _re
|
||
v = _re.sub(r"\D", "", v)
|
||
return v
|
||
|
||
|
||
def prob_to_log_odds(p: float) -> float:
|
||
"""Logit (base 10). Clipped to keep ±∞ out of the graph."""
|
||
p = max(1e-9, min(1 - 1e-9, float(p)))
|
||
import math
|
||
return math.log10(p / (1.0 - p))
|
||
|
||
|
||
def log_odds_to_prob(log_odds: float) -> float:
|
||
"""Inverse of :func:`prob_to_log_odds`: 1 / (1 + 10^(−L))."""
|
||
return 1.0 / (1.0 + 10.0 ** (-float(log_odds)))
|
||
|
||
|
||
_WS_RUN = re.compile(r"\s+")
|
||
|
||
|
||
def _normalize_for_grounding(s: str) -> str:
|
||
"""Canonicalise a string for the loose-match branch of fact grounding.
|
||
|
||
Strict ``value in inv.output`` rejected real evidence because the LLM
|
||
routinely normalises tool output before quoting:
|
||
- case-folds hex (``89 50 4e 47`` → ``89 50 4E 47``)
|
||
- flips path separators (``Sunny\\foo.exe`` → ``Sunny/foo.exe``)
|
||
- collapses whitespace across newlines (``AppleID:\\n alice@x.com``
|
||
→ ``AppleID: alice@x.com``)
|
||
|
||
None of those are hallucinations — they're presentation choices. This
|
||
normaliser does the inverse so both sides line up:
|
||
- lowercase everything (handles hex case + email case + MAC case)
|
||
- collapse any run of whitespace to a single space
|
||
- replace ``\\`` with ``/`` (path-sep flip)
|
||
|
||
Genuine fabrications still fail: a value that doesn't appear (in any
|
||
form) inside the output normalises to a string that isn't a substring
|
||
of the normalised output, and the gateway rejects exactly as before.
|
||
"""
|
||
if not s:
|
||
return ""
|
||
s = s.lower().replace("\\", "/")
|
||
s = _WS_RUN.sub(" ", s)
|
||
return s.strip()
|
||
|
||
class GroundingError(ValueError):
|
||
"""Raised by the add_phenomenon gateway when one or more verified_facts
|
||
fail the grounding check (missing/wrong invocation_id, wrong agent or
|
||
task, or fact.value not present in the cited tool output).
|
||
|
||
Carries the failed facts so callers (BaseAgent) can format them back
|
||
to the LLM for a corrective retry.
|
||
"""
|
||
|
||
def __init__(self, message: str, failures: list[dict]) -> None:
|
||
super().__init__(message)
|
||
self.failures = failures
|
||
|
||
|
||
# All valid edge types across the graph.
|
||
VALID_EDGE_TYPES: set[str] = {
|
||
# Phenomenon → Hypothesis
|
||
"direct_evidence", "supports", "prerequisite_met",
|
||
"consequence_observed", "contradicts", "weakens",
|
||
# Phenomenon → Hypothesis (S5 coref-specific — used between identifier
|
||
# observation phenomena and the "Entity A ≡ Entity B" coref hypothesis)
|
||
"shared_strong_identifier", "shared_weak_identifier",
|
||
"conflicting_strong_identifier",
|
||
# Phenomenon → Phenomenon
|
||
"temporal", "causal", "input_to", "modifies", "co_located", "corroborates",
|
||
# Phenomenon → Entity
|
||
"created_by", "executed_by", "owned_by", "targets",
|
||
"associated_with", "found_on", "used_by",
|
||
# Hypothesis → Hypothesis
|
||
"refines", "conflicts", "depends_on",
|
||
# Entity → Entity (S5 — backed by a coref hypothesis ≥ threshold)
|
||
"same_as",
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Graph node types
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _compute_quality_score(
|
||
source_tool: str,
|
||
timestamp: str | None,
|
||
raw_data: dict,
|
||
interpretation: str,
|
||
verified_facts: list[dict],
|
||
related_ids: list[str],
|
||
) -> float:
|
||
"""Compute a quality score (0.0-1.0) based on evidence completeness.
|
||
|
||
A grounded phenomenon (any verified_facts) outweighs a long free-text
|
||
interpretation: the facts carry provenance, the interpretation doesn't.
|
||
"""
|
||
score = 0.0
|
||
if source_tool:
|
||
score += 0.20
|
||
if timestamp is not None:
|
||
score += 0.15
|
||
if raw_data:
|
||
score += 0.15
|
||
if verified_facts:
|
||
# Capped contribution: 0.05 per fact up to 0.25.
|
||
score += min(0.25, 0.05 * len(verified_facts))
|
||
if len(interpretation) >= 50:
|
||
score += 0.10
|
||
if related_ids:
|
||
score += 0.15
|
||
return min(1.0, score)
|
||
|
||
|
||
def _jaccard_similarity(a: str, b: str) -> float:
|
||
"""Token-level Jaccard similarity between two strings."""
|
||
tokens_a = set(a.lower().split())
|
||
tokens_b = set(b.lower().split())
|
||
if not tokens_a or not tokens_b:
|
||
return 0.0
|
||
return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)
|
||
|
||
|
||
@dataclass
|
||
class Phenomenon:
|
||
"""Raw observable artifact found on disk.
|
||
|
||
DESIGN.md §4.4: a phenomenon is split into provenance-bound *facts*
|
||
and free-text *interpretation*. The gateway hard-validates every
|
||
fact against the recorded tool invocation it cites; interpretation
|
||
is the agent's narrative and is rendered as "agent analysis" in the
|
||
final report — not as truth.
|
||
"""
|
||
|
||
id: str # "ph-{uuid8}"
|
||
source_agent: str
|
||
category: str # filesystem, registry, email, network, timeline
|
||
title: str
|
||
# Free-form analysis text — the agent's reasoning. NOT verified.
|
||
interpretation: str = ""
|
||
# Grounded atoms. Each fact: {type, value, invocation_id}.
|
||
# type ∈ {path, timestamp, inode, hash, identifier, count, raw, ...}
|
||
verified_facts: list[dict] = field(default_factory=list)
|
||
raw_data: dict = field(default_factory=dict)
|
||
timestamp: str | None = None
|
||
confidence: float = 1.0
|
||
source_tool: str = ""
|
||
source_id: str = "" # id of the EvidenceSource this finding came from
|
||
corroborating_agents: list[str] = field(default_factory=list)
|
||
from_lead_id: str | None = None
|
||
created_at: str = ""
|
||
|
||
def to_dict(self) -> dict:
|
||
return asdict(self)
|
||
|
||
@classmethod
|
||
def from_dict(cls, d: dict) -> Phenomenon:
|
||
"""Reconstruct from a dict; migrate legacy ``description`` field.
|
||
|
||
Older runs persisted free text in ``description``; treat that as
|
||
``interpretation`` so old graph_state.json files keep loading.
|
||
"""
|
||
d = dict(d)
|
||
if "description" in d:
|
||
legacy = d.pop("description")
|
||
d.setdefault("interpretation", legacy or "")
|
||
d.setdefault("verified_facts", [])
|
||
known = set(cls.__dataclass_fields__)
|
||
return cls(**{k: v for k, v in d.items() if k in known})
|
||
|
||
def summary(self) -> str:
|
||
ts = f" @ {self.timestamp}" if self.timestamp else ""
|
||
nf = len(self.verified_facts)
|
||
facts_note = f" facts={nf}" if nf else ""
|
||
return (
|
||
f"[{self.id}] [{self.category}] {self.title}{ts} "
|
||
f"(conf={self.confidence:.2f}{facts_note})"
|
||
)
|
||
|
||
|
||
@dataclass
|
||
class Hypothesis:
|
||
"""Interpretive claim about what happened on the system.
|
||
|
||
Confidence is a *derived* projection of ``log_odds`` (DESIGN.md §4.5):
|
||
every Phenomenon→Hypothesis edge contributes log₁₀(LR) to ``log_odds``,
|
||
and ``confidence = 1 / (1 + 10^(−log_odds))``. ``log_odds`` is the
|
||
canonical state; ``confidence`` is kept in sync for display and
|
||
threshold checks (≥0.8 supported / ≤0.2 refuted).
|
||
|
||
``prior_prob`` seeds the starting log_odds (default 0.5 → 0.0).
|
||
"""
|
||
|
||
id: str # "hyp-{uuid8}"
|
||
title: str
|
||
description: str
|
||
prior_prob: float = 0.5
|
||
log_odds: float = 0.0
|
||
confidence: float = 0.5 # derived from log_odds — kept in sync on update
|
||
status: str = "active" # active, supported, refuted, inconclusive
|
||
parent_id: str | None = None
|
||
created_by: str = "" # "manual", "hypothesis_agent", agent name
|
||
created_at: str = ""
|
||
confidence_log: list[dict] = field(default_factory=list)
|
||
# S5 coref-specific: pair of entity ids this hypothesis claims are the
|
||
# same actor. Lets update_hypothesis_confidence sync the backing
|
||
# ``same_as`` edge automatically when contradicting evidence arrives.
|
||
coref_entity_pair: list[str] = field(default_factory=list)
|
||
|
||
def to_dict(self) -> dict:
|
||
return asdict(self)
|
||
|
||
@classmethod
|
||
def from_dict(cls, d: dict) -> Hypothesis:
|
||
"""Reconstruct from a dict. Migrates pre-S3 records that only had
|
||
``confidence`` by deriving ``log_odds`` via the logit transform.
|
||
"""
|
||
d = dict(d)
|
||
if "log_odds" not in d:
|
||
d["log_odds"] = prob_to_log_odds(d.get("confidence", 0.5))
|
||
d.setdefault("prior_prob", 0.5)
|
||
# Re-sync confidence from log_odds in case of drift in old files.
|
||
d["confidence"] = log_odds_to_prob(d["log_odds"])
|
||
known = set(cls.__dataclass_fields__)
|
||
return cls(**{k: v for k, v in d.items() if k in known})
|
||
|
||
def summary(self) -> str:
|
||
return (
|
||
f"[{self.id}] {self.title} "
|
||
f"(conf={self.confidence:.2f}, L={self.log_odds:+.2f}, {self.status})"
|
||
)
|
||
|
||
|
||
@dataclass
|
||
class Entity:
|
||
"""Recurring actor or object across phenomena.
|
||
|
||
DESIGN.md §4.6 attaches typed identifiers directly to the entity for
|
||
fast blocking lookups during coref. Each identifier entry:
|
||
{type, value, normalized, strong, invocation_id, phenomenon_id, observed_at}
|
||
where ``normalized`` is the canonicalised form used for matching
|
||
(lower-cased email, digits-only phone, …).
|
||
"""
|
||
|
||
id: str # "ent-{uuid8}"
|
||
name: str
|
||
entity_type: str # person, program, file, host, ip_address
|
||
description: str = ""
|
||
identifiers: list[dict] = field(default_factory=list)
|
||
created_at: str = ""
|
||
|
||
def to_dict(self) -> dict:
|
||
return asdict(self)
|
||
|
||
@classmethod
|
||
def from_dict(cls, d: dict) -> Entity:
|
||
d = dict(d)
|
||
d.setdefault("identifiers", [])
|
||
known = set(cls.__dataclass_fields__)
|
||
return cls(**{k: v for k, v in d.items() if k in known})
|
||
|
||
def has_identifier(self, identifier_type: str, normalized_value: str) -> bool:
|
||
return any(
|
||
i.get("type") == identifier_type
|
||
and i.get("normalized") == normalized_value
|
||
for i in self.identifiers
|
||
)
|
||
|
||
def summary(self) -> str:
|
||
idents = ""
|
||
if self.identifiers:
|
||
top = self.identifiers[:3]
|
||
preview = ", ".join(f"{i.get('type')}={i.get('value')}" for i in top)
|
||
extra = (
|
||
f" (+{len(self.identifiers) - 3} more)"
|
||
if len(self.identifiers) > 3 else ""
|
||
)
|
||
idents = f" [{preview}{extra}]"
|
||
return f"[{self.id}] {self.entity_type}: {self.name}{idents}"
|
||
|
||
|
||
@dataclass
|
||
class Edge:
|
||
"""Directed edge in the evidence graph."""
|
||
|
||
id: str # "edge-{uuid8}"
|
||
source_id: str
|
||
target_id: str
|
||
edge_type: str
|
||
metadata: dict = field(default_factory=dict)
|
||
created_by: str = ""
|
||
created_at: str = ""
|
||
|
||
def to_dict(self) -> dict:
|
||
return asdict(self)
|
||
|
||
@classmethod
|
||
def from_dict(cls, d: dict) -> Edge:
|
||
return cls(**d)
|
||
|
||
|
||
@dataclass
|
||
class Lead:
|
||
"""An investigative lead that should be followed up by an agent.
|
||
|
||
Phase 1 agents create leads as "things outside my scope but worth chasing".
|
||
The strategist (DESIGN_STRATEGIST.md) also creates leads, and additionally
|
||
annotates each with the hypothesis it's meant to corroborate or refute plus
|
||
the kind of edge it expects to produce — so the orchestrator can later
|
||
measure "did this lead actually change any belief".
|
||
"""
|
||
|
||
id: str
|
||
target_agent: str
|
||
description: str
|
||
priority: int = 5 # 1 (highest) - 10 (lowest)
|
||
context: dict = field(default_factory=dict)
|
||
status: str = "pending" # pending, assigned, completed, failed
|
||
hypothesis_id: str | None = None
|
||
# Strategist-loop annotations. proposed_by names the agent that created
|
||
# the lead ("filesystem", "strategist", ...). motivating_hypothesis and
|
||
# expected_evidence_type let the orchestrator measure marginal yield.
|
||
# round_number is 0 for Phase 1 leads, ≥1 for strategist-produced leads.
|
||
proposed_by: str = ""
|
||
motivating_hypothesis: str = ""
|
||
expected_evidence_type: str = ""
|
||
round_number: int = 0
|
||
|
||
def to_dict(self) -> dict:
|
||
return asdict(self)
|
||
|
||
@classmethod
|
||
def from_dict(cls, d: dict) -> Lead:
|
||
# Forward-compatible: old state files predate the strategist annotations.
|
||
known = set(cls.__dataclass_fields__)
|
||
return cls(**{k: v for k, v in d.items() if k in known})
|
||
|
||
|
||
@dataclass
|
||
class InvestigationRound:
|
||
"""One round of strategist-driven investigation.
|
||
|
||
DESIGN_STRATEGIST.md §1.2: provenance for the strategist's decisions. Each
|
||
round records what hypothesis statuses looked like before vs. after, what
|
||
leads were proposed, which actually got executed, and how many new
|
||
phenomena/edges resulted. ``marginal_yield`` over recent rounds is what
|
||
the strategist consults to decide whether to keep digging or declare
|
||
complete.
|
||
"""
|
||
|
||
id: str # "round-{nnn}"
|
||
round_number: int
|
||
started_at: str
|
||
completed_at: str = ""
|
||
strategist_action: str = "" # "propose_leads" | "declare_complete"
|
||
leads_proposed: list[str] = field(default_factory=list)
|
||
leads_executed: list[str] = field(default_factory=list)
|
||
hypothesis_status_snapshot_before: dict = field(default_factory=dict)
|
||
hypothesis_status_snapshot_after: dict = field(default_factory=dict)
|
||
phenomena_count_before: int = 0
|
||
phenomena_count_after: int = 0
|
||
edges_count_before: int = 0
|
||
edges_count_after: int = 0
|
||
decision_rationale: str = ""
|
||
|
||
@property
|
||
def new_phenomena_count(self) -> int:
|
||
return max(0, self.phenomena_count_after - self.phenomena_count_before)
|
||
|
||
@property
|
||
def new_edges_count(self) -> int:
|
||
return max(0, self.edges_count_after - self.edges_count_before)
|
||
|
||
@property
|
||
def status_flips(self) -> int:
|
||
before = self.hypothesis_status_snapshot_before
|
||
after = self.hypothesis_status_snapshot_after
|
||
flips = 0
|
||
for hid, after_status in after.items():
|
||
if before.get(hid) and before.get(hid) != after_status:
|
||
flips += 1
|
||
return flips
|
||
|
||
def to_dict(self) -> dict:
|
||
return asdict(self)
|
||
|
||
@classmethod
|
||
def from_dict(cls, d: dict) -> InvestigationRound:
|
||
known = set(cls.__dataclass_fields__)
|
||
return cls(**{k: v for k, v in d.items() if k in known})
|
||
|
||
|
||
@dataclass
|
||
class InvestigationArea:
|
||
"""An area to investigate to confirm/refute one or more hypotheses.
|
||
|
||
Derived by the orchestrator from active hypotheses after Phase 2; also
|
||
seeded from config.yaml:investigation_areas as an optional manual
|
||
override. Each area carries its own keywords + expected tools so the
|
||
gap-analysis coverage check is generic, not tied to hard-coded constants.
|
||
"""
|
||
|
||
id: str # "area-{slug}"
|
||
area: str # snake_case slug (dedupe key)
|
||
description: str
|
||
suggested_agent: str # filesystem / registry / communication / network / timeline
|
||
expected_keywords: list[str] = field(default_factory=list)
|
||
expected_tools: list[str] = field(default_factory=list)
|
||
priority: int = 5 # 1 (highest) - 10 (lowest)
|
||
motivating_hypothesis_ids: list[str] = field(default_factory=list)
|
||
created_by: str = "" # "manual" | "llm_derive" | "fallback"
|
||
created_at: str = ""
|
||
|
||
def to_dict(self) -> dict:
|
||
return asdict(self)
|
||
|
||
@classmethod
|
||
def from_dict(cls, d: dict) -> InvestigationArea:
|
||
return cls(**d)
|
||
|
||
def summary(self) -> str:
|
||
return (
|
||
f"[{self.area}] P{self.priority} agent={self.suggested_agent} "
|
||
f"(motivating: {len(self.motivating_hypothesis_ids)})"
|
||
)
|
||
|
||
|
||
@dataclass
|
||
class ExtractedAsset:
|
||
"""A file extracted from the disk image and tracked in the asset library."""
|
||
|
||
id: str # "asset-{uuid8}"
|
||
inode: str # e.g. "334-128-4"
|
||
original_path: str # disk image path from ffind
|
||
local_path: str # "extracted/SYSTEM"
|
||
category: str # registry_hive, chat_log, prefetch, ...
|
||
filename: str # "SYSTEM"
|
||
size_bytes: int
|
||
extracted_by: str # agent name
|
||
extracted_at: str # ISO timestamp
|
||
|
||
def to_dict(self) -> dict:
|
||
return asdict(self)
|
||
|
||
@classmethod
|
||
def from_dict(cls, d: dict) -> ExtractedAsset:
|
||
return cls(**d)
|
||
|
||
def summary(self) -> str:
|
||
size_kb = self.size_bytes / 1024
|
||
return (
|
||
f"[{self.id}] {self.filename} ({self.category}) "
|
||
f"— {size_kb:.1f}KB @ {self.local_path} [inode:{self.inode}]"
|
||
)
|
||
|
||
|
||
@dataclass
|
||
class ToolInvocation:
|
||
"""One recorded tool call — the provenance unit for grounded facts.
|
||
|
||
Every wrapped tool executor records a ToolInvocation when it runs. The
|
||
grounding gateway looks these up by id when validating that a fact in
|
||
an ``add_phenomenon`` call traces back to a real tool output. Persisted
|
||
with the graph so a re-loaded run can still verify provenance.
|
||
"""
|
||
|
||
id: str # "inv-{uuid8}"
|
||
tool: str # tool name as registered in TOOL_CATALOG
|
||
args: dict # kwargs passed to the executor
|
||
output: str # the raw output string the tool produced
|
||
output_sha256: str # hexdigest — tamper-evident hash of output
|
||
agent: str # agent that issued the call
|
||
task_id: str # agent run scope (graph._current_task_id at call time)
|
||
source_id: str # active evidence source at call time
|
||
created_at: str # ISO timestamp
|
||
cached: bool = False # served from result cache without re-running
|
||
|
||
def to_dict(self) -> dict:
|
||
return asdict(self)
|
||
|
||
@classmethod
|
||
def from_dict(cls, d: dict) -> ToolInvocation:
|
||
return cls(**d)
|
||
|
||
def summary(self) -> str:
|
||
return (
|
||
f"[{self.id}] {self.tool}({json.dumps(self.args, ensure_ascii=False)}) "
|
||
f"@{self.created_at} agent={self.agent} cached={self.cached}"
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Evidence Graph
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class EvidenceGraph:
|
||
"""Graph-based evidence store for multi-agent forensic analysis.
|
||
|
||
Agents interact with the graph via query tools (list_phenomena,
|
||
get_phenomenon, search_graph, get_related) rather than reading
|
||
a full dump in the system prompt.
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
case_info: dict | None = None,
|
||
persist_path: Path | None = None,
|
||
edge_log_lr: dict[str, float] | None = None,
|
||
) -> None:
|
||
self.case_info: dict = case_info or {}
|
||
# log₁₀(LR) per edge type — calibration table for confidence updates.
|
||
# Renamed from edge_weights (S3): the values are no longer deltas in
|
||
# confidence space, they are log-likelihood ratios in odds space.
|
||
self.edge_log_lr: dict[str, float] = (
|
||
dict(edge_log_lr) if edge_log_lr else dict(_DEFAULT_LOG_LR)
|
||
)
|
||
self.image_path: str = ""
|
||
self.partition_offset: int = 0
|
||
self.extracted_dir: str = "extracted"
|
||
|
||
# Multi-evidence: the case and the source tools/phenomena bind to.
|
||
# image_path / partition_offset above mirror active_source for
|
||
# backward-compatible readers; set_active_source keeps them in sync.
|
||
self.case: Case | None = None
|
||
self.active_source: EvidenceSource | None = None
|
||
|
||
# Graph storage
|
||
self.phenomena: dict[str, Phenomenon] = {}
|
||
self.hypotheses: dict[str, Hypothesis] = {}
|
||
self.entities: dict[str, Entity] = {}
|
||
self.edges: list[Edge] = []
|
||
|
||
# Adjacency index for fast traversal
|
||
self._adj: dict[str, list[Edge]] = {} # node_id → outgoing edges
|
||
self._adj_rev: dict[str, list[Edge]] = {} # node_id → incoming edges
|
||
|
||
# Lead / status management (carried over from Blackboard)
|
||
self.leads: list[Lead] = []
|
||
self.agent_status: dict[str, str] = {}
|
||
|
||
# Asset library — tracks all files extracted from the disk image
|
||
self.asset_library: dict[str, ExtractedAsset] = {}
|
||
self._inode_index: dict[str, str] = {} # inode → asset_id
|
||
|
||
# Investigation areas — derived from hypotheses (LLM) and/or seeded
|
||
# from config.yaml:investigation_areas (manual override). Drives the
|
||
# gap-analysis coverage check.
|
||
self.investigation_areas: dict[str, InvestigationArea] = {}
|
||
|
||
# Tool invocations — provenance log for grounded facts. Every wrapped
|
||
# tool executor records one entry; add_phenomenon's grounding gateway
|
||
# looks them up to validate cited invocation_ids and substring-match
|
||
# claimed fact values against real tool outputs.
|
||
self.tool_invocations: dict[str, ToolInvocation] = {}
|
||
|
||
# Investigation rounds — provenance for the strategist's per-round
|
||
# decisions (DESIGN_STRATEGIST.md). Empty for runs that don't reach
|
||
# Phase 3 or that disable the strategist via config.
|
||
self.investigation_rounds: list[InvestigationRound] = []
|
||
|
||
# Budget config + run-start monotonic clock. Set by the orchestrator
|
||
# when it boots; the budget_status strategy tool reads these. None
|
||
# means unbounded / not yet started.
|
||
self.budgets: dict[str, int] = {}
|
||
self.run_start_monotonic: float | None = None
|
||
|
||
# _current_agent / _current_task_id are exposed as @property below,
|
||
# backed by module-level ContextVars (race-free under asyncio.gather).
|
||
|
||
self._lock = asyncio.Lock()
|
||
self._persist_path: Path | None = persist_path
|
||
|
||
# ---- Per-asyncio-task scoped state ---------------------------------------
|
||
#
|
||
# Reads/writes through these properties hit ContextVars rather than
|
||
# instance attributes. Concurrent agent runs (Phase 3 parallel
|
||
# dispatch) each have their own task-local context, so writes inside
|
||
# one agent's run() are invisible to siblings — which means
|
||
# ``record_tool_invocation`` always tags an invocation with the agent
|
||
# and task scope that actually issued it.
|
||
|
||
@property
|
||
def _current_agent(self) -> str:
|
||
return _current_agent_ctx.get()
|
||
|
||
@_current_agent.setter
|
||
def _current_agent(self, value: str) -> None:
|
||
_current_agent_ctx.set(value or "")
|
||
|
||
@property
|
||
def _current_task_id(self) -> str:
|
||
return _current_task_id_ctx.get()
|
||
|
||
@_current_task_id.setter
|
||
def _current_task_id(self, value: str) -> None:
|
||
_current_task_id_ctx.set(value or "")
|
||
|
||
# ---- Persistence -------------------------------------------------------
|
||
|
||
def _auto_save(self) -> None:
|
||
"""Persist full state to disk. Must be called inside _lock."""
|
||
if self._persist_path is None:
|
||
return
|
||
try:
|
||
state = {
|
||
"case_info": self.case_info,
|
||
"case": self.case.to_dict() if self.case else None,
|
||
"active_source_id": (
|
||
self.active_source.id if self.active_source else ""
|
||
),
|
||
"image_path": self.image_path,
|
||
"partition_offset": self.partition_offset,
|
||
"extracted_dir": self.extracted_dir,
|
||
"phenomena": {pid: p.to_dict() for pid, p in self.phenomena.items()},
|
||
"hypotheses": {hid: h.to_dict() for hid, h in self.hypotheses.items()},
|
||
"entities": {eid: e.to_dict() for eid, e in self.entities.items()},
|
||
"edges": [e.to_dict() for e in self.edges],
|
||
"leads": [l.to_dict() for l in self.leads],
|
||
"agent_status": dict(self.agent_status),
|
||
"asset_library": {aid: a.to_dict() for aid, a in self.asset_library.items()},
|
||
"investigation_areas": {
|
||
aid: a.to_dict() for aid, a in self.investigation_areas.items()
|
||
},
|
||
"tool_invocations": {
|
||
iid: inv.to_dict() for iid, inv in self.tool_invocations.items()
|
||
},
|
||
"investigation_rounds": [
|
||
r.to_dict() for r in self.investigation_rounds
|
||
],
|
||
"saved_at": datetime.now().isoformat(),
|
||
}
|
||
tmp = self._persist_path.with_suffix(".tmp")
|
||
tmp.write_text(json.dumps(state, ensure_ascii=False, indent=2))
|
||
tmp.replace(self._persist_path)
|
||
except Exception as e:
|
||
logger.error("EvidenceGraph auto-save failed: %s", e)
|
||
|
||
def save_state(self, path: Path) -> None:
|
||
"""Explicitly save state to the given path."""
|
||
old = self._persist_path
|
||
self._persist_path = path
|
||
self._auto_save()
|
||
self._persist_path = old
|
||
|
||
@classmethod
|
||
def load_state(
|
||
cls,
|
||
path: Path,
|
||
edge_log_lr: dict[str, float] | None = None,
|
||
) -> EvidenceGraph:
|
||
"""Restore an EvidenceGraph from a saved JSON state file."""
|
||
data = json.loads(path.read_text())
|
||
graph = cls(
|
||
case_info=data.get("case_info", {}),
|
||
persist_path=path,
|
||
edge_log_lr=edge_log_lr,
|
||
)
|
||
graph.image_path = data.get("image_path", "")
|
||
graph.partition_offset = data.get("partition_offset", 0)
|
||
graph.extracted_dir = data.get("extracted_dir", "extracted")
|
||
|
||
# Restore the evidence-source model. State files predating the Case
|
||
# model carry only image_path/partition_offset → wrap as one source.
|
||
case_data = data.get("case")
|
||
if case_data:
|
||
graph.case = Case.from_dict(case_data)
|
||
elif graph.image_path:
|
||
graph.case = single_source_case(
|
||
graph.image_path, graph.partition_offset
|
||
)
|
||
if graph.case and graph.case.sources:
|
||
active = graph.case.get_source(data.get("active_source_id", ""))
|
||
graph.set_active_source(active or graph.case.sources[0])
|
||
|
||
graph.phenomena = {
|
||
pid: Phenomenon.from_dict(p)
|
||
for pid, p in data.get("phenomena", {}).items()
|
||
}
|
||
graph.hypotheses = {
|
||
hid: Hypothesis.from_dict(h)
|
||
for hid, h in data.get("hypotheses", {}).items()
|
||
}
|
||
graph.entities = {
|
||
eid: Entity.from_dict(e)
|
||
for eid, e in data.get("entities", {}).items()
|
||
}
|
||
graph.edges = [Edge.from_dict(e) for e in data.get("edges", [])]
|
||
graph.leads = [Lead.from_dict(l) for l in data.get("leads", [])]
|
||
graph.agent_status = data.get("agent_status", {})
|
||
for aid, a_data in data.get("asset_library", {}).items():
|
||
asset = ExtractedAsset.from_dict(a_data)
|
||
graph.asset_library[aid] = asset
|
||
graph._inode_index[asset.inode] = aid
|
||
graph.investigation_areas = {
|
||
aid: InvestigationArea.from_dict(a)
|
||
for aid, a in data.get("investigation_areas", {}).items()
|
||
}
|
||
graph.tool_invocations = {
|
||
iid: ToolInvocation.from_dict(inv)
|
||
for iid, inv in data.get("tool_invocations", {}).items()
|
||
}
|
||
graph.investigation_rounds = [
|
||
InvestigationRound.from_dict(r)
|
||
for r in data.get("investigation_rounds", [])
|
||
]
|
||
graph._rebuild_adjacency()
|
||
logger.info(
|
||
"EvidenceGraph restored: %d phenomena, %d hypotheses, %d entities, "
|
||
"%d edges, %d assets",
|
||
len(graph.phenomena), len(graph.hypotheses),
|
||
len(graph.entities), len(graph.edges), len(graph.asset_library),
|
||
)
|
||
return graph
|
||
|
||
def _rebuild_adjacency(self) -> None:
|
||
"""Rebuild adjacency index from edges list."""
|
||
self._adj.clear()
|
||
self._adj_rev.clear()
|
||
for edge in self.edges:
|
||
self._adj.setdefault(edge.source_id, []).append(edge)
|
||
self._adj_rev.setdefault(edge.target_id, []).append(edge)
|
||
|
||
# ---- Evidence source ----------------------------------------------------
|
||
|
||
def set_active_source(self, source: EvidenceSource | None) -> None:
|
||
"""Bind tools and newly recorded phenomena to *source*.
|
||
|
||
Syncs the legacy image_path / partition_offset fields so existing
|
||
readers (orchestrator logs, report naming, agent prompts) keep
|
||
working unchanged. The orchestrator calls this before dispatching an
|
||
agent; single-source runs call it once at startup.
|
||
"""
|
||
self.active_source = source
|
||
if source is not None:
|
||
self.image_path = source.path
|
||
self.partition_offset = source.partition_offset
|
||
|
||
# ---- Node helpers -------------------------------------------------------
|
||
|
||
def _node_exists(self, node_id: str) -> bool:
|
||
if node_id.startswith("ph-"):
|
||
return node_id in self.phenomena
|
||
if node_id.startswith("hyp-"):
|
||
return node_id in self.hypotheses
|
||
if node_id.startswith("ent-"):
|
||
return node_id in self.entities
|
||
return False
|
||
|
||
def get_node(self, node_id: str) -> Phenomenon | Hypothesis | Entity | None:
|
||
if node_id.startswith("ph-"):
|
||
return self.phenomena.get(node_id)
|
||
if node_id.startswith("hyp-"):
|
||
return self.hypotheses.get(node_id)
|
||
if node_id.startswith("ent-"):
|
||
return self.entities.get(node_id)
|
||
return None
|
||
|
||
# ---- Similarity merging (Phenomenon only) --------------------------------
|
||
|
||
def _find_similar_phenomenon(
|
||
self, title: str, interpretation: str, category: str,
|
||
) -> Phenomenon | None:
|
||
best_match: Phenomenon | None = None
|
||
best_score = 0.0
|
||
for ph in self.phenomena.values():
|
||
if ph.category != category:
|
||
continue
|
||
title_sim = _jaccard_similarity(ph.title, title)
|
||
if title_sim <= 0.6:
|
||
continue
|
||
desc_sim = _jaccard_similarity(
|
||
ph.interpretation[:200], interpretation[:200],
|
||
)
|
||
if desc_sim <= 0.4:
|
||
continue
|
||
combined = title_sim * 0.6 + desc_sim * 0.4
|
||
if combined > best_score:
|
||
best_score = combined
|
||
best_match = ph
|
||
return best_match
|
||
|
||
# ---- Mutation methods (async, under lock) --------------------------------
|
||
|
||
async def add_phenomenon(
|
||
self,
|
||
source_agent: str,
|
||
category: str,
|
||
title: str,
|
||
interpretation: str = "",
|
||
verified_facts: list[dict] | None = None,
|
||
raw_data: dict | None = None,
|
||
timestamp: str | None = None,
|
||
source_tool: str = "",
|
||
from_lead_id: str | None = None,
|
||
task_id: str | None = None,
|
||
# Pre-S2 callers passed analysis text as ``description``. Accept it
|
||
# as an alias for ``interpretation`` so legacy tests and any in-flight
|
||
# tool-call messages don't break. Not advertised in the LLM-facing
|
||
# tool schema — BaseAgent's add_phenomenon advertises the new fields.
|
||
description: str | None = None,
|
||
) -> tuple[str, bool]:
|
||
"""Add a phenomenon under the grounding gateway. Returns (id, was_merged).
|
||
|
||
Each fact in ``verified_facts`` must point at a real ToolInvocation
|
||
made by this agent within ``task_id`` (defaults to the graph's
|
||
current task scope). Any fact failing grounding raises
|
||
:class:`GroundingError` — the whole call is rejected; the caller
|
||
must fix and retry. This is the code-level enforcement of
|
||
DESIGN.md §4.4.
|
||
"""
|
||
if description and not interpretation:
|
||
interpretation = description
|
||
facts = list(verified_facts or [])
|
||
active_task_id = task_id if task_id is not None else self._current_task_id
|
||
|
||
# Grounding gateway — validate every fact BEFORE acquiring the lock
|
||
# (read-only check; lookup uses dict access which is thread-safe).
|
||
failures: list[dict] = []
|
||
for fact in facts:
|
||
ok, reason = self.validate_fact_grounding(
|
||
fact, agent=source_agent, task_id=active_task_id or "",
|
||
)
|
||
if not ok:
|
||
failures.append({"fact": fact, "reason": reason})
|
||
if failures:
|
||
raise GroundingError(
|
||
"Phenomenon rejected — one or more facts are not grounded:\n"
|
||
+ "\n".join(
|
||
f" - {f['reason']}: {json.dumps(f['fact'], ensure_ascii=False)}"
|
||
for f in failures
|
||
)
|
||
+ self._format_recent_invocations(
|
||
source_agent, active_task_id or "",
|
||
),
|
||
failures=failures,
|
||
)
|
||
|
||
async with self._lock:
|
||
similar = self._find_similar_phenomenon(title, interpretation, category)
|
||
if similar is not None:
|
||
similar.confidence = min(1.0, similar.confidence + 0.15)
|
||
if source_agent not in similar.corroborating_agents:
|
||
similar.corroborating_agents.append(source_agent)
|
||
if raw_data:
|
||
for k, v in raw_data.items():
|
||
if k not in similar.raw_data:
|
||
similar.raw_data[k] = v
|
||
# Merge any new facts whose (type, value, invocation_id)
|
||
# tuple isn't already on the existing phenomenon.
|
||
if facts:
|
||
seen = {
|
||
(f.get("type"), f.get("value"), f.get("invocation_id"))
|
||
for f in similar.verified_facts
|
||
}
|
||
for f in facts:
|
||
key = (f.get("type"), f.get("value"), f.get("invocation_id"))
|
||
if key not in seen:
|
||
similar.verified_facts.append(f)
|
||
seen.add(key)
|
||
if from_lead_id and similar.from_lead_id is None:
|
||
similar.from_lead_id = from_lead_id
|
||
self._auto_save()
|
||
return similar.id, True
|
||
|
||
pid = f"ph-{uuid.uuid4().hex[:8]}"
|
||
confidence = _compute_quality_score(
|
||
source_tool, timestamp, raw_data or {},
|
||
interpretation, facts, [],
|
||
)
|
||
ph = Phenomenon(
|
||
id=pid,
|
||
source_agent=source_agent,
|
||
category=category,
|
||
title=title,
|
||
interpretation=interpretation,
|
||
verified_facts=facts,
|
||
raw_data=raw_data or {},
|
||
timestamp=timestamp,
|
||
confidence=confidence,
|
||
source_tool=source_tool,
|
||
source_id=self.active_source.id if self.active_source else "",
|
||
from_lead_id=from_lead_id,
|
||
created_at=datetime.now().isoformat(),
|
||
)
|
||
self.phenomena[pid] = ph
|
||
self._auto_save()
|
||
return pid, False
|
||
|
||
async def add_hypothesis(
|
||
self,
|
||
title: str,
|
||
description: str,
|
||
created_by: str = "",
|
||
parent_id: str | None = None,
|
||
prior_prob: float = 0.5,
|
||
) -> str:
|
||
"""Add a hypothesis. Returns the hypothesis ID.
|
||
|
||
``prior_prob`` seeds the starting log_odds (default 0.5 → 0.0).
|
||
Pick a different prior when you have base-rate knowledge — e.g.
|
||
prior_prob=0.1 for an unusual claim, 0.9 for a strong default.
|
||
"""
|
||
async with self._lock:
|
||
hid = f"hyp-{uuid.uuid4().hex[:8]}"
|
||
l_prior = prob_to_log_odds(prior_prob)
|
||
hyp = Hypothesis(
|
||
id=hid,
|
||
title=title,
|
||
description=description,
|
||
prior_prob=prior_prob,
|
||
log_odds=l_prior,
|
||
confidence=log_odds_to_prob(l_prior),
|
||
status="active",
|
||
parent_id=parent_id,
|
||
created_by=created_by,
|
||
created_at=datetime.now().isoformat(),
|
||
)
|
||
self.hypotheses[hid] = hyp
|
||
self._auto_save()
|
||
return hid
|
||
|
||
async def add_entity(
|
||
self,
|
||
name: str,
|
||
entity_type: str,
|
||
description: str = "",
|
||
) -> tuple[str, bool]:
|
||
"""Add an entity. Deduplicates on (name, entity_type). Returns (id, was_existing)."""
|
||
async with self._lock:
|
||
for ent in self.entities.values():
|
||
if ent.name == name and ent.entity_type == entity_type:
|
||
return ent.id, True
|
||
|
||
eid = f"ent-{uuid.uuid4().hex[:8]}"
|
||
self.entities[eid] = Entity(
|
||
id=eid,
|
||
name=name,
|
||
entity_type=entity_type,
|
||
description=description,
|
||
created_at=datetime.now().isoformat(),
|
||
)
|
||
self._auto_save()
|
||
return eid, False
|
||
|
||
async def add_edge(
|
||
self,
|
||
source_id: str,
|
||
target_id: str,
|
||
edge_type: str,
|
||
metadata: dict | None = None,
|
||
created_by: str = "",
|
||
) -> str:
|
||
"""Add a directed edge. Validates nodes exist and edge type is valid."""
|
||
async with self._lock:
|
||
if not self._node_exists(source_id):
|
||
raise ValueError(f"Source node not found: {source_id}")
|
||
if not self._node_exists(target_id):
|
||
raise ValueError(f"Target node not found: {target_id}")
|
||
if edge_type not in VALID_EDGE_TYPES:
|
||
raise ValueError(f"Invalid edge type: {edge_type}")
|
||
|
||
eid = f"edge-{uuid.uuid4().hex[:8]}"
|
||
edge = Edge(
|
||
id=eid,
|
||
source_id=source_id,
|
||
target_id=target_id,
|
||
edge_type=edge_type,
|
||
metadata=metadata or {},
|
||
created_by=created_by,
|
||
created_at=datetime.now().isoformat(),
|
||
)
|
||
self.edges.append(edge)
|
||
self._adj.setdefault(source_id, []).append(edge)
|
||
self._adj_rev.setdefault(target_id, []).append(edge)
|
||
self._auto_save()
|
||
return eid
|
||
|
||
async def update_hypothesis_confidence(
|
||
self,
|
||
hyp_id: str,
|
||
phenomenon_id: str,
|
||
edge_type: str,
|
||
reason: str = "",
|
||
) -> float:
|
||
"""Apply one phenomenon→hypothesis edge as an additive log_odds update.
|
||
|
||
DESIGN.md §4.5: edge_type → log₁₀(LR) is looked up in
|
||
``self.edge_log_lr`` (LLM never emits the number). The update is
|
||
``L_post = L_prior + log_lr`` and ``confidence = sigmoid(L_post)``
|
||
— commutative and order-independent, fixing the pre-S3 ordering
|
||
bug. Status flips at ≥0.8 → supported / ≤0.2 → refuted.
|
||
|
||
**Idempotency**: if a ``(phenomenon, hypothesis, edge_type)`` edge
|
||
already exists, this is a no-op — the same agent re-recording the
|
||
same link (or two agents linking via the orchestrator's batch
|
||
judge and a manual override) does not double-count.
|
||
|
||
**Harmonic damping of repeated same-direction evidence** (added
|
||
post first full-case run, 2026-05-20): independent evidence —
|
||
different phenomena pointing the same way — still accumulates,
|
||
but with diminishing returns: the k-th edge of the same
|
||
``(hyp_id, edge_type)`` contributes ``log_lr_base / k``. After N
|
||
same-direction edges the cumulative contribution is
|
||
``log_lr_base · H_N`` (harmonic sum, grows as ln N). This
|
||
formalises the naive-Bayes-breakdown DESIGN.md §4.5 calls out:
|
||
"同一发现被多 agent 重复入图". Single-edge hypotheses are
|
||
unaffected (k=1, damping = 1.0). Cross-direction edges (supports
|
||
vs contradicts) keep their own independent counts so a strong
|
||
contradicting fact still bites against piled-on supports.
|
||
"""
|
||
if edge_type not in self.edge_log_lr:
|
||
raise ValueError(
|
||
f"Invalid hypothesis edge type: {edge_type}. "
|
||
f"Must be one of: {list(self.edge_log_lr.keys())}"
|
||
)
|
||
|
||
async with self._lock:
|
||
if not self._node_exists(phenomenon_id):
|
||
raise ValueError(f"Phenomenon not found: {phenomenon_id}")
|
||
hyp = self.hypotheses.get(hyp_id)
|
||
if hyp is None:
|
||
raise ValueError(f"Hypothesis not found: {hyp_id}")
|
||
|
||
# Idempotency check — same (ph, hyp, edge_type) already on graph.
|
||
for existing in self._adj.get(phenomenon_id, []):
|
||
if (
|
||
existing.target_id == hyp_id
|
||
and existing.edge_type == edge_type
|
||
):
|
||
return hyp.confidence
|
||
|
||
# Harmonic damping rank: count existing edges of the SAME
|
||
# edge_type already incident on this hypothesis. The new edge
|
||
# becomes the (rank+1)-th of its kind. _adj_rev is keyed by
|
||
# target so this is O(in-degree(hyp)) without scanning all edges.
|
||
existing_same_type = sum(
|
||
1 for e in self._adj_rev.get(hyp_id, [])
|
||
if e.edge_type == edge_type
|
||
)
|
||
rank = existing_same_type + 1
|
||
log_lr_base = self.edge_log_lr[edge_type]
|
||
log_lr = log_lr_base / rank
|
||
old_log_odds = hyp.log_odds
|
||
old_conf = hyp.confidence
|
||
new_log_odds = old_log_odds + log_lr
|
||
new_conf = log_odds_to_prob(new_log_odds)
|
||
|
||
hyp.log_odds = new_log_odds
|
||
hyp.confidence = new_conf
|
||
|
||
if new_conf >= 0.8:
|
||
hyp.status = "supported"
|
||
elif new_conf <= 0.2:
|
||
hyp.status = "refuted"
|
||
else:
|
||
hyp.status = "active"
|
||
|
||
hyp.confidence_log.append({
|
||
"timestamp": datetime.now().isoformat(),
|
||
"phenomenon_id": phenomenon_id,
|
||
"edge_type": edge_type,
|
||
"log_lr_base": log_lr_base,
|
||
"rank": rank,
|
||
"log_lr": round(log_lr, 4),
|
||
"old_log_odds": round(old_log_odds, 4),
|
||
"new_log_odds": round(new_log_odds, 4),
|
||
"old_confidence": round(old_conf, 4),
|
||
"new_confidence": round(new_conf, 4),
|
||
"reason": reason,
|
||
})
|
||
|
||
# Also create the edge in the graph
|
||
eid = f"edge-{uuid.uuid4().hex[:8]}"
|
||
edge = Edge(
|
||
id=eid,
|
||
source_id=phenomenon_id,
|
||
target_id=hyp_id,
|
||
edge_type=edge_type,
|
||
metadata={"reason": reason, "log_lr": log_lr},
|
||
created_by="hypothesis_engine",
|
||
created_at=datetime.now().isoformat(),
|
||
)
|
||
self.edges.append(edge)
|
||
self._adj.setdefault(phenomenon_id, []).append(edge)
|
||
self._adj_rev.setdefault(hyp_id, []).append(edge)
|
||
|
||
self._auto_save()
|
||
|
||
# If this is a coref hypothesis, mirror the new confidence into the
|
||
# entity-level same_as edge. Done OUTSIDE the lock — _sync_same_as_edge
|
||
# re-acquires it internally — so we avoid reentrant locking.
|
||
if hyp.coref_entity_pair and len(hyp.coref_entity_pair) == 2:
|
||
await self._sync_same_as_edge(
|
||
hyp.coref_entity_pair[0],
|
||
hyp.coref_entity_pair[1],
|
||
hyp_id,
|
||
)
|
||
return new_conf
|
||
|
||
# ---- Cross-source entity coreference (DESIGN.md §4.6) -------------------
|
||
|
||
@staticmethod
|
||
def _coref_hypothesis_id(eid_a: str, eid_b: str) -> str:
|
||
"""Deterministic id for the coref hypothesis between an entity pair.
|
||
|
||
Same pair (regardless of arg order) always maps to the same id so
|
||
repeated observations augment the existing hypothesis rather than
|
||
spawning duplicates.
|
||
"""
|
||
pair = "|".join(sorted([eid_a, eid_b]))
|
||
return f"hyp-coref-{hashlib.sha256(pair.encode()).hexdigest()[:10]}"
|
||
|
||
async def get_or_create_coref_hypothesis(
|
||
self, eid_a: str, eid_b: str,
|
||
) -> tuple[str, bool]:
|
||
"""Look up (or insert) the coreference hypothesis for an entity pair.
|
||
|
||
Uses a low prior (``prior_prob=0.1``) — saying any two entities are
|
||
the same actor is a strong claim, so the default should be
|
||
skeptical and let evidence move the needle.
|
||
"""
|
||
hid = self._coref_hypothesis_id(eid_a, eid_b)
|
||
async with self._lock:
|
||
if hid in self.hypotheses:
|
||
return hid, False
|
||
ea = self.entities.get(eid_a)
|
||
eb = self.entities.get(eid_b)
|
||
if ea is None or eb is None:
|
||
raise ValueError(f"Unknown entity in coref pair: {eid_a}, {eid_b}")
|
||
l_prior = prob_to_log_odds(0.1)
|
||
self.hypotheses[hid] = Hypothesis(
|
||
id=hid,
|
||
title=f"Coreference: {ea.name} ≡ {eb.name}",
|
||
description=(
|
||
f"Hypothesis that {ea.id} ({ea.name}, {ea.entity_type}) "
|
||
f"and {eb.id} ({eb.name}, {eb.entity_type}) refer to "
|
||
f"the same actor across evidence sources."
|
||
),
|
||
prior_prob=0.1,
|
||
log_odds=l_prior,
|
||
confidence=log_odds_to_prob(l_prior),
|
||
status="active",
|
||
created_by="coref_engine",
|
||
created_at=datetime.now().isoformat(),
|
||
coref_entity_pair=sorted([eid_a, eid_b]),
|
||
)
|
||
self._auto_save()
|
||
return hid, True
|
||
|
||
async def _sync_same_as_edge(
|
||
self, eid_a: str, eid_b: str, hyp_id: str,
|
||
) -> None:
|
||
"""Mirror coref hypothesis confidence into a ``same_as`` entity edge.
|
||
|
||
- Confidence ≥ 0.8 → ensure an active ``same_as`` edge exists.
|
||
- Confidence < 0.8 → mark any existing edge inactive (audit, not delete).
|
||
Idempotent on both transitions.
|
||
"""
|
||
hyp = self.hypotheses.get(hyp_id)
|
||
if hyp is None:
|
||
return
|
||
active = hyp.confidence >= 0.8
|
||
async with self._lock:
|
||
existing = None
|
||
for edge in self.edges:
|
||
if (edge.edge_type == "same_as"
|
||
and {edge.source_id, edge.target_id} == {eid_a, eid_b}):
|
||
existing = edge
|
||
break
|
||
if active:
|
||
if existing is None:
|
||
eid = f"edge-{uuid.uuid4().hex[:8]}"
|
||
edge = Edge(
|
||
id=eid,
|
||
source_id=eid_a,
|
||
target_id=eid_b,
|
||
edge_type="same_as",
|
||
metadata={
|
||
"backed_by": hyp_id,
|
||
"active": True,
|
||
"confidence_at_creation": hyp.confidence,
|
||
},
|
||
created_by="coref_engine",
|
||
created_at=datetime.now().isoformat(),
|
||
)
|
||
self.edges.append(edge)
|
||
self._adj.setdefault(eid_a, []).append(edge)
|
||
self._adj_rev.setdefault(eid_b, []).append(edge)
|
||
elif not existing.metadata.get("active"):
|
||
existing.metadata["active"] = True
|
||
existing.metadata["reactivated_at"] = datetime.now().isoformat()
|
||
else:
|
||
if existing is not None and existing.metadata.get("active"):
|
||
existing.metadata["active"] = False
|
||
existing.metadata["deactivated_at"] = datetime.now().isoformat()
|
||
self._auto_save()
|
||
|
||
async def observe_identity(
|
||
self,
|
||
entity_name: str,
|
||
entity_type: str,
|
||
identifier_type: str,
|
||
value: str,
|
||
source_agent: str,
|
||
invocation_id: str,
|
||
source_tool: str = "",
|
||
task_id: str | None = None,
|
||
) -> dict:
|
||
"""Record a typed identifier for an entity through the grounding gateway.
|
||
|
||
DESIGN.md §4.6. Steps:
|
||
|
||
1. Validate ``invocation_id`` + ``value`` via the same gateway
|
||
``add_phenomenon`` uses (raises :class:`GroundingError` on failure).
|
||
2. Get-or-create the entity.
|
||
3. Record an ``identity_observation`` phenomenon carrying the
|
||
identifier as its sole verified fact.
|
||
4. Attach the identifier to the entity (idempotent by
|
||
``(type, normalized_value)``).
|
||
5. If the attachment is new, scan other entities for shared
|
||
identifiers (strong / weak) and any conflicting strong
|
||
identifiers, then propose / strengthen / weaken the coref
|
||
hypothesis between each candidate pair. ``same_as`` edges are
|
||
kept in sync with the hypothesis confidence.
|
||
|
||
Returns a dict summarising the entity id, observation phenomenon,
|
||
whether the identifier was new, and any coref proposals fired.
|
||
"""
|
||
if identifier_type not in (STRONG_IDENTIFIER_TYPES | WEAK_IDENTIFIER_TYPES):
|
||
raise ValueError(
|
||
f"Unknown identifier_type: {identifier_type}. "
|
||
f"Strong: {sorted(STRONG_IDENTIFIER_TYPES)}; "
|
||
f"Weak: {sorted(WEAK_IDENTIFIER_TYPES)}."
|
||
)
|
||
if not value:
|
||
raise ValueError("identifier value must be non-empty")
|
||
|
||
# add_phenomenon enforces the grounding contract for the fact below.
|
||
active_task = task_id if task_id is not None else self._current_task_id
|
||
fact = {"type": identifier_type, "value": value, "invocation_id": invocation_id}
|
||
|
||
# Get-or-create entity first so we can attribute the observation.
|
||
eid, _existed = await self.add_entity(entity_name, entity_type)
|
||
|
||
norm = _normalize_identifier(identifier_type, value)
|
||
title = f"{identifier_type}={value} on {entity_name}"
|
||
pid, _merged = await self.add_phenomenon(
|
||
source_agent=source_agent,
|
||
category="identity_observation",
|
||
title=title,
|
||
interpretation=(
|
||
f"Agent attributed identifier {identifier_type}={value} "
|
||
f"(normalized={norm}) to entity {entity_name} ({entity_type})."
|
||
),
|
||
verified_facts=[fact],
|
||
source_tool=source_tool,
|
||
task_id=active_task,
|
||
)
|
||
|
||
# Attach identifier to entity (idempotent on type + normalized value).
|
||
new_identifier = False
|
||
async with self._lock:
|
||
ent = self.entities[eid]
|
||
if not ent.has_identifier(identifier_type, norm):
|
||
ent.identifiers.append({
|
||
"type": identifier_type,
|
||
"value": value,
|
||
"normalized": norm,
|
||
"strong": is_strong_identifier(identifier_type),
|
||
"invocation_id": invocation_id,
|
||
"phenomenon_id": pid,
|
||
"observed_at": datetime.now().isoformat(),
|
||
})
|
||
new_identifier = True
|
||
self._auto_save()
|
||
|
||
coref_proposals: list[dict] = []
|
||
if new_identifier:
|
||
coref_proposals = await self._propose_coref_for_new_identifier(
|
||
new_eid=eid,
|
||
new_type=identifier_type,
|
||
new_norm=norm,
|
||
new_phenomenon_id=pid,
|
||
)
|
||
|
||
return {
|
||
"entity_id": eid,
|
||
"phenomenon_id": pid,
|
||
"new_identifier": new_identifier,
|
||
"coref_proposals": coref_proposals,
|
||
}
|
||
|
||
async def _propose_coref_for_new_identifier(
|
||
self,
|
||
new_eid: str,
|
||
new_type: str,
|
||
new_norm: str,
|
||
new_phenomenon_id: str,
|
||
) -> list[dict]:
|
||
"""Blocking + propose: find candidate entities that share this
|
||
identifier with ``new_eid``, register / strengthen the coref
|
||
hypothesis for each pair, and emit conflicting-identifier edges
|
||
where the two entities have *different* values for the same
|
||
strong identifier type. O(|entities| × identifiers) — blocking
|
||
is implicit in the fact that the new identifier is fixed.
|
||
"""
|
||
new_ent = self.entities.get(new_eid)
|
||
if new_ent is None:
|
||
return []
|
||
|
||
is_strong_new = is_strong_identifier(new_type)
|
||
match_edge = "shared_strong_identifier" if is_strong_new else "shared_weak_identifier"
|
||
|
||
proposals: list[dict] = []
|
||
|
||
for other_eid, other_ent in list(self.entities.items()):
|
||
if other_eid == new_eid:
|
||
continue
|
||
|
||
# Match: other entity carries the same (type, normalized).
|
||
if not other_ent.has_identifier(new_type, new_norm):
|
||
continue
|
||
|
||
# Collect conflicting strong identifiers between the pair —
|
||
# they'll fire negative-LR edges on the same coref hypothesis.
|
||
conflicts: list[dict] = []
|
||
for a_ident in new_ent.identifiers:
|
||
if not a_ident.get("strong"):
|
||
continue
|
||
for b_ident in other_ent.identifiers:
|
||
if (b_ident.get("type") == a_ident.get("type")
|
||
and b_ident.get("strong")
|
||
and b_ident.get("normalized") != a_ident.get("normalized")):
|
||
conflicts.append({
|
||
"type": a_ident.get("type"),
|
||
"new_value": a_ident.get("value"),
|
||
"other_value": b_ident.get("value"),
|
||
"new_phenomenon_id": a_ident.get("phenomenon_id"),
|
||
})
|
||
|
||
hid, _created = await self.get_or_create_coref_hypothesis(
|
||
new_eid, other_eid,
|
||
)
|
||
|
||
# +shared identifier edge (one per identifier, anchored to the
|
||
# newly recorded observation phenomenon). update_hypothesis_
|
||
# confidence is idempotent on (ph, hyp, edge_type), so re-running
|
||
# the same observation does not double-count.
|
||
await self.update_hypothesis_confidence(
|
||
hid, new_phenomenon_id, match_edge,
|
||
reason=f"shared {new_type}={new_norm}",
|
||
)
|
||
|
||
# −conflicting strong identifier edges — one per conflict, anchored
|
||
# to the *new* entity's observation phenomenon for that identifier.
|
||
for c in conflicts:
|
||
ph_src = c["new_phenomenon_id"]
|
||
if not ph_src:
|
||
continue
|
||
await self.update_hypothesis_confidence(
|
||
hid, ph_src, "conflicting_strong_identifier",
|
||
reason=(
|
||
f"conflict {c['type']}: "
|
||
f"{c['new_value']} vs {c['other_value']}"
|
||
),
|
||
)
|
||
|
||
await self._sync_same_as_edge(new_eid, other_eid, hid)
|
||
|
||
proposals.append({
|
||
"hypothesis_id": hid,
|
||
"other_entity_id": other_eid,
|
||
"match": {"type": new_type, "normalized": new_norm,
|
||
"edge_type": match_edge},
|
||
"conflicts": conflicts,
|
||
"confidence": self.hypotheses[hid].confidence,
|
||
})
|
||
|
||
return proposals
|
||
|
||
# ---- Cross-source entity cluster queries (DESIGN.md §4.6) ----------------
|
||
|
||
def _active_same_as_neighbors(self, entity_id: str) -> set[str]:
|
||
"""Neighbours of *entity_id* via ``same_as`` edges that are still active.
|
||
|
||
``same_as`` edges are non-destructive: a coref hypothesis that drops
|
||
below threshold marks ``metadata['active']=False`` rather than
|
||
deleting, so the audit trail survives. Cluster queries respect that.
|
||
"""
|
||
out: set[str] = set()
|
||
for edge in self.edges:
|
||
if edge.edge_type != "same_as":
|
||
continue
|
||
if not edge.metadata.get("active", True):
|
||
continue
|
||
if edge.source_id == entity_id:
|
||
out.add(edge.target_id)
|
||
elif edge.target_id == entity_id:
|
||
out.add(edge.source_id)
|
||
return out
|
||
|
||
def resolve_actor_cluster(self, entity_id: str) -> set[str]:
|
||
"""Return the connected component containing *entity_id* via active
|
||
``same_as`` edges — the set of entity ids that current coref evidence
|
||
treats as the same actor.
|
||
|
||
Reversible: deactivating a ``same_as`` edge (because the backing
|
||
coref hypothesis drops below 0.8) breaks the component, so this
|
||
always reflects the *current* state of the graph.
|
||
"""
|
||
if entity_id not in self.entities:
|
||
return set()
|
||
seen: set[str] = {entity_id}
|
||
frontier: list[str] = [entity_id]
|
||
while frontier:
|
||
cur = frontier.pop()
|
||
for nbr in self._active_same_as_neighbors(cur):
|
||
if nbr not in seen:
|
||
seen.add(nbr)
|
||
frontier.append(nbr)
|
||
return seen
|
||
|
||
def actor_clusters(self) -> list[dict]:
|
||
"""Group all entities into actor clusters via active ``same_as``.
|
||
|
||
Returns a list of ``{members: [...], identifiers: [...], coref_hypotheses: [...]}``
|
||
for the report agent and the orchestrator's cross-source views.
|
||
"""
|
||
unseen = set(self.entities.keys())
|
||
clusters: list[dict] = []
|
||
while unseen:
|
||
start = next(iter(unseen))
|
||
members = self.resolve_actor_cluster(start)
|
||
unseen -= members
|
||
# Aggregate identifiers across the cluster (deduped on type+normalized).
|
||
ident_seen: set[tuple[str, str]] = set()
|
||
idents: list[dict] = []
|
||
for eid in members:
|
||
for ident in self.entities[eid].identifiers:
|
||
key = (ident.get("type"), ident.get("normalized"))
|
||
if key in ident_seen:
|
||
continue
|
||
ident_seen.add(key)
|
||
idents.append({
|
||
"type": ident.get("type"),
|
||
"value": ident.get("value"),
|
||
"strong": ident.get("strong"),
|
||
"on_entity": eid,
|
||
})
|
||
coref_hyps = sorted({
|
||
e.metadata.get("backed_by", "")
|
||
for e in self.edges
|
||
if e.edge_type == "same_as"
|
||
and e.metadata.get("active", True)
|
||
and (e.source_id in members or e.target_id in members)
|
||
} - {""})
|
||
clusters.append({
|
||
"members": sorted(members),
|
||
"identifiers": idents,
|
||
"coref_hypotheses": coref_hyps,
|
||
})
|
||
return clusters
|
||
|
||
# ---- Lead management (same as old Blackboard) ----------------------------
|
||
|
||
async def add_lead(
|
||
self,
|
||
target_agent: str,
|
||
description: str,
|
||
priority: int = 5,
|
||
context: dict | None = None,
|
||
hypothesis_id: str | None = None,
|
||
proposed_by: str = "",
|
||
motivating_hypothesis: str = "",
|
||
expected_evidence_type: str = "",
|
||
round_number: int = 0,
|
||
) -> str:
|
||
async with self._lock:
|
||
lid = f"lead-{uuid.uuid4().hex[:8]}"
|
||
# Idempotency for strategist proposals: identical
|
||
# (motivating_hypothesis, expected_evidence_type, target_agent,
|
||
# source_id) triple should not be created twice — this guards
|
||
# against the "strategist loops on the same lead" failure mode.
|
||
if motivating_hypothesis and proposed_by == "strategist":
|
||
source_id = (context or {}).get("source_id", "")
|
||
for existing in self.leads:
|
||
if (
|
||
existing.proposed_by == "strategist"
|
||
and existing.motivating_hypothesis == motivating_hypothesis
|
||
and existing.expected_evidence_type == expected_evidence_type
|
||
and existing.target_agent == target_agent
|
||
and (existing.context or {}).get("source_id", "") == source_id
|
||
and existing.status in ("pending", "assigned")
|
||
):
|
||
return existing.id
|
||
self.leads.append(Lead(
|
||
id=lid,
|
||
target_agent=target_agent,
|
||
description=description,
|
||
priority=priority,
|
||
context=context or {},
|
||
hypothesis_id=hypothesis_id,
|
||
proposed_by=proposed_by,
|
||
motivating_hypothesis=motivating_hypothesis,
|
||
expected_evidence_type=expected_evidence_type,
|
||
round_number=round_number,
|
||
))
|
||
self._auto_save()
|
||
return lid
|
||
|
||
# ---- Investigation rounds (strategist loop) ----------------------------
|
||
|
||
async def start_investigation_round(
|
||
self, round_number: int,
|
||
) -> str:
|
||
"""Open a new investigation round + capture pre-round snapshot.
|
||
|
||
Called by the orchestrator at the top of each strategist iteration.
|
||
The snapshot records hypothesis status, phenomena count, and edges
|
||
count so that ``complete_investigation_round`` can compute the
|
||
round's yield deltas.
|
||
"""
|
||
async with self._lock:
|
||
rid = f"round-{round_number:03d}"
|
||
snapshot_before = {
|
||
hid: h.status for hid, h in self.hypotheses.items()
|
||
}
|
||
self.investigation_rounds.append(InvestigationRound(
|
||
id=rid,
|
||
round_number=round_number,
|
||
started_at=datetime.now().isoformat(),
|
||
hypothesis_status_snapshot_before=snapshot_before,
|
||
phenomena_count_before=len(self.phenomena),
|
||
edges_count_before=len(self.edges),
|
||
))
|
||
self._auto_save()
|
||
return rid
|
||
|
||
async def complete_investigation_round(
|
||
self,
|
||
round_id: str,
|
||
strategist_action: str = "propose_leads",
|
||
leads_executed: list[str] | None = None,
|
||
decision_rationale: str = "",
|
||
) -> InvestigationRound | None:
|
||
"""Close a round, recording after-snapshot + which leads got executed.
|
||
|
||
Idempotent on already-closed rounds (returns the existing record).
|
||
"""
|
||
async with self._lock:
|
||
for r in self.investigation_rounds:
|
||
if r.id != round_id:
|
||
continue
|
||
if r.completed_at:
|
||
return r
|
||
r.completed_at = datetime.now().isoformat()
|
||
r.strategist_action = strategist_action
|
||
r.leads_executed = list(leads_executed or [])
|
||
r.leads_proposed = [
|
||
l.id for l in self.leads
|
||
if l.round_number == r.round_number
|
||
and l.proposed_by == "strategist"
|
||
]
|
||
r.hypothesis_status_snapshot_after = {
|
||
hid: h.status for hid, h in self.hypotheses.items()
|
||
}
|
||
r.phenomena_count_after = len(self.phenomena)
|
||
r.edges_count_after = len(self.edges)
|
||
r.decision_rationale = decision_rationale
|
||
self._auto_save()
|
||
return r
|
||
return None
|
||
|
||
def get_investigation_round(self, round_id: str) -> InvestigationRound | None:
|
||
for r in self.investigation_rounds:
|
||
if r.id == round_id:
|
||
return r
|
||
return None
|
||
|
||
def latest_round(self) -> InvestigationRound | None:
|
||
return self.investigation_rounds[-1] if self.investigation_rounds else None
|
||
|
||
def leads_from_round(self, round_number: int) -> list[Lead]:
|
||
return [l for l in self.leads if l.round_number == round_number]
|
||
|
||
async def get_pending_leads(self, agent_type: str | None = None) -> list[Lead]:
|
||
async with self._lock:
|
||
leads = [l for l in self.leads if l.status == "pending"]
|
||
if agent_type:
|
||
leads = [l for l in leads if l.target_agent == agent_type]
|
||
return sorted(leads, key=lambda l: l.priority)
|
||
|
||
async def mark_lead_completed(self, lead_id: str) -> None:
|
||
async with self._lock:
|
||
for lead in self.leads:
|
||
if lead.id == lead_id:
|
||
lead.status = "completed"
|
||
break
|
||
self._auto_save()
|
||
|
||
async def mark_lead_failed(self, lead_id: str, error: str = "") -> None:
|
||
async with self._lock:
|
||
for lead in self.leads:
|
||
if lead.id == lead_id:
|
||
lead.status = "failed"
|
||
lead.context["failure_reason"] = error
|
||
break
|
||
self._auto_save()
|
||
|
||
# ---- Investigation areas -------------------------------------------------
|
||
|
||
async def add_investigation_area(
|
||
self,
|
||
area: str,
|
||
description: str,
|
||
suggested_agent: str,
|
||
expected_keywords: list[str] | None = None,
|
||
expected_tools: list[str] | None = None,
|
||
priority: int = 5,
|
||
motivating_hypothesis_ids: list[str] | None = None,
|
||
created_by: str = "",
|
||
) -> tuple[str, bool]:
|
||
"""Add or merge an investigation area. Dedupe key is the `area` slug.
|
||
|
||
On collision, union the three list fields (keywords / tools /
|
||
motivating_hypothesis_ids); description / suggested_agent / priority
|
||
are preserved from the first writer (manual seed wins over LLM derive).
|
||
Returns (id, was_existing).
|
||
"""
|
||
async with self._lock:
|
||
for existing in self.investigation_areas.values():
|
||
if existing.area == area:
|
||
for kw in (expected_keywords or []):
|
||
if kw not in existing.expected_keywords:
|
||
existing.expected_keywords.append(kw)
|
||
for t in (expected_tools or []):
|
||
if t not in existing.expected_tools:
|
||
existing.expected_tools.append(t)
|
||
for hid in (motivating_hypothesis_ids or []):
|
||
if hid not in existing.motivating_hypothesis_ids:
|
||
existing.motivating_hypothesis_ids.append(hid)
|
||
self._auto_save()
|
||
return existing.id, True
|
||
|
||
aid = f"area-{area}"
|
||
self.investigation_areas[aid] = InvestigationArea(
|
||
id=aid,
|
||
area=area,
|
||
description=description,
|
||
suggested_agent=suggested_agent,
|
||
expected_keywords=list(expected_keywords or []),
|
||
expected_tools=list(expected_tools or []),
|
||
priority=priority,
|
||
motivating_hypothesis_ids=list(motivating_hypothesis_ids or []),
|
||
created_by=created_by,
|
||
created_at=datetime.now().isoformat(),
|
||
)
|
||
self._auto_save()
|
||
return aid, False
|
||
|
||
# ---- Tool invocation log -------------------------------------------------
|
||
|
||
async def record_tool_invocation(
|
||
self,
|
||
tool: str,
|
||
args: dict,
|
||
output: str,
|
||
cached: bool = False,
|
||
) -> str:
|
||
"""Record one tool call. Returns the invocation_id.
|
||
|
||
Source / agent / task_id are read from the graph's current run
|
||
context (set by BaseAgent.run and set_active_source) so executors
|
||
can stay stateless.
|
||
"""
|
||
iid = f"inv-{uuid.uuid4().hex[:8]}"
|
||
src_id = self.active_source.id if self.active_source else ""
|
||
inv = ToolInvocation(
|
||
id=iid,
|
||
tool=tool,
|
||
args=dict(args),
|
||
output=output,
|
||
output_sha256=hashlib.sha256(output.encode("utf-8", errors="replace")).hexdigest(),
|
||
agent=self._current_agent or "unknown",
|
||
task_id=self._current_task_id or "",
|
||
source_id=src_id,
|
||
created_at=datetime.now().isoformat(),
|
||
cached=cached,
|
||
)
|
||
async with self._lock:
|
||
self.tool_invocations[iid] = inv
|
||
# Cheap on cache hit; expensive but bounded otherwise. Skip
|
||
# auto-save here — too noisy if every tool call rewrites the
|
||
# state file; the next phenomenon write will flush.
|
||
return iid
|
||
|
||
def validate_fact_grounding(
|
||
self,
|
||
fact: dict,
|
||
agent: str,
|
||
task_id: str,
|
||
) -> tuple[bool, str]:
|
||
"""Check a single verified_fact's grounding. Returns (ok, reason).
|
||
|
||
Rules (DESIGN.md §4.4, refined after first end-to-end run):
|
||
1. invocation_id must exist in self.tool_invocations.
|
||
2. The invocation must have been made by `agent` within `task_id`.
|
||
3. fact.value must appear in invocation.output — either as a
|
||
strict substring, OR (loose-match fallback) once both sides
|
||
are normalised via :func:`_normalize_for_grounding`
|
||
(case-folded, whitespace-collapsed, path-sep unified).
|
||
|
||
Auto-rescue (post first full-case run, 2026-05-20): when the cited
|
||
invocation_id doesn't exist BUT `fact.value` does appear in exactly
|
||
one of this agent/task's real invocations, the fact's
|
||
``invocation_id`` is silently rewritten to that real id. This heals
|
||
the LLM's routine "I know which tool I read this from, I just
|
||
mis-typed the inv id" failure mode without expanding what content
|
||
can land grounded — the value still has to be present in a real
|
||
invocation output. Multi-match is NOT auto-rescued: the value list
|
||
of candidate ids goes back to the LLM so it picks the right one.
|
||
"""
|
||
inv_id = fact.get("invocation_id", "")
|
||
value = fact.get("value", "")
|
||
if not isinstance(value, str) or not value:
|
||
return False, "fact.value must be a non-empty string"
|
||
if not inv_id:
|
||
return False, "missing invocation_id"
|
||
inv = self.tool_invocations.get(inv_id)
|
||
if inv is None:
|
||
# Look for an auto-rescue candidate: same agent, same task,
|
||
# whose output contains `value` (strict or normalised).
|
||
candidates = self._find_grounding_candidates(value, agent, task_id)
|
||
if len(candidates) == 1:
|
||
real_id = candidates[0].id
|
||
fact["invocation_id"] = real_id # heal in place
|
||
logger.info(
|
||
"grounding auto-rescued for agent=%s: bogus %s rewritten to %s "
|
||
"(value matches that invocation's output)",
|
||
agent, inv_id, real_id,
|
||
)
|
||
return True, "ok-auto-rescued"
|
||
if len(candidates) > 1:
|
||
shown = ", ".join(c.id for c in candidates[:5])
|
||
more = "" if len(candidates) <= 5 else f" (+{len(candidates)-5} more)"
|
||
return False, (
|
||
f"invocation_id {inv_id} not found in invocation log. "
|
||
f"The value {value!r} appears in {len(candidates)} of your "
|
||
f"invocations: {shown}{more} — cite the specific one in retry."
|
||
)
|
||
return False, f"invocation_id {inv_id} not found in invocation log"
|
||
if inv.agent != agent:
|
||
return False, (
|
||
f"invocation {inv_id} was made by agent '{inv.agent}', "
|
||
f"not '{agent}' — cannot be cited by a different agent"
|
||
)
|
||
if task_id and inv.task_id and inv.task_id != task_id:
|
||
return False, (
|
||
f"invocation {inv_id} was made in a different task scope "
|
||
f"({inv.task_id}) — cite only invocations from your current task"
|
||
)
|
||
if value in inv.output:
|
||
return True, "ok"
|
||
# Loose fallback: normalised comparison absorbs case / whitespace /
|
||
# path-sep differences but a genuinely absent value still fails.
|
||
if _normalize_for_grounding(value) in _normalize_for_grounding(inv.output):
|
||
return True, "ok-normalized"
|
||
return False, (
|
||
f"fact.value not found in invocation {inv_id} output — even after "
|
||
f"case/whitespace/path-sep normalisation. Copy a literal substring "
|
||
f"from that tool's result; if the content is a guess (device model, "
|
||
f"constructed path, label-joined value), move it into `interpretation` "
|
||
f"instead of `verified_facts`."
|
||
)
|
||
|
||
def _find_grounding_candidates(
|
||
self, value: str, agent: str, task_id: str,
|
||
) -> list[ToolInvocation]:
|
||
"""Return this agent/task's invocations whose output contains `value`.
|
||
|
||
Used for grounding auto-rescue (single match) and informative retry
|
||
messages (multi-match). Strict substring first, then normalised
|
||
fallback — order preserves the same semantics as `validate_fact_grounding`.
|
||
"""
|
||
norm_value = _normalize_for_grounding(value)
|
||
out: list[ToolInvocation] = []
|
||
for inv in self.tool_invocations.values():
|
||
if inv.agent != agent:
|
||
continue
|
||
if task_id and inv.task_id and inv.task_id != task_id:
|
||
continue
|
||
if value in inv.output or norm_value in _normalize_for_grounding(inv.output):
|
||
out.append(inv)
|
||
return out
|
||
|
||
def _format_recent_invocations(
|
||
self, agent: str, task_id: str, limit: int = 8,
|
||
) -> str:
|
||
"""Render this agent/task's most recent invocations as a citation menu.
|
||
|
||
Appended to GroundingError messages so the LLM has the real ids in
|
||
front of it on the retry attempt. Falls back to an empty string when
|
||
the agent has no invocations on record yet.
|
||
"""
|
||
invs = [
|
||
inv for inv in self.tool_invocations.values()
|
||
if inv.agent == agent
|
||
and (not task_id or not inv.task_id or inv.task_id == task_id)
|
||
]
|
||
if not invs:
|
||
return ""
|
||
invs.sort(key=lambda iv: iv.created_at, reverse=True)
|
||
lines = []
|
||
for inv in invs[:limit]:
|
||
args_str = json.dumps(inv.args, ensure_ascii=False)
|
||
if len(args_str) > 140:
|
||
args_str = args_str[:137] + "..."
|
||
lines.append(f" - {inv.id} {inv.tool}({args_str})")
|
||
return (
|
||
"\nYour recent invocations in this task (cite one of these ids "
|
||
"in `invocation_id`):\n" + "\n".join(lines)
|
||
)
|
||
|
||
# ---- Asset library -------------------------------------------------------
|
||
|
||
async def register_asset(
|
||
self,
|
||
inode: str,
|
||
original_path: str,
|
||
local_path: str,
|
||
category: str,
|
||
filename: str,
|
||
size_bytes: int,
|
||
extracted_by: str,
|
||
) -> tuple[str, bool]:
|
||
"""Register an extracted file. Deduplicates by inode. Returns (id, already_existed)."""
|
||
async with self._lock:
|
||
if inode in self._inode_index:
|
||
return self._inode_index[inode], True
|
||
|
||
aid = f"asset-{uuid.uuid4().hex[:8]}"
|
||
asset = ExtractedAsset(
|
||
id=aid,
|
||
inode=inode,
|
||
original_path=original_path,
|
||
local_path=local_path,
|
||
category=category,
|
||
filename=filename,
|
||
size_bytes=size_bytes,
|
||
extracted_by=extracted_by,
|
||
extracted_at=datetime.now().isoformat(),
|
||
)
|
||
self.asset_library[aid] = asset
|
||
self._inode_index[inode] = aid
|
||
self._auto_save()
|
||
return aid, False
|
||
|
||
def lookup_asset_by_inode(self, inode: str) -> ExtractedAsset | None:
|
||
"""Look up an extracted asset by inode (synchronous, no lock needed for reads)."""
|
||
aid = self._inode_index.get(inode)
|
||
return self.asset_library.get(aid) if aid else None
|
||
|
||
def list_assets(self, category: str | None = None) -> list[str]:
|
||
"""Return one-line summaries of all assets, optionally filtered."""
|
||
results = []
|
||
for asset in self.asset_library.values():
|
||
if category and asset.category != category:
|
||
continue
|
||
results.append(asset.summary())
|
||
return results
|
||
|
||
def query_assets(
|
||
self,
|
||
category: str | None = None,
|
||
filename_pattern: str | None = None,
|
||
) -> list[ExtractedAsset]:
|
||
"""Query the asset library with optional filters."""
|
||
results = []
|
||
for asset in self.asset_library.values():
|
||
if category and asset.category != category:
|
||
continue
|
||
if filename_pattern and filename_pattern.lower() not in asset.filename.lower():
|
||
continue
|
||
results.append(asset)
|
||
return results
|
||
|
||
# ---- Query methods (for agent tools) ------------------------------------
|
||
|
||
def list_phenomena(self, category: str | None = None) -> list[str]:
|
||
"""Return one-line summaries of all phenomena, optionally filtered."""
|
||
results = []
|
||
for ph in self.phenomena.values():
|
||
if category and ph.category != category:
|
||
continue
|
||
results.append(ph.summary())
|
||
return results
|
||
|
||
def get_phenomenon(self, ph_id: str) -> dict | None:
|
||
"""Return full phenomenon details as dict, or None."""
|
||
ph = self.phenomena.get(ph_id)
|
||
return ph.to_dict() if ph else None
|
||
|
||
def search_graph(self, keyword: str) -> list[str]:
|
||
"""Search across all node types by keyword. Returns summaries."""
|
||
kw = keyword.lower()
|
||
results = []
|
||
for ph in self.phenomena.values():
|
||
haystack = (
|
||
ph.title.lower()
|
||
+ " " + ph.interpretation.lower()
|
||
+ " " + " ".join(str(f.get("value", "")).lower() for f in ph.verified_facts)
|
||
)
|
||
if kw in haystack:
|
||
results.append(ph.summary())
|
||
for hyp in self.hypotheses.values():
|
||
if kw in hyp.title.lower() or kw in hyp.description.lower():
|
||
results.append(hyp.summary())
|
||
for ent in self.entities.values():
|
||
if kw in ent.name.lower() or kw in ent.description.lower():
|
||
results.append(ent.summary())
|
||
return results
|
||
|
||
def get_related(
|
||
self,
|
||
node_id: str,
|
||
edge_type: str | None = None,
|
||
direction: str = "both",
|
||
) -> list[dict]:
|
||
"""Get nodes connected to the given node. Returns list of {node_summary, edge_type, direction}."""
|
||
results = []
|
||
if direction in ("out", "both"):
|
||
for edge in self._adj.get(node_id, []):
|
||
if edge_type and edge.edge_type != edge_type:
|
||
continue
|
||
node = self.get_node(edge.target_id)
|
||
if node:
|
||
results.append({
|
||
"node": node.summary(),
|
||
"edge_type": edge.edge_type,
|
||
"direction": "outgoing",
|
||
"metadata": edge.metadata,
|
||
})
|
||
if direction in ("in", "both"):
|
||
for edge in self._adj_rev.get(node_id, []):
|
||
if edge_type and edge.edge_type != edge_type:
|
||
continue
|
||
node = self.get_node(edge.source_id)
|
||
if node:
|
||
results.append({
|
||
"node": node.summary(),
|
||
"edge_type": edge.edge_type,
|
||
"direction": "incoming",
|
||
"metadata": edge.metadata,
|
||
})
|
||
return results
|
||
|
||
def get_hypothesis_status(self) -> list[str]:
|
||
"""Return summaries of all hypotheses."""
|
||
return [h.summary() for h in self.hypotheses.values()]
|
||
|
||
def get_phenomena_by_category(self, category: str) -> list[Phenomenon]:
|
||
return [p for p in self.phenomena.values() if p.category == category]
|
||
|
||
def hypotheses_converged(self) -> bool:
|
||
"""True if no hypotheses are still active."""
|
||
return all(h.status != "active" for h in self.hypotheses.values())
|
||
|
||
def mark_remaining_inconclusive(self) -> None:
|
||
"""Mark all still-active hypotheses as inconclusive."""
|
||
for h in self.hypotheses.values():
|
||
if h.status == "active":
|
||
h.status = "inconclusive"
|
||
|
||
# ---- Hypothesis × Evidence matrix (DESIGN.md §4.5) -----------------------
|
||
|
||
def hypothesis_evidence_matrix(self) -> dict:
|
||
"""Structured pivot of every Phenomenon→Hypothesis edge.
|
||
|
||
Returns ``{"hypotheses": [...], "phenomena": [...], "cells": {...},
|
||
"counts_by_edge_type": {hyp_id: {edge_type: count}}}`` — the cells
|
||
map ``(hyp_id, ph_id)`` to a *list* of edge_type strings (a single
|
||
phenomenon may link via several edge_types after a manual override
|
||
plus an LLM judge call). Drives report rendering and gap selection.
|
||
"""
|
||
cells: dict[tuple[str, str], list[str]] = {}
|
||
counts: dict[str, dict[str, int]] = {h: {} for h in self.hypotheses}
|
||
for edge in self.edges:
|
||
if not (
|
||
edge.source_id.startswith("ph-")
|
||
and edge.target_id.startswith("hyp-")
|
||
and edge.edge_type in self.edge_log_lr
|
||
):
|
||
continue
|
||
key = (edge.target_id, edge.source_id)
|
||
cells.setdefault(key, []).append(edge.edge_type)
|
||
counts.setdefault(edge.target_id, {})[edge.edge_type] = (
|
||
counts.setdefault(edge.target_id, {}).get(edge.edge_type, 0) + 1
|
||
)
|
||
|
||
hypotheses = [
|
||
{
|
||
"id": h.id,
|
||
"title": h.title,
|
||
"confidence": h.confidence,
|
||
"log_odds": h.log_odds,
|
||
"status": h.status,
|
||
}
|
||
for h in self.hypotheses.values()
|
||
]
|
||
referenced = {ph_id for (_, ph_id) in cells}
|
||
phenomena = [
|
||
{"id": ph.id, "title": ph.title, "category": ph.category}
|
||
for ph in self.phenomena.values()
|
||
if ph.id in referenced
|
||
]
|
||
return {
|
||
"hypotheses": hypotheses,
|
||
"phenomena": phenomena,
|
||
"cells": {f"{h}|{p}": types for (h, p), types in cells.items()},
|
||
"counts_by_edge_type": counts,
|
||
}
|
||
|
||
def hypothesis_evidence_matrix_markdown(self) -> str:
|
||
"""Render the matrix as a compact markdown pivot.
|
||
|
||
Columns are the edge types (counts), plus log_odds, confidence,
|
||
status — enough for the report agent to ground every hypothesis
|
||
in its supporting and contradicting evidence at a glance.
|
||
"""
|
||
if not self.hypotheses:
|
||
return "(no hypotheses)"
|
||
matrix = self.hypothesis_evidence_matrix()
|
||
edge_types = sorted(self.edge_log_lr.keys())
|
||
header = (
|
||
"| Hypothesis | "
|
||
+ " | ".join(edge_types)
|
||
+ " | log_odds | conf | status |"
|
||
)
|
||
sep = (
|
||
"|---|"
|
||
+ "|".join(["---:"] * len(edge_types))
|
||
+ "|---:|---:|---|"
|
||
)
|
||
rows = [header, sep]
|
||
for h in matrix["hypotheses"]:
|
||
counts = matrix["counts_by_edge_type"].get(h["id"], {})
|
||
cells = [str(counts.get(et, 0)) for et in edge_types]
|
||
rows.append(
|
||
f"| {h['title']} | "
|
||
+ " | ".join(cells)
|
||
+ f" | {h['log_odds']:+.2f} | {h['confidence']:.2f} | {h['status']} |"
|
||
)
|
||
return "\n".join(rows)
|
||
|
||
# ---- Summary (lightweight, for system prompt) ----------------------------
|
||
|
||
def stats_summary(self) -> str:
|
||
"""Ultra-compact stats for inclusion in system prompt."""
|
||
active_hyp = sum(1 for h in self.hypotheses.values() if h.status == "active")
|
||
return (
|
||
f"Graph: {len(self.phenomena)} phenomena, "
|
||
f"{len(self.hypotheses)} hypotheses ({active_hyp} active), "
|
||
f"{len(self.entities)} entities, {len(self.edges)} edges. "
|
||
f"Asset library: {len(self.asset_library)} extracted files. "
|
||
f"Pending leads: {sum(1 for l in self.leads if l.status == 'pending')}."
|
||
)
|