feat(refit): complete S1-S6 — case abstraction, grounding, log-odds, plugins, coref, multi-source
Consolidates the long-running refit work (DESIGN.md as authoritative spec)
into a single baseline commit. Six stages landed together:
S1 Case + EvidenceSource abstraction; tools parameterised by source_id
(case.py, main.py multi-source bootstrap, .bin extension support)
S2 Grounding gateway in add_phenomenon: verified_facts cite real
ToolInvocation ids; substring / normalised match enforced; agent +
task scope checked. Phenomenon.description split into verified_facts
(grounded) + interpretation (free text). [invocation: inv-xxx]
prefix on every wrapped tool result so the LLM can cite.
S3 Confidence as additive log-odds: edge_type → log10(LR) calibration
table; commutative updates; supported / refuted thresholds derived
from log_odds; hypothesis × evidence matrix view.
S4 iOS plugin: unzip_archive + parse_plist / sqlite_tables /
sqlite_query / parse_ios_keychain / read_idevice_info;
IOSArtifactAgent; SOURCE_TYPE_AGENTS routing.
S5 Cross-source entity resolution: typed identifiers on Entity,
observe_identity gateway, auto coref hypothesis with shared /
conflicting strong/weak LR edges, reversible same_as edges,
actor_clusters() view.
S6 Android partition probe + AndroidArtifactAgent; MediaAgent with
OCR fallback; orchestrator Phase 1 iterates every analysable
source; platform-aware get_triage_agent_type; ReportAgent renders
actor clusters + per-source breakdown.
142 unit tests / 1 skipped — full coverage of the new gateway, log-odds
math, coref hypothesis fall-out, and orchestrator multi-source dispatch.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
235
base_agent.py
235
base_agent.py
@@ -5,6 +5,7 @@ from __future__ import annotations
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from typing import Any
|
||||
|
||||
from evidence_graph import EvidenceGraph
|
||||
@@ -36,7 +37,9 @@ class BaseAgent:
|
||||
# forced retry with an explicit "you forgot to record" instruction.
|
||||
# Subclasses override to declare their own recording responsibility
|
||||
# (timeline → add_temporal_edge, hypothesis → add_hypothesis, report → save_report).
|
||||
mandatory_record_tools: tuple[str, ...] = ("add_phenomenon",)
|
||||
# observe_identity (S5) counts as a recording too — it writes through the
|
||||
# same grounding gateway and produces an identity_observation phenomenon.
|
||||
mandatory_record_tools: tuple[str, ...] = ("add_phenomenon", "observe_identity")
|
||||
|
||||
# Tools whose invocation ends the run immediately. After any terminal tool
|
||||
# is called, tool_call_loop returns with that tool's result text as
|
||||
@@ -110,8 +113,23 @@ class BaseAgent:
|
||||
f" Call investigation tools (list_directory, parse_registry_key, etc.) to gather data.\n"
|
||||
f" Only extract_file for forensically relevant files (user data, logs, configs, hives) — NOT system DLLs or OS files.\n"
|
||||
f" Create add_lead for anything outside your expertise.\n\n"
|
||||
f"Phase B — RECORD PHENOMENA:\n"
|
||||
f" For EACH significant finding from Phase A, call add_phenomenon.\n"
|
||||
f"Phase B — RECORD PHENOMENA (GROUNDED):\n"
|
||||
f" For EACH significant finding from Phase A, call add_phenomenon with:\n"
|
||||
f" * interpretation: your analysis — free text, NOT verified.\n"
|
||||
f" * verified_facts: one entry per concrete atom (path, timestamp,\n"
|
||||
f" inode, hash, identifier, count) you want recorded as truth.\n"
|
||||
f" Each entry MUST have:\n"
|
||||
f" - type: e.g. 'path', 'timestamp', 'inode', 'hash', 'identifier', 'count'\n"
|
||||
f" - value: a VERBATIM substring from the tool output\n"
|
||||
f" - invocation_id: the inv-xxx ID from the '[invocation: inv-xxx]'\n"
|
||||
f" header at the top of the tool result that produced this value\n"
|
||||
f" IDENTIFIERS — call observe_identity (in ADDITION to add_phenomenon)\n"
|
||||
f" whenever you see an email, phone number, Apple ID, IMEI, wallet\n"
|
||||
f" address, MAC, UDID, persistent nickname, or display name. Same\n"
|
||||
f" grounding contract: value must be verbatim in the cited tool\n"
|
||||
f" output. This is HOW cross-source attribution gets built — without\n"
|
||||
f" it, we can't tell whether the Apple ID in keychain belongs to the\n"
|
||||
f" same person as the Windows account on the USB.\n"
|
||||
f" Do NOT call link_to_entity yet — just record all phenomena first.\n\n"
|
||||
f"Phase C — LINK ENTITIES:\n"
|
||||
f" FIRST call list_phenomena to get the current IDs — do NOT rely on memory.\n"
|
||||
@@ -125,20 +143,22 @@ class BaseAgent:
|
||||
f"- You MUST call add_phenomenon for EVERY significant finding BEFORE you stop.\n"
|
||||
f"- NEGATIVE findings count too. If you searched X (a directory, a pattern, "
|
||||
f"a registry key) and found NOTHING, that absence IS evidence — call "
|
||||
f"add_phenomenon with a 'No matches for X' title and the search scope in "
|
||||
f"raw_data. Negative findings constrain the hypothesis space and prevent "
|
||||
f"the next agent from wasting time re-searching.\n"
|
||||
f"add_phenomenon with a 'No matches for X' title, the search scope in "
|
||||
f"raw_data, and cite the search tool's invocation_id (verified_facts may "
|
||||
f"be empty for a true negative; the cited invocation in source_tool still "
|
||||
f"anchors it). Negative findings constrain the hypothesis space.\n"
|
||||
f"- If you stop without having called add_phenomenon at least once, the task "
|
||||
f"is FAILED and a forced retry will fire.\n"
|
||||
f"- Include exact file paths, inode numbers, timestamps, and the source_tool "
|
||||
f"that produced each finding.\n\n"
|
||||
f"ANTI-HALLUCINATION RULES — STRICTLY ENFORCED:\n"
|
||||
f"- ONLY record findings that appear VERBATIM in tool results you received\n"
|
||||
f"- NEVER invent or guess timestamps, file paths, inode numbers, or program names\n"
|
||||
f"- If tool output was truncated, state '[truncated]' — do NOT fill in the missing data\n"
|
||||
f"- If you are unsure whether something exists, call a tool to verify or create a lead — do NOT assume\n"
|
||||
f"- Quote exact strings from tool output when recording evidence descriptions\n"
|
||||
f"- Do NOT fabricate execution timestamps — only report timestamps returned by tools"
|
||||
f"is FAILED and a forced retry will fire.\n\n"
|
||||
f"GROUNDING GATEWAY — STRUCTURALLY ENFORCED:\n"
|
||||
f"- Every tool result begins with '[invocation: inv-xxxxxxxx]' — that ID\n"
|
||||
f" is what you cite in each fact's invocation_id.\n"
|
||||
f"- fact.value must be a substring of the cited invocation's output.\n"
|
||||
f" Case, whitespace, and path-separator (/ ↔ \\) variants are tolerated;\n"
|
||||
f" anything else fabricated is REJECTED with a per-fact reason.\n"
|
||||
f"- On REJECTED: quote the literal text from the output (or drop the\n"
|
||||
f" fact), and put guesses / inferred paths / model names in\n"
|
||||
f" `interpretation` instead. Then call add_phenomenon again.\n"
|
||||
f"- You may cite ONLY invocations made within THIS task."
|
||||
)
|
||||
|
||||
async def run(self, task: str, lead_id: str | None = None) -> str:
|
||||
@@ -146,6 +166,11 @@ class BaseAgent:
|
||||
_log(task, event="agent_start", agent=self.name)
|
||||
self.graph.agent_status[self.name] = "running"
|
||||
self.graph._current_agent = self.name
|
||||
# Fresh task scope per agent run. Used by the grounding gateway to
|
||||
# check that facts in add_phenomenon cite invocations made *within
|
||||
# this run* — preventing the agent from forwarding stale IDs from
|
||||
# earlier work or another agent.
|
||||
self.graph._current_task_id = f"task-{uuid.uuid4().hex[:8]}"
|
||||
self._current_lead_id = lead_id
|
||||
|
||||
self._register_graph_tools()
|
||||
@@ -350,20 +375,67 @@ class BaseAgent:
|
||||
self.register_tool(
|
||||
name="add_phenomenon",
|
||||
description=(
|
||||
"Record a forensic finding (phenomenon) on the evidence graph. "
|
||||
"You MUST specify source_tool: the name of the tool call that produced this finding."
|
||||
"Record a forensic finding on the evidence graph. The finding is "
|
||||
"split into provenance-bound atoms (verified_facts) and free-form "
|
||||
"analysis (interpretation). Each fact MUST cite the invocation_id "
|
||||
"of a tool call you made in THIS task — the gateway checks every "
|
||||
"fact's value against that call's real output, byte-for-byte. "
|
||||
"Any fact that fails grounding causes the whole record to be "
|
||||
"rejected with a list of failures; fix the facts and call again."
|
||||
),
|
||||
input_schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"category": {"type": "string", "description": "Category of the finding."},
|
||||
"title": {"type": "string", "description": "Short title."},
|
||||
"description": {"type": "string", "description": "Detailed description. Quote exact data from tool output."},
|
||||
"interpretation": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Free-form analysis text — your reasoning, why this "
|
||||
"matters, what it implies. NOT verified by the gateway. "
|
||||
"Rendered in reports as 'agent analysis', not truth."
|
||||
),
|
||||
},
|
||||
"verified_facts": {
|
||||
"type": "array",
|
||||
"description": (
|
||||
"Atoms you want preserved as ground truth. Each must "
|
||||
"appear verbatim in the cited tool output."
|
||||
),
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"type": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Kind of fact: path, timestamp, inode, "
|
||||
"hash, identifier, count, raw, ..."
|
||||
),
|
||||
},
|
||||
"value": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Verbatim substring from the cited tool "
|
||||
"output. The gateway does a literal "
|
||||
"string-in-string check — no paraphrasing."
|
||||
),
|
||||
},
|
||||
"invocation_id": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"ID from the '[invocation: inv-xxx]' header "
|
||||
"of the tool call that produced this value."
|
||||
),
|
||||
},
|
||||
},
|
||||
"required": ["type", "value", "invocation_id"],
|
||||
},
|
||||
},
|
||||
"raw_data": {"type": "object", "description": "Structured raw data supporting this finding."},
|
||||
"timestamp": {"type": "string", "description": "Timestamp if any. ONLY use timestamps from tool output."},
|
||||
"source_tool": {"type": "string", "description": "Name of the tool that produced this (e.g. 'list_directory')."},
|
||||
},
|
||||
"required": ["category", "title", "description", "source_tool"],
|
||||
"required": ["category", "title", "source_tool"],
|
||||
},
|
||||
executor=self._add_phenomenon,
|
||||
)
|
||||
@@ -414,6 +486,67 @@ class BaseAgent:
|
||||
executor=self._link_to_entity,
|
||||
)
|
||||
|
||||
self.register_tool(
|
||||
name="observe_identity",
|
||||
description=(
|
||||
"Record a typed identifier (email / phone / Apple ID / IMEI / "
|
||||
"wallet address / nickname / display name / …) for an entity. "
|
||||
"Goes through the same grounding gateway as add_phenomenon — "
|
||||
"value MUST be a verbatim substring of the cited tool output. "
|
||||
"After attachment, the engine automatically proposes / "
|
||||
"strengthens / weakens cross-source coreference hypotheses "
|
||||
"between this entity and any others carrying the same or "
|
||||
"conflicting identifiers. This is how 'is the Apple ID in iOS "
|
||||
"keychain the same person as the Windows login name?' gets "
|
||||
"answered. Call this in ADDITION to add_phenomenon for "
|
||||
"identifier-bearing findings."
|
||||
),
|
||||
input_schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"entity_name": {"type": "string", "description": "Human-readable entity name (e.g. 'LEUNG YL', 'alice@example.com')."},
|
||||
"entity_type": {
|
||||
"type": "string",
|
||||
"enum": ["person", "program", "file", "host", "ip_address"],
|
||||
"description": "Kind of entity this identifier belongs to (usually 'person').",
|
||||
},
|
||||
"identifier_type": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Strong (near-unique): email, phone_number, imei, "
|
||||
"imsi, apple_id, icloud_id, google_account, "
|
||||
"wallet_address, udid, mac_address, device_serial. "
|
||||
"Weak (free-form, may collide): nickname, "
|
||||
"display_name, username, screen_name."
|
||||
),
|
||||
},
|
||||
"value": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"The identifier value, quoted VERBATIM from the "
|
||||
"tool output you cite in invocation_id."
|
||||
),
|
||||
},
|
||||
"invocation_id": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"ID from the '[invocation: inv-xxx]' header of "
|
||||
"the tool call that surfaced this identifier."
|
||||
),
|
||||
},
|
||||
"source_tool": {
|
||||
"type": "string",
|
||||
"description": "Name of the tool that produced the identifier.",
|
||||
},
|
||||
},
|
||||
"required": [
|
||||
"entity_name", "entity_type", "identifier_type",
|
||||
"value", "invocation_id",
|
||||
],
|
||||
},
|
||||
executor=self._observe_identity,
|
||||
)
|
||||
|
||||
# ---- Tool executors -----------------------------------------------------
|
||||
|
||||
async def _list_phenomena(self, category: str | None = None) -> str:
|
||||
@@ -453,16 +586,29 @@ class BaseAgent:
|
||||
self,
|
||||
category: str,
|
||||
title: str,
|
||||
description: str,
|
||||
interpretation: str = "",
|
||||
verified_facts: list[dict] | None = None,
|
||||
raw_data: dict | None = None,
|
||||
timestamp: str | None = None,
|
||||
source_tool: str = "",
|
||||
# Back-compat: older prompts (and accidental LLM emissions) may pass
|
||||
# ``description``; treat it as ``interpretation`` rather than failing.
|
||||
description: str | None = None,
|
||||
) -> str:
|
||||
if description and not interpretation:
|
||||
interpretation = description
|
||||
# GroundingError propagates: llm_client._execute_single_tool turns
|
||||
# raised exceptions into "Error executing add_phenomenon: <msg>" tool
|
||||
# results the LLM sees, and _wrap_record_executor does NOT increment
|
||||
# the mandatory-record counter (the increment only runs after a
|
||||
# successful return), so the forced-retry mechanism still fires if
|
||||
# the agent never lands a grounded phenomenon.
|
||||
pid, merged = await self.graph.add_phenomenon(
|
||||
source_agent=self.name,
|
||||
category=category,
|
||||
title=title,
|
||||
description=description,
|
||||
interpretation=interpretation,
|
||||
verified_facts=verified_facts,
|
||||
raw_data=raw_data,
|
||||
timestamp=timestamp,
|
||||
source_tool=source_tool,
|
||||
@@ -508,6 +654,51 @@ class BaseAgent:
|
||||
status = "linked to existing" if existing else "created and linked"
|
||||
return f"Entity {status}: {entity_name} ({entity_type}) ←[{edge_type}]— {phenomenon_id}"
|
||||
|
||||
async def _observe_identity(
|
||||
self,
|
||||
entity_name: str,
|
||||
entity_type: str,
|
||||
identifier_type: str,
|
||||
value: str,
|
||||
invocation_id: str,
|
||||
source_tool: str = "",
|
||||
) -> str:
|
||||
# GroundingError / ValueError propagate to llm_client's per-tool
|
||||
# exception handler, which formats them back to the LLM. That keeps
|
||||
# the mandatory-record counter honest — only a successful return
|
||||
# triggers the increment in _wrap_record_executor.
|
||||
result = await self.graph.observe_identity(
|
||||
entity_name=entity_name,
|
||||
entity_type=entity_type,
|
||||
identifier_type=identifier_type,
|
||||
value=value,
|
||||
source_agent=self.name,
|
||||
source_tool=source_tool,
|
||||
invocation_id=invocation_id,
|
||||
)
|
||||
lines = [
|
||||
f"Identity observed: {identifier_type}={value} "
|
||||
f"on entity {result['entity_id']} ({entity_name})."
|
||||
]
|
||||
if result.get("new_identifier"):
|
||||
lines.append(
|
||||
f" Observation phenomenon: {result['phenomenon_id']}"
|
||||
)
|
||||
else:
|
||||
lines.append(" (identifier already recorded on this entity — idempotent)")
|
||||
for prop in result.get("coref_proposals", []):
|
||||
lines.append(
|
||||
f" → Coref candidate: {prop['other_entity_id']} via "
|
||||
f"{prop['match']['edge_type']} (conf={prop['confidence']:.2f}, "
|
||||
f"hypothesis={prop['hypothesis_id']})"
|
||||
)
|
||||
for c in prop.get("conflicts", []):
|
||||
lines.append(
|
||||
f" ⚠ conflict on {c['type']}: "
|
||||
f"{c['new_value']} vs {c['other_value']}"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
async def _list_assets(self, category: str | None = None) -> str:
|
||||
results = self.graph.list_assets(category)
|
||||
if not results:
|
||||
|
||||
Reference in New Issue
Block a user