refactor: native tool calling + generic forced-retry + terminal exit

- llm_client: switch tool_call_loop from text-based <tool_call> regex
  to OpenAI-native tools=[...] / structured tool_calls field; accumulate
  delta.reasoning_content for DeepSeek thinking-mode echo-back; fold
  preserves system msg and aligns boundary to never orphan role:tool
- base_agent: generic forced-retry via mandatory_record_tools class attr
  (filesystem -> add_phenomenon, timeline -> add_temporal_edge,
  hypothesis -> add_hypothesis, report -> save_report); count via
  executor wrapper
- terminal_tools class attr + loop short-circuit: when a terminal tool
  is called, loop exits with its raw return as final_text. ReportAgent
  declares save_report as terminal - replaces the <answer>-tag stop
  signal that native tool calling broke
- _execute_*: return (raw, formatted) - terminal exit uses untruncated
  raw, conversation history uses 3000-char-capped formatted
- evidence_graph + orchestrator: LLM-derived InvestigationArea support
  (hypothesis-driven coverage check, replaces hardcoded _AREA_KEYWORDS /
  _AREA_TOOLS); manual yaml block kept as optional seed
- strip <answer> references from agent prompts (no longer load-bearing)

Verified on CFReDS image across 4 smoke runs: 0 JSON parse failures
(was 3); 22 temporal edges from Phase 4 (was 0); ReportAgent exits via
save_report (was max_iterations regression). 78/78 unit tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
BattleTag
2026-05-13 13:51:19 +08:00
parent 0a2b344c84
commit 444d58726a
9 changed files with 1356 additions and 298 deletions

View File

@@ -197,6 +197,41 @@ class Lead:
return cls(**d)
@dataclass
class InvestigationArea:
"""An area to investigate to confirm/refute one or more hypotheses.
Derived by the orchestrator from active hypotheses after Phase 2; also
seeded from config.yaml:investigation_areas as an optional manual
override. Each area carries its own keywords + expected tools so the
gap-analysis coverage check is generic, not tied to hard-coded constants.
"""
id: str # "area-{slug}"
area: str # snake_case slug (dedupe key)
description: str
suggested_agent: str # filesystem / registry / communication / network / timeline
expected_keywords: list[str] = field(default_factory=list)
expected_tools: list[str] = field(default_factory=list)
priority: int = 5 # 1 (highest) - 10 (lowest)
motivating_hypothesis_ids: list[str] = field(default_factory=list)
created_by: str = "" # "manual" | "llm_derive" | "fallback"
created_at: str = ""
def to_dict(self) -> dict:
return asdict(self)
@classmethod
def from_dict(cls, d: dict) -> InvestigationArea:
return cls(**d)
def summary(self) -> str:
return (
f"[{self.area}] P{self.priority} agent={self.suggested_agent} "
f"(motivating: {len(self.motivating_hypothesis_ids)})"
)
@dataclass
class ExtractedAsset:
"""A file extracted from the disk image and tracked in the asset library."""
@@ -270,6 +305,11 @@ class EvidenceGraph:
self.asset_library: dict[str, ExtractedAsset] = {}
self._inode_index: dict[str, str] = {} # inode → asset_id
# Investigation areas — derived from hypotheses (LLM) and/or seeded
# from config.yaml:investigation_areas (manual override). Drives the
# gap-analysis coverage check.
self.investigation_areas: dict[str, InvestigationArea] = {}
# Set by BaseAgent.run() before each agent execution
self._current_agent: str = ""
@@ -295,6 +335,9 @@ class EvidenceGraph:
"leads": [l.to_dict() for l in self.leads],
"agent_status": dict(self.agent_status),
"asset_library": {aid: a.to_dict() for aid, a in self.asset_library.items()},
"investigation_areas": {
aid: a.to_dict() for aid, a in self.investigation_areas.items()
},
"saved_at": datetime.now().isoformat(),
}
tmp = self._persist_path.with_suffix(".tmp")
@@ -345,6 +388,10 @@ class EvidenceGraph:
asset = ExtractedAsset.from_dict(a_data)
graph.asset_library[aid] = asset
graph._inode_index[asset.inode] = aid
graph.investigation_areas = {
aid: InvestigationArea.from_dict(a)
for aid, a in data.get("investigation_areas", {}).items()
}
graph._rebuild_adjacency()
logger.info(
"EvidenceGraph restored: %d phenomena, %d hypotheses, %d entities, "
@@ -656,6 +703,57 @@ class EvidenceGraph:
break
self._auto_save()
# ---- Investigation areas -------------------------------------------------
async def add_investigation_area(
self,
area: str,
description: str,
suggested_agent: str,
expected_keywords: list[str] | None = None,
expected_tools: list[str] | None = None,
priority: int = 5,
motivating_hypothesis_ids: list[str] | None = None,
created_by: str = "",
) -> tuple[str, bool]:
"""Add or merge an investigation area. Dedupe key is the `area` slug.
On collision, union the three list fields (keywords / tools /
motivating_hypothesis_ids); description / suggested_agent / priority
are preserved from the first writer (manual seed wins over LLM derive).
Returns (id, was_existing).
"""
async with self._lock:
for existing in self.investigation_areas.values():
if existing.area == area:
for kw in (expected_keywords or []):
if kw not in existing.expected_keywords:
existing.expected_keywords.append(kw)
for t in (expected_tools or []):
if t not in existing.expected_tools:
existing.expected_tools.append(t)
for hid in (motivating_hypothesis_ids or []):
if hid not in existing.motivating_hypothesis_ids:
existing.motivating_hypothesis_ids.append(hid)
self._auto_save()
return existing.id, True
aid = f"area-{area}"
self.investigation_areas[aid] = InvestigationArea(
id=aid,
area=area,
description=description,
suggested_agent=suggested_agent,
expected_keywords=list(expected_keywords or []),
expected_tools=list(expected_tools or []),
priority=priority,
motivating_hypothesis_ids=list(motivating_hypothesis_ids or []),
created_by=created_by,
created_at=datetime.now().isoformat(),
)
self._auto_save()
return aid, False
# ---- Asset library -------------------------------------------------------
async def register_asset(