refactor: native tool calling + generic forced-retry + terminal exit

- llm_client: switch tool_call_loop from text-based <tool_call> regex to OpenAI-native tools=[...] / structured tool_calls field; accumulate delta.reasoning_content for DeepSeek thinking-mode echo-back; fold preserves system msg and aligns boundary to never orphan role:tool - base_agent: generic forced-retry via mandatory_record_tools class attr (filesystem -> add_phenomenon, timeline -> add_temporal_edge, hypothesis -> add_hypothesis, report -> save_report); count via executor wrapper - terminal_tools class attr + loop short-circuit: when a terminal tool is called, loop exits with its raw return as final_text. ReportAgent declares save_report as terminal - replaces the <answer>-tag stop signal that native tool calling broke - _execute_*: return (raw, formatted) - terminal exit uses untruncated raw, conversation history uses 3000-char-capped formatted - evidence_graph + orchestrator: LLM-derived InvestigationArea support (hypothesis-driven coverage check, replaces hardcoded _AREA_KEYWORDS / _AREA_TOOLS); manual yaml block kept as optional seed - strip <answer> references from agent prompts (no longer load-bearing) Verified on CFReDS image across 4 smoke runs: 0 JSON parse failures (was 3); 22 temporal edges from Phase 4 (was 0); ReportAgent exits via save_report (was max_iterations regression). 78/78 unit tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 13:51:19 +08:00
parent 0a2b344c84
commit 444d58726a
9 changed files with 1356 additions and 298 deletions
--- a/evidence_graph.py
+++ b/evidence_graph.py
@@ -197,6 +197,41 @@ class Lead:
        return cls(**d)


+@dataclass
+class InvestigationArea:
+    """An area to investigate to confirm/refute one or more hypotheses.
+
+    Derived by the orchestrator from active hypotheses after Phase 2; also
+    seeded from config.yaml:investigation_areas as an optional manual
+    override. Each area carries its own keywords + expected tools so the
+    gap-analysis coverage check is generic, not tied to hard-coded constants.
+    """
+
+    id: str                                     # "area-{slug}"
+    area: str                                   # snake_case slug (dedupe key)
+    description: str
+    suggested_agent: str                        # filesystem / registry / communication / network / timeline
+    expected_keywords: list[str] = field(default_factory=list)
+    expected_tools: list[str] = field(default_factory=list)
+    priority: int = 5                           # 1 (highest) - 10 (lowest)
+    motivating_hypothesis_ids: list[str] = field(default_factory=list)
+    created_by: str = ""                        # "manual" | "llm_derive" | "fallback"
+    created_at: str = ""
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, d: dict) -> InvestigationArea:
+        return cls(**d)
+
+    def summary(self) -> str:
+        return (
+            f"[{self.area}] P{self.priority} agent={self.suggested_agent} "
+            f"(motivating: {len(self.motivating_hypothesis_ids)})"
+        )
+
+
@dataclass
 class ExtractedAsset:
    """A file extracted from the disk image and tracked in the asset library."""
@@ -270,6 +305,11 @@ class EvidenceGraph:
        self.asset_library: dict[str, ExtractedAsset] = {}
        self._inode_index: dict[str, str] = {}    # inode → asset_id

+        # Investigation areas — derived from hypotheses (LLM) and/or seeded
+        # from config.yaml:investigation_areas (manual override). Drives the
+        # gap-analysis coverage check.
+        self.investigation_areas: dict[str, InvestigationArea] = {}
+
        # Set by BaseAgent.run() before each agent execution
        self._current_agent: str = ""

@@ -295,6 +335,9 @@ class EvidenceGraph:
                "leads": [l.to_dict() for l in self.leads],
                "agent_status": dict(self.agent_status),
                "asset_library": {aid: a.to_dict() for aid, a in self.asset_library.items()},
+                "investigation_areas": {
+                    aid: a.to_dict() for aid, a in self.investigation_areas.items()
+                },
                "saved_at": datetime.now().isoformat(),
            }
            tmp = self._persist_path.with_suffix(".tmp")
@@ -345,6 +388,10 @@ class EvidenceGraph:
            asset = ExtractedAsset.from_dict(a_data)
            graph.asset_library[aid] = asset
            graph._inode_index[asset.inode] = aid
+        graph.investigation_areas = {
+            aid: InvestigationArea.from_dict(a)
+            for aid, a in data.get("investigation_areas", {}).items()
+        }
        graph._rebuild_adjacency()
        logger.info(
            "EvidenceGraph restored: %d phenomena, %d hypotheses, %d entities, "
@@ -656,6 +703,57 @@ class EvidenceGraph:
                    break
            self._auto_save()

+    # ---- Investigation areas -------------------------------------------------
+
+    async def add_investigation_area(
+        self,
+        area: str,
+        description: str,
+        suggested_agent: str,
+        expected_keywords: list[str] | None = None,
+        expected_tools: list[str] | None = None,
+        priority: int = 5,
+        motivating_hypothesis_ids: list[str] | None = None,
+        created_by: str = "",
+    ) -> tuple[str, bool]:
+        """Add or merge an investigation area. Dedupe key is the `area` slug.
+
+        On collision, union the three list fields (keywords / tools /
+        motivating_hypothesis_ids); description / suggested_agent / priority
+        are preserved from the first writer (manual seed wins over LLM derive).
+        Returns (id, was_existing).
+        """
+        async with self._lock:
+            for existing in self.investigation_areas.values():
+                if existing.area == area:
+                    for kw in (expected_keywords or []):
+                        if kw not in existing.expected_keywords:
+                            existing.expected_keywords.append(kw)
+                    for t in (expected_tools or []):
+                        if t not in existing.expected_tools:
+                            existing.expected_tools.append(t)
+                    for hid in (motivating_hypothesis_ids or []):
+                        if hid not in existing.motivating_hypothesis_ids:
+                            existing.motivating_hypothesis_ids.append(hid)
+                    self._auto_save()
+                    return existing.id, True
+
+            aid = f"area-{area}"
+            self.investigation_areas[aid] = InvestigationArea(
+                id=aid,
+                area=area,
+                description=description,
+                suggested_agent=suggested_agent,
+                expected_keywords=list(expected_keywords or []),
+                expected_tools=list(expected_tools or []),
+                priority=priority,
+                motivating_hypothesis_ids=list(motivating_hypothesis_ids or []),
+                created_by=created_by,
+                created_at=datetime.now().isoformat(),
+            )
+            self._auto_save()
+            return aid, False
+
    # ---- Asset library -------------------------------------------------------

    async def register_asset(