From 65745d21dc9a88e3cc4e721738f562aa2c2b8cbe Mon Sep 17 00:00:00 2001 From: BattleTag Date: Thu, 21 May 2026 02:22:05 -1000 Subject: [PATCH] feat(strategist) S4: InvestigationStrategist agent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DESIGN_STRATEGIST.md §3. The smallest possible agent — its entire output per round is one decision: propose 1-3 leads (each citing a real hypothesis it expects to move) OR declare the investigation complete with a reason. Constraint surface: mandatory_record_tools = ("propose_lead", "declare_investigation_complete") terminal_tools = ("declare_investigation_complete",) The agent inherits the BaseAgent forced-retry mechanism: if it returns without calling either action tool, the orchestrator force-prompts a RECORD-only retry. declare_complete being terminal means the tool_call_loop short-circuits the moment the strategist decides we're done. _register_graph_tools overrides BaseAgent's default to skip _register_graph_write_tools entirely — the strategist NEVER writes phenomena, entities, edges, or hypotheses directly. All graph mutations come from the workers it dispatches via leads. This keeps the planning agent's responsibility surface narrow: read the graph, choose what to do next, that's it. Prompt walks through the workflow (call graph_overview / marginal_ yield / budget_status / source_coverage first, then take exactly one terminal action) with decision criteria for propose vs stop. Registered in agent_factory._AGENT_CLASSES["strategist"]. Co-Authored-By: Claude Opus 4.7 (1M context) --- agent_factory.py | 2 + agents/strategist.py | 134 ++++++++++++++++++++++++++++++++++++ tests/test_optimizations.py | 40 +++++++++++ 3 files changed, 176 insertions(+) create mode 100644 agents/strategist.py diff --git a/agent_factory.py b/agent_factory.py index b7a1492..37f86bc 100644 --- a/agent_factory.py +++ b/agent_factory.py @@ -33,6 +33,7 @@ def _load_agent_classes() -> None: from agents.network import NetworkAgent from agents.registry import RegistryAgent from agents.report import ReportAgent + from agents.strategist import InvestigationStrategist from agents.timeline import TimelineAgent _AGENT_CLASSES["filesystem"] = FileSystemAgent _AGENT_CLASSES["registry"] = RegistryAgent @@ -44,6 +45,7 @@ def _load_agent_classes() -> None: _AGENT_CLASSES["ios_artifact"] = IOSArtifactAgent _AGENT_CLASSES["android_artifact"] = AndroidArtifactAgent _AGENT_CLASSES["media"] = MediaAgent + _AGENT_CLASSES["strategist"] = InvestigationStrategist # Triage agent per (source.type, platform). disk_image is ambiguous on its diff --git a/agents/strategist.py b/agents/strategist.py new file mode 100644 index 0000000..d77ae53 --- /dev/null +++ b/agents/strategist.py @@ -0,0 +1,134 @@ +"""InvestigationStrategist — the LLM that decides depth vs breadth. + +DESIGN_STRATEGIST.md §3. + +The strategist does NOT run forensic tools. Its job per round is exactly one +decision: propose 1-3 leads that would move an active hypothesis, OR declare +the investigation complete. It reads the graph through four read-only views +(graph_overview / source_coverage / marginal_yield / budget_status) and +expresses its decision through two write tools (propose_lead / +declare_investigation_complete). + +This is the smallest possible agent in the system — the entire point is that +strategy decisions live in one agent so they're auditable and the rest of the +codebase doesn't carry implicit depth/breadth policy. +""" + +from __future__ import annotations + +import logging + +from base_agent import BaseAgent +from evidence_graph import EvidenceGraph +from llm_client import LLMClient +from tool_registry import TOOL_CATALOG + +logger = logging.getLogger(__name__) + + +class InvestigationStrategist(BaseAgent): + name = "strategist" + role = ( + "Investigation strategist. You do not run forensic tools yourself. " + "Each round you take ONE decision: propose 1-3 new investigation leads " + "that would materially affect an active hypothesis, OR declare the " + "investigation complete. Your judgment is grounded in the graph " + "(hypotheses, sources, coverage, marginal yield, budget) — never in " + "speculation." + ) + # At least one of these must be called every round, otherwise BaseAgent's + # forced RECORD retry kicks in and re-prompts the strategist to take a + # documented decision. + mandatory_record_tools = ("propose_lead", "declare_investigation_complete") + # declare_complete is terminal — calling it short-circuits the tool loop, + # which is what we want (strategist returns immediately on "done"). + terminal_tools = ("declare_investigation_complete",) + + # Strategist-specific tools, plus the read-only graph queries inherited + # from BaseAgent. NO graph write tools (no add_phenomenon / link_to_entity + # / observe_identity); the strategist must NOT mutate evidence directly. + _STRATEGY_TOOLS = ( + "graph_overview", + "source_coverage", + "marginal_yield", + "budget_status", + "propose_lead", + "declare_investigation_complete", + ) + + def _register_graph_tools(self) -> None: + """Strategist gets read-only graph queries + the six strategy tools. + + It does NOT get write tools (no add_phenomenon, observe_identity, + link_to_entity, add_temporal_edge). Every graph mutation must come + from a dispatched worker, not from the planner. + """ + self._register_graph_read_tools() + for tool_name in self._STRATEGY_TOOLS: + td = TOOL_CATALOG.get(tool_name) + if td is None: + logger.warning( + "Strategist could not find tool %s in TOOL_CATALOG — " + "register_all_tools must run before agent instantiation.", + tool_name, + ) + continue + self.register_tool(td.name, td.description, td.input_schema, td.executor) + + def _build_system_prompt(self, task: str) -> str: + """Strategist-specific prompt. Replaces the BaseAgent default which + walks an INVESTIGATE→RECORD→LINK workflow that is wrong for a + planner agent. + """ + return ( + f"You are {self.name}, the investigation strategist.\n" + f"Role: {self.role}\n\n" + f"Your task: {task}\n\n" + f"WORKFLOW (do this exactly):\n" + f" 1. Call graph_overview FIRST. Look at: which hypotheses are\n" + f" active (conf 0.2-0.8) vs already supported/refuted; which\n" + f" ones have many edges but only 1 distinct_source; which had\n" + f" a recent_flip vs none in two rounds.\n" + f" 2. Call marginal_yield to see if the last rounds produced anything.\n" + f" 3. Call budget_status to know your runway.\n" + f" 4. For each candidate lead direction, call source_coverage on\n" + f" the relevant source to see what's been touched.\n" + f" 5. Take exactly ONE of these terminal actions:\n" + f" (a) Call propose_lead 1-3 times for leads that would\n" + f" materially move an active hypothesis. STOP after this.\n" + f" (b) Call declare_investigation_complete with a specific\n" + f" reason. STOP after this.\n" + f"\n" + f"DECISION CRITERIA — when to propose vs when to stop:\n" + f" PROPOSE when:\n" + f" - A hypothesis is supported only by ONE source — get\n" + f" cross-source corroboration. Same-source repeats are\n" + f" cheap (harmonic damping).\n" + f" - A hypothesis is in the active band (0.2 < conf < 0.8) —\n" + f" it needs the deciding evidence.\n" + f" - A high-value artefact is ✗ on source_coverage AND an\n" + f" active hypothesis depends on the kind of evidence that\n" + f" artefact would produce.\n" + f" STOP (declare_complete) when:\n" + f" - marginal_yield shows zero across 2+ rounds.\n" + f" - budget_status warns ≥90% on tool_calls or rounds.\n" + f" - all active hypotheses are resolved (supported or refuted).\n" + f" - coverage saturation: every ✗ on every source is irrelevant\n" + f" to active hypotheses.\n" + f"\n" + f"HARD RULES:\n" + f" - You CANNOT call investigation tools (list_directory,\n" + f" sqlite_query, parse_registry_key, extract_file, etc.) — your\n" + f" job is to direct workers, not to investigate yourself.\n" + f" - You CANNOT call write tools (add_phenomenon, observe_identity,\n" + f" link_to_entity, add_hypothesis, add_temporal_edge). All\n" + f" evidence mutations come from the workers you dispatch.\n" + f" - Every propose_lead MUST cite a real hyp-id from\n" + f" graph_overview's table — fabricated ids will be rejected.\n" + f" - Don't propose more than 3 leads in one round. Quality over\n" + f" quantity — a 4th lead almost always means you're not really\n" + f" sure what would move the graph.\n" + f" - Don't re-propose a lead that's already pending. The system\n" + f" deduplicates (motivating_hyp, expected_type, agent, source)\n" + f" so duplicates silently no-op, but they waste your budget." + ) diff --git a/tests/test_optimizations.py b/tests/test_optimizations.py index a7b0f0b..f10592a 100644 --- a/tests/test_optimizations.py +++ b/tests/test_optimizations.py @@ -3271,6 +3271,46 @@ class TestInvestigationRound: ) assert "≥ 90%" in bs2 # already over 90% (1 of 1 tool calls used) + @pytest.mark.asyncio + async def test_strategist_agent_registers_correct_toolset(self): + """Strategist gets read-only graph queries + the 6 strategy tools; + crucially NO graph-write tools (no add_phenomenon, observe_identity, + link_to_entity, add_hypothesis, add_temporal_edge). + """ + from tool_registry import register_all_tools + from agent_factory import AgentFactory + from llm_client import LLMClient + + graph = EvidenceGraph() + register_all_tools(graph) + llm = LLMClient.__new__(LLMClient) + factory = AgentFactory(llm, graph) + agent = factory.get_or_create_agent("strategist") + agent._register_graph_tools() + + registered = set(agent._tools.keys()) + assert { + "graph_overview", "source_coverage", "marginal_yield", + "budget_status", "propose_lead", "declare_investigation_complete", + } <= registered + assert {"list_phenomena", "get_phenomenon", "search_graph"} <= registered + forbidden = { + "add_phenomenon", "observe_identity", "link_to_entity", + "add_hypothesis", "add_temporal_edge", "add_lead", + } + leaked = registered & forbidden + assert not leaked, f"Strategist must not have write tools: {leaked}" + + def test_strategist_terminal_tool_is_declare_complete(self): + """The strategist class declares declare_investigation_complete as + its terminal tool — the tool_call_loop must short-circuit on that + call (verified at the LLM client level by an existing test). + """ + from agents.strategist import InvestigationStrategist + assert InvestigationStrategist.terminal_tools == ("declare_investigation_complete",) + assert "propose_lead" in InvestigationStrategist.mandatory_record_tools + assert "declare_investigation_complete" in InvestigationStrategist.mandatory_record_tools + @pytest.mark.asyncio async def test_propose_lead_validates_hypothesis_id(self): """propose_lead must reject leads whose motivating_hypothesis isn't