From 65745d21dc9a88e3cc4e721738f562aa2c2b8cbe Mon Sep 17 00:00:00 2001
From: BattleTag <hychen3637.com>
Date: Thu, 21 May 2026 02:22:05 -1000
Subject: [PATCH] feat(strategist) S4: InvestigationStrategist agent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DESIGN_STRATEGIST.md §3. The smallest possible agent — its entire
output per round is one decision: propose 1-3 leads (each citing a
real hypothesis it expects to move) OR declare the investigation
complete with a reason.

Constraint surface:
  mandatory_record_tools = ("propose_lead", "declare_investigation_complete")
  terminal_tools         = ("declare_investigation_complete",)

The agent inherits the BaseAgent forced-retry mechanism: if it returns
without calling either action tool, the orchestrator force-prompts a
RECORD-only retry. declare_complete being terminal means the
tool_call_loop short-circuits the moment the strategist decides
we're done.

_register_graph_tools overrides BaseAgent's default to skip
_register_graph_write_tools entirely — the strategist NEVER writes
phenomena, entities, edges, or hypotheses directly. All graph
mutations come from the workers it dispatches via leads. This keeps
the planning agent's responsibility surface narrow: read the graph,
choose what to do next, that's it.

Prompt walks through the workflow (call graph_overview / marginal_
yield / budget_status / source_coverage first, then take exactly
one terminal action) with decision criteria for propose vs stop.

Registered in agent_factory._AGENT_CLASSES["strategist"].

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 agent_factory.py            |   2 +
 agents/strategist.py        | 134 ++++++++++++++++++++++++++++++++++++
 tests/test_optimizations.py |  40 +++++++++++
 3 files changed, 176 insertions(+)
 create mode 100644 agents/strategist.py

diff --git a/agent_factory.py b/agent_factory.py
index b7a1492..37f86bc 100644
--- a/agent_factory.py
+++ b/agent_factory.py
@@ -33,6 +33,7 @@ def _load_agent_classes() -> None:
     from agents.network import NetworkAgent
     from agents.registry import RegistryAgent
     from agents.report import ReportAgent
+    from agents.strategist import InvestigationStrategist
     from agents.timeline import TimelineAgent
     _AGENT_CLASSES["filesystem"] = FileSystemAgent
     _AGENT_CLASSES["registry"] = RegistryAgent
@@ -44,6 +45,7 @@ def _load_agent_classes() -> None:
     _AGENT_CLASSES["ios_artifact"] = IOSArtifactAgent
     _AGENT_CLASSES["android_artifact"] = AndroidArtifactAgent
     _AGENT_CLASSES["media"] = MediaAgent
+    _AGENT_CLASSES["strategist"] = InvestigationStrategist
 
 
 # Triage agent per (source.type, platform). disk_image is ambiguous on its
diff --git a/agents/strategist.py b/agents/strategist.py
new file mode 100644
index 0000000..d77ae53
--- /dev/null
+++ b/agents/strategist.py
@@ -0,0 +1,134 @@
+"""InvestigationStrategist — the LLM that decides depth vs breadth.
+
+DESIGN_STRATEGIST.md §3.
+
+The strategist does NOT run forensic tools. Its job per round is exactly one
+decision: propose 1-3 leads that would move an active hypothesis, OR declare
+the investigation complete. It reads the graph through four read-only views
+(graph_overview / source_coverage / marginal_yield / budget_status) and
+expresses its decision through two write tools (propose_lead /
+declare_investigation_complete).
+
+This is the smallest possible agent in the system — the entire point is that
+strategy decisions live in one agent so they're auditable and the rest of the
+codebase doesn't carry implicit depth/breadth policy.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from base_agent import BaseAgent
+from evidence_graph import EvidenceGraph
+from llm_client import LLMClient
+from tool_registry import TOOL_CATALOG
+
+logger = logging.getLogger(__name__)
+
+
+class InvestigationStrategist(BaseAgent):
+    name = "strategist"
+    role = (
+        "Investigation strategist. You do not run forensic tools yourself. "
+        "Each round you take ONE decision: propose 1-3 new investigation leads "
+        "that would materially affect an active hypothesis, OR declare the "
+        "investigation complete. Your judgment is grounded in the graph "
+        "(hypotheses, sources, coverage, marginal yield, budget) — never in "
+        "speculation."
+    )
+    # At least one of these must be called every round, otherwise BaseAgent's
+    # forced RECORD retry kicks in and re-prompts the strategist to take a
+    # documented decision.
+    mandatory_record_tools = ("propose_lead", "declare_investigation_complete")
+    # declare_complete is terminal — calling it short-circuits the tool loop,
+    # which is what we want (strategist returns immediately on "done").
+    terminal_tools = ("declare_investigation_complete",)
+
+    # Strategist-specific tools, plus the read-only graph queries inherited
+    # from BaseAgent. NO graph write tools (no add_phenomenon / link_to_entity
+    # / observe_identity); the strategist must NOT mutate evidence directly.
+    _STRATEGY_TOOLS = (
+        "graph_overview",
+        "source_coverage",
+        "marginal_yield",
+        "budget_status",
+        "propose_lead",
+        "declare_investigation_complete",
+    )
+
+    def _register_graph_tools(self) -> None:
+        """Strategist gets read-only graph queries + the six strategy tools.
+
+        It does NOT get write tools (no add_phenomenon, observe_identity,
+        link_to_entity, add_temporal_edge). Every graph mutation must come
+        from a dispatched worker, not from the planner.
+        """
+        self._register_graph_read_tools()
+        for tool_name in self._STRATEGY_TOOLS:
+            td = TOOL_CATALOG.get(tool_name)
+            if td is None:
+                logger.warning(
+                    "Strategist could not find tool %s in TOOL_CATALOG — "
+                    "register_all_tools must run before agent instantiation.",
+                    tool_name,
+                )
+                continue
+            self.register_tool(td.name, td.description, td.input_schema, td.executor)
+
+    def _build_system_prompt(self, task: str) -> str:
+        """Strategist-specific prompt. Replaces the BaseAgent default which
+        walks an INVESTIGATE→RECORD→LINK workflow that is wrong for a
+        planner agent.
+        """
+        return (
+            f"You are {self.name}, the investigation strategist.\n"
+            f"Role: {self.role}\n\n"
+            f"Your task: {task}\n\n"
+            f"WORKFLOW (do this exactly):\n"
+            f"  1. Call graph_overview FIRST. Look at: which hypotheses are\n"
+            f"     active (conf 0.2-0.8) vs already supported/refuted; which\n"
+            f"     ones have many edges but only 1 distinct_source; which had\n"
+            f"     a recent_flip vs none in two rounds.\n"
+            f"  2. Call marginal_yield to see if the last rounds produced anything.\n"
+            f"  3. Call budget_status to know your runway.\n"
+            f"  4. For each candidate lead direction, call source_coverage on\n"
+            f"     the relevant source to see what's been touched.\n"
+            f"  5. Take exactly ONE of these terminal actions:\n"
+            f"     (a) Call propose_lead 1-3 times for leads that would\n"
+            f"         materially move an active hypothesis. STOP after this.\n"
+            f"     (b) Call declare_investigation_complete with a specific\n"
+            f"         reason. STOP after this.\n"
+            f"\n"
+            f"DECISION CRITERIA — when to propose vs when to stop:\n"
+            f"  PROPOSE when:\n"
+            f"    - A hypothesis is supported only by ONE source — get\n"
+            f"      cross-source corroboration. Same-source repeats are\n"
+            f"      cheap (harmonic damping).\n"
+            f"    - A hypothesis is in the active band (0.2 < conf < 0.8) —\n"
+            f"      it needs the deciding evidence.\n"
+            f"    - A high-value artefact is ✗ on source_coverage AND an\n"
+            f"      active hypothesis depends on the kind of evidence that\n"
+            f"      artefact would produce.\n"
+            f"  STOP (declare_complete) when:\n"
+            f"    - marginal_yield shows zero across 2+ rounds.\n"
+            f"    - budget_status warns ≥90% on tool_calls or rounds.\n"
+            f"    - all active hypotheses are resolved (supported or refuted).\n"
+            f"    - coverage saturation: every ✗ on every source is irrelevant\n"
+            f"      to active hypotheses.\n"
+            f"\n"
+            f"HARD RULES:\n"
+            f"  - You CANNOT call investigation tools (list_directory,\n"
+            f"    sqlite_query, parse_registry_key, extract_file, etc.) — your\n"
+            f"    job is to direct workers, not to investigate yourself.\n"
+            f"  - You CANNOT call write tools (add_phenomenon, observe_identity,\n"
+            f"    link_to_entity, add_hypothesis, add_temporal_edge). All\n"
+            f"    evidence mutations come from the workers you dispatch.\n"
+            f"  - Every propose_lead MUST cite a real hyp-id from\n"
+            f"    graph_overview's table — fabricated ids will be rejected.\n"
+            f"  - Don't propose more than 3 leads in one round. Quality over\n"
+            f"    quantity — a 4th lead almost always means you're not really\n"
+            f"    sure what would move the graph.\n"
+            f"  - Don't re-propose a lead that's already pending. The system\n"
+            f"    deduplicates (motivating_hyp, expected_type, agent, source)\n"
+            f"    so duplicates silently no-op, but they waste your budget."
+        )
diff --git a/tests/test_optimizations.py b/tests/test_optimizations.py
index a7b0f0b..f10592a 100644
--- a/tests/test_optimizations.py
+++ b/tests/test_optimizations.py
@@ -3271,6 +3271,46 @@ class TestInvestigationRound:
         )
         assert "≥ 90%" in bs2  # already over 90% (1 of 1 tool calls used)
 
+    @pytest.mark.asyncio
+    async def test_strategist_agent_registers_correct_toolset(self):
+        """Strategist gets read-only graph queries + the 6 strategy tools;
+        crucially NO graph-write tools (no add_phenomenon, observe_identity,
+        link_to_entity, add_hypothesis, add_temporal_edge).
+        """
+        from tool_registry import register_all_tools
+        from agent_factory import AgentFactory
+        from llm_client import LLMClient
+
+        graph = EvidenceGraph()
+        register_all_tools(graph)
+        llm = LLMClient.__new__(LLMClient)
+        factory = AgentFactory(llm, graph)
+        agent = factory.get_or_create_agent("strategist")
+        agent._register_graph_tools()
+
+        registered = set(agent._tools.keys())
+        assert {
+            "graph_overview", "source_coverage", "marginal_yield",
+            "budget_status", "propose_lead", "declare_investigation_complete",
+        } <= registered
+        assert {"list_phenomena", "get_phenomenon", "search_graph"} <= registered
+        forbidden = {
+            "add_phenomenon", "observe_identity", "link_to_entity",
+            "add_hypothesis", "add_temporal_edge", "add_lead",
+        }
+        leaked = registered & forbidden
+        assert not leaked, f"Strategist must not have write tools: {leaked}"
+
+    def test_strategist_terminal_tool_is_declare_complete(self):
+        """The strategist class declares declare_investigation_complete as
+        its terminal tool — the tool_call_loop must short-circuit on that
+        call (verified at the LLM client level by an existing test).
+        """
+        from agents.strategist import InvestigationStrategist
+        assert InvestigationStrategist.terminal_tools == ("declare_investigation_complete",)
+        assert "propose_lead" in InvestigationStrategist.mandatory_record_tools
+        assert "declare_investigation_complete" in InvestigationStrategist.mandatory_record_tools
+
     @pytest.mark.asyncio
     async def test_propose_lead_validates_hypothesis_id(self):
         """propose_lead must reject leads whose motivating_hypothesis isn't