From f04ccd4bc785826512e25ae08e254fd094aedd43 Mon Sep 17 00:00:00 2001 From: BattleTag Date: Thu, 21 May 2026 02:15:08 -1000 Subject: [PATCH] =?UTF-8?q?fix(base=5Fagent):=20forced-retry=20iter=20cap?= =?UTF-8?q?=2010=E2=86=9230=20+=20narrow=20tools=20to=20record+read?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Timeline agent on the 2026-05-20 full run produced 0 phenomena: initial round hit max_iterations=60 cap before recording, forced retry then hit max_iterations=10 cap because every grounding-rejected call burns one iteration in the new gateway. Two changes restore depth without re- introducing the original "agent wanders off and never records" failure: 1. Raise retry cap 10 → 30. With grounding auto-rescue (prev commit) most rejections heal on the first retry, but some still need 2-3 turns; 10 is empirically too tight, 30 leaves headroom. 2. Narrow the retry tool surface to RECORD + graph-write + read-only-graph-query tools. Investigation tools (list_directory, sqlite_query, parse_registry_key) are dropped on retry so the agent can't restart its search loop — the retry is explicitly "record what you already found, then stop". Co-Authored-By: Claude Opus 4.7 (1M context) --- base_agent.py | 19 +++++++++-- tests/test_optimizations.py | 63 +++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/base_agent.py b/base_agent.py index 307887d..82c4a54 100644 --- a/base_agent.py +++ b/base_agent.py @@ -228,12 +228,27 @@ class BaseAgent: f"what you already found. Then end." ), }) + # Narrow the retry tool surface so the agent can't wander off + # to investigate again — only RECORD and read-only graph + # query tools survive. Each grounding-rejected call burns one + # iteration, so the cap is 30 (not the original 10): a + # Timeline agent writing ~10 temporal edges with one rejection + # apiece needs ~20 turns under the rewritten gateway. + retry_tool_names = set(registered_mandatory) | { + "list_phenomena", "list_assets", "search_graph", + "add_temporal_edge", "link_to_entity", "add_lead", + "add_hypothesis", "save_report", + } + retry_tools = [ + td for td in self.get_tool_definitions() + if td["name"] in retry_tool_names + ] final_text, _ = await self.llm.tool_call_loop( messages=conversation, - tools=self.get_tool_definitions(), + tools=retry_tools, tool_executor=self._executors, system=system, - max_iterations=10, + max_iterations=30, terminal_tools=self.terminal_tools, ) diff --git a/tests/test_optimizations.py b/tests/test_optimizations.py index 2504285..33b6d2b 100644 --- a/tests/test_optimizations.py +++ b/tests/test_optimizations.py @@ -1332,6 +1332,69 @@ class TestInvestigationAreaDerivation: assert edge_added["n"] == 1 assert agent._record_call_counts["add_temporal_edge"] == 1 + @pytest.mark.asyncio + async def test_forced_retry_uses_higher_cap_and_narrowed_tools(self): + """The forced RECORD retry must (a) get a generous iter cap so that + grounding-rejected retries don't blow the budget, and (b) hand the + LLM a tool surface restricted to RECORD + read-only graph tools so + it can't wander back into investigation. + """ + from unittest.mock import AsyncMock + from base_agent import BaseAgent + + graph = EvidenceGraph() + llm = AsyncMock() + + # Capture per-call kwargs of tool_call_loop so we can assert what + # the retry round received. + call_kwargs: list[dict] = [] + + async def real_add_edge(**kw): + return None + + class TimelineLike(BaseAgent): + mandatory_record_tools = ("add_temporal_edge",) + + agent = TimelineLike(llm, graph) + agent.name = "timeline_like" + agent._register_graph_tools = lambda: None + agent.register_tool("add_temporal_edge", "", {}, real_add_edge) + # An investigation-style tool the retry must NOT expose. + async def real_inv(**kw): return "" + agent.register_tool("list_directory", "", {}, real_inv) + # A read-only graph query — should remain available in retry. + async def real_ro(**kw): return "" + agent.register_tool("list_phenomena", "", {}, real_ro) + + async def fake_tool_call_loop(messages, tools, tool_executor, system, max_iterations=40, terminal_tools=()): + call_kwargs.append({ + "tools": [t["name"] for t in tools], + "max_iterations": max_iterations, + }) + already_retrying = any( + "STOP." in (m.get("content", "") if isinstance(m, dict) else "") + for m in messages + ) + if not already_retrying: + return "no record", list(messages) + await tool_executor["add_temporal_edge"]() + return "recorded.", [] + + llm.tool_call_loop = fake_tool_call_loop + await agent.run("build timeline") + + assert len(call_kwargs) == 2 + first_call, retry_call = call_kwargs + # First call: full tool surface, default iter cap. + assert "list_directory" in first_call["tools"] + # Retry call: investigation tool dropped, mandatory + read-only kept. + assert "list_directory" not in retry_call["tools"] + assert "add_temporal_edge" in retry_call["tools"] + assert "list_phenomena" in retry_call["tools"] + # Iter cap on the retry is now generous — 10 was empirically too tight + # because grounding-rejected calls burn iterations. + assert retry_call["max_iterations"] >= 30 + # ---- terminal_tools: real LLMClient.tool_call_loop short-circuit ----- @pytest.mark.asyncio