feat(strategist) S5: Phase 3 strategist loop in orchestrator

DESIGN_STRATEGIST.md §4. Replace the fixed-round hypothesis-directed
loop with a belief-driven strategist loop that runs the strategist
agent once per round and dispatches the leads it proposes.

New helpers on Orchestrator:
  _budget_exceeded()              hard budget caps (tool_calls,
                                  wall_clock_minutes), complementing
                                  strategist self-throttling.
  _execute_strategist_lead(lead)  dispatch one lead serially; the
                                  next strategist round sees the
                                  cumulative effect of this lead's
                                  graph mutations.
  _phase3_strategist_loop()       main loop. Open round, run strategist,
                                  exit on declare_complete or empty
                                  proposals, otherwise dispatch each
                                  lead, judge new phenomena, close round,
                                  apply yield/budget checks.
  _phase3_legacy_loop()           fallback when strategist.enabled is
                                  false. Identical to the
                                  pre-DESIGN_STRATEGIST behaviour.

The run() entry point branches on strategist_cfg.enabled (default
true) and always follows up with _retry_failed_leads() + Gap
Analysis + mark_remaining_inconclusive() regardless of variant.

Orchestrator.__init__ also wires graph.budgets and
graph.run_start_monotonic from config so the budget_status tool
sees real numbers.

Integration tests use a mock strategist + mock workers to verify
declare_complete, propose_lead -> worker dispatch, zero-yield-streak
hard stop, and budget-cap-stops-the-loop.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
BattleTag
2026-05-21 02:25:04 -10:00
parent 65745d21dc
commit a103c17bdb
2 changed files with 459 additions and 28 deletions

View File

@@ -119,6 +119,11 @@ class Orchestrator:
self._failure_count = 0
self._max_failures = 3
self._start_time = datetime.now()
# Make budgets visible to strategy tools via the graph object. The
# budget_status tool reads graph.budgets / graph.run_start_monotonic
# directly so it does not need a back-reference to the orchestrator.
self.graph.budgets = dict(self.config.get("budgets", {}) or {})
self.graph.run_start_monotonic = time.monotonic()
def _resolve_agent_type(self, agent_type: str) -> str:
return AGENT_ALIASES.get(agent_type, agent_type)
@@ -195,6 +200,249 @@ class Orchestrator:
lead.context["retry"] = True
await self._dispatch_leads_parallel(failed)
# ---- Phase 3: strategist loop (DESIGN_STRATEGIST.md §4) ------------------
def _budget_exceeded(self) -> bool:
"""Hard budget enforcement, complementing strategist self-throttling.
Any of these triggers an immediate Phase 3 exit even if the
strategist hasn't called declare_investigation_complete. Each cap
is optional — leave it out of config to make it unbounded.
"""
b = self.graph.budgets or {}
tc_cap = b.get("tool_calls_total")
if tc_cap and len(self.graph.tool_invocations) >= tc_cap:
return True
wc_cap = b.get("wall_clock_minutes_max")
if wc_cap and self.graph.run_start_monotonic is not None:
elapsed_min = (time.monotonic() - self.graph.run_start_monotonic) / 60.0
if elapsed_min >= wc_cap:
return True
return False
async def _execute_strategist_lead(self, lead, round_num: int) -> None:
"""Dispatch one strategist-proposed lead to its target worker.
Unlike the legacy bulk dispatcher this runs leads serially so each
worker run reads a graph that includes prior leads' findings — the
strategist's next round can see the cumulative effect of this round.
"""
agent_type = AGENT_ALIASES.get(lead.target_agent, lead.target_agent)
worker = self.factory.get_or_create_agent(agent_type)
if worker is None:
logger.warning(
"No worker registered for lead %s: target_agent=%s",
lead.id, agent_type,
)
lead.status = "failed"
lead.context["failure_reason"] = f"no worker for agent type '{agent_type}'"
self.graph._auto_save()
return
source_id = (lead.context or {}).get("source_id", "")
if source_id and self.graph.case is not None:
src = self.graph.case.get_source(source_id)
if src:
self.graph.set_active_source(src)
rationale = (lead.context or {}).get("rationale", "")
worker_task = (
f"Investigate this specific lead from the strategist:\n\n"
f"REQUEST: {lead.description}\n"
f"MOTIVATING HYPOTHESIS: {lead.motivating_hypothesis or '(unspecified)'}\n"
f"EXPECTED EVIDENCE TYPE: {lead.expected_evidence_type or '(unspecified)'}\n"
f"RATIONALE: {rationale or '(unspecified)'}\n\n"
f"After investigating, record findings via add_phenomenon AND "
f"link relevant phenomena to "
f"{lead.motivating_hypothesis or 'the motivating hypothesis'} via the "
f"appropriate edge_type. If your investigation produces no relevant "
f"finding, record that as a negative phenomenon so the strategist "
f"can see the gap was probed."
)
_log(
f"Round {round_num} dispatching: {lead.description[:80]}",
event="dispatch", agent=agent_type, lead=lead.id,
)
lead.status = "assigned"
self.graph._auto_save()
try:
await worker.run(worker_task, lead_id=lead.id)
lead.status = "completed"
except Exception as e:
logger.error("Strategist lead %s failed: %s", lead.id, e, exc_info=True)
lead.status = "failed"
lead.context["failure_reason"] = str(e)
finally:
self.graph._auto_save()
async def _phase3_strategist_loop(self) -> None:
"""Belief-driven investigation: strategist proposes, workers execute,
repeat. Replaces the legacy fixed-round investigation loop.
"""
_log("Phase 3: Strategist-Driven Investigation", event="phase")
strategist_cfg = self.config.get("strategist", {}) or {}
max_rounds = int(strategist_cfg.get("max_rounds", 10))
zero_yield_cap = int(strategist_cfg.get("hard_stop_marginal_yield_zero_rounds", 3))
strategist = self.factory.get_or_create_agent("strategist")
if strategist is None:
logger.error(
"InvestigationStrategist agent not registered — falling back "
"to legacy Phase 3 loop. Check agent_factory._AGENT_CLASSES."
)
await self._phase3_legacy_loop()
return
zero_yield_streak = 0
for round_num in range(1, max_rounds + 1):
# Reset per-round flags so a previous round's declare_complete
# doesn't leak across iterations (defensive — strategist also
# only sets True, never False).
self.graph.strategist_complete_requested = False
self.graph.current_strategist_round = round_num
rid = await self.graph.start_investigation_round(round_num)
_log(
f"Strategist Round {round_num}/{max_rounds}", event="phase",
round=round_num,
)
t0 = time.monotonic()
try:
await strategist.run(
f"Review the current investigation state and decide the "
f"next action. This is round {round_num}/{max_rounds}. "
f"Use graph_overview / marginal_yield / budget_status / "
f"source_coverage to ground your decision, then call "
f"propose_lead 1-3 times OR declare_investigation_complete."
)
except Exception as e:
logger.error("Strategist round %d failed: %s", round_num, e, exc_info=True)
await self.graph.complete_investigation_round(
rid, decision_rationale=f"strategist crashed: {e}",
)
break
# Strategist declared complete → no leads execute, exit loop.
if self.graph.strategist_complete_requested:
_log(
f"Strategist declared complete at round {round_num}",
event="progress", elapsed=time.monotonic() - t0,
)
await self.graph.complete_investigation_round(
rid, strategist_action="declare_complete",
decision_rationale="strategist declare_investigation_complete",
)
break
# Collect this round's leads (proposed_by=strategist + matching round).
new_leads = [
l for l in self.graph.leads
if l.round_number == round_num
and l.proposed_by == "strategist"
and l.status == "pending"
]
if not new_leads:
_log(
f"Round {round_num}: strategist proposed no new leads — exiting loop",
event="progress", elapsed=time.monotonic() - t0,
)
await self.graph.complete_investigation_round(
rid, strategist_action="no_leads",
decision_rationale="strategist proposed no new leads",
)
break
# Dispatch each lead to its worker.
for lead in new_leads:
await self._execute_strategist_lead(lead, round_num)
# After workers run, judge any new phenomena against existing
# hypotheses (so confidence updates happen before the next round
# of strategist reasoning).
if self.graph.phenomena and self.graph.hypotheses:
await self._judge_new_phenomena()
closed = await self.graph.complete_investigation_round(
rid, strategist_action="propose_leads",
leads_executed=[l.id for l in new_leads],
)
# Show round outcome.
for h in self.graph.hypotheses.values():
_log(f" {h.summary()}", event="hypothesis")
_log(
_progress_summary(self.graph) + f" (yield: +{closed.new_phenomena_count}ph, +{closed.new_edges_count}edges, {closed.status_flips}flips)",
event="progress", elapsed=time.monotonic() - t0,
)
# Marginal-yield hard stop. Distinct from strategist self-throttle:
# if the strategist insists on continuing through repeated dry
# rounds, force-stop. This protects against an over-eager
# strategist + a confused worker that produces no edges.
yield_total = (
closed.new_phenomena_count
+ closed.new_edges_count
+ closed.status_flips
)
if yield_total == 0:
zero_yield_streak += 1
if zero_yield_streak >= zero_yield_cap:
_log(
f"Hard stop: {zero_yield_streak} consecutive "
f"zero-yield rounds (cap {zero_yield_cap})",
event="progress",
)
break
else:
zero_yield_streak = 0
if self._budget_exceeded():
_log(
f"Budget exhausted after round {round_num} — exiting Phase 3",
event="progress",
)
break
else:
_log(
f"Strategist max_rounds={max_rounds} reached", event="progress",
)
# Always reset the round counter on exit so subsequent runs don't
# inherit the last value.
self.graph.current_strategist_round = 0
async def _phase3_legacy_loop(self) -> None:
"""Legacy fixed-round Phase 3 — preserved for fallback / regression.
Engaged when config has ``strategist.enabled: false`` or when the
strategist agent class is somehow not registered. Behaves identically
to the pre-DESIGN_STRATEGIST orchestrator: bounded iteration,
hypothesis-derived leads, parallel dispatch, gap analysis.
"""
max_rounds = self.config.get("max_investigation_rounds", 5)
for round_num in range(max_rounds):
_log(f"Phase 3: Investigation Round {round_num}", event="phase")
t0 = time.monotonic()
if self.graph.hypotheses_converged():
_log("All hypotheses converged — stopping", event="progress")
break
await self._generate_hypothesis_leads()
pending = await self.graph.get_pending_leads()
if not pending:
_log("No pending leads — round complete", event="progress")
break
await self._dispatch_leads_parallel(pending)
await self._judge_new_phenomena()
for h in self.graph.hypotheses.values():
_log(f" {h.summary()}", event="hypothesis")
_log(_progress_summary(self.graph), event="progress", elapsed=time.monotonic() - t0)
# ---- Hypothesis generation -----------------------------------------------
async def _generate_hypotheses_manual(self, hypotheses_config: list[dict]) -> None:
@@ -881,39 +1129,26 @@ class Orchestrator:
event="progress", elapsed=time.monotonic() - t0,
)
# Phase 3: Hypothesis-directed investigation (iterative)
# Phase 3: Strategist-driven investigation (DESIGN_STRATEGIST.md)
if resume_phase <= 3:
max_rounds = self.config.get("max_investigation_rounds", 5)
for round_num in range(max_rounds):
_log(f"Phase 3: Investigation Round {round_num}", event="phase")
t0 = time.monotonic()
strategist_cfg = self.config.get("strategist", {}) or {}
strategist_enabled = strategist_cfg.get("enabled", True)
if strategist_enabled:
await self._phase3_strategist_loop()
else:
# Legacy fallback — keep the old hypothesis-directed
# iterative loop available for runs that explicitly
# disable the strategist (debugging, regression
# comparison, or environments without the strategist
# agent registered).
await self._phase3_legacy_loop()
if self.graph.hypotheses_converged():
_log("All hypotheses converged — stopping", event="progress")
break
await self._generate_hypothesis_leads()
pending = await self.graph.get_pending_leads()
if not pending:
_log("No pending leads — round complete", event="progress")
break
await self._dispatch_leads_parallel(pending)
await self._judge_new_phenomena()
# Show hypothesis status update
for h in self.graph.hypotheses.values():
_log(f" {h.summary()}", event="hypothesis")
_log(_progress_summary(self.graph), event="progress", elapsed=time.monotonic() - t0)
# Retry failed leads
# Retry failed leads + Gap Analysis run regardless of which
# Phase 3 variant was used — they operate on the leads/
# hypothesis graph the strategist loop leaves behind.
await self._retry_failed_leads()
# Gap analysis
_log("Phase 3: Gap Analysis", event="phase")
await self._run_gap_analysis()
self.graph.mark_remaining_inconclusive()
# Phase 4: Timeline construction

View File

@@ -3407,6 +3407,202 @@ class TestInvestigationRound:
assert "not in" in result
assert graph.strategist_complete_requested is False
@pytest.mark.asyncio
async def test_strategist_loop_exits_on_declare_complete(self):
"""Mock strategist that declares complete in round 1 — orchestrator
must exit the Phase 3 loop without dispatching any worker."""
from unittest.mock import AsyncMock
from orchestrator import Orchestrator
graph = EvidenceGraph()
llm = AsyncMock()
worker_runs: list[str] = []
class FakeStrategist:
name = "strategist"
async def run(self, task, lead_id=None):
graph.strategist_complete_requested = True
return "complete"
class FakeFactory:
def __init__(self):
self._instances = {"strategist": FakeStrategist()}
def get_or_create_agent(self, name):
return self._instances.get(name)
orch = Orchestrator(llm, graph, FakeFactory(), config={
"strategist": {"enabled": True, "max_rounds": 5},
})
await orch._phase3_strategist_loop()
assert len(graph.investigation_rounds) == 1
r = graph.investigation_rounds[0]
assert r.strategist_action == "declare_complete"
assert r.completed_at != ""
assert worker_runs == []
@pytest.mark.asyncio
async def test_strategist_loop_dispatches_lead_then_completes(self):
"""Strategist proposes 1 lead in round 1, declares complete in round 2.
Loop must dispatch the worker for the lead, then exit cleanly.
"""
from unittest.mock import AsyncMock
from orchestrator import Orchestrator
from case import Case, EvidenceSource
graph = EvidenceGraph()
src = EvidenceSource(id="src-A", label="A", type="disk_image",
access_mode="image", path="/tmp/x")
graph.case = Case(case_id="c", name="n", sources=[src])
graph.set_active_source(src)
hid = await graph.add_hypothesis("h", "d")
llm = AsyncMock()
worker_calls: list[tuple[str, str]] = []
class FakeStrategist:
name = "strategist"
def __init__(self):
self.round = 0
async def run(self, task, lead_id=None):
self.round += 1
if self.round == 1:
await graph.add_lead(
target_agent="filesystem",
description="probe X",
proposed_by="strategist",
motivating_hypothesis=hid,
expected_evidence_type="supports",
round_number=graph.current_strategist_round,
)
else:
graph.strategist_complete_requested = True
return "ok"
class FakeWorker:
name = "filesystem"
async def run(self, task, lead_id=None):
worker_calls.append((self.name, lead_id))
return "did the thing"
class FakeFactory:
def __init__(self):
self.s = FakeStrategist()
self.w = FakeWorker()
def get_or_create_agent(self, name):
if name == "strategist": return self.s
return self.w
orch = Orchestrator(llm, graph, FakeFactory(), config={
"strategist": {"enabled": True, "max_rounds": 5,
"hard_stop_marginal_yield_zero_rounds": 99},
})
await orch._phase3_strategist_loop()
assert len(graph.investigation_rounds) == 2
assert graph.investigation_rounds[0].strategist_action == "propose_leads"
assert graph.investigation_rounds[1].strategist_action == "declare_complete"
assert len(worker_calls) == 1
assert worker_calls[0][0] == "filesystem"
leads = [l for l in graph.leads if l.proposed_by == "strategist"]
assert len(leads) == 1
assert leads[0].status == "completed"
@pytest.mark.asyncio
async def test_strategist_loop_hard_stop_on_zero_yield(self):
"""If the strategist insists on more rounds but yield stays zero for
N consecutive rounds, the orchestrator force-stops as a safety net."""
from unittest.mock import AsyncMock
from orchestrator import Orchestrator
graph = EvidenceGraph()
llm = AsyncMock()
class FakeStrategist:
name = "strategist"
async def run(self, task, lead_id=None):
hid_local = next(iter(graph.hypotheses)) if graph.hypotheses else None
await graph.add_lead(
target_agent="filesystem", description="probe",
proposed_by="strategist",
motivating_hypothesis=hid_local or "",
expected_evidence_type="supports",
round_number=graph.current_strategist_round,
)
class FakeWorker:
name = "filesystem"
async def run(self, task, lead_id=None):
return ""
class FakeFactory:
def __init__(self):
self.s = FakeStrategist()
self.w = FakeWorker()
def get_or_create_agent(self, name):
return self.s if name == "strategist" else self.w
hid = await graph.add_hypothesis("h", "d")
orch = Orchestrator(llm, graph, FakeFactory(), config={
"strategist": {
"enabled": True, "max_rounds": 20,
"hard_stop_marginal_yield_zero_rounds": 2,
},
})
await orch._phase3_strategist_loop()
assert len(graph.investigation_rounds) == 2
@pytest.mark.asyncio
async def test_strategist_loop_budget_exhaustion_stops_loop(self):
"""Hard budget cap on tool_calls_total kills the loop even when the
strategist wants to continue."""
from unittest.mock import AsyncMock
from orchestrator import Orchestrator
graph = EvidenceGraph()
llm = AsyncMock()
# Pre-stuff the invocations log so we're already past the cap.
await graph.record_tool_invocation(
tool="probe", args={}, output="x",
)
await graph.record_tool_invocation(
tool="probe", args={}, output="y",
)
class FakeStrategist:
name = "strategist"
async def run(self, task, lead_id=None):
hid_local = next(iter(graph.hypotheses)) if graph.hypotheses else ""
await graph.add_lead(
target_agent="filesystem", description="x",
proposed_by="strategist",
motivating_hypothesis=hid_local,
expected_evidence_type="supports",
round_number=graph.current_strategist_round,
)
class FakeWorker:
name = "filesystem"
async def run(self, task, lead_id=None):
await graph.record_tool_invocation(
tool="probe", args={}, output="z",
)
class FakeFactory:
def __init__(self):
self.s = FakeStrategist()
self.w = FakeWorker()
def get_or_create_agent(self, name):
return self.s if name == "strategist" else self.w
hid = await graph.add_hypothesis("h", "d")
orch = Orchestrator(llm, graph, FakeFactory(), config={
"strategist": {"enabled": True, "max_rounds": 99,
"hard_stop_marginal_yield_zero_rounds": 99},
"budgets": {"tool_calls_total": 2},
})
await orch._phase3_strategist_loop()
assert len(graph.investigation_rounds) == 1
@pytest.mark.asyncio
async def test_marginal_yield_after_two_rounds(self):
"""Verify marginal_yield captures phenomena/edge/status deltas."""