diff --git a/evidence_graph.py b/evidence_graph.py index 4735b8d..424b97f 100644 --- a/evidence_graph.py +++ b/evidence_graph.py @@ -855,6 +855,9 @@ class EvidenceGraph: + "\n".join( f" - {f['reason']}: {json.dumps(f['fact'], ensure_ascii=False)}" for f in failures + ) + + self._format_recent_invocations( + source_agent, active_task_id or "", ), failures=failures, ) @@ -1612,17 +1615,44 @@ class EvidenceGraph: are normalised via :func:`_normalize_for_grounding` (case-folded, whitespace-collapsed, path-sep unified). - The loose match catches the LLM's routine presentation - normalisations (case-folded hex, slash-flipped paths, collapsed - multi-line labels) without enabling fabrication: a string that - isn't present in ANY form still fails the normalised check. + Auto-rescue (post first full-case run, 2026-05-20): when the cited + invocation_id doesn't exist BUT `fact.value` does appear in exactly + one of this agent/task's real invocations, the fact's + ``invocation_id`` is silently rewritten to that real id. This heals + the LLM's routine "I know which tool I read this from, I just + mis-typed the inv id" failure mode without expanding what content + can land grounded — the value still has to be present in a real + invocation output. Multi-match is NOT auto-rescued: the value list + of candidate ids goes back to the LLM so it picks the right one. """ inv_id = fact.get("invocation_id", "") value = fact.get("value", "") + if not isinstance(value, str) or not value: + return False, "fact.value must be a non-empty string" if not inv_id: return False, "missing invocation_id" inv = self.tool_invocations.get(inv_id) if inv is None: + # Look for an auto-rescue candidate: same agent, same task, + # whose output contains `value` (strict or normalised). + candidates = self._find_grounding_candidates(value, agent, task_id) + if len(candidates) == 1: + real_id = candidates[0].id + fact["invocation_id"] = real_id # heal in place + logger.info( + "grounding auto-rescued for agent=%s: bogus %s rewritten to %s " + "(value matches that invocation's output)", + agent, inv_id, real_id, + ) + return True, "ok-auto-rescued" + if len(candidates) > 1: + shown = ", ".join(c.id for c in candidates[:5]) + more = "" if len(candidates) <= 5 else f" (+{len(candidates)-5} more)" + return False, ( + f"invocation_id {inv_id} not found in invocation log. " + f"The value {value!r} appears in {len(candidates)} of your " + f"invocations: {shown}{more} — cite the specific one in retry." + ) return False, f"invocation_id {inv_id} not found in invocation log" if inv.agent != agent: return False, ( @@ -1634,8 +1664,6 @@ class EvidenceGraph: f"invocation {inv_id} was made in a different task scope " f"({inv.task_id}) — cite only invocations from your current task" ) - if not isinstance(value, str) or not value: - return False, "fact.value must be a non-empty string" if value in inv.output: return True, "ok" # Loose fallback: normalised comparison absorbs case / whitespace / @@ -1650,6 +1678,54 @@ class EvidenceGraph: f"instead of `verified_facts`." ) + def _find_grounding_candidates( + self, value: str, agent: str, task_id: str, + ) -> list[ToolInvocation]: + """Return this agent/task's invocations whose output contains `value`. + + Used for grounding auto-rescue (single match) and informative retry + messages (multi-match). Strict substring first, then normalised + fallback — order preserves the same semantics as `validate_fact_grounding`. + """ + norm_value = _normalize_for_grounding(value) + out: list[ToolInvocation] = [] + for inv in self.tool_invocations.values(): + if inv.agent != agent: + continue + if task_id and inv.task_id and inv.task_id != task_id: + continue + if value in inv.output or norm_value in _normalize_for_grounding(inv.output): + out.append(inv) + return out + + def _format_recent_invocations( + self, agent: str, task_id: str, limit: int = 8, + ) -> str: + """Render this agent/task's most recent invocations as a citation menu. + + Appended to GroundingError messages so the LLM has the real ids in + front of it on the retry attempt. Falls back to an empty string when + the agent has no invocations on record yet. + """ + invs = [ + inv for inv in self.tool_invocations.values() + if inv.agent == agent + and (not task_id or not inv.task_id or inv.task_id == task_id) + ] + if not invs: + return "" + invs.sort(key=lambda iv: iv.created_at, reverse=True) + lines = [] + for inv in invs[:limit]: + args_str = json.dumps(inv.args, ensure_ascii=False) + if len(args_str) > 140: + args_str = args_str[:137] + "..." + lines.append(f" - {inv.id} {inv.tool}({args_str})") + return ( + "\nYour recent invocations in this task (cite one of these ids " + "in `invocation_id`):\n" + "\n".join(lines) + ) + # ---- Asset library ------------------------------------------------------- async def register_asset( diff --git a/tests/test_optimizations.py b/tests/test_optimizations.py index bf139ec..2504285 100644 --- a/tests/test_optimizations.py +++ b/tests/test_optimizations.py @@ -1779,6 +1779,103 @@ class TestGroundingGateway: ) assert "not found in invocation log" in str(exc.value) + @pytest.mark.asyncio + async def test_auto_rescue_single_match_rewrites_invocation_id(self, graph): + """Layer A: agent cites a bogus inv id but the fact value is uniquely + present in one of its real invocations → silently heal the citation + and accept the fact. + """ + real_inv = await graph.record_tool_invocation( + tool="sqlite_query", + args={"db": "AddressBook.sqlitedb"}, + output="Hogan | +852 5497 4406 | whoishogan@gmail.com", + ) + bogus_id = "inv-deadbeef" + facts = [{"type": "identifier", "value": "+852 5497 4406", "invocation_id": bogus_id}] + pid, _ = await graph.add_phenomenon( + source_agent="fs", category="identity", + title="phone for Hogan", + verified_facts=facts, + source_tool="sqlite_query", + ) + ph = graph.phenomena[pid] + assert ph.verified_facts[0]["invocation_id"] == real_inv + assert ph.verified_facts[0]["invocation_id"] != bogus_id + + @pytest.mark.asyncio + async def test_auto_rescue_skips_when_value_matches_multiple_invocations(self, graph): + """Layer A safety: ambiguous match (value present in >1 invocation) + is NOT silently rewritten — the LLM gets the candidate list back so + it picks the right id on retry. + """ + inv_a = await graph.record_tool_invocation( + tool="list_directory", args={"dir": "1"}, + output="d/d 33-128-1: secret.txt\nfound on disk", + ) + inv_b = await graph.record_tool_invocation( + tool="list_directory", args={"dir": "2"}, + output="d/d 99-128-1: vault.txt\nfound on disk", + ) + with pytest.raises(GroundingError) as exc: + await graph.add_phenomenon( + source_agent="fs", category="filesystem", title="dup", + verified_facts=[ + {"type": "raw", "value": "found on disk", "invocation_id": "inv-nope"}, + ], + source_tool="list_directory", + ) + msg = str(exc.value) + assert inv_a in msg + assert inv_b in msg + assert "2 of your invocations" in msg + + @pytest.mark.asyncio + async def test_grounding_error_lists_recent_invocations(self, graph): + """Layer B: on rejection, the GroundingError message appends the + agent's recent real invocation ids so the LLM can cite a valid one + on retry instead of fabricating again. + """ + inv_one = await graph.record_tool_invocation( + tool="fls", args={"offset": 614400}, output="some output A", + ) + inv_two = await graph.record_tool_invocation( + tool="icat", args={"inode": "33"}, output="some output B", + ) + with pytest.raises(GroundingError) as exc: + await graph.add_phenomenon( + source_agent="fs", category="filesystem", title="bogus", + verified_facts=[ + {"type": "raw", "value": "totally absent string", + "invocation_id": "inv-bogus"}, + ], + source_tool="fls", + ) + msg = str(exc.value) + assert "Your recent invocations in this task" in msg + assert inv_one in msg + assert inv_two in msg + + @pytest.mark.asyncio + async def test_auto_rescue_respects_agent_scope(self, graph): + """Layer A invariant: rescue candidates must be from the SAME agent. + A value present only in another agent's invocation must NOT trigger + auto-rescue across agents. + """ + graph._current_agent = "registry" + await graph.record_tool_invocation( + tool="parse_registry_key", args={}, output="REG_VALUE_xyz", + ) + graph._current_agent = "fs" + with pytest.raises(GroundingError): + await graph.add_phenomenon( + source_agent="fs", category="filesystem", title="cross-agent leak", + verified_facts=[ + {"type": "raw", "value": "REG_VALUE_xyz", + "invocation_id": "inv-anything"}, + ], + source_tool="parse_registry_key", + ) + @pytest.mark.asyncio async def test_empty_verified_facts_allowed_for_negative_findings(self, graph): # A negative finding ("searched X, found nothing") is permitted —