diff --git a/evidence_graph.py b/evidence_graph.py index 424b97f..a25d420 100644 --- a/evidence_graph.py +++ b/evidence_graph.py @@ -1020,9 +1020,20 @@ class EvidenceGraph: **Idempotency**: if a ``(phenomenon, hypothesis, edge_type)`` edge already exists, this is a no-op — the same agent re-recording the same link (or two agents linking via the orchestrator's batch - judge and a manual override) does not double-count. Independent - evidence — *different* phenomena pointing the same way — still - accumulates fully. + judge and a manual override) does not double-count. + + **Harmonic damping of repeated same-direction evidence** (added + post first full-case run, 2026-05-20): independent evidence — + different phenomena pointing the same way — still accumulates, + but with diminishing returns: the k-th edge of the same + ``(hyp_id, edge_type)`` contributes ``log_lr_base / k``. After N + same-direction edges the cumulative contribution is + ``log_lr_base · H_N`` (harmonic sum, grows as ln N). This + formalises the naive-Bayes-breakdown DESIGN.md §4.5 calls out: + "同一发现被多 agent 重复入图". Single-edge hypotheses are + unaffected (k=1, damping = 1.0). Cross-direction edges (supports + vs contradicts) keep their own independent counts so a strong + contradicting fact still bites against piled-on supports. """ if edge_type not in self.edge_log_lr: raise ValueError( @@ -1045,7 +1056,17 @@ class EvidenceGraph: ): return hyp.confidence - log_lr = self.edge_log_lr[edge_type] + # Harmonic damping rank: count existing edges of the SAME + # edge_type already incident on this hypothesis. The new edge + # becomes the (rank+1)-th of its kind. _adj_rev is keyed by + # target so this is O(in-degree(hyp)) without scanning all edges. + existing_same_type = sum( + 1 for e in self._adj_rev.get(hyp_id, []) + if e.edge_type == edge_type + ) + rank = existing_same_type + 1 + log_lr_base = self.edge_log_lr[edge_type] + log_lr = log_lr_base / rank old_log_odds = hyp.log_odds old_conf = hyp.confidence new_log_odds = old_log_odds + log_lr @@ -1065,7 +1086,9 @@ class EvidenceGraph: "timestamp": datetime.now().isoformat(), "phenomenon_id": phenomenon_id, "edge_type": edge_type, - "log_lr": log_lr, + "log_lr_base": log_lr_base, + "rank": rank, + "log_lr": round(log_lr, 4), "old_log_odds": round(old_log_odds, 4), "new_log_odds": round(new_log_odds, 4), "old_confidence": round(old_conf, 4), diff --git a/tests/test_optimizations.py b/tests/test_optimizations.py index 33b6d2b..5a58a16 100644 --- a/tests/test_optimizations.py +++ b/tests/test_optimizations.py @@ -2020,8 +2020,9 @@ class TestLogOddsConfidence: # All three orderings must agree exactly. assert confs[0] == pytest.approx(confs[1]) assert confs[1] == pytest.approx(confs[2]) - # And the value should be 1 + 1 − 0.5 = 1.5 → sigmoid ≈ 0.9694 - assert confs[0] == pytest.approx(0.9694, abs=1e-3) + # With harmonic damping: 2 supports → 1.0 + 0.5 = +1.5, 1 weakens + # → −0.5, net log_odds = +1.0 → sigmoid 1/(1+10^-1) ≈ 0.9091. + assert confs[0] == pytest.approx(0.9091, abs=1e-3) @pytest.mark.asyncio async def test_each_edge_type_calibrated(self, graph): @@ -2082,17 +2083,91 @@ class TestLogOddsConfidence: assert sum(1 for e in graph.edges if e.edge_type == "supports") == first_edges @pytest.mark.asyncio - async def test_independent_evidence_accumulates(self, graph): - """Distinct phenomena with same edge_type DO accumulate (independent).""" + async def test_repeated_same_direction_evidence_dampens_harmonically(self, graph): + """Distinct phenomena with same edge_type accumulate WITH harmonic + damping (post 2026-05-20 full-case-run fix). The k-th edge of a + given (hyp, edge_type) contributes log_lr_base/k; after 3 supports + edges the total is 1.0 · (1 + 1/2 + 1/3) ≈ 1.833, not 3.0. This + formalises the breakdown of the naive-Bayes independence + assumption when multiple agents pile on the same finding. + """ hid = await graph.add_hypothesis("h", "d") for i in range(3): pid, _ = await graph.add_phenomenon( "fs", "filesystem", f"ph {i}", f"d {i}", source_tool="t", ) await graph.update_hypothesis_confidence(hid, pid, "supports", "") - # 3 × +1.0 = +3.0 log_odds → conf ≈ 0.999 - assert graph.hypotheses[hid].log_odds == pytest.approx(3.0) - assert graph.hypotheses[hid].confidence > 0.99 + # 1.0 · H_3 = 1.0 + 0.5 + 0.3333... = 1.8333 + expected = 1.0 + 0.5 + (1.0 / 3.0) + assert graph.hypotheses[hid].log_odds == pytest.approx(expected, abs=1e-6) + # Still above the 0.8 supported threshold (good — 3 independent + # observations remain strong evidence) but well below the runaway + # confidence pre-fix produced. + assert graph.hypotheses[hid].status == "supported" + + @pytest.mark.asyncio + async def test_first_edge_undamped(self, graph): + """Damping is rank-1 → /1 → no change. A hypothesis with a single + edge must contribute the full calibrated log_lr value (otherwise the + whole calibration table would be off by a factor).""" + hid = await graph.add_hypothesis("h", "d") + pid, _ = await graph.add_phenomenon( + "fs", "filesystem", "lone", "d", source_tool="t", + ) + await graph.update_hypothesis_confidence(hid, pid, "direct_evidence", "") + assert graph.hypotheses[hid].log_odds == pytest.approx(2.0) + + @pytest.mark.asyncio + async def test_damping_independent_per_edge_type(self, graph): + """Damping rank is keyed per (hyp, edge_type) — supports' counter + does NOT advance direct_evidence's counter.""" + hid = await graph.add_hypothesis("h", "d") + for i in range(5): + pid, _ = await graph.add_phenomenon( + "fs", "filesystem", f"s{i}", "d", source_tool="t", + ) + await graph.update_hypothesis_confidence(hid, pid, "supports", "") + log_after_supports = graph.hypotheses[hid].log_odds + # H_5 = 1 + 1/2 + 1/3 + 1/4 + 1/5 = 2.2833 + assert log_after_supports == pytest.approx(2.2833, abs=1e-3) + + pid_de, _ = await graph.add_phenomenon( + "fs", "filesystem", "de", "d", source_tool="t", + ) + await graph.update_hypothesis_confidence(hid, pid_de, "direct_evidence", "") + delta = graph.hypotheses[hid].log_odds - log_after_supports + assert delta == pytest.approx(2.0, abs=1e-6) + + @pytest.mark.asyncio + async def test_damping_independent_per_hypothesis(self, graph): + """The rank counter is per (hyp, edge_type), so two hypotheses + each receiving 'supports' edges accumulate independently.""" + h1 = await graph.add_hypothesis("h1", "d") + h2 = await graph.add_hypothesis("h2", "d") + for i in range(3): + pid, _ = await graph.add_phenomenon( + "fs", "filesystem", f"to-h1 {i}", "d", source_tool="t", + ) + await graph.update_hypothesis_confidence(h1, pid, "supports", "") + pid_h2, _ = await graph.add_phenomenon( + "fs", "filesystem", "to-h2", "d", source_tool="t", + ) + await graph.update_hypothesis_confidence(h2, pid_h2, "supports", "") + assert graph.hypotheses[h2].log_odds == pytest.approx(1.0) + + @pytest.mark.asyncio + async def test_damping_rank_persists_in_confidence_log(self, graph): + """The rank used for damping must be recorded so the math is + auditable from the persisted graph (no need to recompute).""" + hid = await graph.add_hypothesis("h", "d") + for i in range(2): + pid, _ = await graph.add_phenomenon( + "fs", "filesystem", f"p {i}", "d", source_tool="t", + ) + await graph.update_hypothesis_confidence(hid, pid, "supports", "") + entries = graph.hypotheses[hid].confidence_log + assert entries[0]["rank"] == 1 and entries[0]["log_lr"] == pytest.approx(1.0) + assert entries[1]["rank"] == 2 and entries[1]["log_lr"] == pytest.approx(0.5) @pytest.mark.asyncio async def test_prior_prob_shifts_starting_log_odds(self, graph): # prior 0.9 → log_odds ≈ +0.954