fix(graph): harmonic damping for repeated same-edge_type evidence

First full-case run (runs/2026-05-20T20-15-04/) produced hypotheses with log_odds +31 (8 direct_evidence + 15 supports). That's the naive-Bayes independence assumption breaking down: 15 different phenomena all "supporting" the same hypothesis from one source are not 15 independent pieces of evidence, they're highly correlated. DESIGN.md §4.5 last bullet flagged this as a "未实施旋钮" — this commit implements it. Rule: the k-th edge of a given (hyp_id, edge_type) contributes log_lr_base / k instead of log_lr_base. Cumulative is harmonic sum H_N, bounded by ~ ln N. Single-edge hypotheses unaffected (k=1 → /1 → no change). Replaying the 2026-05-20 graph's 108 edges under the new rule pulls the top hypothesis from +31.0 → +8.75; the smallest active hypothesis from +4.0 → +2.08. Also adds rank + log_lr_base to confidence_log entries so the math is auditable from the persisted graph. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 02:16:37 -10:00
parent f04ccd4bc7
commit 8020c24776
2 changed files with 110 additions and 12 deletions
--- a/evidence_graph.py
+++ b/evidence_graph.py
@@ -1020,9 +1020,20 @@ class EvidenceGraph:
        **Idempotency**: if a ``(phenomenon, hypothesis, edge_type)`` edge
        already exists, this is a no-op — the same agent re-recording the
        same link (or two agents linking via the orchestrator's batch
-        judge and a manual override) does not double-count. Independent
-        evidence — *different* phenomena pointing the same way — still
-        accumulates fully.
+        judge and a manual override) does not double-count.
+
+        **Harmonic damping of repeated same-direction evidence** (added
+        post first full-case run, 2026-05-20): independent evidence —
+        different phenomena pointing the same way — still accumulates,
+        but with diminishing returns: the k-th edge of the same
+        ``(hyp_id, edge_type)`` contributes ``log_lr_base / k``. After N
+        same-direction edges the cumulative contribution is
+        ``log_lr_base · H_N`` (harmonic sum, grows as ln N). This
+        formalises the naive-Bayes-breakdown DESIGN.md §4.5 calls out:
+        "同一发现被多 agent 重复入图". Single-edge hypotheses are
+        unaffected (k=1, damping = 1.0). Cross-direction edges (supports
+        vs contradicts) keep their own independent counts so a strong
+        contradicting fact still bites against piled-on supports.
        """
        if edge_type not in self.edge_log_lr:
            raise ValueError(
@@ -1045,7 +1056,17 @@ class EvidenceGraph:
                ):
                    return hyp.confidence

-            log_lr = self.edge_log_lr[edge_type]
+            # Harmonic damping rank: count existing edges of the SAME
+            # edge_type already incident on this hypothesis. The new edge
+            # becomes the (rank+1)-th of its kind. _adj_rev is keyed by
+            # target so this is O(in-degree(hyp)) without scanning all edges.
+            existing_same_type = sum(
+                1 for e in self._adj_rev.get(hyp_id, [])
+                if e.edge_type == edge_type
+            )
+            rank = existing_same_type + 1
+            log_lr_base = self.edge_log_lr[edge_type]
+            log_lr = log_lr_base / rank
            old_log_odds = hyp.log_odds
            old_conf = hyp.confidence
            new_log_odds = old_log_odds + log_lr
@@ -1065,7 +1086,9 @@ class EvidenceGraph:
                "timestamp": datetime.now().isoformat(),
                "phenomenon_id": phenomenon_id,
                "edge_type": edge_type,
-                "log_lr": log_lr,
+                "log_lr_base": log_lr_base,
+                "rank": rank,
+                "log_lr": round(log_lr, 4),
                "old_log_odds": round(old_log_odds, 4),
                "new_log_odds": round(new_log_odds, 4),
                "old_confidence": round(old_conf, 4),
--- a/tests/test_optimizations.py
+++ b/tests/test_optimizations.py
@@ -2020,8 +2020,9 @@ class TestLogOddsConfidence:
        # All three orderings must agree exactly.
        assert confs[0] == pytest.approx(confs[1])
        assert confs[1] == pytest.approx(confs[2])
-        # And the value should be 1 + 1 − 0.5 = 1.5 → sigmoid ≈ 0.9694
-        assert confs[0] == pytest.approx(0.9694, abs=1e-3)
+        # With harmonic damping: 2 supports → 1.0 + 0.5 = +1.5, 1 weakens
+        # → −0.5, net log_odds = +1.0 → sigmoid 1/(1+10^-1) ≈ 0.9091.
+        assert confs[0] == pytest.approx(0.9091, abs=1e-3)

    @pytest.mark.asyncio
    async def test_each_edge_type_calibrated(self, graph):
@@ -2082,17 +2083,91 @@ class TestLogOddsConfidence:
        assert sum(1 for e in graph.edges if e.edge_type == "supports") == first_edges

    @pytest.mark.asyncio
-    async def test_independent_evidence_accumulates(self, graph):
-        """Distinct phenomena with same edge_type DO accumulate (independent)."""
+    async def test_repeated_same_direction_evidence_dampens_harmonically(self, graph):
+        """Distinct phenomena with same edge_type accumulate WITH harmonic
+        damping (post 2026-05-20 full-case-run fix). The k-th edge of a
+        given (hyp, edge_type) contributes log_lr_base/k; after 3 supports
+        edges the total is 1.0 · (1 + 1/2 + 1/3) ≈ 1.833, not 3.0. This
+        formalises the breakdown of the naive-Bayes independence
+        assumption when multiple agents pile on the same finding.
+        """
        hid = await graph.add_hypothesis("h", "d")
        for i in range(3):
            pid, _ = await graph.add_phenomenon(
                "fs", "filesystem", f"ph {i}", f"d {i}", source_tool="t",
            )
            await graph.update_hypothesis_confidence(hid, pid, "supports", "")
-        # 3 × +1.0 = +3.0 log_odds → conf ≈ 0.999
-        assert graph.hypotheses[hid].log_odds == pytest.approx(3.0)
-        assert graph.hypotheses[hid].confidence > 0.99
+        # 1.0 · H_3 = 1.0 + 0.5 + 0.3333... = 1.8333
+        expected = 1.0 + 0.5 + (1.0 / 3.0)
+        assert graph.hypotheses[hid].log_odds == pytest.approx(expected, abs=1e-6)
+        # Still above the 0.8 supported threshold (good — 3 independent
+        # observations remain strong evidence) but well below the runaway
+        # confidence pre-fix produced.
+        assert graph.hypotheses[hid].status == "supported"
+
+    @pytest.mark.asyncio
+    async def test_first_edge_undamped(self, graph):
+        """Damping is rank-1 → /1 → no change. A hypothesis with a single
+        edge must contribute the full calibrated log_lr value (otherwise the
+        whole calibration table would be off by a factor)."""
+        hid = await graph.add_hypothesis("h", "d")
+        pid, _ = await graph.add_phenomenon(
+            "fs", "filesystem", "lone", "d", source_tool="t",
+        )
+        await graph.update_hypothesis_confidence(hid, pid, "direct_evidence", "")
+        assert graph.hypotheses[hid].log_odds == pytest.approx(2.0)
+
+    @pytest.mark.asyncio
+    async def test_damping_independent_per_edge_type(self, graph):
+        """Damping rank is keyed per (hyp, edge_type) — supports' counter
+        does NOT advance direct_evidence's counter."""
+        hid = await graph.add_hypothesis("h", "d")
+        for i in range(5):
+            pid, _ = await graph.add_phenomenon(
+                "fs", "filesystem", f"s{i}", "d", source_tool="t",
+            )
+            await graph.update_hypothesis_confidence(hid, pid, "supports", "")
+        log_after_supports = graph.hypotheses[hid].log_odds
+        # H_5 = 1 + 1/2 + 1/3 + 1/4 + 1/5 = 2.2833
+        assert log_after_supports == pytest.approx(2.2833, abs=1e-3)
+
+        pid_de, _ = await graph.add_phenomenon(
+            "fs", "filesystem", "de", "d", source_tool="t",
+        )
+        await graph.update_hypothesis_confidence(hid, pid_de, "direct_evidence", "")
+        delta = graph.hypotheses[hid].log_odds - log_after_supports
+        assert delta == pytest.approx(2.0, abs=1e-6)
+
+    @pytest.mark.asyncio
+    async def test_damping_independent_per_hypothesis(self, graph):
+        """The rank counter is per (hyp, edge_type), so two hypotheses
+        each receiving 'supports' edges accumulate independently."""
+        h1 = await graph.add_hypothesis("h1", "d")
+        h2 = await graph.add_hypothesis("h2", "d")
+        for i in range(3):
+            pid, _ = await graph.add_phenomenon(
+                "fs", "filesystem", f"to-h1 {i}", "d", source_tool="t",
+            )
+            await graph.update_hypothesis_confidence(h1, pid, "supports", "")
+        pid_h2, _ = await graph.add_phenomenon(
+            "fs", "filesystem", "to-h2", "d", source_tool="t",
+        )
+        await graph.update_hypothesis_confidence(h2, pid_h2, "supports", "")
+        assert graph.hypotheses[h2].log_odds == pytest.approx(1.0)
+
+    @pytest.mark.asyncio
+    async def test_damping_rank_persists_in_confidence_log(self, graph):
+        """The rank used for damping must be recorded so the math is
+        auditable from the persisted graph (no need to recompute)."""
+        hid = await graph.add_hypothesis("h", "d")
+        for i in range(2):
+            pid, _ = await graph.add_phenomenon(
+                "fs", "filesystem", f"p {i}", "d", source_tool="t",
+            )
+            await graph.update_hypothesis_confidence(hid, pid, "supports", "")
+        entries = graph.hypotheses[hid].confidence_log
+        assert entries[0]["rank"] == 1 and entries[0]["log_lr"] == pytest.approx(1.0)
+        assert entries[1]["rank"] == 2 and entries[1]["log_lr"] == pytest.approx(0.5)
    @pytest.mark.asyncio
    async def test_prior_prob_shifts_starting_log_odds(self, graph):
        # prior 0.9 → log_odds ≈ +0.954