fix: dampen agreement factor by sample size in trend confidence to prevent low-evidence inflation

Agreement of 1-2 signals was inflating confidence to paper-eligible levels (0.575) even with low credibility sources. Added log2-based dampener that scales agreement contribution by unique source count, saturating at n=7. Single signals now cap at 0.39 confidence, 2 signals at 0.49 — both correctly below paper threshold (0.50).
2026-04-17 03:41:39 +00:00
parent d80d44e2fc
commit e21f162e48
2 changed files with 19 additions and 10 deletions
@@ -10,6 +10,7 @@ from __future__ import annotations

 import json
 import logging
+import math
 import time
 import uuid as _uuid
 from dataclasses import dataclass
@@ -582,7 +583,9 @@ def compute_trend_confidence(
    Confidence is based on:
    - Number of UNIQUE source documents (not raw signal count)
    - Average extraction confidence of contributing signals
-    - Signal agreement (what fraction point the same direction)
+    - Signal agreement (what fraction point the same direction),
+      dampened by sample size so that 1-2 signals agreeing doesn't
+      inflate confidence the same way 10+ signals agreeing does
    - Contradiction penalty (high contradiction lowers confidence)

    Returns a value in [0, 1].
@@ -611,6 +614,12 @@ def compute_trend_confidence(
    else:
        agreement = 0.5

+    # Dampen agreement by sample size: 1-2 signals agreeing is far less
+    # meaningful than 7+ signals agreeing.  Uses log2(n+1)/log2(8) so the
+    # dampener saturates at 1.0 around n=7 unique sources.
+    agreement_dampener = min(1.0, math.log2(unique_sources + 1) / math.log2(8))
+    agreement *= agreement_dampener
+
    # Contradiction penalty
    contradiction_penalty = contradiction_score * 0.4

@@ -426,11 +426,11 @@ class TestRecommendationDrivenOrders:
        impacts = [
            ImpactRow(
                document_id="doc-weak-1",
-                confidence=0.20,
-                novelty_score=0.1,
-                source_credibility=0.2,
+                confidence=0.40,
+                novelty_score=0.3,
+                source_credibility=0.5,
                sentiment="positive",
-                impact_score=0.1,
+                impact_score=0.3,
                catalyst_type="other",
                key_facts=["Minor update"],
                risks=[],
@@ -438,11 +438,11 @@ class TestRecommendationDrivenOrders:
            ),
            ImpactRow(
                document_id="doc-weak-2",
-                confidence=0.15,
-                novelty_score=0.1,
-                source_credibility=0.2,
-                sentiment="negative",
-                impact_score=0.1,
+                confidence=0.35,
+                novelty_score=0.2,
+                source_credibility=0.4,
+                sentiment="positive",
+                impact_score=0.25,
                catalyst_type="other",
                key_facts=["Routine filing"],
                risks=[],