fix: dampen agreement factor by sample size in trend confidence to prevent low-evidence inflation
Agreement of 1-2 signals was inflating confidence to paper-eligible levels (0.575) even with low credibility sources. Added log2-based dampener that scales agreement contribution by unique source count, saturating at n=7. Single signals now cap at 0.39 confidence, 2 signals at 0.49 — both correctly below paper threshold (0.50).
This commit is contained in:
@@ -10,6 +10,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import math
|
||||||
import time
|
import time
|
||||||
import uuid as _uuid
|
import uuid as _uuid
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
@@ -582,7 +583,9 @@ def compute_trend_confidence(
|
|||||||
Confidence is based on:
|
Confidence is based on:
|
||||||
- Number of UNIQUE source documents (not raw signal count)
|
- Number of UNIQUE source documents (not raw signal count)
|
||||||
- Average extraction confidence of contributing signals
|
- Average extraction confidence of contributing signals
|
||||||
- Signal agreement (what fraction point the same direction)
|
- Signal agreement (what fraction point the same direction),
|
||||||
|
dampened by sample size so that 1-2 signals agreeing doesn't
|
||||||
|
inflate confidence the same way 10+ signals agreeing does
|
||||||
- Contradiction penalty (high contradiction lowers confidence)
|
- Contradiction penalty (high contradiction lowers confidence)
|
||||||
|
|
||||||
Returns a value in [0, 1].
|
Returns a value in [0, 1].
|
||||||
@@ -611,6 +614,12 @@ def compute_trend_confidence(
|
|||||||
else:
|
else:
|
||||||
agreement = 0.5
|
agreement = 0.5
|
||||||
|
|
||||||
|
# Dampen agreement by sample size: 1-2 signals agreeing is far less
|
||||||
|
# meaningful than 7+ signals agreeing. Uses log2(n+1)/log2(8) so the
|
||||||
|
# dampener saturates at 1.0 around n=7 unique sources.
|
||||||
|
agreement_dampener = min(1.0, math.log2(unique_sources + 1) / math.log2(8))
|
||||||
|
agreement *= agreement_dampener
|
||||||
|
|
||||||
# Contradiction penalty
|
# Contradiction penalty
|
||||||
contradiction_penalty = contradiction_score * 0.4
|
contradiction_penalty = contradiction_score * 0.4
|
||||||
|
|
||||||
|
|||||||
@@ -426,11 +426,11 @@ class TestRecommendationDrivenOrders:
|
|||||||
impacts = [
|
impacts = [
|
||||||
ImpactRow(
|
ImpactRow(
|
||||||
document_id="doc-weak-1",
|
document_id="doc-weak-1",
|
||||||
confidence=0.20,
|
confidence=0.40,
|
||||||
novelty_score=0.1,
|
novelty_score=0.3,
|
||||||
source_credibility=0.2,
|
source_credibility=0.5,
|
||||||
sentiment="positive",
|
sentiment="positive",
|
||||||
impact_score=0.1,
|
impact_score=0.3,
|
||||||
catalyst_type="other",
|
catalyst_type="other",
|
||||||
key_facts=["Minor update"],
|
key_facts=["Minor update"],
|
||||||
risks=[],
|
risks=[],
|
||||||
@@ -438,11 +438,11 @@ class TestRecommendationDrivenOrders:
|
|||||||
),
|
),
|
||||||
ImpactRow(
|
ImpactRow(
|
||||||
document_id="doc-weak-2",
|
document_id="doc-weak-2",
|
||||||
confidence=0.15,
|
confidence=0.35,
|
||||||
novelty_score=0.1,
|
novelty_score=0.2,
|
||||||
source_credibility=0.2,
|
source_credibility=0.4,
|
||||||
sentiment="negative",
|
sentiment="positive",
|
||||||
impact_score=0.1,
|
impact_score=0.25,
|
||||||
catalyst_type="other",
|
catalyst_type="other",
|
||||||
key_facts=["Routine filing"],
|
key_facts=["Routine filing"],
|
||||||
risks=[],
|
risks=[],
|
||||||
|
|||||||
Reference in New Issue
Block a user