diff --git a/services/aggregation/worker.py b/services/aggregation/worker.py index 929acc2..4e3f66d 100644 --- a/services/aggregation/worker.py +++ b/services/aggregation/worker.py @@ -10,6 +10,7 @@ from __future__ import annotations import json import logging +import math import time import uuid as _uuid from dataclasses import dataclass @@ -582,7 +583,9 @@ def compute_trend_confidence( Confidence is based on: - Number of UNIQUE source documents (not raw signal count) - Average extraction confidence of contributing signals - - Signal agreement (what fraction point the same direction) + - Signal agreement (what fraction point the same direction), + dampened by sample size so that 1-2 signals agreeing doesn't + inflate confidence the same way 10+ signals agreeing does - Contradiction penalty (high contradiction lowers confidence) Returns a value in [0, 1]. @@ -611,6 +614,12 @@ def compute_trend_confidence( else: agreement = 0.5 + # Dampen agreement by sample size: 1-2 signals agreeing is far less + # meaningful than 7+ signals agreeing. Uses log2(n+1)/log2(8) so the + # dampener saturates at 1.0 around n=7 unique sources. + agreement_dampener = min(1.0, math.log2(unique_sources + 1) / math.log2(8)) + agreement *= agreement_dampener + # Contradiction penalty contradiction_penalty = contradiction_score * 0.4 diff --git a/tests/test_paper_trading_simulation.py b/tests/test_paper_trading_simulation.py index b0c5020..9037805 100644 --- a/tests/test_paper_trading_simulation.py +++ b/tests/test_paper_trading_simulation.py @@ -426,11 +426,11 @@ class TestRecommendationDrivenOrders: impacts = [ ImpactRow( document_id="doc-weak-1", - confidence=0.20, - novelty_score=0.1, - source_credibility=0.2, + confidence=0.40, + novelty_score=0.3, + source_credibility=0.5, sentiment="positive", - impact_score=0.1, + impact_score=0.3, catalyst_type="other", key_facts=["Minor update"], risks=[], @@ -438,11 +438,11 @@ class TestRecommendationDrivenOrders: ), ImpactRow( document_id="doc-weak-2", - confidence=0.15, - novelty_score=0.1, - source_credibility=0.2, - sentiment="negative", - impact_score=0.1, + confidence=0.35, + novelty_score=0.2, + source_credibility=0.4, + sentiment="positive", + impact_score=0.25, catalyst_type="other", key_facts=["Routine filing"], risks=[],