stonks-oracle/services/aggregation/evidence.py

"""Evidence ranking for supporting and opposing documents.

Ranks document signals by a composite score that considers multiple
factors beyond raw weight, producing explainable evidence lists for
trend summaries.

Requirements: 6.5
"""
from __future__ import annotations

from dataclasses import dataclass

from services.aggregation.scoring import WeightedSignal


@dataclass(frozen=True)
class EvidenceRankConfig:
    """Weights for the composite evidence ranking score."""

    # How much the combined signal weight matters (recency * credibility * novelty * market)
    weight_factor: float = 0.40
    # How much the document's impact score matters
    impact_factor: float = 0.30
    # How much recency alone matters (favours fresh evidence in the ranking)
    recency_factor: float = 0.20
    # How much extraction confidence matters
    confidence_factor: float = 0.10
    # Maximum evidence refs per side (supporting / opposing)
    max_refs: int = 10


DEFAULT_RANK_CONFIG = EvidenceRankConfig()


@dataclass
class RankedEvidence:
    """A document with its composite ranking score and breakdown."""

    document_id: str
    rank_score: float
    weight_component: float
    impact_component: float
    recency_component: float
    confidence_component: float
    sentiment_value: float  # +1 / -1 / 0


def compute_evidence_rank(
    signal: WeightedSignal,
    config: EvidenceRankConfig = DEFAULT_RANK_CONFIG,
) -> RankedEvidence:
    """Compute a composite ranking score for a single signal.

    The score blends:
    - combined signal weight (captures recency decay, credibility, novelty, market ctx)
    - raw impact score
    - recency weight alone (extra boost for freshness in the ranking)
    - extraction confidence (via the credibility component of the weight)

    All components are in [0, 1] so the composite is bounded by the sum
    of the factor weights.
    """
    w = signal.weight

    weight_component = w.combined * config.weight_factor
    impact_component = signal.impact_score * config.impact_factor
    recency_component = w.recency * config.recency_factor
    confidence_component = w.credibility * config.confidence_factor

    rank_score = weight_component + impact_component + recency_component + confidence_component

    return RankedEvidence(
        document_id=signal.document_id,
        rank_score=round(rank_score, 6),
        weight_component=round(weight_component, 6),
        impact_component=round(impact_component, 6),
        recency_component=round(recency_component, 6),
        confidence_component=round(confidence_component, 6),
        sentiment_value=signal.sentiment_value,
    )


def rank_evidence(
    signals: list[WeightedSignal],
    config: EvidenceRankConfig = DEFAULT_RANK_CONFIG,
) -> tuple[list[str], list[str]]:
    """Rank signals into top supporting and opposing document ID lists.

    Supporting = positive sentiment, Opposing = negative sentiment.
    Neutral/mixed signals are excluded.

    Returns (supporting_ids, opposing_ids) each capped at config.max_refs.
    """
    supporting: list[RankedEvidence] = []
    opposing: list[RankedEvidence] = []

    for sig in signals:
        if sig.sentiment_value == 0.0:
            continue
        ranked = compute_evidence_rank(sig, config)
        if sig.sentiment_value > 0:
            supporting.append(ranked)
        else:
            opposing.append(ranked)

    supporting.sort(key=lambda r: r.rank_score, reverse=True)
    opposing.sort(key=lambda r: r.rank_score, reverse=True)

    return (
        [r.document_id for r in supporting[: config.max_refs]],
        [r.document_id for r in opposing[: config.max_refs]],
    )


def rank_evidence_detailed(
    signals: list[WeightedSignal],
    config: EvidenceRankConfig = DEFAULT_RANK_CONFIG,
) -> tuple[list[RankedEvidence], list[RankedEvidence]]:
    """Like rank_evidence but returns full RankedEvidence objects.

    Useful when callers need the score breakdown for explainability.
    """
    supporting: list[RankedEvidence] = []
    opposing: list[RankedEvidence] = []

    for sig in signals:
        if sig.sentiment_value == 0.0:
            continue
        ranked = compute_evidence_rank(sig, config)
        if sig.sentiment_value > 0:
            supporting.append(ranked)
        else:
            opposing.append(ranked)

    supporting.sort(key=lambda r: r.rank_score, reverse=True)
    opposing.sort(key=lambda r: r.rank_score, reverse=True)

    return (
        supporting[: config.max_refs],
        opposing[: config.max_refs],
    )