stonks-oracle/services/aggregation/contradiction.py

"""Contradiction detection and disagreement representation.

Analyses weighted signals to detect and represent disagreement explicitly,
rather than collapsing contradictory evidence into a single unsupported
conclusion.

Requirements: 6.4, 6.5, 15.1–15.7
"""
from __future__ import annotations

import math
from dataclasses import dataclass

from services.aggregation.scoring import WeightedSignal
from services.shared.schemas import DisagreementDetail


@dataclass
class CatalystEntry:
    """Lightweight carrier for per-document catalyst info needed by
    contradiction detection.  Avoids importing ImpactRow and creating
    a circular dependency with worker.py."""

    document_id: str
    catalyst_type: str


@dataclass
class ContradictionResult:
    """Full contradiction analysis output."""

    score: float  # 0-1, same semantics as existing compute_contradiction_score
    details: list[DisagreementDetail]


def detect_contradictions(
    signals: list[WeightedSignal],
    catalyst_entries: list[CatalystEntry] | None = None,
    *,
    probabilistic: bool = False,
    w_threshold: float = 5.0,
) -> ContradictionResult:
    """Run contradiction detection across multiple dimensions.

    Analyses:
    1. Sentiment disagreement — the core positive-vs-negative split
    2. Catalyst disagreement — same catalyst type with opposing sentiment

    When ``probabilistic`` is True, the overall score uses weighted
    disagreement entropy (Req 15.1–15.7) instead of the minority/majority
    ratio.  When False, the existing ratio formula is preserved exactly.

    Args:
        signals: Weighted signals to analyse.
        catalyst_entries: Optional catalyst metadata for per-catalyst analysis.
        probabilistic: Use entropy-based scoring when True.
        w_threshold: Evidence mass threshold for entropy weighting (default 5.0).

    Returns a ContradictionResult with an overall score and per-dimension
    disagreement details.
    """
    details: list[DisagreementDetail] = []

    sentiment_detail = _detect_sentiment_disagreement(signals)
    if sentiment_detail is not None:
        details.append(sentiment_detail)

    if catalyst_entries:
        catalyst_details = _detect_catalyst_disagreement(signals, catalyst_entries)
        details.extend(catalyst_details)

    if probabilistic:
        score = _compute_entropy_score(signals, w_threshold)
    else:
        score = _compute_overall_score(signals)

    return ContradictionResult(score=score, details=details)


def _compute_overall_score(signals: list[WeightedSignal]) -> float:
    """Minority/majority weight ratio — backward-compatible formula."""
    if not signals:
        return 0.0

    pos_weight = 0.0
    neg_weight = 0.0
    for sig in signals:
        w = sig.weight.combined * sig.impact_score
        if sig.sentiment_value > 0:
            pos_weight += w
        elif sig.sentiment_value < 0:
            neg_weight += w

    total = pos_weight + neg_weight
    if total == 0.0:
        return 0.0

    minority = min(pos_weight, neg_weight)
    return round(minority / total, 4)


def _compute_entropy_score(
    signals: list[WeightedSignal],
    w_threshold: float = 5.0,
) -> float:
    """Weighted disagreement entropy — probabilistic contradiction score.

    Computes Shannon entropy over the positive/negative weight distribution,
    weighted by evidence mass relative to a configurable threshold.

    Formula:
        f_pos = W_pos / (W_pos + W_neg)
        f_neg = 1 - f_pos
        H = -f_pos·log₂(f_pos) - f_neg·log₂(f_neg)   (in [0, 1])
        score = H · min(1.0, (W_pos + W_neg) / W_threshold)

    Returns 0.0 when only one direction exists (no disagreement).

    Requirements: 15.1–15.7
    """
    if not signals:
        return 0.0

    pos_weight = 0.0
    neg_weight = 0.0
    for sig in signals:
        w = sig.weight.combined * sig.impact_score
        if sig.sentiment_value > 0:
            pos_weight += w
        elif sig.sentiment_value < 0:
            neg_weight += w

    # No disagreement when only one direction exists (Req 15.5)
    if pos_weight <= 0.0 or neg_weight <= 0.0:
        return 0.0

    total = pos_weight + neg_weight

    # Compute weight fractions (Req 15.2)
    f_pos = pos_weight / total
    f_neg = neg_weight / total  # = 1 - f_pos

    # Shannon entropy H = -f_pos·log₂(f_pos) - f_neg·log₂(f_neg) (Req 15.3)
    # Guard against log₂(0) — already handled by the early return above
    h_contradiction = -f_pos * math.log2(f_pos) - f_neg * math.log2(f_neg)

    # Weight by evidence mass (Req 15.4)
    evidence_factor = min(1.0, total / w_threshold) if w_threshold > 0.0 else 1.0
    score = h_contradiction * evidence_factor

    return round(score, 4)


def _detect_sentiment_disagreement(
    signals: list[WeightedSignal],
) -> DisagreementDetail | None:
    """Detect when both positive and negative sentiment signals exist."""
    pos_ids: list[str] = []
    neg_ids: list[str] = []
    pos_weight = 0.0
    neg_weight = 0.0

    for sig in signals:
        w = sig.weight.combined * sig.impact_score
        if w <= 0:
            continue
        if sig.sentiment_value > 0:
            pos_ids.append(sig.document_id)
            pos_weight += w
        elif sig.sentiment_value < 0:
            neg_ids.append(sig.document_id)
            neg_weight += w

    if not pos_ids or not neg_ids:
        return None

    total = pos_weight + neg_weight
    minority_pct = min(pos_weight, neg_weight) / total if total > 0 else 0.0

    return DisagreementDetail(
        dimension="sentiment",
        positive_doc_ids=pos_ids,
        negative_doc_ids=neg_ids,
        positive_weight=round(pos_weight, 4),
        negative_weight=round(neg_weight, 4),
        description=(
            f"Sentiment split: {len(pos_ids)} positive vs {len(neg_ids)} negative signals "
            f"(minority weight ratio {minority_pct:.0%})"
        ),
    )


def _detect_catalyst_disagreement(
    signals: list[WeightedSignal],
    catalyst_entries: list[CatalystEntry],
) -> list[DisagreementDetail]:
    """Detect when the same catalyst type has both positive and negative signals."""
    # Build lookup: document_id → (sentiment_value, combined_weight)
    sig_lookup: dict[str, tuple[float, float]] = {}
    for sig in signals:
        w = sig.weight.combined * sig.impact_score
        if w > 0:
            sig_lookup[sig.document_id] = (sig.sentiment_value, w)

    # Group by catalyst type
    from collections import defaultdict
    catalyst_groups: dict[str, list[tuple[str, float, float]]] = defaultdict(list)
    for entry in catalyst_entries:
        if entry.document_id in sig_lookup:
            sent_val, weight = sig_lookup[entry.document_id]
            if sent_val != 0.0:
                catalyst_groups[entry.catalyst_type].append(
                    (entry.document_id, sent_val, weight)
                )

    details: list[DisagreementDetail] = []
    for catalyst, entries in catalyst_groups.items():
        pos_ids = [doc_id for doc_id, sv, _ in entries if sv > 0]
        neg_ids = [doc_id for doc_id, sv, _ in entries if sv < 0]
        if not pos_ids or not neg_ids:
            continue

        pos_w = sum(w for _, sv, w in entries if sv > 0)
        neg_w = sum(w for _, sv, w in entries if sv < 0)

        details.append(DisagreementDetail(
            dimension=f"catalyst:{catalyst}",
            positive_doc_ids=pos_ids,
            negative_doc_ids=neg_ids,
            positive_weight=round(pos_w, 4),
            negative_weight=round(neg_w, 4),
            description=(
                f"Catalyst '{catalyst}' has {len(pos_ids)} positive and "
                f"{len(neg_ids)} negative signals"
            ),
        ))

    return details