stonks-oracle/services/aggregation/pattern_matcher.py

"""Historical pattern mining for competitive intelligence.

Queries document_impact_records joined with trend_windows to find how
similar catalyst types resolved historically for a company or its
competitors.  Produces HistoricalPattern objects consumed by the signal
propagation engine and the aggregation worker.

Requirements: 3.1, 3.2, 3.3, 3.4, 3.5, 11.1, 11.2, 11.3, 11.5
"""
from __future__ import annotations

import logging
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Optional

import asyncpg

from services.shared.config import CompetitiveConfig
from services.shared.schemas import MAJOR_DECISION_CATALYSTS

logger = logging.getLogger(__name__)

DEFAULT_HORIZONS = ["1d", "7d", "30d"]


# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------

@dataclass
class HistoricalPattern:
    """Statistical summary of how a catalyst type resolved historically."""

    source_ticker: str
    target_ticker: str
    catalyst_type: str
    time_horizon: str  # 1d | 7d | 30d
    sample_count: int
    bullish_pct: float  # [0, 1]
    bearish_pct: float  # [0, 1]
    avg_strength: float  # [0, 1]
    avg_time_to_resolution: float  # days
    pattern_confidence: float  # [0, 1]
    data_start: datetime
    data_end: datetime
    tier: str  # major_corporate_decision | routine_signal
    insufficient_data: bool


# ---------------------------------------------------------------------------
# Catalyst tier classification  (Req 11.1)
# ---------------------------------------------------------------------------

def classify_catalyst_tier(catalyst_type: str) -> str:
    """Deterministic mapping of catalyst_type to tier.

    Returns ``"major_corporate_decision"`` for catalyst types in
    MAJOR_DECISION_CATALYSTS, otherwise ``"routine_signal"``.
    """
    if catalyst_type in MAJOR_DECISION_CATALYSTS:
        return "major_corporate_decision"
    return "routine_signal"


# ---------------------------------------------------------------------------
# Pattern confidence  (Req 3.3, 11.2)
# ---------------------------------------------------------------------------

def compute_pattern_confidence(
    sample_count: int,
    outcome_consistency: float,
    data_recency_days: float,
    tier: str,
    config: Optional[CompetitiveConfig] = None,
) -> float:
    """Compute pattern confidence score in [0, 1].

    Formula:
        sample_factor * 0.4 + consistency * 0.4 + recency_factor * 0.2

    With a 1.3× multiplier for ``major_corporate_decision`` tier,
    insufficient-data cap, and staleness decay.
    """
    cfg = config or CompetitiveConfig()

    # --- component factors ---
    sample_factor = min(sample_count / 20.0, 1.0)
    consistency = outcome_consistency  # already max(bullish_pct, bearish_pct)

    if data_recency_days <= cfg.staleness_recent_days:
        recency_factor = 1.0
    elif data_recency_days <= cfg.staleness_window_days:
        recency_factor = 0.7
    else:
        recency_factor = 0.4

    confidence = sample_factor * 0.4 + consistency * 0.4 + recency_factor * 0.2

    # Major-decision multiplier (Req 11.2)
    if tier == "major_corporate_decision":
        confidence *= cfg.major_decision_weight_multiplier

    # Clamp to [0, 1]
    confidence = min(max(confidence, 0.0), 1.0)

    # Insufficient data cap (Req 3.4)
    if sample_count < cfg.min_pattern_samples:
        confidence = min(confidence, 0.25)

    # Staleness decay (Req 9.2)
    if data_recency_days > cfg.staleness_window_days:
        confidence *= cfg.staleness_decay_penalty

    return confidence


# ---------------------------------------------------------------------------
# Lookback helper
# ---------------------------------------------------------------------------

def _lookback_days(tier: str, config: Optional[CompetitiveConfig] = None) -> int:
    """Return the lookback window in days for the given tier."""
    cfg = config or CompetitiveConfig()
    if tier == "major_corporate_decision":
        return cfg.major_decision_lookback_days
    return cfg.routine_lookback_days


# ---------------------------------------------------------------------------
# SQL: self-company pattern query
# ---------------------------------------------------------------------------

_SELF_PATTERN_QUERY = """
WITH matched_docs AS (
    SELECT
        dir.id AS dir_id,
        d.published_at,
        dir.sentiment
    FROM document_impact_records dir
    JOIN document_intelligence di ON di.id = dir.intelligence_id
    JOIN documents d ON d.id = di.document_id
    WHERE dir.ticker = $1
      AND dir.catalyst_type = $2
      AND di.validation_status = 'valid'
      AND d.status != 'rejected'
      AND d.published_at >= $3
      AND d.published_at <= $4
)
SELECT
    md.dir_id,
    md.published_at,
    md.sentiment,
    tw.trend_direction,
    tw.trend_strength,
    tw.generated_at,
    tw."window" AS tw_window
FROM matched_docs md
JOIN trend_windows tw
    ON tw.entity_type = 'company'
   AND tw.entity_id = $1
   AND tw."window" = $5
   AND tw.generated_at >= md.published_at
   AND tw.generated_at <= md.published_at + $6::interval
ORDER BY md.published_at DESC
"""


# ---------------------------------------------------------------------------
# SQL: cross-company pattern query
# ---------------------------------------------------------------------------

_CROSS_PATTERN_QUERY = """
WITH matched_docs AS (
    SELECT
        dir.id AS dir_id,
        d.published_at,
        dir.sentiment
    FROM document_impact_records dir
    JOIN document_intelligence di ON di.id = dir.intelligence_id
    JOIN documents d ON d.id = di.document_id
    WHERE dir.ticker = $1
      AND dir.catalyst_type = $2
      AND di.validation_status = 'valid'
      AND d.status != 'rejected'
      AND d.published_at >= $3
      AND d.published_at <= $4
)
SELECT
    md.dir_id,
    md.published_at,
    md.sentiment,
    tw.trend_direction,
    tw.trend_strength,
    tw.generated_at,
    tw."window" AS tw_window
FROM matched_docs md
JOIN trend_windows tw
    ON tw.entity_type = 'company'
   AND tw.entity_id = $5
   AND tw."window" = $6
   AND tw.generated_at >= md.published_at
   AND tw.generated_at <= md.published_at + $7::interval
ORDER BY md.published_at DESC
"""


# ---------------------------------------------------------------------------
# Horizon → interval mapping
# ---------------------------------------------------------------------------

_HORIZON_INTERVALS: dict[str, str] = {
    "1d": "1 day",
    "7d": "7 days",
    "30d": "30 days",
}


# ---------------------------------------------------------------------------
# Build HistoricalPattern from query rows
# ---------------------------------------------------------------------------

def _build_pattern(
    rows: list[asyncpg.Record],
    source_ticker: str,
    target_ticker: str,
    catalyst_type: str,
    horizon: str,
    tier: str,
    config: Optional[CompetitiveConfig] = None,
) -> Optional[HistoricalPattern]:
    """Aggregate query rows into a single HistoricalPattern."""
    if not rows:
        return None

    # De-duplicate by dir_id — keep the first (closest) trend_window per doc
    seen: set[str] = set()
    unique_rows: list[asyncpg.Record] = []
    for r in rows:
        rid = str(r["dir_id"])
        if rid not in seen:
            seen.add(rid)
            unique_rows.append(r)

    sample_count = len(unique_rows)

    bullish = sum(1 for r in unique_rows if r["trend_direction"] == "bullish")
    bearish = sum(1 for r in unique_rows if r["trend_direction"] == "bearish")
    bullish_pct = bullish / sample_count
    bearish_pct = bearish / sample_count

    strengths = [float(r["trend_strength"]) for r in unique_rows if r["trend_strength"] is not None]
    avg_strength = sum(strengths) / len(strengths) if strengths else 0.0

    # avg_time_to_resolution: average days between published_at and generated_at
    resolutions: list[float] = []
    for r in unique_rows:
        pub = r["published_at"]
        gen = r["generated_at"]
        if pub and gen:
            delta = (gen - pub).total_seconds() / 86400.0
            resolutions.append(max(delta, 0.0))
    avg_time_to_resolution = sum(resolutions) / len(resolutions) if resolutions else 0.0

    # Date range
    published_dates = [r["published_at"] for r in unique_rows if r["published_at"] is not None]
    data_start = min(published_dates)
    data_end = max(published_dates)

    # Recency: days since the most recent data point
    now = datetime.now(timezone.utc)
    data_recency_days = (now - data_end).total_seconds() / 86400.0 if data_end else 999.0

    outcome_consistency = max(bullish_pct, bearish_pct)
    confidence = compute_pattern_confidence(
        sample_count, outcome_consistency, data_recency_days, tier, config,
    )

    insufficient_data = sample_count < (config or CompetitiveConfig()).min_pattern_samples

    return HistoricalPattern(
        source_ticker=source_ticker,
        target_ticker=target_ticker,
        catalyst_type=catalyst_type,
        time_horizon=horizon,
        sample_count=sample_count,
        bullish_pct=bullish_pct,
        bearish_pct=bearish_pct,
        avg_strength=min(max(avg_strength, 0.0), 1.0),
        avg_time_to_resolution=avg_time_to_resolution,
        pattern_confidence=confidence,
        data_start=data_start,
        data_end=data_end,
        tier=tier,
        insufficient_data=insufficient_data,
    )


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

async def find_self_patterns(
    pool: asyncpg.Pool,
    ticker: str,
    catalyst_type: str,
    horizons: Optional[list[str]] = None,
    config: Optional[CompetitiveConfig] = None,
) -> list[HistoricalPattern]:
    """Find historical patterns for the same company-catalyst pair.

    Queries document_impact_records joined with trend_windows for the
    given ticker and catalyst_type across configurable time horizons.

    Requirements: 3.1, 3.2, 3.5, 11.3
    """
    cfg = config or CompetitiveConfig()
    horizons = horizons or DEFAULT_HORIZONS
    tier = classify_catalyst_tier(catalyst_type)
    lookback = _lookback_days(tier, cfg)

    now = datetime.now(timezone.utc)
    cutoff = now - timedelta(days=lookback)

    patterns: list[HistoricalPattern] = []
    async with pool.acquire() as conn:
        for horizon in horizons:
            interval = _HORIZON_INTERVALS.get(horizon)
            if interval is None:
                logger.warning("Unknown horizon %s, skipping", horizon)
                continue
            try:
                rows = await conn.fetch(
                    _SELF_PATTERN_QUERY,
                    ticker,          # $1
                    catalyst_type,   # $2
                    cutoff,          # $3
                    now,             # $4
                    horizon,         # $5
                    interval,        # $6
                )
            except Exception:
                logger.exception(
                    "Error querying self-patterns for %s/%s/%s",
                    ticker, catalyst_type, horizon,
                )
                continue

            pattern = _build_pattern(
                rows, ticker, ticker, catalyst_type, horizon, tier, cfg,
            )
            if pattern is not None:
                patterns.append(pattern)

    return patterns


async def find_cross_company_patterns(
    pool: asyncpg.Pool,
    source_ticker: str,
    target_ticker: str,
    catalyst_type: str,
    horizons: Optional[list[str]] = None,
    config: Optional[CompetitiveConfig] = None,
) -> list[HistoricalPattern]:
    """Find cross-company historical patterns.

    Queries documents about *source_ticker* with the given catalyst_type,
    then looks at trend_windows for *target_ticker* within each horizon
    after the document was published.

    Requirements: 4.2, 11.5
    """
    cfg = config or CompetitiveConfig()
    horizons = horizons or DEFAULT_HORIZONS
    tier = classify_catalyst_tier(catalyst_type)
    lookback = _lookback_days(tier, cfg)

    now = datetime.now(timezone.utc)
    cutoff = now - timedelta(days=lookback)

    patterns: list[HistoricalPattern] = []
    async with pool.acquire() as conn:
        for horizon in horizons:
            interval = _HORIZON_INTERVALS.get(horizon)
            if interval is None:
                logger.warning("Unknown horizon %s, skipping", horizon)
                continue
            try:
                rows = await conn.fetch(
                    _CROSS_PATTERN_QUERY,
                    source_ticker,   # $1
                    catalyst_type,   # $2
                    cutoff,          # $3
                    now,             # $4
                    target_ticker,   # $5
                    horizon,         # $6
                    interval,        # $7
                )
            except Exception:
                logger.exception(
                    "Error querying cross-patterns for %s→%s/%s/%s",
                    source_ticker, target_ticker, catalyst_type, horizon,
                )
                continue

            pattern = _build_pattern(
                rows, source_ticker, target_ticker, catalyst_type,
                horizon, tier, cfg,
            )
            if pattern is not None:
                patterns.append(pattern)

    return patterns