"""Historical pattern mining for competitive intelligence. Queries document_impact_records joined with trend_windows to find how similar catalyst types resolved historically for a company or its competitors. Produces HistoricalPattern objects consumed by the signal propagation engine and the aggregation worker. Requirements: 3.1, 3.2, 3.3, 3.4, 3.5, 11.1, 11.2, 11.3, 11.5 """ from __future__ import annotations import logging from dataclasses import dataclass from datetime import datetime, timedelta, timezone from typing import Optional import asyncpg from services.shared.config import CompetitiveConfig from services.shared.schemas import MAJOR_DECISION_CATALYSTS logger = logging.getLogger(__name__) DEFAULT_HORIZONS = ["1d", "7d", "30d"] # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class HistoricalPattern: """Statistical summary of how a catalyst type resolved historically.""" source_ticker: str target_ticker: str catalyst_type: str time_horizon: str # 1d | 7d | 30d sample_count: int bullish_pct: float # [0, 1] bearish_pct: float # [0, 1] avg_strength: float # [0, 1] avg_time_to_resolution: float # days pattern_confidence: float # [0, 1] data_start: datetime data_end: datetime tier: str # major_corporate_decision | routine_signal insufficient_data: bool # --------------------------------------------------------------------------- # Catalyst tier classification (Req 11.1) # --------------------------------------------------------------------------- def classify_catalyst_tier(catalyst_type: str) -> str: """Deterministic mapping of catalyst_type to tier. Returns ``"major_corporate_decision"`` for catalyst types in MAJOR_DECISION_CATALYSTS, otherwise ``"routine_signal"``. """ if catalyst_type in MAJOR_DECISION_CATALYSTS: return "major_corporate_decision" return "routine_signal" # --------------------------------------------------------------------------- # Pattern confidence (Req 3.3, 11.2) # --------------------------------------------------------------------------- def compute_pattern_confidence( sample_count: int, outcome_consistency: float, data_recency_days: float, tier: str, config: Optional[CompetitiveConfig] = None, ) -> float: """Compute pattern confidence score in [0, 1]. Formula: sample_factor * 0.4 + consistency * 0.4 + recency_factor * 0.2 With a 1.3Ɨ multiplier for ``major_corporate_decision`` tier, insufficient-data cap, and staleness decay. """ cfg = config or CompetitiveConfig() # --- component factors --- sample_factor = min(sample_count / 20.0, 1.0) consistency = outcome_consistency # already max(bullish_pct, bearish_pct) if data_recency_days <= cfg.staleness_recent_days: recency_factor = 1.0 elif data_recency_days <= cfg.staleness_window_days: recency_factor = 0.7 else: recency_factor = 0.4 confidence = sample_factor * 0.4 + consistency * 0.4 + recency_factor * 0.2 # Major-decision multiplier (Req 11.2) if tier == "major_corporate_decision": confidence *= cfg.major_decision_weight_multiplier # Clamp to [0, 1] confidence = min(max(confidence, 0.0), 1.0) # Insufficient data cap (Req 3.4) if sample_count < cfg.min_pattern_samples: confidence = min(confidence, 0.25) # Staleness decay (Req 9.2) if data_recency_days > cfg.staleness_window_days: confidence *= cfg.staleness_decay_penalty return confidence # --------------------------------------------------------------------------- # Lookback helper # --------------------------------------------------------------------------- def _lookback_days(tier: str, config: Optional[CompetitiveConfig] = None) -> int: """Return the lookback window in days for the given tier.""" cfg = config or CompetitiveConfig() if tier == "major_corporate_decision": return cfg.major_decision_lookback_days return cfg.routine_lookback_days # --------------------------------------------------------------------------- # SQL: self-company pattern query # --------------------------------------------------------------------------- _SELF_PATTERN_QUERY = """ WITH matched_docs AS ( SELECT dir.id AS dir_id, d.published_at, dir.sentiment FROM document_impact_records dir JOIN document_intelligence di ON di.id = dir.intelligence_id JOIN documents d ON d.id = di.document_id WHERE dir.ticker = $1 AND dir.catalyst_type = $2 AND di.validation_status = 'valid' AND d.status != 'rejected' AND d.published_at >= $3 AND d.published_at <= $4 ) SELECT md.dir_id, md.published_at, md.sentiment, tw.trend_direction, tw.trend_strength, tw.generated_at, tw."window" AS tw_window FROM matched_docs md JOIN trend_windows tw ON tw.entity_type = 'company' AND tw.entity_id = $1 AND tw."window" = $5 AND tw.generated_at >= md.published_at AND tw.generated_at <= md.published_at + $6::interval ORDER BY md.published_at DESC """ # --------------------------------------------------------------------------- # SQL: cross-company pattern query # --------------------------------------------------------------------------- _CROSS_PATTERN_QUERY = """ WITH matched_docs AS ( SELECT dir.id AS dir_id, d.published_at, dir.sentiment FROM document_impact_records dir JOIN document_intelligence di ON di.id = dir.intelligence_id JOIN documents d ON d.id = di.document_id WHERE dir.ticker = $1 AND dir.catalyst_type = $2 AND di.validation_status = 'valid' AND d.status != 'rejected' AND d.published_at >= $3 AND d.published_at <= $4 ) SELECT md.dir_id, md.published_at, md.sentiment, tw.trend_direction, tw.trend_strength, tw.generated_at, tw."window" AS tw_window FROM matched_docs md JOIN trend_windows tw ON tw.entity_type = 'company' AND tw.entity_id = $5 AND tw."window" = $6 AND tw.generated_at >= md.published_at AND tw.generated_at <= md.published_at + $7::interval ORDER BY md.published_at DESC """ # --------------------------------------------------------------------------- # Horizon → interval mapping # --------------------------------------------------------------------------- _HORIZON_INTERVALS: dict[str, str] = { "1d": "1 day", "7d": "7 days", "30d": "30 days", } # --------------------------------------------------------------------------- # Build HistoricalPattern from query rows # --------------------------------------------------------------------------- def _build_pattern( rows: list[asyncpg.Record], source_ticker: str, target_ticker: str, catalyst_type: str, horizon: str, tier: str, config: Optional[CompetitiveConfig] = None, ) -> Optional[HistoricalPattern]: """Aggregate query rows into a single HistoricalPattern.""" if not rows: return None # De-duplicate by dir_id — keep the first (closest) trend_window per doc seen: set[str] = set() unique_rows: list[asyncpg.Record] = [] for r in rows: rid = str(r["dir_id"]) if rid not in seen: seen.add(rid) unique_rows.append(r) sample_count = len(unique_rows) bullish = sum(1 for r in unique_rows if r["trend_direction"] == "bullish") bearish = sum(1 for r in unique_rows if r["trend_direction"] == "bearish") bullish_pct = bullish / sample_count bearish_pct = bearish / sample_count strengths = [float(r["trend_strength"]) for r in unique_rows if r["trend_strength"] is not None] avg_strength = sum(strengths) / len(strengths) if strengths else 0.0 # avg_time_to_resolution: average days between published_at and generated_at resolutions: list[float] = [] for r in unique_rows: pub = r["published_at"] gen = r["generated_at"] if pub and gen: delta = (gen - pub).total_seconds() / 86400.0 resolutions.append(max(delta, 0.0)) avg_time_to_resolution = sum(resolutions) / len(resolutions) if resolutions else 0.0 # Date range published_dates = [r["published_at"] for r in unique_rows if r["published_at"] is not None] data_start = min(published_dates) data_end = max(published_dates) # Recency: days since the most recent data point now = datetime.now(timezone.utc) data_recency_days = (now - data_end).total_seconds() / 86400.0 if data_end else 999.0 outcome_consistency = max(bullish_pct, bearish_pct) confidence = compute_pattern_confidence( sample_count, outcome_consistency, data_recency_days, tier, config, ) insufficient_data = sample_count < (config or CompetitiveConfig()).min_pattern_samples return HistoricalPattern( source_ticker=source_ticker, target_ticker=target_ticker, catalyst_type=catalyst_type, time_horizon=horizon, sample_count=sample_count, bullish_pct=bullish_pct, bearish_pct=bearish_pct, avg_strength=min(max(avg_strength, 0.0), 1.0), avg_time_to_resolution=avg_time_to_resolution, pattern_confidence=confidence, data_start=data_start, data_end=data_end, tier=tier, insufficient_data=insufficient_data, ) # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- async def find_self_patterns( pool: asyncpg.Pool, ticker: str, catalyst_type: str, horizons: Optional[list[str]] = None, config: Optional[CompetitiveConfig] = None, ) -> list[HistoricalPattern]: """Find historical patterns for the same company-catalyst pair. Queries document_impact_records joined with trend_windows for the given ticker and catalyst_type across configurable time horizons. Requirements: 3.1, 3.2, 3.5, 11.3 """ cfg = config or CompetitiveConfig() horizons = horizons or DEFAULT_HORIZONS tier = classify_catalyst_tier(catalyst_type) lookback = _lookback_days(tier, cfg) now = datetime.now(timezone.utc) cutoff = now - timedelta(days=lookback) patterns: list[HistoricalPattern] = [] async with pool.acquire() as conn: for horizon in horizons: interval = _HORIZON_INTERVALS.get(horizon) if interval is None: logger.warning("Unknown horizon %s, skipping", horizon) continue try: rows = await conn.fetch( _SELF_PATTERN_QUERY, ticker, # $1 catalyst_type, # $2 cutoff, # $3 now, # $4 horizon, # $5 interval, # $6 ) except Exception: logger.exception( "Error querying self-patterns for %s/%s/%s", ticker, catalyst_type, horizon, ) continue pattern = _build_pattern( rows, ticker, ticker, catalyst_type, horizon, tier, cfg, ) if pattern is not None: patterns.append(pattern) return patterns async def find_cross_company_patterns( pool: asyncpg.Pool, source_ticker: str, target_ticker: str, catalyst_type: str, horizons: Optional[list[str]] = None, config: Optional[CompetitiveConfig] = None, ) -> list[HistoricalPattern]: """Find cross-company historical patterns. Queries documents about *source_ticker* with the given catalyst_type, then looks at trend_windows for *target_ticker* within each horizon after the document was published. Requirements: 4.2, 11.5 """ cfg = config or CompetitiveConfig() horizons = horizons or DEFAULT_HORIZONS tier = classify_catalyst_tier(catalyst_type) lookback = _lookback_days(tier, cfg) now = datetime.now(timezone.utc) cutoff = now - timedelta(days=lookback) patterns: list[HistoricalPattern] = [] async with pool.acquire() as conn: for horizon in horizons: interval = _HORIZON_INTERVALS.get(horizon) if interval is None: logger.warning("Unknown horizon %s, skipping", horizon) continue try: rows = await conn.fetch( _CROSS_PATTERN_QUERY, source_ticker, # $1 catalyst_type, # $2 cutoff, # $3 now, # $4 target_ticker, # $5 horizon, # $6 interval, # $7 ) except Exception: logger.exception( "Error querying cross-patterns for %s→%s/%s/%s", source_ticker, target_ticker, catalyst_type, horizon, ) continue pattern = _build_pattern( rows, source_ticker, target_ticker, catalyst_type, horizon, tier, cfg, ) if pattern is not None: patterns.append(pattern) return patterns