feat: competitive intelligence & historical pattern matching layer

2026-04-14 19:42:48 +00:00
parent b478022ba3
commit f7a11d14ea
203 changed files with 20155 additions and 97 deletions
@@ -0,0 +1,414 @@
+"""Historical pattern mining for competitive intelligence.
+
+Queries document_impact_records joined with trend_windows to find how
+similar catalyst types resolved historically for a company or its
+competitors.  Produces HistoricalPattern objects consumed by the signal
+propagation engine and the aggregation worker.
+
+Requirements: 3.1, 3.2, 3.3, 3.4, 3.5, 11.1, 11.2, 11.3, 11.5
+"""
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+from typing import Optional
+
+import asyncpg
+
+from services.shared.config import CompetitiveConfig
+from services.shared.schemas import MAJOR_DECISION_CATALYSTS
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_HORIZONS = ["1d", "7d", "30d"]
+
+
+# ---------------------------------------------------------------------------
+# Data classes
+# ---------------------------------------------------------------------------
+
+@dataclass
+class HistoricalPattern:
+    """Statistical summary of how a catalyst type resolved historically."""
+
+    source_ticker: str
+    target_ticker: str
+    catalyst_type: str
+    time_horizon: str  # 1d | 7d | 30d
+    sample_count: int
+    bullish_pct: float  # [0, 1]
+    bearish_pct: float  # [0, 1]
+    avg_strength: float  # [0, 1]
+    avg_time_to_resolution: float  # days
+    pattern_confidence: float  # [0, 1]
+    data_start: datetime
+    data_end: datetime
+    tier: str  # major_corporate_decision | routine_signal
+    insufficient_data: bool
+
+
+# ---------------------------------------------------------------------------
+# Catalyst tier classification  (Req 11.1)
+# ---------------------------------------------------------------------------
+
+def classify_catalyst_tier(catalyst_type: str) -> str:
+    """Deterministic mapping of catalyst_type to tier.
+
+    Returns ``"major_corporate_decision"`` for catalyst types in
+    MAJOR_DECISION_CATALYSTS, otherwise ``"routine_signal"``.
+    """
+    if catalyst_type in MAJOR_DECISION_CATALYSTS:
+        return "major_corporate_decision"
+    return "routine_signal"
+
+
+# ---------------------------------------------------------------------------
+# Pattern confidence  (Req 3.3, 11.2)
+# ---------------------------------------------------------------------------
+
+def compute_pattern_confidence(
+    sample_count: int,
+    outcome_consistency: float,
+    data_recency_days: float,
+    tier: str,
+    config: Optional[CompetitiveConfig] = None,
+) -> float:
+    """Compute pattern confidence score in [0, 1].
+
+    Formula:
+        sample_factor * 0.4 + consistency * 0.4 + recency_factor * 0.2
+
+    With a 1.3× multiplier for ``major_corporate_decision`` tier,
+    insufficient-data cap, and staleness decay.
+    """
+    cfg = config or CompetitiveConfig()
+
+    # --- component factors ---
+    sample_factor = min(sample_count / 20.0, 1.0)
+    consistency = outcome_consistency  # already max(bullish_pct, bearish_pct)
+
+    if data_recency_days <= cfg.staleness_recent_days:
+        recency_factor = 1.0
+    elif data_recency_days <= cfg.staleness_window_days:
+        recency_factor = 0.7
+    else:
+        recency_factor = 0.4
+
+    confidence = sample_factor * 0.4 + consistency * 0.4 + recency_factor * 0.2
+
+    # Major-decision multiplier (Req 11.2)
+    if tier == "major_corporate_decision":
+        confidence *= cfg.major_decision_weight_multiplier
+
+    # Clamp to [0, 1]
+    confidence = min(max(confidence, 0.0), 1.0)
+
+    # Insufficient data cap (Req 3.4)
+    if sample_count < cfg.min_pattern_samples:
+        confidence = min(confidence, 0.25)
+
+    # Staleness decay (Req 9.2)
+    if data_recency_days > cfg.staleness_window_days:
+        confidence *= cfg.staleness_decay_penalty
+
+    return confidence
+
+
+# ---------------------------------------------------------------------------
+# Lookback helper
+# ---------------------------------------------------------------------------
+
+def _lookback_days(tier: str, config: Optional[CompetitiveConfig] = None) -> int:
+    """Return the lookback window in days for the given tier."""
+    cfg = config or CompetitiveConfig()
+    if tier == "major_corporate_decision":
+        return cfg.major_decision_lookback_days
+    return cfg.routine_lookback_days
+
+
+# ---------------------------------------------------------------------------
+# SQL: self-company pattern query
+# ---------------------------------------------------------------------------
+
+_SELF_PATTERN_QUERY = """
+WITH matched_docs AS (
+    SELECT
+        dir.id AS dir_id,
+        d.published_at,
+        dir.sentiment
+    FROM document_impact_records dir
+    JOIN document_intelligence di ON di.id = dir.intelligence_id
+    JOIN documents d ON d.id = di.document_id
+    WHERE dir.ticker = $1
+      AND dir.catalyst_type = $2
+      AND di.validation_status = 'valid'
+      AND d.status != 'rejected'
+      AND d.published_at >= $3
+      AND d.published_at <= $4
+)
+SELECT
+    md.dir_id,
+    md.published_at,
+    md.sentiment,
+    tw.trend_direction,
+    tw.trend_strength,
+    tw.generated_at,
+    tw."window" AS tw_window
+FROM matched_docs md
+JOIN trend_windows tw
+    ON tw.entity_type = 'company'
+   AND tw.entity_id = $1
+   AND tw."window" = $5
+   AND tw.generated_at >= md.published_at
+   AND tw.generated_at <= md.published_at + $6::interval
+ORDER BY md.published_at DESC
+"""
+
+
+# ---------------------------------------------------------------------------
+# SQL: cross-company pattern query
+# ---------------------------------------------------------------------------
+
+_CROSS_PATTERN_QUERY = """
+WITH matched_docs AS (
+    SELECT
+        dir.id AS dir_id,
+        d.published_at,
+        dir.sentiment
+    FROM document_impact_records dir
+    JOIN document_intelligence di ON di.id = dir.intelligence_id
+    JOIN documents d ON d.id = di.document_id
+    WHERE dir.ticker = $1
+      AND dir.catalyst_type = $2
+      AND di.validation_status = 'valid'
+      AND d.status != 'rejected'
+      AND d.published_at >= $3
+      AND d.published_at <= $4
+)
+SELECT
+    md.dir_id,
+    md.published_at,
+    md.sentiment,
+    tw.trend_direction,
+    tw.trend_strength,
+    tw.generated_at,
+    tw."window" AS tw_window
+FROM matched_docs md
+JOIN trend_windows tw
+    ON tw.entity_type = 'company'
+   AND tw.entity_id = $5
+   AND tw."window" = $6
+   AND tw.generated_at >= md.published_at
+   AND tw.generated_at <= md.published_at + $7::interval
+ORDER BY md.published_at DESC
+"""
+
+
+# ---------------------------------------------------------------------------
+# Horizon → interval mapping
+# ---------------------------------------------------------------------------
+
+_HORIZON_INTERVALS: dict[str, str] = {
+    "1d": "1 day",
+    "7d": "7 days",
+    "30d": "30 days",
+}
+
+
+# ---------------------------------------------------------------------------
+# Build HistoricalPattern from query rows
+# ---------------------------------------------------------------------------
+
+def _build_pattern(
+    rows: list[asyncpg.Record],
+    source_ticker: str,
+    target_ticker: str,
+    catalyst_type: str,
+    horizon: str,
+    tier: str,
+    config: Optional[CompetitiveConfig] = None,
+) -> Optional[HistoricalPattern]:
+    """Aggregate query rows into a single HistoricalPattern."""
+    if not rows:
+        return None
+
+    # De-duplicate by dir_id — keep the first (closest) trend_window per doc
+    seen: set[str] = set()
+    unique_rows: list[asyncpg.Record] = []
+    for r in rows:
+        rid = str(r["dir_id"])
+        if rid not in seen:
+            seen.add(rid)
+            unique_rows.append(r)
+
+    sample_count = len(unique_rows)
+
+    bullish = sum(1 for r in unique_rows if r["trend_direction"] == "bullish")
+    bearish = sum(1 for r in unique_rows if r["trend_direction"] == "bearish")
+    bullish_pct = bullish / sample_count
+    bearish_pct = bearish / sample_count
+
+    strengths = [float(r["trend_strength"]) for r in unique_rows if r["trend_strength"] is not None]
+    avg_strength = sum(strengths) / len(strengths) if strengths else 0.0
+
+    # avg_time_to_resolution: average days between published_at and generated_at
+    resolutions: list[float] = []
+    for r in unique_rows:
+        pub = r["published_at"]
+        gen = r["generated_at"]
+        if pub and gen:
+            delta = (gen - pub).total_seconds() / 86400.0
+            resolutions.append(max(delta, 0.0))
+    avg_time_to_resolution = sum(resolutions) / len(resolutions) if resolutions else 0.0
+
+    # Date range
+    published_dates = [r["published_at"] for r in unique_rows if r["published_at"] is not None]
+    data_start = min(published_dates)
+    data_end = max(published_dates)
+
+    # Recency: days since the most recent data point
+    now = datetime.now(timezone.utc)
+    data_recency_days = (now - data_end).total_seconds() / 86400.0 if data_end else 999.0
+
+    outcome_consistency = max(bullish_pct, bearish_pct)
+    confidence = compute_pattern_confidence(
+        sample_count, outcome_consistency, data_recency_days, tier, config,
+    )
+
+    insufficient_data = sample_count < (config or CompetitiveConfig()).min_pattern_samples
+
+    return HistoricalPattern(
+        source_ticker=source_ticker,
+        target_ticker=target_ticker,
+        catalyst_type=catalyst_type,
+        time_horizon=horizon,
+        sample_count=sample_count,
+        bullish_pct=bullish_pct,
+        bearish_pct=bearish_pct,
+        avg_strength=min(max(avg_strength, 0.0), 1.0),
+        avg_time_to_resolution=avg_time_to_resolution,
+        pattern_confidence=confidence,
+        data_start=data_start,
+        data_end=data_end,
+        tier=tier,
+        insufficient_data=insufficient_data,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+async def find_self_patterns(
+    pool: asyncpg.Pool,
+    ticker: str,
+    catalyst_type: str,
+    horizons: Optional[list[str]] = None,
+    config: Optional[CompetitiveConfig] = None,
+) -> list[HistoricalPattern]:
+    """Find historical patterns for the same company-catalyst pair.
+
+    Queries document_impact_records joined with trend_windows for the
+    given ticker and catalyst_type across configurable time horizons.
+
+    Requirements: 3.1, 3.2, 3.5, 11.3
+    """
+    cfg = config or CompetitiveConfig()
+    horizons = horizons or DEFAULT_HORIZONS
+    tier = classify_catalyst_tier(catalyst_type)
+    lookback = _lookback_days(tier, cfg)
+
+    now = datetime.now(timezone.utc)
+    cutoff = now - timedelta(days=lookback)
+
+    patterns: list[HistoricalPattern] = []
+    async with pool.acquire() as conn:
+        for horizon in horizons:
+            interval = _HORIZON_INTERVALS.get(horizon)
+            if interval is None:
+                logger.warning("Unknown horizon %s, skipping", horizon)
+                continue
+            try:
+                rows = await conn.fetch(
+                    _SELF_PATTERN_QUERY,
+                    ticker,          # $1
+                    catalyst_type,   # $2
+                    cutoff,          # $3
+                    now,             # $4
+                    horizon,         # $5
+                    interval,        # $6
+                )
+            except Exception:
+                logger.exception(
+                    "Error querying self-patterns for %s/%s/%s",
+                    ticker, catalyst_type, horizon,
+                )
+                continue
+
+            pattern = _build_pattern(
+                rows, ticker, ticker, catalyst_type, horizon, tier, cfg,
+            )
+            if pattern is not None:
+                patterns.append(pattern)
+
+    return patterns
+
+
+async def find_cross_company_patterns(
+    pool: asyncpg.Pool,
+    source_ticker: str,
+    target_ticker: str,
+    catalyst_type: str,
+    horizons: Optional[list[str]] = None,
+    config: Optional[CompetitiveConfig] = None,
+) -> list[HistoricalPattern]:
+    """Find cross-company historical patterns.
+
+    Queries documents about *source_ticker* with the given catalyst_type,
+    then looks at trend_windows for *target_ticker* within each horizon
+    after the document was published.
+
+    Requirements: 4.2, 11.5
+    """
+    cfg = config or CompetitiveConfig()
+    horizons = horizons or DEFAULT_HORIZONS
+    tier = classify_catalyst_tier(catalyst_type)
+    lookback = _lookback_days(tier, cfg)
+
+    now = datetime.now(timezone.utc)
+    cutoff = now - timedelta(days=lookback)
+
+    patterns: list[HistoricalPattern] = []
+    async with pool.acquire() as conn:
+        for horizon in horizons:
+            interval = _HORIZON_INTERVALS.get(horizon)
+            if interval is None:
+                logger.warning("Unknown horizon %s, skipping", horizon)
+                continue
+            try:
+                rows = await conn.fetch(
+                    _CROSS_PATTERN_QUERY,
+                    source_ticker,   # $1
+                    catalyst_type,   # $2
+                    cutoff,          # $3
+                    now,             # $4
+                    target_ticker,   # $5
+                    horizon,         # $6
+                    interval,        # $7
+                )
+            except Exception:
+                logger.exception(
+                    "Error querying cross-patterns for %s→%s/%s/%s",
+                    source_ticker, target_ticker, catalyst_type, horizon,
+                )
+                continue
+
+            pattern = _build_pattern(
+                rows, source_ticker, target_ticker, catalyst_type,
+                horizon, tier, cfg,
+            )
+            if pattern is not None:
+                patterns.append(pattern)
+
+    return patterns