415 lines
13 KiB
Python
415 lines
13 KiB
Python
"""Historical pattern mining for competitive intelligence.
|
||
|
||
Queries document_impact_records joined with trend_windows to find how
|
||
similar catalyst types resolved historically for a company or its
|
||
competitors. Produces HistoricalPattern objects consumed by the signal
|
||
propagation engine and the aggregation worker.
|
||
|
||
Requirements: 3.1, 3.2, 3.3, 3.4, 3.5, 11.1, 11.2, 11.3, 11.5
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
from dataclasses import dataclass
|
||
from datetime import datetime, timedelta, timezone
|
||
from typing import Optional
|
||
|
||
import asyncpg
|
||
|
||
from services.shared.config import CompetitiveConfig
|
||
from services.shared.schemas import MAJOR_DECISION_CATALYSTS
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
DEFAULT_HORIZONS = ["1d", "7d", "30d"]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Data classes
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@dataclass
|
||
class HistoricalPattern:
|
||
"""Statistical summary of how a catalyst type resolved historically."""
|
||
|
||
source_ticker: str
|
||
target_ticker: str
|
||
catalyst_type: str
|
||
time_horizon: str # 1d | 7d | 30d
|
||
sample_count: int
|
||
bullish_pct: float # [0, 1]
|
||
bearish_pct: float # [0, 1]
|
||
avg_strength: float # [0, 1]
|
||
avg_time_to_resolution: float # days
|
||
pattern_confidence: float # [0, 1]
|
||
data_start: datetime
|
||
data_end: datetime
|
||
tier: str # major_corporate_decision | routine_signal
|
||
insufficient_data: bool
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Catalyst tier classification (Req 11.1)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def classify_catalyst_tier(catalyst_type: str) -> str:
|
||
"""Deterministic mapping of catalyst_type to tier.
|
||
|
||
Returns ``"major_corporate_decision"`` for catalyst types in
|
||
MAJOR_DECISION_CATALYSTS, otherwise ``"routine_signal"``.
|
||
"""
|
||
if catalyst_type in MAJOR_DECISION_CATALYSTS:
|
||
return "major_corporate_decision"
|
||
return "routine_signal"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Pattern confidence (Req 3.3, 11.2)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def compute_pattern_confidence(
|
||
sample_count: int,
|
||
outcome_consistency: float,
|
||
data_recency_days: float,
|
||
tier: str,
|
||
config: Optional[CompetitiveConfig] = None,
|
||
) -> float:
|
||
"""Compute pattern confidence score in [0, 1].
|
||
|
||
Formula:
|
||
sample_factor * 0.4 + consistency * 0.4 + recency_factor * 0.2
|
||
|
||
With a 1.3× multiplier for ``major_corporate_decision`` tier,
|
||
insufficient-data cap, and staleness decay.
|
||
"""
|
||
cfg = config or CompetitiveConfig()
|
||
|
||
# --- component factors ---
|
||
sample_factor = min(sample_count / 20.0, 1.0)
|
||
consistency = outcome_consistency # already max(bullish_pct, bearish_pct)
|
||
|
||
if data_recency_days <= cfg.staleness_recent_days:
|
||
recency_factor = 1.0
|
||
elif data_recency_days <= cfg.staleness_window_days:
|
||
recency_factor = 0.7
|
||
else:
|
||
recency_factor = 0.4
|
||
|
||
confidence = sample_factor * 0.4 + consistency * 0.4 + recency_factor * 0.2
|
||
|
||
# Major-decision multiplier (Req 11.2)
|
||
if tier == "major_corporate_decision":
|
||
confidence *= cfg.major_decision_weight_multiplier
|
||
|
||
# Clamp to [0, 1]
|
||
confidence = min(max(confidence, 0.0), 1.0)
|
||
|
||
# Insufficient data cap (Req 3.4)
|
||
if sample_count < cfg.min_pattern_samples:
|
||
confidence = min(confidence, 0.25)
|
||
|
||
# Staleness decay (Req 9.2)
|
||
if data_recency_days > cfg.staleness_window_days:
|
||
confidence *= cfg.staleness_decay_penalty
|
||
|
||
return confidence
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Lookback helper
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _lookback_days(tier: str, config: Optional[CompetitiveConfig] = None) -> int:
|
||
"""Return the lookback window in days for the given tier."""
|
||
cfg = config or CompetitiveConfig()
|
||
if tier == "major_corporate_decision":
|
||
return cfg.major_decision_lookback_days
|
||
return cfg.routine_lookback_days
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# SQL: self-company pattern query
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_SELF_PATTERN_QUERY = """
|
||
WITH matched_docs AS (
|
||
SELECT
|
||
dir.id AS dir_id,
|
||
d.published_at,
|
||
dir.sentiment
|
||
FROM document_impact_records dir
|
||
JOIN document_intelligence di ON di.id = dir.intelligence_id
|
||
JOIN documents d ON d.id = di.document_id
|
||
WHERE dir.ticker = $1
|
||
AND dir.catalyst_type = $2
|
||
AND di.validation_status = 'valid'
|
||
AND d.status != 'rejected'
|
||
AND d.published_at >= $3
|
||
AND d.published_at <= $4
|
||
)
|
||
SELECT
|
||
md.dir_id,
|
||
md.published_at,
|
||
md.sentiment,
|
||
tw.trend_direction,
|
||
tw.trend_strength,
|
||
tw.generated_at,
|
||
tw."window" AS tw_window
|
||
FROM matched_docs md
|
||
JOIN trend_windows tw
|
||
ON tw.entity_type = 'company'
|
||
AND tw.entity_id = $1
|
||
AND tw."window" = $5
|
||
AND tw.generated_at >= md.published_at
|
||
AND tw.generated_at <= md.published_at + $6::interval
|
||
ORDER BY md.published_at DESC
|
||
"""
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# SQL: cross-company pattern query
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_CROSS_PATTERN_QUERY = """
|
||
WITH matched_docs AS (
|
||
SELECT
|
||
dir.id AS dir_id,
|
||
d.published_at,
|
||
dir.sentiment
|
||
FROM document_impact_records dir
|
||
JOIN document_intelligence di ON di.id = dir.intelligence_id
|
||
JOIN documents d ON d.id = di.document_id
|
||
WHERE dir.ticker = $1
|
||
AND dir.catalyst_type = $2
|
||
AND di.validation_status = 'valid'
|
||
AND d.status != 'rejected'
|
||
AND d.published_at >= $3
|
||
AND d.published_at <= $4
|
||
)
|
||
SELECT
|
||
md.dir_id,
|
||
md.published_at,
|
||
md.sentiment,
|
||
tw.trend_direction,
|
||
tw.trend_strength,
|
||
tw.generated_at,
|
||
tw."window" AS tw_window
|
||
FROM matched_docs md
|
||
JOIN trend_windows tw
|
||
ON tw.entity_type = 'company'
|
||
AND tw.entity_id = $5
|
||
AND tw."window" = $6
|
||
AND tw.generated_at >= md.published_at
|
||
AND tw.generated_at <= md.published_at + $7::interval
|
||
ORDER BY md.published_at DESC
|
||
"""
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Horizon → interval mapping
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_HORIZON_INTERVALS: dict[str, timedelta] = {
|
||
"1d": timedelta(days=1),
|
||
"7d": timedelta(days=7),
|
||
"30d": timedelta(days=30),
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Build HistoricalPattern from query rows
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _build_pattern(
|
||
rows: list[asyncpg.Record],
|
||
source_ticker: str,
|
||
target_ticker: str,
|
||
catalyst_type: str,
|
||
horizon: str,
|
||
tier: str,
|
||
config: Optional[CompetitiveConfig] = None,
|
||
) -> Optional[HistoricalPattern]:
|
||
"""Aggregate query rows into a single HistoricalPattern."""
|
||
if not rows:
|
||
return None
|
||
|
||
# De-duplicate by dir_id — keep the first (closest) trend_window per doc
|
||
seen: set[str] = set()
|
||
unique_rows: list[asyncpg.Record] = []
|
||
for r in rows:
|
||
rid = str(r["dir_id"])
|
||
if rid not in seen:
|
||
seen.add(rid)
|
||
unique_rows.append(r)
|
||
|
||
sample_count = len(unique_rows)
|
||
|
||
bullish = sum(1 for r in unique_rows if r["trend_direction"] == "bullish")
|
||
bearish = sum(1 for r in unique_rows if r["trend_direction"] == "bearish")
|
||
bullish_pct = bullish / sample_count
|
||
bearish_pct = bearish / sample_count
|
||
|
||
strengths = [float(r["trend_strength"]) for r in unique_rows if r["trend_strength"] is not None]
|
||
avg_strength = sum(strengths) / len(strengths) if strengths else 0.0
|
||
|
||
# avg_time_to_resolution: average days between published_at and generated_at
|
||
resolutions: list[float] = []
|
||
for r in unique_rows:
|
||
pub = r["published_at"]
|
||
gen = r["generated_at"]
|
||
if pub and gen:
|
||
delta = (gen - pub).total_seconds() / 86400.0
|
||
resolutions.append(max(delta, 0.0))
|
||
avg_time_to_resolution = sum(resolutions) / len(resolutions) if resolutions else 0.0
|
||
|
||
# Date range
|
||
published_dates = [r["published_at"] for r in unique_rows if r["published_at"] is not None]
|
||
data_start = min(published_dates)
|
||
data_end = max(published_dates)
|
||
|
||
# Recency: days since the most recent data point
|
||
now = datetime.now(timezone.utc)
|
||
data_recency_days = (now - data_end).total_seconds() / 86400.0 if data_end else 999.0
|
||
|
||
outcome_consistency = max(bullish_pct, bearish_pct)
|
||
confidence = compute_pattern_confidence(
|
||
sample_count, outcome_consistency, data_recency_days, tier, config,
|
||
)
|
||
|
||
insufficient_data = sample_count < (config or CompetitiveConfig()).min_pattern_samples
|
||
|
||
return HistoricalPattern(
|
||
source_ticker=source_ticker,
|
||
target_ticker=target_ticker,
|
||
catalyst_type=catalyst_type,
|
||
time_horizon=horizon,
|
||
sample_count=sample_count,
|
||
bullish_pct=bullish_pct,
|
||
bearish_pct=bearish_pct,
|
||
avg_strength=min(max(avg_strength, 0.0), 1.0),
|
||
avg_time_to_resolution=avg_time_to_resolution,
|
||
pattern_confidence=confidence,
|
||
data_start=data_start,
|
||
data_end=data_end,
|
||
tier=tier,
|
||
insufficient_data=insufficient_data,
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Public API
|
||
# ---------------------------------------------------------------------------
|
||
|
||
async def find_self_patterns(
|
||
pool: asyncpg.Pool,
|
||
ticker: str,
|
||
catalyst_type: str,
|
||
horizons: Optional[list[str]] = None,
|
||
config: Optional[CompetitiveConfig] = None,
|
||
) -> list[HistoricalPattern]:
|
||
"""Find historical patterns for the same company-catalyst pair.
|
||
|
||
Queries document_impact_records joined with trend_windows for the
|
||
given ticker and catalyst_type across configurable time horizons.
|
||
|
||
Requirements: 3.1, 3.2, 3.5, 11.3
|
||
"""
|
||
cfg = config or CompetitiveConfig()
|
||
horizons = horizons or DEFAULT_HORIZONS
|
||
tier = classify_catalyst_tier(catalyst_type)
|
||
lookback = _lookback_days(tier, cfg)
|
||
|
||
now = datetime.now(timezone.utc)
|
||
cutoff = now - timedelta(days=lookback)
|
||
|
||
patterns: list[HistoricalPattern] = []
|
||
async with pool.acquire() as conn:
|
||
for horizon in horizons:
|
||
interval = _HORIZON_INTERVALS.get(horizon)
|
||
if interval is None:
|
||
logger.warning("Unknown horizon %s, skipping", horizon)
|
||
continue
|
||
try:
|
||
rows = await conn.fetch(
|
||
_SELF_PATTERN_QUERY,
|
||
ticker, # $1
|
||
catalyst_type, # $2
|
||
cutoff, # $3
|
||
now, # $4
|
||
horizon, # $5
|
||
interval, # $6
|
||
)
|
||
except Exception:
|
||
logger.exception(
|
||
"Error querying self-patterns for %s/%s/%s",
|
||
ticker, catalyst_type, horizon,
|
||
)
|
||
continue
|
||
|
||
pattern = _build_pattern(
|
||
rows, ticker, ticker, catalyst_type, horizon, tier, cfg,
|
||
)
|
||
if pattern is not None:
|
||
patterns.append(pattern)
|
||
|
||
return patterns
|
||
|
||
|
||
async def find_cross_company_patterns(
|
||
pool: asyncpg.Pool,
|
||
source_ticker: str,
|
||
target_ticker: str,
|
||
catalyst_type: str,
|
||
horizons: Optional[list[str]] = None,
|
||
config: Optional[CompetitiveConfig] = None,
|
||
) -> list[HistoricalPattern]:
|
||
"""Find cross-company historical patterns.
|
||
|
||
Queries documents about *source_ticker* with the given catalyst_type,
|
||
then looks at trend_windows for *target_ticker* within each horizon
|
||
after the document was published.
|
||
|
||
Requirements: 4.2, 11.5
|
||
"""
|
||
cfg = config or CompetitiveConfig()
|
||
horizons = horizons or DEFAULT_HORIZONS
|
||
tier = classify_catalyst_tier(catalyst_type)
|
||
lookback = _lookback_days(tier, cfg)
|
||
|
||
now = datetime.now(timezone.utc)
|
||
cutoff = now - timedelta(days=lookback)
|
||
|
||
patterns: list[HistoricalPattern] = []
|
||
async with pool.acquire() as conn:
|
||
for horizon in horizons:
|
||
interval = _HORIZON_INTERVALS.get(horizon)
|
||
if interval is None:
|
||
logger.warning("Unknown horizon %s, skipping", horizon)
|
||
continue
|
||
try:
|
||
rows = await conn.fetch(
|
||
_CROSS_PATTERN_QUERY,
|
||
source_ticker, # $1
|
||
catalyst_type, # $2
|
||
cutoff, # $3
|
||
now, # $4
|
||
target_ticker, # $5
|
||
horizon, # $6
|
||
interval, # $7
|
||
)
|
||
except Exception:
|
||
logger.exception(
|
||
"Error querying cross-patterns for %s→%s/%s/%s",
|
||
source_ticker, target_ticker, catalyst_type, horizon,
|
||
)
|
||
continue
|
||
|
||
pattern = _build_pattern(
|
||
rows, source_ticker, target_ticker, catalyst_type,
|
||
horizon, tier, cfg,
|
||
)
|
||
if pattern is not None:
|
||
patterns.append(pattern)
|
||
|
||
return patterns
|