feat: competitive intelligence & historical pattern matching layer

This commit is contained in:
Celes Renata
2026-04-14 19:42:48 +00:00
parent b478022ba3
commit f7a11d14ea
203 changed files with 20155 additions and 97 deletions
+414
View File
@@ -0,0 +1,414 @@
"""Historical pattern mining for competitive intelligence.
Queries document_impact_records joined with trend_windows to find how
similar catalyst types resolved historically for a company or its
competitors. Produces HistoricalPattern objects consumed by the signal
propagation engine and the aggregation worker.
Requirements: 3.1, 3.2, 3.3, 3.4, 3.5, 11.1, 11.2, 11.3, 11.5
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Optional
import asyncpg
from services.shared.config import CompetitiveConfig
from services.shared.schemas import MAJOR_DECISION_CATALYSTS
logger = logging.getLogger(__name__)
DEFAULT_HORIZONS = ["1d", "7d", "30d"]
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class HistoricalPattern:
"""Statistical summary of how a catalyst type resolved historically."""
source_ticker: str
target_ticker: str
catalyst_type: str
time_horizon: str # 1d | 7d | 30d
sample_count: int
bullish_pct: float # [0, 1]
bearish_pct: float # [0, 1]
avg_strength: float # [0, 1]
avg_time_to_resolution: float # days
pattern_confidence: float # [0, 1]
data_start: datetime
data_end: datetime
tier: str # major_corporate_decision | routine_signal
insufficient_data: bool
# ---------------------------------------------------------------------------
# Catalyst tier classification (Req 11.1)
# ---------------------------------------------------------------------------
def classify_catalyst_tier(catalyst_type: str) -> str:
"""Deterministic mapping of catalyst_type to tier.
Returns ``"major_corporate_decision"`` for catalyst types in
MAJOR_DECISION_CATALYSTS, otherwise ``"routine_signal"``.
"""
if catalyst_type in MAJOR_DECISION_CATALYSTS:
return "major_corporate_decision"
return "routine_signal"
# ---------------------------------------------------------------------------
# Pattern confidence (Req 3.3, 11.2)
# ---------------------------------------------------------------------------
def compute_pattern_confidence(
sample_count: int,
outcome_consistency: float,
data_recency_days: float,
tier: str,
config: Optional[CompetitiveConfig] = None,
) -> float:
"""Compute pattern confidence score in [0, 1].
Formula:
sample_factor * 0.4 + consistency * 0.4 + recency_factor * 0.2
With a 1.3× multiplier for ``major_corporate_decision`` tier,
insufficient-data cap, and staleness decay.
"""
cfg = config or CompetitiveConfig()
# --- component factors ---
sample_factor = min(sample_count / 20.0, 1.0)
consistency = outcome_consistency # already max(bullish_pct, bearish_pct)
if data_recency_days <= cfg.staleness_recent_days:
recency_factor = 1.0
elif data_recency_days <= cfg.staleness_window_days:
recency_factor = 0.7
else:
recency_factor = 0.4
confidence = sample_factor * 0.4 + consistency * 0.4 + recency_factor * 0.2
# Major-decision multiplier (Req 11.2)
if tier == "major_corporate_decision":
confidence *= cfg.major_decision_weight_multiplier
# Clamp to [0, 1]
confidence = min(max(confidence, 0.0), 1.0)
# Insufficient data cap (Req 3.4)
if sample_count < cfg.min_pattern_samples:
confidence = min(confidence, 0.25)
# Staleness decay (Req 9.2)
if data_recency_days > cfg.staleness_window_days:
confidence *= cfg.staleness_decay_penalty
return confidence
# ---------------------------------------------------------------------------
# Lookback helper
# ---------------------------------------------------------------------------
def _lookback_days(tier: str, config: Optional[CompetitiveConfig] = None) -> int:
"""Return the lookback window in days for the given tier."""
cfg = config or CompetitiveConfig()
if tier == "major_corporate_decision":
return cfg.major_decision_lookback_days
return cfg.routine_lookback_days
# ---------------------------------------------------------------------------
# SQL: self-company pattern query
# ---------------------------------------------------------------------------
_SELF_PATTERN_QUERY = """
WITH matched_docs AS (
SELECT
dir.id AS dir_id,
d.published_at,
dir.sentiment
FROM document_impact_records dir
JOIN document_intelligence di ON di.id = dir.intelligence_id
JOIN documents d ON d.id = di.document_id
WHERE dir.ticker = $1
AND dir.catalyst_type = $2
AND di.validation_status = 'valid'
AND d.status != 'rejected'
AND d.published_at >= $3
AND d.published_at <= $4
)
SELECT
md.dir_id,
md.published_at,
md.sentiment,
tw.trend_direction,
tw.trend_strength,
tw.generated_at,
tw."window" AS tw_window
FROM matched_docs md
JOIN trend_windows tw
ON tw.entity_type = 'company'
AND tw.entity_id = $1
AND tw."window" = $5
AND tw.generated_at >= md.published_at
AND tw.generated_at <= md.published_at + $6::interval
ORDER BY md.published_at DESC
"""
# ---------------------------------------------------------------------------
# SQL: cross-company pattern query
# ---------------------------------------------------------------------------
_CROSS_PATTERN_QUERY = """
WITH matched_docs AS (
SELECT
dir.id AS dir_id,
d.published_at,
dir.sentiment
FROM document_impact_records dir
JOIN document_intelligence di ON di.id = dir.intelligence_id
JOIN documents d ON d.id = di.document_id
WHERE dir.ticker = $1
AND dir.catalyst_type = $2
AND di.validation_status = 'valid'
AND d.status != 'rejected'
AND d.published_at >= $3
AND d.published_at <= $4
)
SELECT
md.dir_id,
md.published_at,
md.sentiment,
tw.trend_direction,
tw.trend_strength,
tw.generated_at,
tw."window" AS tw_window
FROM matched_docs md
JOIN trend_windows tw
ON tw.entity_type = 'company'
AND tw.entity_id = $5
AND tw."window" = $6
AND tw.generated_at >= md.published_at
AND tw.generated_at <= md.published_at + $7::interval
ORDER BY md.published_at DESC
"""
# ---------------------------------------------------------------------------
# Horizon → interval mapping
# ---------------------------------------------------------------------------
_HORIZON_INTERVALS: dict[str, str] = {
"1d": "1 day",
"7d": "7 days",
"30d": "30 days",
}
# ---------------------------------------------------------------------------
# Build HistoricalPattern from query rows
# ---------------------------------------------------------------------------
def _build_pattern(
rows: list[asyncpg.Record],
source_ticker: str,
target_ticker: str,
catalyst_type: str,
horizon: str,
tier: str,
config: Optional[CompetitiveConfig] = None,
) -> Optional[HistoricalPattern]:
"""Aggregate query rows into a single HistoricalPattern."""
if not rows:
return None
# De-duplicate by dir_id — keep the first (closest) trend_window per doc
seen: set[str] = set()
unique_rows: list[asyncpg.Record] = []
for r in rows:
rid = str(r["dir_id"])
if rid not in seen:
seen.add(rid)
unique_rows.append(r)
sample_count = len(unique_rows)
bullish = sum(1 for r in unique_rows if r["trend_direction"] == "bullish")
bearish = sum(1 for r in unique_rows if r["trend_direction"] == "bearish")
bullish_pct = bullish / sample_count
bearish_pct = bearish / sample_count
strengths = [float(r["trend_strength"]) for r in unique_rows if r["trend_strength"] is not None]
avg_strength = sum(strengths) / len(strengths) if strengths else 0.0
# avg_time_to_resolution: average days between published_at and generated_at
resolutions: list[float] = []
for r in unique_rows:
pub = r["published_at"]
gen = r["generated_at"]
if pub and gen:
delta = (gen - pub).total_seconds() / 86400.0
resolutions.append(max(delta, 0.0))
avg_time_to_resolution = sum(resolutions) / len(resolutions) if resolutions else 0.0
# Date range
published_dates = [r["published_at"] for r in unique_rows if r["published_at"] is not None]
data_start = min(published_dates)
data_end = max(published_dates)
# Recency: days since the most recent data point
now = datetime.now(timezone.utc)
data_recency_days = (now - data_end).total_seconds() / 86400.0 if data_end else 999.0
outcome_consistency = max(bullish_pct, bearish_pct)
confidence = compute_pattern_confidence(
sample_count, outcome_consistency, data_recency_days, tier, config,
)
insufficient_data = sample_count < (config or CompetitiveConfig()).min_pattern_samples
return HistoricalPattern(
source_ticker=source_ticker,
target_ticker=target_ticker,
catalyst_type=catalyst_type,
time_horizon=horizon,
sample_count=sample_count,
bullish_pct=bullish_pct,
bearish_pct=bearish_pct,
avg_strength=min(max(avg_strength, 0.0), 1.0),
avg_time_to_resolution=avg_time_to_resolution,
pattern_confidence=confidence,
data_start=data_start,
data_end=data_end,
tier=tier,
insufficient_data=insufficient_data,
)
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
async def find_self_patterns(
pool: asyncpg.Pool,
ticker: str,
catalyst_type: str,
horizons: Optional[list[str]] = None,
config: Optional[CompetitiveConfig] = None,
) -> list[HistoricalPattern]:
"""Find historical patterns for the same company-catalyst pair.
Queries document_impact_records joined with trend_windows for the
given ticker and catalyst_type across configurable time horizons.
Requirements: 3.1, 3.2, 3.5, 11.3
"""
cfg = config or CompetitiveConfig()
horizons = horizons or DEFAULT_HORIZONS
tier = classify_catalyst_tier(catalyst_type)
lookback = _lookback_days(tier, cfg)
now = datetime.now(timezone.utc)
cutoff = now - timedelta(days=lookback)
patterns: list[HistoricalPattern] = []
async with pool.acquire() as conn:
for horizon in horizons:
interval = _HORIZON_INTERVALS.get(horizon)
if interval is None:
logger.warning("Unknown horizon %s, skipping", horizon)
continue
try:
rows = await conn.fetch(
_SELF_PATTERN_QUERY,
ticker, # $1
catalyst_type, # $2
cutoff, # $3
now, # $4
horizon, # $5
interval, # $6
)
except Exception:
logger.exception(
"Error querying self-patterns for %s/%s/%s",
ticker, catalyst_type, horizon,
)
continue
pattern = _build_pattern(
rows, ticker, ticker, catalyst_type, horizon, tier, cfg,
)
if pattern is not None:
patterns.append(pattern)
return patterns
async def find_cross_company_patterns(
pool: asyncpg.Pool,
source_ticker: str,
target_ticker: str,
catalyst_type: str,
horizons: Optional[list[str]] = None,
config: Optional[CompetitiveConfig] = None,
) -> list[HistoricalPattern]:
"""Find cross-company historical patterns.
Queries documents about *source_ticker* with the given catalyst_type,
then looks at trend_windows for *target_ticker* within each horizon
after the document was published.
Requirements: 4.2, 11.5
"""
cfg = config or CompetitiveConfig()
horizons = horizons or DEFAULT_HORIZONS
tier = classify_catalyst_tier(catalyst_type)
lookback = _lookback_days(tier, cfg)
now = datetime.now(timezone.utc)
cutoff = now - timedelta(days=lookback)
patterns: list[HistoricalPattern] = []
async with pool.acquire() as conn:
for horizon in horizons:
interval = _HORIZON_INTERVALS.get(horizon)
if interval is None:
logger.warning("Unknown horizon %s, skipping", horizon)
continue
try:
rows = await conn.fetch(
_CROSS_PATTERN_QUERY,
source_ticker, # $1
catalyst_type, # $2
cutoff, # $3
now, # $4
target_ticker, # $5
horizon, # $6
interval, # $7
)
except Exception:
logger.exception(
"Error querying cross-patterns for %s%s/%s/%s",
source_ticker, target_ticker, catalyst_type, horizon,
)
continue
pattern = _build_pattern(
rows, source_ticker, target_ticker, catalyst_type,
horizon, tier, cfg,
)
if pattern is not None:
patterns.append(pattern)
return patterns