Files
stonks-oracle/services/aggregation/pattern_matcher.py
T

415 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Historical pattern mining for competitive intelligence.
Queries document_impact_records joined with trend_windows to find how
similar catalyst types resolved historically for a company or its
competitors. Produces HistoricalPattern objects consumed by the signal
propagation engine and the aggregation worker.
Requirements: 3.1, 3.2, 3.3, 3.4, 3.5, 11.1, 11.2, 11.3, 11.5
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Optional
import asyncpg
from services.shared.config import CompetitiveConfig
from services.shared.schemas import MAJOR_DECISION_CATALYSTS
logger = logging.getLogger(__name__)
DEFAULT_HORIZONS = ["1d", "7d", "30d"]
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class HistoricalPattern:
"""Statistical summary of how a catalyst type resolved historically."""
source_ticker: str
target_ticker: str
catalyst_type: str
time_horizon: str # 1d | 7d | 30d
sample_count: int
bullish_pct: float # [0, 1]
bearish_pct: float # [0, 1]
avg_strength: float # [0, 1]
avg_time_to_resolution: float # days
pattern_confidence: float # [0, 1]
data_start: datetime
data_end: datetime
tier: str # major_corporate_decision | routine_signal
insufficient_data: bool
# ---------------------------------------------------------------------------
# Catalyst tier classification (Req 11.1)
# ---------------------------------------------------------------------------
def classify_catalyst_tier(catalyst_type: str) -> str:
"""Deterministic mapping of catalyst_type to tier.
Returns ``"major_corporate_decision"`` for catalyst types in
MAJOR_DECISION_CATALYSTS, otherwise ``"routine_signal"``.
"""
if catalyst_type in MAJOR_DECISION_CATALYSTS:
return "major_corporate_decision"
return "routine_signal"
# ---------------------------------------------------------------------------
# Pattern confidence (Req 3.3, 11.2)
# ---------------------------------------------------------------------------
def compute_pattern_confidence(
sample_count: int,
outcome_consistency: float,
data_recency_days: float,
tier: str,
config: Optional[CompetitiveConfig] = None,
) -> float:
"""Compute pattern confidence score in [0, 1].
Formula:
sample_factor * 0.4 + consistency * 0.4 + recency_factor * 0.2
With a 1.3× multiplier for ``major_corporate_decision`` tier,
insufficient-data cap, and staleness decay.
"""
cfg = config or CompetitiveConfig()
# --- component factors ---
sample_factor = min(sample_count / 20.0, 1.0)
consistency = outcome_consistency # already max(bullish_pct, bearish_pct)
if data_recency_days <= cfg.staleness_recent_days:
recency_factor = 1.0
elif data_recency_days <= cfg.staleness_window_days:
recency_factor = 0.7
else:
recency_factor = 0.4
confidence = sample_factor * 0.4 + consistency * 0.4 + recency_factor * 0.2
# Major-decision multiplier (Req 11.2)
if tier == "major_corporate_decision":
confidence *= cfg.major_decision_weight_multiplier
# Clamp to [0, 1]
confidence = min(max(confidence, 0.0), 1.0)
# Insufficient data cap (Req 3.4)
if sample_count < cfg.min_pattern_samples:
confidence = min(confidence, 0.25)
# Staleness decay (Req 9.2)
if data_recency_days > cfg.staleness_window_days:
confidence *= cfg.staleness_decay_penalty
return confidence
# ---------------------------------------------------------------------------
# Lookback helper
# ---------------------------------------------------------------------------
def _lookback_days(tier: str, config: Optional[CompetitiveConfig] = None) -> int:
"""Return the lookback window in days for the given tier."""
cfg = config or CompetitiveConfig()
if tier == "major_corporate_decision":
return cfg.major_decision_lookback_days
return cfg.routine_lookback_days
# ---------------------------------------------------------------------------
# SQL: self-company pattern query
# ---------------------------------------------------------------------------
_SELF_PATTERN_QUERY = """
WITH matched_docs AS (
SELECT
dir.id AS dir_id,
d.published_at,
dir.sentiment
FROM document_impact_records dir
JOIN document_intelligence di ON di.id = dir.intelligence_id
JOIN documents d ON d.id = di.document_id
WHERE dir.ticker = $1
AND dir.catalyst_type = $2
AND di.validation_status = 'valid'
AND d.status != 'rejected'
AND d.published_at >= $3
AND d.published_at <= $4
)
SELECT
md.dir_id,
md.published_at,
md.sentiment,
tw.trend_direction,
tw.trend_strength,
tw.generated_at,
tw."window" AS tw_window
FROM matched_docs md
JOIN trend_windows tw
ON tw.entity_type = 'company'
AND tw.entity_id = $1
AND tw."window" = $5
AND tw.generated_at >= md.published_at
AND tw.generated_at <= md.published_at + $6::interval
ORDER BY md.published_at DESC
"""
# ---------------------------------------------------------------------------
# SQL: cross-company pattern query
# ---------------------------------------------------------------------------
_CROSS_PATTERN_QUERY = """
WITH matched_docs AS (
SELECT
dir.id AS dir_id,
d.published_at,
dir.sentiment
FROM document_impact_records dir
JOIN document_intelligence di ON di.id = dir.intelligence_id
JOIN documents d ON d.id = di.document_id
WHERE dir.ticker = $1
AND dir.catalyst_type = $2
AND di.validation_status = 'valid'
AND d.status != 'rejected'
AND d.published_at >= $3
AND d.published_at <= $4
)
SELECT
md.dir_id,
md.published_at,
md.sentiment,
tw.trend_direction,
tw.trend_strength,
tw.generated_at,
tw."window" AS tw_window
FROM matched_docs md
JOIN trend_windows tw
ON tw.entity_type = 'company'
AND tw.entity_id = $5
AND tw."window" = $6
AND tw.generated_at >= md.published_at
AND tw.generated_at <= md.published_at + $7::interval
ORDER BY md.published_at DESC
"""
# ---------------------------------------------------------------------------
# Horizon → interval mapping
# ---------------------------------------------------------------------------
_HORIZON_INTERVALS: dict[str, str] = {
"1d": "1 day",
"7d": "7 days",
"30d": "30 days",
}
# ---------------------------------------------------------------------------
# Build HistoricalPattern from query rows
# ---------------------------------------------------------------------------
def _build_pattern(
rows: list[asyncpg.Record],
source_ticker: str,
target_ticker: str,
catalyst_type: str,
horizon: str,
tier: str,
config: Optional[CompetitiveConfig] = None,
) -> Optional[HistoricalPattern]:
"""Aggregate query rows into a single HistoricalPattern."""
if not rows:
return None
# De-duplicate by dir_id — keep the first (closest) trend_window per doc
seen: set[str] = set()
unique_rows: list[asyncpg.Record] = []
for r in rows:
rid = str(r["dir_id"])
if rid not in seen:
seen.add(rid)
unique_rows.append(r)
sample_count = len(unique_rows)
bullish = sum(1 for r in unique_rows if r["trend_direction"] == "bullish")
bearish = sum(1 for r in unique_rows if r["trend_direction"] == "bearish")
bullish_pct = bullish / sample_count
bearish_pct = bearish / sample_count
strengths = [float(r["trend_strength"]) for r in unique_rows if r["trend_strength"] is not None]
avg_strength = sum(strengths) / len(strengths) if strengths else 0.0
# avg_time_to_resolution: average days between published_at and generated_at
resolutions: list[float] = []
for r in unique_rows:
pub = r["published_at"]
gen = r["generated_at"]
if pub and gen:
delta = (gen - pub).total_seconds() / 86400.0
resolutions.append(max(delta, 0.0))
avg_time_to_resolution = sum(resolutions) / len(resolutions) if resolutions else 0.0
# Date range
published_dates = [r["published_at"] for r in unique_rows if r["published_at"] is not None]
data_start = min(published_dates)
data_end = max(published_dates)
# Recency: days since the most recent data point
now = datetime.now(timezone.utc)
data_recency_days = (now - data_end).total_seconds() / 86400.0 if data_end else 999.0
outcome_consistency = max(bullish_pct, bearish_pct)
confidence = compute_pattern_confidence(
sample_count, outcome_consistency, data_recency_days, tier, config,
)
insufficient_data = sample_count < (config or CompetitiveConfig()).min_pattern_samples
return HistoricalPattern(
source_ticker=source_ticker,
target_ticker=target_ticker,
catalyst_type=catalyst_type,
time_horizon=horizon,
sample_count=sample_count,
bullish_pct=bullish_pct,
bearish_pct=bearish_pct,
avg_strength=min(max(avg_strength, 0.0), 1.0),
avg_time_to_resolution=avg_time_to_resolution,
pattern_confidence=confidence,
data_start=data_start,
data_end=data_end,
tier=tier,
insufficient_data=insufficient_data,
)
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
async def find_self_patterns(
pool: asyncpg.Pool,
ticker: str,
catalyst_type: str,
horizons: Optional[list[str]] = None,
config: Optional[CompetitiveConfig] = None,
) -> list[HistoricalPattern]:
"""Find historical patterns for the same company-catalyst pair.
Queries document_impact_records joined with trend_windows for the
given ticker and catalyst_type across configurable time horizons.
Requirements: 3.1, 3.2, 3.5, 11.3
"""
cfg = config or CompetitiveConfig()
horizons = horizons or DEFAULT_HORIZONS
tier = classify_catalyst_tier(catalyst_type)
lookback = _lookback_days(tier, cfg)
now = datetime.now(timezone.utc)
cutoff = now - timedelta(days=lookback)
patterns: list[HistoricalPattern] = []
async with pool.acquire() as conn:
for horizon in horizons:
interval = _HORIZON_INTERVALS.get(horizon)
if interval is None:
logger.warning("Unknown horizon %s, skipping", horizon)
continue
try:
rows = await conn.fetch(
_SELF_PATTERN_QUERY,
ticker, # $1
catalyst_type, # $2
cutoff, # $3
now, # $4
horizon, # $5
interval, # $6
)
except Exception:
logger.exception(
"Error querying self-patterns for %s/%s/%s",
ticker, catalyst_type, horizon,
)
continue
pattern = _build_pattern(
rows, ticker, ticker, catalyst_type, horizon, tier, cfg,
)
if pattern is not None:
patterns.append(pattern)
return patterns
async def find_cross_company_patterns(
pool: asyncpg.Pool,
source_ticker: str,
target_ticker: str,
catalyst_type: str,
horizons: Optional[list[str]] = None,
config: Optional[CompetitiveConfig] = None,
) -> list[HistoricalPattern]:
"""Find cross-company historical patterns.
Queries documents about *source_ticker* with the given catalyst_type,
then looks at trend_windows for *target_ticker* within each horizon
after the document was published.
Requirements: 4.2, 11.5
"""
cfg = config or CompetitiveConfig()
horizons = horizons or DEFAULT_HORIZONS
tier = classify_catalyst_tier(catalyst_type)
lookback = _lookback_days(tier, cfg)
now = datetime.now(timezone.utc)
cutoff = now - timedelta(days=lookback)
patterns: list[HistoricalPattern] = []
async with pool.acquire() as conn:
for horizon in horizons:
interval = _HORIZON_INTERVALS.get(horizon)
if interval is None:
logger.warning("Unknown horizon %s, skipping", horizon)
continue
try:
rows = await conn.fetch(
_CROSS_PATTERN_QUERY,
source_ticker, # $1
catalyst_type, # $2
cutoff, # $3
now, # $4
target_ticker, # $5
horizon, # $6
interval, # $7
)
except Exception:
logger.exception(
"Error querying cross-patterns for %s%s/%s/%s",
source_ticker, target_ticker, catalyst_type, horizon,
)
continue
pattern = _build_pattern(
rows, source_ticker, target_ticker, catalyst_type,
horizon, tier, cfg,
)
if pattern is not None:
patterns.append(pattern)
return patterns