Files
stonks-oracle/services/aggregation/worker.py
T

653 lines
21 KiB
Python

"""Aggregation worker - company-level rolling window trend summaries.
Queries document intelligence and market context for a given ticker,
computes weighted signal scores, and produces TrendSummary objects
persisted to the trend_windows table.
Requirements: 6.1, 6.2, 6.5
"""
from __future__ import annotations
import json
import logging
import time
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Any
import asyncpg
from services.aggregation.contradiction import CatalystEntry, detect_contradictions
from services.aggregation.evidence import (
EvidenceRankConfig,
RankedEvidence,
rank_evidence_detailed,
)
from services.aggregation.evidence import (
rank_evidence as _rank_evidence_composite,
)
from services.aggregation.market_context import fetch_market_context
from services.aggregation.scoring import (
ScoringConfig,
WeightedSignal,
compute_signal_weight,
sentiment_to_numeric,
weighted_sentiment_average,
)
from services.shared.metrics import (
AGGREGATION_CONTRADICTION_SCORE,
AGGREGATION_DURATION,
AGGREGATION_SIGNALS_PROCESSED,
AGGREGATION_WINDOWS_COMPUTED,
)
from services.shared.schemas import TrendDirection, TrendSummary, TrendWindow
logger = logging.getLogger(__name__)
# Map TrendWindow values to lookback durations.
WINDOW_DURATIONS: dict[str, timedelta] = {
TrendWindow.INTRADAY.value: timedelta(hours=12),
TrendWindow.ONE_DAY.value: timedelta(days=1),
TrendWindow.SEVEN_DAY.value: timedelta(days=7),
TrendWindow.THIRTY_DAY.value: timedelta(days=30),
TrendWindow.NINETY_DAY.value: timedelta(days=90),
}
# How many evidence document IDs to keep in supporting/opposing lists.
MAX_EVIDENCE_REFS = 10
@dataclass
class AggregationConfig:
"""Controls which windows to compute and scoring parameters."""
windows: list[str] | None = None # None = all windows
scoring: ScoringConfig | None = None
max_evidence: int = MAX_EVIDENCE_REFS
def effective_windows(self) -> list[str]:
if self.windows:
return self.windows
return [w.value for w in TrendWindow]
def effective_scoring(self) -> ScoringConfig:
return self.scoring or ScoringConfig()
# ---------------------------------------------------------------------------
# Fetch impact records for a ticker within a time window
# ---------------------------------------------------------------------------
_IMPACT_QUERY = """
SELECT
di.document_id,
di.confidence,
di.novelty_score,
di.source_credibility,
dir.sentiment,
dir.impact_score,
dir.catalyst_type,
dir.key_facts,
dir.risks,
d.published_at
FROM document_impact_records dir
JOIN document_intelligence di ON di.id = dir.intelligence_id
JOIN documents d ON d.id = di.document_id
WHERE dir.ticker = $1
AND d.published_at >= $2
AND d.published_at <= $3
AND di.validation_status = 'valid'
AND d.status != 'rejected'
ORDER BY d.published_at DESC
"""
@dataclass
class ImpactRow:
"""Parsed row from the impact query."""
document_id: str
confidence: float
novelty_score: float
source_credibility: float
sentiment: str
impact_score: float
catalyst_type: str
key_facts: list[str]
risks: list[str]
published_at: datetime
def _parse_impact_row(row: Any) -> ImpactRow:
"""Convert an asyncpg Record to an ImpactRow."""
key_facts = row["key_facts"]
if isinstance(key_facts, str):
key_facts = json.loads(key_facts)
risks = row["risks"]
if isinstance(risks, str):
risks = json.loads(risks)
return ImpactRow(
document_id=str(row["document_id"]),
confidence=float(row["confidence"] or 0.5),
novelty_score=float(row["novelty_score"] or 0.5),
source_credibility=float(row["source_credibility"] or 0.5),
sentiment=row["sentiment"] or "neutral",
impact_score=float(row["impact_score"] or 0.0),
catalyst_type=row["catalyst_type"] or "other",
key_facts=key_facts if isinstance(key_facts, list) else [],
risks=risks if isinstance(risks, list) else [],
published_at=row["published_at"],
)
async def fetch_impact_records(
pool: asyncpg.Pool,
ticker: str,
window_start: datetime,
window_end: datetime,
) -> list[ImpactRow]:
"""Fetch validated document impact records for a ticker in a time range."""
rows = await pool.fetch(_IMPACT_QUERY, ticker, window_start, window_end)
return [_parse_impact_row(r) for r in rows]
# ---------------------------------------------------------------------------
# Build weighted signals from impact records
# ---------------------------------------------------------------------------
def build_weighted_signals(
impacts: list[ImpactRow],
reference_time: datetime,
window: str,
market_ctx: Any | None = None,
config: ScoringConfig | None = None,
) -> list[WeightedSignal]:
"""Convert impact records into WeightedSignal objects using the scoring module."""
cfg = config or ScoringConfig()
signals: list[WeightedSignal] = []
for imp in impacts:
sw = compute_signal_weight(
published_at=imp.published_at,
reference_time=reference_time,
window=window,
source_credibility=imp.source_credibility,
novelty_score=imp.novelty_score,
extraction_confidence=imp.confidence,
market_ctx=market_ctx,
config=cfg,
)
signals.append(
WeightedSignal(
document_id=imp.document_id,
weight=sw,
sentiment_value=sentiment_to_numeric(imp.sentiment),
impact_score=imp.impact_score,
)
)
return signals
# ---------------------------------------------------------------------------
# Derive trend direction from weighted sentiment
# ---------------------------------------------------------------------------
# Thresholds for mapping numeric sentiment to direction.
BULLISH_THRESHOLD = 0.15
BEARISH_THRESHOLD = -0.15
MIXED_THRESHOLD = 0.10 # contradiction score above this → mixed
def derive_trend_direction(
avg_sentiment: float,
contradiction_score: float = 0.0,
) -> TrendDirection:
"""Map a weighted average sentiment to a TrendDirection.
If contradiction is high, the direction is MIXED regardless of
the average sentiment value.
"""
if contradiction_score > MIXED_THRESHOLD and abs(avg_sentiment) < 0.3:
return TrendDirection.MIXED
if avg_sentiment >= BULLISH_THRESHOLD:
return TrendDirection.BULLISH
if avg_sentiment <= BEARISH_THRESHOLD:
return TrendDirection.BEARISH
return TrendDirection.NEUTRAL
# ---------------------------------------------------------------------------
# Compute contradiction score
# ---------------------------------------------------------------------------
def compute_contradiction_score(signals: list[WeightedSignal]) -> float:
"""Measure how much disagreement exists among weighted signals.
Returns a value in [0, 1] where 0 means full agreement and 1 means
equal-weight positive and negative signals.
The formula computes the ratio of the minority-side total weight to
the majority-side total weight.
"""
if not signals:
return 0.0
pos_weight = 0.0
neg_weight = 0.0
for sig in signals:
w = sig.weight.combined * sig.impact_score
if sig.sentiment_value > 0:
pos_weight += w
elif sig.sentiment_value < 0:
neg_weight += w
total = pos_weight + neg_weight
if total == 0.0:
return 0.0
minority = min(pos_weight, neg_weight)
return round(minority / total, 4)
# ---------------------------------------------------------------------------
# Rank evidence (supporting vs opposing)
# ---------------------------------------------------------------------------
def rank_evidence(
signals: list[WeightedSignal],
max_refs: int = MAX_EVIDENCE_REFS,
) -> tuple[list[str], list[str]]:
"""Return top supporting and opposing document IDs ranked by composite score.
Delegates to the evidence ranking module which considers multiple
factors (weight, impact, recency, confidence) rather than raw weight alone.
Supporting = positive sentiment, Opposing = negative sentiment.
Neutral/mixed signals are excluded from evidence lists.
"""
config = EvidenceRankConfig(max_refs=max_refs)
return _rank_evidence_composite(signals, config)
# ---------------------------------------------------------------------------
# Extract dominant catalysts and material risks
# ---------------------------------------------------------------------------
def extract_catalysts_and_risks(
impacts: list[ImpactRow],
signals: list[WeightedSignal],
) -> tuple[list[str], list[str]]:
"""Return dominant catalyst types and material risks weighted by signal strength.
Catalysts are ranked by cumulative weight. Risks are deduplicated and
ordered by the weight of the signal that surfaced them.
"""
catalyst_weights: dict[str, float] = {}
risk_entries: list[tuple[float, str]] = []
# Build a lookup from document_id to combined weight
weight_by_doc = {s.document_id: s.weight.combined * s.impact_score for s in signals}
for imp in impacts:
w = weight_by_doc.get(imp.document_id, 0.0)
if w <= 0.0:
continue
catalyst_weights[imp.catalyst_type] = catalyst_weights.get(imp.catalyst_type, 0.0) + w
for risk in imp.risks:
risk_entries.append((w, risk))
# Top catalysts by cumulative weight
sorted_catalysts = sorted(catalyst_weights.items(), key=lambda x: x[1], reverse=True)
catalysts = [cat for cat, _ in sorted_catalysts[:5]]
# Deduplicated risks ordered by weight
seen_risks: set[str] = set()
risks: list[str] = []
risk_entries.sort(key=lambda x: x[0], reverse=True)
for _, risk_text in risk_entries:
normalized = risk_text.strip().lower()
if normalized not in seen_risks:
seen_risks.add(normalized)
risks.append(risk_text.strip())
if len(risks) >= 5:
break
return catalysts, risks
# ---------------------------------------------------------------------------
# Compute trend confidence
# ---------------------------------------------------------------------------
def compute_trend_confidence(
signals: list[WeightedSignal],
contradiction_score: float,
) -> float:
"""Derive an overall confidence for the trend summary.
Confidence is based on:
- Number of contributing signals (more = higher base)
- Average extraction confidence of contributing signals
- Contradiction penalty (high contradiction lowers confidence)
Returns a value in [0, 1].
"""
if not signals:
return 0.0
active = [s for s in signals if s.weight.combined > 0]
if not active:
return 0.0
# Base confidence from signal count (diminishing returns)
count_factor = min(len(active) / 20.0, 1.0)
# Average extraction confidence (from the confidence_gate — if gated,
# the signal wouldn't be in active list, so we use the raw confidence
# from the weight breakdown).
avg_conf = sum(s.weight.credibility for s in active) / len(active)
# Contradiction penalty
contradiction_penalty = contradiction_score * 0.4
confidence = (0.4 * count_factor + 0.6 * avg_conf) - contradiction_penalty
return round(max(0.0, min(1.0, confidence)), 4)
# ---------------------------------------------------------------------------
# Assemble a TrendSummary from components
# ---------------------------------------------------------------------------
@dataclass
class AssembledTrend:
"""A trend summary paired with its detailed evidence rankings."""
summary: TrendSummary
supporting_evidence: list[RankedEvidence]
opposing_evidence: list[RankedEvidence]
def assemble_trend_summary(
ticker: str,
window: str,
signals: list[WeightedSignal],
impacts: list[ImpactRow],
market_ctx: Any | None = None,
max_evidence: int = MAX_EVIDENCE_REFS,
reference_time: datetime | None = None,
) -> TrendSummary:
"""Build a complete TrendSummary from weighted signals and impact records."""
result = assemble_trend_with_evidence(
ticker, window, signals, impacts, market_ctx, max_evidence, reference_time,
)
return result.summary
def assemble_trend_with_evidence(
ticker: str,
window: str,
signals: list[WeightedSignal],
impacts: list[ImpactRow],
market_ctx: Any | None = None,
max_evidence: int = MAX_EVIDENCE_REFS,
reference_time: datetime | None = None,
) -> AssembledTrend:
"""Build a TrendSummary and return detailed evidence rankings for persistence."""
if reference_time is None:
reference_time = datetime.now(timezone.utc)
avg_sentiment = weighted_sentiment_average(signals)
# Run full contradiction detection (Requirement 6.4)
catalyst_entries = [
CatalystEntry(document_id=imp.document_id, catalyst_type=imp.catalyst_type)
for imp in impacts
]
contradiction_result = detect_contradictions(signals, catalyst_entries)
contradiction = contradiction_result.score
direction = derive_trend_direction(avg_sentiment, contradiction)
confidence = compute_trend_confidence(signals, contradiction)
# Get detailed evidence rankings for persistence
config = EvidenceRankConfig(max_refs=max_evidence)
supporting_ranked, opposing_ranked = rank_evidence_detailed(signals, config)
supporting = [r.document_id for r in supporting_ranked]
opposing = [r.document_id for r in opposing_ranked]
catalysts, risks = extract_catalysts_and_risks(impacts, signals)
# Trend strength: absolute value of weighted sentiment, clamped to [0, 1]
strength = round(min(abs(avg_sentiment), 1.0), 4)
summary = TrendSummary(
entity_type="company",
entity_id=ticker,
window=TrendWindow(window),
trend_direction=direction,
trend_strength=strength,
confidence=confidence,
top_supporting_evidence=supporting,
top_opposing_evidence=opposing,
dominant_catalysts=catalysts,
material_risks=risks,
contradiction_score=contradiction,
disagreement_details=contradiction_result.details,
market_context=market_ctx,
generated_at=reference_time,
)
return AssembledTrend(
summary=summary,
supporting_evidence=supporting_ranked,
opposing_evidence=opposing_ranked,
)
# ---------------------------------------------------------------------------
# Persist trend summary to PostgreSQL
# ---------------------------------------------------------------------------
_UPSERT_TREND = """
INSERT INTO trend_windows (
entity_type, entity_id, window, trend_direction, trend_strength,
confidence, top_supporting_evidence, top_opposing_evidence,
dominant_catalysts, material_risks, contradiction_score,
disagreement_details, market_context, generated_at
) VALUES (
$1, $2, $3, $4, $5,
$6, $7::jsonb, $8::jsonb,
$9::jsonb, $10::jsonb, $11,
$12::jsonb, $13::jsonb, $14
)
RETURNING id
"""
async def persist_trend_summary(
pool: asyncpg.Pool,
summary: TrendSummary,
) -> str:
"""Insert a trend summary row and return its UUID."""
row = await pool.fetchrow(
_UPSERT_TREND,
summary.entity_type,
summary.entity_id,
summary.window.value,
summary.trend_direction.value,
summary.trend_strength,
summary.confidence,
json.dumps(summary.top_supporting_evidence),
json.dumps(summary.top_opposing_evidence),
json.dumps(summary.dominant_catalysts),
json.dumps(summary.material_risks),
summary.contradiction_score,
json.dumps([d.model_dump() for d in summary.disagreement_details]),
json.dumps(summary.market_context.model_dump() if summary.market_context else {}, default=str),
summary.generated_at,
)
return str(row["id"])
# ---------------------------------------------------------------------------
# Persist evidence mappings to trend_evidence table
# ---------------------------------------------------------------------------
_INSERT_EVIDENCE = """
INSERT INTO trend_evidence (
trend_window_id, document_id, evidence_type,
rank_score, weight_component, impact_component,
recency_component, confidence_component, sentiment_value
) VALUES (
$1, $2::uuid, $3,
$4, $5, $6,
$7, $8, $9
)
"""
async def persist_trend_evidence(
pool: asyncpg.Pool,
trend_window_id: str,
supporting: list[RankedEvidence],
opposing: list[RankedEvidence],
) -> int:
"""Insert evidence mapping rows for a trend window. Returns count inserted."""
rows: list[tuple[str, str, str, float, float, float, float, float, float]] = []
for ev in supporting:
rows.append((
trend_window_id, ev.document_id, "supporting",
ev.rank_score, ev.weight_component, ev.impact_component,
ev.recency_component, ev.confidence_component, ev.sentiment_value,
))
for ev in opposing:
rows.append((
trend_window_id, ev.document_id, "opposing",
ev.rank_score, ev.weight_component, ev.impact_component,
ev.recency_component, ev.confidence_component, ev.sentiment_value,
))
if not rows:
return 0
await pool.executemany(_INSERT_EVIDENCE, rows)
return len(rows)
# ---------------------------------------------------------------------------
# Main aggregation entry point for a single ticker + window
# ---------------------------------------------------------------------------
async def aggregate_company_window(
pool: asyncpg.Pool,
ticker: str,
window: str,
reference_time: datetime | None = None,
config: AggregationConfig | None = None,
) -> TrendSummary:
"""Compute and persist a trend summary for one ticker and one window.
Steps:
1. Determine the time range for the window.
2. Fetch document impact records from PostgreSQL.
3. Fetch market context for the ticker.
4. Build weighted signals using the scoring module.
5. Assemble the TrendSummary.
6. Persist to trend_windows table.
Returns the assembled TrendSummary.
"""
cfg = config or AggregationConfig()
scoring_cfg = cfg.effective_scoring()
if reference_time is None:
reference_time = datetime.now(timezone.utc)
_agg_start = time.monotonic()
duration = WINDOW_DURATIONS.get(window, timedelta(days=7))
window_start = reference_time - duration
# 1. Fetch impact records
impacts = await fetch_impact_records(pool, ticker, window_start, reference_time)
# 2. Fetch market context
market_ctx = await fetch_market_context(pool, ticker, window, reference_time)
# 3. Build weighted signals
signals = build_weighted_signals(
impacts, reference_time, window, market_ctx, scoring_cfg,
)
# 4. Assemble trend summary with evidence details
assembled = assemble_trend_with_evidence(
ticker=ticker,
window=window,
signals=signals,
impacts=impacts,
market_ctx=market_ctx if market_ctx.has_data else None,
max_evidence=cfg.max_evidence,
reference_time=reference_time,
)
summary = assembled.summary
# 5. Persist trend window
trend_id = await persist_trend_summary(pool, summary)
# 6. Persist evidence mappings
evidence_count = await persist_trend_evidence(
pool, trend_id,
assembled.supporting_evidence,
assembled.opposing_evidence,
)
logger.info(
"Persisted trend %s for %s/%s: direction=%s strength=%.3f confidence=%.3f signals=%d evidence=%d",
trend_id, ticker, window, summary.trend_direction.value,
summary.trend_strength, summary.confidence, len(signals), evidence_count,
)
# Prometheus metrics
AGGREGATION_WINDOWS_COMPUTED.labels(window=window).inc()
AGGREGATION_SIGNALS_PROCESSED.labels(window=window).inc(len(signals))
AGGREGATION_CONTRADICTION_SCORE.observe(summary.contradiction_score)
AGGREGATION_DURATION.labels(window=window).observe(time.monotonic() - _agg_start)
return summary
# ---------------------------------------------------------------------------
# Aggregate all windows for a single ticker
# ---------------------------------------------------------------------------
async def aggregate_company(
pool: asyncpg.Pool,
ticker: str,
reference_time: datetime | None = None,
config: AggregationConfig | None = None,
) -> list[TrendSummary]:
"""Compute trend summaries for all configured windows for a ticker."""
cfg = config or AggregationConfig()
if reference_time is None:
reference_time = datetime.now(timezone.utc)
summaries: list[TrendSummary] = []
for window in cfg.effective_windows():
summary = await aggregate_company_window(
pool, ticker, window, reference_time, cfg,
)
summaries.append(summary)
return summaries