phase 14-15: docker build validation and helm deployment

This commit is contained in:
Celes Renata
2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
+169
View File
@@ -0,0 +1,169 @@
"""Contradiction detection and disagreement representation.
Analyses weighted signals to detect and represent disagreement explicitly,
rather than collapsing contradictory evidence into a single unsupported
conclusion.
Requirements: 6.4, 6.5
"""
from __future__ import annotations
from dataclasses import dataclass
from services.aggregation.scoring import WeightedSignal
from services.shared.schemas import DisagreementDetail
@dataclass
class CatalystEntry:
"""Lightweight carrier for per-document catalyst info needed by
contradiction detection. Avoids importing ImpactRow and creating
a circular dependency with worker.py."""
document_id: str
catalyst_type: str
@dataclass
class ContradictionResult:
"""Full contradiction analysis output."""
score: float # 0-1, same semantics as existing compute_contradiction_score
details: list[DisagreementDetail]
def detect_contradictions(
signals: list[WeightedSignal],
catalyst_entries: list[CatalystEntry] | None = None,
) -> ContradictionResult:
"""Run contradiction detection across multiple dimensions.
Analyses:
1. Sentiment disagreement — the core positive-vs-negative split
2. Catalyst disagreement — same catalyst type with opposing sentiment
Returns a ContradictionResult with an overall score and per-dimension
disagreement details.
"""
details: list[DisagreementDetail] = []
sentiment_detail = _detect_sentiment_disagreement(signals)
if sentiment_detail is not None:
details.append(sentiment_detail)
if catalyst_entries:
catalyst_details = _detect_catalyst_disagreement(signals, catalyst_entries)
details.extend(catalyst_details)
score = _compute_overall_score(signals)
return ContradictionResult(score=score, details=details)
def _compute_overall_score(signals: list[WeightedSignal]) -> float:
"""Minority/majority weight ratio — backward-compatible formula."""
if not signals:
return 0.0
pos_weight = 0.0
neg_weight = 0.0
for sig in signals:
w = sig.weight.combined * sig.impact_score
if sig.sentiment_value > 0:
pos_weight += w
elif sig.sentiment_value < 0:
neg_weight += w
total = pos_weight + neg_weight
if total == 0.0:
return 0.0
minority = min(pos_weight, neg_weight)
return round(minority / total, 4)
def _detect_sentiment_disagreement(
signals: list[WeightedSignal],
) -> DisagreementDetail | None:
"""Detect when both positive and negative sentiment signals exist."""
pos_ids: list[str] = []
neg_ids: list[str] = []
pos_weight = 0.0
neg_weight = 0.0
for sig in signals:
w = sig.weight.combined * sig.impact_score
if w <= 0:
continue
if sig.sentiment_value > 0:
pos_ids.append(sig.document_id)
pos_weight += w
elif sig.sentiment_value < 0:
neg_ids.append(sig.document_id)
neg_weight += w
if not pos_ids or not neg_ids:
return None
total = pos_weight + neg_weight
minority_pct = min(pos_weight, neg_weight) / total if total > 0 else 0.0
return DisagreementDetail(
dimension="sentiment",
positive_doc_ids=pos_ids,
negative_doc_ids=neg_ids,
positive_weight=round(pos_weight, 4),
negative_weight=round(neg_weight, 4),
description=(
f"Sentiment split: {len(pos_ids)} positive vs {len(neg_ids)} negative signals "
f"(minority weight ratio {minority_pct:.0%})"
),
)
def _detect_catalyst_disagreement(
signals: list[WeightedSignal],
catalyst_entries: list[CatalystEntry],
) -> list[DisagreementDetail]:
"""Detect when the same catalyst type has both positive and negative signals."""
# Build lookup: document_id → (sentiment_value, combined_weight)
sig_lookup: dict[str, tuple[float, float]] = {}
for sig in signals:
w = sig.weight.combined * sig.impact_score
if w > 0:
sig_lookup[sig.document_id] = (sig.sentiment_value, w)
# Group by catalyst type
from collections import defaultdict
catalyst_groups: dict[str, list[tuple[str, float, float]]] = defaultdict(list)
for entry in catalyst_entries:
if entry.document_id in sig_lookup:
sent_val, weight = sig_lookup[entry.document_id]
if sent_val != 0.0:
catalyst_groups[entry.catalyst_type].append(
(entry.document_id, sent_val, weight)
)
details: list[DisagreementDetail] = []
for catalyst, entries in catalyst_groups.items():
pos_ids = [doc_id for doc_id, sv, _ in entries if sv > 0]
neg_ids = [doc_id for doc_id, sv, _ in entries if sv < 0]
if not pos_ids or not neg_ids:
continue
pos_w = sum(w for _, sv, w in entries if sv > 0)
neg_w = sum(w for _, sv, w in entries if sv < 0)
details.append(DisagreementDetail(
dimension=f"catalyst:{catalyst}",
positive_doc_ids=pos_ids,
negative_doc_ids=neg_ids,
positive_weight=round(pos_w, 4),
negative_weight=round(neg_w, 4),
description=(
f"Catalyst '{catalyst}' has {len(pos_ids)} positive and "
f"{len(neg_ids)} negative signals"
),
))
return details
+141
View File
@@ -0,0 +1,141 @@
"""Evidence ranking for supporting and opposing documents.
Ranks document signals by a composite score that considers multiple
factors beyond raw weight, producing explainable evidence lists for
trend summaries.
Requirements: 6.5
"""
from __future__ import annotations
from dataclasses import dataclass
from services.aggregation.scoring import WeightedSignal
@dataclass(frozen=True)
class EvidenceRankConfig:
"""Weights for the composite evidence ranking score."""
# How much the combined signal weight matters (recency * credibility * novelty * market)
weight_factor: float = 0.40
# How much the document's impact score matters
impact_factor: float = 0.30
# How much recency alone matters (favours fresh evidence in the ranking)
recency_factor: float = 0.20
# How much extraction confidence matters
confidence_factor: float = 0.10
# Maximum evidence refs per side (supporting / opposing)
max_refs: int = 10
DEFAULT_RANK_CONFIG = EvidenceRankConfig()
@dataclass
class RankedEvidence:
"""A document with its composite ranking score and breakdown."""
document_id: str
rank_score: float
weight_component: float
impact_component: float
recency_component: float
confidence_component: float
sentiment_value: float # +1 / -1 / 0
def compute_evidence_rank(
signal: WeightedSignal,
config: EvidenceRankConfig = DEFAULT_RANK_CONFIG,
) -> RankedEvidence:
"""Compute a composite ranking score for a single signal.
The score blends:
- combined signal weight (captures recency decay, credibility, novelty, market ctx)
- raw impact score
- recency weight alone (extra boost for freshness in the ranking)
- extraction confidence (via the credibility component of the weight)
All components are in [0, 1] so the composite is bounded by the sum
of the factor weights.
"""
w = signal.weight
weight_component = w.combined * config.weight_factor
impact_component = signal.impact_score * config.impact_factor
recency_component = w.recency * config.recency_factor
confidence_component = w.credibility * config.confidence_factor
rank_score = weight_component + impact_component + recency_component + confidence_component
return RankedEvidence(
document_id=signal.document_id,
rank_score=round(rank_score, 6),
weight_component=round(weight_component, 6),
impact_component=round(impact_component, 6),
recency_component=round(recency_component, 6),
confidence_component=round(confidence_component, 6),
sentiment_value=signal.sentiment_value,
)
def rank_evidence(
signals: list[WeightedSignal],
config: EvidenceRankConfig = DEFAULT_RANK_CONFIG,
) -> tuple[list[str], list[str]]:
"""Rank signals into top supporting and opposing document ID lists.
Supporting = positive sentiment, Opposing = negative sentiment.
Neutral/mixed signals are excluded.
Returns (supporting_ids, opposing_ids) each capped at config.max_refs.
"""
supporting: list[RankedEvidence] = []
opposing: list[RankedEvidence] = []
for sig in signals:
if sig.sentiment_value == 0.0:
continue
ranked = compute_evidence_rank(sig, config)
if sig.sentiment_value > 0:
supporting.append(ranked)
else:
opposing.append(ranked)
supporting.sort(key=lambda r: r.rank_score, reverse=True)
opposing.sort(key=lambda r: r.rank_score, reverse=True)
return (
[r.document_id for r in supporting[: config.max_refs]],
[r.document_id for r in opposing[: config.max_refs]],
)
def rank_evidence_detailed(
signals: list[WeightedSignal],
config: EvidenceRankConfig = DEFAULT_RANK_CONFIG,
) -> tuple[list[RankedEvidence], list[RankedEvidence]]:
"""Like rank_evidence but returns full RankedEvidence objects.
Useful when callers need the score breakdown for explainability.
"""
supporting: list[RankedEvidence] = []
opposing: list[RankedEvidence] = []
for sig in signals:
if sig.sentiment_value == 0.0:
continue
ranked = compute_evidence_rank(sig, config)
if sig.sentiment_value > 0:
supporting.append(ranked)
else:
opposing.append(ranked)
supporting.sort(key=lambda r: r.rank_score, reverse=True)
opposing.sort(key=lambda r: r.rank_score, reverse=True)
return (
supporting[: config.max_refs],
opposing[: config.max_refs],
)
+57
View File
@@ -0,0 +1,57 @@
"""Aggregation worker entrypoint - polls Redis for aggregation jobs."""
from __future__ import annotations
import asyncio
import json
import logging
import asyncpg
from services.aggregation.worker import aggregate_company
from services.shared.config import load_config
from services.shared.logging import setup_logging
from services.shared.redis_keys import QUEUE_AGGREGATION, queue_key
logger = logging.getLogger("aggregation_main")
async def main() -> None:
config = load_config()
setup_logging("aggregation", level=config.log_level, json_output=config.json_logs)
pool = await asyncpg.create_pool(dsn=config.postgres.dsn, min_size=2, max_size=8)
import redis.asyncio as aioredis
redis_client = aioredis.from_url(config.redis.url)
queue = queue_key(QUEUE_AGGREGATION)
logger.info("Aggregation worker started, polling %s", queue)
try:
while True:
raw = await redis_client.lpop(queue)
if raw is None:
await asyncio.sleep(1)
continue
payload = raw
job = json.loads(payload)
ticker = job.get("ticker", "")
logger.info("Processing aggregation job for %s", ticker)
try:
summaries = await aggregate_company(pool, ticker)
logger.info(
"Aggregation complete for %s: %d windows",
ticker, len(summaries),
)
except Exception:
logger.exception("Aggregation failed for %s", ticker)
finally:
await pool.close()
await redis_client.close()
if __name__ == "__main__":
asyncio.run(main())
+150
View File
@@ -0,0 +1,150 @@
"""Market context feature computation for aggregation windows.
Fetches recent market snapshots from PostgreSQL and computes context
features (price change, volume trend, volatility) that enrich trend
summaries and modulate signal weighting.
Requirements: 6.1, 6.2
"""
from __future__ import annotations
import math
from datetime import datetime, timedelta, timezone
from typing import Any
import asyncpg
from services.shared.schemas import MarketContext, TrendWindow
# Map TrendWindow values to lookback durations in days.
WINDOW_LOOKBACK_DAYS: dict[str, int] = {
TrendWindow.INTRADAY.value: 1,
TrendWindow.ONE_DAY.value: 2,
TrendWindow.SEVEN_DAY.value: 8,
TrendWindow.THIRTY_DAY.value: 35,
TrendWindow.NINETY_DAY.value: 95,
}
async def fetch_market_context(
pool: asyncpg.Pool,
ticker: str,
window: str,
reference_time: datetime | None = None,
) -> MarketContext:
"""Build a MarketContext for *ticker* over the given trend *window*.
Queries the ``market_snapshots`` table for recent bars and computes:
- price_change_pct: (last_close - first_close) / first_close
- avg_volume: mean volume across bars
- volume_change_pct: second-half avg volume vs first-half avg volume
- volatility: std-dev of close prices
- latest_close / latest_bar_at
Returns a MarketContext with ``bars_available == 0`` when no data exists.
"""
if reference_time is None:
reference_time = datetime.now(timezone.utc)
lookback_days = WINDOW_LOOKBACK_DAYS.get(window, 8)
start = reference_time - timedelta(days=lookback_days)
rows = await pool.fetch(
"""
SELECT data, captured_at
FROM market_snapshots
WHERE ticker = $1
AND captured_at >= $2
AND captured_at <= $3
ORDER BY captured_at ASC
""",
ticker,
start,
reference_time,
)
if not rows:
return MarketContext(ticker=ticker)
bars = _extract_bars(rows)
if not bars:
return MarketContext(ticker=ticker)
return _compute_context(ticker, bars)
def _extract_bars(rows: list[Any]) -> list[dict[str, Any]]:
"""Extract OHLCV bar dicts from market_snapshot rows.
The ``data`` column is JSONB. Polygon prev-day bars store fields like
``o``, ``h``, ``l``, ``c``, ``v``, ``t``. We normalise to a common
dict with ``close``, ``volume``, ``captured_at``.
"""
bars: list[dict[str, Any]] = []
for row in rows:
data = row["data"]
if isinstance(data, str):
import json
data = json.loads(data)
# Polygon-style single bar or list of bars
items = data if isinstance(data, list) else [data]
for item in items:
close = item.get("c") or item.get("close")
volume = item.get("v") or item.get("volume")
if close is not None:
bars.append({
"close": float(close),
"volume": float(volume) if volume is not None else 0.0,
"captured_at": row["captured_at"],
})
return bars
def _compute_context(ticker: str, bars: list[dict[str, Any]]) -> MarketContext:
"""Derive market context features from a sorted list of bar dicts."""
closes = [b["close"] for b in bars]
volumes = [b["volume"] for b in bars]
first_close = closes[0]
last_close = closes[-1]
price_change_pct = (
((last_close - first_close) / first_close * 100.0)
if first_close != 0
else 0.0
)
avg_volume = sum(volumes) / len(volumes) if volumes else 0.0
# Volume trend: compare second half to first half
mid = len(volumes) // 2
if mid > 0:
first_half_avg = sum(volumes[:mid]) / mid
second_half_avg = sum(volumes[mid:]) / len(volumes[mid:])
volume_change_pct = (
((second_half_avg - first_half_avg) / first_half_avg * 100.0)
if first_half_avg > 0
else 0.0
)
else:
volume_change_pct = 0.0
# Volatility: std dev of closes
if len(closes) > 1:
mean_close = sum(closes) / len(closes)
variance = sum((c - mean_close) ** 2 for c in closes) / len(closes)
volatility = math.sqrt(variance)
else:
volatility = 0.0
return MarketContext(
ticker=ticker,
price_change_pct=round(price_change_pct, 4),
avg_volume=round(avg_volume, 2),
volume_change_pct=round(volume_change_pct, 4),
volatility=round(volatility, 6),
latest_close=last_close,
latest_bar_at=bars[-1]["captured_at"],
bars_available=len(bars),
)
+439
View File
@@ -0,0 +1,439 @@
"""Sector and market-level rollup aggregation.
Aggregates company-level trend summaries into sector and market-level
summaries, enabling top-down views of sentiment and risk across the
portfolio.
Requirements: 6.3, 6.4, 6.5
"""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
import asyncpg
from services.shared.schemas import (
DisagreementDetail,
TrendDirection,
TrendSummary,
TrendWindow,
)
logger = logging.getLogger(__name__)
@dataclass
class CompanyTrendRow:
"""A company-level trend summary fetched from the DB for rollup."""
entity_id: str # ticker
sector: str
window: str
trend_direction: str
trend_strength: float
confidence: float
contradiction_score: float
dominant_catalysts: list[str]
material_risks: list[str]
top_supporting_evidence: list[str]
top_opposing_evidence: list[str]
# ---------------------------------------------------------------------------
# Fetch latest company trends for a given window
# ---------------------------------------------------------------------------
_LATEST_COMPANY_TRENDS_QUERY = """
SELECT DISTINCT ON (tw.entity_id)
tw.entity_id,
c.sector,
tw.window,
tw.trend_direction,
tw.trend_strength,
tw.confidence,
tw.contradiction_score,
tw.dominant_catalysts,
tw.material_risks,
tw.top_supporting_evidence,
tw.top_opposing_evidence
FROM trend_windows tw
JOIN companies c ON c.ticker = tw.entity_id AND c.active = TRUE
WHERE tw.entity_type = 'company'
AND tw.window = $1
AND tw.generated_at >= $2
ORDER BY tw.entity_id, tw.generated_at DESC
"""
def _parse_jsonb_list(val: object) -> list[str]:
"""Safely parse a JSONB column that should be a list of strings."""
if isinstance(val, list):
return [str(v) for v in val]
if isinstance(val, str):
parsed = json.loads(val)
if isinstance(parsed, list):
return [str(v) for v in parsed]
return []
def _parse_company_trend_row(row: object) -> CompanyTrendRow:
"""Convert an asyncpg Record to a CompanyTrendRow."""
# asyncpg Records support dict() but aren't typed; use getattr-style access
get = getattr(row, "__getitem__", None)
if get is None:
raise TypeError(f"Expected a mapping-like row, got {type(row)}")
def _str(key: str, default: str = "") -> str:
val = get(key)
return str(val) if val is not None else default
def _float(key: str) -> float:
val = get(key)
return float(val) if val is not None else 0.0
return CompanyTrendRow(
entity_id=_str("entity_id"),
sector=_str("sector", "Unknown") or "Unknown",
window=_str("window"),
trend_direction=_str("trend_direction"),
trend_strength=_float("trend_strength"),
confidence=_float("confidence"),
contradiction_score=_float("contradiction_score"),
dominant_catalysts=_parse_jsonb_list(get("dominant_catalysts")),
material_risks=_parse_jsonb_list(get("material_risks")),
top_supporting_evidence=_parse_jsonb_list(get("top_supporting_evidence")),
top_opposing_evidence=_parse_jsonb_list(get("top_opposing_evidence")),
)
async def fetch_latest_company_trends(
pool: asyncpg.Pool,
window: str,
since: datetime,
) -> list[CompanyTrendRow]:
"""Fetch the most recent company-level trend for each ticker in a window."""
rows = await pool.fetch(_LATEST_COMPANY_TRENDS_QUERY, window, since)
return [_parse_company_trend_row(r) for r in rows]
# ---------------------------------------------------------------------------
# Pure rollup logic
# ---------------------------------------------------------------------------
# Direction mapping for numeric aggregation
_DIRECTION_VALUES = {
TrendDirection.BULLISH.value: 1.0,
TrendDirection.BEARISH.value: -1.0,
TrendDirection.MIXED.value: 0.0,
TrendDirection.NEUTRAL.value: 0.0,
}
BULLISH_THRESHOLD = 0.15
BEARISH_THRESHOLD = -0.15
def rollup_trends(
trends: list[CompanyTrendRow],
entity_type: str,
entity_id: str,
window: str,
reference_time: datetime,
) -> TrendSummary:
"""Aggregate a list of company-level trends into a single rollup summary.
Each company trend is weighted by its confidence to produce a
confidence-weighted average of direction, strength, and contradiction.
"""
if not trends:
return TrendSummary(
entity_type=entity_type,
entity_id=entity_id,
window=TrendWindow(window),
trend_direction=TrendDirection.NEUTRAL,
trend_strength=0.0,
confidence=0.0,
generated_at=reference_time,
)
total_weight = 0.0
weighted_direction = 0.0
weighted_strength = 0.0
weighted_contradiction = 0.0
catalyst_weights: dict[str, float] = {}
risk_set: dict[str, float] = {}
all_supporting: list[str] = []
all_opposing: list[str] = []
for t in trends:
w = t.confidence
total_weight += w
dir_val = _DIRECTION_VALUES.get(t.trend_direction, 0.0)
weighted_direction += w * dir_val
weighted_strength += w * t.trend_strength
weighted_contradiction += w * t.contradiction_score
for cat in t.dominant_catalysts:
catalyst_weights[cat] = catalyst_weights.get(cat, 0.0) + w
for risk in t.material_risks:
norm = risk.strip().lower()
if norm not in risk_set:
risk_set[norm] = w
else:
risk_set[norm] = max(risk_set[norm], w)
all_supporting.extend(t.top_supporting_evidence)
all_opposing.extend(t.top_opposing_evidence)
if total_weight == 0.0:
return TrendSummary(
entity_type=entity_type,
entity_id=entity_id,
window=TrendWindow(window),
trend_direction=TrendDirection.NEUTRAL,
trend_strength=0.0,
confidence=0.0,
generated_at=reference_time,
)
avg_direction = weighted_direction / total_weight
avg_strength = weighted_strength / total_weight
avg_contradiction = weighted_contradiction / total_weight
avg_confidence = total_weight / len(trends)
# Derive direction
direction = _derive_rollup_direction(avg_direction, avg_contradiction)
# Top catalysts
sorted_catalysts = sorted(catalyst_weights.items(), key=lambda x: x[1], reverse=True)
catalysts = [c for c, _ in sorted_catalysts[:5]]
# Top risks (deduplicated, by weight)
sorted_risks = sorted(risk_set.items(), key=lambda x: x[1], reverse=True)
risks = [r for r, _ in sorted_risks[:5]]
# Disagreement details
disagreement = _build_rollup_disagreement(trends, entity_id)
return TrendSummary(
entity_type=entity_type,
entity_id=entity_id,
window=TrendWindow(window),
trend_direction=direction,
trend_strength=round(min(abs(avg_strength), 1.0), 4),
confidence=round(max(0.0, min(avg_confidence, 1.0)), 4),
top_supporting_evidence=list(dict.fromkeys(all_supporting))[:10],
top_opposing_evidence=list(dict.fromkeys(all_opposing))[:10],
dominant_catalysts=catalysts,
material_risks=risks,
contradiction_score=round(max(0.0, min(avg_contradiction, 1.0)), 4),
disagreement_details=disagreement,
generated_at=reference_time,
)
def _derive_rollup_direction(
avg_direction: float,
avg_contradiction: float,
) -> TrendDirection:
"""Map averaged direction value to a TrendDirection."""
if avg_contradiction > 0.10 and abs(avg_direction) < 0.3:
return TrendDirection.MIXED
if avg_direction >= BULLISH_THRESHOLD:
return TrendDirection.BULLISH
if avg_direction <= BEARISH_THRESHOLD:
return TrendDirection.BEARISH
return TrendDirection.NEUTRAL
def _build_rollup_disagreement(
trends: list[CompanyTrendRow],
entity_id: str,
) -> list[DisagreementDetail]:
"""Build disagreement details showing which companies are bullish vs bearish."""
bullish_ids: list[str] = []
bearish_ids: list[str] = []
bullish_weight = 0.0
bearish_weight = 0.0
for t in trends:
if t.trend_direction == TrendDirection.BULLISH.value:
bullish_ids.append(t.entity_id)
bullish_weight += t.confidence
elif t.trend_direction == TrendDirection.BEARISH.value:
bearish_ids.append(t.entity_id)
bearish_weight += t.confidence
if not bullish_ids or not bearish_ids:
return []
return [
DisagreementDetail(
dimension="company_direction",
positive_doc_ids=bullish_ids,
negative_doc_ids=bearish_ids,
positive_weight=round(bullish_weight, 4),
negative_weight=round(bearish_weight, 4),
description=(
f"{entity_id}: {len(bullish_ids)} bullish vs "
f"{len(bearish_ids)} bearish companies"
),
)
]
# ---------------------------------------------------------------------------
# Persist rollup (reuses the same trend_windows table)
# ---------------------------------------------------------------------------
_UPSERT_TREND = """
INSERT INTO trend_windows (
entity_type, entity_id, window, trend_direction, trend_strength,
confidence, top_supporting_evidence, top_opposing_evidence,
dominant_catalysts, material_risks, contradiction_score,
disagreement_details, market_context, generated_at
) VALUES (
$1, $2, $3, $4, $5,
$6, $7::jsonb, $8::jsonb,
$9::jsonb, $10::jsonb, $11,
$12::jsonb, $13::jsonb, $14
)
RETURNING id
"""
async def persist_rollup(
pool: asyncpg.Pool,
summary: TrendSummary,
) -> str:
"""Insert a rollup trend summary and return its UUID."""
row = await pool.fetchrow(
_UPSERT_TREND,
summary.entity_type,
summary.entity_id,
summary.window.value,
summary.trend_direction.value,
summary.trend_strength,
summary.confidence,
json.dumps(summary.top_supporting_evidence),
json.dumps(summary.top_opposing_evidence),
json.dumps(summary.dominant_catalysts),
json.dumps(summary.material_risks),
summary.contradiction_score,
json.dumps([d.model_dump() for d in summary.disagreement_details]),
json.dumps({}),
summary.generated_at,
)
return str(row["id"]) # type: ignore[index]
# ---------------------------------------------------------------------------
# High-level rollup entry points
# ---------------------------------------------------------------------------
async def aggregate_sector(
pool: asyncpg.Pool,
sector: str,
window: str,
reference_time: datetime | None = None,
since: datetime | None = None,
) -> TrendSummary:
"""Compute and persist a sector-level rollup for one window.
Fetches the latest company trends, filters to the given sector,
and rolls them up into a single sector summary.
"""
if reference_time is None:
reference_time = datetime.now(timezone.utc)
if since is None:
since = reference_time - _window_lookback(window)
all_trends = await fetch_latest_company_trends(pool, window, since)
sector_trends = [t for t in all_trends if t.sector == sector]
summary = rollup_trends(sector_trends, "sector", sector, window, reference_time)
if sector_trends:
rollup_id = await persist_rollup(pool, summary)
logger.info(
"Persisted sector rollup %s for %s/%s: direction=%s strength=%.3f companies=%d",
rollup_id, sector, window, summary.trend_direction.value,
summary.trend_strength, len(sector_trends),
)
return summary
async def aggregate_market(
pool: asyncpg.Pool,
window: str,
reference_time: datetime | None = None,
since: datetime | None = None,
) -> TrendSummary:
"""Compute and persist a market-wide rollup for one window.
Aggregates all company trends regardless of sector.
"""
if reference_time is None:
reference_time = datetime.now(timezone.utc)
if since is None:
since = reference_time - _window_lookback(window)
all_trends = await fetch_latest_company_trends(pool, window, since)
summary = rollup_trends(all_trends, "market", "all", window, reference_time)
if all_trends:
rollup_id = await persist_rollup(pool, summary)
logger.info(
"Persisted market rollup %s for %s: direction=%s strength=%.3f companies=%d",
rollup_id, window, summary.trend_direction.value,
summary.trend_strength, len(all_trends),
)
return summary
async def aggregate_all_sectors(
pool: asyncpg.Pool,
window: str,
reference_time: datetime | None = None,
since: datetime | None = None,
) -> list[TrendSummary]:
"""Compute sector rollups for every sector that has company trends."""
if reference_time is None:
reference_time = datetime.now(timezone.utc)
if since is None:
since = reference_time - _window_lookback(window)
all_trends = await fetch_latest_company_trends(pool, window, since)
# Group by sector
sectors: dict[str, list[CompanyTrendRow]] = {}
for t in all_trends:
sectors.setdefault(t.sector, []).append(t)
summaries: list[TrendSummary] = []
for sector, trends in sectors.items():
summary = rollup_trends(trends, "sector", sector, window, reference_time)
if trends:
_id = await persist_rollup(pool, summary)
summaries.append(summary)
return summaries
def _window_lookback(window: str) -> timedelta:
"""Return a reasonable lookback for finding recent company trends."""
mapping = {
TrendWindow.INTRADAY.value: timedelta(hours=24),
TrendWindow.ONE_DAY.value: timedelta(days=2),
TrendWindow.SEVEN_DAY.value: timedelta(days=8),
TrendWindow.THIRTY_DAY.value: timedelta(days=35),
TrendWindow.NINETY_DAY.value: timedelta(days=95),
}
return mapping.get(window, timedelta(days=8))
+285
View File
@@ -0,0 +1,285 @@
"""Recency decay, source credibility weighting, and market context
integration for aggregation.
Provides scoring functions used by the aggregation engine to weight
document intelligence signals when computing trend summaries.
Requirements: 6.1, 6.2, 6.5
"""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from datetime import datetime, timezone
from services.shared.schemas import MarketContext
@dataclass(frozen=True)
class ScoringConfig:
"""Tunable parameters for signal scoring."""
# Recency decay: exponential half-life in hours per window.
# After one half-life, a document's recency weight drops to 0.5.
half_life_hours: dict[str, float] = field(default_factory=lambda: {
"intraday": 2.0,
"1d": 12.0,
"7d": 72.0,
"30d": 240.0,
"90d": 720.0,
})
# Minimum recency weight — prevents very old docs from being zeroed out
# entirely so they can still contribute trace-level signal.
min_recency_weight: float = 0.01
# Source credibility bounds — credibility scores outside this range
# are clamped before weighting.
credibility_floor: float = 0.1
credibility_ceiling: float = 1.0
# Exponent applied to credibility score. >1 penalises low-credibility
# sources more aggressively; <1 flattens the curve.
credibility_exponent: float = 1.0
# Novelty bonus: multiplier range applied on top of base weight.
# A novelty_score of 1.0 gets the full bonus; 0.0 gets none.
novelty_bonus_max: float = 0.25
# Confidence floor — documents below this extraction confidence
# receive zero weight (they are too unreliable to aggregate).
confidence_floor: float = 0.2
# Market context modulation ---
# When volatility exceeds this threshold (in price units), recency
# signals are amplified because fast-moving markets make fresh data
# more important.
volatility_recency_boost_threshold: float = 1.0
volatility_recency_boost_max: float = 0.30 # max extra multiplier
# When volume surges above this % change, signals get a small boost
# because high-volume moves carry more conviction.
volume_surge_threshold_pct: float = 50.0
volume_surge_boost: float = 0.15
# Singleton default config
DEFAULT_CONFIG = ScoringConfig()
# ---------------------------------------------------------------------------
# Recency decay
# ---------------------------------------------------------------------------
def recency_weight(
published_at: datetime,
reference_time: datetime,
window: str,
config: ScoringConfig = DEFAULT_CONFIG,
) -> float:
"""Compute an exponential recency decay weight for a document.
Uses the formula: w = 2^(-age_hours / half_life)
Args:
published_at: When the document was published (tz-aware).
reference_time: The "now" anchor for the aggregation window (tz-aware).
window: One of the TrendWindow values (e.g. "7d").
config: Scoring parameters.
Returns:
A weight in [config.min_recency_weight, 1.0].
"""
# Ensure both are tz-aware; treat naive as UTC.
if published_at.tzinfo is None:
published_at = published_at.replace(tzinfo=timezone.utc)
if reference_time.tzinfo is None:
reference_time = reference_time.replace(tzinfo=timezone.utc)
age_seconds = (reference_time - published_at).total_seconds()
if age_seconds <= 0:
return 1.0
age_hours = age_seconds / 3600.0
half_life = config.half_life_hours.get(window, 72.0)
weight = math.pow(2.0, -age_hours / half_life)
return max(weight, config.min_recency_weight)
# ---------------------------------------------------------------------------
# Source credibility weighting
# ---------------------------------------------------------------------------
def credibility_weight(
source_credibility: float,
config: ScoringConfig = DEFAULT_CONFIG,
) -> float:
"""Compute a weight from a source's credibility score.
The raw credibility (0-1) is clamped to [floor, ceiling] then raised
to ``credibility_exponent``.
Args:
source_credibility: The credibility score from the source or
document intelligence record (0-1).
config: Scoring parameters.
Returns:
A weight in [floor^exp, ceiling^exp].
"""
clamped = max(config.credibility_floor, min(source_credibility, config.credibility_ceiling))
return math.pow(clamped, config.credibility_exponent)
# ---------------------------------------------------------------------------
# Market context adjustment
# ---------------------------------------------------------------------------
def market_context_multiplier(
market_ctx: MarketContext | None,
config: ScoringConfig = DEFAULT_CONFIG,
) -> float:
"""Compute a multiplicative adjustment from market context features.
Returns a value >= 1.0 that amplifies signal weights when market
conditions suggest heightened importance (high volatility or volume
surges). Returns 1.0 when no market context is available.
"""
if market_ctx is None or not market_ctx.has_data:
return 1.0
boost = 0.0
# Volatility boost — more volatile markets make recent signals more valuable
if market_ctx.volatility is not None and market_ctx.volatility > config.volatility_recency_boost_threshold:
excess = market_ctx.volatility - config.volatility_recency_boost_threshold
# Logarithmic scaling so extreme volatility doesn't blow up the weight
boost += min(
math.log1p(excess) * 0.15,
config.volatility_recency_boost_max,
)
# Volume surge boost
if market_ctx.volume_change_pct is not None and market_ctx.volume_change_pct > config.volume_surge_threshold_pct:
boost += config.volume_surge_boost
return 1.0 + boost
# ---------------------------------------------------------------------------
# Combined document signal weight
# ---------------------------------------------------------------------------
@dataclass
class SignalWeight:
"""Breakdown of a document's aggregation weight."""
recency: float
credibility: float
novelty_bonus: float
confidence_gate: float # 0.0 or 1.0
market_ctx_multiplier: float # >= 1.0
combined: float
def compute_signal_weight(
published_at: datetime,
reference_time: datetime,
window: str,
source_credibility: float,
novelty_score: float = 0.5,
extraction_confidence: float = 0.5,
market_ctx: MarketContext | None = None,
config: ScoringConfig = DEFAULT_CONFIG,
) -> SignalWeight:
"""Compute the combined aggregation weight for a single document signal.
The formula is:
combined = confidence_gate * recency * credibility
* (1 + novelty_bonus) * market_ctx_multiplier
where novelty_bonus = novelty_score * config.novelty_bonus_max
and market_ctx_multiplier >= 1.0 based on volatility/volume features.
Documents with extraction_confidence below config.confidence_floor
receive a combined weight of 0.0 (gated out).
Args:
published_at: Document publication time.
reference_time: Aggregation anchor time.
window: Trend window identifier.
source_credibility: Source credibility score (0-1).
novelty_score: Document novelty score (0-1).
extraction_confidence: Extraction confidence from the model (0-1).
market_ctx: Optional market context features for the symbol.
config: Scoring parameters.
Returns:
A ``SignalWeight`` with the component breakdown and combined score.
"""
# Confidence gate
gate = 1.0 if extraction_confidence >= config.confidence_floor else 0.0
rec = recency_weight(published_at, reference_time, window, config)
cred = credibility_weight(source_credibility, config)
bonus = novelty_score * config.novelty_bonus_max
mkt_mult = market_context_multiplier(market_ctx, config)
combined = gate * rec * cred * (1.0 + bonus) * mkt_mult
return SignalWeight(
recency=rec,
credibility=cred,
novelty_bonus=bonus,
confidence_gate=gate,
market_ctx_multiplier=mkt_mult,
combined=combined,
)
# ---------------------------------------------------------------------------
# Batch helpers
# ---------------------------------------------------------------------------
@dataclass
class WeightedSignal:
"""A document intelligence reference paired with its computed weight."""
document_id: str
weight: SignalWeight
sentiment_value: float # numeric sentiment: +1 positive, -1 negative, 0 neutral/mixed
impact_score: float
def sentiment_to_numeric(sentiment: str) -> float:
"""Map a sentiment label to a signed numeric value."""
mapping = {
"positive": 1.0,
"negative": -1.0,
"neutral": 0.0,
"mixed": 0.0,
}
return mapping.get(sentiment.lower(), 0.0)
def weighted_sentiment_average(signals: list[WeightedSignal]) -> float:
"""Compute a weight-adjusted average sentiment across signals.
Returns a value in [-1, 1]. Returns 0.0 when total weight is zero.
"""
total_weight = 0.0
weighted_sum = 0.0
for sig in signals:
w = sig.weight.combined * sig.impact_score
weighted_sum += w * sig.sentiment_value
total_weight += w
if total_weight == 0.0:
return 0.0
return weighted_sum / total_weight
+650 -1
View File
@@ -1 +1,650 @@
"""Aggregation worker - rolling trend summaries, contradiction detection, evidence ranking."""
"""Aggregation worker - company-level rolling window trend summaries.
Queries document intelligence and market context for a given ticker,
computes weighted signal scores, and produces TrendSummary objects
persisted to the trend_windows table.
Requirements: 6.1, 6.2, 6.5
"""
from __future__ import annotations
import json
import logging
import time
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Any
import asyncpg
from services.aggregation.contradiction import CatalystEntry, detect_contradictions
from services.aggregation.evidence import (
EvidenceRankConfig,
RankedEvidence,
rank_evidence as _rank_evidence_composite,
rank_evidence_detailed,
)
from services.aggregation.market_context import fetch_market_context
from services.aggregation.scoring import (
ScoringConfig,
WeightedSignal,
compute_signal_weight,
sentiment_to_numeric,
weighted_sentiment_average,
)
from services.shared.schemas import TrendDirection, TrendSummary, TrendWindow
from services.shared.metrics import (
AGGREGATION_CONTRADICTION_SCORE,
AGGREGATION_DURATION,
AGGREGATION_SIGNALS_PROCESSED,
AGGREGATION_WINDOWS_COMPUTED,
)
logger = logging.getLogger(__name__)
# Map TrendWindow values to lookback durations.
WINDOW_DURATIONS: dict[str, timedelta] = {
TrendWindow.INTRADAY.value: timedelta(hours=12),
TrendWindow.ONE_DAY.value: timedelta(days=1),
TrendWindow.SEVEN_DAY.value: timedelta(days=7),
TrendWindow.THIRTY_DAY.value: timedelta(days=30),
TrendWindow.NINETY_DAY.value: timedelta(days=90),
}
# How many evidence document IDs to keep in supporting/opposing lists.
MAX_EVIDENCE_REFS = 10
@dataclass
class AggregationConfig:
"""Controls which windows to compute and scoring parameters."""
windows: list[str] | None = None # None = all windows
scoring: ScoringConfig | None = None
max_evidence: int = MAX_EVIDENCE_REFS
def effective_windows(self) -> list[str]:
if self.windows:
return self.windows
return [w.value for w in TrendWindow]
def effective_scoring(self) -> ScoringConfig:
return self.scoring or ScoringConfig()
# ---------------------------------------------------------------------------
# Fetch impact records for a ticker within a time window
# ---------------------------------------------------------------------------
_IMPACT_QUERY = """
SELECT
di.document_id,
di.confidence,
di.novelty_score,
di.source_credibility,
dir.sentiment,
dir.impact_score,
dir.catalyst_type,
dir.key_facts,
dir.risks,
d.published_at
FROM document_impact_records dir
JOIN document_intelligence di ON di.id = dir.intelligence_id
JOIN documents d ON d.id = di.document_id
WHERE dir.ticker = $1
AND d.published_at >= $2
AND d.published_at <= $3
AND di.validation_status = 'valid'
AND d.status != 'rejected'
ORDER BY d.published_at DESC
"""
@dataclass
class ImpactRow:
"""Parsed row from the impact query."""
document_id: str
confidence: float
novelty_score: float
source_credibility: float
sentiment: str
impact_score: float
catalyst_type: str
key_facts: list[str]
risks: list[str]
published_at: datetime
def _parse_impact_row(row: Any) -> ImpactRow:
"""Convert an asyncpg Record to an ImpactRow."""
key_facts = row["key_facts"]
if isinstance(key_facts, str):
key_facts = json.loads(key_facts)
risks = row["risks"]
if isinstance(risks, str):
risks = json.loads(risks)
return ImpactRow(
document_id=str(row["document_id"]),
confidence=float(row["confidence"] or 0.5),
novelty_score=float(row["novelty_score"] or 0.5),
source_credibility=float(row["source_credibility"] or 0.5),
sentiment=row["sentiment"] or "neutral",
impact_score=float(row["impact_score"] or 0.0),
catalyst_type=row["catalyst_type"] or "other",
key_facts=key_facts if isinstance(key_facts, list) else [],
risks=risks if isinstance(risks, list) else [],
published_at=row["published_at"],
)
async def fetch_impact_records(
pool: asyncpg.Pool,
ticker: str,
window_start: datetime,
window_end: datetime,
) -> list[ImpactRow]:
"""Fetch validated document impact records for a ticker in a time range."""
rows = await pool.fetch(_IMPACT_QUERY, ticker, window_start, window_end)
return [_parse_impact_row(r) for r in rows]
# ---------------------------------------------------------------------------
# Build weighted signals from impact records
# ---------------------------------------------------------------------------
def build_weighted_signals(
impacts: list[ImpactRow],
reference_time: datetime,
window: str,
market_ctx: Any | None = None,
config: ScoringConfig | None = None,
) -> list[WeightedSignal]:
"""Convert impact records into WeightedSignal objects using the scoring module."""
cfg = config or ScoringConfig()
signals: list[WeightedSignal] = []
for imp in impacts:
sw = compute_signal_weight(
published_at=imp.published_at,
reference_time=reference_time,
window=window,
source_credibility=imp.source_credibility,
novelty_score=imp.novelty_score,
extraction_confidence=imp.confidence,
market_ctx=market_ctx,
config=cfg,
)
signals.append(
WeightedSignal(
document_id=imp.document_id,
weight=sw,
sentiment_value=sentiment_to_numeric(imp.sentiment),
impact_score=imp.impact_score,
)
)
return signals
# ---------------------------------------------------------------------------
# Derive trend direction from weighted sentiment
# ---------------------------------------------------------------------------
# Thresholds for mapping numeric sentiment to direction.
BULLISH_THRESHOLD = 0.15
BEARISH_THRESHOLD = -0.15
MIXED_THRESHOLD = 0.10 # contradiction score above this → mixed
def derive_trend_direction(
avg_sentiment: float,
contradiction_score: float = 0.0,
) -> TrendDirection:
"""Map a weighted average sentiment to a TrendDirection.
If contradiction is high, the direction is MIXED regardless of
the average sentiment value.
"""
if contradiction_score > MIXED_THRESHOLD and abs(avg_sentiment) < 0.3:
return TrendDirection.MIXED
if avg_sentiment >= BULLISH_THRESHOLD:
return TrendDirection.BULLISH
if avg_sentiment <= BEARISH_THRESHOLD:
return TrendDirection.BEARISH
return TrendDirection.NEUTRAL
# ---------------------------------------------------------------------------
# Compute contradiction score
# ---------------------------------------------------------------------------
def compute_contradiction_score(signals: list[WeightedSignal]) -> float:
"""Measure how much disagreement exists among weighted signals.
Returns a value in [0, 1] where 0 means full agreement and 1 means
equal-weight positive and negative signals.
The formula computes the ratio of the minority-side total weight to
the majority-side total weight.
"""
if not signals:
return 0.0
pos_weight = 0.0
neg_weight = 0.0
for sig in signals:
w = sig.weight.combined * sig.impact_score
if sig.sentiment_value > 0:
pos_weight += w
elif sig.sentiment_value < 0:
neg_weight += w
total = pos_weight + neg_weight
if total == 0.0:
return 0.0
minority = min(pos_weight, neg_weight)
return round(minority / total, 4)
# ---------------------------------------------------------------------------
# Rank evidence (supporting vs opposing)
# ---------------------------------------------------------------------------
def rank_evidence(
signals: list[WeightedSignal],
max_refs: int = MAX_EVIDENCE_REFS,
) -> tuple[list[str], list[str]]:
"""Return top supporting and opposing document IDs ranked by composite score.
Delegates to the evidence ranking module which considers multiple
factors (weight, impact, recency, confidence) rather than raw weight alone.
Supporting = positive sentiment, Opposing = negative sentiment.
Neutral/mixed signals are excluded from evidence lists.
"""
config = EvidenceRankConfig(max_refs=max_refs)
return _rank_evidence_composite(signals, config)
# ---------------------------------------------------------------------------
# Extract dominant catalysts and material risks
# ---------------------------------------------------------------------------
def extract_catalysts_and_risks(
impacts: list[ImpactRow],
signals: list[WeightedSignal],
) -> tuple[list[str], list[str]]:
"""Return dominant catalyst types and material risks weighted by signal strength.
Catalysts are ranked by cumulative weight. Risks are deduplicated and
ordered by the weight of the signal that surfaced them.
"""
catalyst_weights: dict[str, float] = {}
risk_entries: list[tuple[float, str]] = []
# Build a lookup from document_id to combined weight
weight_by_doc = {s.document_id: s.weight.combined * s.impact_score for s in signals}
for imp in impacts:
w = weight_by_doc.get(imp.document_id, 0.0)
if w <= 0.0:
continue
catalyst_weights[imp.catalyst_type] = catalyst_weights.get(imp.catalyst_type, 0.0) + w
for risk in imp.risks:
risk_entries.append((w, risk))
# Top catalysts by cumulative weight
sorted_catalysts = sorted(catalyst_weights.items(), key=lambda x: x[1], reverse=True)
catalysts = [cat for cat, _ in sorted_catalysts[:5]]
# Deduplicated risks ordered by weight
seen_risks: set[str] = set()
risks: list[str] = []
risk_entries.sort(key=lambda x: x[0], reverse=True)
for _, risk_text in risk_entries:
normalized = risk_text.strip().lower()
if normalized not in seen_risks:
seen_risks.add(normalized)
risks.append(risk_text.strip())
if len(risks) >= 5:
break
return catalysts, risks
# ---------------------------------------------------------------------------
# Compute trend confidence
# ---------------------------------------------------------------------------
def compute_trend_confidence(
signals: list[WeightedSignal],
contradiction_score: float,
) -> float:
"""Derive an overall confidence for the trend summary.
Confidence is based on:
- Number of contributing signals (more = higher base)
- Average extraction confidence of contributing signals
- Contradiction penalty (high contradiction lowers confidence)
Returns a value in [0, 1].
"""
if not signals:
return 0.0
active = [s for s in signals if s.weight.combined > 0]
if not active:
return 0.0
# Base confidence from signal count (diminishing returns)
count_factor = min(len(active) / 20.0, 1.0)
# Average extraction confidence (from the confidence_gate — if gated,
# the signal wouldn't be in active list, so we use the raw confidence
# from the weight breakdown).
avg_conf = sum(s.weight.credibility for s in active) / len(active)
# Contradiction penalty
contradiction_penalty = contradiction_score * 0.4
confidence = (0.4 * count_factor + 0.6 * avg_conf) - contradiction_penalty
return round(max(0.0, min(1.0, confidence)), 4)
# ---------------------------------------------------------------------------
# Assemble a TrendSummary from components
# ---------------------------------------------------------------------------
@dataclass
class AssembledTrend:
"""A trend summary paired with its detailed evidence rankings."""
summary: TrendSummary
supporting_evidence: list[RankedEvidence]
opposing_evidence: list[RankedEvidence]
def assemble_trend_summary(
ticker: str,
window: str,
signals: list[WeightedSignal],
impacts: list[ImpactRow],
market_ctx: Any | None = None,
max_evidence: int = MAX_EVIDENCE_REFS,
reference_time: datetime | None = None,
) -> TrendSummary:
"""Build a complete TrendSummary from weighted signals and impact records."""
result = assemble_trend_with_evidence(
ticker, window, signals, impacts, market_ctx, max_evidence, reference_time,
)
return result.summary
def assemble_trend_with_evidence(
ticker: str,
window: str,
signals: list[WeightedSignal],
impacts: list[ImpactRow],
market_ctx: Any | None = None,
max_evidence: int = MAX_EVIDENCE_REFS,
reference_time: datetime | None = None,
) -> AssembledTrend:
"""Build a TrendSummary and return detailed evidence rankings for persistence."""
if reference_time is None:
reference_time = datetime.now(timezone.utc)
avg_sentiment = weighted_sentiment_average(signals)
# Run full contradiction detection (Requirement 6.4)
catalyst_entries = [
CatalystEntry(document_id=imp.document_id, catalyst_type=imp.catalyst_type)
for imp in impacts
]
contradiction_result = detect_contradictions(signals, catalyst_entries)
contradiction = contradiction_result.score
direction = derive_trend_direction(avg_sentiment, contradiction)
confidence = compute_trend_confidence(signals, contradiction)
# Get detailed evidence rankings for persistence
config = EvidenceRankConfig(max_refs=max_evidence)
supporting_ranked, opposing_ranked = rank_evidence_detailed(signals, config)
supporting = [r.document_id for r in supporting_ranked]
opposing = [r.document_id for r in opposing_ranked]
catalysts, risks = extract_catalysts_and_risks(impacts, signals)
# Trend strength: absolute value of weighted sentiment, clamped to [0, 1]
strength = round(min(abs(avg_sentiment), 1.0), 4)
summary = TrendSummary(
entity_type="company",
entity_id=ticker,
window=TrendWindow(window),
trend_direction=direction,
trend_strength=strength,
confidence=confidence,
top_supporting_evidence=supporting,
top_opposing_evidence=opposing,
dominant_catalysts=catalysts,
material_risks=risks,
contradiction_score=contradiction,
disagreement_details=contradiction_result.details,
market_context=market_ctx,
generated_at=reference_time,
)
return AssembledTrend(
summary=summary,
supporting_evidence=supporting_ranked,
opposing_evidence=opposing_ranked,
)
# ---------------------------------------------------------------------------
# Persist trend summary to PostgreSQL
# ---------------------------------------------------------------------------
_UPSERT_TREND = """
INSERT INTO trend_windows (
entity_type, entity_id, window, trend_direction, trend_strength,
confidence, top_supporting_evidence, top_opposing_evidence,
dominant_catalysts, material_risks, contradiction_score,
disagreement_details, market_context, generated_at
) VALUES (
$1, $2, $3, $4, $5,
$6, $7::jsonb, $8::jsonb,
$9::jsonb, $10::jsonb, $11,
$12::jsonb, $13::jsonb, $14
)
RETURNING id
"""
async def persist_trend_summary(
pool: asyncpg.Pool,
summary: TrendSummary,
) -> str:
"""Insert a trend summary row and return its UUID."""
row = await pool.fetchrow(
_UPSERT_TREND,
summary.entity_type,
summary.entity_id,
summary.window.value,
summary.trend_direction.value,
summary.trend_strength,
summary.confidence,
json.dumps(summary.top_supporting_evidence),
json.dumps(summary.top_opposing_evidence),
json.dumps(summary.dominant_catalysts),
json.dumps(summary.material_risks),
summary.contradiction_score,
json.dumps([d.model_dump() for d in summary.disagreement_details]),
json.dumps(summary.market_context.model_dump() if summary.market_context else {}),
summary.generated_at,
)
return str(row["id"])
# ---------------------------------------------------------------------------
# Persist evidence mappings to trend_evidence table
# ---------------------------------------------------------------------------
_INSERT_EVIDENCE = """
INSERT INTO trend_evidence (
trend_window_id, document_id, evidence_type,
rank_score, weight_component, impact_component,
recency_component, confidence_component, sentiment_value
) VALUES (
$1, $2::uuid, $3,
$4, $5, $6,
$7, $8, $9
)
"""
async def persist_trend_evidence(
pool: asyncpg.Pool,
trend_window_id: str,
supporting: list[RankedEvidence],
opposing: list[RankedEvidence],
) -> int:
"""Insert evidence mapping rows for a trend window. Returns count inserted."""
rows: list[tuple[str, str, str, float, float, float, float, float, float]] = []
for ev in supporting:
rows.append((
trend_window_id, ev.document_id, "supporting",
ev.rank_score, ev.weight_component, ev.impact_component,
ev.recency_component, ev.confidence_component, ev.sentiment_value,
))
for ev in opposing:
rows.append((
trend_window_id, ev.document_id, "opposing",
ev.rank_score, ev.weight_component, ev.impact_component,
ev.recency_component, ev.confidence_component, ev.sentiment_value,
))
if not rows:
return 0
await pool.executemany(_INSERT_EVIDENCE, rows)
return len(rows)
# ---------------------------------------------------------------------------
# Main aggregation entry point for a single ticker + window
# ---------------------------------------------------------------------------
async def aggregate_company_window(
pool: asyncpg.Pool,
ticker: str,
window: str,
reference_time: datetime | None = None,
config: AggregationConfig | None = None,
) -> TrendSummary:
"""Compute and persist a trend summary for one ticker and one window.
Steps:
1. Determine the time range for the window.
2. Fetch document impact records from PostgreSQL.
3. Fetch market context for the ticker.
4. Build weighted signals using the scoring module.
5. Assemble the TrendSummary.
6. Persist to trend_windows table.
Returns the assembled TrendSummary.
"""
cfg = config or AggregationConfig()
scoring_cfg = cfg.effective_scoring()
if reference_time is None:
reference_time = datetime.now(timezone.utc)
_agg_start = time.monotonic()
duration = WINDOW_DURATIONS.get(window, timedelta(days=7))
window_start = reference_time - duration
# 1. Fetch impact records
impacts = await fetch_impact_records(pool, ticker, window_start, reference_time)
# 2. Fetch market context
market_ctx = await fetch_market_context(pool, ticker, window, reference_time)
# 3. Build weighted signals
signals = build_weighted_signals(
impacts, reference_time, window, market_ctx, scoring_cfg,
)
# 4. Assemble trend summary with evidence details
assembled = assemble_trend_with_evidence(
ticker=ticker,
window=window,
signals=signals,
impacts=impacts,
market_ctx=market_ctx if market_ctx.has_data else None,
max_evidence=cfg.max_evidence,
reference_time=reference_time,
)
summary = assembled.summary
# 5. Persist trend window
trend_id = await persist_trend_summary(pool, summary)
# 6. Persist evidence mappings
evidence_count = await persist_trend_evidence(
pool, trend_id,
assembled.supporting_evidence,
assembled.opposing_evidence,
)
logger.info(
"Persisted trend %s for %s/%s: direction=%s strength=%.3f confidence=%.3f signals=%d evidence=%d",
trend_id, ticker, window, summary.trend_direction.value,
summary.trend_strength, summary.confidence, len(signals), evidence_count,
)
# Prometheus metrics
AGGREGATION_WINDOWS_COMPUTED.labels(window=window).inc()
AGGREGATION_SIGNALS_PROCESSED.labels(window=window).inc(len(signals))
AGGREGATION_CONTRADICTION_SCORE.observe(summary.contradiction_score)
AGGREGATION_DURATION.labels(window=window).observe(time.monotonic() - _agg_start)
return summary
# ---------------------------------------------------------------------------
# Aggregate all windows for a single ticker
# ---------------------------------------------------------------------------
async def aggregate_company(
pool: asyncpg.Pool,
ticker: str,
reference_time: datetime | None = None,
config: AggregationConfig | None = None,
) -> list[TrendSummary]:
"""Compute trend summaries for all configured windows for a ticker."""
cfg = config or AggregationConfig()
if reference_time is None:
reference_time = datetime.now(timezone.utc)
summaries: list[TrendSummary] = []
for window in cfg.effective_windows():
summary = await aggregate_company_window(
pool, ticker, window, reference_time, cfg,
)
summaries.append(summary)
return summaries