feat: model validation, calibration, and signal quality layer
ci/woodpecker/push/test Pipeline failed
ci/woodpecker/push/build-1 unknown status
ci/woodpecker/push/build-3 unknown status
ci/woodpecker/push/build-2 unknown status
ci/woodpecker/push/finalize unknown status
Build and Push / lint-and-test (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.adapters.broker_adapter name:broker-adapter]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.aggregation.worker name:aggregation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.extractor.worker name:extractor]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.ingestion.worker name:ingestion]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.lake_publisher.worker name:lake-publisher]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.parser.worker name:parser]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.recommendation.worker name:recommendation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.scheduler.app name:scheduler]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.api.app:app --host 0.0.0.0 --port 8000 name:query-api]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.risk.app:app --host 0.0.0.0 --port 8000 name:risk]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.symbol_registry.app:app --host 0.0.0.0 --port 8000 name:symbol-registry]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.trading.app:app --host 0.0.0.0 --port 8000 name:trading-engine]) (push) Has been cancelled
Build and Push / build-dashboard (push) Has been cancelled
Build and Push / build-superset (push) Has been cancelled
Build and Push / integration-test (push) Has been cancelled
Build and Push / beta-gate (push) Has been cancelled

- Migration 035: prediction_snapshots, prediction_outcomes, signal_evidence_links, model_metric_snapshots tables + SQL views
- Prediction snapshot writer with canonical evidence keys, duplicate detection, contribution scores
- Outcome evaluator across 5 horizons (1h, 6h, 1d, 7d, 30d)
- Metrics engine: ECE, Brier score, IC, Rank IC, benchmark comparison
- Attribution engine: per-source, per-catalyst, per-layer performance
- Calibration engine: Bayesian shrinkage source reliability
- Quality gate for live trading eligibility with configurable thresholds
- 7 new /api/validation/* endpoints
- Upgraded OpsModel dashboard with validation tab
- Enhanced recommendation display with calibration context
- Backtest replay validation mode
- 86 Python tests (unit + property-based), 179 frontend tests passing
This commit is contained in:
Celes Renata
2026-05-01 03:04:58 +00:00
parent 5d2ffd9163
commit 7fcc8a6c07
23 changed files with 7554 additions and 9 deletions
+1
View File
@@ -0,0 +1 @@
+591
View File
@@ -0,0 +1,591 @@
"""Attribution Engine — per-source, per-catalyst, and per-layer performance.
Joins signal evidence links with prediction outcomes to compute attribution
metrics that identify which sources, catalyst types, and signal layers
contribute most to accurate predictions.
Requirements: 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7
"""
from __future__ import annotations
import logging
import math
from dataclasses import dataclass
from datetime import datetime, timedelta
import asyncpg
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------
@dataclass
class SourceAttribution:
"""Performance metrics for a single source."""
source: str
source_type: str
prediction_count: int
avg_weight: float
avg_contribution_score: float
win_rate: float
avg_future_return: float
avg_excess_return_vs_spy: float
information_coefficient: float | None
duplicate_rate: float
@dataclass
class CatalystAttribution:
"""Performance metrics for a single catalyst type."""
catalyst_type: str
prediction_count: int
win_rate: float
avg_future_return: float
avg_excess_return_vs_spy: float
information_coefficient: float | None
@dataclass
class LayerAttribution:
"""Performance metrics for a signal layer."""
layer: str # company, macro, competitive
avg_contribution_pct: float
dominant_win_rate: float # win rate when this layer > 30% contribution
dominant_ic: float | None # IC when this layer > 30% contribution
# ---------------------------------------------------------------------------
# Pure computation helpers
# ---------------------------------------------------------------------------
def _pearson_correlation(xs: list[float], ys: list[float]) -> float | None:
"""Compute Pearson correlation coefficient between two lists.
Returns None if the lists have fewer than 2 elements or if either
has zero variance. Guards against NaN/infinity.
"""
n = len(xs)
if n < 2:
return None
mean_x = sum(xs) / n
mean_y = sum(ys) / n
cov = 0.0
var_x = 0.0
var_y = 0.0
for x, y in zip(xs, ys):
dx = x - mean_x
dy = y - mean_y
cov += dx * dy
var_x += dx * dx
var_y += dy * dy
if var_x == 0.0 or var_y == 0.0:
return None
r = cov / math.sqrt(var_x * var_y)
if math.isnan(r) or math.isinf(r):
return None
return max(-1.0, min(1.0, r))
def _compute_ic(
contribution_scores: list[float],
future_returns: list[float],
) -> float | None:
"""Compute IC (Pearson correlation) between contribution scores and returns.
Returns None when fewer than 30 data points.
"""
if len(contribution_scores) < 30 or len(future_returns) < 30:
return None
n = min(len(contribution_scores), len(future_returns))
return _pearson_correlation(contribution_scores[:n], future_returns[:n])
# ---------------------------------------------------------------------------
# SQL queries — source attribution via v_source_performance
# ---------------------------------------------------------------------------
_SOURCE_ATTRIBUTION_SQL = """
SELECT
source,
source_type,
weight,
contribution_score,
is_duplicate,
direction_correct,
future_return,
excess_return_vs_spy
FROM v_source_performance
WHERE horizon = $1
AND generated_at >= $2
"""
_SOURCE_ATTRIBUTION_ALL_SQL = """
SELECT
source,
source_type,
weight,
contribution_score,
is_duplicate,
direction_correct,
future_return,
excess_return_vs_spy
FROM v_source_performance
WHERE horizon = $1
"""
# ---------------------------------------------------------------------------
# SQL queries — catalyst attribution via v_source_performance
# ---------------------------------------------------------------------------
_CATALYST_ATTRIBUTION_SQL = """
SELECT
catalyst_type,
weight,
contribution_score,
direction_correct,
future_return,
excess_return_vs_spy
FROM v_source_performance
WHERE horizon = $1
AND generated_at >= $2
"""
_CATALYST_ATTRIBUTION_ALL_SQL = """
SELECT
catalyst_type,
weight,
contribution_score,
direction_correct,
future_return,
excess_return_vs_spy
FROM v_source_performance
WHERE horizon = $1
"""
# ---------------------------------------------------------------------------
# SQL queries — layer attribution via prediction_snapshots + outcomes
# ---------------------------------------------------------------------------
_LAYER_ATTRIBUTION_SQL = """
SELECT
ps.score_company,
ps.score_macro,
ps.score_competitive,
po.direction_correct,
po.future_return
FROM prediction_snapshots ps
JOIN prediction_outcomes po ON po.prediction_id = ps.id
WHERE po.horizon = $1
AND ps.generated_at >= $2
"""
_LAYER_ATTRIBUTION_ALL_SQL = """
SELECT
ps.score_company,
ps.score_macro,
ps.score_competitive,
po.direction_correct,
po.future_return
FROM prediction_snapshots ps
JOIN prediction_outcomes po ON po.prediction_id = ps.id
WHERE po.horizon = $1
"""
# ---------------------------------------------------------------------------
# Source attribution (Requirements 7.1, 7.2, 7.7)
# ---------------------------------------------------------------------------
async def compute_source_attribution(
pool: asyncpg.Pool,
lookback_days: int = 30,
horizon: str = "7d",
) -> list[SourceAttribution]:
"""Compute per-source performance metrics.
Queries v_source_performance, groups by source, and computes:
prediction count, avg weight, avg contribution score, win rate,
avg future return, avg excess return vs SPY, IC, and duplicate rate.
Returns a list of SourceAttribution sorted by prediction count descending.
"""
now = datetime.now().astimezone()
cutoff = now - timedelta(days=lookback_days)
try:
rows = await pool.fetch(_SOURCE_ATTRIBUTION_SQL, horizon, cutoff)
except Exception:
logger.exception(
"Failed to query source attribution for horizon=%s lookback=%dd",
horizon,
lookback_days,
)
return []
if not rows:
return []
# Group rows by source
source_groups: dict[str, list[dict]] = {}
for row in rows:
r = dict(row)
key = r.get("source") or "unknown"
source_groups.setdefault(key, []).append(r)
results: list[SourceAttribution] = []
for source, group in source_groups.items():
count = len(group)
# Source type — take the most common one
source_type = group[0].get("source_type") or "unknown"
# Avg weight
weights = [r["weight"] for r in group if r.get("weight") is not None]
avg_weight = sum(weights) / len(weights) if weights else 0.0
# Avg contribution score
contrib_scores = [
r["contribution_score"]
for r in group
if r.get("contribution_score") is not None
]
avg_contribution_score = (
sum(contrib_scores) / len(contrib_scores) if contrib_scores else 0.0
)
# Win rate
direction_rows = [r for r in group if r.get("direction_correct") is not None]
win_count = sum(1 for r in direction_rows if r["direction_correct"] is True)
win_rate = win_count / len(direction_rows) if direction_rows else 0.0
# Avg future return
returns = [
r["future_return"] for r in group if r.get("future_return") is not None
]
avg_future_return = sum(returns) / len(returns) if returns else 0.0
# Avg excess return vs SPY
excess_returns = [
r["excess_return_vs_spy"]
for r in group
if r.get("excess_return_vs_spy") is not None
]
avg_excess_return_vs_spy = (
sum(excess_returns) / len(excess_returns) if excess_returns else 0.0
)
# IC: correlation between contribution scores and future returns
ic_scores = [
r["contribution_score"]
for r in group
if r.get("contribution_score") is not None
and r.get("future_return") is not None
]
ic_returns = [
r["future_return"]
for r in group
if r.get("contribution_score") is not None
and r.get("future_return") is not None
]
ic = _compute_ic(ic_scores, ic_returns)
# Duplicate rate: is_duplicate=true / total
dup_count = sum(1 for r in group if r.get("is_duplicate") is True)
duplicate_rate = dup_count / count
results.append(
SourceAttribution(
source=source,
source_type=source_type,
prediction_count=count,
avg_weight=avg_weight,
avg_contribution_score=avg_contribution_score,
win_rate=win_rate,
avg_future_return=avg_future_return,
avg_excess_return_vs_spy=avg_excess_return_vs_spy,
information_coefficient=ic,
duplicate_rate=duplicate_rate,
)
)
# Sort by prediction count descending
results.sort(key=lambda a: a.prediction_count, reverse=True)
logger.info(
"Computed source attribution for %d sources (horizon=%s, lookback=%dd)",
len(results),
horizon,
lookback_days,
)
return results
# ---------------------------------------------------------------------------
# Catalyst attribution (Requirements 7.3, 7.4)
# ---------------------------------------------------------------------------
async def compute_catalyst_attribution(
pool: asyncpg.Pool,
lookback_days: int = 30,
horizon: str = "7d",
) -> list[CatalystAttribution]:
"""Compute per-catalyst-type performance metrics.
Queries v_source_performance, groups by catalyst_type, and computes:
prediction count, win rate, avg future return, avg excess return vs SPY,
and IC.
Returns a list of CatalystAttribution sorted by prediction count descending.
"""
now = datetime.now().astimezone()
cutoff = now - timedelta(days=lookback_days)
try:
rows = await pool.fetch(_CATALYST_ATTRIBUTION_SQL, horizon, cutoff)
except Exception:
logger.exception(
"Failed to query catalyst attribution for horizon=%s lookback=%dd",
horizon,
lookback_days,
)
return []
if not rows:
return []
# Group rows by catalyst_type
catalyst_groups: dict[str, list[dict]] = {}
for row in rows:
r = dict(row)
key = r.get("catalyst_type") or "unknown"
catalyst_groups.setdefault(key, []).append(r)
results: list[CatalystAttribution] = []
for catalyst_type, group in catalyst_groups.items():
count = len(group)
# Win rate
direction_rows = [r for r in group if r.get("direction_correct") is not None]
win_count = sum(1 for r in direction_rows if r["direction_correct"] is True)
win_rate = win_count / len(direction_rows) if direction_rows else 0.0
# Avg future return
returns = [
r["future_return"] for r in group if r.get("future_return") is not None
]
avg_future_return = sum(returns) / len(returns) if returns else 0.0
# Avg excess return vs SPY
excess_returns = [
r["excess_return_vs_spy"]
for r in group
if r.get("excess_return_vs_spy") is not None
]
avg_excess_return_vs_spy = (
sum(excess_returns) / len(excess_returns) if excess_returns else 0.0
)
# IC: correlation between contribution scores and future returns
ic_scores = [
r["contribution_score"]
for r in group
if r.get("contribution_score") is not None
and r.get("future_return") is not None
]
ic_returns = [
r["future_return"]
for r in group
if r.get("contribution_score") is not None
and r.get("future_return") is not None
]
ic = _compute_ic(ic_scores, ic_returns)
results.append(
CatalystAttribution(
catalyst_type=catalyst_type,
prediction_count=count,
win_rate=win_rate,
avg_future_return=avg_future_return,
avg_excess_return_vs_spy=avg_excess_return_vs_spy,
information_coefficient=ic,
)
)
# Sort by prediction count descending
results.sort(key=lambda a: a.prediction_count, reverse=True)
logger.info(
"Computed catalyst attribution for %d catalyst types "
"(horizon=%s, lookback=%dd)",
len(results),
horizon,
lookback_days,
)
return results
# ---------------------------------------------------------------------------
# Layer attribution (Requirements 7.5, 7.6)
# ---------------------------------------------------------------------------
async def compute_layer_attribution(
pool: asyncpg.Pool,
lookback_days: int = 30,
horizon: str = "7d",
) -> list[LayerAttribution]:
"""Compute per-layer (company, macro, competitive) performance metrics.
Queries prediction_snapshots joined with prediction_outcomes to get
score_company, score_macro, score_competitive alongside outcomes.
For each layer computes:
- avg_contribution_pct: average of layer_score / total_score across all
predictions (where total_score > 0)
- dominant_win_rate: win rate for predictions where the layer contributes
more than 30% of the total score
- dominant_ic: IC (Pearson correlation between layer score and future
return) for predictions where the layer contributes > 30%
Returns a list of 3 LayerAttribution objects (company, macro, competitive).
"""
now = datetime.now().astimezone()
cutoff = now - timedelta(days=lookback_days)
try:
rows = await pool.fetch(_LAYER_ATTRIBUTION_SQL, horizon, cutoff)
except Exception:
logger.exception(
"Failed to query layer attribution for horizon=%s lookback=%dd",
horizon,
lookback_days,
)
return []
if not rows:
return [
LayerAttribution(
layer="company",
avg_contribution_pct=0.0,
dominant_win_rate=0.0,
dominant_ic=None,
),
LayerAttribution(
layer="macro",
avg_contribution_pct=0.0,
dominant_win_rate=0.0,
dominant_ic=None,
),
LayerAttribution(
layer="competitive",
avg_contribution_pct=0.0,
dominant_win_rate=0.0,
dominant_ic=None,
),
]
row_dicts = [dict(r) for r in rows]
layers = [
("company", "score_company"),
("macro", "score_macro"),
("competitive", "score_competitive"),
]
results: list[LayerAttribution] = []
for layer_name, score_field in layers:
# --- Average contribution percentage ---
contribution_pcts: list[float] = []
for r in row_dicts:
total = (
(r.get("score_company") or 0.0)
+ (r.get("score_macro") or 0.0)
+ (r.get("score_competitive") or 0.0)
)
if total > 0.0:
layer_score = r.get(score_field) or 0.0
contribution_pcts.append(layer_score / total)
avg_contribution_pct = (
sum(contribution_pcts) / len(contribution_pcts)
if contribution_pcts
else 0.0
)
# --- Dominant predictions: layer > 30% of total score ---
dominant_rows: list[dict] = []
for r in row_dicts:
total = (
(r.get("score_company") or 0.0)
+ (r.get("score_macro") or 0.0)
+ (r.get("score_competitive") or 0.0)
)
if total > 0.0:
layer_score = r.get(score_field) or 0.0
if layer_score / total > 0.30:
dominant_rows.append(r)
# Dominant win rate
dominant_direction_rows = [
r for r in dominant_rows if r.get("direction_correct") is not None
]
dominant_win_count = sum(
1 for r in dominant_direction_rows if r["direction_correct"] is True
)
dominant_win_rate = (
dominant_win_count / len(dominant_direction_rows)
if dominant_direction_rows
else 0.0
)
# Dominant IC: correlation between layer score and future return
dom_scores = [
r.get(score_field) or 0.0
for r in dominant_rows
if r.get("future_return") is not None
]
dom_returns = [
r["future_return"]
for r in dominant_rows
if r.get("future_return") is not None
]
dominant_ic = _compute_ic(dom_scores, dom_returns)
results.append(
LayerAttribution(
layer=layer_name,
avg_contribution_pct=avg_contribution_pct,
dominant_win_rate=dominant_win_rate,
dominant_ic=dominant_ic,
)
)
logger.info(
"Computed layer attribution for 3 layers (horizon=%s, lookback=%dd)",
horizon,
lookback_days,
)
return results
+135
View File
@@ -0,0 +1,135 @@
"""Calibration Engine — Bayesian shrinkage source reliability and weight adjustment.
Computes source reliability scores using Bayesian shrinkage from historical
prediction outcomes, and adjusts evidence weights based on source performance.
Updates the existing source_accuracy table with reliability scores.
Requirements: 8.1, 8.2, 8.3, 8.4, 8.5
"""
from __future__ import annotations
import logging
import asyncpg
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Pure functions — testable without a database
# ---------------------------------------------------------------------------
def compute_source_reliability(
observed_win_rate: float,
sample_count: int,
prior_strength: int = 30,
) -> float:
"""Bayesian shrinkage source reliability.
reliability = 0.5 + (n / (n + prior_strength)) * (observed_win_rate - 0.5)
Returns value in [0.0, 1.0].
When n=0, returns 0.5 (prior mean).
As n→∞, approaches observed_win_rate.
"""
if sample_count <= 0:
return 0.5
shrinkage = sample_count / (sample_count + prior_strength)
reliability = 0.5 + shrinkage * (observed_win_rate - 0.5)
# Clamp to [0.0, 1.0] for safety (should already be in range when
# observed_win_rate is in [0.0, 1.0], but guard against edge cases).
return max(0.0, min(1.0, reliability))
def compute_adjusted_evidence_weight(
base_weight: float,
reliability: float,
) -> float:
"""Adjusted weight = base_weight * (0.5 + reliability), clamped to [0.1, 2.0]."""
adjusted = base_weight * (0.5 + reliability)
return max(0.1, min(2.0, adjusted))
# ---------------------------------------------------------------------------
# SQL queries
# ---------------------------------------------------------------------------
# Query v_source_performance to get per-source win rates and sample counts.
# Groups by source, counting total predictions and directional wins.
_SOURCE_PERFORMANCE_SQL = """
SELECT
source,
COUNT(*) AS sample_count,
COUNT(*) FILTER (WHERE direction_correct = TRUE) AS win_count
FROM v_source_performance
WHERE direction_correct IS NOT NULL
GROUP BY source
"""
# Upsert into source_accuracy: update accuracy_ratio and sample_count
# for existing sources, insert new ones.
_UPSERT_SOURCE_ACCURACY_SQL = """
INSERT INTO source_accuracy (source_id, accuracy_ratio, sample_count, last_updated)
VALUES ($1, $2, $3, NOW())
ON CONFLICT (source_id)
DO UPDATE SET
accuracy_ratio = EXCLUDED.accuracy_ratio,
sample_count = EXCLUDED.sample_count,
last_updated = NOW()
"""
# ---------------------------------------------------------------------------
# Database-backed function
# ---------------------------------------------------------------------------
async def update_source_reliabilities(
pool: asyncpg.Pool,
) -> int:
"""Recompute and store source reliability scores from latest outcomes.
1. Queries v_source_performance to get per-source win rates and counts
2. Computes Bayesian shrinkage reliability for each source
3. Upserts into source_accuracy table (accuracy_ratio = reliability)
Returns count of sources updated.
"""
try:
rows = await pool.fetch(_SOURCE_PERFORMANCE_SQL)
except Exception:
logger.exception("Failed to query source performance for reliability update")
return 0
if not rows:
logger.info("No source performance data available for reliability update")
return 0
updated = 0
for row in rows:
source = row["source"]
sample_count = row["sample_count"]
win_count = row["win_count"]
observed_win_rate = win_count / sample_count if sample_count > 0 else 0.5
reliability = compute_source_reliability(observed_win_rate, sample_count)
try:
await pool.execute(
_UPSERT_SOURCE_ACCURACY_SQL,
source,
reliability,
sample_count,
)
updated += 1
except Exception:
logger.exception(
"Failed to upsert source reliability for source=%s", source
)
logger.info("Updated source reliabilities for %d sources", updated)
return updated
+637
View File
@@ -0,0 +1,637 @@
"""Metrics Engine — computes calibration, IC, Brier, and benchmark metrics.
Aggregates model quality metrics across configurable lookback windows and
prediction horizons. Stores periodic snapshots for time-series analysis
of model performance trends.
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 6.1, 6.2, 6.3, 6.4, 6.5,
9.1, 9.2, 9.3, 9.4, 10.1, 10.2, 10.3, 10.4, 10.5
"""
from __future__ import annotations
import json
import logging
import math
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timedelta
import asyncpg
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
CONFIDENCE_BUCKETS: list[tuple[float, float]] = [
(0.50, 0.60),
(0.60, 0.70),
(0.70, 0.80),
(0.80, 0.90),
(0.90, 1.00),
]
LOOKBACK_WINDOWS: list[str] = ["7d", "30d", "90d", "all"]
LOOKBACK_DURATIONS: dict[str, timedelta | None] = {
"7d": timedelta(days=7),
"30d": timedelta(days=30),
"90d": timedelta(days=90),
"all": None,
}
EVALUATION_HORIZONS: list[str] = ["1h", "6h", "1d", "7d", "30d"]
# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------
@dataclass
class CalibrationBucket:
"""Calibration metrics for a single confidence bucket."""
bucket_low: float
bucket_high: float
avg_confidence: float
observed_win_rate: float
prediction_count: int
miscalibrated: bool # |avg_confidence - win_rate| > 0.15
@dataclass
class ModelMetricSnapshot:
"""Aggregate model quality metrics for a lookback/horizon combination."""
id: str
generated_at: datetime
lookback_window: str
horizon: str
prediction_count: int
win_rate: float
directional_accuracy: float
information_coefficient: float | None
rank_information_coefficient: float | None
avg_return: float
avg_excess_return_vs_spy: float
avg_excess_return_vs_sector: float
calibration_error: float # ECE
brier_score: float
buy_win_rate: float
sell_win_rate: float
hold_win_rate: float
metadata: dict = field(default_factory=dict)
# ---------------------------------------------------------------------------
# Pure computation functions
# ---------------------------------------------------------------------------
def compute_calibration_error(
confidences: list[float],
outcomes: list[bool],
) -> tuple[float, list[CalibrationBucket]]:
"""Compute ECE and calibration buckets.
ECE = Σ (n_b / N) * |avg_conf_b - win_rate_b|
Groups predictions into 5 confidence buckets and computes the weighted
average of |avg_confidence - observed_win_rate| across all buckets.
Flags buckets where |diff| > 0.15 as miscalibrated.
Returns (ece, buckets). Returns (0.0, []) when no data is provided.
"""
if not confidences or not outcomes:
return 0.0, []
n = len(confidences)
buckets: list[CalibrationBucket] = []
ece = 0.0
for low, high in CONFIDENCE_BUCKETS:
bucket_confs: list[float] = []
bucket_outcomes: list[bool] = []
for conf, outcome in zip(confidences, outcomes):
# Last bucket is inclusive on the right: [0.90, 1.00]
if high == 1.00:
in_bucket = low <= conf <= high
else:
in_bucket = low <= conf < high
if in_bucket:
bucket_confs.append(conf)
bucket_outcomes.append(outcome)
count = len(bucket_confs)
if count == 0:
# Empty bucket — exclude from ECE, still record it
buckets.append(
CalibrationBucket(
bucket_low=low,
bucket_high=high,
avg_confidence=0.0,
observed_win_rate=0.0,
prediction_count=0,
miscalibrated=False,
)
)
continue
avg_conf = sum(bucket_confs) / count
win_rate = sum(1.0 for o in bucket_outcomes if o) / count
diff = abs(avg_conf - win_rate)
miscalibrated = diff > 0.15
buckets.append(
CalibrationBucket(
bucket_low=low,
bucket_high=high,
avg_confidence=avg_conf,
observed_win_rate=win_rate,
prediction_count=count,
miscalibrated=miscalibrated,
)
)
ece += (count / n) * diff
return ece, buckets
def compute_brier_score(
p_bulls: list[float],
outcomes: list[bool],
) -> float:
"""Brier score = mean((p_bull - outcome)^2).
outcome is 1.0 when price moved in predicted direction, 0.0 otherwise.
Returns value in [0.0, 1.0]. Returns 0.0 for empty input.
"""
if not p_bulls or not outcomes:
return 0.0
n = len(p_bulls)
total = 0.0
for p, o in zip(p_bulls, outcomes):
actual = 1.0 if o else 0.0
total += (p - actual) ** 2
return total / n
def _pearson_correlation(xs: list[float], ys: list[float]) -> float | None:
"""Compute Pearson correlation coefficient between two lists.
Returns None if the lists have fewer than 2 elements or if either
has zero variance. Guards against NaN/infinity.
"""
n = len(xs)
if n < 2:
return None
mean_x = sum(xs) / n
mean_y = sum(ys) / n
cov = 0.0
var_x = 0.0
var_y = 0.0
for x, y in zip(xs, ys):
dx = x - mean_x
dy = y - mean_y
cov += dx * dy
var_x += dx * dx
var_y += dy * dy
if var_x == 0.0 or var_y == 0.0:
return None
r = cov / math.sqrt(var_x * var_y)
# Guard against floating-point drift
if math.isnan(r) or math.isinf(r):
return None
# Clamp to [-1.0, 1.0]
return max(-1.0, min(1.0, r))
def _rank_data(values: list[float]) -> list[float]:
"""Compute fractional ranks for a list of values (average tie-breaking)."""
n = len(values)
indexed = sorted(range(n), key=lambda i: values[i])
ranks = [0.0] * n
i = 0
while i < n:
# Find the end of the tie group
j = i + 1
while j < n and values[indexed[j]] == values[indexed[i]]:
j += 1
# Average rank for the tie group (1-based)
avg_rank = (i + j + 1) / 2.0
for k in range(i, j):
ranks[indexed[k]] = avg_rank
i = j
return ranks
def compute_information_coefficient(
scores: list[float],
returns: list[float],
) -> float | None:
"""Pearson correlation between prediction scores and future returns.
Returns None when fewer than 30 data points.
Returns value in [-1.0, 1.0].
"""
if len(scores) < 30 or len(returns) < 30:
return None
n = min(len(scores), len(returns))
return _pearson_correlation(scores[:n], returns[:n])
def compute_rank_information_coefficient(
scores: list[float],
returns: list[float],
) -> float | None:
"""Spearman rank correlation between prediction scores and future returns.
Ranks the data and computes Pearson correlation on the ranks.
Returns None when fewer than 30 data points.
Returns value in [-1.0, 1.0].
"""
if len(scores) < 30 or len(returns) < 30:
return None
n = min(len(scores), len(returns))
ranked_scores = _rank_data(scores[:n])
ranked_returns = _rank_data(returns[:n])
return _pearson_correlation(ranked_scores, ranked_returns)
def compute_contribution_scores(
weights: list[float],
) -> list[float]:
"""Compute contribution scores from document weights.
Each score = weight_i / sum(weights). Sums to 1.0.
Each score in [0.0, 1.0].
Returns empty list for empty input.
"""
if not weights:
return []
total = sum(weights)
if total == 0.0:
n = len(weights)
return [1.0 / n] * n
return [w / total for w in weights]
def compute_hit_rate_improvement(win_rate: float) -> float:
"""Hit rate improvement over random 50/50 baseline.
Defined as (system_win_rate - 0.5) / 0.5.
"""
return (win_rate - 0.5) / 0.5
# ---------------------------------------------------------------------------
# SQL queries for v_prediction_performance view
# ---------------------------------------------------------------------------
_PERFORMANCE_DATA_SQL = """
SELECT
ticker,
direction,
action,
confidence,
strength,
p_bull,
score_company,
score_macro,
score_competitive,
future_return,
excess_return_vs_spy,
excess_return_vs_sector,
direction_correct,
profitable,
horizon,
generated_at
FROM v_prediction_performance
WHERE horizon = $1
"""
_PERFORMANCE_DATA_WITH_LOOKBACK_SQL = """
SELECT
ticker,
direction,
action,
confidence,
strength,
p_bull,
score_company,
score_macro,
score_competitive,
future_return,
excess_return_vs_spy,
excess_return_vs_sector,
direction_correct,
profitable,
horizon,
generated_at
FROM v_prediction_performance
WHERE horizon = $1
AND generated_at >= $2
"""
_INSERT_METRIC_SNAPSHOT_SQL = """
INSERT INTO model_metric_snapshots (
id, generated_at, lookback_window, horizon,
prediction_count, win_rate, directional_accuracy,
information_coefficient, rank_information_coefficient,
avg_return, avg_excess_return_vs_spy, avg_excess_return_vs_sector,
calibration_error, brier_score,
buy_win_rate, sell_win_rate, hold_win_rate,
metadata
) VALUES (
$1::uuid, $2, $3, $4,
$5, $6, $7,
$8, $9,
$10, $11, $12,
$13, $14,
$15, $16, $17,
$18::jsonb
)
"""
# ---------------------------------------------------------------------------
# Metric computation from raw rows
# ---------------------------------------------------------------------------
def _compute_metrics_from_rows(
rows: list[dict],
lookback_window: str,
horizon: str,
) -> ModelMetricSnapshot:
"""Compute all metrics from a list of prediction performance rows.
Returns a ModelMetricSnapshot with all computed metrics.
"""
now = datetime.now().astimezone()
snapshot_id = str(uuid.uuid4())
prediction_count = len(rows)
if prediction_count == 0:
return ModelMetricSnapshot(
id=snapshot_id,
generated_at=now,
lookback_window=lookback_window,
horizon=horizon,
prediction_count=0,
win_rate=0.0,
directional_accuracy=0.0,
information_coefficient=None,
rank_information_coefficient=None,
avg_return=0.0,
avg_excess_return_vs_spy=0.0,
avg_excess_return_vs_sector=0.0,
calibration_error=0.0,
brier_score=0.0,
buy_win_rate=0.0,
sell_win_rate=0.0,
hold_win_rate=0.0,
metadata={},
)
# --- Win rate and directional accuracy ---
direction_correct_count = sum(
1 for r in rows if r.get("direction_correct") is True
)
win_rate = direction_correct_count / prediction_count
directional_accuracy = win_rate # Same metric, different name
# --- Per-action win rates ---
buy_rows = [r for r in rows if (r.get("action") or "").lower() == "buy"]
sell_rows = [r for r in rows if (r.get("action") or "").lower() == "sell"]
hold_rows = [r for r in rows if (r.get("action") or "").lower() == "hold"]
buy_win_rate = (
sum(1 for r in buy_rows if r.get("direction_correct") is True) / len(buy_rows)
if buy_rows
else 0.0
)
sell_win_rate = (
sum(1 for r in sell_rows if r.get("direction_correct") is True)
/ len(sell_rows)
if sell_rows
else 0.0
)
hold_win_rate = (
sum(1 for r in hold_rows if r.get("direction_correct") is True)
/ len(hold_rows)
if hold_rows
else 0.0
)
# --- Average return ---
returns_list = [
r["future_return"] for r in rows if r.get("future_return") is not None
]
avg_return = sum(returns_list) / len(returns_list) if returns_list else 0.0
# --- Average excess return vs SPY (Requirement 9.1) ---
excess_spy_list = [
r["excess_return_vs_spy"]
for r in rows
if r.get("excess_return_vs_spy") is not None
]
avg_excess_return_vs_spy = (
sum(excess_spy_list) / len(excess_spy_list) if excess_spy_list else 0.0
)
# --- Average excess return vs sector ETF (Requirement 9.2) ---
excess_sector_list = [
r["excess_return_vs_sector"]
for r in rows
if r.get("excess_return_vs_sector") is not None
]
avg_excess_return_vs_sector = (
sum(excess_sector_list) / len(excess_sector_list)
if excess_sector_list
else 0.0
)
# --- Calibration error (ECE) (Requirements 5.1, 5.2, 5.3, 5.5) ---
confidences = [
r["confidence"] for r in rows if r.get("confidence") is not None
]
outcomes = [
r.get("direction_correct") is True
for r in rows
if r.get("confidence") is not None
]
ece, _buckets = compute_calibration_error(confidences, outcomes)
# --- Brier score (Requirement 5.4) ---
p_bulls = [r["p_bull"] for r in rows if r.get("p_bull") is not None]
brier_outcomes = [
r.get("direction_correct") is True
for r in rows
if r.get("p_bull") is not None
]
brier = compute_brier_score(p_bulls, brier_outcomes)
# --- Information Coefficient (Requirements 6.1, 6.5) ---
ic_scores = [
r["strength"] for r in rows if r.get("strength") is not None
and r.get("future_return") is not None
]
ic_returns = [
r["future_return"] for r in rows if r.get("strength") is not None
and r.get("future_return") is not None
]
ic = compute_information_coefficient(ic_scores, ic_returns)
# --- Rank Information Coefficient (Requirements 6.2, 6.5) ---
rank_ic = compute_rank_information_coefficient(ic_scores, ic_returns)
# --- Hit rate improvement (Requirement 9.4) ---
hit_rate_improvement = compute_hit_rate_improvement(win_rate)
# --- Metadata (Requirement 10.5) ---
metadata: dict = {
"hit_rate_improvement": hit_rate_improvement,
"buy_count": len(buy_rows),
"sell_count": len(sell_rows),
"hold_count": len(hold_rows),
"returns_count": len(returns_list),
"excess_spy_count": len(excess_spy_list),
"excess_sector_count": len(excess_sector_list),
}
return ModelMetricSnapshot(
id=snapshot_id,
generated_at=now,
lookback_window=lookback_window,
horizon=horizon,
prediction_count=prediction_count,
win_rate=win_rate,
directional_accuracy=directional_accuracy,
information_coefficient=ic,
rank_information_coefficient=rank_ic,
avg_return=avg_return,
avg_excess_return_vs_spy=avg_excess_return_vs_spy,
avg_excess_return_vs_sector=avg_excess_return_vs_sector,
calibration_error=ece,
brier_score=brier,
buy_win_rate=buy_win_rate,
sell_win_rate=sell_win_rate,
hold_win_rate=hold_win_rate,
metadata=metadata,
)
# ---------------------------------------------------------------------------
# Main entry point (Requirements 10.1, 10.2, 10.3, 10.4, 10.5)
# ---------------------------------------------------------------------------
async def compute_and_store_metric_snapshots(
pool: asyncpg.Pool,
) -> list[ModelMetricSnapshot]:
"""Compute metric snapshots for all lookback/horizon combinations.
Lookback windows: 7d, 30d, 90d, all-time.
Horizons: 1h, 6h, 1d, 7d, 30d.
For each of the 4 lookbacks × 5 horizons = 20 combinations, queries the
v_prediction_performance view, computes all metrics, and persists the
result to model_metric_snapshots.
Returns the list of computed snapshots.
"""
snapshots: list[ModelMetricSnapshot] = []
now = datetime.now().astimezone()
for lookback in LOOKBACK_WINDOWS:
duration = LOOKBACK_DURATIONS[lookback]
for horizon in EVALUATION_HORIZONS:
try:
# Query performance data
if duration is not None:
cutoff = now - duration
rows = await pool.fetch(
_PERFORMANCE_DATA_WITH_LOOKBACK_SQL,
horizon,
cutoff,
)
else:
rows = await pool.fetch(
_PERFORMANCE_DATA_SQL,
horizon,
)
# Convert asyncpg Records to dicts
row_dicts = [dict(r) for r in rows]
# Compute metrics
snapshot = _compute_metrics_from_rows(
row_dicts, lookback, horizon
)
# Persist
await pool.execute(
_INSERT_METRIC_SNAPSHOT_SQL,
snapshot.id,
snapshot.generated_at,
snapshot.lookback_window,
snapshot.horizon,
snapshot.prediction_count,
snapshot.win_rate,
snapshot.directional_accuracy,
snapshot.information_coefficient,
snapshot.rank_information_coefficient,
snapshot.avg_return,
snapshot.avg_excess_return_vs_spy,
snapshot.avg_excess_return_vs_sector,
snapshot.calibration_error,
snapshot.brier_score,
snapshot.buy_win_rate,
snapshot.sell_win_rate,
snapshot.hold_win_rate,
json.dumps(snapshot.metadata),
)
snapshots.append(snapshot)
except Exception:
logger.exception(
"Failed to compute metrics for lookback=%s horizon=%s",
lookback,
horizon,
)
continue
logger.info(
"Computed %d metric snapshots across %d lookback/horizon combinations",
len(snapshots),
len(LOOKBACK_WINDOWS) * len(EVALUATION_HORIZONS),
)
return snapshots
+414
View File
@@ -0,0 +1,414 @@
"""Outcome Evaluator — matches predictions with realized market outcomes.
Runs periodically to evaluate prediction snapshots whose horizon has elapsed.
For each snapshot, fetches future prices at the horizon endpoint and computes
returns, excess returns, directional accuracy, and profitability across all
five evaluation horizons (1h, 6h, 1d, 7d, 30d).
Requirements: 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 4.10
"""
from __future__ import annotations
import json
import logging
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timedelta
import asyncpg
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
HORIZON_DURATIONS: dict[str, timedelta] = {
"1h": timedelta(hours=1),
"6h": timedelta(hours=6),
"1d": timedelta(days=1),
"7d": timedelta(days=7),
"30d": timedelta(days=30),
}
# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------
@dataclass
class PredictionOutcome:
"""Realized outcome for a prediction at a specific horizon."""
id: str # UUID
prediction_id: str
evaluated_at: datetime
horizon: str # 1h, 6h, 1d, 7d, 30d
future_price: float
future_return: float
spy_future_price: float | None
spy_return: float | None
sector_etf_future_price: float | None
sector_etf_return: float | None
excess_return_vs_spy: float | None
excess_return_vs_sector: float | None
direction_correct: bool
profitable: bool
metadata: dict = field(default_factory=dict)
# ---------------------------------------------------------------------------
# SQL statements
# ---------------------------------------------------------------------------
# Find matured predictions: snapshots where generated_at + horizon_duration <= NOW()
# and no outcome has been recorded yet for that (prediction_id, horizon) pair.
# We evaluate ALL 5 horizons for each snapshot, not just the snapshot's own horizon.
_MATURED_PREDICTIONS_SQL = """
SELECT
ps.id,
ps.generated_at,
ps.ticker,
ps.horizon AS snapshot_horizon,
ps.direction,
ps.action,
ps.price_at_prediction,
ps.spy_price_at_prediction,
ps.sector_etf_price_at_prediction
FROM prediction_snapshots ps
WHERE ps.generated_at + $1::interval <= NOW()
AND NOT EXISTS (
SELECT 1 FROM prediction_outcomes po
WHERE po.prediction_id = ps.id AND po.horizon = $2
)
"""
# Fetch the close price for a ticker at or before a specific time.
# Uses the closest bar before or at the target time.
_CLOSE_AT_TIME_SQL = """
SELECT (data->>'c')::float AS close
FROM market_snapshots
WHERE ticker = $1
AND snapshot_type = 'bar'
AND data->>'c' IS NOT NULL
AND captured_at <= $2
ORDER BY captured_at DESC
LIMIT 1
"""
_INSERT_OUTCOME_SQL = """
INSERT INTO prediction_outcomes (
id, prediction_id, evaluated_at, horizon,
future_price, future_return,
spy_future_price, spy_return,
sector_etf_future_price, sector_etf_return,
excess_return_vs_spy, excess_return_vs_sector,
direction_correct, profitable,
metadata
) VALUES (
$1::uuid, $2::uuid, $3, $4,
$5, $6,
$7, $8,
$9, $10,
$11, $12,
$13, $14,
$15::jsonb
)
"""
# ---------------------------------------------------------------------------
# Price fetching at a specific time
# ---------------------------------------------------------------------------
async def _fetch_close_at_time(
pool: asyncpg.Pool,
ticker: str,
target_time: datetime,
) -> float | None:
"""Fetch the close price for a ticker at or before a specific time.
Returns None if no market data is available before the target time.
"""
row = await pool.fetchrow(_CLOSE_AT_TIME_SQL, ticker, target_time)
if row is None:
return None
return row["close"]
# ---------------------------------------------------------------------------
# Sector ETF lookup (reuse pattern from prediction_snapshot)
# ---------------------------------------------------------------------------
_SECTOR_ETF_MAP: dict[str, str] = {
"Technology": "XLK",
"Consumer Cyclical": "XLY",
"Financial Services": "XLF",
"Healthcare": "XLV",
"Energy": "XLE",
"Communication Services": "XLC",
"Industrials": "XLI",
"Consumer Defensive": "XLP",
"Real Estate": "XLRE",
"Utilities": "XLU",
}
_COMPANY_SECTOR_SQL = """
SELECT sector FROM companies WHERE ticker = $1 AND active = TRUE LIMIT 1
"""
async def _fetch_sector_etf_ticker(pool: asyncpg.Pool, ticker: str) -> str | None:
"""Look up the sector ETF ticker for a company ticker."""
row = await pool.fetchrow(_COMPANY_SECTOR_SQL, ticker)
if row is None or row["sector"] is None:
return None
return _SECTOR_ETF_MAP.get(row["sector"])
# ---------------------------------------------------------------------------
# Return computation helpers
# ---------------------------------------------------------------------------
def _compute_return(current_price: float, future_price: float) -> float:
"""Compute simple return: (future - current) / current."""
if current_price == 0.0:
return 0.0
return (future_price - current_price) / current_price
def _is_direction_correct(direction: str, future_return: float) -> bool:
"""Determine if the predicted direction matches the realized return.
bullish + positive return = True
bearish + negative return = True
All other combinations = False
"""
direction_lower = direction.lower()
if direction_lower == "bullish" and future_return > 0.0:
return True
if direction_lower == "bearish" and future_return < 0.0:
return True
return False
def _is_profitable(action: str, future_return: float) -> bool:
"""Determine if the predicted action would have been profitable.
buy + positive return = True
sell + negative return = True
All other combinations = False
"""
action_lower = action.lower()
if action_lower == "buy" and future_return > 0.0:
return True
if action_lower == "sell" and future_return < 0.0:
return True
return False
# ---------------------------------------------------------------------------
# Single prediction evaluation (Requirements 4.24.7)
# ---------------------------------------------------------------------------
async def evaluate_single_prediction(
pool: asyncpg.Pool,
snapshot: dict,
horizon: str,
) -> PredictionOutcome | None:
"""Evaluate a single prediction at a specific horizon.
Fetches the future price at generated_at + horizon_duration for the ticker,
SPY, and sector ETF. Computes returns, excess returns, direction correctness,
and profitability.
Returns None if the ticker's future price is unavailable (Requirement 4.10).
"""
duration = HORIZON_DURATIONS[horizon]
target_time = snapshot["generated_at"] + duration
ticker = snapshot["ticker"]
# Fetch future price for the ticker — required (skip if unavailable)
future_price = await _fetch_close_at_time(pool, ticker, target_time)
if future_price is None:
logger.debug(
"Future price unavailable for %s at horizon %s (target %s), skipping",
ticker,
horizon,
target_time,
)
return None
price_at_prediction = snapshot["price_at_prediction"]
if price_at_prediction is None or price_at_prediction == 0.0:
logger.warning(
"Price at prediction is NULL or zero for snapshot %s, skipping horizon %s",
snapshot["id"],
horizon,
)
return None
# Compute ticker future return (Requirement 4.2)
future_return = _compute_return(price_at_prediction, future_price)
# Fetch SPY future price and compute SPY return (Requirement 4.3)
spy_future_price: float | None = None
spy_return: float | None = None
spy_price_at_prediction = snapshot["spy_price_at_prediction"]
if spy_price_at_prediction is not None and spy_price_at_prediction != 0.0:
spy_future_price = await _fetch_close_at_time(pool, "SPY", target_time)
if spy_future_price is not None:
spy_return = _compute_return(spy_price_at_prediction, spy_future_price)
# Fetch sector ETF future price and compute sector return (Requirement 4.4)
sector_etf_future_price: float | None = None
sector_etf_return: float | None = None
sector_etf_price_at_prediction = snapshot["sector_etf_price_at_prediction"]
if (
sector_etf_price_at_prediction is not None
and sector_etf_price_at_prediction != 0.0
):
sector_etf_ticker = await _fetch_sector_etf_ticker(pool, ticker)
if sector_etf_ticker is not None:
sector_etf_future_price = await _fetch_close_at_time(
pool, sector_etf_ticker, target_time
)
if sector_etf_future_price is not None:
sector_etf_return = _compute_return(
sector_etf_price_at_prediction, sector_etf_future_price
)
# Compute excess returns (Requirement 4.5)
excess_return_vs_spy: float | None = None
if future_return is not None and spy_return is not None:
excess_return_vs_spy = future_return - spy_return
excess_return_vs_sector: float | None = None
if future_return is not None and sector_etf_return is not None:
excess_return_vs_sector = future_return - sector_etf_return
# Determine direction correctness (Requirement 4.6)
direction_correct = _is_direction_correct(snapshot["direction"], future_return)
# Determine profitability (Requirement 4.7)
profitable = _is_profitable(snapshot["action"], future_return)
now = datetime.now().astimezone()
return PredictionOutcome(
id=str(uuid.uuid4()),
prediction_id=str(snapshot["id"]),
evaluated_at=now,
horizon=horizon,
future_price=future_price,
future_return=future_return,
spy_future_price=spy_future_price,
spy_return=spy_return,
sector_etf_future_price=sector_etf_future_price,
sector_etf_return=sector_etf_return,
excess_return_vs_spy=excess_return_vs_spy,
excess_return_vs_sector=excess_return_vs_sector,
direction_correct=direction_correct,
profitable=profitable,
metadata={
"ticker": ticker,
"horizon": horizon,
"price_at_prediction": price_at_prediction,
"future_price": future_price,
},
)
# ---------------------------------------------------------------------------
# Store outcome (Requirement 4.9)
# ---------------------------------------------------------------------------
async def _store_outcome(
conn: asyncpg.Connection,
outcome: PredictionOutcome,
) -> None:
"""Persist a single prediction outcome to the database."""
await conn.execute(
_INSERT_OUTCOME_SQL,
outcome.id,
outcome.prediction_id,
outcome.evaluated_at,
outcome.horizon,
outcome.future_price,
outcome.future_return,
outcome.spy_future_price,
outcome.spy_return,
outcome.sector_etf_future_price,
outcome.sector_etf_return,
outcome.excess_return_vs_spy,
outcome.excess_return_vs_sector,
outcome.direction_correct,
outcome.profitable,
json.dumps(outcome.metadata),
)
# ---------------------------------------------------------------------------
# Main entry point (Requirements 4.1, 4.8, 4.9, 4.10)
# ---------------------------------------------------------------------------
async def evaluate_matured_predictions(
pool: asyncpg.Pool,
) -> int:
"""Evaluate all matured prediction snapshots across all horizons.
For each of the 5 horizons (1h, 6h, 1d, 7d, 30d), finds prediction
snapshots where generated_at + horizon_duration <= NOW() and no outcome
has been recorded for that (prediction_id, horizon) pair.
For each matured snapshot-horizon pair, fetches future prices and computes
returns. Skips horizons where the future price is unavailable — those will
be retried on the next run (Requirement 4.10).
Returns the total count of outcomes recorded.
"""
total_recorded = 0
for horizon, duration in HORIZON_DURATIONS.items():
# Find snapshots matured for this horizon
rows = await pool.fetch(_MATURED_PREDICTIONS_SQL, duration, horizon)
if not rows:
continue
logger.info(
"Found %d matured predictions for horizon %s", len(rows), horizon
)
for row in rows:
snapshot = dict(row)
try:
outcome = await evaluate_single_prediction(pool, snapshot, horizon)
if outcome is None:
# Future price unavailable — skip, retry next run
continue
async with pool.acquire() as conn:
async with conn.transaction():
await _store_outcome(conn, outcome)
total_recorded += 1
except Exception:
logger.exception(
"Failed to evaluate snapshot %s at horizon %s",
snapshot["id"],
horizon,
)
continue
logger.info("Outcome evaluation complete: %d outcomes recorded", total_recorded)
return total_recorded
+540
View File
@@ -0,0 +1,540 @@
"""Prediction Snapshot Writer — captures immutable prediction state at generation time.
Creates frozen records of every recommendation with prices, evidence links,
duplicate detection, and contribution scores so that predictions can be
evaluated against future outcomes without hindsight bias.
Requirements: 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 3.1, 3.2, 3.3, 3.4
"""
from __future__ import annotations
import hashlib
import json
import logging
import urllib.parse
import uuid
from dataclasses import dataclass, field
from datetime import datetime
import asyncpg
from services.shared.schemas import Recommendation, TrendSummary
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
SECTOR_ETF_MAP: dict[str, str] = {
"Technology": "XLK",
"Consumer Cyclical": "XLY",
"Financial Services": "XLF",
"Healthcare": "XLV",
"Energy": "XLE",
"Communication Services": "XLC",
"Industrials": "XLI",
"Consumer Defensive": "XLP",
"Real Estate": "XLRE",
"Utilities": "XLU",
}
EVALUATION_HORIZONS: list[str] = ["1h", "6h", "1d", "7d", "30d"]
MAX_SINGLE_DOCUMENT_WEIGHT: float = 1.0
# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------
@dataclass
class PredictionSnapshot:
"""Immutable snapshot of a prediction at generation time."""
id: str # UUID
generated_at: datetime
ticker: str
window: str
horizon: str
direction: str # bullish/bearish/mixed/neutral
action: str # buy/sell/hold/watch
mode: str # informational/paper_eligible/live_eligible
strength: float
confidence: float
contradiction: float
p_bull: float | None
p_bear: float | None
score_company: float
score_macro: float
score_competitive: float
evidence_count: int
unique_source_count: int
duplicate_evidence_count: int
price_at_prediction: float | None
spy_price_at_prediction: float | None
sector_etf_price_at_prediction: float | None
metadata: dict = field(default_factory=dict)
@dataclass
class SignalEvidenceLink:
"""Link between a prediction and a contributing evidence document."""
id: str # UUID
prediction_id: str
document_id: str
signal_id: str
ticker: str
source: str
source_type: str
catalyst_type: str
sentiment: str
impact: float
extraction_confidence: float
weight: float # clamped to MAX_SINGLE_DOCUMENT_WEIGHT
is_duplicate: bool
canonical_evidence_key: str
contribution_score: float # weight / total_weight, sums to 1.0
metadata: dict = field(default_factory=dict)
# ---------------------------------------------------------------------------
# Canonical evidence key computation (Requirements 2.3, 17.4)
# ---------------------------------------------------------------------------
def compute_canonical_evidence_key(title: str, url: str) -> str:
"""SHA256 of normalized(title) + normalized(url).
Normalization:
- Title: lowercase, strip leading/trailing whitespace
- URL: lowercase, strip query parameters (keep scheme, netloc, path)
"""
normalized_title = title.strip().lower()
parsed = urllib.parse.urlparse(url.lower())
normalized_url = urllib.parse.urlunparse(
(parsed.scheme, parsed.netloc, parsed.path, "", "", "")
)
combined = normalized_title + normalized_url
return hashlib.sha256(combined.encode("utf-8")).hexdigest()
# ---------------------------------------------------------------------------
# Contribution score computation (Requirements 2.5, 17.7)
# ---------------------------------------------------------------------------
def compute_contribution_scores(weights: list[float]) -> list[float]:
"""Compute contribution scores: each score = weight_i / sum(weights).
All scores are in [0.0, 1.0] and sum to 1.0 (within floating-point tolerance).
Returns an empty list for empty input.
"""
if not weights:
return []
total = sum(weights)
if total == 0.0:
# All weights are zero — distribute equally
n = len(weights)
return [1.0 / n] * n
return [w / total for w in weights]
# ---------------------------------------------------------------------------
# Price fetching (Requirements 1.2, 1.3, 1.4, 1.5)
# ---------------------------------------------------------------------------
_LATEST_CLOSE_SQL = """
SELECT (data->>'c')::float AS close
FROM market_snapshots
WHERE ticker = $1 AND snapshot_type = 'bar' AND data->>'c' IS NOT NULL
ORDER BY captured_at DESC
LIMIT 1
"""
async def fetch_latest_close_price(
pool: asyncpg.Pool,
ticker: str,
) -> float | None:
"""Fetch most recent close price from market_snapshots for a ticker.
Returns None if no market data is available for the ticker.
"""
row = await pool.fetchrow(_LATEST_CLOSE_SQL, ticker)
if row is None:
return None
return row["close"]
# ---------------------------------------------------------------------------
# Sector ETF lookup
# ---------------------------------------------------------------------------
_COMPANY_SECTOR_SQL = """
SELECT sector FROM companies WHERE ticker = $1 AND active = TRUE LIMIT 1
"""
async def _fetch_sector_etf_ticker(pool: asyncpg.Pool, ticker: str) -> str | None:
"""Look up the sector ETF ticker for a company ticker."""
row = await pool.fetchrow(_COMPANY_SECTOR_SQL, ticker)
if row is None or row["sector"] is None:
return None
return SECTOR_ETF_MAP.get(row["sector"])
# ---------------------------------------------------------------------------
# Layer score computation
# ---------------------------------------------------------------------------
def _compute_layer_scores(
evidence_signals: list[dict],
) -> tuple[float, float, float]:
"""Compute company, macro, and competitive layer scores from evidence signals.
Each signal's source_type determines its layer:
- company: news_api, filings_api, web_scrape
- macro: macro events (source_type containing 'macro')
- competitive: competitive signals (source_type containing 'competitive' or 'pattern')
Returns (score_company, score_macro, score_competitive) as fractions summing to 1.0.
"""
company_weight = 0.0
macro_weight = 0.0
competitive_weight = 0.0
for sig in evidence_signals:
w = sig.get("weight", 0.0)
source_type = sig.get("source_type", "").lower()
catalyst_type = sig.get("catalyst_type", "").lower()
if "macro" in source_type or catalyst_type == "macro":
macro_weight += w
elif "competitive" in source_type or "pattern" in source_type:
competitive_weight += w
else:
company_weight += w
total = company_weight + macro_weight + competitive_weight
if total == 0.0:
return (0.0, 0.0, 0.0)
return (
round(company_weight / total, 6),
round(macro_weight / total, 6),
round(competitive_weight / total, 6),
)
# ---------------------------------------------------------------------------
# SQL statements
# ---------------------------------------------------------------------------
_INSERT_SNAPSHOT_SQL = """
INSERT INTO prediction_snapshots (
id, generated_at, ticker, window, horizon, direction, action, mode,
strength, confidence, contradiction, p_bull, p_bear,
score_company, score_macro, score_competitive,
evidence_count, unique_source_count, duplicate_evidence_count,
price_at_prediction, spy_price_at_prediction, sector_etf_price_at_prediction,
metadata
) VALUES (
$1::uuid, $2, $3, $4, $5, $6, $7, $8,
$9, $10, $11, $12, $13,
$14, $15, $16,
$17, $18, $19,
$20, $21, $22,
$23::jsonb
)
"""
_INSERT_EVIDENCE_LINK_SQL = """
INSERT INTO signal_evidence_links (
id, prediction_id, document_id, signal_id, ticker,
source, source_type, catalyst_type, sentiment,
impact, extraction_confidence, weight,
is_duplicate, canonical_evidence_key, contribution_score,
metadata
) VALUES (
$1::uuid, $2::uuid, $3, $4, $5,
$6, $7, $8, $9,
$10, $11, $12,
$13, $14, $15,
$16::jsonb
)
"""
# ---------------------------------------------------------------------------
# Main entry point (Requirements 1.11.7, 2.12.6, 3.13.4)
# ---------------------------------------------------------------------------
async def create_prediction_snapshot(
pool: asyncpg.Pool,
recommendation: Recommendation,
trend_summary: TrendSummary,
evidence_signals: list[dict],
evidence_docs: list[dict],
) -> PredictionSnapshot:
"""Create and persist a prediction snapshot with evidence links.
Steps:
1. Fetch current prices (ticker, SPY, sector ETF) from market_snapshots
2. Compute canonical evidence keys and detect duplicates
3. Clamp individual document weights to MAX_SINGLE_DOCUMENT_WEIGHT
4. Compute contribution scores (one-vote-per-canonical-key dedup)
5. Persist snapshot and evidence links in a transaction
Args:
pool: asyncpg connection pool.
recommendation: The generated Recommendation object.
trend_summary: The TrendSummary used to generate the recommendation.
evidence_signals: List of dicts with signal fields (source, source_type,
catalyst_type, sentiment, impact, extraction_confidence, weight,
document_id, signal_id, ticker).
evidence_docs: List of dicts with document metadata (title, url, document_id).
Returns:
The persisted PredictionSnapshot.
"""
ticker = recommendation.ticker
# 1. Fetch prices — handle NULL gracefully (Requirement 1.5)
ticker_price = await fetch_latest_close_price(pool, ticker)
if ticker_price is None:
logger.warning("No market price available for %s at snapshot time", ticker)
spy_price = await fetch_latest_close_price(pool, "SPY")
if spy_price is None:
logger.warning("No SPY price available at snapshot time")
sector_etf_ticker = await _fetch_sector_etf_ticker(pool, ticker)
sector_etf_price: float | None = None
if sector_etf_ticker is not None:
sector_etf_price = await fetch_latest_close_price(pool, sector_etf_ticker)
if sector_etf_price is None:
logger.warning(
"No sector ETF price available for %s (%s) at snapshot time",
sector_etf_ticker,
ticker,
)
else:
logger.warning("No sector ETF mapping found for ticker %s", ticker)
# 2. Build a doc lookup for canonical key computation
doc_lookup: dict[str, dict] = {}
for doc in evidence_docs:
doc_id = doc.get("document_id", "")
doc_lookup[doc_id] = doc
# 3. Process evidence signals: compute canonical keys, detect duplicates,
# clamp weights
processed_links: list[dict] = []
seen_canonical_keys: dict[str, int] = {} # canonical_key -> first index
for sig in evidence_signals:
doc_id = sig.get("document_id", "")
doc_meta = doc_lookup.get(doc_id, {})
title = doc_meta.get("title", "")
url = doc_meta.get("url", "")
canonical_key = compute_canonical_evidence_key(title, url)
# Detect duplicates: same canonical key for same ticker
is_duplicate = canonical_key in seen_canonical_keys
if not is_duplicate:
seen_canonical_keys[canonical_key] = len(processed_links)
# Clamp weight to MAX_SINGLE_DOCUMENT_WEIGHT (Requirement 3.3)
raw_weight = sig.get("weight", 0.0)
clamped_weight = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT)
processed_links.append({
"id": str(uuid.uuid4()),
"document_id": doc_id,
"signal_id": sig.get("signal_id", ""),
"ticker": sig.get("ticker", ticker),
"source": sig.get("source", ""),
"source_type": sig.get("source_type", ""),
"catalyst_type": sig.get("catalyst_type", ""),
"sentiment": sig.get("sentiment", ""),
"impact": sig.get("impact", 0.0),
"extraction_confidence": sig.get("extraction_confidence", 0.0),
"weight": clamped_weight,
"is_duplicate": is_duplicate,
"canonical_evidence_key": canonical_key,
})
# 4. Compute contribution scores — one vote per canonical key (Requirement 3.4)
# Only non-duplicate links contribute to the weight pool
non_dup_weights = [
link["weight"] for link in processed_links if not link["is_duplicate"]
]
non_dup_scores = compute_contribution_scores(non_dup_weights)
# Assign contribution scores: non-duplicates get their computed score,
# duplicates get 0.0
score_idx = 0
for link in processed_links:
if not link["is_duplicate"]:
link["contribution_score"] = non_dup_scores[score_idx]
score_idx += 1
else:
link["contribution_score"] = 0.0
# 5. Compute deduplication quality metrics (Requirements 3.1, 3.2)
unique_sources = {
link["source"]
for link in processed_links
if not link["is_duplicate"]
}
unique_source_count = len(unique_sources)
duplicate_evidence_count = sum(
1 for link in processed_links if link["is_duplicate"]
)
# 6. Compute layer scores from evidence signals
score_company, score_macro, score_competitive = _compute_layer_scores(
evidence_signals
)
# 7. Build metadata from trend summary context (Requirement 1.7)
metadata: dict = {}
if trend_summary.market_context is not None:
metadata["market_context"] = {
"ticker": trend_summary.market_context.ticker,
"price_change_pct": trend_summary.market_context.price_change_pct,
"avg_volume": trend_summary.market_context.avg_volume,
"volume_change_pct": trend_summary.market_context.volume_change_pct,
"volatility": trend_summary.market_context.volatility,
"latest_close": trend_summary.market_context.latest_close,
"bars_available": trend_summary.market_context.bars_available,
}
if sector_etf_ticker is not None:
metadata["sector_etf_ticker"] = sector_etf_ticker
# 8. Build the snapshot
snapshot_id = str(uuid.uuid4())
snapshot = PredictionSnapshot(
id=snapshot_id,
generated_at=recommendation.generated_at,
ticker=ticker,
window=trend_summary.window.value,
horizon=recommendation.time_horizon,
direction=trend_summary.trend_direction.value,
action=recommendation.action.value,
mode=recommendation.mode.value,
strength=trend_summary.trend_strength,
confidence=recommendation.confidence,
contradiction=trend_summary.contradiction_score,
p_bull=trend_summary.p_bull,
p_bear=1.0 - trend_summary.p_bull if trend_summary.p_bull is not None else None,
score_company=score_company,
score_macro=score_macro,
score_competitive=score_competitive,
evidence_count=len(processed_links),
unique_source_count=unique_source_count,
duplicate_evidence_count=duplicate_evidence_count,
price_at_prediction=ticker_price,
spy_price_at_prediction=spy_price,
sector_etf_price_at_prediction=sector_etf_price,
metadata=metadata,
)
# 9. Build evidence link objects
evidence_link_objects: list[SignalEvidenceLink] = []
for link in processed_links:
evidence_link_objects.append(
SignalEvidenceLink(
id=link["id"],
prediction_id=snapshot_id,
document_id=link["document_id"],
signal_id=link["signal_id"],
ticker=link["ticker"],
source=link["source"],
source_type=link["source_type"],
catalyst_type=link["catalyst_type"],
sentiment=link["sentiment"],
impact=link["impact"],
extraction_confidence=link["extraction_confidence"],
weight=link["weight"],
is_duplicate=link["is_duplicate"],
canonical_evidence_key=link["canonical_evidence_key"],
contribution_score=link["contribution_score"],
)
)
# 10. Persist in a transaction (Requirements 1.6, 2.6)
async with pool.acquire() as conn:
async with conn.transaction():
await conn.execute(
_INSERT_SNAPSHOT_SQL,
snapshot.id,
snapshot.generated_at,
snapshot.ticker,
snapshot.window,
snapshot.horizon,
snapshot.direction,
snapshot.action,
snapshot.mode,
snapshot.strength,
snapshot.confidence,
snapshot.contradiction,
snapshot.p_bull,
snapshot.p_bear,
snapshot.score_company,
snapshot.score_macro,
snapshot.score_competitive,
snapshot.evidence_count,
snapshot.unique_source_count,
snapshot.duplicate_evidence_count,
snapshot.price_at_prediction,
snapshot.spy_price_at_prediction,
snapshot.sector_etf_price_at_prediction,
json.dumps(snapshot.metadata),
)
for link in evidence_link_objects:
await conn.execute(
_INSERT_EVIDENCE_LINK_SQL,
link.id,
link.prediction_id,
link.document_id,
link.signal_id,
link.ticker,
link.source,
link.source_type,
link.catalyst_type,
link.sentiment,
link.impact,
link.extraction_confidence,
link.weight,
link.is_duplicate,
link.canonical_evidence_key,
link.contribution_score,
json.dumps(link.metadata),
)
logger.info(
"Created prediction snapshot %s for %s: %d evidence links "
"(%d unique sources, %d duplicates), prices: ticker=%s spy=%s sector_etf=%s",
snapshot_id,
ticker,
len(evidence_link_objects),
unique_source_count,
duplicate_evidence_count,
ticker_price,
spy_price,
sector_etf_price,
)
return snapshot