feat: model validation, calibration, and signal quality layer

- Migration 035: prediction_snapshots, prediction_outcomes, signal_evidence_links, model_metric_snapshots tables + SQL views - Prediction snapshot writer with canonical evidence keys, duplicate detection, contribution scores - Outcome evaluator across 5 horizons (1h, 6h, 1d, 7d, 30d) - Metrics engine: ECE, Brier score, IC, Rank IC, benchmark comparison - Attribution engine: per-source, per-catalyst, per-layer performance - Calibration engine: Bayesian shrinkage source reliability - Quality gate for live trading eligibility with configurable thresholds - 7 new /api/validation/* endpoints - Upgraded OpsModel dashboard with validation tab - Enhanced recommendation display with calibration context - Backtest replay validation mode - 86 Python tests (unit + property-based), 179 frontend tests passing
2026-05-01 03:04:58 +00:00
parent 5d2ffd9163
commit 7fcc8a6c07
23 changed files with 7554 additions and 9 deletions
@@ -0,0 +1,135 @@
+"""Calibration Engine — Bayesian shrinkage source reliability and weight adjustment.
+
+Computes source reliability scores using Bayesian shrinkage from historical
+prediction outcomes, and adjusts evidence weights based on source performance.
+Updates the existing source_accuracy table with reliability scores.
+
+Requirements: 8.1, 8.2, 8.3, 8.4, 8.5
+"""
+from __future__ import annotations
+
+import logging
+
+import asyncpg
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Pure functions — testable without a database
+# ---------------------------------------------------------------------------
+
+
+def compute_source_reliability(
+    observed_win_rate: float,
+    sample_count: int,
+    prior_strength: int = 30,
+) -> float:
+    """Bayesian shrinkage source reliability.
+
+    reliability = 0.5 + (n / (n + prior_strength)) * (observed_win_rate - 0.5)
+
+    Returns value in [0.0, 1.0].
+    When n=0, returns 0.5 (prior mean).
+    As n→∞, approaches observed_win_rate.
+    """
+    if sample_count <= 0:
+        return 0.5
+
+    shrinkage = sample_count / (sample_count + prior_strength)
+    reliability = 0.5 + shrinkage * (observed_win_rate - 0.5)
+
+    # Clamp to [0.0, 1.0] for safety (should already be in range when
+    # observed_win_rate is in [0.0, 1.0], but guard against edge cases).
+    return max(0.0, min(1.0, reliability))
+
+
+def compute_adjusted_evidence_weight(
+    base_weight: float,
+    reliability: float,
+) -> float:
+    """Adjusted weight = base_weight * (0.5 + reliability), clamped to [0.1, 2.0]."""
+    adjusted = base_weight * (0.5 + reliability)
+    return max(0.1, min(2.0, adjusted))
+
+
+# ---------------------------------------------------------------------------
+# SQL queries
+# ---------------------------------------------------------------------------
+
+# Query v_source_performance to get per-source win rates and sample counts.
+# Groups by source, counting total predictions and directional wins.
+_SOURCE_PERFORMANCE_SQL = """
+SELECT
+    source,
+    COUNT(*) AS sample_count,
+    COUNT(*) FILTER (WHERE direction_correct = TRUE) AS win_count
+FROM v_source_performance
+WHERE direction_correct IS NOT NULL
+GROUP BY source
+"""
+
+# Upsert into source_accuracy: update accuracy_ratio and sample_count
+# for existing sources, insert new ones.
+_UPSERT_SOURCE_ACCURACY_SQL = """
+INSERT INTO source_accuracy (source_id, accuracy_ratio, sample_count, last_updated)
+VALUES ($1, $2, $3, NOW())
+ON CONFLICT (source_id)
+DO UPDATE SET
+    accuracy_ratio = EXCLUDED.accuracy_ratio,
+    sample_count = EXCLUDED.sample_count,
+    last_updated = NOW()
+"""
+
+
+# ---------------------------------------------------------------------------
+# Database-backed function
+# ---------------------------------------------------------------------------
+
+
+async def update_source_reliabilities(
+    pool: asyncpg.Pool,
+) -> int:
+    """Recompute and store source reliability scores from latest outcomes.
+
+    1. Queries v_source_performance to get per-source win rates and counts
+    2. Computes Bayesian shrinkage reliability for each source
+    3. Upserts into source_accuracy table (accuracy_ratio = reliability)
+
+    Returns count of sources updated.
+    """
+    try:
+        rows = await pool.fetch(_SOURCE_PERFORMANCE_SQL)
+    except Exception:
+        logger.exception("Failed to query source performance for reliability update")
+        return 0
+
+    if not rows:
+        logger.info("No source performance data available for reliability update")
+        return 0
+
+    updated = 0
+
+    for row in rows:
+        source = row["source"]
+        sample_count = row["sample_count"]
+        win_count = row["win_count"]
+
+        observed_win_rate = win_count / sample_count if sample_count > 0 else 0.5
+        reliability = compute_source_reliability(observed_win_rate, sample_count)
+
+        try:
+            await pool.execute(
+                _UPSERT_SOURCE_ACCURACY_SQL,
+                source,
+                reliability,
+                sample_count,
+            )
+            updated += 1
+        except Exception:
+            logger.exception(
+                "Failed to upsert source reliability for source=%s", source
+            )
+
+    logger.info("Updated source reliabilities for %d sources", updated)
+    return updated