feat: model validation, calibration, and signal quality layer

- Migration 035: prediction_snapshots, prediction_outcomes, signal_evidence_links, model_metric_snapshots tables + SQL views - Prediction snapshot writer with canonical evidence keys, duplicate detection, contribution scores - Outcome evaluator across 5 horizons (1h, 6h, 1d, 7d, 30d) - Metrics engine: ECE, Brier score, IC, Rank IC, benchmark comparison - Attribution engine: per-source, per-catalyst, per-layer performance - Calibration engine: Bayesian shrinkage source reliability - Quality gate for live trading eligibility with configurable thresholds - 7 new /api/validation/* endpoints - Upgraded OpsModel dashboard with validation tab - Enhanced recommendation display with calibration context - Backtest replay validation mode - 86 Python tests (unit + property-based), 179 frontend tests passing
2026-05-01 03:04:58 +00:00
parent 5d2ffd9163
commit 7fcc8a6c07
23 changed files with 7554 additions and 9 deletions
@@ -43,6 +43,11 @@ from services.shared.db import get_pg_pool, get_redis
 from services.shared.logging import new_trace_id, set_trace_context, setup_logging
 from services.shared.redis_keys import PIPELINE_ENABLED_KEY, QUEUE_BROKER, QUEUE_PREFIX, queue_key
 from services.shared.schemas import MAJOR_DECISION_CATALYSTS
+from services.validation.attribution import (
+    compute_catalyst_attribution,
+    compute_layer_attribution,
+    compute_source_attribution,
+)

 logger = logging.getLogger("query_api")

@@ -3769,3 +3774,336 @@ async def get_variant_performance_history(
        agent_id, variant_id, hours,
    )
    return [_row_to_dict(r) for r in rows]
+
+
+# ---------------------------------------------------------------------------
+# Model Validation Dashboard  (Requirements 12.1, 12.2, 12.3, 12.7)
+# ---------------------------------------------------------------------------
+
+_VALID_LOOKBACKS = {"7d", "30d", "90d", "all"}
+_VALID_HORIZONS = {"1h", "6h", "1d", "7d", "30d"}
+
+
+@app.get("/api/validation/summary")
+async def get_validation_summary(
+    lookback: str = Query(default="30d"),
+    horizon: str = Query(default="7d"),
+):
+    """Latest model metric snapshot plus quality gate status.
+
+    Returns the most recent model_metric_snapshot for the given
+    lookback/horizon combination, along with the current gate status
+    from risk_configs.
+
+    Requirement 12.1
+    """
+    if lookback not in _VALID_LOOKBACKS:
+        raise HTTPException(400, f"Invalid lookback: {lookback}. Must be one of {sorted(_VALID_LOOKBACKS)}")
+    if horizon not in _VALID_HORIZONS:
+        raise HTTPException(400, f"Invalid horizon: {horizon}. Must be one of {sorted(_VALID_HORIZONS)}")
+
+    # Latest metric snapshot for the requested lookback/horizon
+    snapshot_row = await pool.fetchrow(
+        """SELECT id, generated_at, lookback_window, horizon,
+                  prediction_count, win_rate, directional_accuracy,
+                  information_coefficient, rank_information_coefficient,
+                  avg_return, avg_excess_return_vs_spy, avg_excess_return_vs_sector,
+                  calibration_error, brier_score,
+                  buy_win_rate, sell_win_rate, hold_win_rate,
+                  metadata
+           FROM model_metric_snapshots
+           WHERE lookback_window = $1 AND horizon = $2
+           ORDER BY generated_at DESC
+           LIMIT 1""",
+        lookback, horizon,
+    )
+
+    snapshot = None
+    if snapshot_row:
+        snapshot = _row_to_dict(snapshot_row)
+        snapshot["metadata"] = _parse_jsonb(snapshot.get("metadata"))
+
+    # Gate status from risk_configs
+    gate_row = await pool.fetchrow(
+        "SELECT config, updated_at FROM risk_configs WHERE name = 'model_quality_gate'",
+    )
+    gate_status = None
+    if gate_row:
+        gate_status = _parse_jsonb(gate_row["config"])
+
+    return {
+        "snapshot": snapshot,
+        "gate_status": gate_status,
+    }
+
+
+@app.get("/api/validation/calibration")
+async def get_validation_calibration(
+    lookback: str = Query(default="30d"),
+    horizon: str = Query(default="7d"),
+):
+    """Calibration table with confidence buckets.
+
+    Queries v_prediction_performance for the given lookback/horizon,
+    groups by confidence buckets, and computes avg_confidence,
+    observed_win_rate, count, and miscalibrated flag per bucket.
+
+    Requirement 12.2
+    """
+    if lookback not in _VALID_LOOKBACKS:
+        raise HTTPException(400, f"Invalid lookback: {lookback}. Must be one of {sorted(_VALID_LOOKBACKS)}")
+    if horizon not in _VALID_HORIZONS:
+        raise HTTPException(400, f"Invalid horizon: {horizon}. Must be one of {sorted(_VALID_HORIZONS)}")
+
+    # Build lookback filter
+    lookback_condition = ""
+    params: list[Any] = [horizon]
+    idx = 2
+
+    if lookback != "all":
+        lookback_days = {"7d": 7, "30d": 30, "90d": 90}[lookback]
+        lookback_condition = f"AND generated_at >= NOW() - make_interval(days => ${idx})"
+        params.append(lookback_days)
+        idx += 1
+
+    rows = await pool.fetch(
+        f"""SELECT confidence, direction_correct
+            FROM v_prediction_performance
+            WHERE horizon = $1
+              {lookback_condition}
+              AND confidence IS NOT NULL""",
+        *params,
+    )
+
+    # Group into calibration buckets
+    buckets_def = [
+        (0.50, 0.60),
+        (0.60, 0.70),
+        (0.70, 0.80),
+        (0.80, 0.90),
+        (0.90, 1.00),
+    ]
+
+    buckets = []
+    for low, high in buckets_def:
+        bucket_rows = []
+        for r in rows:
+            conf = float(r["confidence"])
+            if high == 1.00:
+                in_bucket = low <= conf <= high
+            else:
+                in_bucket = low <= conf < high
+            if in_bucket:
+                bucket_rows.append(r)
+
+        count = len(bucket_rows)
+        if count == 0:
+            buckets.append({
+                "bucket_low": low,
+                "bucket_high": high,
+                "avg_confidence": 0.0,
+                "observed_win_rate": 0.0,
+                "prediction_count": 0,
+                "miscalibrated": False,
+            })
+            continue
+
+        avg_conf = sum(float(r["confidence"]) for r in bucket_rows) / count
+        win_count = sum(1 for r in bucket_rows if r["direction_correct"] is True)
+        win_rate = win_count / count
+        diff = abs(avg_conf - win_rate)
+
+        buckets.append({
+            "bucket_low": low,
+            "bucket_high": high,
+            "avg_confidence": round(avg_conf, 4),
+            "observed_win_rate": round(win_rate, 4),
+            "prediction_count": count,
+            "miscalibrated": diff > 0.15,
+        })
+
+    return {"buckets": buckets, "lookback": lookback, "horizon": horizon}
+
+
+@app.get("/api/validation/ic-by-horizon")
+async def get_validation_ic_by_horizon(
+    lookback: str = Query(default="30d"),
+):
+    """IC and Rank IC per prediction horizon.
+
+    Queries the most recent model_metric_snapshot for the given lookback
+    across all 5 horizons, returning IC and Rank IC for each.
+
+    Requirement 12.3
+    """
+    if lookback not in _VALID_LOOKBACKS:
+        raise HTTPException(400, f"Invalid lookback: {lookback}. Must be one of {sorted(_VALID_LOOKBACKS)}")
+
+    rows = await pool.fetch(
+        """SELECT DISTINCT ON (horizon)
+                  horizon,
+                  information_coefficient,
+                  rank_information_coefficient,
+                  prediction_count,
+                  generated_at
+           FROM model_metric_snapshots
+           WHERE lookback_window = $1
+           ORDER BY horizon, generated_at DESC""",
+        lookback,
+    )
+
+    horizons = []
+    for r in rows:
+        horizons.append({
+            "horizon": r["horizon"],
+            "information_coefficient": float(r["information_coefficient"]) if r["information_coefficient"] is not None else None,
+            "rank_information_coefficient": float(r["rank_information_coefficient"]) if r["rank_information_coefficient"] is not None else None,
+            "prediction_count": r["prediction_count"],
+            "generated_at": r["generated_at"].isoformat() if r["generated_at"] else None,
+        })
+
+    # Sort by canonical horizon order
+    horizon_order = {"1h": 0, "6h": 1, "1d": 2, "7d": 3, "30d": 4}
+    horizons.sort(key=lambda h: horizon_order.get(h["horizon"], 99))
+
+    return {"horizons": horizons, "lookback": lookback}
+
+
+@app.get("/api/validation/gate-status")
+async def get_validation_gate_status():
+    """Quality gate evaluation detail.
+
+    Returns the stored gate evaluation result from risk_configs
+    where key = 'model_quality_gate'.
+
+    Requirement 12.7
+    """
+    gate_row = await pool.fetchrow(
+        "SELECT config, updated_at FROM risk_configs WHERE name = 'model_quality_gate'",
+    )
+
+    if not gate_row:
+        return {
+            "gate_status": None,
+            "message": "No gate evaluation found. Model metrics may not have been computed yet.",
+        }
+
+    gate_data = _parse_jsonb(gate_row["config"])
+    updated_at = gate_row["updated_at"].isoformat() if gate_row.get("updated_at") else None
+
+    return {
+        "gate_status": gate_data,
+        "updated_at": updated_at,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Attribution Endpoints  (Requirements 12.4, 12.5, 12.6)
+# ---------------------------------------------------------------------------
+
+
+_LOOKBACK_TO_DAYS: dict[str, int] = {
+    "7d": 7,
+    "30d": 30,
+    "90d": 90,
+    "all": 3650,
+}
+
+
+@app.get("/api/validation/attribution/sources")
+async def get_validation_attribution_sources(
+    lookback: str = Query(default="30d"),
+    horizon: str = Query(default="7d"),
+):
+    """Per-source performance metrics.
+
+    Returns win rate, IC, average return, duplicate rate, and other
+    attribution metrics for each source, computed over the given
+    lookback window and prediction horizon.
+
+    Requirement 12.4
+    """
+    if lookback not in _VALID_LOOKBACKS:
+        raise HTTPException(400, f"Invalid lookback: {lookback}. Must be one of {sorted(_VALID_LOOKBACKS)}")
+    if horizon not in _VALID_HORIZONS:
+        raise HTTPException(400, f"Invalid horizon: {horizon}. Must be one of {sorted(_VALID_HORIZONS)}")
+
+    lookback_days = _LOOKBACK_TO_DAYS[lookback]
+
+    try:
+        results = await compute_source_attribution(pool, lookback_days=lookback_days, horizon=horizon)
+    except Exception:
+        logger.exception("Failed to compute source attribution")
+        raise HTTPException(500, "Failed to compute source attribution")
+
+    return {
+        "sources": [asdict(r) for r in results],
+        "lookback": lookback,
+        "horizon": horizon,
+    }
+
+
+@app.get("/api/validation/attribution/catalysts")
+async def get_validation_attribution_catalysts(
+    lookback: str = Query(default="30d"),
+    horizon: str = Query(default="7d"),
+):
+    """Per-catalyst-type performance metrics.
+
+    Returns win rate, IC, average return, and other attribution metrics
+    for each catalyst type, computed over the given lookback window
+    and prediction horizon.
+
+    Requirement 12.5
+    """
+    if lookback not in _VALID_LOOKBACKS:
+        raise HTTPException(400, f"Invalid lookback: {lookback}. Must be one of {sorted(_VALID_LOOKBACKS)}")
+    if horizon not in _VALID_HORIZONS:
+        raise HTTPException(400, f"Invalid horizon: {horizon}. Must be one of {sorted(_VALID_HORIZONS)}")
+
+    lookback_days = _LOOKBACK_TO_DAYS[lookback]
+
+    try:
+        results = await compute_catalyst_attribution(pool, lookback_days=lookback_days, horizon=horizon)
+    except Exception:
+        logger.exception("Failed to compute catalyst attribution")
+        raise HTTPException(500, "Failed to compute catalyst attribution")
+
+    return {
+        "catalysts": [asdict(r) for r in results],
+        "lookback": lookback,
+        "horizon": horizon,
+    }
+
+
+@app.get("/api/validation/attribution/layers")
+async def get_validation_attribution_layers(
+    lookback: str = Query(default="30d"),
+    horizon: str = Query(default="7d"),
+):
+    """Per-signal-layer (company, macro, competitive) performance metrics.
+
+    Returns average contribution percentage, dominant win rate, and
+    dominant IC for each of the three signal layers, computed over
+    the given lookback window and prediction horizon.
+
+    Requirement 12.6
+    """
+    if lookback not in _VALID_LOOKBACKS:
+        raise HTTPException(400, f"Invalid lookback: {lookback}. Must be one of {sorted(_VALID_LOOKBACKS)}")
+    if horizon not in _VALID_HORIZONS:
+        raise HTTPException(400, f"Invalid horizon: {horizon}. Must be one of {sorted(_VALID_HORIZONS)}")
+
+    lookback_days = _LOOKBACK_TO_DAYS[lookback]
+
+    try:
+        results = await compute_layer_attribution(pool, lookback_days=lookback_days, horizon=horizon)
+    except Exception:
+        logger.exception("Failed to compute layer attribution")
+        raise HTTPException(500, "Failed to compute layer attribution")
+
+    return {
+        "layers": [asdict(r) for r in results],
+        "lookback": lookback,
+        "horizon": horizon,
+    }