feat: model validation, calibration, and signal quality layer

- Migration 035: prediction_snapshots, prediction_outcomes, signal_evidence_links, model_metric_snapshots tables + SQL views - Prediction snapshot writer with canonical evidence keys, duplicate detection, contribution scores - Outcome evaluator across 5 horizons (1h, 6h, 1d, 7d, 30d) - Metrics engine: ECE, Brier score, IC, Rank IC, benchmark comparison - Attribution engine: per-source, per-catalyst, per-layer performance - Calibration engine: Bayesian shrinkage source reliability - Quality gate for live trading eligibility with configurable thresholds - 7 new /api/validation/* endpoints - Upgraded OpsModel dashboard with validation tab - Enhanced recommendation display with calibration context - Backtest replay validation mode - 86 Python tests (unit + property-based), 179 frontend tests passing
2026-05-01 03:04:58 +00:00
parent 5d2ffd9163
commit 7fcc8a6c07
23 changed files with 7554 additions and 9 deletions
@@ -4,6 +4,10 @@ Task 32: Fetches historical recommendations from the database, simulates
 the decision logic chronologically using evaluate_recommendation(), tracks
 simulated positions and equity curve, and persists results to backtest_runs
 and backtest_trades tables.
+
+Supports a validation mode (Requirements 15.1–15.5) that generates prediction
+snapshots and evaluates outcomes using only data available at each historical
+point in time, preventing future data leakage.
 """

 from __future__ import annotations
@@ -39,12 +43,22 @@ class BacktestReplay:
        self.pool = pool
        self._perf = PerformanceComputer()

-    async def run(self, config: BacktestConfig, backtest_id: str | None = None) -> BacktestResult:
+    async def run(
+        self,
+        config: BacktestConfig,
+        backtest_id: str | None = None,
+        validation_mode: bool = False,
+    ) -> BacktestResult:
        """Execute a full backtest replay.

        Args:
            config: Backtest configuration (date range, capital, risk tier).
            backtest_id: Optional pre-generated ID. If not provided, one is generated.
+            validation_mode: When True, creates prediction snapshots for each
+                historical recommendation using only data available at that point
+                in time, evaluates outcomes, and computes model metrics over the
+                backtest period. Snapshots are tagged with the backtest_id.
+                (Requirements 15.1–15.5)

        Returns:
            BacktestResult with metrics, trade log, and equity curve.
@@ -87,6 +101,7 @@ class BacktestReplay:
            daily_returns: list[float] = []
            prev_value = config.initial_capital
            trade_log: list[dict] = []
+            validation_snapshot_ids: list[str] = []  # track snapshot IDs for validation mode

            # Pre-load company sectors and latest prices for enrichment
            company_sectors: dict[str, str] = {}
@@ -172,6 +187,25 @@ class BacktestReplay:
                        now=sim_time,
                    )

+                    # --- Validation mode: create prediction snapshot (Req 15.1, 15.2, 15.4) ---
+                    if validation_mode and self.pool is not None:
+                        try:
+                            snapshot_id = await self._create_validation_snapshot(
+                                rec=rec,
+                                sim_time=sim_time,
+                                backtest_id=backtest_id,
+                                company_sectors=company_sectors,
+                            )
+                            if snapshot_id is not None:
+                                validation_snapshot_ids.append(snapshot_id)
+                        except Exception:
+                            logger.warning(
+                                "Validation snapshot failed for %s at %s, continuing backtest",
+                                rec.get("ticker", "?"),
+                                sim_time,
+                                exc_info=True,
+                            )
+
                    if decision.decision == "act":
                        act_count += 1
                        ticker = decision.ticker
@@ -348,6 +382,10 @@ class BacktestReplay:
            # Persist results
            await self._persist_results(result, closed_trades)

+            # --- Validation mode: evaluate outcomes and compute metrics (Req 15.3, 15.5) ---
+            if validation_mode and self.pool is not None and validation_snapshot_ids:
+                await self._run_validation_evaluation(backtest_id)
+
            return result

        except Exception as exc:
@@ -356,6 +394,210 @@ class BacktestReplay:
            await self._persist_failed_run(backtest_id, config, str(exc))
            raise

+    # ------------------------------------------------------------------
+    # Validation mode helpers (Requirements 15.1–15.5)
+    # ------------------------------------------------------------------
+
+    # SQL to fetch the close price at or before a specific time — prevents
+    # future data leakage by only returning data available at that point.
+    _CLOSE_AT_TIME_SQL = """
+    SELECT (data->>'c')::float AS close
+    FROM market_snapshots
+    WHERE ticker = $1
+      AND snapshot_type = 'bar'
+      AND data->>'c' IS NOT NULL
+      AND captured_at <= $2
+    ORDER BY captured_at DESC
+    LIMIT 1
+    """
+
+    _COMPANY_SECTOR_SQL = """
+    SELECT sector FROM companies WHERE ticker = $1 AND active = TRUE LIMIT 1
+    """
+
+    _SECTOR_ETF_MAP: dict[str, str] = {
+        "Technology": "XLK",
+        "Consumer Cyclical": "XLY",
+        "Financial Services": "XLF",
+        "Healthcare": "XLV",
+        "Energy": "XLE",
+        "Communication Services": "XLC",
+        "Industrials": "XLI",
+        "Consumer Defensive": "XLP",
+        "Real Estate": "XLRE",
+        "Utilities": "XLU",
+    }
+
+    async def _fetch_close_at_time(
+        self,
+        ticker: str,
+        target_time: datetime,
+    ) -> float | None:
+        """Fetch the close price for *ticker* at or before *target_time*.
+
+        Ensures no future data leakage — only market data with
+        ``captured_at <= target_time`` is considered (Requirement 15.4).
+        """
+        if self.pool is None:
+            return None
+        row = await self.pool.fetchrow(self._CLOSE_AT_TIME_SQL, ticker, target_time)
+        if row is None:
+            return None
+        return row["close"]
+
+    async def _create_validation_snapshot(
+        self,
+        rec: dict,
+        sim_time: datetime,
+        backtest_id: str,
+        company_sectors: dict[str, str],
+    ) -> str | None:
+        """Create a prediction snapshot using only data available at *sim_time*.
+
+        Fetches ticker, SPY, and sector ETF prices as of *sim_time* to prevent
+        future data leakage (Requirements 15.1, 15.2, 15.4).  The snapshot is
+        tagged with *backtest_id* in its metadata field (Requirement 15.5).
+
+        Returns the snapshot UUID on success, or ``None`` on failure.
+        """
+        from services.validation.prediction_snapshot import (
+            SECTOR_ETF_MAP,
+        )
+
+        ticker = rec.get("ticker", "")
+        if not ticker:
+            return None
+
+        # Fetch prices using only data available at sim_time (Req 15.4)
+        ticker_price = await self._fetch_close_at_time(ticker, sim_time)
+        spy_price = await self._fetch_close_at_time("SPY", sim_time)
+
+        # Sector ETF price
+        sector = company_sectors.get(ticker)
+        sector_etf_ticker = SECTOR_ETF_MAP.get(sector) if sector else None
+        sector_etf_price: float | None = None
+        if sector_etf_ticker is not None:
+            sector_etf_price = await self._fetch_close_at_time(
+                sector_etf_ticker, sim_time
+            )
+
+        snapshot_id = str(uuid.uuid4())
+
+        # Build metadata tagged with backtest_id (Req 15.5)
+        metadata: dict = {
+            "backtest_id": backtest_id,
+            "source": "backtest_validation",
+        }
+
+        # Map recommendation fields to snapshot columns
+        direction = rec.get("direction", rec.get("trend_direction", "neutral"))
+        action = rec.get("action", "watch")
+        mode = rec.get("mode", "informational")
+        confidence = float(rec.get("confidence", 0.5))
+        strength = float(rec.get("strength", rec.get("trend_strength", 0.5)))
+        contradiction = float(rec.get("contradiction", rec.get("contradiction_score", 0.0)))
+        p_bull = rec.get("p_bull")
+        if p_bull is not None:
+            p_bull = float(p_bull)
+        p_bear = (1.0 - p_bull) if p_bull is not None else None
+        window = rec.get("window", rec.get("trend_window", "7d"))
+        horizon = rec.get("time_horizon", rec.get("horizon", "7d"))
+
+        # Insert the snapshot directly — we bypass create_prediction_snapshot()
+        # because that function fetches *latest* prices (not point-in-time).
+        insert_sql = """
+        INSERT INTO prediction_snapshots (
+            id, generated_at, ticker, window, horizon, direction, action, mode,
+            strength, confidence, contradiction, p_bull, p_bear,
+            score_company, score_macro, score_competitive,
+            evidence_count, unique_source_count, duplicate_evidence_count,
+            price_at_prediction, spy_price_at_prediction,
+            sector_etf_price_at_prediction, metadata
+        ) VALUES (
+            $1::uuid, $2, $3, $4, $5, $6, $7, $8,
+            $9, $10, $11, $12, $13,
+            $14, $15, $16,
+            $17, $18, $19,
+            $20, $21, $22,
+            $23::jsonb
+        )
+        """
+        await self.pool.execute(
+            insert_sql,
+            snapshot_id,
+            sim_time,
+            ticker,
+            str(window),
+            str(horizon),
+            str(direction),
+            str(action),
+            str(mode),
+            strength,
+            confidence,
+            contradiction,
+            p_bull,
+            p_bear,
+            float(rec.get("score_company", 0.0)),
+            float(rec.get("score_macro", 0.0)),
+            float(rec.get("score_competitive", 0.0)),
+            int(rec.get("evidence_count", 0)),
+            int(rec.get("unique_source_count", 0)),
+            int(rec.get("duplicate_evidence_count", 0)),
+            ticker_price,
+            spy_price,
+            sector_etf_price,
+            json.dumps(metadata),
+        )
+
+        logger.debug(
+            "Validation snapshot %s created for %s at %s (backtest %s)",
+            snapshot_id,
+            ticker,
+            sim_time,
+            backtest_id,
+        )
+        return snapshot_id
+
+    async def _run_validation_evaluation(self, backtest_id: str) -> None:
+        """Evaluate prediction outcomes and compute metrics for the backtest.
+
+        Calls the outcome evaluator and metrics engine after the backtest
+        completes (Requirements 15.3, 15.5).  Failures are logged but do
+        not block the backtest result.
+        """
+        from services.validation.metrics import compute_and_store_metric_snapshots
+        from services.validation.outcome_evaluator import evaluate_matured_predictions
+
+        # Step 1: Evaluate matured predictions (Req 15.3)
+        try:
+            outcomes_count = await evaluate_matured_predictions(self.pool)
+            logger.info(
+                "Backtest %s validation: %d prediction outcomes evaluated",
+                backtest_id,
+                outcomes_count,
+            )
+        except Exception:
+            logger.warning(
+                "Backtest %s: outcome evaluation failed, continuing",
+                backtest_id,
+                exc_info=True,
+            )
+
+        # Step 2: Compute and store metric snapshots (Req 15.5)
+        try:
+            snapshots = await compute_and_store_metric_snapshots(self.pool)
+            logger.info(
+                "Backtest %s validation: %d metric snapshots computed",
+                backtest_id,
+                len(snapshots),
+            )
+        except Exception:
+            logger.warning(
+                "Backtest %s: metric snapshot computation failed, continuing",
+                backtest_id,
+                exc_info=True,
+            )
+
    # ------------------------------------------------------------------
    # Database helpers
    # ------------------------------------------------------------------