feat: model validation, calibration, and signal quality layer
ci/woodpecker/push/test Pipeline failed
ci/woodpecker/push/build-1 unknown status
ci/woodpecker/push/build-3 unknown status
ci/woodpecker/push/build-2 unknown status
ci/woodpecker/push/finalize unknown status
Build and Push / lint-and-test (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.adapters.broker_adapter name:broker-adapter]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.aggregation.worker name:aggregation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.extractor.worker name:extractor]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.ingestion.worker name:ingestion]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.lake_publisher.worker name:lake-publisher]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.parser.worker name:parser]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.recommendation.worker name:recommendation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.scheduler.app name:scheduler]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.api.app:app --host 0.0.0.0 --port 8000 name:query-api]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.risk.app:app --host 0.0.0.0 --port 8000 name:risk]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.symbol_registry.app:app --host 0.0.0.0 --port 8000 name:symbol-registry]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.trading.app:app --host 0.0.0.0 --port 8000 name:trading-engine]) (push) Has been cancelled
Build and Push / build-dashboard (push) Has been cancelled
Build and Push / build-superset (push) Has been cancelled
Build and Push / integration-test (push) Has been cancelled
Build and Push / beta-gate (push) Has been cancelled

- Migration 035: prediction_snapshots, prediction_outcomes, signal_evidence_links, model_metric_snapshots tables + SQL views
- Prediction snapshot writer with canonical evidence keys, duplicate detection, contribution scores
- Outcome evaluator across 5 horizons (1h, 6h, 1d, 7d, 30d)
- Metrics engine: ECE, Brier score, IC, Rank IC, benchmark comparison
- Attribution engine: per-source, per-catalyst, per-layer performance
- Calibration engine: Bayesian shrinkage source reliability
- Quality gate for live trading eligibility with configurable thresholds
- 7 new /api/validation/* endpoints
- Upgraded OpsModel dashboard with validation tab
- Enhanced recommendation display with calibration context
- Backtest replay validation mode
- 86 Python tests (unit + property-based), 179 frontend tests passing
This commit is contained in:
Celes Renata
2026-05-01 03:04:58 +00:00
parent 5d2ffd9163
commit 7fcc8a6c07
23 changed files with 7554 additions and 9 deletions
+243 -1
View File
@@ -4,6 +4,10 @@ Task 32: Fetches historical recommendations from the database, simulates
the decision logic chronologically using evaluate_recommendation(), tracks
simulated positions and equity curve, and persists results to backtest_runs
and backtest_trades tables.
Supports a validation mode (Requirements 15.115.5) that generates prediction
snapshots and evaluates outcomes using only data available at each historical
point in time, preventing future data leakage.
"""
from __future__ import annotations
@@ -39,12 +43,22 @@ class BacktestReplay:
self.pool = pool
self._perf = PerformanceComputer()
async def run(self, config: BacktestConfig, backtest_id: str | None = None) -> BacktestResult:
async def run(
self,
config: BacktestConfig,
backtest_id: str | None = None,
validation_mode: bool = False,
) -> BacktestResult:
"""Execute a full backtest replay.
Args:
config: Backtest configuration (date range, capital, risk tier).
backtest_id: Optional pre-generated ID. If not provided, one is generated.
validation_mode: When True, creates prediction snapshots for each
historical recommendation using only data available at that point
in time, evaluates outcomes, and computes model metrics over the
backtest period. Snapshots are tagged with the backtest_id.
(Requirements 15.115.5)
Returns:
BacktestResult with metrics, trade log, and equity curve.
@@ -87,6 +101,7 @@ class BacktestReplay:
daily_returns: list[float] = []
prev_value = config.initial_capital
trade_log: list[dict] = []
validation_snapshot_ids: list[str] = [] # track snapshot IDs for validation mode
# Pre-load company sectors and latest prices for enrichment
company_sectors: dict[str, str] = {}
@@ -172,6 +187,25 @@ class BacktestReplay:
now=sim_time,
)
# --- Validation mode: create prediction snapshot (Req 15.1, 15.2, 15.4) ---
if validation_mode and self.pool is not None:
try:
snapshot_id = await self._create_validation_snapshot(
rec=rec,
sim_time=sim_time,
backtest_id=backtest_id,
company_sectors=company_sectors,
)
if snapshot_id is not None:
validation_snapshot_ids.append(snapshot_id)
except Exception:
logger.warning(
"Validation snapshot failed for %s at %s, continuing backtest",
rec.get("ticker", "?"),
sim_time,
exc_info=True,
)
if decision.decision == "act":
act_count += 1
ticker = decision.ticker
@@ -348,6 +382,10 @@ class BacktestReplay:
# Persist results
await self._persist_results(result, closed_trades)
# --- Validation mode: evaluate outcomes and compute metrics (Req 15.3, 15.5) ---
if validation_mode and self.pool is not None and validation_snapshot_ids:
await self._run_validation_evaluation(backtest_id)
return result
except Exception as exc:
@@ -356,6 +394,210 @@ class BacktestReplay:
await self._persist_failed_run(backtest_id, config, str(exc))
raise
# ------------------------------------------------------------------
# Validation mode helpers (Requirements 15.115.5)
# ------------------------------------------------------------------
# SQL to fetch the close price at or before a specific time — prevents
# future data leakage by only returning data available at that point.
_CLOSE_AT_TIME_SQL = """
SELECT (data->>'c')::float AS close
FROM market_snapshots
WHERE ticker = $1
AND snapshot_type = 'bar'
AND data->>'c' IS NOT NULL
AND captured_at <= $2
ORDER BY captured_at DESC
LIMIT 1
"""
_COMPANY_SECTOR_SQL = """
SELECT sector FROM companies WHERE ticker = $1 AND active = TRUE LIMIT 1
"""
_SECTOR_ETF_MAP: dict[str, str] = {
"Technology": "XLK",
"Consumer Cyclical": "XLY",
"Financial Services": "XLF",
"Healthcare": "XLV",
"Energy": "XLE",
"Communication Services": "XLC",
"Industrials": "XLI",
"Consumer Defensive": "XLP",
"Real Estate": "XLRE",
"Utilities": "XLU",
}
async def _fetch_close_at_time(
self,
ticker: str,
target_time: datetime,
) -> float | None:
"""Fetch the close price for *ticker* at or before *target_time*.
Ensures no future data leakage — only market data with
``captured_at <= target_time`` is considered (Requirement 15.4).
"""
if self.pool is None:
return None
row = await self.pool.fetchrow(self._CLOSE_AT_TIME_SQL, ticker, target_time)
if row is None:
return None
return row["close"]
async def _create_validation_snapshot(
self,
rec: dict,
sim_time: datetime,
backtest_id: str,
company_sectors: dict[str, str],
) -> str | None:
"""Create a prediction snapshot using only data available at *sim_time*.
Fetches ticker, SPY, and sector ETF prices as of *sim_time* to prevent
future data leakage (Requirements 15.1, 15.2, 15.4). The snapshot is
tagged with *backtest_id* in its metadata field (Requirement 15.5).
Returns the snapshot UUID on success, or ``None`` on failure.
"""
from services.validation.prediction_snapshot import (
SECTOR_ETF_MAP,
)
ticker = rec.get("ticker", "")
if not ticker:
return None
# Fetch prices using only data available at sim_time (Req 15.4)
ticker_price = await self._fetch_close_at_time(ticker, sim_time)
spy_price = await self._fetch_close_at_time("SPY", sim_time)
# Sector ETF price
sector = company_sectors.get(ticker)
sector_etf_ticker = SECTOR_ETF_MAP.get(sector) if sector else None
sector_etf_price: float | None = None
if sector_etf_ticker is not None:
sector_etf_price = await self._fetch_close_at_time(
sector_etf_ticker, sim_time
)
snapshot_id = str(uuid.uuid4())
# Build metadata tagged with backtest_id (Req 15.5)
metadata: dict = {
"backtest_id": backtest_id,
"source": "backtest_validation",
}
# Map recommendation fields to snapshot columns
direction = rec.get("direction", rec.get("trend_direction", "neutral"))
action = rec.get("action", "watch")
mode = rec.get("mode", "informational")
confidence = float(rec.get("confidence", 0.5))
strength = float(rec.get("strength", rec.get("trend_strength", 0.5)))
contradiction = float(rec.get("contradiction", rec.get("contradiction_score", 0.0)))
p_bull = rec.get("p_bull")
if p_bull is not None:
p_bull = float(p_bull)
p_bear = (1.0 - p_bull) if p_bull is not None else None
window = rec.get("window", rec.get("trend_window", "7d"))
horizon = rec.get("time_horizon", rec.get("horizon", "7d"))
# Insert the snapshot directly — we bypass create_prediction_snapshot()
# because that function fetches *latest* prices (not point-in-time).
insert_sql = """
INSERT INTO prediction_snapshots (
id, generated_at, ticker, window, horizon, direction, action, mode,
strength, confidence, contradiction, p_bull, p_bear,
score_company, score_macro, score_competitive,
evidence_count, unique_source_count, duplicate_evidence_count,
price_at_prediction, spy_price_at_prediction,
sector_etf_price_at_prediction, metadata
) VALUES (
$1::uuid, $2, $3, $4, $5, $6, $7, $8,
$9, $10, $11, $12, $13,
$14, $15, $16,
$17, $18, $19,
$20, $21, $22,
$23::jsonb
)
"""
await self.pool.execute(
insert_sql,
snapshot_id,
sim_time,
ticker,
str(window),
str(horizon),
str(direction),
str(action),
str(mode),
strength,
confidence,
contradiction,
p_bull,
p_bear,
float(rec.get("score_company", 0.0)),
float(rec.get("score_macro", 0.0)),
float(rec.get("score_competitive", 0.0)),
int(rec.get("evidence_count", 0)),
int(rec.get("unique_source_count", 0)),
int(rec.get("duplicate_evidence_count", 0)),
ticker_price,
spy_price,
sector_etf_price,
json.dumps(metadata),
)
logger.debug(
"Validation snapshot %s created for %s at %s (backtest %s)",
snapshot_id,
ticker,
sim_time,
backtest_id,
)
return snapshot_id
async def _run_validation_evaluation(self, backtest_id: str) -> None:
"""Evaluate prediction outcomes and compute metrics for the backtest.
Calls the outcome evaluator and metrics engine after the backtest
completes (Requirements 15.3, 15.5). Failures are logged but do
not block the backtest result.
"""
from services.validation.metrics import compute_and_store_metric_snapshots
from services.validation.outcome_evaluator import evaluate_matured_predictions
# Step 1: Evaluate matured predictions (Req 15.3)
try:
outcomes_count = await evaluate_matured_predictions(self.pool)
logger.info(
"Backtest %s validation: %d prediction outcomes evaluated",
backtest_id,
outcomes_count,
)
except Exception:
logger.warning(
"Backtest %s: outcome evaluation failed, continuing",
backtest_id,
exc_info=True,
)
# Step 2: Compute and store metric snapshots (Req 15.5)
try:
snapshots = await compute_and_store_metric_snapshots(self.pool)
logger.info(
"Backtest %s validation: %d metric snapshots computed",
backtest_id,
len(snapshots),
)
except Exception:
logger.warning(
"Backtest %s: metric snapshot computation failed, continuing",
backtest_id,
exc_info=True,
)
# ------------------------------------------------------------------
# Database helpers
# ------------------------------------------------------------------