feat: model validation, calibration, and signal quality layer
ci/woodpecker/push/test Pipeline failed
ci/woodpecker/push/build-1 unknown status
ci/woodpecker/push/build-3 unknown status
ci/woodpecker/push/build-2 unknown status
ci/woodpecker/push/finalize unknown status
Build and Push / lint-and-test (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.adapters.broker_adapter name:broker-adapter]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.aggregation.worker name:aggregation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.extractor.worker name:extractor]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.ingestion.worker name:ingestion]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.lake_publisher.worker name:lake-publisher]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.parser.worker name:parser]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.recommendation.worker name:recommendation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.scheduler.app name:scheduler]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.api.app:app --host 0.0.0.0 --port 8000 name:query-api]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.risk.app:app --host 0.0.0.0 --port 8000 name:risk]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.symbol_registry.app:app --host 0.0.0.0 --port 8000 name:symbol-registry]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.trading.app:app --host 0.0.0.0 --port 8000 name:trading-engine]) (push) Has been cancelled
Build and Push / build-dashboard (push) Has been cancelled
Build and Push / build-superset (push) Has been cancelled
Build and Push / integration-test (push) Has been cancelled
Build and Push / beta-gate (push) Has been cancelled

- Migration 035: prediction_snapshots, prediction_outcomes, signal_evidence_links, model_metric_snapshots tables + SQL views
- Prediction snapshot writer with canonical evidence keys, duplicate detection, contribution scores
- Outcome evaluator across 5 horizons (1h, 6h, 1d, 7d, 30d)
- Metrics engine: ECE, Brier score, IC, Rank IC, benchmark comparison
- Attribution engine: per-source, per-catalyst, per-layer performance
- Calibration engine: Bayesian shrinkage source reliability
- Quality gate for live trading eligibility with configurable thresholds
- 7 new /api/validation/* endpoints
- Upgraded OpsModel dashboard with validation tab
- Enhanced recommendation display with calibration context
- Backtest replay validation mode
- 86 Python tests (unit + property-based), 179 frontend tests passing
This commit is contained in:
Celes Renata
2026-05-01 03:04:58 +00:00
parent 5d2ffd9163
commit 7fcc8a6c07
23 changed files with 7554 additions and 9 deletions
+338
View File
@@ -43,6 +43,11 @@ from services.shared.db import get_pg_pool, get_redis
from services.shared.logging import new_trace_id, set_trace_context, setup_logging
from services.shared.redis_keys import PIPELINE_ENABLED_KEY, QUEUE_BROKER, QUEUE_PREFIX, queue_key
from services.shared.schemas import MAJOR_DECISION_CATALYSTS
from services.validation.attribution import (
compute_catalyst_attribution,
compute_layer_attribution,
compute_source_attribution,
)
logger = logging.getLogger("query_api")
@@ -3769,3 +3774,336 @@ async def get_variant_performance_history(
agent_id, variant_id, hours,
)
return [_row_to_dict(r) for r in rows]
# ---------------------------------------------------------------------------
# Model Validation Dashboard (Requirements 12.1, 12.2, 12.3, 12.7)
# ---------------------------------------------------------------------------
_VALID_LOOKBACKS = {"7d", "30d", "90d", "all"}
_VALID_HORIZONS = {"1h", "6h", "1d", "7d", "30d"}
@app.get("/api/validation/summary")
async def get_validation_summary(
lookback: str = Query(default="30d"),
horizon: str = Query(default="7d"),
):
"""Latest model metric snapshot plus quality gate status.
Returns the most recent model_metric_snapshot for the given
lookback/horizon combination, along with the current gate status
from risk_configs.
Requirement 12.1
"""
if lookback not in _VALID_LOOKBACKS:
raise HTTPException(400, f"Invalid lookback: {lookback}. Must be one of {sorted(_VALID_LOOKBACKS)}")
if horizon not in _VALID_HORIZONS:
raise HTTPException(400, f"Invalid horizon: {horizon}. Must be one of {sorted(_VALID_HORIZONS)}")
# Latest metric snapshot for the requested lookback/horizon
snapshot_row = await pool.fetchrow(
"""SELECT id, generated_at, lookback_window, horizon,
prediction_count, win_rate, directional_accuracy,
information_coefficient, rank_information_coefficient,
avg_return, avg_excess_return_vs_spy, avg_excess_return_vs_sector,
calibration_error, brier_score,
buy_win_rate, sell_win_rate, hold_win_rate,
metadata
FROM model_metric_snapshots
WHERE lookback_window = $1 AND horizon = $2
ORDER BY generated_at DESC
LIMIT 1""",
lookback, horizon,
)
snapshot = None
if snapshot_row:
snapshot = _row_to_dict(snapshot_row)
snapshot["metadata"] = _parse_jsonb(snapshot.get("metadata"))
# Gate status from risk_configs
gate_row = await pool.fetchrow(
"SELECT config, updated_at FROM risk_configs WHERE name = 'model_quality_gate'",
)
gate_status = None
if gate_row:
gate_status = _parse_jsonb(gate_row["config"])
return {
"snapshot": snapshot,
"gate_status": gate_status,
}
@app.get("/api/validation/calibration")
async def get_validation_calibration(
lookback: str = Query(default="30d"),
horizon: str = Query(default="7d"),
):
"""Calibration table with confidence buckets.
Queries v_prediction_performance for the given lookback/horizon,
groups by confidence buckets, and computes avg_confidence,
observed_win_rate, count, and miscalibrated flag per bucket.
Requirement 12.2
"""
if lookback not in _VALID_LOOKBACKS:
raise HTTPException(400, f"Invalid lookback: {lookback}. Must be one of {sorted(_VALID_LOOKBACKS)}")
if horizon not in _VALID_HORIZONS:
raise HTTPException(400, f"Invalid horizon: {horizon}. Must be one of {sorted(_VALID_HORIZONS)}")
# Build lookback filter
lookback_condition = ""
params: list[Any] = [horizon]
idx = 2
if lookback != "all":
lookback_days = {"7d": 7, "30d": 30, "90d": 90}[lookback]
lookback_condition = f"AND generated_at >= NOW() - make_interval(days => ${idx})"
params.append(lookback_days)
idx += 1
rows = await pool.fetch(
f"""SELECT confidence, direction_correct
FROM v_prediction_performance
WHERE horizon = $1
{lookback_condition}
AND confidence IS NOT NULL""",
*params,
)
# Group into calibration buckets
buckets_def = [
(0.50, 0.60),
(0.60, 0.70),
(0.70, 0.80),
(0.80, 0.90),
(0.90, 1.00),
]
buckets = []
for low, high in buckets_def:
bucket_rows = []
for r in rows:
conf = float(r["confidence"])
if high == 1.00:
in_bucket = low <= conf <= high
else:
in_bucket = low <= conf < high
if in_bucket:
bucket_rows.append(r)
count = len(bucket_rows)
if count == 0:
buckets.append({
"bucket_low": low,
"bucket_high": high,
"avg_confidence": 0.0,
"observed_win_rate": 0.0,
"prediction_count": 0,
"miscalibrated": False,
})
continue
avg_conf = sum(float(r["confidence"]) for r in bucket_rows) / count
win_count = sum(1 for r in bucket_rows if r["direction_correct"] is True)
win_rate = win_count / count
diff = abs(avg_conf - win_rate)
buckets.append({
"bucket_low": low,
"bucket_high": high,
"avg_confidence": round(avg_conf, 4),
"observed_win_rate": round(win_rate, 4),
"prediction_count": count,
"miscalibrated": diff > 0.15,
})
return {"buckets": buckets, "lookback": lookback, "horizon": horizon}
@app.get("/api/validation/ic-by-horizon")
async def get_validation_ic_by_horizon(
lookback: str = Query(default="30d"),
):
"""IC and Rank IC per prediction horizon.
Queries the most recent model_metric_snapshot for the given lookback
across all 5 horizons, returning IC and Rank IC for each.
Requirement 12.3
"""
if lookback not in _VALID_LOOKBACKS:
raise HTTPException(400, f"Invalid lookback: {lookback}. Must be one of {sorted(_VALID_LOOKBACKS)}")
rows = await pool.fetch(
"""SELECT DISTINCT ON (horizon)
horizon,
information_coefficient,
rank_information_coefficient,
prediction_count,
generated_at
FROM model_metric_snapshots
WHERE lookback_window = $1
ORDER BY horizon, generated_at DESC""",
lookback,
)
horizons = []
for r in rows:
horizons.append({
"horizon": r["horizon"],
"information_coefficient": float(r["information_coefficient"]) if r["information_coefficient"] is not None else None,
"rank_information_coefficient": float(r["rank_information_coefficient"]) if r["rank_information_coefficient"] is not None else None,
"prediction_count": r["prediction_count"],
"generated_at": r["generated_at"].isoformat() if r["generated_at"] else None,
})
# Sort by canonical horizon order
horizon_order = {"1h": 0, "6h": 1, "1d": 2, "7d": 3, "30d": 4}
horizons.sort(key=lambda h: horizon_order.get(h["horizon"], 99))
return {"horizons": horizons, "lookback": lookback}
@app.get("/api/validation/gate-status")
async def get_validation_gate_status():
"""Quality gate evaluation detail.
Returns the stored gate evaluation result from risk_configs
where key = 'model_quality_gate'.
Requirement 12.7
"""
gate_row = await pool.fetchrow(
"SELECT config, updated_at FROM risk_configs WHERE name = 'model_quality_gate'",
)
if not gate_row:
return {
"gate_status": None,
"message": "No gate evaluation found. Model metrics may not have been computed yet.",
}
gate_data = _parse_jsonb(gate_row["config"])
updated_at = gate_row["updated_at"].isoformat() if gate_row.get("updated_at") else None
return {
"gate_status": gate_data,
"updated_at": updated_at,
}
# ---------------------------------------------------------------------------
# Attribution Endpoints (Requirements 12.4, 12.5, 12.6)
# ---------------------------------------------------------------------------
_LOOKBACK_TO_DAYS: dict[str, int] = {
"7d": 7,
"30d": 30,
"90d": 90,
"all": 3650,
}
@app.get("/api/validation/attribution/sources")
async def get_validation_attribution_sources(
lookback: str = Query(default="30d"),
horizon: str = Query(default="7d"),
):
"""Per-source performance metrics.
Returns win rate, IC, average return, duplicate rate, and other
attribution metrics for each source, computed over the given
lookback window and prediction horizon.
Requirement 12.4
"""
if lookback not in _VALID_LOOKBACKS:
raise HTTPException(400, f"Invalid lookback: {lookback}. Must be one of {sorted(_VALID_LOOKBACKS)}")
if horizon not in _VALID_HORIZONS:
raise HTTPException(400, f"Invalid horizon: {horizon}. Must be one of {sorted(_VALID_HORIZONS)}")
lookback_days = _LOOKBACK_TO_DAYS[lookback]
try:
results = await compute_source_attribution(pool, lookback_days=lookback_days, horizon=horizon)
except Exception:
logger.exception("Failed to compute source attribution")
raise HTTPException(500, "Failed to compute source attribution")
return {
"sources": [asdict(r) for r in results],
"lookback": lookback,
"horizon": horizon,
}
@app.get("/api/validation/attribution/catalysts")
async def get_validation_attribution_catalysts(
lookback: str = Query(default="30d"),
horizon: str = Query(default="7d"),
):
"""Per-catalyst-type performance metrics.
Returns win rate, IC, average return, and other attribution metrics
for each catalyst type, computed over the given lookback window
and prediction horizon.
Requirement 12.5
"""
if lookback not in _VALID_LOOKBACKS:
raise HTTPException(400, f"Invalid lookback: {lookback}. Must be one of {sorted(_VALID_LOOKBACKS)}")
if horizon not in _VALID_HORIZONS:
raise HTTPException(400, f"Invalid horizon: {horizon}. Must be one of {sorted(_VALID_HORIZONS)}")
lookback_days = _LOOKBACK_TO_DAYS[lookback]
try:
results = await compute_catalyst_attribution(pool, lookback_days=lookback_days, horizon=horizon)
except Exception:
logger.exception("Failed to compute catalyst attribution")
raise HTTPException(500, "Failed to compute catalyst attribution")
return {
"catalysts": [asdict(r) for r in results],
"lookback": lookback,
"horizon": horizon,
}
@app.get("/api/validation/attribution/layers")
async def get_validation_attribution_layers(
lookback: str = Query(default="30d"),
horizon: str = Query(default="7d"),
):
"""Per-signal-layer (company, macro, competitive) performance metrics.
Returns average contribution percentage, dominant win rate, and
dominant IC for each of the three signal layers, computed over
the given lookback window and prediction horizon.
Requirement 12.6
"""
if lookback not in _VALID_LOOKBACKS:
raise HTTPException(400, f"Invalid lookback: {lookback}. Must be one of {sorted(_VALID_LOOKBACKS)}")
if horizon not in _VALID_HORIZONS:
raise HTTPException(400, f"Invalid horizon: {horizon}. Must be one of {sorted(_VALID_HORIZONS)}")
lookback_days = _LOOKBACK_TO_DAYS[lookback]
try:
results = await compute_layer_attribution(pool, lookback_days=lookback_days, horizon=horizon)
except Exception:
logger.exception("Failed to compute layer attribution")
raise HTTPException(500, "Failed to compute layer attribution")
return {
"layers": [asdict(r) for r in results],
"lookback": lookback,
"horizon": horizon,
}