Files
stonks-oracle/services/trading/model_quality_gate.py
T
Celes Renata 7fcc8a6c07
ci/woodpecker/push/test Pipeline failed
ci/woodpecker/push/build-1 unknown status
ci/woodpecker/push/build-3 unknown status
ci/woodpecker/push/build-2 unknown status
ci/woodpecker/push/finalize unknown status
Build and Push / lint-and-test (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.adapters.broker_adapter name:broker-adapter]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.aggregation.worker name:aggregation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.extractor.worker name:extractor]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.ingestion.worker name:ingestion]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.lake_publisher.worker name:lake-publisher]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.parser.worker name:parser]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.recommendation.worker name:recommendation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.scheduler.app name:scheduler]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.api.app:app --host 0.0.0.0 --port 8000 name:query-api]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.risk.app:app --host 0.0.0.0 --port 8000 name:risk]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.symbol_registry.app:app --host 0.0.0.0 --port 8000 name:symbol-registry]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.trading.app:app --host 0.0.0.0 --port 8000 name:trading-engine]) (push) Has been cancelled
Build and Push / build-dashboard (push) Has been cancelled
Build and Push / build-superset (push) Has been cancelled
Build and Push / integration-test (push) Has been cancelled
Build and Push / beta-gate (push) Has been cancelled
feat: model validation, calibration, and signal quality layer
- Migration 035: prediction_snapshots, prediction_outcomes, signal_evidence_links, model_metric_snapshots tables + SQL views
- Prediction snapshot writer with canonical evidence keys, duplicate detection, contribution scores
- Outcome evaluator across 5 horizons (1h, 6h, 1d, 7d, 30d)
- Metrics engine: ECE, Brier score, IC, Rank IC, benchmark comparison
- Attribution engine: per-source, per-catalyst, per-layer performance
- Calibration engine: Bayesian shrinkage source reliability
- Quality gate for live trading eligibility with configurable thresholds
- 7 new /api/validation/* endpoints
- Upgraded OpsModel dashboard with validation tab
- Enhanced recommendation display with calibration context
- Backtest replay validation mode
- 86 Python tests (unit + property-based), 179 frontend tests passing
2026-05-01 03:04:58 +00:00

330 lines
10 KiB
Python

"""Quality gate for live trading eligibility.
Evaluates aggregate model metrics against configurable thresholds and
determines whether the system meets minimum quality standards for live
trading. When any threshold is not met, the gate forces all
recommendations to paper mode (fail-safe).
Requirements: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7
"""
from __future__ import annotations
import json
import logging
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
import asyncpg
logger = logging.getLogger("trading_engine.quality_gate")
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class QualityGateConfig:
"""Configurable thresholds for live trading eligibility."""
min_prediction_count: int = 100
min_ic: float = 0.03
min_win_rate: float = 0.53
max_ece: float = 0.15
min_excess_return_vs_spy: float = 0.0
max_snapshot_age_hours: int = 24
@dataclass
class GateThresholdResult:
"""Result for a single threshold check."""
name: str
threshold: float
actual: float
passed: bool
@dataclass
class QualityGateResult:
"""Full gate evaluation result."""
passed: bool
evaluated_at: datetime
threshold_results: list[GateThresholdResult] = field(default_factory=list)
reason: str = ""
snapshot_id: str | None = None
config: QualityGateConfig = field(default_factory=QualityGateConfig)
# ---------------------------------------------------------------------------
# Threshold evaluation helpers
# ---------------------------------------------------------------------------
def _evaluate_thresholds(
snapshot: dict,
config: QualityGateConfig,
) -> list[GateThresholdResult]:
"""Evaluate each threshold against snapshot metric values."""
results: list[GateThresholdResult] = []
# min_prediction_count
actual_count = snapshot.get("prediction_count") or 0
results.append(
GateThresholdResult(
name="min_prediction_count",
threshold=float(config.min_prediction_count),
actual=float(actual_count),
passed=actual_count >= config.min_prediction_count,
)
)
# min_ic
actual_ic = snapshot.get("information_coefficient")
if actual_ic is None:
actual_ic = 0.0
results.append(
GateThresholdResult(
name="min_ic",
threshold=config.min_ic,
actual=float(actual_ic),
passed=float(actual_ic) >= config.min_ic,
)
)
# min_win_rate
actual_wr = snapshot.get("win_rate")
if actual_wr is None:
actual_wr = 0.0
results.append(
GateThresholdResult(
name="min_win_rate",
threshold=config.min_win_rate,
actual=float(actual_wr),
passed=float(actual_wr) >= config.min_win_rate,
)
)
# max_ece (calibration_error)
actual_ece = snapshot.get("calibration_error")
if actual_ece is None:
actual_ece = 1.0 # worst-case when missing
results.append(
GateThresholdResult(
name="max_ece",
threshold=config.max_ece,
actual=float(actual_ece),
passed=float(actual_ece) <= config.max_ece,
)
)
# min_excess_return_vs_spy
actual_excess = snapshot.get("avg_excess_return_vs_spy")
if actual_excess is None:
actual_excess = 0.0
results.append(
GateThresholdResult(
name="min_excess_return_vs_spy",
threshold=config.min_excess_return_vs_spy,
actual=float(actual_excess),
passed=float(actual_excess) >= config.min_excess_return_vs_spy,
)
)
return results
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
async def evaluate_quality_gate(
pool: asyncpg.Pool,
config: QualityGateConfig | None = None,
) -> QualityGateResult:
"""Evaluate model quality gate from latest metric snapshot.
Reads the most recent ``model_metric_snapshot`` for the 30d lookback
and 7d horizon (the primary evaluation window).
If no snapshot exists or snapshot is stale (>max_snapshot_age_hours),
defaults to paper-only mode (fail-safe).
Stores result in ``risk_configs`` under ``'model_quality_gate'`` key.
"""
if config is None:
config = await load_gate_config_from_db(pool)
now = datetime.now(tz=timezone.utc)
# Fetch the most recent metric snapshot for 30d lookback / 7d horizon
try:
row = await pool.fetchrow(
"""SELECT id, generated_at, prediction_count, win_rate,
directional_accuracy, information_coefficient,
rank_information_coefficient, avg_return,
avg_excess_return_vs_spy, avg_excess_return_vs_sector,
calibration_error, brier_score,
buy_win_rate, sell_win_rate, hold_win_rate
FROM model_metric_snapshots
WHERE lookback_window = '30d' AND horizon = '7d'
ORDER BY generated_at DESC
LIMIT 1""",
)
except Exception:
logger.exception("Failed to query model_metric_snapshots")
row = None
# Fail-safe: no snapshot exists
if row is None:
result = QualityGateResult(
passed=False,
evaluated_at=now,
threshold_results=[],
reason="no model metric snapshot available — defaulting to paper-only",
snapshot_id=None,
config=config,
)
logger.warning("Quality gate: %s", result.reason)
await _store_gate_result(pool, result)
return result
snapshot = dict(row)
snapshot_id = str(snapshot["id"])
generated_at = snapshot["generated_at"]
# Fail-safe: stale snapshot
age_hours = (now - generated_at).total_seconds() / 3600.0
if age_hours > config.max_snapshot_age_hours:
result = QualityGateResult(
passed=False,
evaluated_at=now,
threshold_results=[],
reason=(
f"most recent snapshot is {age_hours:.1f}h old "
f"(max {config.max_snapshot_age_hours}h) — defaulting to paper-only"
),
snapshot_id=snapshot_id,
config=config,
)
logger.warning("Quality gate: %s", result.reason)
await _store_gate_result(pool, result)
return result
# Evaluate thresholds
threshold_results = _evaluate_thresholds(snapshot, config)
failed = [r for r in threshold_results if not r.passed]
if failed:
failed_names = ", ".join(
f"{r.name}(actual={r.actual:.4f}, threshold={r.threshold:.4f})"
for r in failed
)
reason = f"failed: {failed_names}"
passed = False
else:
reason = "all thresholds met"
passed = True
result = QualityGateResult(
passed=passed,
evaluated_at=now,
threshold_results=threshold_results,
reason=reason,
snapshot_id=snapshot_id,
config=config,
)
# Log details
for tr in threshold_results:
logger.info(
"Quality gate threshold %s: actual=%.4f threshold=%.4f %s",
tr.name,
tr.actual,
tr.threshold,
"PASS" if tr.passed else "FAIL",
)
logger.info("Quality gate result: %s%s", "PASS" if passed else "FAIL", reason)
await _store_gate_result(pool, result)
return result
async def load_gate_config_from_db(
pool: asyncpg.Pool,
) -> QualityGateConfig:
"""Load gate thresholds from risk_configs, with defaults.
Looks for a ``risk_configs`` row with ``name = 'model_quality_gate_config'``.
If found, merges stored thresholds over the defaults. If not found or
the stored JSON is invalid, returns the default config.
"""
defaults = QualityGateConfig()
try:
row = await pool.fetchrow(
"SELECT config FROM risk_configs WHERE name = 'model_quality_gate_config'",
)
except Exception:
logger.warning("Failed to load gate config from risk_configs — using defaults")
return defaults
if row is None:
return defaults
try:
raw = row["config"]
cfg = raw if isinstance(raw, dict) else json.loads(raw)
except (json.JSONDecodeError, TypeError):
logger.warning("Invalid gate config JSON in risk_configs — using defaults")
return defaults
return QualityGateConfig(
min_prediction_count=int(cfg.get("min_prediction_count", defaults.min_prediction_count)),
min_ic=float(cfg.get("min_ic", defaults.min_ic)),
min_win_rate=float(cfg.get("min_win_rate", defaults.min_win_rate)),
max_ece=float(cfg.get("max_ece", defaults.max_ece)),
min_excess_return_vs_spy=float(
cfg.get("min_excess_return_vs_spy", defaults.min_excess_return_vs_spy)
),
max_snapshot_age_hours=int(
cfg.get("max_snapshot_age_hours", defaults.max_snapshot_age_hours)
),
)
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _gate_result_to_json(result: QualityGateResult) -> str:
"""Serialize a QualityGateResult to JSON for storage in risk_configs."""
payload = {
"passed": result.passed,
"evaluated_at": result.evaluated_at.isoformat(),
"reason": result.reason,
"snapshot_id": result.snapshot_id,
"config": asdict(result.config),
"threshold_results": [asdict(tr) for tr in result.threshold_results],
}
return json.dumps(payload, default=str)
async def _store_gate_result(pool: asyncpg.Pool, result: QualityGateResult) -> None:
"""Upsert gate evaluation result into risk_configs."""
payload = _gate_result_to_json(result)
try:
await pool.execute(
"""INSERT INTO risk_configs (name, config, updated_at)
VALUES ('model_quality_gate', $1::jsonb, NOW())
ON CONFLICT (name) WHERE active = TRUE
DO UPDATE SET config = $1::jsonb, updated_at = NOW()""",
payload,
)
except Exception:
logger.exception("Failed to store quality gate result in risk_configs")