feat: model validation, calibration, and signal quality layer
ci/woodpecker/push/test Pipeline failed
ci/woodpecker/push/build-1 unknown status
ci/woodpecker/push/build-3 unknown status
ci/woodpecker/push/build-2 unknown status
ci/woodpecker/push/finalize unknown status
Build and Push / lint-and-test (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.adapters.broker_adapter name:broker-adapter]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.aggregation.worker name:aggregation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.extractor.worker name:extractor]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.ingestion.worker name:ingestion]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.lake_publisher.worker name:lake-publisher]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.parser.worker name:parser]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.recommendation.worker name:recommendation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.scheduler.app name:scheduler]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.api.app:app --host 0.0.0.0 --port 8000 name:query-api]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.risk.app:app --host 0.0.0.0 --port 8000 name:risk]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.symbol_registry.app:app --host 0.0.0.0 --port 8000 name:symbol-registry]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.trading.app:app --host 0.0.0.0 --port 8000 name:trading-engine]) (push) Has been cancelled
Build and Push / build-dashboard (push) Has been cancelled
Build and Push / build-superset (push) Has been cancelled
Build and Push / integration-test (push) Has been cancelled
Build and Push / beta-gate (push) Has been cancelled

- Migration 035: prediction_snapshots, prediction_outcomes, signal_evidence_links, model_metric_snapshots tables + SQL views
- Prediction snapshot writer with canonical evidence keys, duplicate detection, contribution scores
- Outcome evaluator across 5 horizons (1h, 6h, 1d, 7d, 30d)
- Metrics engine: ECE, Brier score, IC, Rank IC, benchmark comparison
- Attribution engine: per-source, per-catalyst, per-layer performance
- Calibration engine: Bayesian shrinkage source reliability
- Quality gate for live trading eligibility with configurable thresholds
- 7 new /api/validation/* endpoints
- Upgraded OpsModel dashboard with validation tab
- Enhanced recommendation display with calibration context
- Backtest replay validation mode
- 86 Python tests (unit + property-based), 179 frontend tests passing
This commit is contained in:
Celes Renata
2026-05-01 03:04:58 +00:00
parent 5d2ffd9163
commit 7fcc8a6c07
23 changed files with 7554 additions and 9 deletions
+690
View File
@@ -0,0 +1,690 @@
"""Unit tests for model validation, calibration, and signal quality modules.
Covers prediction snapshot writer, outcome evaluator, metrics engine,
calibration engine, and quality gate — all pure-function / deterministic tests.
Requirements: 1.1, 2.3, 2.4, 2.5, 3.3, 4.2, 4.5, 4.6, 4.7,
5.3, 5.4, 6.1, 6.2, 6.5, 8.1, 8.2, 8.3, 11.1, 11.6
"""
from __future__ import annotations
import hashlib
import pytest
# -- Prediction Snapshot Writer --
from services.validation.prediction_snapshot import (
MAX_SINGLE_DOCUMENT_WEIGHT,
compute_canonical_evidence_key,
compute_contribution_scores,
)
# -- Outcome Evaluator --
from services.validation.outcome_evaluator import (
_compute_return,
_is_direction_correct,
_is_profitable,
)
# -- Metrics Engine --
from services.validation.metrics import (
compute_brier_score,
compute_calibration_error,
compute_information_coefficient,
compute_rank_information_coefficient,
)
# -- Calibration Engine --
from services.validation.calibration import (
compute_adjusted_evidence_weight,
compute_source_reliability,
)
# -- Quality Gate --
from services.trading.model_quality_gate import (
QualityGateConfig,
_evaluate_thresholds,
)
# ===================================================================
# 8.2 — Prediction Snapshot Writer unit tests
# Requirements: 1.1, 2.3, 2.4, 2.5, 3.3
# ===================================================================
class TestCanonicalEvidenceKey:
"""Tests for compute_canonical_evidence_key."""
def test_known_title_url_produces_expected_sha256(self):
"""Known title/URL pair produces a deterministic SHA256 hash."""
key = compute_canonical_evidence_key(
"Test Article", "https://example.com/article?ref=123"
)
assert key == "abd5818d51579a7af51cd06861289c7f1fdc97c0f522e8ba13ce9b4aad01cb6f"
def test_empty_inputs(self):
"""Empty title and URL produce SHA256 of empty string."""
key = compute_canonical_evidence_key("", "")
expected = hashlib.sha256(b"").hexdigest()
assert key == expected
def test_unicode_inputs(self):
"""Unicode title and URL are handled correctly."""
key = compute_canonical_evidence_key(
"日本語テスト", "https://example.com/日本語"
)
assert key == "553553928bb4e36abdf283ff3c52df0695fca09809159650a9bdcb4fb2c5f62b"
def test_normalization_case_insensitive(self):
"""Title and URL are lowercased before hashing."""
key_lower = compute_canonical_evidence_key(
"test article", "https://example.com/path"
)
key_upper = compute_canonical_evidence_key(
"TEST ARTICLE", "HTTPS://EXAMPLE.COM/PATH"
)
assert key_lower == key_upper
def test_normalization_strips_query_params(self):
"""URL query parameters are stripped before hashing."""
key_with_params = compute_canonical_evidence_key(
"title", "https://example.com/article?utm_source=twitter&ref=123"
)
key_without_params = compute_canonical_evidence_key(
"title", "https://example.com/article"
)
assert key_with_params == key_without_params
def test_normalization_strips_whitespace(self):
"""Leading/trailing whitespace in title is stripped."""
key_trimmed = compute_canonical_evidence_key(
"test", "https://example.com"
)
key_padded = compute_canonical_evidence_key(
" test ", "https://example.com"
)
assert key_trimmed == key_padded
class TestDuplicateDetection:
"""Tests for duplicate detection via canonical evidence keys."""
def test_three_docs_two_sharing_key_one_duplicate(self):
"""3 docs where 2 share a canonical key → 1 marked duplicate."""
# Simulate the duplicate detection logic from create_prediction_snapshot
docs = [
{"title": "Breaking News", "url": "https://news.com/article"},
{"title": "breaking news", "url": "https://news.com/article?ref=1"},
{"title": "Other Story", "url": "https://other.com/story"},
]
seen_keys: dict[str, int] = {}
duplicates: list[bool] = []
for doc in docs:
key = compute_canonical_evidence_key(doc["title"], doc["url"])
is_dup = key in seen_keys
if not is_dup:
seen_keys[key] = len(duplicates)
duplicates.append(is_dup)
assert duplicates == [False, True, False]
assert sum(duplicates) == 1
class TestContributionScores:
"""Tests for compute_contribution_scores."""
def test_known_weights(self):
"""[0.5, 0.3, 0.2] → [0.5, 0.3, 0.2] (already sums to 1.0)."""
scores = compute_contribution_scores([0.5, 0.3, 0.2])
assert scores == pytest.approx([0.5, 0.3, 0.2])
assert sum(scores) == pytest.approx(1.0)
def test_single_doc(self):
"""Single document → contribution score of 1.0."""
scores = compute_contribution_scores([0.7])
assert scores == pytest.approx([1.0])
def test_empty_input(self):
"""Empty input → empty list."""
scores = compute_contribution_scores([])
assert scores == []
def test_all_zero_weights(self):
"""All-zero weights → equal distribution."""
scores = compute_contribution_scores([0.0, 0.0, 0.0])
assert len(scores) == 3
assert all(s == pytest.approx(1.0 / 3.0) for s in scores)
def test_scores_sum_to_one(self):
"""Arbitrary weights sum to 1.0."""
scores = compute_contribution_scores([1.0, 2.0, 3.0, 4.0])
assert sum(scores) == pytest.approx(1.0)
assert scores == pytest.approx([0.1, 0.2, 0.3, 0.4])
class TestWeightClamping:
"""Tests for MAX_SINGLE_DOCUMENT_WEIGHT clamping."""
def test_weight_above_max_clamped(self):
"""Weight 1.5 → clamped to MAX_SINGLE_DOCUMENT_WEIGHT (1.0)."""
raw_weight = 1.5
clamped = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT)
assert clamped == 1.0
def test_weight_at_max_unchanged(self):
"""Weight exactly at MAX stays unchanged."""
raw_weight = 1.0
clamped = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT)
assert clamped == 1.0
def test_weight_below_max_unchanged(self):
"""Weight below MAX stays unchanged."""
raw_weight = 0.5
clamped = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT)
assert clamped == 0.5
# ===================================================================
# 8.3 — Outcome Evaluator unit tests
# Requirements: 4.2, 4.5, 4.6, 4.7
# ===================================================================
class TestComputeReturn:
"""Tests for _compute_return."""
def test_positive_return(self):
"""Price 100 → 110 → return 0.10."""
assert _compute_return(100.0, 110.0) == pytest.approx(0.10)
def test_negative_return(self):
"""Price 100 → 90 → return -0.10."""
assert _compute_return(100.0, 90.0) == pytest.approx(-0.10)
def test_zero_return(self):
"""Price unchanged → return 0.0."""
assert _compute_return(100.0, 100.0) == pytest.approx(0.0)
def test_zero_current_price(self):
"""Current price 0 → return 0.0 (guard against division by zero)."""
assert _compute_return(0.0, 110.0) == 0.0
class TestDirectionCorrect:
"""Tests for _is_direction_correct."""
def test_bullish_positive_return(self):
"""Bullish + positive return → True."""
assert _is_direction_correct("bullish", 0.05) is True
def test_bullish_negative_return(self):
"""Bullish + negative return → False."""
assert _is_direction_correct("bullish", -0.05) is False
def test_bearish_negative_return(self):
"""Bearish + negative return → True."""
assert _is_direction_correct("bearish", -0.05) is True
def test_bearish_positive_return(self):
"""Bearish + positive return → False."""
assert _is_direction_correct("bearish", 0.05) is False
def test_bullish_zero_return(self):
"""Bullish + zero return → False (not strictly positive)."""
assert _is_direction_correct("bullish", 0.0) is False
def test_bearish_zero_return(self):
"""Bearish + zero return → False (not strictly negative)."""
assert _is_direction_correct("bearish", 0.0) is False
def test_mixed_direction(self):
"""Mixed direction → always False."""
assert _is_direction_correct("mixed", 0.05) is False
assert _is_direction_correct("mixed", -0.05) is False
def test_case_insensitive(self):
"""Direction matching is case-insensitive."""
assert _is_direction_correct("Bullish", 0.05) is True
assert _is_direction_correct("BEARISH", -0.05) is True
class TestIsProfitable:
"""Tests for _is_profitable."""
def test_buy_positive_return(self):
"""Buy + positive return → True."""
assert _is_profitable("buy", 0.05) is True
def test_buy_negative_return(self):
"""Buy + negative return → False."""
assert _is_profitable("buy", -0.05) is False
def test_sell_negative_return(self):
"""Sell + negative return → True."""
assert _is_profitable("sell", -0.05) is True
def test_sell_positive_return(self):
"""Sell + positive return → False."""
assert _is_profitable("sell", 0.05) is False
def test_hold_any_return(self):
"""Hold → always False."""
assert _is_profitable("hold", 0.05) is False
assert _is_profitable("hold", -0.05) is False
def test_case_insensitive(self):
"""Action matching is case-insensitive."""
assert _is_profitable("Buy", 0.05) is True
assert _is_profitable("SELL", -0.05) is True
class TestExcessReturn:
"""Tests for excess return computation (ticker return - benchmark return)."""
def test_excess_return_vs_spy(self):
"""Ticker 10%, SPY 5% → excess 5%."""
ticker_return = _compute_return(100.0, 110.0) # 0.10
spy_return = _compute_return(100.0, 105.0) # 0.05
excess = ticker_return - spy_return
assert excess == pytest.approx(0.05)
def test_negative_excess_return(self):
"""Ticker 3%, SPY 5% → excess -2%."""
ticker_return = _compute_return(100.0, 103.0) # 0.03
spy_return = _compute_return(100.0, 105.0) # 0.05
excess = ticker_return - spy_return
assert excess == pytest.approx(-0.02)
def test_zero_excess_return(self):
"""Same return → excess 0%."""
ticker_return = _compute_return(100.0, 110.0)
spy_return = _compute_return(100.0, 110.0)
excess = ticker_return - spy_return
assert excess == pytest.approx(0.0)
# ===================================================================
# 8.4 — Metrics Engine unit tests
# Requirements: 5.3, 5.4, 6.1, 6.2, 6.5
# ===================================================================
class TestCalibrationError:
"""Tests for compute_calibration_error (ECE)."""
def test_perfect_calibration_ece_zero(self):
"""Perfect calibration → ECE = 0.0.
All predictions in [0.70, 0.80) bucket with 75% win rate
matching ~0.75 avg confidence.
"""
confidences = [0.75] * 100
outcomes = [True] * 75 + [False] * 25
ece, buckets = compute_calibration_error(confidences, outcomes)
assert ece == pytest.approx(0.0, abs=1e-9)
def test_all_overconfident_positive_ece(self):
"""All overconfident (high confidence, low win rate) → positive ECE."""
# All predictions at 0.95 confidence but only 50% win rate
confidences = [0.95] * 100
outcomes = [True] * 50 + [False] * 50
ece, buckets = compute_calibration_error(confidences, outcomes)
assert ece > 0.0
# ECE should be |0.95 - 0.50| = 0.45
assert ece == pytest.approx(0.45, abs=0.01)
def test_empty_input_returns_zero(self):
"""Empty input → ECE = 0.0, empty buckets."""
ece, buckets = compute_calibration_error([], [])
assert ece == 0.0
assert buckets == []
def test_miscalibrated_flag(self):
"""Buckets with |avg_conf - win_rate| > 0.15 are flagged."""
# All in [0.90, 1.00] bucket with 0% win rate → diff = 0.95
confidences = [0.95] * 20
outcomes = [False] * 20
_ece, buckets = compute_calibration_error(confidences, outcomes)
# Find the [0.90, 1.00] bucket
high_bucket = [b for b in buckets if b.bucket_low == 0.90]
assert len(high_bucket) == 1
assert high_bucket[0].miscalibrated is True
def test_ece_in_valid_range(self):
"""ECE is always in [0.0, 1.0]."""
confidences = [0.55, 0.65, 0.75, 0.85, 0.95]
outcomes = [False, True, False, True, False]
ece, _ = compute_calibration_error(confidences, outcomes)
assert 0.0 <= ece <= 1.0
class TestBrierScore:
"""Tests for compute_brier_score."""
def test_all_correct_at_p1(self):
"""All correct at p=1.0 → Brier = 0.0."""
p_bulls = [1.0] * 10
outcomes = [True] * 10
assert compute_brier_score(p_bulls, outcomes) == pytest.approx(0.0)
def test_all_wrong_at_p1(self):
"""All wrong at p=1.0 → Brier = 1.0."""
p_bulls = [1.0] * 10
outcomes = [False] * 10
assert compute_brier_score(p_bulls, outcomes) == pytest.approx(1.0)
def test_all_correct_at_p0(self):
"""All correct at p=0.0 (bearish correct) → Brier = 0.0."""
p_bulls = [0.0] * 10
outcomes = [False] * 10
assert compute_brier_score(p_bulls, outcomes) == pytest.approx(0.0)
def test_empty_input(self):
"""Empty input → Brier = 0.0."""
assert compute_brier_score([], []) == 0.0
def test_mixed_predictions(self):
"""Mixed predictions produce a value in (0, 1)."""
p_bulls = [0.8, 0.6, 0.3]
outcomes = [True, False, True]
brier = compute_brier_score(p_bulls, outcomes)
assert 0.0 < brier < 1.0
class TestInformationCoefficient:
"""Tests for compute_information_coefficient (Pearson IC)."""
def test_perfect_positive_correlation(self):
"""Perfectly correlated scores and returns → IC = 1.0."""
scores = list(range(30))
returns = [s * 2.0 + 1.0 for s in scores] # linear: y = 2x + 1
ic = compute_information_coefficient(scores, returns)
assert ic is not None
assert ic == pytest.approx(1.0, abs=1e-9)
def test_perfect_negative_correlation(self):
"""Anti-correlated scores and returns → IC = -1.0."""
scores = list(range(30))
returns = [-s * 2.0 for s in scores]
ic = compute_information_coefficient(scores, returns)
assert ic is not None
assert ic == pytest.approx(-1.0, abs=1e-9)
def test_fewer_than_30_returns_none(self):
"""Fewer than 30 data points → None."""
scores = list(range(29))
returns = list(range(29))
ic = compute_information_coefficient(scores, returns)
assert ic is None
def test_ic_in_valid_range(self):
"""IC is always in [-1.0, 1.0] for valid data."""
scores = [float(i % 7) for i in range(50)]
returns = [float(i % 5) for i in range(50)]
ic = compute_information_coefficient(scores, returns)
assert ic is not None
assert -1.0 <= ic <= 1.0
class TestRankInformationCoefficient:
"""Tests for compute_rank_information_coefficient (Spearman Rank IC)."""
def test_perfect_rank_correlation(self):
"""Perfectly rank-correlated → Rank IC = 1.0."""
scores = list(range(30))
returns = list(range(30)) # same ordering
rank_ic = compute_rank_information_coefficient(scores, returns)
assert rank_ic is not None
assert rank_ic == pytest.approx(1.0, abs=1e-9)
def test_perfect_anti_rank_correlation(self):
"""Perfectly anti-rank-correlated → Rank IC = -1.0."""
scores = list(range(30))
returns = list(range(29, -1, -1)) # reversed ordering
rank_ic = compute_rank_information_coefficient(scores, returns)
assert rank_ic is not None
assert rank_ic == pytest.approx(-1.0, abs=1e-9)
def test_fewer_than_30_returns_none(self):
"""Fewer than 30 data points → None."""
scores = list(range(29))
returns = list(range(29))
rank_ic = compute_rank_information_coefficient(scores, returns)
assert rank_ic is None
# ===================================================================
# 8.5 — Calibration Engine unit tests
# Requirements: 8.1, 8.2, 8.3
# ===================================================================
class TestSourceReliability:
"""Tests for compute_source_reliability (Bayesian shrinkage)."""
def test_zero_samples_returns_prior(self):
"""n=0 → reliability = 0.5 (prior mean)."""
assert compute_source_reliability(0.8, 0) == 0.5
def test_large_sample_approaches_observed(self):
"""n=1000 with wr=0.8 → ≈0.8 (close to observed win rate)."""
reliability = compute_source_reliability(0.8, 1000)
assert reliability == pytest.approx(0.7912621359223302)
# Should be close to 0.8 but not exactly
assert abs(reliability - 0.8) < 0.02
def test_moderate_sample(self):
"""n=30 with wr=0.7 → 0.6 exactly.
0.5 + (30/60) * (0.7 - 0.5) = 0.5 + 0.5 * 0.2 = 0.6
"""
assert compute_source_reliability(0.7, 30) == pytest.approx(0.6)
def test_reliability_in_range(self):
"""Reliability is always in [0.0, 1.0]."""
# Extreme win rates
assert 0.0 <= compute_source_reliability(0.0, 100) <= 1.0
assert 0.0 <= compute_source_reliability(1.0, 100) <= 1.0
assert 0.0 <= compute_source_reliability(0.5, 1) <= 1.0
def test_negative_sample_count_returns_prior(self):
"""Negative sample count → treated as 0, returns 0.5."""
assert compute_source_reliability(0.8, -5) == 0.5
class TestAdjustedEvidenceWeight:
"""Tests for compute_adjusted_evidence_weight."""
def test_reliability_half_gives_base_weight(self):
"""reliability=0.5 → adjusted = base * (0.5 + 0.5) = base * 1.0."""
assert compute_adjusted_evidence_weight(1.0, 0.5) == pytest.approx(1.0)
def test_high_reliability_increases_weight(self):
"""reliability=1.0 → adjusted = base * 1.5."""
assert compute_adjusted_evidence_weight(1.0, 1.0) == pytest.approx(1.5)
def test_low_reliability_decreases_weight(self):
"""reliability=0.0 → adjusted = base * 0.5."""
assert compute_adjusted_evidence_weight(1.0, 0.0) == pytest.approx(0.5)
def test_clamped_to_upper_bound(self):
"""Large base_weight * high reliability → clamped to 2.0."""
result = compute_adjusted_evidence_weight(3.0, 1.0)
assert result == 2.0
def test_clamped_to_lower_bound(self):
"""Small base_weight * low reliability → clamped to 0.1."""
result = compute_adjusted_evidence_weight(0.1, 0.0)
assert result == 0.1
def test_mid_range_not_clamped(self):
"""Normal values stay within bounds without clamping."""
result = compute_adjusted_evidence_weight(0.8, 0.6)
# 0.8 * (0.5 + 0.6) = 0.8 * 1.1 = 0.88
assert result == pytest.approx(0.88)
assert 0.1 <= result <= 2.0
# ===================================================================
# 8.6 — Quality Gate unit tests
# Requirements: 11.1, 11.6
# ===================================================================
class TestQualityGate:
"""Tests for _evaluate_thresholds and QualityGateConfig."""
def _make_passing_snapshot(self) -> dict:
"""Return a metric snapshot dict that meets all default thresholds."""
return {
"prediction_count": 200,
"information_coefficient": 0.10,
"win_rate": 0.60,
"calibration_error": 0.08,
"avg_excess_return_vs_spy": 0.02,
}
def test_all_thresholds_met_pass(self):
"""All thresholds met → every result is passed=True."""
config = QualityGateConfig()
snapshot = self._make_passing_snapshot()
results = _evaluate_thresholds(snapshot, config)
assert len(results) == 5
assert all(r.passed for r in results), (
f"Expected all thresholds to pass, but got: "
f"{[(r.name, r.passed) for r in results]}"
)
def test_one_threshold_failed_ic_below_min(self):
"""IC below min_ic → that threshold fails, others pass."""
config = QualityGateConfig()
snapshot = self._make_passing_snapshot()
snapshot["information_coefficient"] = 0.01 # below min_ic=0.03
results = _evaluate_thresholds(snapshot, config)
results_by_name = {r.name: r for r in results}
assert results_by_name["min_ic"].passed is False
assert results_by_name["min_ic"].actual == pytest.approx(0.01)
assert results_by_name["min_ic"].threshold == pytest.approx(0.03)
# All other thresholds should still pass
for name, result in results_by_name.items():
if name != "min_ic":
assert result.passed is True, f"{name} should pass but didn't"
def test_all_thresholds_below_all_fail(self):
"""All metric values below thresholds → all results are passed=False."""
config = QualityGateConfig()
snapshot = {
"prediction_count": 10, # below 100
"information_coefficient": 0.0, # below 0.03
"win_rate": 0.40, # below 0.53
"calibration_error": 0.50, # above 0.15
"avg_excess_return_vs_spy": -0.05, # below 0.0
}
results = _evaluate_thresholds(snapshot, config)
assert len(results) == 5
assert all(not r.passed for r in results), (
f"Expected all thresholds to fail, but got: "
f"{[(r.name, r.passed) for r in results]}"
)
def test_failsafe_none_values_treated_as_worst_case(self):
"""Missing (None) metric values are treated as worst-case defaults.
This tests the fail-safe behavior: when no snapshots exist,
the snapshot dict would have None values. _evaluate_thresholds
treats None as 0 for min-thresholds and 1.0 for max_ece,
causing all thresholds to fail → paper-only.
"""
config = QualityGateConfig()
snapshot = {
"prediction_count": None,
"information_coefficient": None,
"win_rate": None,
"calibration_error": None,
"avg_excess_return_vs_spy": None,
}
results = _evaluate_thresholds(snapshot, config)
results_by_name = {r.name: r for r in results}
# prediction_count: None → 0, below 100 → fail
assert results_by_name["min_prediction_count"].passed is False
assert results_by_name["min_prediction_count"].actual == 0.0
# IC: None → 0.0, below 0.03 → fail
assert results_by_name["min_ic"].passed is False
assert results_by_name["min_ic"].actual == 0.0
# win_rate: None → 0.0, below 0.53 → fail
assert results_by_name["min_win_rate"].passed is False
assert results_by_name["min_win_rate"].actual == 0.0
# calibration_error: None → 1.0 (worst-case), above 0.15 → fail
assert results_by_name["max_ece"].passed is False
assert results_by_name["max_ece"].actual == 1.0
# excess_return: None → 0.0, equal to min 0.0 → pass (>= 0.0)
assert results_by_name["min_excess_return_vs_spy"].passed is True
assert results_by_name["min_excess_return_vs_spy"].actual == 0.0
def test_stale_snapshot_age_exceeds_max(self):
"""Snapshot age exceeding max_snapshot_age_hours causes gate failure.
The evaluate_quality_gate async function checks snapshot age
before calling _evaluate_thresholds. Here we verify the config
field is respected by testing the age comparison logic directly.
"""
config = QualityGateConfig(max_snapshot_age_hours=24)
age_hours = 30.0 # 30 hours old, exceeds 24h max
assert age_hours > config.max_snapshot_age_hours
def test_threshold_boundary_exact_values(self):
"""Metric values exactly at threshold boundaries → pass.
min thresholds use >=, max thresholds use <=.
"""
config = QualityGateConfig()
snapshot = {
"prediction_count": 100, # exactly min_prediction_count
"information_coefficient": 0.03, # exactly min_ic
"win_rate": 0.53, # exactly min_win_rate
"calibration_error": 0.15, # exactly max_ece
"avg_excess_return_vs_spy": 0.0, # exactly min_excess_return
}
results = _evaluate_thresholds(snapshot, config)
assert all(r.passed for r in results), (
f"Boundary values should pass, but got: "
f"{[(r.name, r.passed, r.actual, r.threshold) for r in results]}"
)
def test_custom_config_thresholds(self):
"""Custom QualityGateConfig thresholds are respected."""
config = QualityGateConfig(
min_prediction_count=50,
min_ic=0.01,
min_win_rate=0.51,
max_ece=0.20,
min_excess_return_vs_spy=-0.01,
)
snapshot = {
"prediction_count": 60,
"information_coefficient": 0.02,
"win_rate": 0.52,
"calibration_error": 0.18,
"avg_excess_return_vs_spy": -0.005,
}
results = _evaluate_thresholds(snapshot, config)
assert all(r.passed for r in results), (
f"Custom thresholds should pass, but got: "
f"{[(r.name, r.passed) for r in results]}"
)
+662
View File
@@ -0,0 +1,662 @@
"""Property-based tests for model validation, calibration, and signal quality.
Feature: model-validation-calibration
Tests correctness properties from the design specification covering
canonical evidence key determinism/idempotence, contribution score
invariants, calibration error bounds, Brier score bounds, information
coefficient bounds, source reliability shrinkage, and quality gate
determinism.
"""
from __future__ import annotations
import urllib.parse
from hypothesis import given, settings
from hypothesis import strategies as st
from services.validation.prediction_snapshot import (
compute_canonical_evidence_key,
compute_contribution_scores,
)
# ---------------------------------------------------------------------------
# Strategies
# ---------------------------------------------------------------------------
# Titles: arbitrary text (including whitespace, unicode)
title_strategy = st.text(min_size=0, max_size=200)
# URLs: build realistic URLs with optional query params
url_strategy = st.builds(
lambda scheme, host, path, query: urllib.parse.urlunparse(
(scheme, host, path, "", query, "")
),
scheme=st.sampled_from(["http", "https"]),
host=st.from_regex(r"[a-z0-9]{1,20}\.[a-z]{2,6}", fullmatch=True),
path=st.from_regex(r"(/[a-z0-9\-]{0,15}){0,4}", fullmatch=True),
query=st.from_regex(r"([a-z]{1,8}=[a-z0-9]{1,8}(&[a-z]{1,8}=[a-z0-9]{1,8}){0,3})?", fullmatch=True),
)
# ---------------------------------------------------------------------------
# Property 4: Canonical Evidence Key Determinism and Normalization Idempotence
# Validates: Requirements 2.3, 17.4
# ---------------------------------------------------------------------------
@given(title=title_strategy, url=url_strategy)
@settings(max_examples=100)
def test_canonical_evidence_key_determinism(title: str, url: str) -> None:
"""**Validates: Requirements 2.3, 17.4**
For any (title, url) pair, computing the canonical evidence key twice
with the same inputs SHALL produce the same result (determinism).
"""
key1 = compute_canonical_evidence_key(title, url)
key2 = compute_canonical_evidence_key(title, url)
assert key1 == key2, (
f"Determinism violated: same inputs produced different keys: "
f"{key1!r} != {key2!r}"
)
# Key should be a valid SHA256 hex digest (64 hex chars)
assert len(key1) == 64, f"Expected 64-char hex digest, got {len(key1)}"
assert all(c in "0123456789abcdef" for c in key1), (
f"Key contains non-hex characters: {key1!r}"
)
@given(title=title_strategy, url=url_strategy)
@settings(max_examples=100)
def test_canonical_evidence_key_normalization_idempotence(title: str, url: str) -> None:
"""**Validates: Requirements 2.3, 17.4**
Normalizing an already-normalized input and computing the key SHALL
produce the same key as the original computation (idempotence).
Normalization rules:
- Title: lowercase, strip leading/trailing whitespace
- URL: lowercase, strip query parameters (keep scheme, netloc, path)
"""
# Compute key from original (unnormalized) inputs
key_original = compute_canonical_evidence_key(title, url)
# Pre-normalize the inputs the same way the function does internally
normalized_title = title.strip().lower()
parsed = urllib.parse.urlparse(url.lower())
normalized_url = urllib.parse.urlunparse(
(parsed.scheme, parsed.netloc, parsed.path, "", "", "")
)
# Compute key from already-normalized inputs
key_from_normalized = compute_canonical_evidence_key(normalized_title, normalized_url)
assert key_original == key_from_normalized, (
f"Idempotence violated: key from original inputs ({key_original!r}) "
f"differs from key from pre-normalized inputs ({key_from_normalized!r}). "
f"title={title!r}, url={url!r}"
)
# ---------------------------------------------------------------------------
# Strategies for contribution score tests
# ---------------------------------------------------------------------------
positive_weights_strategy = st.lists(
st.floats(min_value=0.01, max_value=1000.0, allow_nan=False, allow_infinity=False),
min_size=1,
max_size=50,
)
# ---------------------------------------------------------------------------
# Property 7: Contribution Score Sum-to-One and Range
# Validates: Requirements 2.5, 17.7
# ---------------------------------------------------------------------------
@given(weights=positive_weights_strategy)
@settings(max_examples=100)
def test_contribution_scores_sum_to_one_and_range(weights: list[float]) -> None:
"""**Validates: Requirements 2.5, 17.7**
For any non-empty list of positive document weights, the computed
contribution scores SHALL each be in [0.0, 1.0] and SHALL sum to 1.0
(within floating-point tolerance of 1e-9).
"""
scores = compute_contribution_scores(weights)
# Same length as input
assert len(scores) == len(weights), (
f"Expected {len(weights)} scores, got {len(scores)}"
)
# Each score in [0.0, 1.0]
for i, score in enumerate(scores):
assert 0.0 <= score <= 1.0, (
f"Score at index {i} is {score}, expected in [0.0, 1.0]. "
f"weights={weights}"
)
# Scores sum to 1.0 within tolerance
total = sum(scores)
assert abs(total - 1.0) < 1e-9, (
f"Scores sum to {total}, expected 1.0 within 1e-9 tolerance. "
f"weights={weights}"
)
def test_contribution_scores_empty_input() -> None:
"""**Validates: Requirements 2.5, 17.7**
For an empty weight list, the result SHALL be an empty list.
"""
scores = compute_contribution_scores([])
assert scores == [], f"Expected empty list for empty input, got {scores}"
# ---------------------------------------------------------------------------
# Strategies for calibration error tests
# ---------------------------------------------------------------------------
confidence_strategy = st.floats(
min_value=0.50, max_value=1.00, allow_nan=False, allow_infinity=False
)
outcome_strategy = st.booleans()
prediction_pairs_strategy = st.lists(
st.tuples(confidence_strategy, outcome_strategy),
min_size=1,
max_size=100,
)
# Import metric functions
from services.validation.metrics import (
compute_brier_score,
compute_calibration_error,
compute_information_coefficient,
)
# ---------------------------------------------------------------------------
# Property 1: Calibration Error Range and Round-Trip
# Validates: Requirements 5.1, 5.3, 17.1
# ---------------------------------------------------------------------------
@given(pairs=prediction_pairs_strategy)
@settings(max_examples=100)
def test_calibration_error_range(pairs: list[tuple[float, bool]]) -> None:
"""**Validates: Requirements 5.1, 5.3, 17.1**
For any valid distribution of predictions with confidences in [0.50, 1.00]
and boolean outcomes, the Expected Calibration Error (ECE) SHALL be in
[0.0, 1.0].
"""
confidences = [c for c, _ in pairs]
outcomes = [o for _, o in pairs]
ece, buckets = compute_calibration_error(confidences, outcomes)
assert 0.0 <= ece <= 1.0, (
f"ECE {ece} is outside [0.0, 1.0]. "
f"confidences={confidences}, outcomes={outcomes}"
)
# Each bucket's metrics should also be well-formed
for bucket in buckets:
if bucket.prediction_count > 0:
assert 0.0 <= bucket.avg_confidence <= 1.0, (
f"Bucket [{bucket.bucket_low}, {bucket.bucket_high}) has "
f"avg_confidence={bucket.avg_confidence} outside [0.0, 1.0]"
)
assert 0.0 <= bucket.observed_win_rate <= 1.0, (
f"Bucket [{bucket.bucket_low}, {bucket.bucket_high}) has "
f"observed_win_rate={bucket.observed_win_rate} outside [0.0, 1.0]"
)
def test_calibration_error_zero_when_perfectly_calibrated() -> None:
"""**Validates: Requirements 5.1, 5.3, 17.1**
When every bucket's observed win rate exactly matches its average
confidence, ECE SHALL be 0.0.
Constructs a scenario with predictions in multiple buckets where the
fraction of True outcomes in each bucket equals the bucket's average
confidence.
"""
# For each bucket midpoint, place predictions so win_rate == avg_confidence.
# Use 100 predictions per bucket at the midpoint confidence.
# Set exactly round(100 * midpoint) outcomes to True.
bucket_midpoints = [0.55, 0.65, 0.75, 0.85, 0.95]
n_per_bucket = 100
confidences: list[float] = []
outcomes: list[bool] = []
for midpoint in bucket_midpoints:
n_true = round(n_per_bucket * midpoint)
n_false = n_per_bucket - n_true
confidences.extend([midpoint] * n_per_bucket)
outcomes.extend([True] * n_true + [False] * n_false)
ece, buckets = compute_calibration_error(confidences, outcomes)
assert ece == 0.0, (
f"ECE should be 0.0 for perfectly calibrated predictions, got {ece}. "
f"Buckets: {[(b.avg_confidence, b.observed_win_rate, b.prediction_count) for b in buckets]}"
)
# Verify each non-empty bucket has matching avg_confidence and win_rate
for bucket in buckets:
if bucket.prediction_count > 0:
assert bucket.avg_confidence == bucket.observed_win_rate, (
f"Bucket [{bucket.bucket_low}, {bucket.bucket_high}) has "
f"avg_confidence={bucket.avg_confidence} != "
f"observed_win_rate={bucket.observed_win_rate}"
)
assert not bucket.miscalibrated, (
f"Bucket [{bucket.bucket_low}, {bucket.bucket_high}) should not "
f"be flagged as miscalibrated when perfectly calibrated"
)
# ---------------------------------------------------------------------------
# Strategies for Brier score tests
# ---------------------------------------------------------------------------
p_bull_strategy = st.floats(
min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
)
brier_outcome_strategy = st.booleans()
brier_pairs_strategy = st.lists(
st.tuples(p_bull_strategy, brier_outcome_strategy),
min_size=1,
max_size=100,
)
# ---------------------------------------------------------------------------
# Property 2: Brier Score Range and Perfect Prediction
# Validates: Requirements 5.4, 17.2
# ---------------------------------------------------------------------------
@given(pairs=brier_pairs_strategy)
@settings(max_examples=100)
def test_brier_score_range(pairs: list[tuple[float, bool]]) -> None:
"""**Validates: Requirements 5.4, 17.2**
For any list of (p_bull, outcome) pairs where p_bull ∈ [0.0, 1.0] and
outcome is boolean, the Brier score SHALL be in [0.0, 1.0].
"""
p_bulls = [p for p, _ in pairs]
outcomes = [o for _, o in pairs]
brier = compute_brier_score(p_bulls, outcomes)
assert 0.0 <= brier <= 1.0, (
f"Brier score {brier} is outside [0.0, 1.0]. "
f"p_bulls={p_bulls}, outcomes={outcomes}"
)
@given(n=st.integers(min_value=1, max_value=100))
@settings(max_examples=100)
def test_brier_score_perfect_prediction(n: int) -> None:
"""**Validates: Requirements 5.4, 17.2**
When all predictions are perfectly correct — p_bull = 1.0 with
outcome = True, or p_bull = 0.0 with outcome = False — the Brier
score SHALL be 0.0.
"""
# Case 1: all p_bull = 1.0 and outcome = True
p_bulls_all_bull = [1.0] * n
outcomes_all_true = [True] * n
brier_bull = compute_brier_score(p_bulls_all_bull, outcomes_all_true)
assert brier_bull == 0.0, (
f"Brier score should be 0.0 for perfect bullish predictions, "
f"got {brier_bull} with n={n}"
)
# Case 2: all p_bull = 0.0 and outcome = False
p_bulls_all_bear = [0.0] * n
outcomes_all_false = [False] * n
brier_bear = compute_brier_score(p_bulls_all_bear, outcomes_all_false)
assert brier_bear == 0.0, (
f"Brier score should be 0.0 for perfect bearish predictions, "
f"got {brier_bear} with n={n}"
)
# ---------------------------------------------------------------------------
# Strategies for Information Coefficient tests
# ---------------------------------------------------------------------------
ic_score_strategy = st.floats(
min_value=-100.0, max_value=100.0, allow_nan=False, allow_infinity=False
)
# Generate lists of at least 30 (score, return) pairs
ic_pairs_strategy = st.lists(
st.tuples(ic_score_strategy, ic_score_strategy),
min_size=30,
max_size=100,
)
# ---------------------------------------------------------------------------
# Property 3: Information Coefficient Range and Perfect Correlation
# Validates: Requirements 6.1, 6.2, 17.3
# ---------------------------------------------------------------------------
@given(pairs=ic_pairs_strategy)
@settings(max_examples=100)
def test_information_coefficient_range(pairs: list[tuple[float, float]]) -> None:
"""**Validates: Requirements 6.1, 6.2, 17.3**
For any list of (score, return) pairs with at least 30 elements where
scores and returns are finite floats, the Information Coefficient
(Pearson correlation) SHALL be in [-1.0, 1.0] or None (when variance
is zero).
"""
scores = [s for s, _ in pairs]
returns = [r for _, r in pairs]
ic = compute_information_coefficient(scores, returns)
# IC may be None if variance is zero in either list
if ic is not None:
assert -1.0 <= ic <= 1.0, (
f"IC {ic} is outside [-1.0, 1.0]. "
f"scores={scores[:5]}..., returns={returns[:5]}..."
)
@given(
scores=st.lists(
st.floats(min_value=-100.0, max_value=100.0, allow_nan=False, allow_infinity=False),
min_size=30,
max_size=100,
).filter(lambda xs: max(xs) - min(xs) > 1e-6),
a=st.floats(min_value=0.01, max_value=100.0, allow_nan=False, allow_infinity=False),
b=st.floats(min_value=-100.0, max_value=100.0, allow_nan=False, allow_infinity=False),
)
@settings(max_examples=100)
def test_information_coefficient_perfect_positive_correlation(
scores: list[float], a: float, b: float
) -> None:
"""**Validates: Requirements 6.1, 6.2, 17.3**
When scores and returns are perfectly positively linearly correlated
(returns = a * scores + b, a > 0), IC SHALL be 1.0 within
floating-point tolerance.
"""
returns = [a * s + b for s in scores]
ic = compute_information_coefficient(scores, returns)
assert ic is not None, (
f"IC should not be None for perfectly correlated data with variance. "
f"a={a}, b={b}, scores={scores[:5]}..."
)
assert abs(ic - 1.0) < 1e-6, (
f"IC should be 1.0 for perfectly positively correlated data, "
f"got {ic}. a={a}, b={b}"
)
# ---------------------------------------------------------------------------
# Strategies for source reliability tests
# ---------------------------------------------------------------------------
from services.validation.calibration import compute_source_reliability
observed_win_rate_strategy = st.floats(
min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
)
sample_count_strategy = st.integers(min_value=0, max_value=100_000)
# ---------------------------------------------------------------------------
# Property 5: Source Reliability Bayesian Shrinkage Bounds and Convergence
# Validates: Requirements 8.1, 8.2, 17.5
# ---------------------------------------------------------------------------
@given(
observed_win_rate=observed_win_rate_strategy,
sample_count=sample_count_strategy,
)
@settings(max_examples=100)
def test_source_reliability_range(observed_win_rate: float, sample_count: int) -> None:
"""**Validates: Requirements 8.1, 8.2, 17.5**
For any observed_win_rate in [0.0, 1.0] and sample_count >= 0,
the source reliability computed via Bayesian shrinkage SHALL be
in [0.0, 1.0].
"""
reliability = compute_source_reliability(observed_win_rate, sample_count)
assert 0.0 <= reliability <= 1.0, (
f"Reliability {reliability} is outside [0.0, 1.0]. "
f"observed_win_rate={observed_win_rate}, sample_count={sample_count}"
)
def test_source_reliability_zero_samples() -> None:
"""**Validates: Requirements 8.1, 8.2, 17.5**
When sample_count = 0, reliability SHALL be exactly 0.5 (the prior mean).
"""
reliability = compute_source_reliability(observed_win_rate=0.8, sample_count=0)
assert reliability == 0.5, (
f"Reliability should be 0.5 when sample_count=0, got {reliability}"
)
# Also verify with different win rates
for wr in [0.0, 0.25, 0.5, 0.75, 1.0]:
r = compute_source_reliability(observed_win_rate=wr, sample_count=0)
assert r == 0.5, (
f"Reliability should be 0.5 when sample_count=0 regardless of "
f"observed_win_rate={wr}, got {r}"
)
@given(
observed_win_rate=st.floats(
min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
),
)
@settings(max_examples=100)
def test_source_reliability_convergence(observed_win_rate: float) -> None:
"""**Validates: Requirements 8.1, 8.2, 17.5**
As sample_count increases toward infinity, reliability SHALL approach
the observed_win_rate. For a large sample_count (e.g., 10000),
reliability should be within 0.01 of observed_win_rate.
"""
reliability = compute_source_reliability(observed_win_rate, sample_count=10_000)
assert abs(reliability - observed_win_rate) < 0.01, (
f"Reliability {reliability} should be within 0.01 of "
f"observed_win_rate {observed_win_rate} when sample_count=10000. "
f"Difference: {abs(reliability - observed_win_rate)}"
)
# ---------------------------------------------------------------------------
# Strategies for quality gate tests
# ---------------------------------------------------------------------------
from services.trading.model_quality_gate import (
GateThresholdResult,
QualityGateConfig,
_evaluate_thresholds,
)
# Snapshot dict strategy: generate each metric value in a reasonable range
snapshot_strategy = st.fixed_dictionaries({
"prediction_count": st.integers(min_value=0, max_value=10_000),
"information_coefficient": st.floats(
min_value=-1.0, max_value=1.0, allow_nan=False, allow_infinity=False
),
"win_rate": st.floats(
min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
),
"calibration_error": st.floats(
min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
),
"avg_excess_return_vs_spy": st.floats(
min_value=-1.0, max_value=1.0, allow_nan=False, allow_infinity=False
),
})
# Config strategy: generate each threshold in a reasonable range
gate_config_strategy = st.builds(
QualityGateConfig,
min_prediction_count=st.integers(min_value=0, max_value=10_000),
min_ic=st.floats(
min_value=-1.0, max_value=1.0, allow_nan=False, allow_infinity=False
),
min_win_rate=st.floats(
min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
),
max_ece=st.floats(
min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
),
min_excess_return_vs_spy=st.floats(
min_value=-1.0, max_value=1.0, allow_nan=False, allow_infinity=False
),
)
# ---------------------------------------------------------------------------
# Property 6: Quality Gate Determinism and Threshold Monotonicity
# Validates: Requirements 11.1, 17.6
# ---------------------------------------------------------------------------
@given(snapshot=snapshot_strategy, config=gate_config_strategy)
@settings(max_examples=100)
def test_quality_gate_determinism(
snapshot: dict, config: QualityGateConfig
) -> None:
"""**Validates: Requirements 11.1, 17.6**
For any set of model metric values and quality gate configuration,
calling _evaluate_thresholds twice with the same inputs SHALL produce
the same pass/fail result for every threshold (determinism).
"""
results1 = _evaluate_thresholds(snapshot, config)
results2 = _evaluate_thresholds(snapshot, config)
assert len(results1) == len(results2), (
f"Different number of threshold results: {len(results1)} vs {len(results2)}"
)
for r1, r2 in zip(results1, results2):
assert r1.name == r2.name, (
f"Threshold name mismatch: {r1.name!r} vs {r2.name!r}"
)
assert r1.threshold == r2.threshold, (
f"Threshold value mismatch for {r1.name}: "
f"{r1.threshold} vs {r2.threshold}"
)
assert r1.actual == r2.actual, (
f"Actual value mismatch for {r1.name}: "
f"{r1.actual} vs {r2.actual}"
)
assert r1.passed == r2.passed, (
f"Determinism violated for threshold {r1.name}: "
f"first call passed={r1.passed}, second call passed={r2.passed}. "
f"actual={r1.actual}, threshold={r1.threshold}"
)
# Overall gate pass/fail should also be deterministic
all_passed_1 = all(r.passed for r in results1)
all_passed_2 = all(r.passed for r in results2)
assert all_passed_1 == all_passed_2, (
f"Overall gate determinism violated: "
f"first call passed={all_passed_1}, second call passed={all_passed_2}"
)
@given(
snapshot=snapshot_strategy,
config=gate_config_strategy,
relax_amount=st.floats(
min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
),
threshold_to_relax=st.sampled_from([
"min_prediction_count",
"min_ic",
"min_win_rate",
"max_ece",
"min_excess_return_vs_spy",
]),
)
@settings(max_examples=100)
def test_quality_gate_threshold_monotonicity(
snapshot: dict,
config: QualityGateConfig,
relax_amount: float,
threshold_to_relax: str,
) -> None:
"""**Validates: Requirements 11.1, 17.6**
For any configuration where the gate passes, relaxing any single
threshold (decreasing min values or increasing max values to make
them easier to satisfy) SHALL NOT cause the gate to fail
(monotonicity).
"""
# Evaluate with original config
original_results = _evaluate_thresholds(snapshot, config)
original_passed = all(r.passed for r in original_results)
# Only test monotonicity when the gate originally passes
if not original_passed:
return
# Create a relaxed config by making one threshold easier to satisfy
from dataclasses import replace
if threshold_to_relax == "min_prediction_count":
# Decrease min → easier to satisfy
relaxed_value = max(0, config.min_prediction_count - int(relax_amount * 1000))
relaxed_config = replace(config, min_prediction_count=relaxed_value)
elif threshold_to_relax == "min_ic":
# Decrease min → easier to satisfy
relaxed_config = replace(config, min_ic=config.min_ic - relax_amount)
elif threshold_to_relax == "min_win_rate":
# Decrease min → easier to satisfy
relaxed_config = replace(config, min_win_rate=config.min_win_rate - relax_amount)
elif threshold_to_relax == "max_ece":
# Increase max → easier to satisfy
relaxed_config = replace(config, max_ece=config.max_ece + relax_amount)
elif threshold_to_relax == "min_excess_return_vs_spy":
# Decrease min → easier to satisfy
relaxed_config = replace(
config,
min_excess_return_vs_spy=config.min_excess_return_vs_spy - relax_amount,
)
else:
return # pragma: no cover
# Evaluate with relaxed config
relaxed_results = _evaluate_thresholds(snapshot, config=relaxed_config)
relaxed_passed = all(r.passed for r in relaxed_results)
assert relaxed_passed, (
f"Monotonicity violated: gate passed with original config but failed "
f"after relaxing {threshold_to_relax}. "
f"Original config: min_prediction_count={config.min_prediction_count}, "
f"min_ic={config.min_ic}, min_win_rate={config.min_win_rate}, "
f"max_ece={config.max_ece}, "
f"min_excess_return_vs_spy={config.min_excess_return_vs_spy}. "
f"Relaxed threshold: {threshold_to_relax} by {relax_amount}. "
f"Failed thresholds: "
f"{[(r.name, r.actual, r.threshold) for r in relaxed_results if not r.passed]}"
)