feat: model validation, calibration, and signal quality layer

- Migration 035: prediction_snapshots, prediction_outcomes, signal_evidence_links, model_metric_snapshots tables + SQL views - Prediction snapshot writer with canonical evidence keys, duplicate detection, contribution scores - Outcome evaluator across 5 horizons (1h, 6h, 1d, 7d, 30d) - Metrics engine: ECE, Brier score, IC, Rank IC, benchmark comparison - Attribution engine: per-source, per-catalyst, per-layer performance - Calibration engine: Bayesian shrinkage source reliability - Quality gate for live trading eligibility with configurable thresholds - 7 new /api/validation/* endpoints - Upgraded OpsModel dashboard with validation tab - Enhanced recommendation display with calibration context - Backtest replay validation mode - 86 Python tests (unit + property-based), 179 frontend tests passing
2026-05-01 03:04:58 +00:00
parent 5d2ffd9163
commit 7fcc8a6c07
23 changed files with 7554 additions and 9 deletions
@@ -0,0 +1,690 @@
+"""Unit tests for model validation, calibration, and signal quality modules.
+
+Covers prediction snapshot writer, outcome evaluator, metrics engine,
+calibration engine, and quality gate — all pure-function / deterministic tests.
+
+Requirements: 1.1, 2.3, 2.4, 2.5, 3.3, 4.2, 4.5, 4.6, 4.7,
+              5.3, 5.4, 6.1, 6.2, 6.5, 8.1, 8.2, 8.3, 11.1, 11.6
+"""
+from __future__ import annotations
+
+import hashlib
+
+import pytest
+
+# -- Prediction Snapshot Writer --
+from services.validation.prediction_snapshot import (
+    MAX_SINGLE_DOCUMENT_WEIGHT,
+    compute_canonical_evidence_key,
+    compute_contribution_scores,
+)
+
+# -- Outcome Evaluator --
+from services.validation.outcome_evaluator import (
+    _compute_return,
+    _is_direction_correct,
+    _is_profitable,
+)
+
+# -- Metrics Engine --
+from services.validation.metrics import (
+    compute_brier_score,
+    compute_calibration_error,
+    compute_information_coefficient,
+    compute_rank_information_coefficient,
+)
+
+# -- Calibration Engine --
+from services.validation.calibration import (
+    compute_adjusted_evidence_weight,
+    compute_source_reliability,
+)
+
+# -- Quality Gate --
+from services.trading.model_quality_gate import (
+    QualityGateConfig,
+    _evaluate_thresholds,
+)
+
+
+# ===================================================================
+# 8.2 — Prediction Snapshot Writer unit tests
+# Requirements: 1.1, 2.3, 2.4, 2.5, 3.3
+# ===================================================================
+
+
+class TestCanonicalEvidenceKey:
+    """Tests for compute_canonical_evidence_key."""
+
+    def test_known_title_url_produces_expected_sha256(self):
+        """Known title/URL pair produces a deterministic SHA256 hash."""
+        key = compute_canonical_evidence_key(
+            "Test Article", "https://example.com/article?ref=123"
+        )
+        assert key == "abd5818d51579a7af51cd06861289c7f1fdc97c0f522e8ba13ce9b4aad01cb6f"
+
+    def test_empty_inputs(self):
+        """Empty title and URL produce SHA256 of empty string."""
+        key = compute_canonical_evidence_key("", "")
+        expected = hashlib.sha256(b"").hexdigest()
+        assert key == expected
+
+    def test_unicode_inputs(self):
+        """Unicode title and URL are handled correctly."""
+        key = compute_canonical_evidence_key(
+            "日本語テスト", "https://example.com/日本語"
+        )
+        assert key == "553553928bb4e36abdf283ff3c52df0695fca09809159650a9bdcb4fb2c5f62b"
+
+    def test_normalization_case_insensitive(self):
+        """Title and URL are lowercased before hashing."""
+        key_lower = compute_canonical_evidence_key(
+            "test article", "https://example.com/path"
+        )
+        key_upper = compute_canonical_evidence_key(
+            "TEST ARTICLE", "HTTPS://EXAMPLE.COM/PATH"
+        )
+        assert key_lower == key_upper
+
+    def test_normalization_strips_query_params(self):
+        """URL query parameters are stripped before hashing."""
+        key_with_params = compute_canonical_evidence_key(
+            "title", "https://example.com/article?utm_source=twitter&ref=123"
+        )
+        key_without_params = compute_canonical_evidence_key(
+            "title", "https://example.com/article"
+        )
+        assert key_with_params == key_without_params
+
+    def test_normalization_strips_whitespace(self):
+        """Leading/trailing whitespace in title is stripped."""
+        key_trimmed = compute_canonical_evidence_key(
+            "test", "https://example.com"
+        )
+        key_padded = compute_canonical_evidence_key(
+            "  test  ", "https://example.com"
+        )
+        assert key_trimmed == key_padded
+
+
+class TestDuplicateDetection:
+    """Tests for duplicate detection via canonical evidence keys."""
+
+    def test_three_docs_two_sharing_key_one_duplicate(self):
+        """3 docs where 2 share a canonical key → 1 marked duplicate."""
+        # Simulate the duplicate detection logic from create_prediction_snapshot
+        docs = [
+            {"title": "Breaking News", "url": "https://news.com/article"},
+            {"title": "breaking news", "url": "https://news.com/article?ref=1"},
+            {"title": "Other Story", "url": "https://other.com/story"},
+        ]
+
+        seen_keys: dict[str, int] = {}
+        duplicates: list[bool] = []
+
+        for doc in docs:
+            key = compute_canonical_evidence_key(doc["title"], doc["url"])
+            is_dup = key in seen_keys
+            if not is_dup:
+                seen_keys[key] = len(duplicates)
+            duplicates.append(is_dup)
+
+        assert duplicates == [False, True, False]
+        assert sum(duplicates) == 1
+
+
+class TestContributionScores:
+    """Tests for compute_contribution_scores."""
+
+    def test_known_weights(self):
+        """[0.5, 0.3, 0.2] → [0.5, 0.3, 0.2] (already sums to 1.0)."""
+        scores = compute_contribution_scores([0.5, 0.3, 0.2])
+        assert scores == pytest.approx([0.5, 0.3, 0.2])
+        assert sum(scores) == pytest.approx(1.0)
+
+    def test_single_doc(self):
+        """Single document → contribution score of 1.0."""
+        scores = compute_contribution_scores([0.7])
+        assert scores == pytest.approx([1.0])
+
+    def test_empty_input(self):
+        """Empty input → empty list."""
+        scores = compute_contribution_scores([])
+        assert scores == []
+
+    def test_all_zero_weights(self):
+        """All-zero weights → equal distribution."""
+        scores = compute_contribution_scores([0.0, 0.0, 0.0])
+        assert len(scores) == 3
+        assert all(s == pytest.approx(1.0 / 3.0) for s in scores)
+
+    def test_scores_sum_to_one(self):
+        """Arbitrary weights sum to 1.0."""
+        scores = compute_contribution_scores([1.0, 2.0, 3.0, 4.0])
+        assert sum(scores) == pytest.approx(1.0)
+        assert scores == pytest.approx([0.1, 0.2, 0.3, 0.4])
+
+
+class TestWeightClamping:
+    """Tests for MAX_SINGLE_DOCUMENT_WEIGHT clamping."""
+
+    def test_weight_above_max_clamped(self):
+        """Weight 1.5 → clamped to MAX_SINGLE_DOCUMENT_WEIGHT (1.0)."""
+        raw_weight = 1.5
+        clamped = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT)
+        assert clamped == 1.0
+
+    def test_weight_at_max_unchanged(self):
+        """Weight exactly at MAX stays unchanged."""
+        raw_weight = 1.0
+        clamped = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT)
+        assert clamped == 1.0
+
+    def test_weight_below_max_unchanged(self):
+        """Weight below MAX stays unchanged."""
+        raw_weight = 0.5
+        clamped = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT)
+        assert clamped == 0.5
+
+
+# ===================================================================
+# 8.3 — Outcome Evaluator unit tests
+# Requirements: 4.2, 4.5, 4.6, 4.7
+# ===================================================================
+
+
+class TestComputeReturn:
+    """Tests for _compute_return."""
+
+    def test_positive_return(self):
+        """Price 100 → 110 → return 0.10."""
+        assert _compute_return(100.0, 110.0) == pytest.approx(0.10)
+
+    def test_negative_return(self):
+        """Price 100 → 90 → return -0.10."""
+        assert _compute_return(100.0, 90.0) == pytest.approx(-0.10)
+
+    def test_zero_return(self):
+        """Price unchanged → return 0.0."""
+        assert _compute_return(100.0, 100.0) == pytest.approx(0.0)
+
+    def test_zero_current_price(self):
+        """Current price 0 → return 0.0 (guard against division by zero)."""
+        assert _compute_return(0.0, 110.0) == 0.0
+
+
+class TestDirectionCorrect:
+    """Tests for _is_direction_correct."""
+
+    def test_bullish_positive_return(self):
+        """Bullish + positive return → True."""
+        assert _is_direction_correct("bullish", 0.05) is True
+
+    def test_bullish_negative_return(self):
+        """Bullish + negative return → False."""
+        assert _is_direction_correct("bullish", -0.05) is False
+
+    def test_bearish_negative_return(self):
+        """Bearish + negative return → True."""
+        assert _is_direction_correct("bearish", -0.05) is True
+
+    def test_bearish_positive_return(self):
+        """Bearish + positive return → False."""
+        assert _is_direction_correct("bearish", 0.05) is False
+
+    def test_bullish_zero_return(self):
+        """Bullish + zero return → False (not strictly positive)."""
+        assert _is_direction_correct("bullish", 0.0) is False
+
+    def test_bearish_zero_return(self):
+        """Bearish + zero return → False (not strictly negative)."""
+        assert _is_direction_correct("bearish", 0.0) is False
+
+    def test_mixed_direction(self):
+        """Mixed direction → always False."""
+        assert _is_direction_correct("mixed", 0.05) is False
+        assert _is_direction_correct("mixed", -0.05) is False
+
+    def test_case_insensitive(self):
+        """Direction matching is case-insensitive."""
+        assert _is_direction_correct("Bullish", 0.05) is True
+        assert _is_direction_correct("BEARISH", -0.05) is True
+
+
+class TestIsProfitable:
+    """Tests for _is_profitable."""
+
+    def test_buy_positive_return(self):
+        """Buy + positive return → True."""
+        assert _is_profitable("buy", 0.05) is True
+
+    def test_buy_negative_return(self):
+        """Buy + negative return → False."""
+        assert _is_profitable("buy", -0.05) is False
+
+    def test_sell_negative_return(self):
+        """Sell + negative return → True."""
+        assert _is_profitable("sell", -0.05) is True
+
+    def test_sell_positive_return(self):
+        """Sell + positive return → False."""
+        assert _is_profitable("sell", 0.05) is False
+
+    def test_hold_any_return(self):
+        """Hold → always False."""
+        assert _is_profitable("hold", 0.05) is False
+        assert _is_profitable("hold", -0.05) is False
+
+    def test_case_insensitive(self):
+        """Action matching is case-insensitive."""
+        assert _is_profitable("Buy", 0.05) is True
+        assert _is_profitable("SELL", -0.05) is True
+
+
+class TestExcessReturn:
+    """Tests for excess return computation (ticker return - benchmark return)."""
+
+    def test_excess_return_vs_spy(self):
+        """Ticker 10%, SPY 5% → excess 5%."""
+        ticker_return = _compute_return(100.0, 110.0)  # 0.10
+        spy_return = _compute_return(100.0, 105.0)  # 0.05
+        excess = ticker_return - spy_return
+        assert excess == pytest.approx(0.05)
+
+    def test_negative_excess_return(self):
+        """Ticker 3%, SPY 5% → excess -2%."""
+        ticker_return = _compute_return(100.0, 103.0)  # 0.03
+        spy_return = _compute_return(100.0, 105.0)  # 0.05
+        excess = ticker_return - spy_return
+        assert excess == pytest.approx(-0.02)
+
+    def test_zero_excess_return(self):
+        """Same return → excess 0%."""
+        ticker_return = _compute_return(100.0, 110.0)
+        spy_return = _compute_return(100.0, 110.0)
+        excess = ticker_return - spy_return
+        assert excess == pytest.approx(0.0)
+
+
+# ===================================================================
+# 8.4 — Metrics Engine unit tests
+# Requirements: 5.3, 5.4, 6.1, 6.2, 6.5
+# ===================================================================
+
+
+class TestCalibrationError:
+    """Tests for compute_calibration_error (ECE)."""
+
+    def test_perfect_calibration_ece_zero(self):
+        """Perfect calibration → ECE = 0.0.
+
+        All predictions in [0.70, 0.80) bucket with 75% win rate
+        matching ~0.75 avg confidence.
+        """
+        confidences = [0.75] * 100
+        outcomes = [True] * 75 + [False] * 25
+        ece, buckets = compute_calibration_error(confidences, outcomes)
+        assert ece == pytest.approx(0.0, abs=1e-9)
+
+    def test_all_overconfident_positive_ece(self):
+        """All overconfident (high confidence, low win rate) → positive ECE."""
+        # All predictions at 0.95 confidence but only 50% win rate
+        confidences = [0.95] * 100
+        outcomes = [True] * 50 + [False] * 50
+        ece, buckets = compute_calibration_error(confidences, outcomes)
+        assert ece > 0.0
+        # ECE should be |0.95 - 0.50| = 0.45
+        assert ece == pytest.approx(0.45, abs=0.01)
+
+    def test_empty_input_returns_zero(self):
+        """Empty input → ECE = 0.0, empty buckets."""
+        ece, buckets = compute_calibration_error([], [])
+        assert ece == 0.0
+        assert buckets == []
+
+    def test_miscalibrated_flag(self):
+        """Buckets with |avg_conf - win_rate| > 0.15 are flagged."""
+        # All in [0.90, 1.00] bucket with 0% win rate → diff = 0.95
+        confidences = [0.95] * 20
+        outcomes = [False] * 20
+        _ece, buckets = compute_calibration_error(confidences, outcomes)
+        # Find the [0.90, 1.00] bucket
+        high_bucket = [b for b in buckets if b.bucket_low == 0.90]
+        assert len(high_bucket) == 1
+        assert high_bucket[0].miscalibrated is True
+
+    def test_ece_in_valid_range(self):
+        """ECE is always in [0.0, 1.0]."""
+        confidences = [0.55, 0.65, 0.75, 0.85, 0.95]
+        outcomes = [False, True, False, True, False]
+        ece, _ = compute_calibration_error(confidences, outcomes)
+        assert 0.0 <= ece <= 1.0
+
+
+class TestBrierScore:
+    """Tests for compute_brier_score."""
+
+    def test_all_correct_at_p1(self):
+        """All correct at p=1.0 → Brier = 0.0."""
+        p_bulls = [1.0] * 10
+        outcomes = [True] * 10
+        assert compute_brier_score(p_bulls, outcomes) == pytest.approx(0.0)
+
+    def test_all_wrong_at_p1(self):
+        """All wrong at p=1.0 → Brier = 1.0."""
+        p_bulls = [1.0] * 10
+        outcomes = [False] * 10
+        assert compute_brier_score(p_bulls, outcomes) == pytest.approx(1.0)
+
+    def test_all_correct_at_p0(self):
+        """All correct at p=0.0 (bearish correct) → Brier = 0.0."""
+        p_bulls = [0.0] * 10
+        outcomes = [False] * 10
+        assert compute_brier_score(p_bulls, outcomes) == pytest.approx(0.0)
+
+    def test_empty_input(self):
+        """Empty input → Brier = 0.0."""
+        assert compute_brier_score([], []) == 0.0
+
+    def test_mixed_predictions(self):
+        """Mixed predictions produce a value in (0, 1)."""
+        p_bulls = [0.8, 0.6, 0.3]
+        outcomes = [True, False, True]
+        brier = compute_brier_score(p_bulls, outcomes)
+        assert 0.0 < brier < 1.0
+
+
+class TestInformationCoefficient:
+    """Tests for compute_information_coefficient (Pearson IC)."""
+
+    def test_perfect_positive_correlation(self):
+        """Perfectly correlated scores and returns → IC = 1.0."""
+        scores = list(range(30))
+        returns = [s * 2.0 + 1.0 for s in scores]  # linear: y = 2x + 1
+        ic = compute_information_coefficient(scores, returns)
+        assert ic is not None
+        assert ic == pytest.approx(1.0, abs=1e-9)
+
+    def test_perfect_negative_correlation(self):
+        """Anti-correlated scores and returns → IC = -1.0."""
+        scores = list(range(30))
+        returns = [-s * 2.0 for s in scores]
+        ic = compute_information_coefficient(scores, returns)
+        assert ic is not None
+        assert ic == pytest.approx(-1.0, abs=1e-9)
+
+    def test_fewer_than_30_returns_none(self):
+        """Fewer than 30 data points → None."""
+        scores = list(range(29))
+        returns = list(range(29))
+        ic = compute_information_coefficient(scores, returns)
+        assert ic is None
+
+    def test_ic_in_valid_range(self):
+        """IC is always in [-1.0, 1.0] for valid data."""
+        scores = [float(i % 7) for i in range(50)]
+        returns = [float(i % 5) for i in range(50)]
+        ic = compute_information_coefficient(scores, returns)
+        assert ic is not None
+        assert -1.0 <= ic <= 1.0
+
+
+class TestRankInformationCoefficient:
+    """Tests for compute_rank_information_coefficient (Spearman Rank IC)."""
+
+    def test_perfect_rank_correlation(self):
+        """Perfectly rank-correlated → Rank IC = 1.0."""
+        scores = list(range(30))
+        returns = list(range(30))  # same ordering
+        rank_ic = compute_rank_information_coefficient(scores, returns)
+        assert rank_ic is not None
+        assert rank_ic == pytest.approx(1.0, abs=1e-9)
+
+    def test_perfect_anti_rank_correlation(self):
+        """Perfectly anti-rank-correlated → Rank IC = -1.0."""
+        scores = list(range(30))
+        returns = list(range(29, -1, -1))  # reversed ordering
+        rank_ic = compute_rank_information_coefficient(scores, returns)
+        assert rank_ic is not None
+        assert rank_ic == pytest.approx(-1.0, abs=1e-9)
+
+    def test_fewer_than_30_returns_none(self):
+        """Fewer than 30 data points → None."""
+        scores = list(range(29))
+        returns = list(range(29))
+        rank_ic = compute_rank_information_coefficient(scores, returns)
+        assert rank_ic is None
+
+
+# ===================================================================
+# 8.5 — Calibration Engine unit tests
+# Requirements: 8.1, 8.2, 8.3
+# ===================================================================
+
+
+class TestSourceReliability:
+    """Tests for compute_source_reliability (Bayesian shrinkage)."""
+
+    def test_zero_samples_returns_prior(self):
+        """n=0 → reliability = 0.5 (prior mean)."""
+        assert compute_source_reliability(0.8, 0) == 0.5
+
+    def test_large_sample_approaches_observed(self):
+        """n=1000 with wr=0.8 → ≈0.8 (close to observed win rate)."""
+        reliability = compute_source_reliability(0.8, 1000)
+        assert reliability == pytest.approx(0.7912621359223302)
+        # Should be close to 0.8 but not exactly
+        assert abs(reliability - 0.8) < 0.02
+
+    def test_moderate_sample(self):
+        """n=30 with wr=0.7 → 0.6 exactly.
+
+        0.5 + (30/60) * (0.7 - 0.5) = 0.5 + 0.5 * 0.2 = 0.6
+        """
+        assert compute_source_reliability(0.7, 30) == pytest.approx(0.6)
+
+    def test_reliability_in_range(self):
+        """Reliability is always in [0.0, 1.0]."""
+        # Extreme win rates
+        assert 0.0 <= compute_source_reliability(0.0, 100) <= 1.0
+        assert 0.0 <= compute_source_reliability(1.0, 100) <= 1.0
+        assert 0.0 <= compute_source_reliability(0.5, 1) <= 1.0
+
+    def test_negative_sample_count_returns_prior(self):
+        """Negative sample count → treated as 0, returns 0.5."""
+        assert compute_source_reliability(0.8, -5) == 0.5
+
+
+class TestAdjustedEvidenceWeight:
+    """Tests for compute_adjusted_evidence_weight."""
+
+    def test_reliability_half_gives_base_weight(self):
+        """reliability=0.5 → adjusted = base * (0.5 + 0.5) = base * 1.0."""
+        assert compute_adjusted_evidence_weight(1.0, 0.5) == pytest.approx(1.0)
+
+    def test_high_reliability_increases_weight(self):
+        """reliability=1.0 → adjusted = base * 1.5."""
+        assert compute_adjusted_evidence_weight(1.0, 1.0) == pytest.approx(1.5)
+
+    def test_low_reliability_decreases_weight(self):
+        """reliability=0.0 → adjusted = base * 0.5."""
+        assert compute_adjusted_evidence_weight(1.0, 0.0) == pytest.approx(0.5)
+
+    def test_clamped_to_upper_bound(self):
+        """Large base_weight * high reliability → clamped to 2.0."""
+        result = compute_adjusted_evidence_weight(3.0, 1.0)
+        assert result == 2.0
+
+    def test_clamped_to_lower_bound(self):
+        """Small base_weight * low reliability → clamped to 0.1."""
+        result = compute_adjusted_evidence_weight(0.1, 0.0)
+        assert result == 0.1
+
+    def test_mid_range_not_clamped(self):
+        """Normal values stay within bounds without clamping."""
+        result = compute_adjusted_evidence_weight(0.8, 0.6)
+        # 0.8 * (0.5 + 0.6) = 0.8 * 1.1 = 0.88
+        assert result == pytest.approx(0.88)
+        assert 0.1 <= result <= 2.0
+
+
+# ===================================================================
+# 8.6 — Quality Gate unit tests
+# Requirements: 11.1, 11.6
+# ===================================================================
+
+
+class TestQualityGate:
+    """Tests for _evaluate_thresholds and QualityGateConfig."""
+
+    def _make_passing_snapshot(self) -> dict:
+        """Return a metric snapshot dict that meets all default thresholds."""
+        return {
+            "prediction_count": 200,
+            "information_coefficient": 0.10,
+            "win_rate": 0.60,
+            "calibration_error": 0.08,
+            "avg_excess_return_vs_spy": 0.02,
+        }
+
+    def test_all_thresholds_met_pass(self):
+        """All thresholds met → every result is passed=True."""
+        config = QualityGateConfig()
+        snapshot = self._make_passing_snapshot()
+
+        results = _evaluate_thresholds(snapshot, config)
+
+        assert len(results) == 5
+        assert all(r.passed for r in results), (
+            f"Expected all thresholds to pass, but got: "
+            f"{[(r.name, r.passed) for r in results]}"
+        )
+
+    def test_one_threshold_failed_ic_below_min(self):
+        """IC below min_ic → that threshold fails, others pass."""
+        config = QualityGateConfig()
+        snapshot = self._make_passing_snapshot()
+        snapshot["information_coefficient"] = 0.01  # below min_ic=0.03
+
+        results = _evaluate_thresholds(snapshot, config)
+
+        results_by_name = {r.name: r for r in results}
+        assert results_by_name["min_ic"].passed is False
+        assert results_by_name["min_ic"].actual == pytest.approx(0.01)
+        assert results_by_name["min_ic"].threshold == pytest.approx(0.03)
+        # All other thresholds should still pass
+        for name, result in results_by_name.items():
+            if name != "min_ic":
+                assert result.passed is True, f"{name} should pass but didn't"
+
+    def test_all_thresholds_below_all_fail(self):
+        """All metric values below thresholds → all results are passed=False."""
+        config = QualityGateConfig()
+        snapshot = {
+            "prediction_count": 10,           # below 100
+            "information_coefficient": 0.0,   # below 0.03
+            "win_rate": 0.40,                 # below 0.53
+            "calibration_error": 0.50,        # above 0.15
+            "avg_excess_return_vs_spy": -0.05, # below 0.0
+        }
+
+        results = _evaluate_thresholds(snapshot, config)
+
+        assert len(results) == 5
+        assert all(not r.passed for r in results), (
+            f"Expected all thresholds to fail, but got: "
+            f"{[(r.name, r.passed) for r in results]}"
+        )
+
+    def test_failsafe_none_values_treated_as_worst_case(self):
+        """Missing (None) metric values are treated as worst-case defaults.
+
+        This tests the fail-safe behavior: when no snapshots exist,
+        the snapshot dict would have None values. _evaluate_thresholds
+        treats None as 0 for min-thresholds and 1.0 for max_ece,
+        causing all thresholds to fail → paper-only.
+        """
+        config = QualityGateConfig()
+        snapshot = {
+            "prediction_count": None,
+            "information_coefficient": None,
+            "win_rate": None,
+            "calibration_error": None,
+            "avg_excess_return_vs_spy": None,
+        }
+
+        results = _evaluate_thresholds(snapshot, config)
+
+        results_by_name = {r.name: r for r in results}
+        # prediction_count: None → 0, below 100 → fail
+        assert results_by_name["min_prediction_count"].passed is False
+        assert results_by_name["min_prediction_count"].actual == 0.0
+        # IC: None → 0.0, below 0.03 → fail
+        assert results_by_name["min_ic"].passed is False
+        assert results_by_name["min_ic"].actual == 0.0
+        # win_rate: None → 0.0, below 0.53 → fail
+        assert results_by_name["min_win_rate"].passed is False
+        assert results_by_name["min_win_rate"].actual == 0.0
+        # calibration_error: None → 1.0 (worst-case), above 0.15 → fail
+        assert results_by_name["max_ece"].passed is False
+        assert results_by_name["max_ece"].actual == 1.0
+        # excess_return: None → 0.0, equal to min 0.0 → pass (>= 0.0)
+        assert results_by_name["min_excess_return_vs_spy"].passed is True
+        assert results_by_name["min_excess_return_vs_spy"].actual == 0.0
+
+    def test_stale_snapshot_age_exceeds_max(self):
+        """Snapshot age exceeding max_snapshot_age_hours causes gate failure.
+
+        The evaluate_quality_gate async function checks snapshot age
+        before calling _evaluate_thresholds. Here we verify the config
+        field is respected by testing the age comparison logic directly.
+        """
+        config = QualityGateConfig(max_snapshot_age_hours=24)
+        age_hours = 30.0  # 30 hours old, exceeds 24h max
+
+        assert age_hours > config.max_snapshot_age_hours
+
+    def test_threshold_boundary_exact_values(self):
+        """Metric values exactly at threshold boundaries → pass.
+
+        min thresholds use >=, max thresholds use <=.
+        """
+        config = QualityGateConfig()
+        snapshot = {
+            "prediction_count": 100,          # exactly min_prediction_count
+            "information_coefficient": 0.03,  # exactly min_ic
+            "win_rate": 0.53,                 # exactly min_win_rate
+            "calibration_error": 0.15,        # exactly max_ece
+            "avg_excess_return_vs_spy": 0.0,  # exactly min_excess_return
+        }
+
+        results = _evaluate_thresholds(snapshot, config)
+
+        assert all(r.passed for r in results), (
+            f"Boundary values should pass, but got: "
+            f"{[(r.name, r.passed, r.actual, r.threshold) for r in results]}"
+        )
+
+    def test_custom_config_thresholds(self):
+        """Custom QualityGateConfig thresholds are respected."""
+        config = QualityGateConfig(
+            min_prediction_count=50,
+            min_ic=0.01,
+            min_win_rate=0.51,
+            max_ece=0.20,
+            min_excess_return_vs_spy=-0.01,
+        )
+        snapshot = {
+            "prediction_count": 60,
+            "information_coefficient": 0.02,
+            "win_rate": 0.52,
+            "calibration_error": 0.18,
+            "avg_excess_return_vs_spy": -0.005,
+        }
+
+        results = _evaluate_thresholds(snapshot, config)
+
+        assert all(r.passed for r in results), (
+            f"Custom thresholds should pass, but got: "
+            f"{[(r.name, r.passed) for r in results]}"
+        )
@@ -0,0 +1,662 @@
+"""Property-based tests for model validation, calibration, and signal quality.
+
+Feature: model-validation-calibration
+
+Tests correctness properties from the design specification covering
+canonical evidence key determinism/idempotence, contribution score
+invariants, calibration error bounds, Brier score bounds, information
+coefficient bounds, source reliability shrinkage, and quality gate
+determinism.
+"""
+from __future__ import annotations
+
+import urllib.parse
+
+from hypothesis import given, settings
+from hypothesis import strategies as st
+
+from services.validation.prediction_snapshot import (
+    compute_canonical_evidence_key,
+    compute_contribution_scores,
+)
+
+# ---------------------------------------------------------------------------
+# Strategies
+# ---------------------------------------------------------------------------
+
+# Titles: arbitrary text (including whitespace, unicode)
+title_strategy = st.text(min_size=0, max_size=200)
+
+# URLs: build realistic URLs with optional query params
+url_strategy = st.builds(
+    lambda scheme, host, path, query: urllib.parse.urlunparse(
+        (scheme, host, path, "", query, "")
+    ),
+    scheme=st.sampled_from(["http", "https"]),
+    host=st.from_regex(r"[a-z0-9]{1,20}\.[a-z]{2,6}", fullmatch=True),
+    path=st.from_regex(r"(/[a-z0-9\-]{0,15}){0,4}", fullmatch=True),
+    query=st.from_regex(r"([a-z]{1,8}=[a-z0-9]{1,8}(&[a-z]{1,8}=[a-z0-9]{1,8}){0,3})?", fullmatch=True),
+)
+
+
+# ---------------------------------------------------------------------------
+# Property 4: Canonical Evidence Key Determinism and Normalization Idempotence
+# Validates: Requirements 2.3, 17.4
+# ---------------------------------------------------------------------------
+
+
+@given(title=title_strategy, url=url_strategy)
+@settings(max_examples=100)
+def test_canonical_evidence_key_determinism(title: str, url: str) -> None:
+    """**Validates: Requirements 2.3, 17.4**
+
+    For any (title, url) pair, computing the canonical evidence key twice
+    with the same inputs SHALL produce the same result (determinism).
+    """
+    key1 = compute_canonical_evidence_key(title, url)
+    key2 = compute_canonical_evidence_key(title, url)
+    assert key1 == key2, (
+        f"Determinism violated: same inputs produced different keys: "
+        f"{key1!r} != {key2!r}"
+    )
+    # Key should be a valid SHA256 hex digest (64 hex chars)
+    assert len(key1) == 64, f"Expected 64-char hex digest, got {len(key1)}"
+    assert all(c in "0123456789abcdef" for c in key1), (
+        f"Key contains non-hex characters: {key1!r}"
+    )
+
+
+@given(title=title_strategy, url=url_strategy)
+@settings(max_examples=100)
+def test_canonical_evidence_key_normalization_idempotence(title: str, url: str) -> None:
+    """**Validates: Requirements 2.3, 17.4**
+
+    Normalizing an already-normalized input and computing the key SHALL
+    produce the same key as the original computation (idempotence).
+
+    Normalization rules:
+    - Title: lowercase, strip leading/trailing whitespace
+    - URL: lowercase, strip query parameters (keep scheme, netloc, path)
+    """
+    # Compute key from original (unnormalized) inputs
+    key_original = compute_canonical_evidence_key(title, url)
+
+    # Pre-normalize the inputs the same way the function does internally
+    normalized_title = title.strip().lower()
+    parsed = urllib.parse.urlparse(url.lower())
+    normalized_url = urllib.parse.urlunparse(
+        (parsed.scheme, parsed.netloc, parsed.path, "", "", "")
+    )
+
+    # Compute key from already-normalized inputs
+    key_from_normalized = compute_canonical_evidence_key(normalized_title, normalized_url)
+
+    assert key_original == key_from_normalized, (
+        f"Idempotence violated: key from original inputs ({key_original!r}) "
+        f"differs from key from pre-normalized inputs ({key_from_normalized!r}). "
+        f"title={title!r}, url={url!r}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Strategies for contribution score tests
+# ---------------------------------------------------------------------------
+
+positive_weights_strategy = st.lists(
+    st.floats(min_value=0.01, max_value=1000.0, allow_nan=False, allow_infinity=False),
+    min_size=1,
+    max_size=50,
+)
+
+
+# ---------------------------------------------------------------------------
+# Property 7: Contribution Score Sum-to-One and Range
+# Validates: Requirements 2.5, 17.7
+# ---------------------------------------------------------------------------
+
+
+@given(weights=positive_weights_strategy)
+@settings(max_examples=100)
+def test_contribution_scores_sum_to_one_and_range(weights: list[float]) -> None:
+    """**Validates: Requirements 2.5, 17.7**
+
+    For any non-empty list of positive document weights, the computed
+    contribution scores SHALL each be in [0.0, 1.0] and SHALL sum to 1.0
+    (within floating-point tolerance of 1e-9).
+    """
+    scores = compute_contribution_scores(weights)
+
+    # Same length as input
+    assert len(scores) == len(weights), (
+        f"Expected {len(weights)} scores, got {len(scores)}"
+    )
+
+    # Each score in [0.0, 1.0]
+    for i, score in enumerate(scores):
+        assert 0.0 <= score <= 1.0, (
+            f"Score at index {i} is {score}, expected in [0.0, 1.0]. "
+            f"weights={weights}"
+        )
+
+    # Scores sum to 1.0 within tolerance
+    total = sum(scores)
+    assert abs(total - 1.0) < 1e-9, (
+        f"Scores sum to {total}, expected 1.0 within 1e-9 tolerance. "
+        f"weights={weights}"
+    )
+
+
+def test_contribution_scores_empty_input() -> None:
+    """**Validates: Requirements 2.5, 17.7**
+
+    For an empty weight list, the result SHALL be an empty list.
+    """
+    scores = compute_contribution_scores([])
+    assert scores == [], f"Expected empty list for empty input, got {scores}"
+
+
+# ---------------------------------------------------------------------------
+# Strategies for calibration error tests
+# ---------------------------------------------------------------------------
+
+confidence_strategy = st.floats(
+    min_value=0.50, max_value=1.00, allow_nan=False, allow_infinity=False
+)
+outcome_strategy = st.booleans()
+prediction_pairs_strategy = st.lists(
+    st.tuples(confidence_strategy, outcome_strategy),
+    min_size=1,
+    max_size=100,
+)
+
+# Import metric functions
+from services.validation.metrics import (
+    compute_brier_score,
+    compute_calibration_error,
+    compute_information_coefficient,
+)
+
+
+# ---------------------------------------------------------------------------
+# Property 1: Calibration Error Range and Round-Trip
+# Validates: Requirements 5.1, 5.3, 17.1
+# ---------------------------------------------------------------------------
+
+
+@given(pairs=prediction_pairs_strategy)
+@settings(max_examples=100)
+def test_calibration_error_range(pairs: list[tuple[float, bool]]) -> None:
+    """**Validates: Requirements 5.1, 5.3, 17.1**
+
+    For any valid distribution of predictions with confidences in [0.50, 1.00]
+    and boolean outcomes, the Expected Calibration Error (ECE) SHALL be in
+    [0.0, 1.0].
+    """
+    confidences = [c for c, _ in pairs]
+    outcomes = [o for _, o in pairs]
+
+    ece, buckets = compute_calibration_error(confidences, outcomes)
+
+    assert 0.0 <= ece <= 1.0, (
+        f"ECE {ece} is outside [0.0, 1.0]. "
+        f"confidences={confidences}, outcomes={outcomes}"
+    )
+
+    # Each bucket's metrics should also be well-formed
+    for bucket in buckets:
+        if bucket.prediction_count > 0:
+            assert 0.0 <= bucket.avg_confidence <= 1.0, (
+                f"Bucket [{bucket.bucket_low}, {bucket.bucket_high}) has "
+                f"avg_confidence={bucket.avg_confidence} outside [0.0, 1.0]"
+            )
+            assert 0.0 <= bucket.observed_win_rate <= 1.0, (
+                f"Bucket [{bucket.bucket_low}, {bucket.bucket_high}) has "
+                f"observed_win_rate={bucket.observed_win_rate} outside [0.0, 1.0]"
+            )
+
+
+def test_calibration_error_zero_when_perfectly_calibrated() -> None:
+    """**Validates: Requirements 5.1, 5.3, 17.1**
+
+    When every bucket's observed win rate exactly matches its average
+    confidence, ECE SHALL be 0.0.
+
+    Constructs a scenario with predictions in multiple buckets where the
+    fraction of True outcomes in each bucket equals the bucket's average
+    confidence.
+    """
+    # For each bucket midpoint, place predictions so win_rate == avg_confidence.
+    # Use 100 predictions per bucket at the midpoint confidence.
+    # Set exactly round(100 * midpoint) outcomes to True.
+    bucket_midpoints = [0.55, 0.65, 0.75, 0.85, 0.95]
+    n_per_bucket = 100
+
+    confidences: list[float] = []
+    outcomes: list[bool] = []
+
+    for midpoint in bucket_midpoints:
+        n_true = round(n_per_bucket * midpoint)
+        n_false = n_per_bucket - n_true
+
+        confidences.extend([midpoint] * n_per_bucket)
+        outcomes.extend([True] * n_true + [False] * n_false)
+
+    ece, buckets = compute_calibration_error(confidences, outcomes)
+
+    assert ece == 0.0, (
+        f"ECE should be 0.0 for perfectly calibrated predictions, got {ece}. "
+        f"Buckets: {[(b.avg_confidence, b.observed_win_rate, b.prediction_count) for b in buckets]}"
+    )
+
+    # Verify each non-empty bucket has matching avg_confidence and win_rate
+    for bucket in buckets:
+        if bucket.prediction_count > 0:
+            assert bucket.avg_confidence == bucket.observed_win_rate, (
+                f"Bucket [{bucket.bucket_low}, {bucket.bucket_high}) has "
+                f"avg_confidence={bucket.avg_confidence} != "
+                f"observed_win_rate={bucket.observed_win_rate}"
+            )
+            assert not bucket.miscalibrated, (
+                f"Bucket [{bucket.bucket_low}, {bucket.bucket_high}) should not "
+                f"be flagged as miscalibrated when perfectly calibrated"
+            )
+
+
+# ---------------------------------------------------------------------------
+# Strategies for Brier score tests
+# ---------------------------------------------------------------------------
+
+p_bull_strategy = st.floats(
+    min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
+)
+brier_outcome_strategy = st.booleans()
+brier_pairs_strategy = st.lists(
+    st.tuples(p_bull_strategy, brier_outcome_strategy),
+    min_size=1,
+    max_size=100,
+)
+
+
+# ---------------------------------------------------------------------------
+# Property 2: Brier Score Range and Perfect Prediction
+# Validates: Requirements 5.4, 17.2
+# ---------------------------------------------------------------------------
+
+
+@given(pairs=brier_pairs_strategy)
+@settings(max_examples=100)
+def test_brier_score_range(pairs: list[tuple[float, bool]]) -> None:
+    """**Validates: Requirements 5.4, 17.2**
+
+    For any list of (p_bull, outcome) pairs where p_bull ∈ [0.0, 1.0] and
+    outcome is boolean, the Brier score SHALL be in [0.0, 1.0].
+    """
+    p_bulls = [p for p, _ in pairs]
+    outcomes = [o for _, o in pairs]
+
+    brier = compute_brier_score(p_bulls, outcomes)
+
+    assert 0.0 <= brier <= 1.0, (
+        f"Brier score {brier} is outside [0.0, 1.0]. "
+        f"p_bulls={p_bulls}, outcomes={outcomes}"
+    )
+
+
+@given(n=st.integers(min_value=1, max_value=100))
+@settings(max_examples=100)
+def test_brier_score_perfect_prediction(n: int) -> None:
+    """**Validates: Requirements 5.4, 17.2**
+
+    When all predictions are perfectly correct — p_bull = 1.0 with
+    outcome = True, or p_bull = 0.0 with outcome = False — the Brier
+    score SHALL be 0.0.
+    """
+    # Case 1: all p_bull = 1.0 and outcome = True
+    p_bulls_all_bull = [1.0] * n
+    outcomes_all_true = [True] * n
+    brier_bull = compute_brier_score(p_bulls_all_bull, outcomes_all_true)
+    assert brier_bull == 0.0, (
+        f"Brier score should be 0.0 for perfect bullish predictions, "
+        f"got {brier_bull} with n={n}"
+    )
+
+    # Case 2: all p_bull = 0.0 and outcome = False
+    p_bulls_all_bear = [0.0] * n
+    outcomes_all_false = [False] * n
+    brier_bear = compute_brier_score(p_bulls_all_bear, outcomes_all_false)
+    assert brier_bear == 0.0, (
+        f"Brier score should be 0.0 for perfect bearish predictions, "
+        f"got {brier_bear} with n={n}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Strategies for Information Coefficient tests
+# ---------------------------------------------------------------------------
+
+ic_score_strategy = st.floats(
+    min_value=-100.0, max_value=100.0, allow_nan=False, allow_infinity=False
+)
+
+# Generate lists of at least 30 (score, return) pairs
+ic_pairs_strategy = st.lists(
+    st.tuples(ic_score_strategy, ic_score_strategy),
+    min_size=30,
+    max_size=100,
+)
+
+
+# ---------------------------------------------------------------------------
+# Property 3: Information Coefficient Range and Perfect Correlation
+# Validates: Requirements 6.1, 6.2, 17.3
+# ---------------------------------------------------------------------------
+
+
+@given(pairs=ic_pairs_strategy)
+@settings(max_examples=100)
+def test_information_coefficient_range(pairs: list[tuple[float, float]]) -> None:
+    """**Validates: Requirements 6.1, 6.2, 17.3**
+
+    For any list of (score, return) pairs with at least 30 elements where
+    scores and returns are finite floats, the Information Coefficient
+    (Pearson correlation) SHALL be in [-1.0, 1.0] or None (when variance
+    is zero).
+    """
+    scores = [s for s, _ in pairs]
+    returns = [r for _, r in pairs]
+
+    ic = compute_information_coefficient(scores, returns)
+
+    # IC may be None if variance is zero in either list
+    if ic is not None:
+        assert -1.0 <= ic <= 1.0, (
+            f"IC {ic} is outside [-1.0, 1.0]. "
+            f"scores={scores[:5]}..., returns={returns[:5]}..."
+        )
+
+
+@given(
+    scores=st.lists(
+        st.floats(min_value=-100.0, max_value=100.0, allow_nan=False, allow_infinity=False),
+        min_size=30,
+        max_size=100,
+    ).filter(lambda xs: max(xs) - min(xs) > 1e-6),
+    a=st.floats(min_value=0.01, max_value=100.0, allow_nan=False, allow_infinity=False),
+    b=st.floats(min_value=-100.0, max_value=100.0, allow_nan=False, allow_infinity=False),
+)
+@settings(max_examples=100)
+def test_information_coefficient_perfect_positive_correlation(
+    scores: list[float], a: float, b: float
+) -> None:
+    """**Validates: Requirements 6.1, 6.2, 17.3**
+
+    When scores and returns are perfectly positively linearly correlated
+    (returns = a * scores + b, a > 0), IC SHALL be 1.0 within
+    floating-point tolerance.
+    """
+    returns = [a * s + b for s in scores]
+
+    ic = compute_information_coefficient(scores, returns)
+
+    assert ic is not None, (
+        f"IC should not be None for perfectly correlated data with variance. "
+        f"a={a}, b={b}, scores={scores[:5]}..."
+    )
+    assert abs(ic - 1.0) < 1e-6, (
+        f"IC should be 1.0 for perfectly positively correlated data, "
+        f"got {ic}. a={a}, b={b}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Strategies for source reliability tests
+# ---------------------------------------------------------------------------
+
+from services.validation.calibration import compute_source_reliability
+
+observed_win_rate_strategy = st.floats(
+    min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
+)
+sample_count_strategy = st.integers(min_value=0, max_value=100_000)
+
+
+# ---------------------------------------------------------------------------
+# Property 5: Source Reliability Bayesian Shrinkage Bounds and Convergence
+# Validates: Requirements 8.1, 8.2, 17.5
+# ---------------------------------------------------------------------------
+
+
+@given(
+    observed_win_rate=observed_win_rate_strategy,
+    sample_count=sample_count_strategy,
+)
+@settings(max_examples=100)
+def test_source_reliability_range(observed_win_rate: float, sample_count: int) -> None:
+    """**Validates: Requirements 8.1, 8.2, 17.5**
+
+    For any observed_win_rate in [0.0, 1.0] and sample_count >= 0,
+    the source reliability computed via Bayesian shrinkage SHALL be
+    in [0.0, 1.0].
+    """
+    reliability = compute_source_reliability(observed_win_rate, sample_count)
+
+    assert 0.0 <= reliability <= 1.0, (
+        f"Reliability {reliability} is outside [0.0, 1.0]. "
+        f"observed_win_rate={observed_win_rate}, sample_count={sample_count}"
+    )
+
+
+def test_source_reliability_zero_samples() -> None:
+    """**Validates: Requirements 8.1, 8.2, 17.5**
+
+    When sample_count = 0, reliability SHALL be exactly 0.5 (the prior mean).
+    """
+    reliability = compute_source_reliability(observed_win_rate=0.8, sample_count=0)
+    assert reliability == 0.5, (
+        f"Reliability should be 0.5 when sample_count=0, got {reliability}"
+    )
+
+    # Also verify with different win rates
+    for wr in [0.0, 0.25, 0.5, 0.75, 1.0]:
+        r = compute_source_reliability(observed_win_rate=wr, sample_count=0)
+        assert r == 0.5, (
+            f"Reliability should be 0.5 when sample_count=0 regardless of "
+            f"observed_win_rate={wr}, got {r}"
+        )
+
+
+@given(
+    observed_win_rate=st.floats(
+        min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
+    ),
+)
+@settings(max_examples=100)
+def test_source_reliability_convergence(observed_win_rate: float) -> None:
+    """**Validates: Requirements 8.1, 8.2, 17.5**
+
+    As sample_count increases toward infinity, reliability SHALL approach
+    the observed_win_rate. For a large sample_count (e.g., 10000),
+    reliability should be within 0.01 of observed_win_rate.
+    """
+    reliability = compute_source_reliability(observed_win_rate, sample_count=10_000)
+
+    assert abs(reliability - observed_win_rate) < 0.01, (
+        f"Reliability {reliability} should be within 0.01 of "
+        f"observed_win_rate {observed_win_rate} when sample_count=10000. "
+        f"Difference: {abs(reliability - observed_win_rate)}"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Strategies for quality gate tests
+# ---------------------------------------------------------------------------
+
+from services.trading.model_quality_gate import (
+    GateThresholdResult,
+    QualityGateConfig,
+    _evaluate_thresholds,
+)
+
+# Snapshot dict strategy: generate each metric value in a reasonable range
+snapshot_strategy = st.fixed_dictionaries({
+    "prediction_count": st.integers(min_value=0, max_value=10_000),
+    "information_coefficient": st.floats(
+        min_value=-1.0, max_value=1.0, allow_nan=False, allow_infinity=False
+    ),
+    "win_rate": st.floats(
+        min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
+    ),
+    "calibration_error": st.floats(
+        min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
+    ),
+    "avg_excess_return_vs_spy": st.floats(
+        min_value=-1.0, max_value=1.0, allow_nan=False, allow_infinity=False
+    ),
+})
+
+# Config strategy: generate each threshold in a reasonable range
+gate_config_strategy = st.builds(
+    QualityGateConfig,
+    min_prediction_count=st.integers(min_value=0, max_value=10_000),
+    min_ic=st.floats(
+        min_value=-1.0, max_value=1.0, allow_nan=False, allow_infinity=False
+    ),
+    min_win_rate=st.floats(
+        min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
+    ),
+    max_ece=st.floats(
+        min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
+    ),
+    min_excess_return_vs_spy=st.floats(
+        min_value=-1.0, max_value=1.0, allow_nan=False, allow_infinity=False
+    ),
+)
+
+
+# ---------------------------------------------------------------------------
+# Property 6: Quality Gate Determinism and Threshold Monotonicity
+# Validates: Requirements 11.1, 17.6
+# ---------------------------------------------------------------------------
+
+
+@given(snapshot=snapshot_strategy, config=gate_config_strategy)
+@settings(max_examples=100)
+def test_quality_gate_determinism(
+    snapshot: dict, config: QualityGateConfig
+) -> None:
+    """**Validates: Requirements 11.1, 17.6**
+
+    For any set of model metric values and quality gate configuration,
+    calling _evaluate_thresholds twice with the same inputs SHALL produce
+    the same pass/fail result for every threshold (determinism).
+    """
+    results1 = _evaluate_thresholds(snapshot, config)
+    results2 = _evaluate_thresholds(snapshot, config)
+
+    assert len(results1) == len(results2), (
+        f"Different number of threshold results: {len(results1)} vs {len(results2)}"
+    )
+
+    for r1, r2 in zip(results1, results2):
+        assert r1.name == r2.name, (
+            f"Threshold name mismatch: {r1.name!r} vs {r2.name!r}"
+        )
+        assert r1.threshold == r2.threshold, (
+            f"Threshold value mismatch for {r1.name}: "
+            f"{r1.threshold} vs {r2.threshold}"
+        )
+        assert r1.actual == r2.actual, (
+            f"Actual value mismatch for {r1.name}: "
+            f"{r1.actual} vs {r2.actual}"
+        )
+        assert r1.passed == r2.passed, (
+            f"Determinism violated for threshold {r1.name}: "
+            f"first call passed={r1.passed}, second call passed={r2.passed}. "
+            f"actual={r1.actual}, threshold={r1.threshold}"
+        )
+
+    # Overall gate pass/fail should also be deterministic
+    all_passed_1 = all(r.passed for r in results1)
+    all_passed_2 = all(r.passed for r in results2)
+    assert all_passed_1 == all_passed_2, (
+        f"Overall gate determinism violated: "
+        f"first call passed={all_passed_1}, second call passed={all_passed_2}"
+    )
+
+
+@given(
+    snapshot=snapshot_strategy,
+    config=gate_config_strategy,
+    relax_amount=st.floats(
+        min_value=0.0, max_value=1.0, allow_nan=False, allow_infinity=False
+    ),
+    threshold_to_relax=st.sampled_from([
+        "min_prediction_count",
+        "min_ic",
+        "min_win_rate",
+        "max_ece",
+        "min_excess_return_vs_spy",
+    ]),
+)
+@settings(max_examples=100)
+def test_quality_gate_threshold_monotonicity(
+    snapshot: dict,
+    config: QualityGateConfig,
+    relax_amount: float,
+    threshold_to_relax: str,
+) -> None:
+    """**Validates: Requirements 11.1, 17.6**
+
+    For any configuration where the gate passes, relaxing any single
+    threshold (decreasing min values or increasing max values to make
+    them easier to satisfy) SHALL NOT cause the gate to fail
+    (monotonicity).
+    """
+    # Evaluate with original config
+    original_results = _evaluate_thresholds(snapshot, config)
+    original_passed = all(r.passed for r in original_results)
+
+    # Only test monotonicity when the gate originally passes
+    if not original_passed:
+        return
+
+    # Create a relaxed config by making one threshold easier to satisfy
+    from dataclasses import replace
+
+    if threshold_to_relax == "min_prediction_count":
+        # Decrease min → easier to satisfy
+        relaxed_value = max(0, config.min_prediction_count - int(relax_amount * 1000))
+        relaxed_config = replace(config, min_prediction_count=relaxed_value)
+    elif threshold_to_relax == "min_ic":
+        # Decrease min → easier to satisfy
+        relaxed_config = replace(config, min_ic=config.min_ic - relax_amount)
+    elif threshold_to_relax == "min_win_rate":
+        # Decrease min → easier to satisfy
+        relaxed_config = replace(config, min_win_rate=config.min_win_rate - relax_amount)
+    elif threshold_to_relax == "max_ece":
+        # Increase max → easier to satisfy
+        relaxed_config = replace(config, max_ece=config.max_ece + relax_amount)
+    elif threshold_to_relax == "min_excess_return_vs_spy":
+        # Decrease min → easier to satisfy
+        relaxed_config = replace(
+            config,
+            min_excess_return_vs_spy=config.min_excess_return_vs_spy - relax_amount,
+        )
+    else:
+        return  # pragma: no cover
+
+    # Evaluate with relaxed config
+    relaxed_results = _evaluate_thresholds(snapshot, config=relaxed_config)
+    relaxed_passed = all(r.passed for r in relaxed_results)
+
+    assert relaxed_passed, (
+        f"Monotonicity violated: gate passed with original config but failed "
+        f"after relaxing {threshold_to_relax}. "
+        f"Original config: min_prediction_count={config.min_prediction_count}, "
+        f"min_ic={config.min_ic}, min_win_rate={config.min_win_rate}, "
+        f"max_ece={config.max_ece}, "
+        f"min_excess_return_vs_spy={config.min_excess_return_vs_spy}. "
+        f"Relaxed threshold: {threshold_to_relax} by {relax_amount}. "
+        f"Failed thresholds: "
+        f"{[(r.name, r.actual, r.threshold) for r in relaxed_results if not r.passed]}"
+    )