"""Unit tests for model validation, calibration, and signal quality modules. Covers prediction snapshot writer, outcome evaluator, metrics engine, calibration engine, and quality gate — all pure-function / deterministic tests. Requirements: 1.1, 2.3, 2.4, 2.5, 3.3, 4.2, 4.5, 4.6, 4.7, 5.3, 5.4, 6.1, 6.2, 6.5, 8.1, 8.2, 8.3, 11.1, 11.6 """ from __future__ import annotations import hashlib import pytest # -- Quality Gate -- from services.trading.model_quality_gate import ( QualityGateConfig, _evaluate_thresholds, ) # -- Calibration Engine -- from services.validation.calibration import ( compute_adjusted_evidence_weight, compute_source_reliability, ) # -- Metrics Engine -- from services.validation.metrics import ( compute_brier_score, compute_calibration_error, compute_information_coefficient, compute_rank_information_coefficient, ) # -- Outcome Evaluator -- from services.validation.outcome_evaluator import ( _compute_return, _is_direction_correct, _is_profitable, ) # -- Prediction Snapshot Writer -- from services.validation.prediction_snapshot import ( MAX_SINGLE_DOCUMENT_WEIGHT, compute_canonical_evidence_key, compute_contribution_scores, ) # =================================================================== # 8.2 — Prediction Snapshot Writer unit tests # Requirements: 1.1, 2.3, 2.4, 2.5, 3.3 # =================================================================== class TestCanonicalEvidenceKey: """Tests for compute_canonical_evidence_key.""" def test_known_title_url_produces_expected_sha256(self): """Known title/URL pair produces a deterministic SHA256 hash.""" key = compute_canonical_evidence_key( "Test Article", "https://example.com/article?ref=123" ) assert key == "abd5818d51579a7af51cd06861289c7f1fdc97c0f522e8ba13ce9b4aad01cb6f" def test_empty_inputs(self): """Empty title and URL produce SHA256 of empty string.""" key = compute_canonical_evidence_key("", "") expected = hashlib.sha256(b"").hexdigest() assert key == expected def test_unicode_inputs(self): """Unicode title and URL are handled correctly.""" key = compute_canonical_evidence_key( "日本語テスト", "https://example.com/日本語" ) assert key == "553553928bb4e36abdf283ff3c52df0695fca09809159650a9bdcb4fb2c5f62b" def test_normalization_case_insensitive(self): """Title and URL are lowercased before hashing.""" key_lower = compute_canonical_evidence_key( "test article", "https://example.com/path" ) key_upper = compute_canonical_evidence_key( "TEST ARTICLE", "HTTPS://EXAMPLE.COM/PATH" ) assert key_lower == key_upper def test_normalization_strips_query_params(self): """URL query parameters are stripped before hashing.""" key_with_params = compute_canonical_evidence_key( "title", "https://example.com/article?utm_source=twitter&ref=123" ) key_without_params = compute_canonical_evidence_key( "title", "https://example.com/article" ) assert key_with_params == key_without_params def test_normalization_strips_whitespace(self): """Leading/trailing whitespace in title is stripped.""" key_trimmed = compute_canonical_evidence_key( "test", "https://example.com" ) key_padded = compute_canonical_evidence_key( " test ", "https://example.com" ) assert key_trimmed == key_padded class TestDuplicateDetection: """Tests for duplicate detection via canonical evidence keys.""" def test_three_docs_two_sharing_key_one_duplicate(self): """3 docs where 2 share a canonical key → 1 marked duplicate.""" # Simulate the duplicate detection logic from create_prediction_snapshot docs = [ {"title": "Breaking News", "url": "https://news.com/article"}, {"title": "breaking news", "url": "https://news.com/article?ref=1"}, {"title": "Other Story", "url": "https://other.com/story"}, ] seen_keys: dict[str, int] = {} duplicates: list[bool] = [] for doc in docs: key = compute_canonical_evidence_key(doc["title"], doc["url"]) is_dup = key in seen_keys if not is_dup: seen_keys[key] = len(duplicates) duplicates.append(is_dup) assert duplicates == [False, True, False] assert sum(duplicates) == 1 class TestContributionScores: """Tests for compute_contribution_scores.""" def test_known_weights(self): """[0.5, 0.3, 0.2] → [0.5, 0.3, 0.2] (already sums to 1.0).""" scores = compute_contribution_scores([0.5, 0.3, 0.2]) assert scores == pytest.approx([0.5, 0.3, 0.2]) assert sum(scores) == pytest.approx(1.0) def test_single_doc(self): """Single document → contribution score of 1.0.""" scores = compute_contribution_scores([0.7]) assert scores == pytest.approx([1.0]) def test_empty_input(self): """Empty input → empty list.""" scores = compute_contribution_scores([]) assert scores == [] def test_all_zero_weights(self): """All-zero weights → equal distribution.""" scores = compute_contribution_scores([0.0, 0.0, 0.0]) assert len(scores) == 3 assert all(s == pytest.approx(1.0 / 3.0) for s in scores) def test_scores_sum_to_one(self): """Arbitrary weights sum to 1.0.""" scores = compute_contribution_scores([1.0, 2.0, 3.0, 4.0]) assert sum(scores) == pytest.approx(1.0) assert scores == pytest.approx([0.1, 0.2, 0.3, 0.4]) class TestWeightClamping: """Tests for MAX_SINGLE_DOCUMENT_WEIGHT clamping.""" def test_weight_above_max_clamped(self): """Weight 1.5 → clamped to MAX_SINGLE_DOCUMENT_WEIGHT (1.0).""" raw_weight = 1.5 clamped = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT) assert clamped == 1.0 def test_weight_at_max_unchanged(self): """Weight exactly at MAX stays unchanged.""" raw_weight = 1.0 clamped = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT) assert clamped == 1.0 def test_weight_below_max_unchanged(self): """Weight below MAX stays unchanged.""" raw_weight = 0.5 clamped = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT) assert clamped == 0.5 # =================================================================== # 8.3 — Outcome Evaluator unit tests # Requirements: 4.2, 4.5, 4.6, 4.7 # =================================================================== class TestComputeReturn: """Tests for _compute_return.""" def test_positive_return(self): """Price 100 → 110 → return 0.10.""" assert _compute_return(100.0, 110.0) == pytest.approx(0.10) def test_negative_return(self): """Price 100 → 90 → return -0.10.""" assert _compute_return(100.0, 90.0) == pytest.approx(-0.10) def test_zero_return(self): """Price unchanged → return 0.0.""" assert _compute_return(100.0, 100.0) == pytest.approx(0.0) def test_zero_current_price(self): """Current price 0 → return 0.0 (guard against division by zero).""" assert _compute_return(0.0, 110.0) == 0.0 class TestDirectionCorrect: """Tests for _is_direction_correct.""" def test_bullish_positive_return(self): """Bullish + positive return → True.""" assert _is_direction_correct("bullish", 0.05) is True def test_bullish_negative_return(self): """Bullish + negative return → False.""" assert _is_direction_correct("bullish", -0.05) is False def test_bearish_negative_return(self): """Bearish + negative return → True.""" assert _is_direction_correct("bearish", -0.05) is True def test_bearish_positive_return(self): """Bearish + positive return → False.""" assert _is_direction_correct("bearish", 0.05) is False def test_bullish_zero_return(self): """Bullish + zero return → False (not strictly positive).""" assert _is_direction_correct("bullish", 0.0) is False def test_bearish_zero_return(self): """Bearish + zero return → False (not strictly negative).""" assert _is_direction_correct("bearish", 0.0) is False def test_mixed_direction(self): """Mixed direction → always False.""" assert _is_direction_correct("mixed", 0.05) is False assert _is_direction_correct("mixed", -0.05) is False def test_case_insensitive(self): """Direction matching is case-insensitive.""" assert _is_direction_correct("Bullish", 0.05) is True assert _is_direction_correct("BEARISH", -0.05) is True class TestIsProfitable: """Tests for _is_profitable.""" def test_buy_positive_return(self): """Buy + positive return → True.""" assert _is_profitable("buy", 0.05) is True def test_buy_negative_return(self): """Buy + negative return → False.""" assert _is_profitable("buy", -0.05) is False def test_sell_negative_return(self): """Sell + negative return → True.""" assert _is_profitable("sell", -0.05) is True def test_sell_positive_return(self): """Sell + positive return → False.""" assert _is_profitable("sell", 0.05) is False def test_hold_any_return(self): """Hold → always False.""" assert _is_profitable("hold", 0.05) is False assert _is_profitable("hold", -0.05) is False def test_case_insensitive(self): """Action matching is case-insensitive.""" assert _is_profitable("Buy", 0.05) is True assert _is_profitable("SELL", -0.05) is True class TestExcessReturn: """Tests for excess return computation (ticker return - benchmark return).""" def test_excess_return_vs_spy(self): """Ticker 10%, SPY 5% → excess 5%.""" ticker_return = _compute_return(100.0, 110.0) # 0.10 spy_return = _compute_return(100.0, 105.0) # 0.05 excess = ticker_return - spy_return assert excess == pytest.approx(0.05) def test_negative_excess_return(self): """Ticker 3%, SPY 5% → excess -2%.""" ticker_return = _compute_return(100.0, 103.0) # 0.03 spy_return = _compute_return(100.0, 105.0) # 0.05 excess = ticker_return - spy_return assert excess == pytest.approx(-0.02) def test_zero_excess_return(self): """Same return → excess 0%.""" ticker_return = _compute_return(100.0, 110.0) spy_return = _compute_return(100.0, 110.0) excess = ticker_return - spy_return assert excess == pytest.approx(0.0) # =================================================================== # 8.4 — Metrics Engine unit tests # Requirements: 5.3, 5.4, 6.1, 6.2, 6.5 # =================================================================== class TestCalibrationError: """Tests for compute_calibration_error (ECE).""" def test_perfect_calibration_ece_zero(self): """Perfect calibration → ECE = 0.0. All predictions in [0.70, 0.80) bucket with 75% win rate matching ~0.75 avg confidence. """ confidences = [0.75] * 100 outcomes = [True] * 75 + [False] * 25 ece, buckets = compute_calibration_error(confidences, outcomes) assert ece == pytest.approx(0.0, abs=1e-9) def test_all_overconfident_positive_ece(self): """All overconfident (high confidence, low win rate) → positive ECE.""" # All predictions at 0.95 confidence but only 50% win rate confidences = [0.95] * 100 outcomes = [True] * 50 + [False] * 50 ece, buckets = compute_calibration_error(confidences, outcomes) assert ece > 0.0 # ECE should be |0.95 - 0.50| = 0.45 assert ece == pytest.approx(0.45, abs=0.01) def test_empty_input_returns_zero(self): """Empty input → ECE = 0.0, empty buckets.""" ece, buckets = compute_calibration_error([], []) assert ece == 0.0 assert buckets == [] def test_miscalibrated_flag(self): """Buckets with |avg_conf - win_rate| > 0.15 are flagged.""" # All in [0.90, 1.00] bucket with 0% win rate → diff = 0.95 confidences = [0.95] * 20 outcomes = [False] * 20 _ece, buckets = compute_calibration_error(confidences, outcomes) # Find the [0.90, 1.00] bucket high_bucket = [b for b in buckets if b.bucket_low == 0.90] assert len(high_bucket) == 1 assert high_bucket[0].miscalibrated is True def test_ece_in_valid_range(self): """ECE is always in [0.0, 1.0].""" confidences = [0.55, 0.65, 0.75, 0.85, 0.95] outcomes = [False, True, False, True, False] ece, _ = compute_calibration_error(confidences, outcomes) assert 0.0 <= ece <= 1.0 class TestBrierScore: """Tests for compute_brier_score.""" def test_all_correct_at_p1(self): """All correct at p=1.0 → Brier = 0.0.""" p_bulls = [1.0] * 10 outcomes = [True] * 10 assert compute_brier_score(p_bulls, outcomes) == pytest.approx(0.0) def test_all_wrong_at_p1(self): """All wrong at p=1.0 → Brier = 1.0.""" p_bulls = [1.0] * 10 outcomes = [False] * 10 assert compute_brier_score(p_bulls, outcomes) == pytest.approx(1.0) def test_all_correct_at_p0(self): """All correct at p=0.0 (bearish correct) → Brier = 0.0.""" p_bulls = [0.0] * 10 outcomes = [False] * 10 assert compute_brier_score(p_bulls, outcomes) == pytest.approx(0.0) def test_empty_input(self): """Empty input → Brier = 0.0.""" assert compute_brier_score([], []) == 0.0 def test_mixed_predictions(self): """Mixed predictions produce a value in (0, 1).""" p_bulls = [0.8, 0.6, 0.3] outcomes = [True, False, True] brier = compute_brier_score(p_bulls, outcomes) assert 0.0 < brier < 1.0 class TestInformationCoefficient: """Tests for compute_information_coefficient (Pearson IC).""" def test_perfect_positive_correlation(self): """Perfectly correlated scores and returns → IC = 1.0.""" scores = list(range(30)) returns = [s * 2.0 + 1.0 for s in scores] # linear: y = 2x + 1 ic = compute_information_coefficient(scores, returns) assert ic is not None assert ic == pytest.approx(1.0, abs=1e-9) def test_perfect_negative_correlation(self): """Anti-correlated scores and returns → IC = -1.0.""" scores = list(range(30)) returns = [-s * 2.0 for s in scores] ic = compute_information_coefficient(scores, returns) assert ic is not None assert ic == pytest.approx(-1.0, abs=1e-9) def test_fewer_than_30_returns_none(self): """Fewer than 30 data points → None.""" scores = list(range(29)) returns = list(range(29)) ic = compute_information_coefficient(scores, returns) assert ic is None def test_ic_in_valid_range(self): """IC is always in [-1.0, 1.0] for valid data.""" scores = [float(i % 7) for i in range(50)] returns = [float(i % 5) for i in range(50)] ic = compute_information_coefficient(scores, returns) assert ic is not None assert -1.0 <= ic <= 1.0 class TestRankInformationCoefficient: """Tests for compute_rank_information_coefficient (Spearman Rank IC).""" def test_perfect_rank_correlation(self): """Perfectly rank-correlated → Rank IC = 1.0.""" scores = list(range(30)) returns = list(range(30)) # same ordering rank_ic = compute_rank_information_coefficient(scores, returns) assert rank_ic is not None assert rank_ic == pytest.approx(1.0, abs=1e-9) def test_perfect_anti_rank_correlation(self): """Perfectly anti-rank-correlated → Rank IC = -1.0.""" scores = list(range(30)) returns = list(range(29, -1, -1)) # reversed ordering rank_ic = compute_rank_information_coefficient(scores, returns) assert rank_ic is not None assert rank_ic == pytest.approx(-1.0, abs=1e-9) def test_fewer_than_30_returns_none(self): """Fewer than 30 data points → None.""" scores = list(range(29)) returns = list(range(29)) rank_ic = compute_rank_information_coefficient(scores, returns) assert rank_ic is None # =================================================================== # 8.5 — Calibration Engine unit tests # Requirements: 8.1, 8.2, 8.3 # =================================================================== class TestSourceReliability: """Tests for compute_source_reliability (Bayesian shrinkage).""" def test_zero_samples_returns_prior(self): """n=0 → reliability = 0.5 (prior mean).""" assert compute_source_reliability(0.8, 0) == 0.5 def test_large_sample_approaches_observed(self): """n=1000 with wr=0.8 → ≈0.8 (close to observed win rate).""" reliability = compute_source_reliability(0.8, 1000) assert reliability == pytest.approx(0.7912621359223302) # Should be close to 0.8 but not exactly assert abs(reliability - 0.8) < 0.02 def test_moderate_sample(self): """n=30 with wr=0.7 → 0.6 exactly. 0.5 + (30/60) * (0.7 - 0.5) = 0.5 + 0.5 * 0.2 = 0.6 """ assert compute_source_reliability(0.7, 30) == pytest.approx(0.6) def test_reliability_in_range(self): """Reliability is always in [0.0, 1.0].""" # Extreme win rates assert 0.0 <= compute_source_reliability(0.0, 100) <= 1.0 assert 0.0 <= compute_source_reliability(1.0, 100) <= 1.0 assert 0.0 <= compute_source_reliability(0.5, 1) <= 1.0 def test_negative_sample_count_returns_prior(self): """Negative sample count → treated as 0, returns 0.5.""" assert compute_source_reliability(0.8, -5) == 0.5 class TestAdjustedEvidenceWeight: """Tests for compute_adjusted_evidence_weight.""" def test_reliability_half_gives_base_weight(self): """reliability=0.5 → adjusted = base * (0.5 + 0.5) = base * 1.0.""" assert compute_adjusted_evidence_weight(1.0, 0.5) == pytest.approx(1.0) def test_high_reliability_increases_weight(self): """reliability=1.0 → adjusted = base * 1.5.""" assert compute_adjusted_evidence_weight(1.0, 1.0) == pytest.approx(1.5) def test_low_reliability_decreases_weight(self): """reliability=0.0 → adjusted = base * 0.5.""" assert compute_adjusted_evidence_weight(1.0, 0.0) == pytest.approx(0.5) def test_clamped_to_upper_bound(self): """Large base_weight * high reliability → clamped to 2.0.""" result = compute_adjusted_evidence_weight(3.0, 1.0) assert result == 2.0 def test_clamped_to_lower_bound(self): """Small base_weight * low reliability → clamped to 0.1.""" result = compute_adjusted_evidence_weight(0.1, 0.0) assert result == 0.1 def test_mid_range_not_clamped(self): """Normal values stay within bounds without clamping.""" result = compute_adjusted_evidence_weight(0.8, 0.6) # 0.8 * (0.5 + 0.6) = 0.8 * 1.1 = 0.88 assert result == pytest.approx(0.88) assert 0.1 <= result <= 2.0 # =================================================================== # 8.6 — Quality Gate unit tests # Requirements: 11.1, 11.6 # =================================================================== class TestQualityGate: """Tests for _evaluate_thresholds and QualityGateConfig.""" def _make_passing_snapshot(self) -> dict: """Return a metric snapshot dict that meets all default thresholds.""" return { "prediction_count": 200, "information_coefficient": 0.10, "win_rate": 0.60, "calibration_error": 0.08, "avg_excess_return_vs_spy": 0.02, } def test_all_thresholds_met_pass(self): """All thresholds met → every result is passed=True.""" config = QualityGateConfig() snapshot = self._make_passing_snapshot() results = _evaluate_thresholds(snapshot, config) assert len(results) == 5 assert all(r.passed for r in results), ( f"Expected all thresholds to pass, but got: " f"{[(r.name, r.passed) for r in results]}" ) def test_one_threshold_failed_ic_below_min(self): """IC below min_ic → that threshold fails, others pass.""" config = QualityGateConfig() snapshot = self._make_passing_snapshot() snapshot["information_coefficient"] = 0.01 # below min_ic=0.03 results = _evaluate_thresholds(snapshot, config) results_by_name = {r.name: r for r in results} assert results_by_name["min_ic"].passed is False assert results_by_name["min_ic"].actual == pytest.approx(0.01) assert results_by_name["min_ic"].threshold == pytest.approx(0.03) # All other thresholds should still pass for name, result in results_by_name.items(): if name != "min_ic": assert result.passed is True, f"{name} should pass but didn't" def test_all_thresholds_below_all_fail(self): """All metric values below thresholds → all results are passed=False.""" config = QualityGateConfig() snapshot = { "prediction_count": 10, # below 100 "information_coefficient": 0.0, # below 0.03 "win_rate": 0.40, # below 0.53 "calibration_error": 0.50, # above 0.15 "avg_excess_return_vs_spy": -0.05, # below 0.0 } results = _evaluate_thresholds(snapshot, config) assert len(results) == 5 assert all(not r.passed for r in results), ( f"Expected all thresholds to fail, but got: " f"{[(r.name, r.passed) for r in results]}" ) def test_failsafe_none_values_treated_as_worst_case(self): """Missing (None) metric values are treated as worst-case defaults. This tests the fail-safe behavior: when no snapshots exist, the snapshot dict would have None values. _evaluate_thresholds treats None as 0 for min-thresholds and 1.0 for max_ece, causing all thresholds to fail → paper-only. """ config = QualityGateConfig() snapshot = { "prediction_count": None, "information_coefficient": None, "win_rate": None, "calibration_error": None, "avg_excess_return_vs_spy": None, } results = _evaluate_thresholds(snapshot, config) results_by_name = {r.name: r for r in results} # prediction_count: None → 0, below 100 → fail assert results_by_name["min_prediction_count"].passed is False assert results_by_name["min_prediction_count"].actual == 0.0 # IC: None → 0.0, below 0.03 → fail assert results_by_name["min_ic"].passed is False assert results_by_name["min_ic"].actual == 0.0 # win_rate: None → 0.0, below 0.53 → fail assert results_by_name["min_win_rate"].passed is False assert results_by_name["min_win_rate"].actual == 0.0 # calibration_error: None → 1.0 (worst-case), above 0.15 → fail assert results_by_name["max_ece"].passed is False assert results_by_name["max_ece"].actual == 1.0 # excess_return: None → 0.0, equal to min 0.0 → pass (>= 0.0) assert results_by_name["min_excess_return_vs_spy"].passed is True assert results_by_name["min_excess_return_vs_spy"].actual == 0.0 def test_stale_snapshot_age_exceeds_max(self): """Snapshot age exceeding max_snapshot_age_hours causes gate failure. The evaluate_quality_gate async function checks snapshot age before calling _evaluate_thresholds. Here we verify the config field is respected by testing the age comparison logic directly. """ config = QualityGateConfig(max_snapshot_age_hours=24) age_hours = 30.0 # 30 hours old, exceeds 24h max assert age_hours > config.max_snapshot_age_hours def test_threshold_boundary_exact_values(self): """Metric values exactly at threshold boundaries → pass. min thresholds use >=, max thresholds use <=. """ config = QualityGateConfig() snapshot = { "prediction_count": 100, # exactly min_prediction_count "information_coefficient": 0.03, # exactly min_ic "win_rate": 0.53, # exactly min_win_rate "calibration_error": 0.15, # exactly max_ece "avg_excess_return_vs_spy": 0.0, # exactly min_excess_return } results = _evaluate_thresholds(snapshot, config) assert all(r.passed for r in results), ( f"Boundary values should pass, but got: " f"{[(r.name, r.passed, r.actual, r.threshold) for r in results]}" ) def test_custom_config_thresholds(self): """Custom QualityGateConfig thresholds are respected.""" config = QualityGateConfig( min_prediction_count=50, min_ic=0.01, min_win_rate=0.51, max_ece=0.20, min_excess_return_vs_spy=-0.01, ) snapshot = { "prediction_count": 60, "information_coefficient": 0.02, "win_rate": 0.52, "calibration_error": 0.18, "avg_excess_return_vs_spy": -0.005, } results = _evaluate_thresholds(snapshot, config) assert all(r.passed for r in results), ( f"Custom thresholds should pass, but got: " f"{[(r.name, r.passed) for r in results]}" )