32d290bea7
ci/woodpecker/push/test Pipeline was successful
ci/woodpecker/push/build-1 Pipeline was successful
ci/woodpecker/push/build-2 Pipeline was successful
ci/woodpecker/push/build-3 Pipeline was successful
ci/woodpecker/push/finalize Pipeline was successful
Build and Push / lint-and-test (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.adapters.broker_adapter name:broker-adapter]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.aggregation.worker name:aggregation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.extractor.worker name:extractor]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.ingestion.worker name:ingestion]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.lake_publisher.worker name:lake-publisher]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.parser.worker name:parser]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.recommendation.worker name:recommendation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.scheduler.app name:scheduler]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.api.app:app --host 0.0.0.0 --port 8000 name:query-api]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.risk.app:app --host 0.0.0.0 --port 8000 name:risk]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.symbol_registry.app:app --host 0.0.0.0 --port 8000 name:symbol-registry]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.trading.app:app --host 0.0.0.0 --port 8000 name:trading-engine]) (push) Has been cancelled
Build and Push / build-dashboard (push) Has been cancelled
Build and Push / build-superset (push) Has been cancelled
Build and Push / integration-test (push) Has been cancelled
Build and Push / beta-gate (push) Has been cancelled
690 lines
26 KiB
Python
690 lines
26 KiB
Python
"""Unit tests for model validation, calibration, and signal quality modules.
|
|
|
|
Covers prediction snapshot writer, outcome evaluator, metrics engine,
|
|
calibration engine, and quality gate — all pure-function / deterministic tests.
|
|
|
|
Requirements: 1.1, 2.3, 2.4, 2.5, 3.3, 4.2, 4.5, 4.6, 4.7,
|
|
5.3, 5.4, 6.1, 6.2, 6.5, 8.1, 8.2, 8.3, 11.1, 11.6
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
|
|
import pytest
|
|
|
|
# -- Quality Gate --
|
|
from services.trading.model_quality_gate import (
|
|
QualityGateConfig,
|
|
_evaluate_thresholds,
|
|
)
|
|
|
|
# -- Calibration Engine --
|
|
from services.validation.calibration import (
|
|
compute_adjusted_evidence_weight,
|
|
compute_source_reliability,
|
|
)
|
|
|
|
# -- Metrics Engine --
|
|
from services.validation.metrics import (
|
|
compute_brier_score,
|
|
compute_calibration_error,
|
|
compute_information_coefficient,
|
|
compute_rank_information_coefficient,
|
|
)
|
|
|
|
# -- Outcome Evaluator --
|
|
from services.validation.outcome_evaluator import (
|
|
_compute_return,
|
|
_is_direction_correct,
|
|
_is_profitable,
|
|
)
|
|
|
|
# -- Prediction Snapshot Writer --
|
|
from services.validation.prediction_snapshot import (
|
|
MAX_SINGLE_DOCUMENT_WEIGHT,
|
|
compute_canonical_evidence_key,
|
|
compute_contribution_scores,
|
|
)
|
|
|
|
# ===================================================================
|
|
# 8.2 — Prediction Snapshot Writer unit tests
|
|
# Requirements: 1.1, 2.3, 2.4, 2.5, 3.3
|
|
# ===================================================================
|
|
|
|
|
|
class TestCanonicalEvidenceKey:
|
|
"""Tests for compute_canonical_evidence_key."""
|
|
|
|
def test_known_title_url_produces_expected_sha256(self):
|
|
"""Known title/URL pair produces a deterministic SHA256 hash."""
|
|
key = compute_canonical_evidence_key(
|
|
"Test Article", "https://example.com/article?ref=123"
|
|
)
|
|
assert key == "abd5818d51579a7af51cd06861289c7f1fdc97c0f522e8ba13ce9b4aad01cb6f"
|
|
|
|
def test_empty_inputs(self):
|
|
"""Empty title and URL produce SHA256 of empty string."""
|
|
key = compute_canonical_evidence_key("", "")
|
|
expected = hashlib.sha256(b"").hexdigest()
|
|
assert key == expected
|
|
|
|
def test_unicode_inputs(self):
|
|
"""Unicode title and URL are handled correctly."""
|
|
key = compute_canonical_evidence_key(
|
|
"日本語テスト", "https://example.com/日本語"
|
|
)
|
|
assert key == "553553928bb4e36abdf283ff3c52df0695fca09809159650a9bdcb4fb2c5f62b"
|
|
|
|
def test_normalization_case_insensitive(self):
|
|
"""Title and URL are lowercased before hashing."""
|
|
key_lower = compute_canonical_evidence_key(
|
|
"test article", "https://example.com/path"
|
|
)
|
|
key_upper = compute_canonical_evidence_key(
|
|
"TEST ARTICLE", "HTTPS://EXAMPLE.COM/PATH"
|
|
)
|
|
assert key_lower == key_upper
|
|
|
|
def test_normalization_strips_query_params(self):
|
|
"""URL query parameters are stripped before hashing."""
|
|
key_with_params = compute_canonical_evidence_key(
|
|
"title", "https://example.com/article?utm_source=twitter&ref=123"
|
|
)
|
|
key_without_params = compute_canonical_evidence_key(
|
|
"title", "https://example.com/article"
|
|
)
|
|
assert key_with_params == key_without_params
|
|
|
|
def test_normalization_strips_whitespace(self):
|
|
"""Leading/trailing whitespace in title is stripped."""
|
|
key_trimmed = compute_canonical_evidence_key(
|
|
"test", "https://example.com"
|
|
)
|
|
key_padded = compute_canonical_evidence_key(
|
|
" test ", "https://example.com"
|
|
)
|
|
assert key_trimmed == key_padded
|
|
|
|
|
|
class TestDuplicateDetection:
|
|
"""Tests for duplicate detection via canonical evidence keys."""
|
|
|
|
def test_three_docs_two_sharing_key_one_duplicate(self):
|
|
"""3 docs where 2 share a canonical key → 1 marked duplicate."""
|
|
# Simulate the duplicate detection logic from create_prediction_snapshot
|
|
docs = [
|
|
{"title": "Breaking News", "url": "https://news.com/article"},
|
|
{"title": "breaking news", "url": "https://news.com/article?ref=1"},
|
|
{"title": "Other Story", "url": "https://other.com/story"},
|
|
]
|
|
|
|
seen_keys: dict[str, int] = {}
|
|
duplicates: list[bool] = []
|
|
|
|
for doc in docs:
|
|
key = compute_canonical_evidence_key(doc["title"], doc["url"])
|
|
is_dup = key in seen_keys
|
|
if not is_dup:
|
|
seen_keys[key] = len(duplicates)
|
|
duplicates.append(is_dup)
|
|
|
|
assert duplicates == [False, True, False]
|
|
assert sum(duplicates) == 1
|
|
|
|
|
|
class TestContributionScores:
|
|
"""Tests for compute_contribution_scores."""
|
|
|
|
def test_known_weights(self):
|
|
"""[0.5, 0.3, 0.2] → [0.5, 0.3, 0.2] (already sums to 1.0)."""
|
|
scores = compute_contribution_scores([0.5, 0.3, 0.2])
|
|
assert scores == pytest.approx([0.5, 0.3, 0.2])
|
|
assert sum(scores) == pytest.approx(1.0)
|
|
|
|
def test_single_doc(self):
|
|
"""Single document → contribution score of 1.0."""
|
|
scores = compute_contribution_scores([0.7])
|
|
assert scores == pytest.approx([1.0])
|
|
|
|
def test_empty_input(self):
|
|
"""Empty input → empty list."""
|
|
scores = compute_contribution_scores([])
|
|
assert scores == []
|
|
|
|
def test_all_zero_weights(self):
|
|
"""All-zero weights → equal distribution."""
|
|
scores = compute_contribution_scores([0.0, 0.0, 0.0])
|
|
assert len(scores) == 3
|
|
assert all(s == pytest.approx(1.0 / 3.0) for s in scores)
|
|
|
|
def test_scores_sum_to_one(self):
|
|
"""Arbitrary weights sum to 1.0."""
|
|
scores = compute_contribution_scores([1.0, 2.0, 3.0, 4.0])
|
|
assert sum(scores) == pytest.approx(1.0)
|
|
assert scores == pytest.approx([0.1, 0.2, 0.3, 0.4])
|
|
|
|
|
|
class TestWeightClamping:
|
|
"""Tests for MAX_SINGLE_DOCUMENT_WEIGHT clamping."""
|
|
|
|
def test_weight_above_max_clamped(self):
|
|
"""Weight 1.5 → clamped to MAX_SINGLE_DOCUMENT_WEIGHT (1.0)."""
|
|
raw_weight = 1.5
|
|
clamped = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT)
|
|
assert clamped == 1.0
|
|
|
|
def test_weight_at_max_unchanged(self):
|
|
"""Weight exactly at MAX stays unchanged."""
|
|
raw_weight = 1.0
|
|
clamped = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT)
|
|
assert clamped == 1.0
|
|
|
|
def test_weight_below_max_unchanged(self):
|
|
"""Weight below MAX stays unchanged."""
|
|
raw_weight = 0.5
|
|
clamped = min(raw_weight, MAX_SINGLE_DOCUMENT_WEIGHT)
|
|
assert clamped == 0.5
|
|
|
|
|
|
# ===================================================================
|
|
# 8.3 — Outcome Evaluator unit tests
|
|
# Requirements: 4.2, 4.5, 4.6, 4.7
|
|
# ===================================================================
|
|
|
|
|
|
class TestComputeReturn:
|
|
"""Tests for _compute_return."""
|
|
|
|
def test_positive_return(self):
|
|
"""Price 100 → 110 → return 0.10."""
|
|
assert _compute_return(100.0, 110.0) == pytest.approx(0.10)
|
|
|
|
def test_negative_return(self):
|
|
"""Price 100 → 90 → return -0.10."""
|
|
assert _compute_return(100.0, 90.0) == pytest.approx(-0.10)
|
|
|
|
def test_zero_return(self):
|
|
"""Price unchanged → return 0.0."""
|
|
assert _compute_return(100.0, 100.0) == pytest.approx(0.0)
|
|
|
|
def test_zero_current_price(self):
|
|
"""Current price 0 → return 0.0 (guard against division by zero)."""
|
|
assert _compute_return(0.0, 110.0) == 0.0
|
|
|
|
|
|
class TestDirectionCorrect:
|
|
"""Tests for _is_direction_correct."""
|
|
|
|
def test_bullish_positive_return(self):
|
|
"""Bullish + positive return → True."""
|
|
assert _is_direction_correct("bullish", 0.05) is True
|
|
|
|
def test_bullish_negative_return(self):
|
|
"""Bullish + negative return → False."""
|
|
assert _is_direction_correct("bullish", -0.05) is False
|
|
|
|
def test_bearish_negative_return(self):
|
|
"""Bearish + negative return → True."""
|
|
assert _is_direction_correct("bearish", -0.05) is True
|
|
|
|
def test_bearish_positive_return(self):
|
|
"""Bearish + positive return → False."""
|
|
assert _is_direction_correct("bearish", 0.05) is False
|
|
|
|
def test_bullish_zero_return(self):
|
|
"""Bullish + zero return → False (not strictly positive)."""
|
|
assert _is_direction_correct("bullish", 0.0) is False
|
|
|
|
def test_bearish_zero_return(self):
|
|
"""Bearish + zero return → False (not strictly negative)."""
|
|
assert _is_direction_correct("bearish", 0.0) is False
|
|
|
|
def test_mixed_direction(self):
|
|
"""Mixed direction → always False."""
|
|
assert _is_direction_correct("mixed", 0.05) is False
|
|
assert _is_direction_correct("mixed", -0.05) is False
|
|
|
|
def test_case_insensitive(self):
|
|
"""Direction matching is case-insensitive."""
|
|
assert _is_direction_correct("Bullish", 0.05) is True
|
|
assert _is_direction_correct("BEARISH", -0.05) is True
|
|
|
|
|
|
class TestIsProfitable:
|
|
"""Tests for _is_profitable."""
|
|
|
|
def test_buy_positive_return(self):
|
|
"""Buy + positive return → True."""
|
|
assert _is_profitable("buy", 0.05) is True
|
|
|
|
def test_buy_negative_return(self):
|
|
"""Buy + negative return → False."""
|
|
assert _is_profitable("buy", -0.05) is False
|
|
|
|
def test_sell_negative_return(self):
|
|
"""Sell + negative return → True."""
|
|
assert _is_profitable("sell", -0.05) is True
|
|
|
|
def test_sell_positive_return(self):
|
|
"""Sell + positive return → False."""
|
|
assert _is_profitable("sell", 0.05) is False
|
|
|
|
def test_hold_any_return(self):
|
|
"""Hold → always False."""
|
|
assert _is_profitable("hold", 0.05) is False
|
|
assert _is_profitable("hold", -0.05) is False
|
|
|
|
def test_case_insensitive(self):
|
|
"""Action matching is case-insensitive."""
|
|
assert _is_profitable("Buy", 0.05) is True
|
|
assert _is_profitable("SELL", -0.05) is True
|
|
|
|
|
|
class TestExcessReturn:
|
|
"""Tests for excess return computation (ticker return - benchmark return)."""
|
|
|
|
def test_excess_return_vs_spy(self):
|
|
"""Ticker 10%, SPY 5% → excess 5%."""
|
|
ticker_return = _compute_return(100.0, 110.0) # 0.10
|
|
spy_return = _compute_return(100.0, 105.0) # 0.05
|
|
excess = ticker_return - spy_return
|
|
assert excess == pytest.approx(0.05)
|
|
|
|
def test_negative_excess_return(self):
|
|
"""Ticker 3%, SPY 5% → excess -2%."""
|
|
ticker_return = _compute_return(100.0, 103.0) # 0.03
|
|
spy_return = _compute_return(100.0, 105.0) # 0.05
|
|
excess = ticker_return - spy_return
|
|
assert excess == pytest.approx(-0.02)
|
|
|
|
def test_zero_excess_return(self):
|
|
"""Same return → excess 0%."""
|
|
ticker_return = _compute_return(100.0, 110.0)
|
|
spy_return = _compute_return(100.0, 110.0)
|
|
excess = ticker_return - spy_return
|
|
assert excess == pytest.approx(0.0)
|
|
|
|
|
|
# ===================================================================
|
|
# 8.4 — Metrics Engine unit tests
|
|
# Requirements: 5.3, 5.4, 6.1, 6.2, 6.5
|
|
# ===================================================================
|
|
|
|
|
|
class TestCalibrationError:
|
|
"""Tests for compute_calibration_error (ECE)."""
|
|
|
|
def test_perfect_calibration_ece_zero(self):
|
|
"""Perfect calibration → ECE = 0.0.
|
|
|
|
All predictions in [0.70, 0.80) bucket with 75% win rate
|
|
matching ~0.75 avg confidence.
|
|
"""
|
|
confidences = [0.75] * 100
|
|
outcomes = [True] * 75 + [False] * 25
|
|
ece, buckets = compute_calibration_error(confidences, outcomes)
|
|
assert ece == pytest.approx(0.0, abs=1e-9)
|
|
|
|
def test_all_overconfident_positive_ece(self):
|
|
"""All overconfident (high confidence, low win rate) → positive ECE."""
|
|
# All predictions at 0.95 confidence but only 50% win rate
|
|
confidences = [0.95] * 100
|
|
outcomes = [True] * 50 + [False] * 50
|
|
ece, buckets = compute_calibration_error(confidences, outcomes)
|
|
assert ece > 0.0
|
|
# ECE should be |0.95 - 0.50| = 0.45
|
|
assert ece == pytest.approx(0.45, abs=0.01)
|
|
|
|
def test_empty_input_returns_zero(self):
|
|
"""Empty input → ECE = 0.0, empty buckets."""
|
|
ece, buckets = compute_calibration_error([], [])
|
|
assert ece == 0.0
|
|
assert buckets == []
|
|
|
|
def test_miscalibrated_flag(self):
|
|
"""Buckets with |avg_conf - win_rate| > 0.15 are flagged."""
|
|
# All in [0.90, 1.00] bucket with 0% win rate → diff = 0.95
|
|
confidences = [0.95] * 20
|
|
outcomes = [False] * 20
|
|
_ece, buckets = compute_calibration_error(confidences, outcomes)
|
|
# Find the [0.90, 1.00] bucket
|
|
high_bucket = [b for b in buckets if b.bucket_low == 0.90]
|
|
assert len(high_bucket) == 1
|
|
assert high_bucket[0].miscalibrated is True
|
|
|
|
def test_ece_in_valid_range(self):
|
|
"""ECE is always in [0.0, 1.0]."""
|
|
confidences = [0.55, 0.65, 0.75, 0.85, 0.95]
|
|
outcomes = [False, True, False, True, False]
|
|
ece, _ = compute_calibration_error(confidences, outcomes)
|
|
assert 0.0 <= ece <= 1.0
|
|
|
|
|
|
class TestBrierScore:
|
|
"""Tests for compute_brier_score."""
|
|
|
|
def test_all_correct_at_p1(self):
|
|
"""All correct at p=1.0 → Brier = 0.0."""
|
|
p_bulls = [1.0] * 10
|
|
outcomes = [True] * 10
|
|
assert compute_brier_score(p_bulls, outcomes) == pytest.approx(0.0)
|
|
|
|
def test_all_wrong_at_p1(self):
|
|
"""All wrong at p=1.0 → Brier = 1.0."""
|
|
p_bulls = [1.0] * 10
|
|
outcomes = [False] * 10
|
|
assert compute_brier_score(p_bulls, outcomes) == pytest.approx(1.0)
|
|
|
|
def test_all_correct_at_p0(self):
|
|
"""All correct at p=0.0 (bearish correct) → Brier = 0.0."""
|
|
p_bulls = [0.0] * 10
|
|
outcomes = [False] * 10
|
|
assert compute_brier_score(p_bulls, outcomes) == pytest.approx(0.0)
|
|
|
|
def test_empty_input(self):
|
|
"""Empty input → Brier = 0.0."""
|
|
assert compute_brier_score([], []) == 0.0
|
|
|
|
def test_mixed_predictions(self):
|
|
"""Mixed predictions produce a value in (0, 1)."""
|
|
p_bulls = [0.8, 0.6, 0.3]
|
|
outcomes = [True, False, True]
|
|
brier = compute_brier_score(p_bulls, outcomes)
|
|
assert 0.0 < brier < 1.0
|
|
|
|
|
|
class TestInformationCoefficient:
|
|
"""Tests for compute_information_coefficient (Pearson IC)."""
|
|
|
|
def test_perfect_positive_correlation(self):
|
|
"""Perfectly correlated scores and returns → IC = 1.0."""
|
|
scores = list(range(30))
|
|
returns = [s * 2.0 + 1.0 for s in scores] # linear: y = 2x + 1
|
|
ic = compute_information_coefficient(scores, returns)
|
|
assert ic is not None
|
|
assert ic == pytest.approx(1.0, abs=1e-9)
|
|
|
|
def test_perfect_negative_correlation(self):
|
|
"""Anti-correlated scores and returns → IC = -1.0."""
|
|
scores = list(range(30))
|
|
returns = [-s * 2.0 for s in scores]
|
|
ic = compute_information_coefficient(scores, returns)
|
|
assert ic is not None
|
|
assert ic == pytest.approx(-1.0, abs=1e-9)
|
|
|
|
def test_fewer_than_30_returns_none(self):
|
|
"""Fewer than 30 data points → None."""
|
|
scores = list(range(29))
|
|
returns = list(range(29))
|
|
ic = compute_information_coefficient(scores, returns)
|
|
assert ic is None
|
|
|
|
def test_ic_in_valid_range(self):
|
|
"""IC is always in [-1.0, 1.0] for valid data."""
|
|
scores = [float(i % 7) for i in range(50)]
|
|
returns = [float(i % 5) for i in range(50)]
|
|
ic = compute_information_coefficient(scores, returns)
|
|
assert ic is not None
|
|
assert -1.0 <= ic <= 1.0
|
|
|
|
|
|
class TestRankInformationCoefficient:
|
|
"""Tests for compute_rank_information_coefficient (Spearman Rank IC)."""
|
|
|
|
def test_perfect_rank_correlation(self):
|
|
"""Perfectly rank-correlated → Rank IC = 1.0."""
|
|
scores = list(range(30))
|
|
returns = list(range(30)) # same ordering
|
|
rank_ic = compute_rank_information_coefficient(scores, returns)
|
|
assert rank_ic is not None
|
|
assert rank_ic == pytest.approx(1.0, abs=1e-9)
|
|
|
|
def test_perfect_anti_rank_correlation(self):
|
|
"""Perfectly anti-rank-correlated → Rank IC = -1.0."""
|
|
scores = list(range(30))
|
|
returns = list(range(29, -1, -1)) # reversed ordering
|
|
rank_ic = compute_rank_information_coefficient(scores, returns)
|
|
assert rank_ic is not None
|
|
assert rank_ic == pytest.approx(-1.0, abs=1e-9)
|
|
|
|
def test_fewer_than_30_returns_none(self):
|
|
"""Fewer than 30 data points → None."""
|
|
scores = list(range(29))
|
|
returns = list(range(29))
|
|
rank_ic = compute_rank_information_coefficient(scores, returns)
|
|
assert rank_ic is None
|
|
|
|
|
|
# ===================================================================
|
|
# 8.5 — Calibration Engine unit tests
|
|
# Requirements: 8.1, 8.2, 8.3
|
|
# ===================================================================
|
|
|
|
|
|
class TestSourceReliability:
|
|
"""Tests for compute_source_reliability (Bayesian shrinkage)."""
|
|
|
|
def test_zero_samples_returns_prior(self):
|
|
"""n=0 → reliability = 0.5 (prior mean)."""
|
|
assert compute_source_reliability(0.8, 0) == 0.5
|
|
|
|
def test_large_sample_approaches_observed(self):
|
|
"""n=1000 with wr=0.8 → ≈0.8 (close to observed win rate)."""
|
|
reliability = compute_source_reliability(0.8, 1000)
|
|
assert reliability == pytest.approx(0.7912621359223302)
|
|
# Should be close to 0.8 but not exactly
|
|
assert abs(reliability - 0.8) < 0.02
|
|
|
|
def test_moderate_sample(self):
|
|
"""n=30 with wr=0.7 → 0.6 exactly.
|
|
|
|
0.5 + (30/60) * (0.7 - 0.5) = 0.5 + 0.5 * 0.2 = 0.6
|
|
"""
|
|
assert compute_source_reliability(0.7, 30) == pytest.approx(0.6)
|
|
|
|
def test_reliability_in_range(self):
|
|
"""Reliability is always in [0.0, 1.0]."""
|
|
# Extreme win rates
|
|
assert 0.0 <= compute_source_reliability(0.0, 100) <= 1.0
|
|
assert 0.0 <= compute_source_reliability(1.0, 100) <= 1.0
|
|
assert 0.0 <= compute_source_reliability(0.5, 1) <= 1.0
|
|
|
|
def test_negative_sample_count_returns_prior(self):
|
|
"""Negative sample count → treated as 0, returns 0.5."""
|
|
assert compute_source_reliability(0.8, -5) == 0.5
|
|
|
|
|
|
class TestAdjustedEvidenceWeight:
|
|
"""Tests for compute_adjusted_evidence_weight."""
|
|
|
|
def test_reliability_half_gives_base_weight(self):
|
|
"""reliability=0.5 → adjusted = base * (0.5 + 0.5) = base * 1.0."""
|
|
assert compute_adjusted_evidence_weight(1.0, 0.5) == pytest.approx(1.0)
|
|
|
|
def test_high_reliability_increases_weight(self):
|
|
"""reliability=1.0 → adjusted = base * 1.5."""
|
|
assert compute_adjusted_evidence_weight(1.0, 1.0) == pytest.approx(1.5)
|
|
|
|
def test_low_reliability_decreases_weight(self):
|
|
"""reliability=0.0 → adjusted = base * 0.5."""
|
|
assert compute_adjusted_evidence_weight(1.0, 0.0) == pytest.approx(0.5)
|
|
|
|
def test_clamped_to_upper_bound(self):
|
|
"""Large base_weight * high reliability → clamped to 2.0."""
|
|
result = compute_adjusted_evidence_weight(3.0, 1.0)
|
|
assert result == 2.0
|
|
|
|
def test_clamped_to_lower_bound(self):
|
|
"""Small base_weight * low reliability → clamped to 0.1."""
|
|
result = compute_adjusted_evidence_weight(0.1, 0.0)
|
|
assert result == 0.1
|
|
|
|
def test_mid_range_not_clamped(self):
|
|
"""Normal values stay within bounds without clamping."""
|
|
result = compute_adjusted_evidence_weight(0.8, 0.6)
|
|
# 0.8 * (0.5 + 0.6) = 0.8 * 1.1 = 0.88
|
|
assert result == pytest.approx(0.88)
|
|
assert 0.1 <= result <= 2.0
|
|
|
|
|
|
# ===================================================================
|
|
# 8.6 — Quality Gate unit tests
|
|
# Requirements: 11.1, 11.6
|
|
# ===================================================================
|
|
|
|
|
|
class TestQualityGate:
|
|
"""Tests for _evaluate_thresholds and QualityGateConfig."""
|
|
|
|
def _make_passing_snapshot(self) -> dict:
|
|
"""Return a metric snapshot dict that meets all default thresholds."""
|
|
return {
|
|
"prediction_count": 200,
|
|
"information_coefficient": 0.10,
|
|
"win_rate": 0.60,
|
|
"calibration_error": 0.08,
|
|
"avg_excess_return_vs_spy": 0.02,
|
|
}
|
|
|
|
def test_all_thresholds_met_pass(self):
|
|
"""All thresholds met → every result is passed=True."""
|
|
config = QualityGateConfig()
|
|
snapshot = self._make_passing_snapshot()
|
|
|
|
results = _evaluate_thresholds(snapshot, config)
|
|
|
|
assert len(results) == 5
|
|
assert all(r.passed for r in results), (
|
|
f"Expected all thresholds to pass, but got: "
|
|
f"{[(r.name, r.passed) for r in results]}"
|
|
)
|
|
|
|
def test_one_threshold_failed_ic_below_min(self):
|
|
"""IC below min_ic → that threshold fails, others pass."""
|
|
config = QualityGateConfig()
|
|
snapshot = self._make_passing_snapshot()
|
|
snapshot["information_coefficient"] = 0.01 # below min_ic=0.03
|
|
|
|
results = _evaluate_thresholds(snapshot, config)
|
|
|
|
results_by_name = {r.name: r for r in results}
|
|
assert results_by_name["min_ic"].passed is False
|
|
assert results_by_name["min_ic"].actual == pytest.approx(0.01)
|
|
assert results_by_name["min_ic"].threshold == pytest.approx(0.03)
|
|
# All other thresholds should still pass
|
|
for name, result in results_by_name.items():
|
|
if name != "min_ic":
|
|
assert result.passed is True, f"{name} should pass but didn't"
|
|
|
|
def test_all_thresholds_below_all_fail(self):
|
|
"""All metric values below thresholds → all results are passed=False."""
|
|
config = QualityGateConfig()
|
|
snapshot = {
|
|
"prediction_count": 10, # below 100
|
|
"information_coefficient": 0.0, # below 0.03
|
|
"win_rate": 0.40, # below 0.53
|
|
"calibration_error": 0.50, # above 0.15
|
|
"avg_excess_return_vs_spy": -0.05, # below 0.0
|
|
}
|
|
|
|
results = _evaluate_thresholds(snapshot, config)
|
|
|
|
assert len(results) == 5
|
|
assert all(not r.passed for r in results), (
|
|
f"Expected all thresholds to fail, but got: "
|
|
f"{[(r.name, r.passed) for r in results]}"
|
|
)
|
|
|
|
def test_failsafe_none_values_treated_as_worst_case(self):
|
|
"""Missing (None) metric values are treated as worst-case defaults.
|
|
|
|
This tests the fail-safe behavior: when no snapshots exist,
|
|
the snapshot dict would have None values. _evaluate_thresholds
|
|
treats None as 0 for min-thresholds and 1.0 for max_ece,
|
|
causing all thresholds to fail → paper-only.
|
|
"""
|
|
config = QualityGateConfig()
|
|
snapshot = {
|
|
"prediction_count": None,
|
|
"information_coefficient": None,
|
|
"win_rate": None,
|
|
"calibration_error": None,
|
|
"avg_excess_return_vs_spy": None,
|
|
}
|
|
|
|
results = _evaluate_thresholds(snapshot, config)
|
|
|
|
results_by_name = {r.name: r for r in results}
|
|
# prediction_count: None → 0, below 100 → fail
|
|
assert results_by_name["min_prediction_count"].passed is False
|
|
assert results_by_name["min_prediction_count"].actual == 0.0
|
|
# IC: None → 0.0, below 0.03 → fail
|
|
assert results_by_name["min_ic"].passed is False
|
|
assert results_by_name["min_ic"].actual == 0.0
|
|
# win_rate: None → 0.0, below 0.53 → fail
|
|
assert results_by_name["min_win_rate"].passed is False
|
|
assert results_by_name["min_win_rate"].actual == 0.0
|
|
# calibration_error: None → 1.0 (worst-case), above 0.15 → fail
|
|
assert results_by_name["max_ece"].passed is False
|
|
assert results_by_name["max_ece"].actual == 1.0
|
|
# excess_return: None → 0.0, equal to min 0.0 → pass (>= 0.0)
|
|
assert results_by_name["min_excess_return_vs_spy"].passed is True
|
|
assert results_by_name["min_excess_return_vs_spy"].actual == 0.0
|
|
|
|
def test_stale_snapshot_age_exceeds_max(self):
|
|
"""Snapshot age exceeding max_snapshot_age_hours causes gate failure.
|
|
|
|
The evaluate_quality_gate async function checks snapshot age
|
|
before calling _evaluate_thresholds. Here we verify the config
|
|
field is respected by testing the age comparison logic directly.
|
|
"""
|
|
config = QualityGateConfig(max_snapshot_age_hours=24)
|
|
age_hours = 30.0 # 30 hours old, exceeds 24h max
|
|
|
|
assert age_hours > config.max_snapshot_age_hours
|
|
|
|
def test_threshold_boundary_exact_values(self):
|
|
"""Metric values exactly at threshold boundaries → pass.
|
|
|
|
min thresholds use >=, max thresholds use <=.
|
|
"""
|
|
config = QualityGateConfig()
|
|
snapshot = {
|
|
"prediction_count": 100, # exactly min_prediction_count
|
|
"information_coefficient": 0.03, # exactly min_ic
|
|
"win_rate": 0.53, # exactly min_win_rate
|
|
"calibration_error": 0.15, # exactly max_ece
|
|
"avg_excess_return_vs_spy": 0.0, # exactly min_excess_return
|
|
}
|
|
|
|
results = _evaluate_thresholds(snapshot, config)
|
|
|
|
assert all(r.passed for r in results), (
|
|
f"Boundary values should pass, but got: "
|
|
f"{[(r.name, r.passed, r.actual, r.threshold) for r in results]}"
|
|
)
|
|
|
|
def test_custom_config_thresholds(self):
|
|
"""Custom QualityGateConfig thresholds are respected."""
|
|
config = QualityGateConfig(
|
|
min_prediction_count=50,
|
|
min_ic=0.01,
|
|
min_win_rate=0.51,
|
|
max_ece=0.20,
|
|
min_excess_return_vs_spy=-0.01,
|
|
)
|
|
snapshot = {
|
|
"prediction_count": 60,
|
|
"information_coefficient": 0.02,
|
|
"win_rate": 0.52,
|
|
"calibration_error": 0.18,
|
|
"avg_excess_return_vs_spy": -0.005,
|
|
}
|
|
|
|
results = _evaluate_thresholds(snapshot, config)
|
|
|
|
assert all(r.passed for r in results), (
|
|
f"Custom thresholds should pass, but got: "
|
|
f"{[(r.name, r.passed) for r in results]}"
|
|
)
|