Files
stonks-oracle/tests/test_extractor_metrics.py

169 lines
5.2 KiB
Python

"""Tests for extraction model performance metrics collection.
Validates that collect_metrics correctly computes metrics from
ExtractionResponse objects for both successful and failed extractions.
Requirements: 5.2, 5.4, 12.1, 12.2
"""
from __future__ import annotations
from services.extractor.client import ExtractionAttempt, ExtractionResponse
from services.extractor.metrics import collect_metrics
from services.extractor.schemas import ExtractionResult, ValidationReport
def _make_valid_result() -> ExtractionResult:
return ExtractionResult.model_validate({
"summary": "Apple beat earnings expectations.",
"companies": [
{
"ticker": "AAPL",
"company_name": "Apple Inc.",
"relevance": 0.95,
"sentiment": "positive",
"impact_score": 0.7,
"impact_horizon": "1d_30d",
"catalyst_type": "earnings",
"key_facts": ["Revenue up 12%"],
"risks": [],
"evidence_spans": ["Apple beat expectations"],
}
],
"macro_themes": ["ai_capex"],
"novelty_score": 0.6,
"confidence": 0.85,
"extraction_warnings": [],
})
def _make_success_response() -> ExtractionResponse:
result = _make_valid_result()
validation = ValidationReport(valid=True, errors=[], warnings=["low_novelty"], parsed=result)
attempt = ExtractionAttempt(
raw_output=result.model_dump_json(),
validation=validation,
error=None,
duration_ms=500,
model="test-model",
)
return ExtractionResponse(
success=True,
result=result,
attempts=[attempt],
prompt_metadata={"prompt_version": "document-intel-v1", "schema_version": "2.0.0"},
model="test-model",
total_duration_ms=500,
)
def _make_failed_response_with_retries() -> ExtractionResponse:
attempt1 = ExtractionAttempt(
raw_output="bad json",
validation=None,
error="invalid_json",
duration_ms=200,
model="test-model",
)
attempt2 = ExtractionAttempt(
raw_output="still bad output here",
validation=ValidationReport(
valid=False,
errors=["schema_fail", "missing_companies"],
warnings=["truncated"],
),
error="schema_fail; missing_companies",
duration_ms=300,
model="test-model",
)
return ExtractionResponse(
success=False,
result=None,
attempts=[attempt1, attempt2],
prompt_metadata={"prompt_version": "document-intel-v1", "schema_version": "2.0.0"},
model="test-model",
total_duration_ms=500,
)
def test_collect_metrics_success():
"""Successful extraction produces correct metrics."""
resp = _make_success_response()
m = collect_metrics(
resp,
document_id="doc-1",
ticker="AAPL",
document_text_length=4000,
)
assert m.document_id == "doc-1"
assert m.ticker == "AAPL"
assert m.model_name == "test-model"
assert m.prompt_version == "document-intel-v1"
assert m.schema_version == "2.0.0"
assert m.success is True
assert m.attempt_count == 1
assert m.total_duration_ms == 500
assert m.first_attempt_duration_ms == 500
assert m.final_attempt_duration_ms == 500
assert m.confidence == 0.85
assert m.validation_status == "valid"
assert m.validation_error_count == 0
assert m.validation_warning_count == 1
assert m.retry_count == 0
assert m.input_token_estimate == 1000 # 4000 / 4
assert m.output_token_estimate > 0
assert m.company_count == 1
def test_collect_metrics_failed_with_retries():
"""Failed extraction with retries produces correct metrics."""
resp = _make_failed_response_with_retries()
m = collect_metrics(
resp,
document_id="doc-2",
ticker="MSFT",
document_text_length=2000,
)
assert m.success is False
assert m.attempt_count == 2
assert m.retry_count == 1
assert m.first_attempt_duration_ms == 200
assert m.final_attempt_duration_ms == 300
assert m.total_duration_ms == 500
assert m.validation_status == "failed"
assert m.validation_error_count == 2
assert m.validation_warning_count == 1
assert "schema_fail" in m.validation_errors
assert m.confidence == 0.0
assert m.company_count == 0
assert m.input_token_estimate == 500 # 2000 / 4
def test_collect_metrics_empty_attempts():
"""Response with no attempts produces safe defaults."""
resp = ExtractionResponse(
success=False,
result=None,
attempts=[],
prompt_metadata={},
model="test-model",
total_duration_ms=0,
)
m = collect_metrics(resp, document_id="doc-3")
assert m.attempt_count == 0
assert m.retry_count == 0
assert m.first_attempt_duration_ms == 0
assert m.final_attempt_duration_ms == 0
assert m.validation_status == "unknown"
assert m.confidence == 0.0
def test_collect_metrics_no_document_text_length():
"""Zero document text length produces zero token estimate."""
resp = _make_success_response()
m = collect_metrics(resp, document_text_length=0)
assert m.input_token_estimate == 0