169 lines
5.2 KiB
Python
169 lines
5.2 KiB
Python
"""Tests for extraction model performance metrics collection.
|
|
|
|
Validates that collect_metrics correctly computes metrics from
|
|
ExtractionResponse objects for both successful and failed extractions.
|
|
|
|
Requirements: 5.2, 5.4, 12.1, 12.2
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from services.extractor.client import ExtractionAttempt, ExtractionResponse
|
|
from services.extractor.metrics import collect_metrics
|
|
from services.extractor.schemas import ExtractionResult, ValidationReport
|
|
|
|
|
|
def _make_valid_result() -> ExtractionResult:
|
|
return ExtractionResult.model_validate({
|
|
"summary": "Apple beat earnings expectations.",
|
|
"companies": [
|
|
{
|
|
"ticker": "AAPL",
|
|
"company_name": "Apple Inc.",
|
|
"relevance": 0.95,
|
|
"sentiment": "positive",
|
|
"impact_score": 0.7,
|
|
"impact_horizon": "1d_30d",
|
|
"catalyst_type": "earnings",
|
|
"key_facts": ["Revenue up 12%"],
|
|
"risks": [],
|
|
"evidence_spans": ["Apple beat expectations"],
|
|
}
|
|
],
|
|
"macro_themes": ["ai_capex"],
|
|
"novelty_score": 0.6,
|
|
"confidence": 0.85,
|
|
"extraction_warnings": [],
|
|
})
|
|
|
|
|
|
def _make_success_response() -> ExtractionResponse:
|
|
result = _make_valid_result()
|
|
validation = ValidationReport(valid=True, errors=[], warnings=["low_novelty"], parsed=result)
|
|
attempt = ExtractionAttempt(
|
|
raw_output=result.model_dump_json(),
|
|
validation=validation,
|
|
error=None,
|
|
duration_ms=500,
|
|
model="test-model",
|
|
)
|
|
return ExtractionResponse(
|
|
success=True,
|
|
result=result,
|
|
attempts=[attempt],
|
|
prompt_metadata={"prompt_version": "document-intel-v1", "schema_version": "2.0.0"},
|
|
model="test-model",
|
|
total_duration_ms=500,
|
|
)
|
|
|
|
|
|
def _make_failed_response_with_retries() -> ExtractionResponse:
|
|
attempt1 = ExtractionAttempt(
|
|
raw_output="bad json",
|
|
validation=None,
|
|
error="invalid_json",
|
|
duration_ms=200,
|
|
model="test-model",
|
|
)
|
|
attempt2 = ExtractionAttempt(
|
|
raw_output="still bad output here",
|
|
validation=ValidationReport(
|
|
valid=False,
|
|
errors=["schema_fail", "missing_companies"],
|
|
warnings=["truncated"],
|
|
),
|
|
error="schema_fail; missing_companies",
|
|
duration_ms=300,
|
|
model="test-model",
|
|
)
|
|
return ExtractionResponse(
|
|
success=False,
|
|
result=None,
|
|
attempts=[attempt1, attempt2],
|
|
prompt_metadata={"prompt_version": "document-intel-v1", "schema_version": "2.0.0"},
|
|
model="test-model",
|
|
total_duration_ms=500,
|
|
)
|
|
|
|
|
|
def test_collect_metrics_success():
|
|
"""Successful extraction produces correct metrics."""
|
|
resp = _make_success_response()
|
|
m = collect_metrics(
|
|
resp,
|
|
document_id="doc-1",
|
|
ticker="AAPL",
|
|
document_text_length=4000,
|
|
)
|
|
|
|
assert m.document_id == "doc-1"
|
|
assert m.ticker == "AAPL"
|
|
assert m.model_name == "test-model"
|
|
assert m.prompt_version == "document-intel-v1"
|
|
assert m.schema_version == "2.0.0"
|
|
assert m.success is True
|
|
assert m.attempt_count == 1
|
|
assert m.total_duration_ms == 500
|
|
assert m.first_attempt_duration_ms == 500
|
|
assert m.final_attempt_duration_ms == 500
|
|
assert m.confidence == 0.85
|
|
assert m.validation_status == "valid"
|
|
assert m.validation_error_count == 0
|
|
assert m.validation_warning_count == 1
|
|
assert m.retry_count == 0
|
|
assert m.input_token_estimate == 1000 # 4000 / 4
|
|
assert m.output_token_estimate > 0
|
|
assert m.company_count == 1
|
|
|
|
|
|
def test_collect_metrics_failed_with_retries():
|
|
"""Failed extraction with retries produces correct metrics."""
|
|
resp = _make_failed_response_with_retries()
|
|
m = collect_metrics(
|
|
resp,
|
|
document_id="doc-2",
|
|
ticker="MSFT",
|
|
document_text_length=2000,
|
|
)
|
|
|
|
assert m.success is False
|
|
assert m.attempt_count == 2
|
|
assert m.retry_count == 1
|
|
assert m.first_attempt_duration_ms == 200
|
|
assert m.final_attempt_duration_ms == 300
|
|
assert m.total_duration_ms == 500
|
|
assert m.validation_status == "failed"
|
|
assert m.validation_error_count == 2
|
|
assert m.validation_warning_count == 1
|
|
assert "schema_fail" in m.validation_errors
|
|
assert m.confidence == 0.0
|
|
assert m.company_count == 0
|
|
assert m.input_token_estimate == 500 # 2000 / 4
|
|
|
|
|
|
def test_collect_metrics_empty_attempts():
|
|
"""Response with no attempts produces safe defaults."""
|
|
resp = ExtractionResponse(
|
|
success=False,
|
|
result=None,
|
|
attempts=[],
|
|
prompt_metadata={},
|
|
model="test-model",
|
|
total_duration_ms=0,
|
|
)
|
|
m = collect_metrics(resp, document_id="doc-3")
|
|
|
|
assert m.attempt_count == 0
|
|
assert m.retry_count == 0
|
|
assert m.first_attempt_duration_ms == 0
|
|
assert m.final_attempt_duration_ms == 0
|
|
assert m.validation_status == "unknown"
|
|
assert m.confidence == 0.0
|
|
|
|
|
|
def test_collect_metrics_no_document_text_length():
|
|
"""Zero document text length produces zero token estimate."""
|
|
resp = _make_success_response()
|
|
m = collect_metrics(resp, document_text_length=0)
|
|
|
|
assert m.input_token_estimate == 0
|