phase 17: add extraction output normalization — clamp scores to 0-1, map impact_horizon alternatives
This commit is contained in:
@@ -166,6 +166,9 @@ def validate_extraction(
|
||||
if not isinstance(data, dict):
|
||||
return ValidationReport(valid=False, errors=["Expected a JSON object at top level."])
|
||||
|
||||
# --- Normalize common model output issues before validation ---
|
||||
data = _normalize_extraction_data(data)
|
||||
|
||||
# --- Pydantic structural validation ---
|
||||
try:
|
||||
result = ExtractionResult.model_validate(data)
|
||||
@@ -187,6 +190,53 @@ def validate_extraction(
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Normalize model output before validation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_HORIZON_MAP: dict[str, str] = {
|
||||
"long-term": "90d_plus",
|
||||
"long": "90d_plus",
|
||||
"longterm": "90d_plus",
|
||||
"medium-term": "30d_90d",
|
||||
"medium": "1d_30d",
|
||||
"short-term": "1d_7d",
|
||||
"short": "1d",
|
||||
"immediate": "intraday",
|
||||
"near-term": "1d_7d",
|
||||
"mid-term": "1d_30d",
|
||||
}
|
||||
|
||||
|
||||
def _normalize_extraction_data(data: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Fix common model output issues before Pydantic validation."""
|
||||
# Clamp novelty_score and confidence to [0, 1]
|
||||
for field in ("novelty_score", "confidence"):
|
||||
val = data.get(field)
|
||||
if isinstance(val, (int, float)):
|
||||
data[field] = max(0.0, min(1.0, float(val)))
|
||||
|
||||
# Normalize company entries
|
||||
companies = data.get("companies", [])
|
||||
if isinstance(companies, list):
|
||||
for comp in companies:
|
||||
if not isinstance(comp, dict):
|
||||
continue
|
||||
# Clamp numeric fields
|
||||
for f in ("relevance", "impact_score"):
|
||||
v = comp.get(f)
|
||||
if isinstance(v, (int, float)):
|
||||
comp[f] = max(0.0, min(1.0, float(v)))
|
||||
# Map impact_horizon alternatives
|
||||
horizon = comp.get("impact_horizon", "")
|
||||
if isinstance(horizon, str):
|
||||
mapped = _HORIZON_MAP.get(horizon.lower().strip())
|
||||
if mapped:
|
||||
comp["impact_horizon"] = mapped
|
||||
|
||||
return data
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Known valid impact horizons
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user