phase 17: add extraction output normalization — clamp scores to 0-1, map impact_horizon alternatives

This commit is contained in:
Celes Renata
2026-04-12 10:15:38 -07:00
parent 608ccc8b68
commit 1993bfdf3e
+50
View File
@@ -166,6 +166,9 @@ def validate_extraction(
if not isinstance(data, dict):
return ValidationReport(valid=False, errors=["Expected a JSON object at top level."])
# --- Normalize common model output issues before validation ---
data = _normalize_extraction_data(data)
# --- Pydantic structural validation ---
try:
result = ExtractionResult.model_validate(data)
@@ -187,6 +190,53 @@ def validate_extraction(
)
# ---------------------------------------------------------------------------
# Normalize model output before validation
# ---------------------------------------------------------------------------
_HORIZON_MAP: dict[str, str] = {
"long-term": "90d_plus",
"long": "90d_plus",
"longterm": "90d_plus",
"medium-term": "30d_90d",
"medium": "1d_30d",
"short-term": "1d_7d",
"short": "1d",
"immediate": "intraday",
"near-term": "1d_7d",
"mid-term": "1d_30d",
}
def _normalize_extraction_data(data: dict[str, Any]) -> dict[str, Any]:
"""Fix common model output issues before Pydantic validation."""
# Clamp novelty_score and confidence to [0, 1]
for field in ("novelty_score", "confidence"):
val = data.get(field)
if isinstance(val, (int, float)):
data[field] = max(0.0, min(1.0, float(val)))
# Normalize company entries
companies = data.get("companies", [])
if isinstance(companies, list):
for comp in companies:
if not isinstance(comp, dict):
continue
# Clamp numeric fields
for f in ("relevance", "impact_score"):
v = comp.get(f)
if isinstance(v, (int, float)):
comp[f] = max(0.0, min(1.0, float(v)))
# Map impact_horizon alternatives
horizon = comp.get("impact_horizon", "")
if isinstance(horizon, str):
mapped = _HORIZON_MAP.get(horizon.lower().strip())
if mapped:
comp["impact_horizon"] = mapped
return data
# ---------------------------------------------------------------------------
# Known valid impact horizons
# ---------------------------------------------------------------------------