diff --git a/services/extractor/schemas.py b/services/extractor/schemas.py index 75ff407..547f2c6 100644 --- a/services/extractor/schemas.py +++ b/services/extractor/schemas.py @@ -166,6 +166,9 @@ def validate_extraction( if not isinstance(data, dict): return ValidationReport(valid=False, errors=["Expected a JSON object at top level."]) + # --- Normalize common model output issues before validation --- + data = _normalize_extraction_data(data) + # --- Pydantic structural validation --- try: result = ExtractionResult.model_validate(data) @@ -187,6 +190,53 @@ def validate_extraction( ) +# --------------------------------------------------------------------------- +# Normalize model output before validation +# --------------------------------------------------------------------------- + +_HORIZON_MAP: dict[str, str] = { + "long-term": "90d_plus", + "long": "90d_plus", + "longterm": "90d_plus", + "medium-term": "30d_90d", + "medium": "1d_30d", + "short-term": "1d_7d", + "short": "1d", + "immediate": "intraday", + "near-term": "1d_7d", + "mid-term": "1d_30d", +} + + +def _normalize_extraction_data(data: dict[str, Any]) -> dict[str, Any]: + """Fix common model output issues before Pydantic validation.""" + # Clamp novelty_score and confidence to [0, 1] + for field in ("novelty_score", "confidence"): + val = data.get(field) + if isinstance(val, (int, float)): + data[field] = max(0.0, min(1.0, float(val))) + + # Normalize company entries + companies = data.get("companies", []) + if isinstance(companies, list): + for comp in companies: + if not isinstance(comp, dict): + continue + # Clamp numeric fields + for f in ("relevance", "impact_score"): + v = comp.get(f) + if isinstance(v, (int, float)): + comp[f] = max(0.0, min(1.0, float(v))) + # Map impact_horizon alternatives + horizon = comp.get("impact_horizon", "") + if isinstance(horizon, str): + mapped = _HORIZON_MAP.get(horizon.lower().strip()) + if mapped: + comp["impact_horizon"] = mapped + + return data + + # --------------------------------------------------------------------------- # Known valid impact horizons # ---------------------------------------------------------------------------