fix(extractor): streaming with guardrails + catalyst_type normalization

- Switch Ollama calls from non-streaming to streaming with early termination
- Add loop detection, max token limit, and stall timeout guards
- Add catalyst_type alias normalizer to handle model hallucinations
- Add explicit enum values in extraction prompt for catalyst_type
- Add streaming config knobs to OllamaConfig
This commit is contained in:
Celes Renata
2026-04-12 15:28:20 -07:00
parent 527be42f82
commit cd782d1552
4 changed files with 116 additions and 14 deletions
+38
View File
@@ -194,6 +194,39 @@ def validate_extraction(
# Normalize model output before validation
# ---------------------------------------------------------------------------
_CATALYST_ALIASES: dict[str, str] = {
"strategic pivot": "other",
"strategic": "other",
"restructuring": "other",
"partnership": "other",
"acquisition": "m_and_a",
"merger": "m_and_a",
"buyout": "m_and_a",
"lawsuit": "legal",
"regulation": "legal",
"regulatory": "legal",
"upgrade": "rating_change",
"downgrade": "rating_change",
"price target": "rating_change",
"inflation": "macro",
"interest rate": "macro",
"interest rates": "macro",
"tariff": "macro",
"tariffs": "macro",
"launch": "product",
"product launch": "product",
"revenue": "earnings",
"profit": "earnings",
"guidance": "earnings",
"supply": "supply_chain",
"shortage": "supply_chain",
}
_VALID_CATALYSTS = frozenset({
"earnings", "product", "legal", "macro",
"supply_chain", "m_and_a", "rating_change", "other",
})
_HORIZON_MAP: dict[str, str] = {
"long-term": "90d_plus",
"long": "90d_plus",
@@ -233,6 +266,11 @@ def _normalize_extraction_data(data: dict[str, Any]) -> dict[str, Any]:
mapped = _HORIZON_MAP.get(horizon.lower().strip())
if mapped:
comp["impact_horizon"] = mapped
# Map catalyst_type alternatives
cat = comp.get("catalyst_type", "")
if isinstance(cat, str) and cat.lower().strip() not in _VALID_CATALYSTS:
mapped_cat = _CATALYST_ALIASES.get(cat.lower().strip(), "other")
comp["catalyst_type"] = mapped_cat
return data