fix(extractor): streaming with guardrails + catalyst_type normalization
- Switch Ollama calls from non-streaming to streaming with early termination - Add loop detection, max token limit, and stall timeout guards - Add catalyst_type alias normalizer to handle model hallucinations - Add explicit enum values in extraction prompt for catalyst_type - Add streaming config knobs to OllamaConfig
This commit is contained in:
@@ -194,6 +194,39 @@ def validate_extraction(
|
||||
# Normalize model output before validation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CATALYST_ALIASES: dict[str, str] = {
|
||||
"strategic pivot": "other",
|
||||
"strategic": "other",
|
||||
"restructuring": "other",
|
||||
"partnership": "other",
|
||||
"acquisition": "m_and_a",
|
||||
"merger": "m_and_a",
|
||||
"buyout": "m_and_a",
|
||||
"lawsuit": "legal",
|
||||
"regulation": "legal",
|
||||
"regulatory": "legal",
|
||||
"upgrade": "rating_change",
|
||||
"downgrade": "rating_change",
|
||||
"price target": "rating_change",
|
||||
"inflation": "macro",
|
||||
"interest rate": "macro",
|
||||
"interest rates": "macro",
|
||||
"tariff": "macro",
|
||||
"tariffs": "macro",
|
||||
"launch": "product",
|
||||
"product launch": "product",
|
||||
"revenue": "earnings",
|
||||
"profit": "earnings",
|
||||
"guidance": "earnings",
|
||||
"supply": "supply_chain",
|
||||
"shortage": "supply_chain",
|
||||
}
|
||||
|
||||
_VALID_CATALYSTS = frozenset({
|
||||
"earnings", "product", "legal", "macro",
|
||||
"supply_chain", "m_and_a", "rating_change", "other",
|
||||
})
|
||||
|
||||
_HORIZON_MAP: dict[str, str] = {
|
||||
"long-term": "90d_plus",
|
||||
"long": "90d_plus",
|
||||
@@ -233,6 +266,11 @@ def _normalize_extraction_data(data: dict[str, Any]) -> dict[str, Any]:
|
||||
mapped = _HORIZON_MAP.get(horizon.lower().strip())
|
||||
if mapped:
|
||||
comp["impact_horizon"] = mapped
|
||||
# Map catalyst_type alternatives
|
||||
cat = comp.get("catalyst_type", "")
|
||||
if isinstance(cat, str) and cat.lower().strip() not in _VALID_CATALYSTS:
|
||||
mapped_cat = _CATALYST_ALIASES.get(cat.lower().strip(), "other")
|
||||
comp["catalyst_type"] = mapped_cat
|
||||
|
||||
return data
|
||||
|
||||
|
||||
Reference in New Issue
Block a user