fix(extractor): streaming with guardrails + catalyst_type normalization

- Switch Ollama calls from non-streaming to streaming with early termination
- Add loop detection, max token limit, and stall timeout guards
- Add catalyst_type alias normalizer to handle model hallucinations
- Add explicit enum values in extraction prompt for catalyst_type
- Add streaming config knobs to OllamaConfig
This commit is contained in:
Celes Renata
2026-04-12 15:28:20 -07:00
parent 527be42f82
commit cd782d1552
4 changed files with 116 additions and 14 deletions
+72 -14
View File
@@ -117,7 +117,9 @@ class OllamaClient:
self._max_delay = config.retry_max_delay
self._backoff_multiplier = config.retry_backoff_multiplier
self._owns_client = http_client is None
self._http = http_client or httpx.AsyncClient(timeout=config.timeout)
self._http = http_client or httpx.AsyncClient(
timeout=httpx.Timeout(config.timeout, read=config.timeout),
)
async def close(self) -> None:
"""Close the underlying HTTP client if we own it."""
@@ -208,7 +210,13 @@ class OllamaClient:
json_schema: dict[str, object],
document_text: str = "",
) -> ExtractionAttempt:
"""Make a single call to the Ollama /api/chat endpoint."""
"""Make a streaming call to Ollama with early-termination guardrails.
Aborts the stream if:
- Total generated tokens exceed ``max_tokens``
- No new chunk arrives within ``stall_timeout`` seconds
- Repetition loop detected in the last ``loop_window`` tokens
"""
attempt = ExtractionAttempt(model=self._config.model)
start = time.monotonic()
@@ -219,19 +227,20 @@ class OllamaClient:
{"role": "user", "content": prompts["user"]},
],
"format": json_schema,
"stream": False,
"stream": True,
"think": False,
}
url = f"{self._config.base_url}/api/chat"
logger.info(
"Ollama POST %s model=%s input_chars=%d",
"Ollama POST %s model=%s input_chars=%d (streaming)",
url, self._config.model, len(prompts.get("user", "")),
)
try:
resp = await self._http.post(url, json=payload)
_ = resp.raise_for_status()
req = self._http.build_request("POST", url, json=payload)
resp = await self._http.send(req, stream=True)
resp.raise_for_status()
except httpx.TimeoutException:
attempt.error = "timeout"
attempt.duration_ms = int((time.monotonic() - start) * 1000)
@@ -246,18 +255,67 @@ class OllamaClient:
attempt.duration_ms = int((time.monotonic() - start) * 1000)
return attempt
# Stream and accumulate with guardrails
chunks: list[str] = []
token_count = 0
last_chunk_time = time.monotonic()
abort_reason: str | None = None
try:
async for line in resp.aiter_lines():
if not line:
continue
try:
frame = json.loads(line)
except json.JSONDecodeError:
continue
if frame.get("done"):
break
msg = frame.get("message", {})
token = msg.get("content", "") if isinstance(msg, dict) else ""
if not token:
continue
chunks.append(token)
token_count += 1
last_chunk_time = time.monotonic()
# Guard: max tokens
if token_count > self._config.max_tokens:
abort_reason = f"max_tokens_exceeded ({token_count})"
break
# Guard: repetition loop detection
if token_count >= self._config.loop_window:
window = chunks[-self._config.loop_window:]
unique_ratio = len(set(window)) / len(window)
if unique_ratio < self._config.loop_threshold:
abort_reason = f"repetition_loop (unique_ratio={unique_ratio:.2f})"
break
# Guard: stall detection (check between chunks)
elapsed_since_last = time.monotonic() - last_chunk_time
if elapsed_since_last > self._config.stall_timeout:
abort_reason = "stall_timeout"
break
except httpx.ReadTimeout:
abort_reason = "read_timeout"
finally:
await resp.aclose()
attempt.duration_ms = int((time.monotonic() - start) * 1000)
# Parse the Ollama response envelope
try:
body: dict[str, object] = resp.json()
except json.JSONDecodeError:
attempt.error = "invalid_response_json"
attempt.raw_output = resp.text
if abort_reason:
logger.warning(
"Stream aborted after %d tokens: %s", token_count, abort_reason,
)
attempt.error = abort_reason
attempt.raw_output = "".join(chunks)
return attempt
msg = body.get("message")
content: str = msg.get("content", "") if isinstance(msg, dict) else ""
content = "".join(chunks)
attempt.raw_output = content
if not content: