From 46b069a7481220a29788f39b4a27f7d1736a55d7 Mon Sep 17 00:00:00 2001 From: Celes Renata Date: Wed, 15 Apr 2026 01:19:17 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20switch=20to=20non-streaming=20Ollama=20c?= =?UTF-8?q?alls=20=E2=80=94=20streaming=20breaks=20thinking=20mode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- services/extractor/client.py | 89 ++++-------------------------------- 1 file changed, 9 insertions(+), 80 deletions(-) diff --git a/services/extractor/client.py b/services/extractor/client.py index 728c9ec..64efd99 100644 --- a/services/extractor/client.py +++ b/services/extractor/client.py @@ -12,7 +12,6 @@ Requirements: 5.1, 5.2, 5.4 from __future__ import annotations import asyncio -import json import logging import time from dataclasses import dataclass, field @@ -227,30 +226,17 @@ class OllamaClient: {"role": "user", "content": prompts["user"]}, ], "format": json_schema, - "stream": True, - # NOTE: Do NOT set "think": False here. Ollama has a known bug - # (issues #14645, #15260) where think=false silently disables - # the format constraint for qwen3.5 and gemma4 models, causing - # the model to output plain text instead of valid JSON. - # Omitting "think" lets the model use thinking mode (slightly - # slower but structured output actually works). - "options": { - # Ollama defaults num_predict to 4096 which is consumed - # entirely by thinking tokens, leaving nothing for content. - # Set high enough for thinking + JSON output. - "num_predict": self._config.max_tokens, - }, + "stream": False, } url = f"{self._config.base_url}/api/chat" logger.info( - "Ollama POST %s model=%s input_chars=%d (streaming)", + "Ollama POST %s model=%s input_chars=%d", url, self._config.model, len(prompts.get("user", "")), ) try: - req = self._http.build_request("POST", url, json=payload) - resp = await self._http.send(req, stream=True) + resp = await self._http.post(url, json=payload) resp.raise_for_status() except httpx.TimeoutException: attempt.error = "timeout" @@ -266,73 +252,16 @@ class OllamaClient: attempt.duration_ms = int((time.monotonic() - start) * 1000) return attempt - # Stream and accumulate with guardrails - chunks: list[str] = [] - token_count = 0 - last_chunk_time = time.monotonic() - abort_reason: str | None = None - - try: - async for line in resp.aiter_lines(): - if not line: - continue - try: - frame = json.loads(line) - except json.JSONDecodeError: - continue - - if frame.get("done"): - break - - msg = frame.get("message", {}) - token = msg.get("content", "") if isinstance(msg, dict) else "" - # During thinking mode, the model emits tokens in msg.thinking - # before msg.content. We don't accumulate thinking tokens but - # must update last_chunk_time so the stall guard doesn't fire. - thinking_token = msg.get("thinking", "") if isinstance(msg, dict) else "" - if thinking_token: - last_chunk_time = time.monotonic() - if not token: - continue - - chunks.append(token) - token_count += 1 - last_chunk_time = time.monotonic() - - # Guard: max tokens - if token_count > self._config.max_tokens: - abort_reason = f"max_tokens_exceeded ({token_count})" - break - - # Guard: repetition loop detection - if token_count >= self._config.loop_window: - window = chunks[-self._config.loop_window:] - unique_ratio = len(set(window)) / len(window) - if unique_ratio < self._config.loop_threshold: - abort_reason = f"repetition_loop (unique_ratio={unique_ratio:.2f})" - break - - # Guard: stall detection (check between chunks) - elapsed_since_last = time.monotonic() - last_chunk_time - if elapsed_since_last > self._config.stall_timeout: - abort_reason = "stall_timeout" - break - except httpx.ReadTimeout: - abort_reason = "read_timeout" - finally: - await resp.aclose() - attempt.duration_ms = int((time.monotonic() - start) * 1000) - if abort_reason: - logger.warning( - "Stream aborted after %d tokens: %s", token_count, abort_reason, - ) - attempt.error = abort_reason - attempt.raw_output = "".join(chunks) + try: + data = resp.json() + except Exception: + attempt.error = "invalid_response_json" + attempt.raw_output = resp.text[:2000] return attempt - content = "".join(chunks) + content = data.get("message", {}).get("content", "") attempt.raw_output = content if not content: