From 46b069a7481220a29788f39b4a27f7d1736a55d7 Mon Sep 17 00:00:00 2001
From: Celes Renata <celes@frameshift.net>
Date: Wed, 15 Apr 2026 01:19:17 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20switch=20to=20non-streaming=20Ollama=20c?=
 =?UTF-8?q?alls=20=E2=80=94=20streaming=20breaks=20thinking=20mode?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 services/extractor/client.py | 89 ++++--------------------------------
 1 file changed, 9 insertions(+), 80 deletions(-)

diff --git a/services/extractor/client.py b/services/extractor/client.py
index 728c9ec..64efd99 100644
--- a/services/extractor/client.py
+++ b/services/extractor/client.py
@@ -12,7 +12,6 @@ Requirements: 5.1, 5.2, 5.4
 from __future__ import annotations
 
 import asyncio
-import json
 import logging
 import time
 from dataclasses import dataclass, field
@@ -227,30 +226,17 @@ class OllamaClient:
                 {"role": "user", "content": prompts["user"]},
             ],
             "format": json_schema,
-            "stream": True,
-            # NOTE: Do NOT set "think": False here. Ollama has a known bug
-            # (issues #14645, #15260) where think=false silently disables
-            # the format constraint for qwen3.5 and gemma4 models, causing
-            # the model to output plain text instead of valid JSON.
-            # Omitting "think" lets the model use thinking mode (slightly
-            # slower but structured output actually works).
-            "options": {
-                # Ollama defaults num_predict to 4096 which is consumed
-                # entirely by thinking tokens, leaving nothing for content.
-                # Set high enough for thinking + JSON output.
-                "num_predict": self._config.max_tokens,
-            },
+            "stream": False,
         }
 
         url = f"{self._config.base_url}/api/chat"
         logger.info(
-            "Ollama POST %s model=%s input_chars=%d (streaming)",
+            "Ollama POST %s model=%s input_chars=%d",
             url, self._config.model, len(prompts.get("user", "")),
         )
 
         try:
-            req = self._http.build_request("POST", url, json=payload)
-            resp = await self._http.send(req, stream=True)
+            resp = await self._http.post(url, json=payload)
             resp.raise_for_status()
         except httpx.TimeoutException:
             attempt.error = "timeout"
@@ -266,73 +252,16 @@ class OllamaClient:
             attempt.duration_ms = int((time.monotonic() - start) * 1000)
             return attempt
 
-        # Stream and accumulate with guardrails
-        chunks: list[str] = []
-        token_count = 0
-        last_chunk_time = time.monotonic()
-        abort_reason: str | None = None
-
-        try:
-            async for line in resp.aiter_lines():
-                if not line:
-                    continue
-                try:
-                    frame = json.loads(line)
-                except json.JSONDecodeError:
-                    continue
-
-                if frame.get("done"):
-                    break
-
-                msg = frame.get("message", {})
-                token = msg.get("content", "") if isinstance(msg, dict) else ""
-                # During thinking mode, the model emits tokens in msg.thinking
-                # before msg.content. We don't accumulate thinking tokens but
-                # must update last_chunk_time so the stall guard doesn't fire.
-                thinking_token = msg.get("thinking", "") if isinstance(msg, dict) else ""
-                if thinking_token:
-                    last_chunk_time = time.monotonic()
-                if not token:
-                    continue
-
-                chunks.append(token)
-                token_count += 1
-                last_chunk_time = time.monotonic()
-
-                # Guard: max tokens
-                if token_count > self._config.max_tokens:
-                    abort_reason = f"max_tokens_exceeded ({token_count})"
-                    break
-
-                # Guard: repetition loop detection
-                if token_count >= self._config.loop_window:
-                    window = chunks[-self._config.loop_window:]
-                    unique_ratio = len(set(window)) / len(window)
-                    if unique_ratio < self._config.loop_threshold:
-                        abort_reason = f"repetition_loop (unique_ratio={unique_ratio:.2f})"
-                        break
-
-                # Guard: stall detection (check between chunks)
-                elapsed_since_last = time.monotonic() - last_chunk_time
-                if elapsed_since_last > self._config.stall_timeout:
-                    abort_reason = "stall_timeout"
-                    break
-        except httpx.ReadTimeout:
-            abort_reason = "read_timeout"
-        finally:
-            await resp.aclose()
-
         attempt.duration_ms = int((time.monotonic() - start) * 1000)
 
-        if abort_reason:
-            logger.warning(
-                "Stream aborted after %d tokens: %s", token_count, abort_reason,
-            )
-            attempt.error = abort_reason
-            attempt.raw_output = "".join(chunks)
+        try:
+            data = resp.json()
+        except Exception:
+            attempt.error = "invalid_response_json"
+            attempt.raw_output = resp.text[:2000]
             return attempt
 
-        content = "".join(chunks)
+        content = data.get("message", {}).get("content", "")
         attempt.raw_output = content
 
         if not content: