fix: switch to non-streaming Ollama calls — streaming breaks thinking mode

2026-04-15 01:19:17 +00:00
parent ffe19eb23a
commit 46b069a748
1 changed files with 9 additions and 80 deletions
@@ -12,7 +12,6 @@ Requirements: 5.1, 5.2, 5.4
 from __future__ import annotations

 import asyncio
-import json
 import logging
 import time
 from dataclasses import dataclass, field
@@ -227,30 +226,17 @@ class OllamaClient:
                {"role": "user", "content": prompts["user"]},
            ],
            "format": json_schema,
-            "stream": True,
-            # NOTE: Do NOT set "think": False here. Ollama has a known bug
-            # (issues #14645, #15260) where think=false silently disables
-            # the format constraint for qwen3.5 and gemma4 models, causing
-            # the model to output plain text instead of valid JSON.
-            # Omitting "think" lets the model use thinking mode (slightly
-            # slower but structured output actually works).
-            "options": {
-                # Ollama defaults num_predict to 4096 which is consumed
-                # entirely by thinking tokens, leaving nothing for content.
-                # Set high enough for thinking + JSON output.
-                "num_predict": self._config.max_tokens,
-            },
+            "stream": False,
        }

        url = f"{self._config.base_url}/api/chat"
        logger.info(
-            "Ollama POST %s model=%s input_chars=%d (streaming)",
+            "Ollama POST %s model=%s input_chars=%d",
            url, self._config.model, len(prompts.get("user", "")),
        )

        try:
-            req = self._http.build_request("POST", url, json=payload)
-            resp = await self._http.send(req, stream=True)
+            resp = await self._http.post(url, json=payload)
            resp.raise_for_status()
        except httpx.TimeoutException:
            attempt.error = "timeout"
@@ -266,73 +252,16 @@ class OllamaClient:
            attempt.duration_ms = int((time.monotonic() - start) * 1000)
            return attempt

-        # Stream and accumulate with guardrails
-        chunks: list[str] = []
-        token_count = 0
-        last_chunk_time = time.monotonic()
-        abort_reason: str | None = None
-
-        try:
-            async for line in resp.aiter_lines():
-                if not line:
-                    continue
-                try:
-                    frame = json.loads(line)
-                except json.JSONDecodeError:
-                    continue
-
-                if frame.get("done"):
-                    break
-
-                msg = frame.get("message", {})
-                token = msg.get("content", "") if isinstance(msg, dict) else ""
-                # During thinking mode, the model emits tokens in msg.thinking
-                # before msg.content. We don't accumulate thinking tokens but
-                # must update last_chunk_time so the stall guard doesn't fire.
-                thinking_token = msg.get("thinking", "") if isinstance(msg, dict) else ""
-                if thinking_token:
-                    last_chunk_time = time.monotonic()
-                if not token:
-                    continue
-
-                chunks.append(token)
-                token_count += 1
-                last_chunk_time = time.monotonic()
-
-                # Guard: max tokens
-                if token_count > self._config.max_tokens:
-                    abort_reason = f"max_tokens_exceeded ({token_count})"
-                    break
-
-                # Guard: repetition loop detection
-                if token_count >= self._config.loop_window:
-                    window = chunks[-self._config.loop_window:]
-                    unique_ratio = len(set(window)) / len(window)
-                    if unique_ratio < self._config.loop_threshold:
-                        abort_reason = f"repetition_loop (unique_ratio={unique_ratio:.2f})"
-                        break
-
-                # Guard: stall detection (check between chunks)
-                elapsed_since_last = time.monotonic() - last_chunk_time
-                if elapsed_since_last > self._config.stall_timeout:
-                    abort_reason = "stall_timeout"
-                    break
-        except httpx.ReadTimeout:
-            abort_reason = "read_timeout"
-        finally:
-            await resp.aclose()
-
        attempt.duration_ms = int((time.monotonic() - start) * 1000)

-        if abort_reason:
-            logger.warning(
-                "Stream aborted after %d tokens: %s", token_count, abort_reason,
-            )
-            attempt.error = abort_reason
-            attempt.raw_output = "".join(chunks)
+        try:
+            data = resp.json()
+        except Exception:
+            attempt.error = "invalid_response_json"
+            attempt.raw_output = resp.text[:2000]
            return attempt

-        content = "".join(chunks)
+        content = data.get("message", {}).get("content", "")
        attempt.raw_output = content

        if not content: