fix: switch to non-streaming Ollama calls — streaming breaks thinking mode

2026-04-15 01:19:17 +00:00
parent ffe19eb23a
commit 46b069a748
1 changed files with 9 additions and 80 deletions
@@ -12,7 +12,6 @@ Requirements: 5.1, 5.2, 5.4
 from __future__ import annotations
 import asyncio
 import json
 import logging
 import time
 from dataclasses import dataclass, field
@@ -227,30 +226,17 @@ class OllamaClient:
                {"role": "user", "content": prompts["user"]},
            ],
            "format": json_schema,
-            "stream": True,
+            "stream": False,
            # NOTE: Do NOT set "think": False here. Ollama has a known bug
            # (issues #14645, #15260) where think=false silently disables
            # the format constraint for qwen3.5 and gemma4 models, causing
            # the model to output plain text instead of valid JSON.
            # Omitting "think" lets the model use thinking mode (slightly
            # slower but structured output actually works).
            "options": {
                # Ollama defaults num_predict to 4096 which is consumed
                # entirely by thinking tokens, leaving nothing for content.
                # Set high enough for thinking + JSON output.
                "num_predict": self._config.max_tokens,
            },
        }
        url = f"{self._config.base_url}/api/chat"
        logger.info(
-            "Ollama POST %s model=%s input_chars=%d (streaming)",
+            "Ollama POST %s model=%s input_chars=%d",
            url, self._config.model, len(prompts.get("user", "")),
        )
        try:
-            req = self._http.build_request("POST", url, json=payload)
+            resp = await self._http.post(url, json=payload)
            resp = await self._http.send(req, stream=True)
            resp.raise_for_status()
        except httpx.TimeoutException:
            attempt.error = "timeout"
@@ -266,73 +252,16 @@ class OllamaClient:
            attempt.duration_ms = int((time.monotonic() - start) * 1000)
            return attempt
        # Stream and accumulate with guardrails
        chunks: list[str] = []
        token_count = 0
        last_chunk_time = time.monotonic()
        abort_reason: str | None = None
        try:
            async for line in resp.aiter_lines():
                if not line:
                    continue
                try:
                    frame = json.loads(line)
                except json.JSONDecodeError:
                    continue
                if frame.get("done"):
                    break
                msg = frame.get("message", {})
                token = msg.get("content", "") if isinstance(msg, dict) else ""
                # During thinking mode, the model emits tokens in msg.thinking
                # before msg.content. We don't accumulate thinking tokens but
                # must update last_chunk_time so the stall guard doesn't fire.
                thinking_token = msg.get("thinking", "") if isinstance(msg, dict) else ""
                if thinking_token:
                    last_chunk_time = time.monotonic()
                if not token:
                    continue
                chunks.append(token)
                token_count += 1
                last_chunk_time = time.monotonic()
                # Guard: max tokens
                if token_count > self._config.max_tokens:
                    abort_reason = f"max_tokens_exceeded ({token_count})"
                    break
                # Guard: repetition loop detection
                if token_count >= self._config.loop_window:
                    window = chunks[-self._config.loop_window:]
                    unique_ratio = len(set(window)) / len(window)
                    if unique_ratio < self._config.loop_threshold:
                        abort_reason = f"repetition_loop (unique_ratio={unique_ratio:.2f})"
                        break
                # Guard: stall detection (check between chunks)
                elapsed_since_last = time.monotonic() - last_chunk_time
                if elapsed_since_last > self._config.stall_timeout:
                    abort_reason = "stall_timeout"
                    break
        except httpx.ReadTimeout:
            abort_reason = "read_timeout"
        finally:
            await resp.aclose()
        attempt.duration_ms = int((time.monotonic() - start) * 1000)
-        if abort_reason:
+        try:
-            logger.warning(
+            data = resp.json()
-                "Stream aborted after %d tokens: %s", token_count, abort_reason,
+        except Exception:
-            )
+            attempt.error = "invalid_response_json"
-            attempt.error = abort_reason
+            attempt.raw_output = resp.text[:2000]
            attempt.raw_output = "".join(chunks)
            return attempt
-        content = "".join(chunks)
+        content = data.get("message", {}).get("content", "")
        attempt.raw_output = content
        if not content: