fix: switch to non-streaming Ollama calls — streaming breaks thinking mode
This commit is contained in:
@@ -12,7 +12,6 @@ Requirements: 5.1, 5.2, 5.4
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
@@ -227,30 +226,17 @@ class OllamaClient:
|
|||||||
{"role": "user", "content": prompts["user"]},
|
{"role": "user", "content": prompts["user"]},
|
||||||
],
|
],
|
||||||
"format": json_schema,
|
"format": json_schema,
|
||||||
"stream": True,
|
"stream": False,
|
||||||
# NOTE: Do NOT set "think": False here. Ollama has a known bug
|
|
||||||
# (issues #14645, #15260) where think=false silently disables
|
|
||||||
# the format constraint for qwen3.5 and gemma4 models, causing
|
|
||||||
# the model to output plain text instead of valid JSON.
|
|
||||||
# Omitting "think" lets the model use thinking mode (slightly
|
|
||||||
# slower but structured output actually works).
|
|
||||||
"options": {
|
|
||||||
# Ollama defaults num_predict to 4096 which is consumed
|
|
||||||
# entirely by thinking tokens, leaving nothing for content.
|
|
||||||
# Set high enough for thinking + JSON output.
|
|
||||||
"num_predict": self._config.max_tokens,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
url = f"{self._config.base_url}/api/chat"
|
url = f"{self._config.base_url}/api/chat"
|
||||||
logger.info(
|
logger.info(
|
||||||
"Ollama POST %s model=%s input_chars=%d (streaming)",
|
"Ollama POST %s model=%s input_chars=%d",
|
||||||
url, self._config.model, len(prompts.get("user", "")),
|
url, self._config.model, len(prompts.get("user", "")),
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
req = self._http.build_request("POST", url, json=payload)
|
resp = await self._http.post(url, json=payload)
|
||||||
resp = await self._http.send(req, stream=True)
|
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
except httpx.TimeoutException:
|
except httpx.TimeoutException:
|
||||||
attempt.error = "timeout"
|
attempt.error = "timeout"
|
||||||
@@ -266,73 +252,16 @@ class OllamaClient:
|
|||||||
attempt.duration_ms = int((time.monotonic() - start) * 1000)
|
attempt.duration_ms = int((time.monotonic() - start) * 1000)
|
||||||
return attempt
|
return attempt
|
||||||
|
|
||||||
# Stream and accumulate with guardrails
|
|
||||||
chunks: list[str] = []
|
|
||||||
token_count = 0
|
|
||||||
last_chunk_time = time.monotonic()
|
|
||||||
abort_reason: str | None = None
|
|
||||||
|
|
||||||
try:
|
|
||||||
async for line in resp.aiter_lines():
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
frame = json.loads(line)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if frame.get("done"):
|
|
||||||
break
|
|
||||||
|
|
||||||
msg = frame.get("message", {})
|
|
||||||
token = msg.get("content", "") if isinstance(msg, dict) else ""
|
|
||||||
# During thinking mode, the model emits tokens in msg.thinking
|
|
||||||
# before msg.content. We don't accumulate thinking tokens but
|
|
||||||
# must update last_chunk_time so the stall guard doesn't fire.
|
|
||||||
thinking_token = msg.get("thinking", "") if isinstance(msg, dict) else ""
|
|
||||||
if thinking_token:
|
|
||||||
last_chunk_time = time.monotonic()
|
|
||||||
if not token:
|
|
||||||
continue
|
|
||||||
|
|
||||||
chunks.append(token)
|
|
||||||
token_count += 1
|
|
||||||
last_chunk_time = time.monotonic()
|
|
||||||
|
|
||||||
# Guard: max tokens
|
|
||||||
if token_count > self._config.max_tokens:
|
|
||||||
abort_reason = f"max_tokens_exceeded ({token_count})"
|
|
||||||
break
|
|
||||||
|
|
||||||
# Guard: repetition loop detection
|
|
||||||
if token_count >= self._config.loop_window:
|
|
||||||
window = chunks[-self._config.loop_window:]
|
|
||||||
unique_ratio = len(set(window)) / len(window)
|
|
||||||
if unique_ratio < self._config.loop_threshold:
|
|
||||||
abort_reason = f"repetition_loop (unique_ratio={unique_ratio:.2f})"
|
|
||||||
break
|
|
||||||
|
|
||||||
# Guard: stall detection (check between chunks)
|
|
||||||
elapsed_since_last = time.monotonic() - last_chunk_time
|
|
||||||
if elapsed_since_last > self._config.stall_timeout:
|
|
||||||
abort_reason = "stall_timeout"
|
|
||||||
break
|
|
||||||
except httpx.ReadTimeout:
|
|
||||||
abort_reason = "read_timeout"
|
|
||||||
finally:
|
|
||||||
await resp.aclose()
|
|
||||||
|
|
||||||
attempt.duration_ms = int((time.monotonic() - start) * 1000)
|
attempt.duration_ms = int((time.monotonic() - start) * 1000)
|
||||||
|
|
||||||
if abort_reason:
|
try:
|
||||||
logger.warning(
|
data = resp.json()
|
||||||
"Stream aborted after %d tokens: %s", token_count, abort_reason,
|
except Exception:
|
||||||
)
|
attempt.error = "invalid_response_json"
|
||||||
attempt.error = abort_reason
|
attempt.raw_output = resp.text[:2000]
|
||||||
attempt.raw_output = "".join(chunks)
|
|
||||||
return attempt
|
return attempt
|
||||||
|
|
||||||
content = "".join(chunks)
|
content = data.get("message", {}).get("content", "")
|
||||||
attempt.raw_output = content
|
attempt.raw_output = content
|
||||||
|
|
||||||
if not content:
|
if not content:
|
||||||
|
|||||||
Reference in New Issue
Block a user