fix: remove think=false (Ollama bug #14645), bump max_tokens to 32k

2026-04-14 23:50:28 +00:00
parent b0fc207c1a
commit 01726af360
2 changed files with 7 additions and 2 deletions
@@ -228,7 +228,12 @@ class OllamaClient:
            ],
            "format": json_schema,
            "stream": True,
-            "think": False,
+            # NOTE: Do NOT set "think": False here. Ollama has a known bug
+            # (issues #14645, #15260) where think=false silently disables
+            # the format constraint for qwen3.5 and gemma4 models, causing
+            # the model to output plain text instead of valid JSON.
+            # Omitting "think" lets the model use thinking mode (slightly
+            # slower but structured output actually works).
        }

        url = f"{self._config.base_url}/api/chat"
@@ -47,7 +47,7 @@ class OllamaConfig:
    retry_base_delay: float = 1.0
    retry_max_delay: float = 10.0
    retry_backoff_multiplier: float = 2.0
-    max_tokens: int = 4096
+    max_tokens: int = 32768
    stall_timeout: float = 30.0
    loop_window: int = 64
    loop_threshold: float = 0.5