From 01726af3604ce6e1245b88f761a1de5bf62e79ac Mon Sep 17 00:00:00 2001
From: Celes Renata <celes@frameshift.net>
Date: Tue, 14 Apr 2026 23:50:28 +0000
Subject: [PATCH] fix: remove think=false (Ollama bug #14645), bump max_tokens
 to 32k

---
 services/extractor/client.py | 7 ++++++-
 services/shared/config.py    | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/services/extractor/client.py b/services/extractor/client.py
index 3d7737c..fb27cd4 100644
--- a/services/extractor/client.py
+++ b/services/extractor/client.py
@@ -228,7 +228,12 @@ class OllamaClient:
             ],
             "format": json_schema,
             "stream": True,
-            "think": False,
+            # NOTE: Do NOT set "think": False here. Ollama has a known bug
+            # (issues #14645, #15260) where think=false silently disables
+            # the format constraint for qwen3.5 and gemma4 models, causing
+            # the model to output plain text instead of valid JSON.
+            # Omitting "think" lets the model use thinking mode (slightly
+            # slower but structured output actually works).
         }
 
         url = f"{self._config.base_url}/api/chat"
diff --git a/services/shared/config.py b/services/shared/config.py
index 5de1620..480e3c3 100644
--- a/services/shared/config.py
+++ b/services/shared/config.py
@@ -47,7 +47,7 @@ class OllamaConfig:
     retry_base_delay: float = 1.0
     retry_max_delay: float = 10.0
     retry_backoff_multiplier: float = 2.0
-    max_tokens: int = 4096
+    max_tokens: int = 32768
     stall_timeout: float = 30.0
     loop_window: int = 64
     loop_threshold: float = 0.5