From 01726af3604ce6e1245b88f761a1de5bf62e79ac Mon Sep 17 00:00:00 2001 From: Celes Renata Date: Tue, 14 Apr 2026 23:50:28 +0000 Subject: [PATCH] fix: remove think=false (Ollama bug #14645), bump max_tokens to 32k --- services/extractor/client.py | 7 ++++++- services/shared/config.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/services/extractor/client.py b/services/extractor/client.py index 3d7737c..fb27cd4 100644 --- a/services/extractor/client.py +++ b/services/extractor/client.py @@ -228,7 +228,12 @@ class OllamaClient: ], "format": json_schema, "stream": True, - "think": False, + # NOTE: Do NOT set "think": False here. Ollama has a known bug + # (issues #14645, #15260) where think=false silently disables + # the format constraint for qwen3.5 and gemma4 models, causing + # the model to output plain text instead of valid JSON. + # Omitting "think" lets the model use thinking mode (slightly + # slower but structured output actually works). } url = f"{self._config.base_url}/api/chat" diff --git a/services/shared/config.py b/services/shared/config.py index 5de1620..480e3c3 100644 --- a/services/shared/config.py +++ b/services/shared/config.py @@ -47,7 +47,7 @@ class OllamaConfig: retry_base_delay: float = 1.0 retry_max_delay: float = 10.0 retry_backoff_multiplier: float = 2.0 - max_tokens: int = 4096 + max_tokens: int = 32768 stall_timeout: float = 30.0 loop_window: int = 64 loop_threshold: float = 0.5