From eead4f1381a6a2d7647d460c12b99eabfcc25a1e Mon Sep 17 00:00:00 2001 From: Celes Renata Date: Wed, 29 Apr 2026 16:04:04 +0000 Subject: [PATCH] fix: disable thinking mode on vLLM path with chat_template_kwargs The thesis rewriter uses vLLM (not Ollama) in production. The previous fix only added think=False to the Ollama payload. For vLLM's OpenAI-compatible API with Qwen3 models, thinking mode is disabled via chat_template_kwargs: {enable_thinking: false} in the request body. --- services/recommendation/thesis_llm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/recommendation/thesis_llm.py b/services/recommendation/thesis_llm.py index 218e32e..1ff0248 100644 --- a/services/recommendation/thesis_llm.py +++ b/services/recommendation/thesis_llm.py @@ -405,6 +405,8 @@ async def _call_vllm_thesis( "max_tokens": config.max_tokens, "temperature": config.temperature, "stream": False, + # Disable thinking/reasoning mode for Qwen3 models on vLLM + "chat_template_kwargs": {"enable_thinking": False}, } headers: dict[str, str] = {"Content-Type": "application/json"}