From eead4f1381a6a2d7647d460c12b99eabfcc25a1e Mon Sep 17 00:00:00 2001
From: Celes Renata <celes@frameshift.net>
Date: Wed, 29 Apr 2026 16:04:04 +0000
Subject: [PATCH] fix: disable thinking mode on vLLM path with
 chat_template_kwargs

The thesis rewriter uses vLLM (not Ollama) in production. The previous
fix only added think=False to the Ollama payload. For vLLM's
OpenAI-compatible API with Qwen3 models, thinking mode is disabled via
chat_template_kwargs: {enable_thinking: false} in the request body.
---
 services/recommendation/thesis_llm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/services/recommendation/thesis_llm.py b/services/recommendation/thesis_llm.py
index 218e32e..1ff0248 100644
--- a/services/recommendation/thesis_llm.py
+++ b/services/recommendation/thesis_llm.py
@@ -405,6 +405,8 @@ async def _call_vllm_thesis(
         "max_tokens": config.max_tokens,
         "temperature": config.temperature,
         "stream": False,
+        # Disable thinking/reasoning mode for Qwen3 models on vLLM
+        "chat_template_kwargs": {"enable_thinking": False},
     }
 
     headers: dict[str, str] = {"Content-Type": "application/json"}