From f9ee1532dcd97911eb3fec79e6744691f615a694 Mon Sep 17 00:00:00 2001
From: Celes Renata <celes@frameshift.net>
Date: Wed, 29 Apr 2026 15:25:04 +0000
Subject: [PATCH] fix: strip <think> reasoning blocks from thesis LLM output

Qwen3.5 in thinking mode emits <think>...</think> chain-of-thought
before the actual response. The thesis rewriter was returning the raw
output including the entire reasoning block. Now strips thinking tags
from both Ollama and vLLM response paths.
---
 services/recommendation/thesis_llm.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/services/recommendation/thesis_llm.py b/services/recommendation/thesis_llm.py
index fc55917..1865e86 100644
--- a/services/recommendation/thesis_llm.py
+++ b/services/recommendation/thesis_llm.py
@@ -336,7 +336,22 @@ async def _call_ollama_thesis(
         len(content),
     )
 
-    return content.strip()
+    return _strip_thinking_block(content.strip())
+
+
+def _strip_thinking_block(text: str) -> str:
+    """Remove <think>...</think> reasoning blocks from model output.
+
+    Some models (e.g. Qwen) emit chain-of-thought in <think> tags before
+    the actual response. This strips that prefix to return only the final
+    thesis text.
+    """
+    import re
+    # Remove <think>...</think> blocks (greedy, handles multiline)
+    cleaned = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL)
+    # Also handle unclosed <think> tag (model cut off mid-thought)
+    cleaned = re.sub(r"<think>.*", "", cleaned, flags=re.DOTALL)
+    return cleaned.strip()
 
 
 async def _call_vllm_thesis(
@@ -388,4 +403,4 @@ async def _call_vllm_thesis(
         len(content),
     )
 
-    return content.strip()
+    return _strip_thinking_block(content.strip())