diff --git a/infra/helm/stonks-oracle/values.yaml b/infra/helm/stonks-oracle/values.yaml
index 0e97ad5..da22e22 100644
--- a/infra/helm/stonks-oracle/values.yaml
+++ b/infra/helm/stonks-oracle/values.yaml
@@ -174,7 +174,7 @@ config:
   REDIS_DB: "0"
   MINIO_ENDPOINT: "minio.minio-service.svc.cluster.local:80"
   MINIO_SECURE: "false"
-  OLLAMA_BASE_URL: "http://ollama.ollama-service.svc.cluster.local:11434"
+  OLLAMA_BASE_URL: "http://10.1.1.12:2701"
   OLLAMA_MODEL: "qwen3.5:9b-fast"
   OLLAMA_TIMEOUT: "240"
   OLLAMA_MAX_RETRIES: "2"
diff --git a/services/recommendation/thesis_llm.py b/services/recommendation/thesis_llm.py
index 605df79..fc55917 100644
--- a/services/recommendation/thesis_llm.py
+++ b/services/recommendation/thesis_llm.py
@@ -20,7 +20,7 @@ import asyncpg
 import httpx
 
 from services.shared.agent_config import AgentConfigResolver, ResolvedAgentConfig
-from services.shared.config import OllamaConfig
+from services.shared.config import OllamaConfig, VLLMConfig
 from services.shared.schemas import TrendSummary
 
 logger = logging.getLogger(__name__)
@@ -115,26 +115,42 @@ async def rewrite_thesis_with_llm(
 
     # Resolve thesis-rewriter config from DB for variant override
     resolved: ResolvedAgentConfig | None = None
-    effective_config = config
+    effective_config: OllamaConfig | VLLMConfig = config
+    use_vllm = False
     if pool is not None:
         try:
             resolver = AgentConfigResolver(pool, ttl_seconds=60)
             resolved = await resolver.resolve("thesis-rewriter")
             if resolved is not None:
-                effective_config = OllamaConfig(
-                    base_url=config.base_url,
-                    model=resolved.model_name,
-                    timeout=resolved.timeout_seconds,
-                    max_retries=resolved.max_retries,
-                    retry_base_delay=config.retry_base_delay,
-                    retry_max_delay=config.retry_max_delay,
-                    retry_backoff_multiplier=config.retry_backoff_multiplier,
-                    max_tokens=resolved.max_tokens,
-                    context_window=resolved.context_window,
-                )
+                provider = (resolved.model_provider or "").strip().lower()
+                if provider == "vllm":
+                    use_vllm = True
+                    # Import load_config to get vllm base_url from env
+                    from services.shared.config import load_config as _load_config
+                    _cfg = _load_config()
+                    effective_config = VLLMConfig(
+                        base_url=_cfg.vllm.base_url,
+                        model=resolved.model_name,
+                        timeout=resolved.timeout_seconds,
+                        max_retries=resolved.max_retries,
+                        max_tokens=resolved.max_tokens,
+                        temperature=0.0,
+                    )
+                else:
+                    effective_config = OllamaConfig(
+                        base_url=config.base_url,
+                        model=resolved.model_name,
+                        timeout=resolved.timeout_seconds,
+                        max_retries=resolved.max_retries,
+                        retry_base_delay=config.retry_base_delay,
+                        retry_max_delay=config.retry_max_delay,
+                        retry_backoff_multiplier=config.retry_backoff_multiplier,
+                        max_tokens=resolved.max_tokens,
+                        context_window=resolved.context_window,
+                    )
                 logger.info(
-                    "Thesis rewriter using resolved config: model=%s variant=%s",
-                    resolved.model_name, resolved.variant_id,
+                    "Thesis rewriter using resolved config: model=%s variant=%s provider=%s",
+                    resolved.model_name, resolved.variant_id, provider or "ollama",
                 )
         except Exception:
             logger.warning(
@@ -177,7 +193,10 @@ async def rewrite_thesis_with_llm(
     client = http_client or httpx.AsyncClient(timeout=effective_config.timeout)
 
     try:
-        rewritten = await _call_ollama_thesis(client, effective_config, prompts)
+        if use_vllm:
+            rewritten = await _call_vllm_thesis(client, effective_config, prompts)  # type: ignore[arg-type]
+        else:
+            rewritten = await _call_ollama_thesis(client, effective_config, prompts)  # type: ignore[arg-type]
         duration_ms = int((time.monotonic() - start_time) * 1000)
 
         if rewritten:
@@ -318,3 +337,55 @@ async def _call_ollama_thesis(
     )
 
     return content.strip()
+
+
+async def _call_vllm_thesis(
+    client: httpx.AsyncClient,
+    config: VLLMConfig,
+    prompts: dict[str, str],
+) -> str:
+    """Make a vLLM chat completion call for thesis rewriting.
+
+    Uses the OpenAI-compatible /v1/chat/completions endpoint.
+    Returns the model's text response, or empty string on failure.
+    """
+    start = time.monotonic()
+
+    payload: dict[str, object] = {
+        "model": config.model,
+        "messages": [
+            {"role": "system", "content": prompts["system"]},
+            {"role": "user", "content": prompts["user"]},
+        ],
+        "max_tokens": config.max_tokens,
+        "temperature": config.temperature,
+        "stream": False,
+    }
+
+    headers: dict[str, str] = {"Content-Type": "application/json"}
+    if config.api_key:
+        headers["Authorization"] = f"Bearer {config.api_key}"
+
+    resp = await client.post(
+        f"{config.base_url}/v1/chat/completions",
+        json=payload,
+        headers=headers,
+    )
+    _ = resp.raise_for_status()
+
+    duration_ms = int((time.monotonic() - start) * 1000)
+
+    body: dict[str, object] = resp.json()
+    choices = body.get("choices", [])
+    content: str = ""
+    if choices and isinstance(choices, list):
+        msg = choices[0].get("message", {})  # type: ignore[union-attr]
+        content = msg.get("content", "") if isinstance(msg, dict) else ""
+
+    logger.debug(
+        "vLLM thesis call completed in %dms, response length=%d",
+        duration_ms,
+        len(content),
+    )
+
+    return content.strip()