fix: thesis rewriter now routes to vLLM when provider is vllm

- thesis_llm.py: add _call_vllm_thesis() using /v1/chat/completions - thesis_llm.py: check resolved model_provider and route accordingly - values.yaml: set OLLAMA_BASE_URL to http://10.1.1.12:2701
2026-04-29 05:41:57 +00:00
parent a36702e5f3
commit f264e924f0
2 changed files with 88 additions and 17 deletions
@@ -174,7 +174,7 @@ config:
  REDIS_DB: "0"
  MINIO_ENDPOINT: "minio.minio-service.svc.cluster.local:80"
  MINIO_SECURE: "false"
-  OLLAMA_BASE_URL: "http://ollama.ollama-service.svc.cluster.local:11434"
+  OLLAMA_BASE_URL: "http://10.1.1.12:2701"
  OLLAMA_MODEL: "qwen3.5:9b-fast"
  OLLAMA_TIMEOUT: "240"
  OLLAMA_MAX_RETRIES: "2"
@@ -20,7 +20,7 @@ import asyncpg
 import httpx
 from services.shared.agent_config import AgentConfigResolver, ResolvedAgentConfig
-from services.shared.config import OllamaConfig
+from services.shared.config import OllamaConfig, VLLMConfig
 from services.shared.schemas import TrendSummary
 logger = logging.getLogger(__name__)
@@ -115,26 +115,42 @@ async def rewrite_thesis_with_llm(
    # Resolve thesis-rewriter config from DB for variant override
    resolved: ResolvedAgentConfig | None = None
-    effective_config = config
+    effective_config: OllamaConfig | VLLMConfig = config
    use_vllm = False
    if pool is not None:
        try:
            resolver = AgentConfigResolver(pool, ttl_seconds=60)
            resolved = await resolver.resolve("thesis-rewriter")
            if resolved is not None:
-                effective_config = OllamaConfig(
+                provider = (resolved.model_provider or "").strip().lower()
-                    base_url=config.base_url,
+                if provider == "vllm":
-                    model=resolved.model_name,
+                    use_vllm = True
-                    timeout=resolved.timeout_seconds,
+                    # Import load_config to get vllm base_url from env
-                    max_retries=resolved.max_retries,
+                    from services.shared.config import load_config as _load_config
-                    retry_base_delay=config.retry_base_delay,
+                    _cfg = _load_config()
-                    retry_max_delay=config.retry_max_delay,
+                    effective_config = VLLMConfig(
-                    retry_backoff_multiplier=config.retry_backoff_multiplier,
+                        base_url=_cfg.vllm.base_url,
-                    max_tokens=resolved.max_tokens,
+                        model=resolved.model_name,
-                    context_window=resolved.context_window,
+                        timeout=resolved.timeout_seconds,
-                )
+                        max_retries=resolved.max_retries,
                        max_tokens=resolved.max_tokens,
                        temperature=0.0,
                    )
                else:
                    effective_config = OllamaConfig(
                        base_url=config.base_url,
                        model=resolved.model_name,
                        timeout=resolved.timeout_seconds,
                        max_retries=resolved.max_retries,
                        retry_base_delay=config.retry_base_delay,
                        retry_max_delay=config.retry_max_delay,
                        retry_backoff_multiplier=config.retry_backoff_multiplier,
                        max_tokens=resolved.max_tokens,
                        context_window=resolved.context_window,
                    )
                logger.info(
-                    "Thesis rewriter using resolved config: model=%s variant=%s",
+                    "Thesis rewriter using resolved config: model=%s variant=%s provider=%s",
-                    resolved.model_name, resolved.variant_id,
+                    resolved.model_name, resolved.variant_id, provider or "ollama",
                )
        except Exception:
            logger.warning(
@@ -177,7 +193,10 @@ async def rewrite_thesis_with_llm(
    client = http_client or httpx.AsyncClient(timeout=effective_config.timeout)
    try:
-        rewritten = await _call_ollama_thesis(client, effective_config, prompts)
+        if use_vllm:
            rewritten = await _call_vllm_thesis(client, effective_config, prompts)  # type: ignore[arg-type]
        else:
            rewritten = await _call_ollama_thesis(client, effective_config, prompts)  # type: ignore[arg-type]
        duration_ms = int((time.monotonic() - start_time) * 1000)
        if rewritten:
@@ -318,3 +337,55 @@ async def _call_ollama_thesis(
    )
    return content.strip()
 async def _call_vllm_thesis(
    client: httpx.AsyncClient,
    config: VLLMConfig,
    prompts: dict[str, str],
 ) -> str:
    """Make a vLLM chat completion call for thesis rewriting.
    Uses the OpenAI-compatible /v1/chat/completions endpoint.
    Returns the model's text response, or empty string on failure.
    """
    start = time.monotonic()
    payload: dict[str, object] = {
        "model": config.model,
        "messages": [
            {"role": "system", "content": prompts["system"]},
            {"role": "user", "content": prompts["user"]},
        ],
        "max_tokens": config.max_tokens,
        "temperature": config.temperature,
        "stream": False,
    }
    headers: dict[str, str] = {"Content-Type": "application/json"}
    if config.api_key:
        headers["Authorization"] = f"Bearer {config.api_key}"
    resp = await client.post(
        f"{config.base_url}/v1/chat/completions",
        json=payload,
        headers=headers,
    )
    _ = resp.raise_for_status()
    duration_ms = int((time.monotonic() - start) * 1000)
    body: dict[str, object] = resp.json()
    choices = body.get("choices", [])
    content: str = ""
    if choices and isinstance(choices, list):
        msg = choices[0].get("message", {})  # type: ignore[union-attr]
        content = msg.get("content", "") if isinstance(msg, dict) else ""
    logger.debug(
        "vLLM thesis call completed in %dms, response length=%d",
        duration_ms,
        len(content),
    )
    return content.strip()