diff --git a/infra/helm/stonks-oracle/values.yaml b/infra/helm/stonks-oracle/values.yaml index 0e97ad5..da22e22 100644 --- a/infra/helm/stonks-oracle/values.yaml +++ b/infra/helm/stonks-oracle/values.yaml @@ -174,7 +174,7 @@ config: REDIS_DB: "0" MINIO_ENDPOINT: "minio.minio-service.svc.cluster.local:80" MINIO_SECURE: "false" - OLLAMA_BASE_URL: "http://ollama.ollama-service.svc.cluster.local:11434" + OLLAMA_BASE_URL: "http://10.1.1.12:2701" OLLAMA_MODEL: "qwen3.5:9b-fast" OLLAMA_TIMEOUT: "240" OLLAMA_MAX_RETRIES: "2" diff --git a/services/recommendation/thesis_llm.py b/services/recommendation/thesis_llm.py index 605df79..fc55917 100644 --- a/services/recommendation/thesis_llm.py +++ b/services/recommendation/thesis_llm.py @@ -20,7 +20,7 @@ import asyncpg import httpx from services.shared.agent_config import AgentConfigResolver, ResolvedAgentConfig -from services.shared.config import OllamaConfig +from services.shared.config import OllamaConfig, VLLMConfig from services.shared.schemas import TrendSummary logger = logging.getLogger(__name__) @@ -115,26 +115,42 @@ async def rewrite_thesis_with_llm( # Resolve thesis-rewriter config from DB for variant override resolved: ResolvedAgentConfig | None = None - effective_config = config + effective_config: OllamaConfig | VLLMConfig = config + use_vllm = False if pool is not None: try: resolver = AgentConfigResolver(pool, ttl_seconds=60) resolved = await resolver.resolve("thesis-rewriter") if resolved is not None: - effective_config = OllamaConfig( - base_url=config.base_url, - model=resolved.model_name, - timeout=resolved.timeout_seconds, - max_retries=resolved.max_retries, - retry_base_delay=config.retry_base_delay, - retry_max_delay=config.retry_max_delay, - retry_backoff_multiplier=config.retry_backoff_multiplier, - max_tokens=resolved.max_tokens, - context_window=resolved.context_window, - ) + provider = (resolved.model_provider or "").strip().lower() + if provider == "vllm": + use_vllm = True + # Import load_config to get vllm base_url from env + from services.shared.config import load_config as _load_config + _cfg = _load_config() + effective_config = VLLMConfig( + base_url=_cfg.vllm.base_url, + model=resolved.model_name, + timeout=resolved.timeout_seconds, + max_retries=resolved.max_retries, + max_tokens=resolved.max_tokens, + temperature=0.0, + ) + else: + effective_config = OllamaConfig( + base_url=config.base_url, + model=resolved.model_name, + timeout=resolved.timeout_seconds, + max_retries=resolved.max_retries, + retry_base_delay=config.retry_base_delay, + retry_max_delay=config.retry_max_delay, + retry_backoff_multiplier=config.retry_backoff_multiplier, + max_tokens=resolved.max_tokens, + context_window=resolved.context_window, + ) logger.info( - "Thesis rewriter using resolved config: model=%s variant=%s", - resolved.model_name, resolved.variant_id, + "Thesis rewriter using resolved config: model=%s variant=%s provider=%s", + resolved.model_name, resolved.variant_id, provider or "ollama", ) except Exception: logger.warning( @@ -177,7 +193,10 @@ async def rewrite_thesis_with_llm( client = http_client or httpx.AsyncClient(timeout=effective_config.timeout) try: - rewritten = await _call_ollama_thesis(client, effective_config, prompts) + if use_vllm: + rewritten = await _call_vllm_thesis(client, effective_config, prompts) # type: ignore[arg-type] + else: + rewritten = await _call_ollama_thesis(client, effective_config, prompts) # type: ignore[arg-type] duration_ms = int((time.monotonic() - start_time) * 1000) if rewritten: @@ -318,3 +337,55 @@ async def _call_ollama_thesis( ) return content.strip() + + +async def _call_vllm_thesis( + client: httpx.AsyncClient, + config: VLLMConfig, + prompts: dict[str, str], +) -> str: + """Make a vLLM chat completion call for thesis rewriting. + + Uses the OpenAI-compatible /v1/chat/completions endpoint. + Returns the model's text response, or empty string on failure. + """ + start = time.monotonic() + + payload: dict[str, object] = { + "model": config.model, + "messages": [ + {"role": "system", "content": prompts["system"]}, + {"role": "user", "content": prompts["user"]}, + ], + "max_tokens": config.max_tokens, + "temperature": config.temperature, + "stream": False, + } + + headers: dict[str, str] = {"Content-Type": "application/json"} + if config.api_key: + headers["Authorization"] = f"Bearer {config.api_key}" + + resp = await client.post( + f"{config.base_url}/v1/chat/completions", + json=payload, + headers=headers, + ) + _ = resp.raise_for_status() + + duration_ms = int((time.monotonic() - start) * 1000) + + body: dict[str, object] = resp.json() + choices = body.get("choices", []) + content: str = "" + if choices and isinstance(choices, list): + msg = choices[0].get("message", {}) # type: ignore[union-attr] + content = msg.get("content", "") if isinstance(msg, dict) else "" + + logger.debug( + "vLLM thesis call completed in %dms, response length=%d", + duration_ms, + len(content), + ) + + return content.strip()