fix: thesis rewriter now routes to vLLM when provider is vllm
ci/woodpecker/push/test Pipeline was successful
ci/woodpecker/push/build-1 Pipeline was successful
ci/woodpecker/push/build-2 Pipeline was successful
ci/woodpecker/push/build-3 Pipeline was successful
ci/woodpecker/push/finalize Pipeline was successful
Build and Push / lint-and-test (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.adapters.broker_adapter name:broker-adapter]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.aggregation.worker name:aggregation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.extractor.worker name:extractor]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.ingestion.worker name:ingestion]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.lake_publisher.worker name:lake-publisher]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.parser.worker name:parser]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.recommendation.worker name:recommendation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.scheduler.app name:scheduler]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.api.app:app --host 0.0.0.0 --port 8000 name:query-api]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.risk.app:app --host 0.0.0.0 --port 8000 name:risk]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.symbol_registry.app:app --host 0.0.0.0 --port 8000 name:symbol-registry]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.trading.app:app --host 0.0.0.0 --port 8000 name:trading-engine]) (push) Has been cancelled
Build and Push / build-dashboard (push) Has been cancelled
Build and Push / build-superset (push) Has been cancelled
Build and Push / integration-test (push) Has been cancelled
Build and Push / beta-gate (push) Has been cancelled

- thesis_llm.py: add _call_vllm_thesis() using /v1/chat/completions
- thesis_llm.py: check resolved model_provider and route accordingly
- values.yaml: set OLLAMA_BASE_URL to http://10.1.1.12:2701
This commit is contained in:
Celes Renata
2026-04-29 05:41:57 +00:00
parent a36702e5f3
commit f264e924f0
2 changed files with 88 additions and 17 deletions
+87 -16
View File
@@ -20,7 +20,7 @@ import asyncpg
import httpx
from services.shared.agent_config import AgentConfigResolver, ResolvedAgentConfig
from services.shared.config import OllamaConfig
from services.shared.config import OllamaConfig, VLLMConfig
from services.shared.schemas import TrendSummary
logger = logging.getLogger(__name__)
@@ -115,26 +115,42 @@ async def rewrite_thesis_with_llm(
# Resolve thesis-rewriter config from DB for variant override
resolved: ResolvedAgentConfig | None = None
effective_config = config
effective_config: OllamaConfig | VLLMConfig = config
use_vllm = False
if pool is not None:
try:
resolver = AgentConfigResolver(pool, ttl_seconds=60)
resolved = await resolver.resolve("thesis-rewriter")
if resolved is not None:
effective_config = OllamaConfig(
base_url=config.base_url,
model=resolved.model_name,
timeout=resolved.timeout_seconds,
max_retries=resolved.max_retries,
retry_base_delay=config.retry_base_delay,
retry_max_delay=config.retry_max_delay,
retry_backoff_multiplier=config.retry_backoff_multiplier,
max_tokens=resolved.max_tokens,
context_window=resolved.context_window,
)
provider = (resolved.model_provider or "").strip().lower()
if provider == "vllm":
use_vllm = True
# Import load_config to get vllm base_url from env
from services.shared.config import load_config as _load_config
_cfg = _load_config()
effective_config = VLLMConfig(
base_url=_cfg.vllm.base_url,
model=resolved.model_name,
timeout=resolved.timeout_seconds,
max_retries=resolved.max_retries,
max_tokens=resolved.max_tokens,
temperature=0.0,
)
else:
effective_config = OllamaConfig(
base_url=config.base_url,
model=resolved.model_name,
timeout=resolved.timeout_seconds,
max_retries=resolved.max_retries,
retry_base_delay=config.retry_base_delay,
retry_max_delay=config.retry_max_delay,
retry_backoff_multiplier=config.retry_backoff_multiplier,
max_tokens=resolved.max_tokens,
context_window=resolved.context_window,
)
logger.info(
"Thesis rewriter using resolved config: model=%s variant=%s",
resolved.model_name, resolved.variant_id,
"Thesis rewriter using resolved config: model=%s variant=%s provider=%s",
resolved.model_name, resolved.variant_id, provider or "ollama",
)
except Exception:
logger.warning(
@@ -177,7 +193,10 @@ async def rewrite_thesis_with_llm(
client = http_client or httpx.AsyncClient(timeout=effective_config.timeout)
try:
rewritten = await _call_ollama_thesis(client, effective_config, prompts)
if use_vllm:
rewritten = await _call_vllm_thesis(client, effective_config, prompts) # type: ignore[arg-type]
else:
rewritten = await _call_ollama_thesis(client, effective_config, prompts) # type: ignore[arg-type]
duration_ms = int((time.monotonic() - start_time) * 1000)
if rewritten:
@@ -318,3 +337,55 @@ async def _call_ollama_thesis(
)
return content.strip()
async def _call_vllm_thesis(
client: httpx.AsyncClient,
config: VLLMConfig,
prompts: dict[str, str],
) -> str:
"""Make a vLLM chat completion call for thesis rewriting.
Uses the OpenAI-compatible /v1/chat/completions endpoint.
Returns the model's text response, or empty string on failure.
"""
start = time.monotonic()
payload: dict[str, object] = {
"model": config.model,
"messages": [
{"role": "system", "content": prompts["system"]},
{"role": "user", "content": prompts["user"]},
],
"max_tokens": config.max_tokens,
"temperature": config.temperature,
"stream": False,
}
headers: dict[str, str] = {"Content-Type": "application/json"}
if config.api_key:
headers["Authorization"] = f"Bearer {config.api_key}"
resp = await client.post(
f"{config.base_url}/v1/chat/completions",
json=payload,
headers=headers,
)
_ = resp.raise_for_status()
duration_ms = int((time.monotonic() - start) * 1000)
body: dict[str, object] = resp.json()
choices = body.get("choices", [])
content: str = ""
if choices and isinstance(choices, list):
msg = choices[0].get("message", {}) # type: ignore[union-attr]
content = msg.get("content", "") if isinstance(msg, dict) else ""
logger.debug(
"vLLM thesis call completed in %dms, response length=%d",
duration_ms,
len(content),
)
return content.strip()