feat: add remote vLLM support with provider abstraction layer
- LLMClient Protocol for provider-agnostic inference - VLLMClient for OpenAI-compatible /v1/chat/completions API - LLM client factory with provider routing (ollama/vllm) - VLLMConfig with VLLM_* environment variable loading - Updated extractor worker with health check and provider switching - Updated event classifier to use LLMClient protocol - Helm values for vLLM configuration - 18 unit tests + 6 property-based tests - Full backward compatibility preserved
This commit is contained in:
@@ -54,6 +54,24 @@ class OllamaConfig:
|
||||
context_window: int = 0 # Ollama num_ctx; 0 = use model default
|
||||
|
||||
|
||||
@dataclass
|
||||
class VLLMConfig:
|
||||
"""Configuration for the remote vLLM inference server.
|
||||
|
||||
Requirements: 3.1, 3.2
|
||||
"""
|
||||
base_url: str = "http://192.168.42.254:8000"
|
||||
model: str = "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
|
||||
timeout: int = 120
|
||||
max_retries: int = 2
|
||||
retry_base_delay: float = 1.0
|
||||
retry_max_delay: float = 10.0
|
||||
retry_backoff_multiplier: float = 2.0
|
||||
max_tokens: int = 32768
|
||||
temperature: float = 0.7
|
||||
api_key: str = "" # Optional, for authenticated vLLM deployments
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrinoConfig:
|
||||
host: str = "localhost"
|
||||
@@ -217,6 +235,7 @@ class AppConfig:
|
||||
redis: RedisConfig = field(default_factory=RedisConfig)
|
||||
minio: MinioConfig = field(default_factory=MinioConfig)
|
||||
ollama: OllamaConfig = field(default_factory=OllamaConfig)
|
||||
vllm: VLLMConfig = field(default_factory=VLLMConfig)
|
||||
trino: TrinoConfig = field(default_factory=TrinoConfig)
|
||||
market_data: MarketDataConfig = field(default_factory=MarketDataConfig)
|
||||
broker: BrokerConfig = field(default_factory=BrokerConfig)
|
||||
@@ -260,6 +279,18 @@ def load_config() -> AppConfig:
|
||||
retry_max_delay=float(os.getenv("OLLAMA_RETRY_MAX_DELAY", "10.0")),
|
||||
retry_backoff_multiplier=float(os.getenv("OLLAMA_RETRY_BACKOFF_MULTIPLIER", "2.0")),
|
||||
),
|
||||
vllm=VLLMConfig(
|
||||
base_url=os.getenv("VLLM_BASE_URL", "http://192.168.42.254:8000"),
|
||||
model=os.getenv("VLLM_MODEL", "RedHatAI/Qwen3.6-35B-A3B-NVFP4"),
|
||||
timeout=int(os.getenv("VLLM_TIMEOUT", "120")),
|
||||
max_retries=int(os.getenv("VLLM_MAX_RETRIES", "2")),
|
||||
retry_base_delay=float(os.getenv("VLLM_RETRY_BASE_DELAY", "1.0")),
|
||||
retry_max_delay=float(os.getenv("VLLM_RETRY_MAX_DELAY", "10.0")),
|
||||
retry_backoff_multiplier=float(os.getenv("VLLM_RETRY_BACKOFF_MULTIPLIER", "2.0")),
|
||||
max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "32768")),
|
||||
temperature=float(os.getenv("VLLM_TEMPERATURE", "0.7")),
|
||||
api_key=os.getenv("VLLM_API_KEY", ""),
|
||||
),
|
||||
trino=TrinoConfig(
|
||||
host=os.getenv("TRINO_HOST", "localhost"),
|
||||
port=int(os.getenv("TRINO_PORT", "8080")),
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
"""LLM client protocol for provider abstraction.
|
||||
|
||||
Defines the structural interface that both OllamaClient and VLLMClient
|
||||
must satisfy, using typing.Protocol for duck-typing compatibility.
|
||||
|
||||
Requirements: 1.1, 1.2
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Protocol, runtime_checkable
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from services.extractor.client import ExtractionAttempt
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class LLMClient(Protocol):
|
||||
"""Protocol defining the contract for LLM inference clients.
|
||||
|
||||
Both OllamaClient and VLLMClient satisfy this protocol via
|
||||
structural subtyping — no inheritance required.
|
||||
"""
|
||||
|
||||
async def call_llm(
|
||||
self,
|
||||
prompts: dict[str, str],
|
||||
json_schema: dict[str, object],
|
||||
document_text: str = "",
|
||||
) -> ExtractionAttempt:
|
||||
"""Send a chat completion request and return an extraction attempt.
|
||||
|
||||
Args:
|
||||
prompts: Dict with 'system' and 'user' prompt strings.
|
||||
json_schema: JSON schema hint for structured output.
|
||||
document_text: Optional raw document text for context.
|
||||
|
||||
Returns:
|
||||
An ExtractionAttempt with raw output, validation, and error info.
|
||||
"""
|
||||
...
|
||||
|
||||
async def close(self) -> None:
|
||||
"""Release underlying HTTP resources."""
|
||||
...
|
||||
Reference in New Issue
Block a user