feat: add remote vLLM support with provider abstraction layer

- LLMClient Protocol for provider-agnostic inference
- VLLMClient for OpenAI-compatible /v1/chat/completions API
- LLM client factory with provider routing (ollama/vllm)
- VLLMConfig with VLLM_* environment variable loading
- Updated extractor worker with health check and provider switching
- Updated event classifier to use LLMClient protocol
- Helm values for vLLM configuration
- 18 unit tests + 6 property-based tests
- Full backward compatibility preserved
This commit is contained in:
Celes Renata
2026-04-23 08:17:23 +00:00
parent 63e4fb96ea
commit 117b693b19
15 changed files with 1876 additions and 77 deletions
+31
View File
@@ -54,6 +54,24 @@ class OllamaConfig:
context_window: int = 0 # Ollama num_ctx; 0 = use model default
@dataclass
class VLLMConfig:
"""Configuration for the remote vLLM inference server.
Requirements: 3.1, 3.2
"""
base_url: str = "http://192.168.42.254:8000"
model: str = "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
timeout: int = 120
max_retries: int = 2
retry_base_delay: float = 1.0
retry_max_delay: float = 10.0
retry_backoff_multiplier: float = 2.0
max_tokens: int = 32768
temperature: float = 0.7
api_key: str = "" # Optional, for authenticated vLLM deployments
@dataclass
class TrinoConfig:
host: str = "localhost"
@@ -217,6 +235,7 @@ class AppConfig:
redis: RedisConfig = field(default_factory=RedisConfig)
minio: MinioConfig = field(default_factory=MinioConfig)
ollama: OllamaConfig = field(default_factory=OllamaConfig)
vllm: VLLMConfig = field(default_factory=VLLMConfig)
trino: TrinoConfig = field(default_factory=TrinoConfig)
market_data: MarketDataConfig = field(default_factory=MarketDataConfig)
broker: BrokerConfig = field(default_factory=BrokerConfig)
@@ -260,6 +279,18 @@ def load_config() -> AppConfig:
retry_max_delay=float(os.getenv("OLLAMA_RETRY_MAX_DELAY", "10.0")),
retry_backoff_multiplier=float(os.getenv("OLLAMA_RETRY_BACKOFF_MULTIPLIER", "2.0")),
),
vllm=VLLMConfig(
base_url=os.getenv("VLLM_BASE_URL", "http://192.168.42.254:8000"),
model=os.getenv("VLLM_MODEL", "RedHatAI/Qwen3.6-35B-A3B-NVFP4"),
timeout=int(os.getenv("VLLM_TIMEOUT", "120")),
max_retries=int(os.getenv("VLLM_MAX_RETRIES", "2")),
retry_base_delay=float(os.getenv("VLLM_RETRY_BASE_DELAY", "1.0")),
retry_max_delay=float(os.getenv("VLLM_RETRY_MAX_DELAY", "10.0")),
retry_backoff_multiplier=float(os.getenv("VLLM_RETRY_BACKOFF_MULTIPLIER", "2.0")),
max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "32768")),
temperature=float(os.getenv("VLLM_TEMPERATURE", "0.7")),
api_key=os.getenv("VLLM_API_KEY", ""),
),
trino=TrinoConfig(
host=os.getenv("TRINO_HOST", "localhost"),
port=int(os.getenv("TRINO_PORT", "8080")),