feat: add remote vLLM support with provider abstraction layer

- LLMClient Protocol for provider-agnostic inference - VLLMClient for OpenAI-compatible /v1/chat/completions API - LLM client factory with provider routing (ollama/vllm) - VLLMConfig with VLLM_* environment variable loading - Updated extractor worker with health check and provider switching - Updated event classifier to use LLMClient protocol - Helm values for vLLM configuration - 18 unit tests + 6 property-based tests - Full backward compatibility preserved
2026-04-23 08:17:23 +00:00
parent 63e4fb96ea
commit 117b693b19
15 changed files with 1876 additions and 77 deletions
@@ -54,6 +54,24 @@ class OllamaConfig:
    context_window: int = 0  # Ollama num_ctx; 0 = use model default


+@dataclass
+class VLLMConfig:
+    """Configuration for the remote vLLM inference server.
+
+    Requirements: 3.1, 3.2
+    """
+    base_url: str = "http://192.168.42.254:8000"
+    model: str = "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
+    timeout: int = 120
+    max_retries: int = 2
+    retry_base_delay: float = 1.0
+    retry_max_delay: float = 10.0
+    retry_backoff_multiplier: float = 2.0
+    max_tokens: int = 32768
+    temperature: float = 0.7
+    api_key: str = ""  # Optional, for authenticated vLLM deployments
+
+
@dataclass
 class TrinoConfig:
    host: str = "localhost"
@@ -217,6 +235,7 @@ class AppConfig:
    redis: RedisConfig = field(default_factory=RedisConfig)
    minio: MinioConfig = field(default_factory=MinioConfig)
    ollama: OllamaConfig = field(default_factory=OllamaConfig)
+    vllm: VLLMConfig = field(default_factory=VLLMConfig)
    trino: TrinoConfig = field(default_factory=TrinoConfig)
    market_data: MarketDataConfig = field(default_factory=MarketDataConfig)
    broker: BrokerConfig = field(default_factory=BrokerConfig)
@@ -260,6 +279,18 @@ def load_config() -> AppConfig:
            retry_max_delay=float(os.getenv("OLLAMA_RETRY_MAX_DELAY", "10.0")),
            retry_backoff_multiplier=float(os.getenv("OLLAMA_RETRY_BACKOFF_MULTIPLIER", "2.0")),
        ),
+        vllm=VLLMConfig(
+            base_url=os.getenv("VLLM_BASE_URL", "http://192.168.42.254:8000"),
+            model=os.getenv("VLLM_MODEL", "RedHatAI/Qwen3.6-35B-A3B-NVFP4"),
+            timeout=int(os.getenv("VLLM_TIMEOUT", "120")),
+            max_retries=int(os.getenv("VLLM_MAX_RETRIES", "2")),
+            retry_base_delay=float(os.getenv("VLLM_RETRY_BASE_DELAY", "1.0")),
+            retry_max_delay=float(os.getenv("VLLM_RETRY_MAX_DELAY", "10.0")),
+            retry_backoff_multiplier=float(os.getenv("VLLM_RETRY_BACKOFF_MULTIPLIER", "2.0")),
+            max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "32768")),
+            temperature=float(os.getenv("VLLM_TEMPERATURE", "0.7")),
+            api_key=os.getenv("VLLM_API_KEY", ""),
+        ),
        trino=TrinoConfig(
            host=os.getenv("TRINO_HOST", "localhost"),
            port=int(os.getenv("TRINO_PORT", "8080")),