feat: add remote vLLM support with provider abstraction layer

- LLMClient Protocol for provider-agnostic inference - VLLMClient for OpenAI-compatible /v1/chat/completions API - LLM client factory with provider routing (ollama/vllm) - VLLMConfig with VLLM_* environment variable loading - Updated extractor worker with health check and provider switching - Updated event classifier to use LLMClient protocol - Helm values for vLLM configuration - 18 unit tests + 6 property-based tests - Full backward compatibility preserved
2026-04-23 08:17:23 +00:00
parent 63e4fb96ea
commit 117b693b19
15 changed files with 1876 additions and 77 deletions
@@ -54,6 +54,24 @@ class OllamaConfig:
    context_window: int = 0  # Ollama num_ctx; 0 = use model default


+@dataclass
+class VLLMConfig:
+    """Configuration for the remote vLLM inference server.
+
+    Requirements: 3.1, 3.2
+    """
+    base_url: str = "http://192.168.42.254:8000"
+    model: str = "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
+    timeout: int = 120
+    max_retries: int = 2
+    retry_base_delay: float = 1.0
+    retry_max_delay: float = 10.0
+    retry_backoff_multiplier: float = 2.0
+    max_tokens: int = 32768
+    temperature: float = 0.7
+    api_key: str = ""  # Optional, for authenticated vLLM deployments
+
+
@dataclass
 class TrinoConfig:
    host: str = "localhost"
@@ -217,6 +235,7 @@ class AppConfig:
    redis: RedisConfig = field(default_factory=RedisConfig)
    minio: MinioConfig = field(default_factory=MinioConfig)
    ollama: OllamaConfig = field(default_factory=OllamaConfig)
+    vllm: VLLMConfig = field(default_factory=VLLMConfig)
    trino: TrinoConfig = field(default_factory=TrinoConfig)
    market_data: MarketDataConfig = field(default_factory=MarketDataConfig)
    broker: BrokerConfig = field(default_factory=BrokerConfig)
@@ -260,6 +279,18 @@ def load_config() -> AppConfig:
            retry_max_delay=float(os.getenv("OLLAMA_RETRY_MAX_DELAY", "10.0")),
            retry_backoff_multiplier=float(os.getenv("OLLAMA_RETRY_BACKOFF_MULTIPLIER", "2.0")),
        ),
+        vllm=VLLMConfig(
+            base_url=os.getenv("VLLM_BASE_URL", "http://192.168.42.254:8000"),
+            model=os.getenv("VLLM_MODEL", "RedHatAI/Qwen3.6-35B-A3B-NVFP4"),
+            timeout=int(os.getenv("VLLM_TIMEOUT", "120")),
+            max_retries=int(os.getenv("VLLM_MAX_RETRIES", "2")),
+            retry_base_delay=float(os.getenv("VLLM_RETRY_BASE_DELAY", "1.0")),
+            retry_max_delay=float(os.getenv("VLLM_RETRY_MAX_DELAY", "10.0")),
+            retry_backoff_multiplier=float(os.getenv("VLLM_RETRY_BACKOFF_MULTIPLIER", "2.0")),
+            max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "32768")),
+            temperature=float(os.getenv("VLLM_TEMPERATURE", "0.7")),
+            api_key=os.getenv("VLLM_API_KEY", ""),
+        ),
        trino=TrinoConfig(
            host=os.getenv("TRINO_HOST", "localhost"),
            port=int(os.getenv("TRINO_PORT", "8080")),
@@ -0,0 +1,44 @@
+"""LLM client protocol for provider abstraction.
+
+Defines the structural interface that both OllamaClient and VLLMClient
+must satisfy, using typing.Protocol for duck-typing compatibility.
+
+Requirements: 1.1, 1.2
+"""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Protocol, runtime_checkable
+
+if TYPE_CHECKING:
+    from services.extractor.client import ExtractionAttempt
+
+
+@runtime_checkable
+class LLMClient(Protocol):
+    """Protocol defining the contract for LLM inference clients.
+
+    Both OllamaClient and VLLMClient satisfy this protocol via
+    structural subtyping — no inheritance required.
+    """
+
+    async def call_llm(
+        self,
+        prompts: dict[str, str],
+        json_schema: dict[str, object],
+        document_text: str = "",
+    ) -> ExtractionAttempt:
+        """Send a chat completion request and return an extraction attempt.
+
+        Args:
+            prompts: Dict with 'system' and 'user' prompt strings.
+            json_schema: JSON schema hint for structured output.
+            document_text: Optional raw document text for context.
+
+        Returns:
+            An ExtractionAttempt with raw output, validation, and error info.
+        """
+        ...
+
+    async def close(self) -> None:
+        """Release underlying HTTP resources."""
+        ...