feat: add remote vLLM support with provider abstraction layer
- LLMClient Protocol for provider-agnostic inference - VLLMClient for OpenAI-compatible /v1/chat/completions API - LLM client factory with provider routing (ollama/vllm) - VLLMConfig with VLLM_* environment variable loading - Updated extractor worker with health check and provider switching - Updated event classifier to use LLMClient protocol - Helm values for vLLM configuration - 18 unit tests + 6 property-based tests - Full backward compatibility preserved
This commit is contained in:
@@ -54,6 +54,24 @@ class OllamaConfig:
|
||||
context_window: int = 0 # Ollama num_ctx; 0 = use model default
|
||||
|
||||
|
||||
@dataclass
|
||||
class VLLMConfig:
|
||||
"""Configuration for the remote vLLM inference server.
|
||||
|
||||
Requirements: 3.1, 3.2
|
||||
"""
|
||||
base_url: str = "http://192.168.42.254:8000"
|
||||
model: str = "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
|
||||
timeout: int = 120
|
||||
max_retries: int = 2
|
||||
retry_base_delay: float = 1.0
|
||||
retry_max_delay: float = 10.0
|
||||
retry_backoff_multiplier: float = 2.0
|
||||
max_tokens: int = 32768
|
||||
temperature: float = 0.7
|
||||
api_key: str = "" # Optional, for authenticated vLLM deployments
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrinoConfig:
|
||||
host: str = "localhost"
|
||||
@@ -217,6 +235,7 @@ class AppConfig:
|
||||
redis: RedisConfig = field(default_factory=RedisConfig)
|
||||
minio: MinioConfig = field(default_factory=MinioConfig)
|
||||
ollama: OllamaConfig = field(default_factory=OllamaConfig)
|
||||
vllm: VLLMConfig = field(default_factory=VLLMConfig)
|
||||
trino: TrinoConfig = field(default_factory=TrinoConfig)
|
||||
market_data: MarketDataConfig = field(default_factory=MarketDataConfig)
|
||||
broker: BrokerConfig = field(default_factory=BrokerConfig)
|
||||
@@ -260,6 +279,18 @@ def load_config() -> AppConfig:
|
||||
retry_max_delay=float(os.getenv("OLLAMA_RETRY_MAX_DELAY", "10.0")),
|
||||
retry_backoff_multiplier=float(os.getenv("OLLAMA_RETRY_BACKOFF_MULTIPLIER", "2.0")),
|
||||
),
|
||||
vllm=VLLMConfig(
|
||||
base_url=os.getenv("VLLM_BASE_URL", "http://192.168.42.254:8000"),
|
||||
model=os.getenv("VLLM_MODEL", "RedHatAI/Qwen3.6-35B-A3B-NVFP4"),
|
||||
timeout=int(os.getenv("VLLM_TIMEOUT", "120")),
|
||||
max_retries=int(os.getenv("VLLM_MAX_RETRIES", "2")),
|
||||
retry_base_delay=float(os.getenv("VLLM_RETRY_BASE_DELAY", "1.0")),
|
||||
retry_max_delay=float(os.getenv("VLLM_RETRY_MAX_DELAY", "10.0")),
|
||||
retry_backoff_multiplier=float(os.getenv("VLLM_RETRY_BACKOFF_MULTIPLIER", "2.0")),
|
||||
max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "32768")),
|
||||
temperature=float(os.getenv("VLLM_TEMPERATURE", "0.7")),
|
||||
api_key=os.getenv("VLLM_API_KEY", ""),
|
||||
),
|
||||
trino=TrinoConfig(
|
||||
host=os.getenv("TRINO_HOST", "localhost"),
|
||||
port=int(os.getenv("TRINO_PORT", "8080")),
|
||||
|
||||
Reference in New Issue
Block a user