fix: reduce vLLM default max_tokens to 4096, update model to AxionML/Qwen3.5-9B-NVFP4
The model's max_model_len is 16384 — requesting 32768 output tokens caused HTTP 400 from vLLM. 4096 is a safe default for extraction output.
This commit is contained in:
@@ -61,13 +61,13 @@ class VLLMConfig:
|
||||
Requirements: 3.1, 3.2
|
||||
"""
|
||||
base_url: str = "http://192.168.42.254:8000"
|
||||
model: str = "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
|
||||
model: str = "AxionML/Qwen3.5-9B-NVFP4"
|
||||
timeout: int = 120
|
||||
max_retries: int = 2
|
||||
retry_base_delay: float = 1.0
|
||||
retry_max_delay: float = 10.0
|
||||
retry_backoff_multiplier: float = 2.0
|
||||
max_tokens: int = 32768
|
||||
max_tokens: int = 4096
|
||||
temperature: float = 0.7
|
||||
api_key: str = "" # Optional, for authenticated vLLM deployments
|
||||
|
||||
@@ -287,7 +287,7 @@ def load_config() -> AppConfig:
|
||||
retry_base_delay=float(os.getenv("VLLM_RETRY_BASE_DELAY", "1.0")),
|
||||
retry_max_delay=float(os.getenv("VLLM_RETRY_MAX_DELAY", "10.0")),
|
||||
retry_backoff_multiplier=float(os.getenv("VLLM_RETRY_BACKOFF_MULTIPLIER", "2.0")),
|
||||
max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "32768")),
|
||||
max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "4096")),
|
||||
temperature=float(os.getenv("VLLM_TEMPERATURE", "0.7")),
|
||||
api_key=os.getenv("VLLM_API_KEY", ""),
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user