fix: reduce vLLM default max_tokens to 4096, update model to AxionML/Qwen3.5-9B-NVFP4

The model's max_model_len is 16384 — requesting 32768 output tokens
caused HTTP 400 from vLLM. 4096 is a safe default for extraction output.
This commit is contained in:
Celes Renata
2026-04-23 19:49:34 +00:00
parent f7ae34ef3b
commit 0437943863
2 changed files with 5 additions and 5 deletions
+3 -3
View File
@@ -61,13 +61,13 @@ class VLLMConfig:
Requirements: 3.1, 3.2
"""
base_url: str = "http://192.168.42.254:8000"
model: str = "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
model: str = "AxionML/Qwen3.5-9B-NVFP4"
timeout: int = 120
max_retries: int = 2
retry_base_delay: float = 1.0
retry_max_delay: float = 10.0
retry_backoff_multiplier: float = 2.0
max_tokens: int = 32768
max_tokens: int = 4096
temperature: float = 0.7
api_key: str = "" # Optional, for authenticated vLLM deployments
@@ -287,7 +287,7 @@ def load_config() -> AppConfig:
retry_base_delay=float(os.getenv("VLLM_RETRY_BASE_DELAY", "1.0")),
retry_max_delay=float(os.getenv("VLLM_RETRY_MAX_DELAY", "10.0")),
retry_backoff_multiplier=float(os.getenv("VLLM_RETRY_BACKOFF_MULTIPLIER", "2.0")),
max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "32768")),
max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "4096")),
temperature=float(os.getenv("VLLM_TEMPERATURE", "0.7")),
api_key=os.getenv("VLLM_API_KEY", ""),
),