fix: reduce vLLM default max_tokens to 4096, update model to AxionML/Qwen3.5-9B-NVFP4
The model's max_model_len is 16384 — requesting 32768 output tokens caused HTTP 400 from vLLM. 4096 is a safe default for extraction output.
This commit is contained in:
@@ -61,13 +61,13 @@ class VLLMConfig:
|
|||||||
Requirements: 3.1, 3.2
|
Requirements: 3.1, 3.2
|
||||||
"""
|
"""
|
||||||
base_url: str = "http://192.168.42.254:8000"
|
base_url: str = "http://192.168.42.254:8000"
|
||||||
model: str = "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
|
model: str = "AxionML/Qwen3.5-9B-NVFP4"
|
||||||
timeout: int = 120
|
timeout: int = 120
|
||||||
max_retries: int = 2
|
max_retries: int = 2
|
||||||
retry_base_delay: float = 1.0
|
retry_base_delay: float = 1.0
|
||||||
retry_max_delay: float = 10.0
|
retry_max_delay: float = 10.0
|
||||||
retry_backoff_multiplier: float = 2.0
|
retry_backoff_multiplier: float = 2.0
|
||||||
max_tokens: int = 32768
|
max_tokens: int = 4096
|
||||||
temperature: float = 0.7
|
temperature: float = 0.7
|
||||||
api_key: str = "" # Optional, for authenticated vLLM deployments
|
api_key: str = "" # Optional, for authenticated vLLM deployments
|
||||||
|
|
||||||
@@ -287,7 +287,7 @@ def load_config() -> AppConfig:
|
|||||||
retry_base_delay=float(os.getenv("VLLM_RETRY_BASE_DELAY", "1.0")),
|
retry_base_delay=float(os.getenv("VLLM_RETRY_BASE_DELAY", "1.0")),
|
||||||
retry_max_delay=float(os.getenv("VLLM_RETRY_MAX_DELAY", "10.0")),
|
retry_max_delay=float(os.getenv("VLLM_RETRY_MAX_DELAY", "10.0")),
|
||||||
retry_backoff_multiplier=float(os.getenv("VLLM_RETRY_BACKOFF_MULTIPLIER", "2.0")),
|
retry_backoff_multiplier=float(os.getenv("VLLM_RETRY_BACKOFF_MULTIPLIER", "2.0")),
|
||||||
max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "32768")),
|
max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "4096")),
|
||||||
temperature=float(os.getenv("VLLM_TEMPERATURE", "0.7")),
|
temperature=float(os.getenv("VLLM_TEMPERATURE", "0.7")),
|
||||||
api_key=os.getenv("VLLM_API_KEY", ""),
|
api_key=os.getenv("VLLM_API_KEY", ""),
|
||||||
),
|
),
|
||||||
|
|||||||
@@ -386,11 +386,11 @@ def test_appconfig_vllm_defaults():
|
|||||||
assert hasattr(cfg, "vllm")
|
assert hasattr(cfg, "vllm")
|
||||||
assert isinstance(cfg.vllm, VLLMConfig)
|
assert isinstance(cfg.vllm, VLLMConfig)
|
||||||
assert cfg.vllm.base_url == "http://192.168.42.254:8000"
|
assert cfg.vllm.base_url == "http://192.168.42.254:8000"
|
||||||
assert cfg.vllm.model == "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
|
assert cfg.vllm.model == "AxionML/Qwen3.5-9B-NVFP4"
|
||||||
assert cfg.vllm.timeout == 120
|
assert cfg.vllm.timeout == 120
|
||||||
assert cfg.vllm.max_retries == 2
|
assert cfg.vllm.max_retries == 2
|
||||||
assert cfg.vllm.temperature == 0.7
|
assert cfg.vllm.temperature == 0.7
|
||||||
assert cfg.vllm.max_tokens == 32768
|
assert cfg.vllm.max_tokens == 4096
|
||||||
assert cfg.vllm.api_key == ""
|
assert cfg.vllm.api_key == ""
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user