fix: reduce vLLM default max_tokens to 4096, update model to AxionML/Qwen3.5-9B-NVFP4
The model's max_model_len is 16384 — requesting 32768 output tokens caused HTTP 400 from vLLM. 4096 is a safe default for extraction output.
This commit is contained in:
@@ -386,11 +386,11 @@ def test_appconfig_vllm_defaults():
|
||||
assert hasattr(cfg, "vllm")
|
||||
assert isinstance(cfg.vllm, VLLMConfig)
|
||||
assert cfg.vllm.base_url == "http://192.168.42.254:8000"
|
||||
assert cfg.vllm.model == "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
|
||||
assert cfg.vllm.model == "AxionML/Qwen3.5-9B-NVFP4"
|
||||
assert cfg.vllm.timeout == 120
|
||||
assert cfg.vllm.max_retries == 2
|
||||
assert cfg.vllm.temperature == 0.7
|
||||
assert cfg.vllm.max_tokens == 32768
|
||||
assert cfg.vllm.max_tokens == 4096
|
||||
assert cfg.vllm.api_key == ""
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user