fix: reduce vLLM default max_tokens to 4096, update model to AxionML/Qwen3.5-9B-NVFP4

The model's max_model_len is 16384 — requesting 32768 output tokens caused HTTP 400 from vLLM. 4096 is a safe default for extraction output.
2026-04-23 19:49:34 +00:00
parent f7ae34ef3b
commit 0437943863
2 changed files with 5 additions and 5 deletions
@@ -61,13 +61,13 @@ class VLLMConfig:
    Requirements: 3.1, 3.2
    """
    base_url: str = "http://192.168.42.254:8000"
-    model: str = "RedHatAI/Qwen3.6-35B-A3B-NVFP4"
+    model: str = "AxionML/Qwen3.5-9B-NVFP4"
    timeout: int = 120
    max_retries: int = 2
    retry_base_delay: float = 1.0
    retry_max_delay: float = 10.0
    retry_backoff_multiplier: float = 2.0
-    max_tokens: int = 32768
+    max_tokens: int = 4096
    temperature: float = 0.7
    api_key: str = ""  # Optional, for authenticated vLLM deployments

@@ -287,7 +287,7 @@ def load_config() -> AppConfig:
            retry_base_delay=float(os.getenv("VLLM_RETRY_BASE_DELAY", "1.0")),
            retry_max_delay=float(os.getenv("VLLM_RETRY_MAX_DELAY", "10.0")),
            retry_backoff_multiplier=float(os.getenv("VLLM_RETRY_BACKOFF_MULTIPLIER", "2.0")),
-            max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "32768")),
+            max_tokens=int(os.getenv("VLLM_MAX_TOKENS", "4096")),
            temperature=float(os.getenv("VLLM_TEMPERATURE", "0.7")),
            api_key=os.getenv("VLLM_API_KEY", ""),
        ),