fix: switch to think=false with json-repair — 20x faster extraction

2026-04-15 02:54:39 +00:00
parent 4f2ae23d42
commit 00044af993
2 changed files with 49 additions and 12 deletions
@@ -13,6 +13,9 @@ minio>=7.2.0
 # HTTP client
 httpx>=0.27.0
 # JSON repair for LLM output
 json-repair>=0.59.0
 # Web scraping
 beautifulsoup4>=4.12.0
 requests>=2.31.0
@@ -1,8 +1,9 @@
-"""Ollama client wrapper using structured output format.
+"""Ollama client wrapper for document intelligence extraction.
 Sends documents to a local Ollama instance via the /api/chat endpoint
-with the ``format`` parameter set to the extraction JSON schema, ensuring
+with think=false for speed. Uses json-repair to fix common JSON syntax
-the model returns schema-compliant JSON.
+issues in model output since the Ollama format constraint is broken
 with think=false on qwen3.5 models (Ollama bug #14645).
 Includes retry logic for invalid or incomplete model responses with
 exponential backoff, error classification, and full audit preservation.
@@ -12,11 +13,14 @@ Requirements: 5.1, 5.2, 5.4
 from __future__ import annotations
 import asyncio
 import json
 import logging
 import re
 import time
 from dataclasses import dataclass, field
 import httpx
 from json_repair import repair_json
 from services.extractor.prompts import (
    build_extraction_prompt,
@@ -45,6 +49,32 @@ def _is_retryable(error: str | None) -> bool:
    return error not in _NON_RETRYABLE_ERRORS
 _FENCE_RE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?\s*```\s*$", re.DOTALL)
 def _strip_markdown_fences(text: str) -> str:
    """Remove ```json ... ``` wrappers if present."""
    m = _FENCE_RE.match(text.strip())
    return m.group(1) if m else text
 def _repair_json(text: str) -> str:
    """Try json.loads first; if it fails, repair with json-repair."""
    try:
        json.loads(text)
        return text  # already valid
    except (json.JSONDecodeError, ValueError):
        pass
    try:
        repaired = repair_json(text, return_objects=False)
        logger.info("JSON repaired successfully (%d -> %d chars)", len(text), len(repaired))
        return repaired
    except Exception:
        logger.warning("JSON repair failed, returning original text")
        return text
@dataclass
 class ExtractionAttempt:
    """Record of a single extraction attempt for audit."""
@@ -209,12 +239,13 @@ class OllamaClient:
        json_schema: dict[str, object],
        document_text: str = "",
    ) -> ExtractionAttempt:
-        """Make a streaming call to Ollama with early-termination guardrails.
+        """Call Ollama with think=false for speed, then repair any malformed JSON.
-        Aborts the stream if:
+        Uses think=false to avoid the 2-4 minute thinking overhead.
-        - Total generated tokens exceed ``max_tokens``
+        Does NOT use the format parameter (Ollama bug #14645 silently
-        - No new chunk arrives within ``stall_timeout`` seconds
+        ignores format when think=false on qwen3.5 models).
-        - Repetition loop detected in the last ``loop_window`` tokens
+        Instead, relies on the prompt to produce JSON and repairs
        common syntax issues with json-repair.
        """
        attempt = ExtractionAttempt(model=self._config.model)
        start = time.monotonic()
@@ -225,11 +256,8 @@ class OllamaClient:
                {"role": "system", "content": prompts["system"]},
                {"role": "user", "content": prompts["user"]},
            ],
            "format": json_schema,
            "stream": False,
-            "options": {
+            "think": False,
                "num_predict": 16384,
            },
        }
        url = f"{self._config.base_url}/api/chat"
@@ -271,6 +299,12 @@ class OllamaClient:
            attempt.error = "empty_model_response"
            return attempt
        # Strip markdown fences if present (model sometimes wraps in ```json ... ```)
        content = _strip_markdown_fences(content)
        # Try json.loads first; if it fails, attempt repair
        content = _repair_json(content)
        # Validate against extraction schema
        attempt.validation = validate_extraction(content, document_text=document_text)
        if not attempt.validation.valid: