fix: switch to think=false with json-repair — 20x faster extraction

2026-04-15 02:54:39 +00:00
parent 4f2ae23d42
commit 00044af993
2 changed files with 49 additions and 12 deletions
@@ -13,6 +13,9 @@ minio>=7.2.0
 # HTTP client
 httpx>=0.27.0

+# JSON repair for LLM output
+json-repair>=0.59.0
+
 # Web scraping
 beautifulsoup4>=4.12.0
 requests>=2.31.0
@@ -1,8 +1,9 @@
-"""Ollama client wrapper using structured output format.
+"""Ollama client wrapper for document intelligence extraction.

 Sends documents to a local Ollama instance via the /api/chat endpoint
-with the ``format`` parameter set to the extraction JSON schema, ensuring
-the model returns schema-compliant JSON.
+with think=false for speed. Uses json-repair to fix common JSON syntax
+issues in model output since the Ollama format constraint is broken
+with think=false on qwen3.5 models (Ollama bug #14645).

 Includes retry logic for invalid or incomplete model responses with
 exponential backoff, error classification, and full audit preservation.
@@ -12,11 +13,14 @@ Requirements: 5.1, 5.2, 5.4
 from __future__ import annotations

 import asyncio
+import json
 import logging
+import re
 import time
 from dataclasses import dataclass, field

 import httpx
+from json_repair import repair_json

 from services.extractor.prompts import (
    build_extraction_prompt,
@@ -45,6 +49,32 @@ def _is_retryable(error: str | None) -> bool:
    return error not in _NON_RETRYABLE_ERRORS


+_FENCE_RE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?\s*```\s*$", re.DOTALL)
+
+
+def _strip_markdown_fences(text: str) -> str:
+    """Remove ```json ... ``` wrappers if present."""
+    m = _FENCE_RE.match(text.strip())
+    return m.group(1) if m else text
+
+
+def _repair_json(text: str) -> str:
+    """Try json.loads first; if it fails, repair with json-repair."""
+    try:
+        json.loads(text)
+        return text  # already valid
+    except (json.JSONDecodeError, ValueError):
+        pass
+
+    try:
+        repaired = repair_json(text, return_objects=False)
+        logger.info("JSON repaired successfully (%d -> %d chars)", len(text), len(repaired))
+        return repaired
+    except Exception:
+        logger.warning("JSON repair failed, returning original text")
+        return text
+
+
@dataclass
 class ExtractionAttempt:
    """Record of a single extraction attempt for audit."""
@@ -209,12 +239,13 @@ class OllamaClient:
        json_schema: dict[str, object],
        document_text: str = "",
    ) -> ExtractionAttempt:
-        """Make a streaming call to Ollama with early-termination guardrails.
+        """Call Ollama with think=false for speed, then repair any malformed JSON.

-        Aborts the stream if:
-        - Total generated tokens exceed ``max_tokens``
-        - No new chunk arrives within ``stall_timeout`` seconds
-        - Repetition loop detected in the last ``loop_window`` tokens
+        Uses think=false to avoid the 2-4 minute thinking overhead.
+        Does NOT use the format parameter (Ollama bug #14645 silently
+        ignores format when think=false on qwen3.5 models).
+        Instead, relies on the prompt to produce JSON and repairs
+        common syntax issues with json-repair.
        """
        attempt = ExtractionAttempt(model=self._config.model)
        start = time.monotonic()
@@ -225,11 +256,8 @@ class OllamaClient:
                {"role": "system", "content": prompts["system"]},
                {"role": "user", "content": prompts["user"]},
            ],
-            "format": json_schema,
            "stream": False,
-            "options": {
-                "num_predict": 16384,
-            },
+            "think": False,
        }

        url = f"{self._config.base_url}/api/chat"
@@ -271,6 +299,12 @@ class OllamaClient:
            attempt.error = "empty_model_response"
            return attempt

+        # Strip markdown fences if present (model sometimes wraps in ```json ... ```)
+        content = _strip_markdown_fences(content)
+
+        # Try json.loads first; if it fails, attempt repair
+        content = _repair_json(content)
+
        # Validate against extraction schema
        attempt.validation = validate_extraction(content, document_text=document_text)
        if not attempt.validation.valid: