fix: switch to think=false with json-repair — 20x faster extraction

This commit is contained in:
Celes Renata
2026-04-15 02:54:39 +00:00
parent 4f2ae23d42
commit 00044af993
2 changed files with 49 additions and 12 deletions
+46 -12
View File
@@ -1,8 +1,9 @@
"""Ollama client wrapper using structured output format.
"""Ollama client wrapper for document intelligence extraction.
Sends documents to a local Ollama instance via the /api/chat endpoint
with the ``format`` parameter set to the extraction JSON schema, ensuring
the model returns schema-compliant JSON.
with think=false for speed. Uses json-repair to fix common JSON syntax
issues in model output since the Ollama format constraint is broken
with think=false on qwen3.5 models (Ollama bug #14645).
Includes retry logic for invalid or incomplete model responses with
exponential backoff, error classification, and full audit preservation.
@@ -12,11 +13,14 @@ Requirements: 5.1, 5.2, 5.4
from __future__ import annotations
import asyncio
import json
import logging
import re
import time
from dataclasses import dataclass, field
import httpx
from json_repair import repair_json
from services.extractor.prompts import (
build_extraction_prompt,
@@ -45,6 +49,32 @@ def _is_retryable(error: str | None) -> bool:
return error not in _NON_RETRYABLE_ERRORS
_FENCE_RE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?\s*```\s*$", re.DOTALL)
def _strip_markdown_fences(text: str) -> str:
"""Remove ```json ... ``` wrappers if present."""
m = _FENCE_RE.match(text.strip())
return m.group(1) if m else text
def _repair_json(text: str) -> str:
"""Try json.loads first; if it fails, repair with json-repair."""
try:
json.loads(text)
return text # already valid
except (json.JSONDecodeError, ValueError):
pass
try:
repaired = repair_json(text, return_objects=False)
logger.info("JSON repaired successfully (%d -> %d chars)", len(text), len(repaired))
return repaired
except Exception:
logger.warning("JSON repair failed, returning original text")
return text
@dataclass
class ExtractionAttempt:
"""Record of a single extraction attempt for audit."""
@@ -209,12 +239,13 @@ class OllamaClient:
json_schema: dict[str, object],
document_text: str = "",
) -> ExtractionAttempt:
"""Make a streaming call to Ollama with early-termination guardrails.
"""Call Ollama with think=false for speed, then repair any malformed JSON.
Aborts the stream if:
- Total generated tokens exceed ``max_tokens``
- No new chunk arrives within ``stall_timeout`` seconds
- Repetition loop detected in the last ``loop_window`` tokens
Uses think=false to avoid the 2-4 minute thinking overhead.
Does NOT use the format parameter (Ollama bug #14645 silently
ignores format when think=false on qwen3.5 models).
Instead, relies on the prompt to produce JSON and repairs
common syntax issues with json-repair.
"""
attempt = ExtractionAttempt(model=self._config.model)
start = time.monotonic()
@@ -225,11 +256,8 @@ class OllamaClient:
{"role": "system", "content": prompts["system"]},
{"role": "user", "content": prompts["user"]},
],
"format": json_schema,
"stream": False,
"options": {
"num_predict": 16384,
},
"think": False,
}
url = f"{self._config.base_url}/api/chat"
@@ -271,6 +299,12 @@ class OllamaClient:
attempt.error = "empty_model_response"
return attempt
# Strip markdown fences if present (model sometimes wraps in ```json ... ```)
content = _strip_markdown_fences(content)
# Try json.loads first; if it fails, attempt repair
content = _repair_json(content)
# Validate against extraction schema
attempt.validation = validate_extraction(content, document_text=document_text)
if not attempt.validation.valid: