fix: switch to think=false with json-repair — 20x faster extraction

This commit is contained in:
Celes Renata
2026-04-15 02:54:39 +00:00
parent 4f2ae23d42
commit 00044af993
2 changed files with 49 additions and 12 deletions
+3
View File
@@ -13,6 +13,9 @@ minio>=7.2.0
# HTTP client # HTTP client
httpx>=0.27.0 httpx>=0.27.0
# JSON repair for LLM output
json-repair>=0.59.0
# Web scraping # Web scraping
beautifulsoup4>=4.12.0 beautifulsoup4>=4.12.0
requests>=2.31.0 requests>=2.31.0
+46 -12
View File
@@ -1,8 +1,9 @@
"""Ollama client wrapper using structured output format. """Ollama client wrapper for document intelligence extraction.
Sends documents to a local Ollama instance via the /api/chat endpoint Sends documents to a local Ollama instance via the /api/chat endpoint
with the ``format`` parameter set to the extraction JSON schema, ensuring with think=false for speed. Uses json-repair to fix common JSON syntax
the model returns schema-compliant JSON. issues in model output since the Ollama format constraint is broken
with think=false on qwen3.5 models (Ollama bug #14645).
Includes retry logic for invalid or incomplete model responses with Includes retry logic for invalid or incomplete model responses with
exponential backoff, error classification, and full audit preservation. exponential backoff, error classification, and full audit preservation.
@@ -12,11 +13,14 @@ Requirements: 5.1, 5.2, 5.4
from __future__ import annotations from __future__ import annotations
import asyncio import asyncio
import json
import logging import logging
import re
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
import httpx import httpx
from json_repair import repair_json
from services.extractor.prompts import ( from services.extractor.prompts import (
build_extraction_prompt, build_extraction_prompt,
@@ -45,6 +49,32 @@ def _is_retryable(error: str | None) -> bool:
return error not in _NON_RETRYABLE_ERRORS return error not in _NON_RETRYABLE_ERRORS
_FENCE_RE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?\s*```\s*$", re.DOTALL)
def _strip_markdown_fences(text: str) -> str:
"""Remove ```json ... ``` wrappers if present."""
m = _FENCE_RE.match(text.strip())
return m.group(1) if m else text
def _repair_json(text: str) -> str:
"""Try json.loads first; if it fails, repair with json-repair."""
try:
json.loads(text)
return text # already valid
except (json.JSONDecodeError, ValueError):
pass
try:
repaired = repair_json(text, return_objects=False)
logger.info("JSON repaired successfully (%d -> %d chars)", len(text), len(repaired))
return repaired
except Exception:
logger.warning("JSON repair failed, returning original text")
return text
@dataclass @dataclass
class ExtractionAttempt: class ExtractionAttempt:
"""Record of a single extraction attempt for audit.""" """Record of a single extraction attempt for audit."""
@@ -209,12 +239,13 @@ class OllamaClient:
json_schema: dict[str, object], json_schema: dict[str, object],
document_text: str = "", document_text: str = "",
) -> ExtractionAttempt: ) -> ExtractionAttempt:
"""Make a streaming call to Ollama with early-termination guardrails. """Call Ollama with think=false for speed, then repair any malformed JSON.
Aborts the stream if: Uses think=false to avoid the 2-4 minute thinking overhead.
- Total generated tokens exceed ``max_tokens`` Does NOT use the format parameter (Ollama bug #14645 silently
- No new chunk arrives within ``stall_timeout`` seconds ignores format when think=false on qwen3.5 models).
- Repetition loop detected in the last ``loop_window`` tokens Instead, relies on the prompt to produce JSON and repairs
common syntax issues with json-repair.
""" """
attempt = ExtractionAttempt(model=self._config.model) attempt = ExtractionAttempt(model=self._config.model)
start = time.monotonic() start = time.monotonic()
@@ -225,11 +256,8 @@ class OllamaClient:
{"role": "system", "content": prompts["system"]}, {"role": "system", "content": prompts["system"]},
{"role": "user", "content": prompts["user"]}, {"role": "user", "content": prompts["user"]},
], ],
"format": json_schema,
"stream": False, "stream": False,
"options": { "think": False,
"num_predict": 16384,
},
} }
url = f"{self._config.base_url}/api/chat" url = f"{self._config.base_url}/api/chat"
@@ -271,6 +299,12 @@ class OllamaClient:
attempt.error = "empty_model_response" attempt.error = "empty_model_response"
return attempt return attempt
# Strip markdown fences if present (model sometimes wraps in ```json ... ```)
content = _strip_markdown_fences(content)
# Try json.loads first; if it fails, attempt repair
content = _repair_json(content)
# Validate against extraction schema # Validate against extraction schema
attempt.validation = validate_extraction(content, document_text=document_text) attempt.validation = validate_extraction(content, document_text=document_text)
if not attempt.validation.valid: if not attempt.validation.valid: