fix: switch to think=false with json-repair — 20x faster extraction
This commit is contained in:
@@ -13,6 +13,9 @@ minio>=7.2.0
|
||||
# HTTP client
|
||||
httpx>=0.27.0
|
||||
|
||||
# JSON repair for LLM output
|
||||
json-repair>=0.59.0
|
||||
|
||||
# Web scraping
|
||||
beautifulsoup4>=4.12.0
|
||||
requests>=2.31.0
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
"""Ollama client wrapper using structured output format.
|
||||
"""Ollama client wrapper for document intelligence extraction.
|
||||
|
||||
Sends documents to a local Ollama instance via the /api/chat endpoint
|
||||
with the ``format`` parameter set to the extraction JSON schema, ensuring
|
||||
the model returns schema-compliant JSON.
|
||||
with think=false for speed. Uses json-repair to fix common JSON syntax
|
||||
issues in model output since the Ollama format constraint is broken
|
||||
with think=false on qwen3.5 models (Ollama bug #14645).
|
||||
|
||||
Includes retry logic for invalid or incomplete model responses with
|
||||
exponential backoff, error classification, and full audit preservation.
|
||||
@@ -12,11 +13,14 @@ Requirements: 5.1, 5.2, 5.4
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import httpx
|
||||
from json_repair import repair_json
|
||||
|
||||
from services.extractor.prompts import (
|
||||
build_extraction_prompt,
|
||||
@@ -45,6 +49,32 @@ def _is_retryable(error: str | None) -> bool:
|
||||
return error not in _NON_RETRYABLE_ERRORS
|
||||
|
||||
|
||||
_FENCE_RE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?\s*```\s*$", re.DOTALL)
|
||||
|
||||
|
||||
def _strip_markdown_fences(text: str) -> str:
|
||||
"""Remove ```json ... ``` wrappers if present."""
|
||||
m = _FENCE_RE.match(text.strip())
|
||||
return m.group(1) if m else text
|
||||
|
||||
|
||||
def _repair_json(text: str) -> str:
|
||||
"""Try json.loads first; if it fails, repair with json-repair."""
|
||||
try:
|
||||
json.loads(text)
|
||||
return text # already valid
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
|
||||
try:
|
||||
repaired = repair_json(text, return_objects=False)
|
||||
logger.info("JSON repaired successfully (%d -> %d chars)", len(text), len(repaired))
|
||||
return repaired
|
||||
except Exception:
|
||||
logger.warning("JSON repair failed, returning original text")
|
||||
return text
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractionAttempt:
|
||||
"""Record of a single extraction attempt for audit."""
|
||||
@@ -209,12 +239,13 @@ class OllamaClient:
|
||||
json_schema: dict[str, object],
|
||||
document_text: str = "",
|
||||
) -> ExtractionAttempt:
|
||||
"""Make a streaming call to Ollama with early-termination guardrails.
|
||||
"""Call Ollama with think=false for speed, then repair any malformed JSON.
|
||||
|
||||
Aborts the stream if:
|
||||
- Total generated tokens exceed ``max_tokens``
|
||||
- No new chunk arrives within ``stall_timeout`` seconds
|
||||
- Repetition loop detected in the last ``loop_window`` tokens
|
||||
Uses think=false to avoid the 2-4 minute thinking overhead.
|
||||
Does NOT use the format parameter (Ollama bug #14645 silently
|
||||
ignores format when think=false on qwen3.5 models).
|
||||
Instead, relies on the prompt to produce JSON and repairs
|
||||
common syntax issues with json-repair.
|
||||
"""
|
||||
attempt = ExtractionAttempt(model=self._config.model)
|
||||
start = time.monotonic()
|
||||
@@ -225,11 +256,8 @@ class OllamaClient:
|
||||
{"role": "system", "content": prompts["system"]},
|
||||
{"role": "user", "content": prompts["user"]},
|
||||
],
|
||||
"format": json_schema,
|
||||
"stream": False,
|
||||
"options": {
|
||||
"num_predict": 16384,
|
||||
},
|
||||
"think": False,
|
||||
}
|
||||
|
||||
url = f"{self._config.base_url}/api/chat"
|
||||
@@ -271,6 +299,12 @@ class OllamaClient:
|
||||
attempt.error = "empty_model_response"
|
||||
return attempt
|
||||
|
||||
# Strip markdown fences if present (model sometimes wraps in ```json ... ```)
|
||||
content = _strip_markdown_fences(content)
|
||||
|
||||
# Try json.loads first; if it fails, attempt repair
|
||||
content = _repair_json(content)
|
||||
|
||||
# Validate against extraction schema
|
||||
attempt.validation = validate_extraction(content, document_text=document_text)
|
||||
if not attempt.validation.valid:
|
||||
|
||||
Reference in New Issue
Block a user