fix: switch to think=false with json-repair — 20x faster extraction
This commit is contained in:
@@ -13,6 +13,9 @@ minio>=7.2.0
|
|||||||
# HTTP client
|
# HTTP client
|
||||||
httpx>=0.27.0
|
httpx>=0.27.0
|
||||||
|
|
||||||
|
# JSON repair for LLM output
|
||||||
|
json-repair>=0.59.0
|
||||||
|
|
||||||
# Web scraping
|
# Web scraping
|
||||||
beautifulsoup4>=4.12.0
|
beautifulsoup4>=4.12.0
|
||||||
requests>=2.31.0
|
requests>=2.31.0
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
"""Ollama client wrapper using structured output format.
|
"""Ollama client wrapper for document intelligence extraction.
|
||||||
|
|
||||||
Sends documents to a local Ollama instance via the /api/chat endpoint
|
Sends documents to a local Ollama instance via the /api/chat endpoint
|
||||||
with the ``format`` parameter set to the extraction JSON schema, ensuring
|
with think=false for speed. Uses json-repair to fix common JSON syntax
|
||||||
the model returns schema-compliant JSON.
|
issues in model output since the Ollama format constraint is broken
|
||||||
|
with think=false on qwen3.5 models (Ollama bug #14645).
|
||||||
|
|
||||||
Includes retry logic for invalid or incomplete model responses with
|
Includes retry logic for invalid or incomplete model responses with
|
||||||
exponential backoff, error classification, and full audit preservation.
|
exponential backoff, error classification, and full audit preservation.
|
||||||
@@ -12,11 +13,14 @@ Requirements: 5.1, 5.2, 5.4
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
from json_repair import repair_json
|
||||||
|
|
||||||
from services.extractor.prompts import (
|
from services.extractor.prompts import (
|
||||||
build_extraction_prompt,
|
build_extraction_prompt,
|
||||||
@@ -45,6 +49,32 @@ def _is_retryable(error: str | None) -> bool:
|
|||||||
return error not in _NON_RETRYABLE_ERRORS
|
return error not in _NON_RETRYABLE_ERRORS
|
||||||
|
|
||||||
|
|
||||||
|
_FENCE_RE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?\s*```\s*$", re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_markdown_fences(text: str) -> str:
|
||||||
|
"""Remove ```json ... ``` wrappers if present."""
|
||||||
|
m = _FENCE_RE.match(text.strip())
|
||||||
|
return m.group(1) if m else text
|
||||||
|
|
||||||
|
|
||||||
|
def _repair_json(text: str) -> str:
|
||||||
|
"""Try json.loads first; if it fails, repair with json-repair."""
|
||||||
|
try:
|
||||||
|
json.loads(text)
|
||||||
|
return text # already valid
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
repaired = repair_json(text, return_objects=False)
|
||||||
|
logger.info("JSON repaired successfully (%d -> %d chars)", len(text), len(repaired))
|
||||||
|
return repaired
|
||||||
|
except Exception:
|
||||||
|
logger.warning("JSON repair failed, returning original text")
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ExtractionAttempt:
|
class ExtractionAttempt:
|
||||||
"""Record of a single extraction attempt for audit."""
|
"""Record of a single extraction attempt for audit."""
|
||||||
@@ -209,12 +239,13 @@ class OllamaClient:
|
|||||||
json_schema: dict[str, object],
|
json_schema: dict[str, object],
|
||||||
document_text: str = "",
|
document_text: str = "",
|
||||||
) -> ExtractionAttempt:
|
) -> ExtractionAttempt:
|
||||||
"""Make a streaming call to Ollama with early-termination guardrails.
|
"""Call Ollama with think=false for speed, then repair any malformed JSON.
|
||||||
|
|
||||||
Aborts the stream if:
|
Uses think=false to avoid the 2-4 minute thinking overhead.
|
||||||
- Total generated tokens exceed ``max_tokens``
|
Does NOT use the format parameter (Ollama bug #14645 silently
|
||||||
- No new chunk arrives within ``stall_timeout`` seconds
|
ignores format when think=false on qwen3.5 models).
|
||||||
- Repetition loop detected in the last ``loop_window`` tokens
|
Instead, relies on the prompt to produce JSON and repairs
|
||||||
|
common syntax issues with json-repair.
|
||||||
"""
|
"""
|
||||||
attempt = ExtractionAttempt(model=self._config.model)
|
attempt = ExtractionAttempt(model=self._config.model)
|
||||||
start = time.monotonic()
|
start = time.monotonic()
|
||||||
@@ -225,11 +256,8 @@ class OllamaClient:
|
|||||||
{"role": "system", "content": prompts["system"]},
|
{"role": "system", "content": prompts["system"]},
|
||||||
{"role": "user", "content": prompts["user"]},
|
{"role": "user", "content": prompts["user"]},
|
||||||
],
|
],
|
||||||
"format": json_schema,
|
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"options": {
|
"think": False,
|
||||||
"num_predict": 16384,
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
url = f"{self._config.base_url}/api/chat"
|
url = f"{self._config.base_url}/api/chat"
|
||||||
@@ -271,6 +299,12 @@ class OllamaClient:
|
|||||||
attempt.error = "empty_model_response"
|
attempt.error = "empty_model_response"
|
||||||
return attempt
|
return attempt
|
||||||
|
|
||||||
|
# Strip markdown fences if present (model sometimes wraps in ```json ... ```)
|
||||||
|
content = _strip_markdown_fences(content)
|
||||||
|
|
||||||
|
# Try json.loads first; if it fails, attempt repair
|
||||||
|
content = _repair_json(content)
|
||||||
|
|
||||||
# Validate against extraction schema
|
# Validate against extraction schema
|
||||||
attempt.validation = validate_extraction(content, document_text=document_text)
|
attempt.validation = validate_extraction(content, document_text=document_text)
|
||||||
if not attempt.validation.valid:
|
if not attempt.validation.valid:
|
||||||
|
|||||||
Reference in New Issue
Block a user