diff --git a/requirements.txt b/requirements.txt index 5108fdf..6ede7b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,9 @@ minio>=7.2.0 # HTTP client httpx>=0.27.0 +# JSON repair for LLM output +json-repair>=0.59.0 + # Web scraping beautifulsoup4>=4.12.0 requests>=2.31.0 diff --git a/services/extractor/client.py b/services/extractor/client.py index 21a57c0..9f8c2bc 100644 --- a/services/extractor/client.py +++ b/services/extractor/client.py @@ -1,8 +1,9 @@ -"""Ollama client wrapper using structured output format. +"""Ollama client wrapper for document intelligence extraction. Sends documents to a local Ollama instance via the /api/chat endpoint -with the ``format`` parameter set to the extraction JSON schema, ensuring -the model returns schema-compliant JSON. +with think=false for speed. Uses json-repair to fix common JSON syntax +issues in model output since the Ollama format constraint is broken +with think=false on qwen3.5 models (Ollama bug #14645). Includes retry logic for invalid or incomplete model responses with exponential backoff, error classification, and full audit preservation. @@ -12,11 +13,14 @@ Requirements: 5.1, 5.2, 5.4 from __future__ import annotations import asyncio +import json import logging +import re import time from dataclasses import dataclass, field import httpx +from json_repair import repair_json from services.extractor.prompts import ( build_extraction_prompt, @@ -45,6 +49,32 @@ def _is_retryable(error: str | None) -> bool: return error not in _NON_RETRYABLE_ERRORS +_FENCE_RE = re.compile(r"^```(?:json)?\s*\n?(.*?)\n?\s*```\s*$", re.DOTALL) + + +def _strip_markdown_fences(text: str) -> str: + """Remove ```json ... ``` wrappers if present.""" + m = _FENCE_RE.match(text.strip()) + return m.group(1) if m else text + + +def _repair_json(text: str) -> str: + """Try json.loads first; if it fails, repair with json-repair.""" + try: + json.loads(text) + return text # already valid + except (json.JSONDecodeError, ValueError): + pass + + try: + repaired = repair_json(text, return_objects=False) + logger.info("JSON repaired successfully (%d -> %d chars)", len(text), len(repaired)) + return repaired + except Exception: + logger.warning("JSON repair failed, returning original text") + return text + + @dataclass class ExtractionAttempt: """Record of a single extraction attempt for audit.""" @@ -209,12 +239,13 @@ class OllamaClient: json_schema: dict[str, object], document_text: str = "", ) -> ExtractionAttempt: - """Make a streaming call to Ollama with early-termination guardrails. + """Call Ollama with think=false for speed, then repair any malformed JSON. - Aborts the stream if: - - Total generated tokens exceed ``max_tokens`` - - No new chunk arrives within ``stall_timeout`` seconds - - Repetition loop detected in the last ``loop_window`` tokens + Uses think=false to avoid the 2-4 minute thinking overhead. + Does NOT use the format parameter (Ollama bug #14645 silently + ignores format when think=false on qwen3.5 models). + Instead, relies on the prompt to produce JSON and repairs + common syntax issues with json-repair. """ attempt = ExtractionAttempt(model=self._config.model) start = time.monotonic() @@ -225,11 +256,8 @@ class OllamaClient: {"role": "system", "content": prompts["system"]}, {"role": "user", "content": prompts["user"]}, ], - "format": json_schema, "stream": False, - "options": { - "num_predict": 16384, - }, + "think": False, } url = f"{self._config.base_url}/api/chat" @@ -271,6 +299,12 @@ class OllamaClient: attempt.error = "empty_model_response" return attempt + # Strip markdown fences if present (model sometimes wraps in ```json ... ```) + content = _strip_markdown_fences(content) + + # Try json.loads first; if it fails, attempt repair + content = _repair_json(content) + # Validate against extraction schema attempt.validation = validate_extraction(content, document_text=document_text) if not attempt.validation.valid: