diff --git a/services/extractor/event_classifier.py b/services/extractor/event_classifier.py index 17bc4a4..52c8552 100644 --- a/services/extractor/event_classifier.py +++ b/services/extractor/event_classifier.py @@ -291,13 +291,30 @@ def _parse_classification_response( cleaned = _strip_markdown_fences(raw_json) cleaned = _repair_json(cleaned) + + # DEBUG: log raw vs cleaned to diagnose persistent list issue + logger.info( + "Classification parse debug doc=%s raw_len=%d cleaned_len=%d raw_start=%s cleaned_start=%s", + document_id, len(raw_json), len(cleaned), + repr(raw_json[:300]), repr(cleaned[:300]), + ) + data = json.loads(cleaned) - # Model sometimes wraps the object in a list — unwrap it - if isinstance(data, list) and len(data) == 1 and isinstance(data[0], dict): - data = data[0] + # Model sometimes wraps the object in a single-element list — unwrap it + if isinstance(data, list): + if len(data) == 1 and isinstance(data[0], dict): + data = data[0] + elif len(data) == 0: + raise ValueError( + f"Empty list from model for document {document_id}. " + f"Raw output ({len(raw_json)} chars): {raw_json[:500]}" + ) if not isinstance(data, dict): - raise ValueError(f"Expected a JSON object, got {type(data).__name__}") + raise ValueError( + f"Expected a JSON object, got {type(data).__name__} for document {document_id}. " + f"Raw output ({len(raw_json)} chars): {raw_json[:500]}" + ) confidence = data.get("confidence", 0.5) if isinstance(confidence, (int, float)): @@ -613,7 +630,7 @@ async def classify_global_event( return event - except (json.JSONDecodeError, KeyError, TypeError) as exc: + except (json.JSONDecodeError, KeyError, TypeError, ValueError) as exc: last_error = f"parse_error: {exc}" logger.warning( "Classification parse error for doc %s attempt %d: %s",