fix: data quality query and suppression fallback in recommendation worker

- Fix _DATA_QUALITY_QUERY: remove nonexistent d.source_id/s.source_class,
  use d.source_type directly
- Fix LIMIT 1 applied after jsonb expansion by restructuring as CTE
- Fix fallback build_quality_context_from_summary returning empty
  source_types which always triggered LOW_SOURCE_DIVERSITY suppression
- Update test to reflect corrected fallback behavior
This commit is contained in:
Celes Renata
2026-04-14 06:57:46 +00:00
parent 4fbddc307a
commit b478022ba3
3 changed files with 22 additions and 20 deletions
+3 -2
View File
@@ -107,7 +107,8 @@ def build_quality_context_from_summary(
This is a fallback when full document-level quality metrics aren't This is a fallback when full document-level quality metrics aren't
available. It uses the trend summary's evidence counts and confidence available. It uses the trend summary's evidence counts and confidence
as proxies. as proxies. We assume at least one source type contributed so that
the fallback does not automatically trigger LOW_SOURCE_DIVERSITY.
""" """
total = len(summary.top_supporting_evidence) + len(summary.top_opposing_evidence) total = len(summary.top_supporting_evidence) + len(summary.top_opposing_evidence)
return DataQualityContext( return DataQualityContext(
@@ -116,7 +117,7 @@ def build_quality_context_from_summary(
failed_documents=0, failed_documents=0,
avg_extraction_confidence=summary.confidence, avg_extraction_confidence=summary.confidence,
newest_evidence_at=summary.generated_at, newest_evidence_at=summary.generated_at,
source_types=set(), source_types={"unknown"},
) )
+16 -12
View File
@@ -58,26 +58,30 @@ logger = logging.getLogger(__name__)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
_DATA_QUALITY_QUERY = """ _DATA_QUALITY_QUERY = """
WITH latest_trend AS (
SELECT top_supporting_evidence, top_opposing_evidence
FROM trend_windows
WHERE entity_id = $1 AND "window" = $2
ORDER BY generated_at DESC
LIMIT 1
),
evidence_ids AS (
SELECT jsonb_array_elements_text(
COALESCE(lt.top_supporting_evidence, '[]'::jsonb)
|| COALESCE(lt.top_opposing_evidence, '[]'::jsonb)
) AS eid
FROM latest_trend lt
)
SELECT SELECT
COUNT(*) AS total_documents, COUNT(*) AS total_documents,
COUNT(*) FILTER (WHERE di.validation_status = 'valid') AS valid_documents, COUNT(*) FILTER (WHERE di.validation_status = 'valid') AS valid_documents,
COUNT(*) FILTER (WHERE di.validation_status = 'failed') AS failed_documents, COUNT(*) FILTER (WHERE di.validation_status = 'failed') AS failed_documents,
AVG(di.confidence) FILTER (WHERE di.validation_status = 'valid') AS avg_extraction_confidence, AVG(di.confidence) FILTER (WHERE di.validation_status = 'valid') AS avg_extraction_confidence,
MAX(d.published_at) AS newest_evidence_at, MAX(d.published_at) AS newest_evidence_at,
ARRAY_AGG(DISTINCT s.source_class) FILTER (WHERE s.source_class IS NOT NULL) AS source_types ARRAY_AGG(DISTINCT d.source_type) FILTER (WHERE d.source_type IS NOT NULL) AS source_types
FROM documents d FROM documents d
JOIN document_intelligence di ON di.document_id = d.id JOIN document_intelligence di ON di.document_id = d.id
LEFT JOIN sources s ON d.source_id = s.id WHERE d.id::text IN (SELECT eid FROM evidence_ids)
WHERE d.id = ANY(
SELECT UNNEST(
COALESCE(tw.top_supporting_evidence, '[]'::jsonb)
|| COALESCE(tw.top_opposing_evidence, '[]'::jsonb)
)::uuid
FROM trend_windows tw
WHERE tw.entity_id = $1 AND tw."window" = $2
ORDER BY tw.generated_at DESC
LIMIT 1
)
""" """
+3 -6
View File
@@ -128,12 +128,9 @@ def test_fallback_context_from_summary():
def test_no_suppression_with_summary_fallback(): def test_no_suppression_with_summary_fallback():
"""When no quality context is provided, summary-based fallback is used.""" """When no quality context is provided, summary-based fallback is used."""
summary = _make_summary(confidence=0.7) summary = _make_summary(confidence=0.7)
# Default config has min_source_types=1, but fallback has empty source_types. # Default config has min_source_types=1. The fallback now returns
# With min_source_types=1 and empty source_types, LOW_SOURCE_DIVERSITY fires # source_types={"unknown"} so LOW_SOURCE_DIVERSITY does not fire.
# only when total_documents > 0. But default min_source_types is 1 and result = evaluate_suppression(summary, config=SuppressionConfig(), reference_time=NOW)
# len(set()) = 0 < 1, so it would fire. Let's use a config that relaxes this.
config = SuppressionConfig(min_source_types=0)
result = evaluate_suppression(summary, config=config, reference_time=NOW)
assert result.suppressed is False assert result.suppressed is False