fix: data quality query and suppression fallback in recommendation worker

- Fix _DATA_QUALITY_QUERY: remove nonexistent d.source_id/s.source_class, use d.source_type directly - Fix LIMIT 1 applied after jsonb expansion by restructuring as CTE - Fix fallback build_quality_context_from_summary returning empty source_types which always triggered LOW_SOURCE_DIVERSITY suppression - Update test to reflect corrected fallback behavior
2026-04-14 06:57:46 +00:00
parent 4fbddc307a
commit b478022ba3
3 changed files with 22 additions and 20 deletions
@@ -107,7 +107,8 @@ def build_quality_context_from_summary(
    This is a fallback when full document-level quality metrics aren't
    available. It uses the trend summary's evidence counts and confidence
-    as proxies.
+    as proxies.  We assume at least one source type contributed so that
    the fallback does not automatically trigger LOW_SOURCE_DIVERSITY.
    """
    total = len(summary.top_supporting_evidence) + len(summary.top_opposing_evidence)
    return DataQualityContext(
@@ -116,7 +117,7 @@ def build_quality_context_from_summary(
        failed_documents=0,
        avg_extraction_confidence=summary.confidence,
        newest_evidence_at=summary.generated_at,
-        source_types=set(),
+        source_types={"unknown"},
    )
@@ -58,26 +58,30 @@ logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 _DATA_QUALITY_QUERY = """
 WITH latest_trend AS (
    SELECT top_supporting_evidence, top_opposing_evidence
    FROM trend_windows
    WHERE entity_id = $1 AND "window" = $2
    ORDER BY generated_at DESC
    LIMIT 1
 ),
 evidence_ids AS (
    SELECT jsonb_array_elements_text(
        COALESCE(lt.top_supporting_evidence, '[]'::jsonb)
        || COALESCE(lt.top_opposing_evidence, '[]'::jsonb)
    ) AS eid
    FROM latest_trend lt
 )
 SELECT
    COUNT(*) AS total_documents,
    COUNT(*) FILTER (WHERE di.validation_status = 'valid') AS valid_documents,
    COUNT(*) FILTER (WHERE di.validation_status = 'failed') AS failed_documents,
    AVG(di.confidence) FILTER (WHERE di.validation_status = 'valid') AS avg_extraction_confidence,
    MAX(d.published_at) AS newest_evidence_at,
-    ARRAY_AGG(DISTINCT s.source_class) FILTER (WHERE s.source_class IS NOT NULL) AS source_types
+    ARRAY_AGG(DISTINCT d.source_type) FILTER (WHERE d.source_type IS NOT NULL) AS source_types
 FROM documents d
 JOIN document_intelligence di ON di.document_id = d.id
-LEFT JOIN sources s ON d.source_id = s.id
+WHERE d.id::text IN (SELECT eid FROM evidence_ids)
 WHERE d.id = ANY(
    SELECT UNNEST(
        COALESCE(tw.top_supporting_evidence, '[]'::jsonb)
        || COALESCE(tw.top_opposing_evidence, '[]'::jsonb)
    )::uuid
    FROM trend_windows tw
    WHERE tw.entity_id = $1 AND tw."window" = $2
    ORDER BY tw.generated_at DESC
    LIMIT 1
 )
 """
@@ -128,12 +128,9 @@ def test_fallback_context_from_summary():
 def test_no_suppression_with_summary_fallback():
    """When no quality context is provided, summary-based fallback is used."""
    summary = _make_summary(confidence=0.7)
-    # Default config has min_source_types=1, but fallback has empty source_types.
+    # Default config has min_source_types=1. The fallback now returns
-    # With min_source_types=1 and empty source_types, LOW_SOURCE_DIVERSITY fires
+    # source_types={"unknown"} so LOW_SOURCE_DIVERSITY does not fire.
-    # only when total_documents > 0. But default min_source_types is 1 and
+    result = evaluate_suppression(summary, config=SuppressionConfig(), reference_time=NOW)
    # len(set()) = 0 < 1, so it would fire. Let's use a config that relaxes this.
    config = SuppressionConfig(min_source_types=0)
    result = evaluate_suppression(summary, config=config, reference_time=NOW)
    assert result.suppressed is False