fix: data quality query and suppression fallback in recommendation worker

- Fix _DATA_QUALITY_QUERY: remove nonexistent d.source_id/s.source_class,
  use d.source_type directly
- Fix LIMIT 1 applied after jsonb expansion by restructuring as CTE
- Fix fallback build_quality_context_from_summary returning empty
  source_types which always triggered LOW_SOURCE_DIVERSITY suppression
- Update test to reflect corrected fallback behavior
This commit is contained in:
Celes Renata
2026-04-14 06:57:46 +00:00
parent 4fbddc307a
commit b478022ba3
3 changed files with 22 additions and 20 deletions
+3 -2
View File
@@ -107,7 +107,8 @@ def build_quality_context_from_summary(
This is a fallback when full document-level quality metrics aren't
available. It uses the trend summary's evidence counts and confidence
as proxies.
as proxies. We assume at least one source type contributed so that
the fallback does not automatically trigger LOW_SOURCE_DIVERSITY.
"""
total = len(summary.top_supporting_evidence) + len(summary.top_opposing_evidence)
return DataQualityContext(
@@ -116,7 +117,7 @@ def build_quality_context_from_summary(
failed_documents=0,
avg_extraction_confidence=summary.confidence,
newest_evidence_at=summary.generated_at,
source_types=set(),
source_types={"unknown"},
)
+16 -12
View File
@@ -58,26 +58,30 @@ logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
_DATA_QUALITY_QUERY = """
WITH latest_trend AS (
SELECT top_supporting_evidence, top_opposing_evidence
FROM trend_windows
WHERE entity_id = $1 AND "window" = $2
ORDER BY generated_at DESC
LIMIT 1
),
evidence_ids AS (
SELECT jsonb_array_elements_text(
COALESCE(lt.top_supporting_evidence, '[]'::jsonb)
|| COALESCE(lt.top_opposing_evidence, '[]'::jsonb)
) AS eid
FROM latest_trend lt
)
SELECT
COUNT(*) AS total_documents,
COUNT(*) FILTER (WHERE di.validation_status = 'valid') AS valid_documents,
COUNT(*) FILTER (WHERE di.validation_status = 'failed') AS failed_documents,
AVG(di.confidence) FILTER (WHERE di.validation_status = 'valid') AS avg_extraction_confidence,
MAX(d.published_at) AS newest_evidence_at,
ARRAY_AGG(DISTINCT s.source_class) FILTER (WHERE s.source_class IS NOT NULL) AS source_types
ARRAY_AGG(DISTINCT d.source_type) FILTER (WHERE d.source_type IS NOT NULL) AS source_types
FROM documents d
JOIN document_intelligence di ON di.document_id = d.id
LEFT JOIN sources s ON d.source_id = s.id
WHERE d.id = ANY(
SELECT UNNEST(
COALESCE(tw.top_supporting_evidence, '[]'::jsonb)
|| COALESCE(tw.top_opposing_evidence, '[]'::jsonb)
)::uuid
FROM trend_windows tw
WHERE tw.entity_id = $1 AND tw."window" = $2
ORDER BY tw.generated_at DESC
LIMIT 1
)
WHERE d.id::text IN (SELECT eid FROM evidence_ids)
"""
+3 -6
View File
@@ -128,12 +128,9 @@ def test_fallback_context_from_summary():
def test_no_suppression_with_summary_fallback():
"""When no quality context is provided, summary-based fallback is used."""
summary = _make_summary(confidence=0.7)
# Default config has min_source_types=1, but fallback has empty source_types.
# With min_source_types=1 and empty source_types, LOW_SOURCE_DIVERSITY fires
# only when total_documents > 0. But default min_source_types is 1 and
# len(set()) = 0 < 1, so it would fire. Let's use a config that relaxes this.
config = SuppressionConfig(min_source_types=0)
result = evaluate_suppression(summary, config=config, reference_time=NOW)
# Default config has min_source_types=1. The fallback now returns
# source_types={"unknown"} so LOW_SOURCE_DIVERSITY does not fire.
result = evaluate_suppression(summary, config=SuppressionConfig(), reference_time=NOW)
assert result.suppressed is False