diff --git a/services/recommendation/suppression.py b/services/recommendation/suppression.py index d508e87..bea913c 100644 --- a/services/recommendation/suppression.py +++ b/services/recommendation/suppression.py @@ -107,7 +107,8 @@ def build_quality_context_from_summary( This is a fallback when full document-level quality metrics aren't available. It uses the trend summary's evidence counts and confidence - as proxies. + as proxies. We assume at least one source type contributed so that + the fallback does not automatically trigger LOW_SOURCE_DIVERSITY. """ total = len(summary.top_supporting_evidence) + len(summary.top_opposing_evidence) return DataQualityContext( @@ -116,7 +117,7 @@ def build_quality_context_from_summary( failed_documents=0, avg_extraction_confidence=summary.confidence, newest_evidence_at=summary.generated_at, - source_types=set(), + source_types={"unknown"}, ) diff --git a/services/recommendation/worker.py b/services/recommendation/worker.py index dfca497..a8b0cf0 100644 --- a/services/recommendation/worker.py +++ b/services/recommendation/worker.py @@ -58,26 +58,30 @@ logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- _DATA_QUALITY_QUERY = """ +WITH latest_trend AS ( + SELECT top_supporting_evidence, top_opposing_evidence + FROM trend_windows + WHERE entity_id = $1 AND "window" = $2 + ORDER BY generated_at DESC + LIMIT 1 +), +evidence_ids AS ( + SELECT jsonb_array_elements_text( + COALESCE(lt.top_supporting_evidence, '[]'::jsonb) + || COALESCE(lt.top_opposing_evidence, '[]'::jsonb) + ) AS eid + FROM latest_trend lt +) SELECT COUNT(*) AS total_documents, COUNT(*) FILTER (WHERE di.validation_status = 'valid') AS valid_documents, COUNT(*) FILTER (WHERE di.validation_status = 'failed') AS failed_documents, AVG(di.confidence) FILTER (WHERE di.validation_status = 'valid') AS avg_extraction_confidence, MAX(d.published_at) AS newest_evidence_at, - ARRAY_AGG(DISTINCT s.source_class) FILTER (WHERE s.source_class IS NOT NULL) AS source_types + ARRAY_AGG(DISTINCT d.source_type) FILTER (WHERE d.source_type IS NOT NULL) AS source_types FROM documents d JOIN document_intelligence di ON di.document_id = d.id -LEFT JOIN sources s ON d.source_id = s.id -WHERE d.id = ANY( - SELECT UNNEST( - COALESCE(tw.top_supporting_evidence, '[]'::jsonb) - || COALESCE(tw.top_opposing_evidence, '[]'::jsonb) - )::uuid - FROM trend_windows tw - WHERE tw.entity_id = $1 AND tw."window" = $2 - ORDER BY tw.generated_at DESC - LIMIT 1 -) +WHERE d.id::text IN (SELECT eid FROM evidence_ids) """ diff --git a/tests/test_suppression.py b/tests/test_suppression.py index 2442374..d881164 100644 --- a/tests/test_suppression.py +++ b/tests/test_suppression.py @@ -128,12 +128,9 @@ def test_fallback_context_from_summary(): def test_no_suppression_with_summary_fallback(): """When no quality context is provided, summary-based fallback is used.""" summary = _make_summary(confidence=0.7) - # Default config has min_source_types=1, but fallback has empty source_types. - # With min_source_types=1 and empty source_types, LOW_SOURCE_DIVERSITY fires - # only when total_documents > 0. But default min_source_types is 1 and - # len(set()) = 0 < 1, so it would fire. Let's use a config that relaxes this. - config = SuppressionConfig(min_source_types=0) - result = evaluate_suppression(summary, config=config, reference_time=NOW) + # Default config has min_source_types=1. The fallback now returns + # source_types={"unknown"} so LOW_SOURCE_DIVERSITY does not fire. + result = evaluate_suppression(summary, config=SuppressionConfig(), reference_time=NOW) assert result.suppressed is False