fix: data quality query and suppression fallback in recommendation worker
- Fix _DATA_QUALITY_QUERY: remove nonexistent d.source_id/s.source_class, use d.source_type directly - Fix LIMIT 1 applied after jsonb expansion by restructuring as CTE - Fix fallback build_quality_context_from_summary returning empty source_types which always triggered LOW_SOURCE_DIVERSITY suppression - Update test to reflect corrected fallback behavior
This commit is contained in:
@@ -107,7 +107,8 @@ def build_quality_context_from_summary(
|
|||||||
|
|
||||||
This is a fallback when full document-level quality metrics aren't
|
This is a fallback when full document-level quality metrics aren't
|
||||||
available. It uses the trend summary's evidence counts and confidence
|
available. It uses the trend summary's evidence counts and confidence
|
||||||
as proxies.
|
as proxies. We assume at least one source type contributed so that
|
||||||
|
the fallback does not automatically trigger LOW_SOURCE_DIVERSITY.
|
||||||
"""
|
"""
|
||||||
total = len(summary.top_supporting_evidence) + len(summary.top_opposing_evidence)
|
total = len(summary.top_supporting_evidence) + len(summary.top_opposing_evidence)
|
||||||
return DataQualityContext(
|
return DataQualityContext(
|
||||||
@@ -116,7 +117,7 @@ def build_quality_context_from_summary(
|
|||||||
failed_documents=0,
|
failed_documents=0,
|
||||||
avg_extraction_confidence=summary.confidence,
|
avg_extraction_confidence=summary.confidence,
|
||||||
newest_evidence_at=summary.generated_at,
|
newest_evidence_at=summary.generated_at,
|
||||||
source_types=set(),
|
source_types={"unknown"},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -58,26 +58,30 @@ logger = logging.getLogger(__name__)
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
_DATA_QUALITY_QUERY = """
|
_DATA_QUALITY_QUERY = """
|
||||||
|
WITH latest_trend AS (
|
||||||
|
SELECT top_supporting_evidence, top_opposing_evidence
|
||||||
|
FROM trend_windows
|
||||||
|
WHERE entity_id = $1 AND "window" = $2
|
||||||
|
ORDER BY generated_at DESC
|
||||||
|
LIMIT 1
|
||||||
|
),
|
||||||
|
evidence_ids AS (
|
||||||
|
SELECT jsonb_array_elements_text(
|
||||||
|
COALESCE(lt.top_supporting_evidence, '[]'::jsonb)
|
||||||
|
|| COALESCE(lt.top_opposing_evidence, '[]'::jsonb)
|
||||||
|
) AS eid
|
||||||
|
FROM latest_trend lt
|
||||||
|
)
|
||||||
SELECT
|
SELECT
|
||||||
COUNT(*) AS total_documents,
|
COUNT(*) AS total_documents,
|
||||||
COUNT(*) FILTER (WHERE di.validation_status = 'valid') AS valid_documents,
|
COUNT(*) FILTER (WHERE di.validation_status = 'valid') AS valid_documents,
|
||||||
COUNT(*) FILTER (WHERE di.validation_status = 'failed') AS failed_documents,
|
COUNT(*) FILTER (WHERE di.validation_status = 'failed') AS failed_documents,
|
||||||
AVG(di.confidence) FILTER (WHERE di.validation_status = 'valid') AS avg_extraction_confidence,
|
AVG(di.confidence) FILTER (WHERE di.validation_status = 'valid') AS avg_extraction_confidence,
|
||||||
MAX(d.published_at) AS newest_evidence_at,
|
MAX(d.published_at) AS newest_evidence_at,
|
||||||
ARRAY_AGG(DISTINCT s.source_class) FILTER (WHERE s.source_class IS NOT NULL) AS source_types
|
ARRAY_AGG(DISTINCT d.source_type) FILTER (WHERE d.source_type IS NOT NULL) AS source_types
|
||||||
FROM documents d
|
FROM documents d
|
||||||
JOIN document_intelligence di ON di.document_id = d.id
|
JOIN document_intelligence di ON di.document_id = d.id
|
||||||
LEFT JOIN sources s ON d.source_id = s.id
|
WHERE d.id::text IN (SELECT eid FROM evidence_ids)
|
||||||
WHERE d.id = ANY(
|
|
||||||
SELECT UNNEST(
|
|
||||||
COALESCE(tw.top_supporting_evidence, '[]'::jsonb)
|
|
||||||
|| COALESCE(tw.top_opposing_evidence, '[]'::jsonb)
|
|
||||||
)::uuid
|
|
||||||
FROM trend_windows tw
|
|
||||||
WHERE tw.entity_id = $1 AND tw."window" = $2
|
|
||||||
ORDER BY tw.generated_at DESC
|
|
||||||
LIMIT 1
|
|
||||||
)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -128,12 +128,9 @@ def test_fallback_context_from_summary():
|
|||||||
def test_no_suppression_with_summary_fallback():
|
def test_no_suppression_with_summary_fallback():
|
||||||
"""When no quality context is provided, summary-based fallback is used."""
|
"""When no quality context is provided, summary-based fallback is used."""
|
||||||
summary = _make_summary(confidence=0.7)
|
summary = _make_summary(confidence=0.7)
|
||||||
# Default config has min_source_types=1, but fallback has empty source_types.
|
# Default config has min_source_types=1. The fallback now returns
|
||||||
# With min_source_types=1 and empty source_types, LOW_SOURCE_DIVERSITY fires
|
# source_types={"unknown"} so LOW_SOURCE_DIVERSITY does not fire.
|
||||||
# only when total_documents > 0. But default min_source_types is 1 and
|
result = evaluate_suppression(summary, config=SuppressionConfig(), reference_time=NOW)
|
||||||
# len(set()) = 0 < 1, so it would fire. Let's use a config that relaxes this.
|
|
||||||
config = SuppressionConfig(min_source_types=0)
|
|
||||||
result = evaluate_suppression(summary, config=config, reference_time=NOW)
|
|
||||||
assert result.suppressed is False
|
assert result.suppressed is False
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user