188 lines
5.8 KiB
Python
188 lines
5.8 KiB
Python
"""Tests for cross-source deduplication logic.
|
|
|
|
Validates the pure functions and key-building helpers in services.shared.dedupe.
|
|
Async functions that require Redis/PostgreSQL are tested with lightweight fakes.
|
|
|
|
Requirements: 3.2, 3.3
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import AsyncMock
|
|
|
|
import pytest
|
|
|
|
from services.shared.dedupe import (
|
|
DedupeResult,
|
|
_hash_dedupe_key,
|
|
_url_dedupe_key,
|
|
check_duplicate,
|
|
dedupe_items,
|
|
mark_as_seen,
|
|
)
|
|
from services.shared.redis_keys import DEDUPE_PREFIX
|
|
|
|
|
|
class TestDedupeKeyBuilders:
|
|
def test_hash_dedupe_key_format(self):
|
|
key = _hash_dedupe_key("abc123")
|
|
assert key == f"{DEDUPE_PREFIX}:abc123"
|
|
|
|
def test_url_dedupe_key_is_hashed(self):
|
|
key = _url_dedupe_key("https://example.com/article")
|
|
assert key.startswith(f"{DEDUPE_PREFIX}:url:")
|
|
# Should be deterministic
|
|
assert key == _url_dedupe_key("https://example.com/article")
|
|
|
|
def test_url_dedupe_key_differs_for_different_urls(self):
|
|
k1 = _url_dedupe_key("https://a.com/1")
|
|
k2 = _url_dedupe_key("https://b.com/2")
|
|
assert k1 != k2
|
|
|
|
|
|
class TestDedupeResult:
|
|
def test_not_duplicate(self):
|
|
r = DedupeResult(is_duplicate=False)
|
|
assert not r.is_duplicate
|
|
assert r.existing_document_id is None
|
|
assert r.match_type is None
|
|
|
|
def test_duplicate_with_details(self):
|
|
r = DedupeResult(
|
|
is_duplicate=True,
|
|
existing_document_id="doc-123",
|
|
match_type="canonical_url",
|
|
)
|
|
assert r.is_duplicate
|
|
assert r.existing_document_id == "doc-123"
|
|
|
|
|
|
class FakeRedis:
|
|
"""Minimal async Redis fake for dedupe tests."""
|
|
|
|
def __init__(self, data: dict[str, str] | None = None):
|
|
self._data: dict[str, str] = data or {}
|
|
|
|
async def get(self, key: str) -> str | None:
|
|
return self._data.get(key)
|
|
|
|
async def set(self, key: str, value: str, ex: int | None = None) -> None:
|
|
self._data[key] = value
|
|
|
|
|
|
class FakePool:
|
|
"""Minimal async PG pool fake that returns None for all queries."""
|
|
|
|
def __init__(self, rows: dict[str, dict | None] | None = None):
|
|
self._rows = rows or {}
|
|
|
|
async def fetchrow(self, query: str, *args) -> dict | None:
|
|
# Match on the first arg (content_hash or canonical_url)
|
|
if args:
|
|
return self._rows.get(str(args[0]))
|
|
return None
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_check_duplicate_no_match():
|
|
rds = FakeRedis()
|
|
pool = FakePool()
|
|
result = await check_duplicate(
|
|
pool, rds, content_hash="newhash", url="https://example.com/new"
|
|
)
|
|
assert not result.is_duplicate
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_check_duplicate_redis_hash_hit():
|
|
hash_key = _hash_dedupe_key("existinghash")
|
|
rds = FakeRedis({hash_key: "doc-abc"})
|
|
pool = FakePool()
|
|
result = await check_duplicate(pool, rds, content_hash="existinghash")
|
|
assert result.is_duplicate
|
|
assert result.existing_document_id == "doc-abc"
|
|
assert result.match_type == "content_hash"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_check_duplicate_redis_url_hit():
|
|
canonical = "https://example.com/article"
|
|
url_key = _url_dedupe_key(canonical)
|
|
rds = FakeRedis({url_key: "doc-xyz"})
|
|
pool = FakePool()
|
|
result = await check_duplicate(
|
|
pool, rds, content_hash="newhash", canonical_url=canonical
|
|
)
|
|
assert result.is_duplicate
|
|
assert result.existing_document_id == "doc-xyz"
|
|
assert result.match_type == "canonical_url"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_check_duplicate_pg_hash_fallback():
|
|
rds = FakeRedis()
|
|
pool = FakePool({"pghash": {"id": "doc-pg1"}})
|
|
result = await check_duplicate(pool, rds, content_hash="pghash")
|
|
assert result.is_duplicate
|
|
assert result.existing_document_id == "doc-pg1"
|
|
assert result.match_type == "content_hash"
|
|
# Should have warmed Redis cache
|
|
assert rds._data.get(_hash_dedupe_key("pghash")) == "doc-pg1"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_check_duplicate_pg_url_fallback():
|
|
canonical = "https://example.com/filing"
|
|
rds = FakeRedis()
|
|
pool = FakePool({canonical: {"id": "doc-pg2"}})
|
|
result = await check_duplicate(
|
|
pool, rds, content_hash="nomatch", canonical_url=canonical
|
|
)
|
|
assert result.is_duplicate
|
|
assert result.existing_document_id == "doc-pg2"
|
|
assert result.match_type == "canonical_url"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_dedupe_items_partitions_correctly():
|
|
"""dedupe_items should split items into new and duplicate groups."""
|
|
existing_hash = "existinghash"
|
|
hash_key = _hash_dedupe_key(existing_hash)
|
|
rds = FakeRedis({hash_key: "doc-old"})
|
|
pool = FakePool()
|
|
|
|
items = [
|
|
{"title": "New Article", "content_hash": "newhash", "url": "https://a.com/1"},
|
|
{"title": "Dup Article", "content_hash": existing_hash, "url": "https://b.com/2"},
|
|
{"title": "Another New", "content_hash": "anothernew", "url": "https://c.com/3"},
|
|
]
|
|
|
|
new, dups = await dedupe_items(pool, rds, items)
|
|
assert len(new) == 2
|
|
assert len(dups) == 1
|
|
assert dups[0]["title"] == "Dup Article"
|
|
assert dups[0]["_dedupe_existing_id"] == "doc-old"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_mark_as_seen_sets_redis_keys():
|
|
rds = FakeRedis()
|
|
await mark_as_seen(
|
|
rds,
|
|
content_hash="hash123",
|
|
canonical_url="https://example.com/page",
|
|
document_id="doc-new",
|
|
)
|
|
assert rds._data[_hash_dedupe_key("hash123")] == "doc-new"
|
|
assert rds._data[_url_dedupe_key("https://example.com/page")] == "doc-new"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_mark_as_seen_handles_none_url():
|
|
rds = FakeRedis()
|
|
await mark_as_seen(
|
|
rds, content_hash="hash456", canonical_url=None, document_id="doc-x"
|
|
)
|
|
assert rds._data[_hash_dedupe_key("hash456")] == "doc-x"
|
|
# No URL key should be set
|
|
assert len(rds._data) == 1
|