Files
stonks-oracle/tests/test_dedupe.py
T
Celes Renata c85c0068a2 fix: clean up utcnow deprecation warnings, fix 12 failing tests, add CI/CD pipeline manifests
- Replace all datetime.utcnow() with datetime.now(tz=timezone.utc) across 8 files
- Fix 12 failing tests to match current implementation behavior
- Fix pytest_plugins in non-top-level conftest (moved to root conftest.py)
- Auto-fix 189 lint issues (import sorting, unused imports)
- Add CI/CD pipeline infrastructure (ARC, ArgoCD, Kargo manifests)
- Add values-beta.yaml and values-paper.yaml for staged deployments
- Update GitHub Actions workflow to use self-hosted-gremlin runners
- Add integration-test job to CI pipeline

Result: 1596 passed, 0 failed, 0 warnings
2026-04-18 03:59:28 +00:00

186 lines
5.7 KiB
Python

"""Tests for cross-source deduplication logic.
Validates the pure functions and key-building helpers in services.shared.dedupe.
Async functions that require Redis/PostgreSQL are tested with lightweight fakes.
Requirements: 3.2, 3.3
"""
from __future__ import annotations
import pytest
from services.shared.dedupe import (
DedupeResult,
_hash_dedupe_key,
_url_dedupe_key,
check_duplicate,
dedupe_items,
mark_as_seen,
)
from services.shared.redis_keys import DEDUPE_PREFIX
class TestDedupeKeyBuilders:
def test_hash_dedupe_key_format(self):
key = _hash_dedupe_key("abc123")
assert key == f"{DEDUPE_PREFIX}:abc123"
def test_url_dedupe_key_is_hashed(self):
key = _url_dedupe_key("https://example.com/article")
assert key.startswith(f"{DEDUPE_PREFIX}:url:")
# Should be deterministic
assert key == _url_dedupe_key("https://example.com/article")
def test_url_dedupe_key_differs_for_different_urls(self):
k1 = _url_dedupe_key("https://a.com/1")
k2 = _url_dedupe_key("https://b.com/2")
assert k1 != k2
class TestDedupeResult:
def test_not_duplicate(self):
r = DedupeResult(is_duplicate=False)
assert not r.is_duplicate
assert r.existing_document_id is None
assert r.match_type is None
def test_duplicate_with_details(self):
r = DedupeResult(
is_duplicate=True,
existing_document_id="doc-123",
match_type="canonical_url",
)
assert r.is_duplicate
assert r.existing_document_id == "doc-123"
class FakeRedis:
"""Minimal async Redis fake for dedupe tests."""
def __init__(self, data: dict[str, str] | None = None):
self._data: dict[str, str] = data or {}
async def get(self, key: str) -> str | None:
return self._data.get(key)
async def set(self, key: str, value: str, ex: int | None = None) -> None:
self._data[key] = value
class FakePool:
"""Minimal async PG pool fake that returns None for all queries."""
def __init__(self, rows: dict[str, dict | None] | None = None):
self._rows = rows or {}
async def fetchrow(self, query: str, *args) -> dict | None:
# Match on the first arg (content_hash or canonical_url)
if args:
return self._rows.get(str(args[0]))
return None
@pytest.mark.asyncio
async def test_check_duplicate_no_match():
rds = FakeRedis()
pool = FakePool()
result = await check_duplicate(
pool, rds, content_hash="newhash", url="https://example.com/new"
)
assert not result.is_duplicate
@pytest.mark.asyncio
async def test_check_duplicate_redis_hash_hit():
hash_key = _hash_dedupe_key("existinghash")
rds = FakeRedis({hash_key: "doc-abc"})
pool = FakePool()
result = await check_duplicate(pool, rds, content_hash="existinghash")
assert result.is_duplicate
assert result.existing_document_id == "doc-abc"
assert result.match_type == "content_hash"
@pytest.mark.asyncio
async def test_check_duplicate_redis_url_hit():
canonical = "https://example.com/article"
url_key = _url_dedupe_key(canonical)
rds = FakeRedis({url_key: "doc-xyz"})
pool = FakePool()
result = await check_duplicate(
pool, rds, content_hash="newhash", canonical_url=canonical
)
assert result.is_duplicate
assert result.existing_document_id == "doc-xyz"
assert result.match_type == "canonical_url"
@pytest.mark.asyncio
async def test_check_duplicate_pg_hash_fallback():
rds = FakeRedis()
pool = FakePool({"pghash": {"id": "doc-pg1"}})
result = await check_duplicate(pool, rds, content_hash="pghash")
assert result.is_duplicate
assert result.existing_document_id == "doc-pg1"
assert result.match_type == "content_hash"
# Should have warmed Redis cache
assert rds._data.get(_hash_dedupe_key("pghash")) == "doc-pg1"
@pytest.mark.asyncio
async def test_check_duplicate_pg_url_fallback():
canonical = "https://example.com/filing"
rds = FakeRedis()
pool = FakePool({canonical: {"id": "doc-pg2"}})
result = await check_duplicate(
pool, rds, content_hash="nomatch", canonical_url=canonical
)
assert result.is_duplicate
assert result.existing_document_id == "doc-pg2"
assert result.match_type == "canonical_url"
@pytest.mark.asyncio
async def test_dedupe_items_partitions_correctly():
"""dedupe_items should split items into new and duplicate groups."""
existing_hash = "existinghash"
hash_key = _hash_dedupe_key(existing_hash)
rds = FakeRedis({hash_key: "doc-old"})
pool = FakePool()
items = [
{"title": "New Article", "content_hash": "newhash", "url": "https://a.com/1"},
{"title": "Dup Article", "content_hash": existing_hash, "url": "https://b.com/2"},
{"title": "Another New", "content_hash": "anothernew", "url": "https://c.com/3"},
]
new, dups = await dedupe_items(pool, rds, items)
assert len(new) == 2
assert len(dups) == 1
assert dups[0]["title"] == "Dup Article"
assert dups[0]["_dedupe_existing_id"] == "doc-old"
@pytest.mark.asyncio
async def test_mark_as_seen_sets_redis_keys():
rds = FakeRedis()
await mark_as_seen(
rds,
content_hash="hash123",
canonical_url="https://example.com/page",
document_id="doc-new",
)
assert rds._data[_hash_dedupe_key("hash123")] == "doc-new"
assert rds._data[_url_dedupe_key("https://example.com/page")] == "doc-new"
@pytest.mark.asyncio
async def test_mark_as_seen_handles_none_url():
rds = FakeRedis()
await mark_as_seen(
rds, content_hash="hash456", canonical_url=None, document_id="doc-x"
)
assert rds._data[_hash_dedupe_key("hash456")] == "doc-x"
# No URL key should be set
assert len(rds._data) == 1