phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -0,0 +1,198 @@
+"""Cross-source deduplication for articles and filings.
+
+Detects duplicate documents across different source types (news_api,
+filings_api, web_scrape) using a layered approach:
+
+1. Redis fast-path: check content_hash and canonical_url markers for
+   recently-seen documents (TTL-bounded, cheap).
+2. PostgreSQL fallback: query the documents table by canonical_url or
+   content_hash for durable cross-source matching.
+
+When a duplicate is detected the caller receives the existing document_id
+so it can link additional company mentions without re-inserting the document.
+
+Requirements: 3.2, 3.3
+"""
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+import asyncpg
+import redis.asyncio as aioredis
+
+from services.shared.content import content_hash_str, normalize_url
+from services.shared.redis_keys import DEDUPE_PREFIX
+
+logger = logging.getLogger("dedupe")
+
+# Redis TTL for dedupe markers (24 hours)
+DEDUPE_TTL_SECONDS: int = 86400
+
+
+def _url_dedupe_key(canonical_url: str) -> str:
+    """Build a Redis key for URL-based deduplication."""
+    return f"{DEDUPE_PREFIX}:url:{content_hash_str(canonical_url)}"
+
+
+def _hash_dedupe_key(content_hash: str) -> str:
+    """Build a Redis key for content-hash-based deduplication."""
+    return f"{DEDUPE_PREFIX}:{content_hash}"
+
+
+@dataclass
+class DedupeResult:
+    """Result of a deduplication check."""
+
+    is_duplicate: bool
+    existing_document_id: str | None = None
+    match_type: str | None = None  # "content_hash" | "canonical_url" | None
+
+
+async def check_duplicate(
+    pool: asyncpg.Pool,
+    rds: aioredis.Redis,
+    *,
+    content_hash: str,
+    url: str | None = None,
+    canonical_url: str | None = None,
+) -> DedupeResult:
+    """Check whether a document is a duplicate across all source types.
+
+    Checks in order of cost:
+    1. Redis content_hash marker (fast path)
+    2. Redis canonical_url marker (fast path)
+    3. PostgreSQL documents.content_hash (durable)
+    4. PostgreSQL documents.canonical_url (cross-source)
+
+    Returns a DedupeResult indicating whether the document already exists.
+    """
+    # Resolve canonical URL if only raw URL provided
+    resolved_canonical = canonical_url or (normalize_url(url) if url else None)
+
+    # --- Redis fast path: content hash ---
+    if content_hash:
+        redis_key = _hash_dedupe_key(content_hash)
+        cached_id = await rds.get(redis_key)
+        if cached_id:
+            logger.debug("Dedupe hit (redis content_hash) for %s", content_hash[:16])
+            return DedupeResult(
+                is_duplicate=True,
+                existing_document_id=str(cached_id),
+                match_type="content_hash",
+            )
+
+    # --- Redis fast path: canonical URL ---
+    if resolved_canonical:
+        url_key = _url_dedupe_key(resolved_canonical)
+        cached_id = await rds.get(url_key)
+        if cached_id:
+            logger.debug("Dedupe hit (redis canonical_url) for %s", resolved_canonical[:60])
+            return DedupeResult(
+                is_duplicate=True,
+                existing_document_id=str(cached_id),
+                match_type="canonical_url",
+            )
+
+    # --- PostgreSQL fallback: content hash ---
+    if content_hash:
+        row = await pool.fetchrow(
+            "SELECT id FROM documents WHERE content_hash = $1 LIMIT 1",
+            content_hash,
+        )
+        if row:
+            doc_id = str(row["id"])
+            # Warm the Redis cache for future checks
+            await _set_dedupe_markers(rds, content_hash, resolved_canonical, doc_id)
+            logger.debug("Dedupe hit (pg content_hash) for %s", content_hash[:16])
+            return DedupeResult(
+                is_duplicate=True,
+                existing_document_id=doc_id,
+                match_type="content_hash",
+            )
+
+    # --- PostgreSQL fallback: canonical URL ---
+    if resolved_canonical:
+        row = await pool.fetchrow(
+            "SELECT id FROM documents WHERE canonical_url = $1 LIMIT 1",
+            resolved_canonical,
+        )
+        if row:
+            doc_id = str(row["id"])
+            await _set_dedupe_markers(rds, content_hash, resolved_canonical, doc_id)
+            logger.debug("Dedupe hit (pg canonical_url) for %s", resolved_canonical[:60])
+            return DedupeResult(
+                is_duplicate=True,
+                existing_document_id=doc_id,
+                match_type="canonical_url",
+            )
+
+    return DedupeResult(is_duplicate=False)
+
+
+async def mark_as_seen(
+    rds: aioredis.Redis,
+    *,
+    content_hash: str,
+    canonical_url: str | None,
+    document_id: str,
+) -> None:
+    """Mark a newly-persisted document in Redis for fast future dedupe checks."""
+    await _set_dedupe_markers(rds, content_hash, canonical_url, document_id)
+
+
+async def _set_dedupe_markers(
+    rds: aioredis.Redis,
+    content_hash: str | None,
+    canonical_url: str | None,
+    document_id: str,
+) -> None:
+    """Set Redis dedupe markers for both content hash and canonical URL."""
+    if content_hash:
+        await rds.set(
+            _hash_dedupe_key(content_hash), document_id, ex=DEDUPE_TTL_SECONDS
+        )
+    if canonical_url:
+        await rds.set(
+            _url_dedupe_key(canonical_url), document_id, ex=DEDUPE_TTL_SECONDS
+        )
+
+
+async def dedupe_items(
+    pool: asyncpg.Pool,
+    rds: aioredis.Redis,
+    items: list[dict[str, Any]],
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
+    """Partition a list of ingestion items into new and duplicate groups.
+
+    Each item is expected to have at least one of:
+    - content_hash: SHA-256 of the raw content
+    - url / canonical_url: the document URL
+
+    Returns (new_items, duplicate_items).
+    """
+    new_items: list[dict[str, Any]] = []
+    dup_items: list[dict[str, Any]] = []
+
+    for item in items:
+        item_hash = item.get("content_hash", "")
+        item_url = item.get("url") or item.get("link")
+        item_canonical = item.get("canonical_url")
+
+        result = await check_duplicate(
+            pool,
+            rds,
+            content_hash=item_hash,
+            url=item_url,
+            canonical_url=item_canonical,
+        )
+
+        if result.is_duplicate:
+            item["_dedupe_match_type"] = result.match_type
+            item["_dedupe_existing_id"] = result.existing_document_id
+            dup_items.append(item)
+        else:
+            new_items.append(item)
+
+    return new_items, dup_items