Files
stonks-oracle/services/shared/content.py
T

44 lines
1.4 KiB
Python

"""Canonical URL normalization and content hashing utilities.
Provides consistent URL canonicalization and SHA-256 content hashing
across all ingestion adapters and pipeline stages.
Requirements: 3.2, 3.3
"""
import hashlib
from urllib.parse import parse_qsl, urlencode, urlparse
def normalize_url(url: str) -> str:
"""Canonical URL normalization.
- Lowercases scheme and host
- Strips fragments
- Strips trailing slashes from path (preserves root "/")
- Strips default ports (80, 443)
- Sorts query parameters for deterministic comparison
- Defaults scheme to https if missing
"""
parsed = urlparse(url)
scheme = (parsed.scheme or "https").lower()
netloc = (parsed.hostname or "").lower()
if parsed.port and parsed.port not in (80, 443):
netloc = f"{netloc}:{parsed.port}"
path = parsed.path.rstrip("/") or "/"
# Sort query params for deterministic ordering
query = urlencode(sorted(parse_qsl(parsed.query)))
normalized = f"{scheme}://{netloc}{path}"
if query:
normalized = f"{normalized}?{query}"
return normalized
def content_hash(data: bytes) -> str:
"""Compute a stable SHA-256 hex digest for raw content bytes."""
return hashlib.sha256(data).hexdigest()
def content_hash_str(text: str, encoding: str = "utf-8") -> str:
"""Compute a stable SHA-256 hex digest for a text string."""
return hashlib.sha256(text.encode(encoding)).hexdigest()