"""Canonical URL normalization and content hashing utilities. Provides consistent URL canonicalization and SHA-256 content hashing across all ingestion adapters and pipeline stages. Requirements: 3.2, 3.3 """ import hashlib from urllib.parse import parse_qsl, urlencode, urlparse def normalize_url(url: str) -> str: """Canonical URL normalization. - Lowercases scheme and host - Strips fragments - Strips trailing slashes from path (preserves root "/") - Strips default ports (80, 443) - Sorts query parameters for deterministic comparison - Defaults scheme to https if missing """ parsed = urlparse(url) scheme = (parsed.scheme or "https").lower() netloc = (parsed.hostname or "").lower() if parsed.port and parsed.port not in (80, 443): netloc = f"{netloc}:{parsed.port}" path = parsed.path.rstrip("/") or "/" # Sort query params for deterministic ordering query = urlencode(sorted(parse_qsl(parsed.query))) normalized = f"{scheme}://{netloc}{path}" if query: normalized = f"{normalized}?{query}" return normalized def content_hash(data: bytes) -> str: """Compute a stable SHA-256 hex digest for raw content bytes.""" return hashlib.sha256(data).hexdigest() def content_hash_str(text: str, encoding: str = "utf-8") -> str: """Compute a stable SHA-256 hex digest for a text string.""" return hashlib.sha256(text.encode(encoding)).hexdigest()