phase 14-15: docker build validation and helm deployment
This commit is contained in:
@@ -0,0 +1,43 @@
|
||||
"""Canonical URL normalization and content hashing utilities.
|
||||
|
||||
Provides consistent URL canonicalization and SHA-256 content hashing
|
||||
across all ingestion adapters and pipeline stages.
|
||||
|
||||
Requirements: 3.2, 3.3
|
||||
"""
|
||||
import hashlib
|
||||
from urllib.parse import parse_qsl, urlencode, urlparse
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
"""Canonical URL normalization.
|
||||
|
||||
- Lowercases scheme and host
|
||||
- Strips fragments
|
||||
- Strips trailing slashes from path (preserves root "/")
|
||||
- Strips default ports (80, 443)
|
||||
- Sorts query parameters for deterministic comparison
|
||||
- Defaults scheme to https if missing
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
scheme = (parsed.scheme or "https").lower()
|
||||
netloc = (parsed.hostname or "").lower()
|
||||
if parsed.port and parsed.port not in (80, 443):
|
||||
netloc = f"{netloc}:{parsed.port}"
|
||||
path = parsed.path.rstrip("/") or "/"
|
||||
# Sort query params for deterministic ordering
|
||||
query = urlencode(sorted(parse_qsl(parsed.query)))
|
||||
normalized = f"{scheme}://{netloc}{path}"
|
||||
if query:
|
||||
normalized = f"{normalized}?{query}"
|
||||
return normalized
|
||||
|
||||
|
||||
def content_hash(data: bytes) -> str:
|
||||
"""Compute a stable SHA-256 hex digest for raw content bytes."""
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def content_hash_str(text: str, encoding: str = "utf-8") -> str:
|
||||
"""Compute a stable SHA-256 hex digest for a text string."""
|
||||
return hashlib.sha256(text.encode(encoding)).hexdigest()
|
||||
Reference in New Issue
Block a user