"""Canonical URL normalization and content hashing utilities.

Provides consistent URL canonicalization and SHA-256 content hashing
across all ingestion adapters and pipeline stages.

Requirements: 3.2, 3.3
"""
import hashlib
from urllib.parse import parse_qsl, urlencode, urlparse


def normalize_url(url: str) -> str:
    """Canonical URL normalization.

    - Lowercases scheme and host
    - Strips fragments
    - Strips trailing slashes from path (preserves root "/")
    - Strips default ports (80, 443)
    - Sorts query parameters for deterministic comparison
    - Defaults scheme to https if missing
    """
    parsed = urlparse(url)
    scheme = (parsed.scheme or "https").lower()
    netloc = (parsed.hostname or "").lower()
    if parsed.port and parsed.port not in (80, 443):
        netloc = f"{netloc}:{parsed.port}"
    path = parsed.path.rstrip("/") or "/"
    # Sort query params for deterministic ordering
    query = urlencode(sorted(parse_qsl(parsed.query)))
    normalized = f"{scheme}://{netloc}{path}"
    if query:
        normalized = f"{normalized}?{query}"
    return normalized


def content_hash(data: bytes) -> str:
    """Compute a stable SHA-256 hex digest for raw content bytes."""
    return hashlib.sha256(data).hexdigest()


def content_hash_str(text: str, encoding: str = "utf-8") -> str:
    """Compute a stable SHA-256 hex digest for a text string."""
    return hashlib.sha256(text.encode(encoding)).hexdigest()