"""Tests for shared canonical URL normalization and content hashing. Validates normalize_url, content_hash, and content_hash_str from services.shared.content. Requirements: 3.2, 3.3 """ import hashlib from services.shared.content import content_hash, content_hash_str, normalize_url class TestNormalizeUrl: def test_lowercases_scheme_and_host(self): assert normalize_url("HTTPS://Example.COM/path") == "https://example.com/path" def test_strips_trailing_slash(self): assert normalize_url("https://example.com/path/") == "https://example.com/path" def test_strips_fragment(self): result = normalize_url("https://example.com/path#section") assert "#" not in result assert result == "https://example.com/path" def test_preserves_query(self): assert normalize_url("https://example.com/path?q=test") == "https://example.com/path?q=test" def test_sorts_query_params(self): result = normalize_url("https://example.com/path?z=1&a=2") assert result == "https://example.com/path?a=2&z=1" def test_preserves_non_standard_port(self): result = normalize_url("https://example.com:8443/path") assert ":8443" in result def test_strips_default_port_443(self): result = normalize_url("https://example.com:443/path") assert ":443" not in result def test_strips_default_port_80(self): result = normalize_url("http://example.com:80/path") assert ":80" not in result def test_root_path(self): assert normalize_url("https://example.com") == "https://example.com/" def test_defaults_scheme_to_https(self): result = normalize_url("//example.com/path") assert result.startswith("https://") def test_deterministic_for_same_input(self): url = "https://example.com/article?b=2&a=1#frag" assert normalize_url(url) == normalize_url(url) class TestContentHash: def test_returns_sha256_hex(self): data = b"hello world" expected = hashlib.sha256(data).hexdigest() assert content_hash(data) == expected def test_deterministic(self): data = b"test content" assert content_hash(data) == content_hash(data) def test_different_content_different_hash(self): assert content_hash(b"aaa") != content_hash(b"bbb") def test_empty_bytes(self): result = content_hash(b"") assert len(result) == 64 # SHA-256 hex length class TestContentHashStr: def test_matches_manual_sha256(self): text = "hello world" expected = hashlib.sha256(text.encode("utf-8")).hexdigest() assert content_hash_str(text) == expected def test_deterministic(self): assert content_hash_str("test") == content_hash_str("test") def test_different_text_different_hash(self): assert content_hash_str("aaa") != content_hash_str("bbb")