"""Tests for the web scrape adapter. Validates URL normalization, HTML metadata extraction, body text extraction, and adapter result construction. """ import pytest from services.adapters.web_scrape_adapter import ( WebScrapeAdapter, extract_body_text, extract_metadata_from_html, ) from services.shared.content import normalize_url SAMPLE_HTML = """ Apple Q2 Earnings Beat Expectations

Apple Q2 Earnings Beat Expectations

Apple Inc. reported quarterly revenue of $95 billion, exceeding analyst estimates.

The company saw strong growth in its services division and iPhone sales.

""" MINIMAL_HTML = """

Short content.

""" class TestNormalizeUrl: def test_basic_normalization(self): assert normalize_url("HTTPS://Example.COM/path") == "https://example.com/path" def test_strips_trailing_slash(self): assert normalize_url("https://example.com/path/") == "https://example.com/path" def test_strips_fragment(self): result = normalize_url("https://example.com/path#section") assert "#" not in result def test_preserves_query(self): result = normalize_url("https://example.com/path?q=test") assert result == "https://example.com/path?q=test" def test_preserves_non_standard_port(self): result = normalize_url("https://example.com:8443/path") assert ":8443" in result def test_root_path(self): result = normalize_url("https://example.com") assert result == "https://example.com/" class TestExtractMetadataFromHtml: def test_extracts_title(self): meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article") assert meta["title"] == "Apple Q2 Earnings Beat Expectations" def test_extracts_author(self): meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article") assert meta["author"] == "Jane Reporter" def test_extracts_publisher(self): meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article") assert meta["publisher"] == "TechNews" def test_extracts_published_at(self): meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article") assert meta["published_at"] == "2026-04-10T14:00:00Z" def test_extracts_canonical_url(self): meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article") assert meta["canonical_url"] == "https://technews.example.com/apple-q2-earnings" def test_extracts_language(self): meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article") assert meta["language"] == "en" def test_fallback_publisher_from_hostname(self): meta = extract_metadata_from_html(MINIMAL_HTML, "https://example.com/page") assert meta["publisher"] == "example.com" def test_fallback_title_empty(self): meta = extract_metadata_from_html(MINIMAL_HTML, "https://example.com/page") assert meta["title"] == "" class TestExtractBodyText: def test_extracts_article_content(self): text = extract_body_text(SAMPLE_HTML) assert "Apple Inc. reported quarterly revenue" in text assert "strong growth" in text def test_strips_nav_and_footer(self): text = extract_body_text(SAMPLE_HTML) assert "Navigation links here" not in text assert "Copyright 2026" not in text def test_strips_script_and_style(self): html = "

Content

" text = extract_body_text(html) assert "alert" not in text assert "Content" in text def test_minimal_html(self): text = extract_body_text(MINIMAL_HTML) assert "Short content." in text class TestWebScrapeAdapterSourceType: def test_source_type(self): adapter = WebScrapeAdapter() assert adapter.source_type() == "web_scrape" def test_bucket_name(self): adapter = WebScrapeAdapter() assert adapter.bucket_name() == "stonks-raw-news" class TestWebScrapeAdapterErrorResult: def test_error_on_no_urls(self): adapter = WebScrapeAdapter() result = adapter._error_result("AAPL", "No URLs configured", 0) assert not result.ok assert result.error == "No URLs configured" assert result.source_type == "web_scrape" assert result.ticker == "AAPL" @pytest.mark.asyncio async def test_fetch_no_urls_configured(): adapter = WebScrapeAdapter() result = await adapter.fetch("AAPL", {}) assert not result.ok assert result.error is not None assert "No URLs configured" in result.error