Apple Q2 Earnings Beat Expectations
Apple Inc. reported quarterly revenue of $95 billion, exceeding analyst estimates.
The company saw strong growth in its services division and iPhone sales.
"""Tests for the web scrape adapter. Validates URL normalization, HTML metadata extraction, body text extraction, and adapter result construction. """ import pytest from services.adapters.web_scrape_adapter import ( WebScrapeAdapter, extract_body_text, extract_metadata_from_html, ) from services.shared.content import normalize_url SAMPLE_HTML = """
Apple Inc. reported quarterly revenue of $95 billion, exceeding analyst estimates.
The company saw strong growth in its services division and iPhone sales.
Short content.
""" class TestNormalizeUrl: def test_basic_normalization(self): assert normalize_url("HTTPS://Example.COM/path") == "https://example.com/path" def test_strips_trailing_slash(self): assert normalize_url("https://example.com/path/") == "https://example.com/path" def test_strips_fragment(self): result = normalize_url("https://example.com/path#section") assert "#" not in result def test_preserves_query(self): result = normalize_url("https://example.com/path?q=test") assert result == "https://example.com/path?q=test" def test_preserves_non_standard_port(self): result = normalize_url("https://example.com:8443/path") assert ":8443" in result def test_root_path(self): result = normalize_url("https://example.com") assert result == "https://example.com/" class TestExtractMetadataFromHtml: def test_extracts_title(self): meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article") assert meta["title"] == "Apple Q2 Earnings Beat Expectations" def test_extracts_author(self): meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article") assert meta["author"] == "Jane Reporter" def test_extracts_publisher(self): meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article") assert meta["publisher"] == "TechNews" def test_extracts_published_at(self): meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article") assert meta["published_at"] == "2026-04-10T14:00:00Z" def test_extracts_canonical_url(self): meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article") assert meta["canonical_url"] == "https://technews.example.com/apple-q2-earnings" def test_extracts_language(self): meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article") assert meta["language"] == "en" def test_fallback_publisher_from_hostname(self): meta = extract_metadata_from_html(MINIMAL_HTML, "https://example.com/page") assert meta["publisher"] == "example.com" def test_fallback_title_empty(self): meta = extract_metadata_from_html(MINIMAL_HTML, "https://example.com/page") assert meta["title"] == "" class TestExtractBodyText: def test_extracts_article_content(self): text = extract_body_text(SAMPLE_HTML) assert "Apple Inc. reported quarterly revenue" in text assert "strong growth" in text def test_strips_nav_and_footer(self): text = extract_body_text(SAMPLE_HTML) assert "Navigation links here" not in text assert "Copyright 2026" not in text def test_strips_script_and_style(self): html = "Content
" text = extract_body_text(html) assert "alert" not in text assert "Content" in text def test_minimal_html(self): text = extract_body_text(MINIMAL_HTML) assert "Short content." in text class TestWebScrapeAdapterSourceType: def test_source_type(self): adapter = WebScrapeAdapter() assert adapter.source_type() == "web_scrape" def test_bucket_name(self): adapter = WebScrapeAdapter() assert adapter.bucket_name() == "stonks-raw-news" class TestWebScrapeAdapterErrorResult: def test_error_on_no_urls(self): adapter = WebScrapeAdapter() result = adapter._error_result("AAPL", "No URLs configured", 0) assert not result.ok assert result.error == "No URLs configured" assert result.source_type == "web_scrape" assert result.ticker == "AAPL" @pytest.mark.asyncio async def test_fetch_no_urls_configured(): adapter = WebScrapeAdapter() result = await adapter.fetch("AAPL", {}) assert not result.ok assert result.error is not None assert "No URLs configured" in result.error