phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -0,0 +1,147 @@
+"""Tests for the web scrape adapter.
+
+Validates URL normalization, HTML metadata extraction, body text extraction,
+and adapter result construction.
+"""
+import pytest
+
+from services.adapters.web_scrape_adapter import (
+    WebScrapeAdapter,
+    extract_body_text,
+    extract_metadata_from_html,
+)
+from services.shared.content import normalize_url
+
+
+SAMPLE_HTML = """<!DOCTYPE html>
+<html lang="en">
+<head>
+    <title>Apple Q2 Earnings Beat Expectations</title>
+    <meta property="og:title" content="Apple Q2 Earnings Beat Expectations" />
+    <meta property="og:site_name" content="TechNews" />
+    <meta property="og:description" content="Apple reported strong Q2 results." />
+    <meta name="author" content="Jane Reporter" />
+    <meta property="article:published_time" content="2026-04-10T14:00:00Z" />
+    <link rel="canonical" href="https://technews.example.com/apple-q2-earnings" />
+</head>
+<body>
+    <nav>Navigation links here</nav>
+    <article>
+        <h1>Apple Q2 Earnings Beat Expectations</h1>
+        <p>Apple Inc. reported quarterly revenue of $95 billion, exceeding analyst estimates.</p>
+        <p>The company saw strong growth in its services division and iPhone sales.</p>
+    </article>
+    <footer>Copyright 2026 TechNews</footer>
+</body>
+</html>"""
+
+MINIMAL_HTML = """<html><body><p>Short content.</p></body></html>"""
+
+
+class TestNormalizeUrl:
+    def test_basic_normalization(self):
+        assert normalize_url("HTTPS://Example.COM/path") == "https://example.com/path"
+
+    def test_strips_trailing_slash(self):
+        assert normalize_url("https://example.com/path/") == "https://example.com/path"
+
+    def test_strips_fragment(self):
+        result = normalize_url("https://example.com/path#section")
+        assert "#" not in result
+
+    def test_preserves_query(self):
+        result = normalize_url("https://example.com/path?q=test")
+        assert result == "https://example.com/path?q=test"
+
+    def test_preserves_non_standard_port(self):
+        result = normalize_url("https://example.com:8443/path")
+        assert ":8443" in result
+
+    def test_root_path(self):
+        result = normalize_url("https://example.com")
+        assert result == "https://example.com/"
+
+
+class TestExtractMetadataFromHtml:
+    def test_extracts_title(self):
+        meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
+        assert meta["title"] == "Apple Q2 Earnings Beat Expectations"
+
+    def test_extracts_author(self):
+        meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
+        assert meta["author"] == "Jane Reporter"
+
+    def test_extracts_publisher(self):
+        meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
+        assert meta["publisher"] == "TechNews"
+
+    def test_extracts_published_at(self):
+        meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
+        assert meta["published_at"] == "2026-04-10T14:00:00Z"
+
+    def test_extracts_canonical_url(self):
+        meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
+        assert meta["canonical_url"] == "https://technews.example.com/apple-q2-earnings"
+
+    def test_extracts_language(self):
+        meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
+        assert meta["language"] == "en"
+
+    def test_fallback_publisher_from_hostname(self):
+        meta = extract_metadata_from_html(MINIMAL_HTML, "https://example.com/page")
+        assert meta["publisher"] == "example.com"
+
+    def test_fallback_title_empty(self):
+        meta = extract_metadata_from_html(MINIMAL_HTML, "https://example.com/page")
+        assert meta["title"] == ""
+
+
+class TestExtractBodyText:
+    def test_extracts_article_content(self):
+        text = extract_body_text(SAMPLE_HTML)
+        assert "Apple Inc. reported quarterly revenue" in text
+        assert "strong growth" in text
+
+    def test_strips_nav_and_footer(self):
+        text = extract_body_text(SAMPLE_HTML)
+        assert "Navigation links here" not in text
+        assert "Copyright 2026" not in text
+
+    def test_strips_script_and_style(self):
+        html = "<html><body><script>alert('x')</script><style>.x{}</style><p>Content</p></body></html>"
+        text = extract_body_text(html)
+        assert "alert" not in text
+        assert "Content" in text
+
+    def test_minimal_html(self):
+        text = extract_body_text(MINIMAL_HTML)
+        assert "Short content." in text
+
+
+class TestWebScrapeAdapterSourceType:
+    def test_source_type(self):
+        adapter = WebScrapeAdapter()
+        assert adapter.source_type() == "web_scrape"
+
+    def test_bucket_name(self):
+        adapter = WebScrapeAdapter()
+        assert adapter.bucket_name() == "stonks-raw-news"
+
+
+class TestWebScrapeAdapterErrorResult:
+    def test_error_on_no_urls(self):
+        adapter = WebScrapeAdapter()
+        result = adapter._error_result("AAPL", "No URLs configured", 0)
+        assert not result.ok
+        assert result.error == "No URLs configured"
+        assert result.source_type == "web_scrape"
+        assert result.ticker == "AAPL"
+
+
+@pytest.mark.asyncio
+async def test_fetch_no_urls_configured():
+    adapter = WebScrapeAdapter()
+    result = await adapter.fetch("AAPL", {})
+    assert not result.ok
+    assert result.error is not None
+    assert "No URLs configured" in result.error