phase 14-15: docker build validation and helm deployment

This commit is contained in:
Celes Renata
2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
+147
View File
@@ -0,0 +1,147 @@
"""Tests for the web scrape adapter.
Validates URL normalization, HTML metadata extraction, body text extraction,
and adapter result construction.
"""
import pytest
from services.adapters.web_scrape_adapter import (
WebScrapeAdapter,
extract_body_text,
extract_metadata_from_html,
)
from services.shared.content import normalize_url
SAMPLE_HTML = """<!DOCTYPE html>
<html lang="en">
<head>
<title>Apple Q2 Earnings Beat Expectations</title>
<meta property="og:title" content="Apple Q2 Earnings Beat Expectations" />
<meta property="og:site_name" content="TechNews" />
<meta property="og:description" content="Apple reported strong Q2 results." />
<meta name="author" content="Jane Reporter" />
<meta property="article:published_time" content="2026-04-10T14:00:00Z" />
<link rel="canonical" href="https://technews.example.com/apple-q2-earnings" />
</head>
<body>
<nav>Navigation links here</nav>
<article>
<h1>Apple Q2 Earnings Beat Expectations</h1>
<p>Apple Inc. reported quarterly revenue of $95 billion, exceeding analyst estimates.</p>
<p>The company saw strong growth in its services division and iPhone sales.</p>
</article>
<footer>Copyright 2026 TechNews</footer>
</body>
</html>"""
MINIMAL_HTML = """<html><body><p>Short content.</p></body></html>"""
class TestNormalizeUrl:
def test_basic_normalization(self):
assert normalize_url("HTTPS://Example.COM/path") == "https://example.com/path"
def test_strips_trailing_slash(self):
assert normalize_url("https://example.com/path/") == "https://example.com/path"
def test_strips_fragment(self):
result = normalize_url("https://example.com/path#section")
assert "#" not in result
def test_preserves_query(self):
result = normalize_url("https://example.com/path?q=test")
assert result == "https://example.com/path?q=test"
def test_preserves_non_standard_port(self):
result = normalize_url("https://example.com:8443/path")
assert ":8443" in result
def test_root_path(self):
result = normalize_url("https://example.com")
assert result == "https://example.com/"
class TestExtractMetadataFromHtml:
def test_extracts_title(self):
meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
assert meta["title"] == "Apple Q2 Earnings Beat Expectations"
def test_extracts_author(self):
meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
assert meta["author"] == "Jane Reporter"
def test_extracts_publisher(self):
meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
assert meta["publisher"] == "TechNews"
def test_extracts_published_at(self):
meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
assert meta["published_at"] == "2026-04-10T14:00:00Z"
def test_extracts_canonical_url(self):
meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
assert meta["canonical_url"] == "https://technews.example.com/apple-q2-earnings"
def test_extracts_language(self):
meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
assert meta["language"] == "en"
def test_fallback_publisher_from_hostname(self):
meta = extract_metadata_from_html(MINIMAL_HTML, "https://example.com/page")
assert meta["publisher"] == "example.com"
def test_fallback_title_empty(self):
meta = extract_metadata_from_html(MINIMAL_HTML, "https://example.com/page")
assert meta["title"] == ""
class TestExtractBodyText:
def test_extracts_article_content(self):
text = extract_body_text(SAMPLE_HTML)
assert "Apple Inc. reported quarterly revenue" in text
assert "strong growth" in text
def test_strips_nav_and_footer(self):
text = extract_body_text(SAMPLE_HTML)
assert "Navigation links here" not in text
assert "Copyright 2026" not in text
def test_strips_script_and_style(self):
html = "<html><body><script>alert('x')</script><style>.x{}</style><p>Content</p></body></html>"
text = extract_body_text(html)
assert "alert" not in text
assert "Content" in text
def test_minimal_html(self):
text = extract_body_text(MINIMAL_HTML)
assert "Short content." in text
class TestWebScrapeAdapterSourceType:
def test_source_type(self):
adapter = WebScrapeAdapter()
assert adapter.source_type() == "web_scrape"
def test_bucket_name(self):
adapter = WebScrapeAdapter()
assert adapter.bucket_name() == "stonks-raw-news"
class TestWebScrapeAdapterErrorResult:
def test_error_on_no_urls(self):
adapter = WebScrapeAdapter()
result = adapter._error_result("AAPL", "No URLs configured", 0)
assert not result.ok
assert result.error == "No URLs configured"
assert result.source_type == "web_scrape"
assert result.ticker == "AAPL"
@pytest.mark.asyncio
async def test_fetch_no_urls_configured():
adapter = WebScrapeAdapter()
result = await adapter.fetch("AAPL", {})
assert not result.ok
assert result.error is not None
assert "No URLs configured" in result.error