phase 14-15: docker build validation and helm deployment
This commit is contained in:
@@ -0,0 +1,147 @@
|
||||
"""Tests for the web scrape adapter.
|
||||
|
||||
Validates URL normalization, HTML metadata extraction, body text extraction,
|
||||
and adapter result construction.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from services.adapters.web_scrape_adapter import (
|
||||
WebScrapeAdapter,
|
||||
extract_body_text,
|
||||
extract_metadata_from_html,
|
||||
)
|
||||
from services.shared.content import normalize_url
|
||||
|
||||
|
||||
SAMPLE_HTML = """<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>Apple Q2 Earnings Beat Expectations</title>
|
||||
<meta property="og:title" content="Apple Q2 Earnings Beat Expectations" />
|
||||
<meta property="og:site_name" content="TechNews" />
|
||||
<meta property="og:description" content="Apple reported strong Q2 results." />
|
||||
<meta name="author" content="Jane Reporter" />
|
||||
<meta property="article:published_time" content="2026-04-10T14:00:00Z" />
|
||||
<link rel="canonical" href="https://technews.example.com/apple-q2-earnings" />
|
||||
</head>
|
||||
<body>
|
||||
<nav>Navigation links here</nav>
|
||||
<article>
|
||||
<h1>Apple Q2 Earnings Beat Expectations</h1>
|
||||
<p>Apple Inc. reported quarterly revenue of $95 billion, exceeding analyst estimates.</p>
|
||||
<p>The company saw strong growth in its services division and iPhone sales.</p>
|
||||
</article>
|
||||
<footer>Copyright 2026 TechNews</footer>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
MINIMAL_HTML = """<html><body><p>Short content.</p></body></html>"""
|
||||
|
||||
|
||||
class TestNormalizeUrl:
|
||||
def test_basic_normalization(self):
|
||||
assert normalize_url("HTTPS://Example.COM/path") == "https://example.com/path"
|
||||
|
||||
def test_strips_trailing_slash(self):
|
||||
assert normalize_url("https://example.com/path/") == "https://example.com/path"
|
||||
|
||||
def test_strips_fragment(self):
|
||||
result = normalize_url("https://example.com/path#section")
|
||||
assert "#" not in result
|
||||
|
||||
def test_preserves_query(self):
|
||||
result = normalize_url("https://example.com/path?q=test")
|
||||
assert result == "https://example.com/path?q=test"
|
||||
|
||||
def test_preserves_non_standard_port(self):
|
||||
result = normalize_url("https://example.com:8443/path")
|
||||
assert ":8443" in result
|
||||
|
||||
def test_root_path(self):
|
||||
result = normalize_url("https://example.com")
|
||||
assert result == "https://example.com/"
|
||||
|
||||
|
||||
class TestExtractMetadataFromHtml:
|
||||
def test_extracts_title(self):
|
||||
meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
|
||||
assert meta["title"] == "Apple Q2 Earnings Beat Expectations"
|
||||
|
||||
def test_extracts_author(self):
|
||||
meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
|
||||
assert meta["author"] == "Jane Reporter"
|
||||
|
||||
def test_extracts_publisher(self):
|
||||
meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
|
||||
assert meta["publisher"] == "TechNews"
|
||||
|
||||
def test_extracts_published_at(self):
|
||||
meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
|
||||
assert meta["published_at"] == "2026-04-10T14:00:00Z"
|
||||
|
||||
def test_extracts_canonical_url(self):
|
||||
meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
|
||||
assert meta["canonical_url"] == "https://technews.example.com/apple-q2-earnings"
|
||||
|
||||
def test_extracts_language(self):
|
||||
meta = extract_metadata_from_html(SAMPLE_HTML, "https://technews.example.com/article")
|
||||
assert meta["language"] == "en"
|
||||
|
||||
def test_fallback_publisher_from_hostname(self):
|
||||
meta = extract_metadata_from_html(MINIMAL_HTML, "https://example.com/page")
|
||||
assert meta["publisher"] == "example.com"
|
||||
|
||||
def test_fallback_title_empty(self):
|
||||
meta = extract_metadata_from_html(MINIMAL_HTML, "https://example.com/page")
|
||||
assert meta["title"] == ""
|
||||
|
||||
|
||||
class TestExtractBodyText:
|
||||
def test_extracts_article_content(self):
|
||||
text = extract_body_text(SAMPLE_HTML)
|
||||
assert "Apple Inc. reported quarterly revenue" in text
|
||||
assert "strong growth" in text
|
||||
|
||||
def test_strips_nav_and_footer(self):
|
||||
text = extract_body_text(SAMPLE_HTML)
|
||||
assert "Navigation links here" not in text
|
||||
assert "Copyright 2026" not in text
|
||||
|
||||
def test_strips_script_and_style(self):
|
||||
html = "<html><body><script>alert('x')</script><style>.x{}</style><p>Content</p></body></html>"
|
||||
text = extract_body_text(html)
|
||||
assert "alert" not in text
|
||||
assert "Content" in text
|
||||
|
||||
def test_minimal_html(self):
|
||||
text = extract_body_text(MINIMAL_HTML)
|
||||
assert "Short content." in text
|
||||
|
||||
|
||||
class TestWebScrapeAdapterSourceType:
|
||||
def test_source_type(self):
|
||||
adapter = WebScrapeAdapter()
|
||||
assert adapter.source_type() == "web_scrape"
|
||||
|
||||
def test_bucket_name(self):
|
||||
adapter = WebScrapeAdapter()
|
||||
assert adapter.bucket_name() == "stonks-raw-news"
|
||||
|
||||
|
||||
class TestWebScrapeAdapterErrorResult:
|
||||
def test_error_on_no_urls(self):
|
||||
adapter = WebScrapeAdapter()
|
||||
result = adapter._error_result("AAPL", "No URLs configured", 0)
|
||||
assert not result.ok
|
||||
assert result.error == "No URLs configured"
|
||||
assert result.source_type == "web_scrape"
|
||||
assert result.ticker == "AAPL"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_fetch_no_urls_configured():
|
||||
adapter = WebScrapeAdapter()
|
||||
result = await adapter.fetch("AAPL", {})
|
||||
assert not result.ok
|
||||
assert result.error is not None
|
||||
assert "No URLs configured" in result.error
|
||||
Reference in New Issue
Block a user