c85c0068a2
- Replace all datetime.utcnow() with datetime.now(tz=timezone.utc) across 8 files - Fix 12 failing tests to match current implementation behavior - Fix pytest_plugins in non-top-level conftest (moved to root conftest.py) - Auto-fix 189 lint issues (import sorting, unused imports) - Add CI/CD pipeline infrastructure (ARC, ArgoCD, Kargo manifests) - Add values-beta.yaml and values-paper.yaml for staged deployments - Update GitHub Actions workflow to use self-hosted-gremlin runners - Add integration-test job to CI pipeline Result: 1596 passed, 0 failed, 0 warnings
582 lines
25 KiB
Python
582 lines
25 KiB
Python
"""Tests for the HTML-to-text parsing pipeline.
|
|
|
|
Validates body extraction, metadata extraction, boilerplate removal,
|
|
quality scoring, link extraction, document type inference, and company
|
|
mention detection.
|
|
|
|
Requirements: 4.1, 4.2, 4.3
|
|
"""
|
|
from services.parser.html_parser import (
|
|
CompanyMention,
|
|
ParsedDocument,
|
|
QualitySignals,
|
|
_block_score,
|
|
_collapse_whitespace,
|
|
_link_density,
|
|
_remove_short_orphan_lines,
|
|
_text_density,
|
|
detect_company_mentions,
|
|
extract_body_text,
|
|
extract_metadata,
|
|
extract_outbound_links,
|
|
infer_document_type,
|
|
parse_html,
|
|
score_parse_quality,
|
|
score_quality,
|
|
)
|
|
|
|
RICH_HTML = """<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<title>Apple Q2 Earnings Beat Expectations</title>
|
|
<meta property="og:title" content="Apple Q2 Earnings Beat" />
|
|
<meta property="og:site_name" content="TechNews" />
|
|
<meta property="og:description" content="Apple reported strong Q2 results." />
|
|
<meta name="author" content="Jane Reporter" />
|
|
<meta name="keywords" content="apple, earnings, tech" />
|
|
<meta property="article:published_time" content="2026-04-10T14:00:00Z" />
|
|
<link rel="canonical" href="https://technews.example.com/apple-q2-earnings" />
|
|
</head>
|
|
<body>
|
|
<nav>Navigation links here</nav>
|
|
<article>
|
|
<h1>Apple Q2 Earnings Beat Expectations</h1>
|
|
<p>Apple Inc. reported quarterly revenue of $95 billion, exceeding analyst estimates.
|
|
The company saw strong growth in its services division and iPhone sales across all
|
|
major markets worldwide. Revenue from the App Store and iCloud subscriptions
|
|
continued to climb, contributing significantly to the overall results.</p>
|
|
<p>CEO Tim Cook highlighted the company's commitment to innovation and expanding
|
|
its ecosystem. The services segment alone generated over $20 billion in revenue,
|
|
marking a new quarterly record for the division.</p>
|
|
<a href="https://other-site.com/analysis">External analysis</a>
|
|
<a href="https://technews.example.com/related">Related article</a>
|
|
</article>
|
|
<footer>Copyright 2026 TechNews. All rights reserved. Privacy policy applies.</footer>
|
|
<div class="sidebar">Sidebar content</div>
|
|
<div class="newsletter">Subscribe to our newsletter for updates</div>
|
|
</body>
|
|
</html>"""
|
|
|
|
MINIMAL_HTML = "<html><body><p>Short.</p></body></html>"
|
|
|
|
BOILERPLATE_HTML = """<html><body>
|
|
<nav>Menu items</nav>
|
|
<div class="article-body">
|
|
<p>The actual article content is here with enough words to pass quality checks.
|
|
This paragraph discusses important market developments and financial results
|
|
that are relevant to investors and analysts tracking the technology sector.</p>
|
|
</div>
|
|
<aside class="sidebar">Related links</aside>
|
|
<div class="advertisement">Buy stuff</div>
|
|
<footer>Copyright © 2026. All rights reserved. Terms of service apply.</footer>
|
|
</body></html>"""
|
|
|
|
|
|
class TestExtractBodyText:
|
|
def test_extracts_article_content(self):
|
|
text = extract_body_text(RICH_HTML)
|
|
assert "Apple Inc. reported quarterly revenue" in text
|
|
assert "strong growth" in text
|
|
|
|
def test_strips_nav_footer_sidebar(self):
|
|
text = extract_body_text(RICH_HTML)
|
|
assert "Navigation links here" not in text
|
|
assert "Sidebar content" not in text
|
|
|
|
def test_strips_boilerplate_text(self):
|
|
text = extract_body_text(BOILERPLATE_HTML)
|
|
assert "Subscribe to our newsletter" not in text
|
|
assert "Copyright ©" not in text
|
|
|
|
def test_finds_article_body_class(self):
|
|
text = extract_body_text(BOILERPLATE_HTML)
|
|
assert "actual article content" in text
|
|
|
|
def test_minimal_html_returns_text(self):
|
|
text = extract_body_text(MINIMAL_HTML)
|
|
assert "Short." in text
|
|
|
|
def test_strips_script_and_style(self):
|
|
html = "<html><body><script>alert('x')</script><style>.x{color:red}</style><p>Real content here</p></body></html>"
|
|
text = extract_body_text(html)
|
|
assert "alert" not in text
|
|
assert "color:red" not in text
|
|
assert "Real content here" in text
|
|
|
|
def test_empty_html(self):
|
|
text = extract_body_text("")
|
|
assert text == ""
|
|
|
|
|
|
class TestExtractMetadata:
|
|
def test_extracts_title(self):
|
|
meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
|
|
assert meta["title"] == "Apple Q2 Earnings Beat"
|
|
|
|
def test_extracts_author(self):
|
|
meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
|
|
assert meta["author"] == "Jane Reporter"
|
|
|
|
def test_extracts_publisher(self):
|
|
meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
|
|
assert meta["publisher"] == "TechNews"
|
|
|
|
def test_extracts_published_at(self):
|
|
meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
|
|
assert meta["published_at"] == "2026-04-10T14:00:00Z"
|
|
|
|
def test_extracts_canonical_url(self):
|
|
meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
|
|
assert meta["canonical_url"] == "https://technews.example.com/apple-q2-earnings"
|
|
|
|
def test_extracts_language(self):
|
|
meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
|
|
assert meta["language"] == "en"
|
|
|
|
def test_extracts_keywords(self):
|
|
meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
|
|
assert meta["tags"] is not None
|
|
assert "apple" in str(meta["tags"])
|
|
|
|
def test_fallback_publisher_from_hostname(self):
|
|
meta = extract_metadata(MINIMAL_HTML, "https://example.com/page")
|
|
assert meta["publisher"] == "example.com"
|
|
|
|
def test_no_url_publisher_empty(self):
|
|
meta = extract_metadata(MINIMAL_HTML, "")
|
|
assert meta["publisher"] == ""
|
|
|
|
|
|
class TestExtractOutboundLinks:
|
|
def test_finds_external_links(self):
|
|
links = extract_outbound_links(RICH_HTML, "https://technews.example.com/article")
|
|
assert "https://other-site.com/analysis" in links
|
|
|
|
def test_excludes_same_host_links(self):
|
|
links = extract_outbound_links(RICH_HTML, "https://technews.example.com/article")
|
|
assert all("technews.example.com" not in link for link in links)
|
|
|
|
def test_deduplicates_links(self):
|
|
html = '<html><body><a href="https://ext.com/a">1</a><a href="https://ext.com/a">2</a></body></html>'
|
|
links = extract_outbound_links(html, "https://example.com")
|
|
assert links.count("https://ext.com/a") == 1
|
|
|
|
def test_ignores_fragment_and_javascript(self):
|
|
html = '<html><body><a href="#top">top</a><a href="javascript:void(0)">js</a></body></html>'
|
|
links = extract_outbound_links(html, "https://example.com")
|
|
assert links == []
|
|
|
|
|
|
class TestScoreQuality:
|
|
def test_very_short_text_low(self):
|
|
score, conf = score_quality("hello world")
|
|
assert score < 0.5
|
|
# With default body_found=True, very short text lands in medium
|
|
assert conf in ("low", "medium")
|
|
|
|
def test_medium_text(self):
|
|
words = [f"word{i}" for i in range(100)]
|
|
text = " ".join(words) + "."
|
|
score, conf = score_quality(text)
|
|
# 100 diverse words with sentence structure scores well
|
|
assert conf in ("medium", "high")
|
|
|
|
def test_long_diverse_text_high(self):
|
|
words = [f"word{i}" for i in range(300)]
|
|
text = ". ".join(" ".join(words[i:i+10]) for i in range(0, 300, 10)) + "."
|
|
score, conf = score_quality(text)
|
|
assert conf == "high"
|
|
assert score >= 0.65
|
|
|
|
def test_empty_text_low(self):
|
|
score, conf = score_quality("")
|
|
assert conf == "low"
|
|
assert score < 0.35
|
|
|
|
|
|
class TestScoreParseQuality:
|
|
"""Tests for the multi-signal quality scoring function."""
|
|
|
|
def test_returns_four_tuple(self):
|
|
score, conf, signals, warnings = score_parse_quality("hello world")
|
|
assert isinstance(score, float)
|
|
assert conf in ("low", "medium", "high")
|
|
assert isinstance(signals, QualitySignals)
|
|
assert isinstance(warnings, list)
|
|
|
|
def test_empty_text_is_low(self):
|
|
score, conf, signals, warnings = score_parse_quality("")
|
|
assert conf == "low"
|
|
assert "very_short_text" in warnings
|
|
|
|
def test_short_text_warns(self):
|
|
text = " ".join(["word"] * 30)
|
|
_score, _conf, _signals, warnings = score_parse_quality(text)
|
|
assert "short_text" in warnings
|
|
|
|
def test_body_not_found_warns(self):
|
|
text = " ".join([f"word{i}" for i in range(100)]) + "."
|
|
_score, _conf, signals, warnings = score_parse_quality(text, body_found=False)
|
|
assert "no_article_body_found" in warnings
|
|
assert signals.body_found_signal < 0.5
|
|
|
|
def test_metadata_boosts_score(self):
|
|
text = ". ".join(" ".join(f"word{i}" for i in range(j, j+10)) for j in range(0, 200, 10)) + "."
|
|
score_no_meta, _, _, _ = score_parse_quality(text)
|
|
score_with_meta, _, _, _ = score_parse_quality(
|
|
text, has_title=True, has_author=True, has_publisher=True, has_published_at=True,
|
|
)
|
|
assert score_with_meta > score_no_meta
|
|
|
|
def test_signals_as_dict(self):
|
|
_, _, signals, _ = score_parse_quality("hello world")
|
|
d = signals.as_dict()
|
|
assert "word_count" in d
|
|
assert "diversity" in d
|
|
assert "body_found" in d
|
|
|
|
def test_well_structured_article_scores_high(self):
|
|
paragraphs = []
|
|
for i in range(5):
|
|
sentences = ". ".join(f"Sentence {j} of paragraph {i} with diverse vocabulary" for j in range(4))
|
|
paragraphs.append(sentences + ".")
|
|
text = "\n\n".join(paragraphs)
|
|
score, conf, signals, warnings = score_parse_quality(
|
|
text, body_found=True, has_title=True, has_author=True,
|
|
has_publisher=True, has_published_at=True,
|
|
)
|
|
assert conf == "high"
|
|
assert score >= 0.7
|
|
assert signals.paragraph_signal == 1.0
|
|
assert signals.body_found_signal == 1.0
|
|
|
|
|
|
class TestInferDocumentType:
|
|
def test_filing_from_url(self):
|
|
assert infer_document_type("", "https://sec.gov/filing/10-k") == "filing"
|
|
|
|
def test_transcript_from_url(self):
|
|
assert infer_document_type("", "https://example.com/earnings-call-transcript") == "transcript"
|
|
|
|
def test_press_release_from_url(self):
|
|
assert infer_document_type("", "https://example.com/press-release/q2") == "press_release"
|
|
|
|
def test_default_article(self):
|
|
assert infer_document_type("", "https://example.com/news/story") == "article"
|
|
|
|
|
|
class TestDetectCompanyMentions:
|
|
def test_detects_ticker(self):
|
|
aliases = [{"company_id": "1", "alias": "AAPL", "alias_type": "ticker", "ticker": "AAPL"}]
|
|
mentions = detect_company_mentions("Shares of AAPL rose 5% today", aliases)
|
|
assert len(mentions) == 1
|
|
assert mentions[0]["ticker"] == "AAPL"
|
|
assert mentions[0]["confidence"] == 0.9 # ticker confidence
|
|
|
|
def test_detects_company_name(self):
|
|
aliases = [{"company_id": "1", "alias": "Apple Inc.", "alias_type": "legal_name", "ticker": "AAPL"}]
|
|
mentions = detect_company_mentions("Apple Inc. reported strong earnings", aliases)
|
|
assert len(mentions) == 1
|
|
assert mentions[0]["confidence"] == 0.85 # legal_name confidence
|
|
|
|
def test_no_false_positive_short_ticker(self):
|
|
aliases = [{"company_id": "1", "alias": "A", "alias_type": "ticker", "ticker": "A"}]
|
|
mentions = detect_company_mentions("This is a sentence about nothing", aliases)
|
|
assert len(mentions) == 0
|
|
|
|
def test_short_ticker_case_sensitive(self):
|
|
aliases = [{"company_id": "1", "alias": "AI", "alias_type": "ticker", "ticker": "AI"}]
|
|
# "AI" as a word should match case-sensitively
|
|
mentions = detect_company_mentions("The AI revolution is here", aliases)
|
|
assert len(mentions) == 1
|
|
# Lowercase "ai" should not match
|
|
mentions2 = detect_company_mentions("the ai revolution is here", aliases)
|
|
assert len(mentions2) == 0
|
|
|
|
def test_deduplicates_by_company(self):
|
|
aliases = [
|
|
{"company_id": "1", "alias": "AAPL", "alias_type": "ticker", "ticker": "AAPL"},
|
|
{"company_id": "1", "alias": "Apple Inc.", "alias_type": "legal_name", "ticker": "AAPL"},
|
|
]
|
|
mentions = detect_company_mentions("AAPL Apple Inc. reported earnings", aliases)
|
|
assert len(mentions) == 1
|
|
# Should keep the higher confidence (ticker=0.9 > legal_name=0.85)
|
|
assert mentions[0]["confidence"] == 0.9
|
|
|
|
def test_match_count_accumulated(self):
|
|
aliases = [{"company_id": "1", "alias": "AAPL", "alias_type": "ticker", "ticker": "AAPL"}]
|
|
mentions = detect_company_mentions("AAPL beat estimates. AAPL shares rose.", aliases)
|
|
assert len(mentions) == 1
|
|
assert mentions[0]["match_count"] == 2
|
|
|
|
def test_multiple_companies(self):
|
|
aliases = [
|
|
{"company_id": "1", "alias": "AAPL", "alias_type": "ticker", "ticker": "AAPL"},
|
|
{"company_id": "2", "alias": "MSFT", "alias_type": "ticker", "ticker": "MSFT"},
|
|
]
|
|
mentions = detect_company_mentions("AAPL and MSFT both reported earnings", aliases)
|
|
assert len(mentions) == 2
|
|
tickers = {m["ticker"] for m in mentions}
|
|
assert tickers == {"AAPL", "MSFT"}
|
|
|
|
def test_brand_alias(self):
|
|
aliases = [{"company_id": "1", "alias": "iPhone", "alias_type": "brand", "ticker": "AAPL"}]
|
|
mentions = detect_company_mentions("The new iPhone sales exceeded expectations", aliases)
|
|
assert len(mentions) == 1
|
|
assert mentions[0]["confidence"] == 0.6 # brand confidence
|
|
|
|
def test_empty_text(self):
|
|
aliases = [{"company_id": "1", "alias": "AAPL", "alias_type": "ticker", "ticker": "AAPL"}]
|
|
assert detect_company_mentions("", aliases) == []
|
|
|
|
def test_empty_aliases(self):
|
|
assert detect_company_mentions("Some text about stocks", []) == []
|
|
|
|
def test_case_insensitive_name_match(self):
|
|
aliases = [{"company_id": "1", "alias": "Apple Inc.", "alias_type": "legal_name", "ticker": "AAPL"}]
|
|
mentions = detect_company_mentions("APPLE INC. reported earnings", aliases)
|
|
assert len(mentions) == 1
|
|
|
|
|
|
class TestParseHtml:
|
|
def test_returns_parsed_document(self):
|
|
result = parse_html(RICH_HTML, "https://technews.example.com/article")
|
|
assert isinstance(result, ParsedDocument)
|
|
|
|
def test_body_text_populated(self):
|
|
result = parse_html(RICH_HTML, "https://technews.example.com/article")
|
|
assert "Apple Inc." in result.body_text
|
|
assert result.word_count > 0
|
|
|
|
def test_metadata_populated(self):
|
|
result = parse_html(RICH_HTML, "https://technews.example.com/article")
|
|
assert result.title == "Apple Q2 Earnings Beat"
|
|
assert result.author == "Jane Reporter"
|
|
assert result.publisher == "TechNews"
|
|
|
|
def test_quality_scoring(self):
|
|
result = parse_html(RICH_HTML, "https://technews.example.com/article")
|
|
assert result.quality_score > 0
|
|
assert result.confidence in ("low", "medium", "high")
|
|
|
|
def test_quality_signals_populated(self):
|
|
result = parse_html(RICH_HTML, "https://technews.example.com/article")
|
|
assert isinstance(result.quality_signals, QualitySignals)
|
|
assert result.quality_signals.body_found_signal == 1.0
|
|
assert result.quality_signals.metadata_signal > 0
|
|
|
|
def test_low_quality_flag_on_minimal(self):
|
|
result = parse_html(MINIMAL_HTML, "")
|
|
assert result.low_quality_flag is True
|
|
assert result.confidence == "low"
|
|
|
|
def test_rich_html_not_low_quality(self):
|
|
result = parse_html(RICH_HTML, "https://technews.example.com/article")
|
|
assert result.low_quality_flag is False
|
|
|
|
def test_quality_warnings_list(self):
|
|
result = parse_html(MINIMAL_HTML, "")
|
|
assert isinstance(result.quality_warnings, list)
|
|
|
|
def test_tags_extracted(self):
|
|
result = parse_html(RICH_HTML, "https://technews.example.com/article")
|
|
assert "apple" in result.tags
|
|
|
|
def test_document_type_inferred(self):
|
|
result = parse_html(RICH_HTML, "https://technews.example.com/article")
|
|
assert result.document_type == "article"
|
|
|
|
def test_outbound_links(self):
|
|
result = parse_html(RICH_HTML, "https://technews.example.com/article")
|
|
assert any("other-site.com" in link for link in result.outbound_links)
|
|
|
|
def test_mentioned_companies_with_aliases(self):
|
|
aliases = [
|
|
{"company_id": "1", "alias": "AAPL", "alias_type": "ticker", "ticker": "AAPL"},
|
|
{"company_id": "1", "alias": "Apple Inc.", "alias_type": "legal_name", "ticker": "AAPL"},
|
|
]
|
|
result = parse_html(RICH_HTML, "https://technews.example.com/article", aliases=aliases)
|
|
assert len(result.mentioned_companies) == 1
|
|
assert result.mentioned_companies[0].ticker == "AAPL"
|
|
assert isinstance(result.mentioned_companies[0], CompanyMention)
|
|
|
|
def test_no_mentions_without_aliases(self):
|
|
result = parse_html(RICH_HTML, "https://technews.example.com/article")
|
|
assert result.mentioned_companies == []
|
|
|
|
# --- HTML fixtures for boilerplate reduction tests ---
|
|
|
|
NO_SEMANTIC_HTML = """<html><body>
|
|
<div class="top-bar"><a href="/">Home</a> <a href="/about">About</a> <a href="/contact">Contact</a></div>
|
|
<div class="main-content">
|
|
<p>The Federal Reserve announced a 25 basis point rate cut on Wednesday,
|
|
surprising markets that had expected rates to remain unchanged. Bond yields
|
|
fell sharply across the curve, with the 10-year Treasury dropping to 3.8 percent.
|
|
Equity markets rallied on the news, with the S&P 500 gaining 1.2 percent by close.</p>
|
|
<p>Analysts noted that the decision reflects growing concerns about slowing economic
|
|
growth and weakening labor market data. Several Fed governors had signaled openness
|
|
to easing in recent speeches, but the timing caught many off guard.</p>
|
|
<p>Market participants are now pricing in additional cuts at the next two meetings,
|
|
with futures indicating a 70 percent probability of another reduction in September.</p>
|
|
</div>
|
|
<div class="link-list"><a href="/1">Story 1</a><a href="/2">Story 2</a><a href="/3">Story 3</a><a href="/4">Story 4</a><a href="/5">Story 5</a></div>
|
|
</body></html>"""
|
|
|
|
HEAVY_BOILERPLATE_HTML = """<html><body>
|
|
<div class="cookie-banner">We use cookies. Accept all cookies.</div>
|
|
<div class="signup-form">Sign up for free alerts</div>
|
|
<nav class="menu">Home | Markets | Tech | Opinion</nav>
|
|
<article>
|
|
<p>Tesla reported record deliveries in Q1 2026, shipping over 500,000 vehicles
|
|
globally. The company attributed the strong performance to expanded production
|
|
capacity at its Berlin and Austin gigafactories, as well as growing demand for
|
|
the refreshed Model Y across European and Asian markets.</p>
|
|
<p>Revenue for the quarter came in at $28 billion, beating consensus estimates
|
|
by roughly 4 percent. Automotive gross margins improved to 19.5 percent,
|
|
reversing a trend of compression seen throughout 2025.</p>
|
|
</article>
|
|
<div class="social-share">Share this article on Twitter Facebook LinkedIn</div>
|
|
<div class="related-posts">You may also like: Story A, Story B</div>
|
|
<div class="ad-container">Sponsored content here</div>
|
|
<footer>Copyright © 2026 FinanceDaily. All rights reserved. Terms of service. Privacy policy.</footer>
|
|
</body></html>"""
|
|
|
|
REPEATED_BLOCKS_HTML = """<html><body>
|
|
<article>
|
|
<p>Apple announced a new partnership with Samsung to develop next-generation
|
|
display technology for future iPhone models. The collaboration is expected to
|
|
yield OLED panels with improved brightness and energy efficiency.</p>
|
|
<p>This is a developing story. Check back for updates as more information becomes available.</p>
|
|
<p>Industry analysts view the partnership as a strategic move to secure supply
|
|
chain advantages ahead of the 2027 product cycle. Display costs represent a
|
|
significant portion of iPhone bill of materials.</p>
|
|
<p>This is a developing story. Check back for updates as more information becomes available.</p>
|
|
</article>
|
|
</body></html>"""
|
|
|
|
|
|
class TestTextDensityScoring:
|
|
"""Tests for text-density-based block scoring heuristics."""
|
|
|
|
def test_content_rich_div_has_high_density(self):
|
|
from bs4 import BeautifulSoup
|
|
html = "<div><p>This is a substantial paragraph with real content about markets.</p></div>"
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
tag = soup.find("div")
|
|
assert _text_density(tag) > _MIN_TEXT_DENSITY
|
|
|
|
def test_link_heavy_div_has_high_link_density(self):
|
|
from bs4 import BeautifulSoup
|
|
html = '<div><a href="/a">Link one</a> <a href="/b">Link two</a> <a href="/c">Link three</a></div>'
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
tag = soup.find("div")
|
|
assert _link_density(tag) > 0.8
|
|
|
|
def test_article_div_has_low_link_density(self):
|
|
from bs4 import BeautifulSoup
|
|
html = "<div><p>A long paragraph of article text that discusses important financial results and market movements in detail.</p></div>"
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
tag = soup.find("div")
|
|
assert _link_density(tag) < 0.1
|
|
|
|
def test_block_score_prefers_content_over_nav(self):
|
|
from bs4 import BeautifulSoup
|
|
content_html = "<div>" + "<p>Substantial article paragraph with real content about markets and earnings.</p>" * 3 + "</div>"
|
|
nav_html = '<div><a href="/a">Link</a><a href="/b">Link</a><a href="/c">Link</a><a href="/d">Link</a></div>'
|
|
soup_c = BeautifulSoup(content_html, "html.parser")
|
|
soup_n = BeautifulSoup(nav_html, "html.parser")
|
|
assert _block_score(soup_c.find("div")) > _block_score(soup_n.find("div"))
|
|
|
|
|
|
class TestBoilerplateReduction:
|
|
"""Tests for enhanced boilerplate reduction pipeline."""
|
|
|
|
def test_strips_cookie_banner(self):
|
|
text = extract_body_text(HEAVY_BOILERPLATE_HTML)
|
|
assert "cookie" not in text.lower()
|
|
|
|
def test_strips_signup_form(self):
|
|
text = extract_body_text(HEAVY_BOILERPLATE_HTML)
|
|
assert "Sign up for free" not in text
|
|
|
|
def test_strips_social_share(self):
|
|
text = extract_body_text(HEAVY_BOILERPLATE_HTML)
|
|
assert "Share this article" not in text
|
|
|
|
def test_strips_ad_container(self):
|
|
text = extract_body_text(HEAVY_BOILERPLATE_HTML)
|
|
assert "Sponsored content" not in text
|
|
|
|
def test_strips_related_posts(self):
|
|
text = extract_body_text(HEAVY_BOILERPLATE_HTML)
|
|
assert "You may also like" not in text
|
|
|
|
def test_preserves_article_content(self):
|
|
text = extract_body_text(HEAVY_BOILERPLATE_HTML)
|
|
assert "Tesla reported record deliveries" in text
|
|
assert "Revenue for the quarter" in text
|
|
|
|
def test_strips_copyright_footer(self):
|
|
text = extract_body_text(HEAVY_BOILERPLATE_HTML)
|
|
assert "Copyright ©" not in text
|
|
|
|
|
|
class TestBodyExtractionFallback:
|
|
"""Tests for text-density fallback when no semantic selector matches."""
|
|
|
|
def test_finds_content_without_article_tag(self):
|
|
text = extract_body_text(NO_SEMANTIC_HTML)
|
|
assert "Federal Reserve announced" in text
|
|
assert "25 basis point rate cut" in text
|
|
|
|
def test_prefers_content_over_nav_links(self):
|
|
text = extract_body_text(NO_SEMANTIC_HTML)
|
|
# The nav-like link list should not dominate the output
|
|
assert "Story 1" not in text or "Federal Reserve" in text
|
|
|
|
|
|
class TestRepeatedBlockDetection:
|
|
"""Tests for repeated/template text detection."""
|
|
|
|
def test_collapses_repeated_template_text(self):
|
|
text = extract_body_text(REPEATED_BLOCKS_HTML)
|
|
count = text.count("This is a developing story")
|
|
assert count <= 1
|
|
|
|
def test_preserves_unique_content(self):
|
|
text = extract_body_text(REPEATED_BLOCKS_HTML)
|
|
assert "Apple announced a new partnership" in text
|
|
assert "Industry analysts view" in text
|
|
|
|
|
|
class TestOrphanLineRemoval:
|
|
"""Tests for short orphan line removal."""
|
|
|
|
def test_removes_short_fragments(self):
|
|
text = _remove_short_orphan_lines("OK\nThis is a real sentence about markets.\nHi")
|
|
assert "OK" not in text
|
|
assert "Hi" not in text
|
|
assert "real sentence" in text
|
|
|
|
def test_keeps_short_lines_with_punctuation(self):
|
|
text = _remove_short_orphan_lines("Breaking news.\nDetails follow in the article.")
|
|
assert "Breaking news." in text
|
|
|
|
|
|
class TestCollapseWhitespace:
|
|
"""Tests for whitespace collapsing."""
|
|
|
|
def test_collapses_multiple_blank_lines(self):
|
|
text = _collapse_whitespace("Line one.\n\n\n\nLine two.")
|
|
assert "\n\n\n" not in text
|
|
assert "Line one." in text
|
|
assert "Line two." in text
|
|
|
|
def test_strips_leading_trailing(self):
|
|
text = _collapse_whitespace("\n\n Hello world. \n\n")
|
|
assert text == "Hello world."
|
|
|
|
|
|
# Import the constant for use in density tests
|
|
from services.parser.html_parser import _MIN_TEXT_DENSITY
|