stonks-oracle/tests/test_html_parser.py

"""Tests for the HTML-to-text parsing pipeline.

Validates body extraction, metadata extraction, boilerplate removal,
quality scoring, link extraction, document type inference, and company
mention detection.

Requirements: 4.1, 4.2, 4.3
"""
from services.parser.html_parser import (
    CompanyMention,
    ParsedDocument,
    QualitySignals,
    _block_score,
    _collapse_whitespace,
    _link_density,
    _remove_short_orphan_lines,
    _text_density,
    detect_company_mentions,
    extract_body_text,
    extract_metadata,
    extract_outbound_links,
    infer_document_type,
    parse_html,
    score_parse_quality,
    score_quality,
)

RICH_HTML = """<!DOCTYPE html>
<html lang="en">
<head>
    <title>Apple Q2 Earnings Beat Expectations</title>
    <meta property="og:title" content="Apple Q2 Earnings Beat" />
    <meta property="og:site_name" content="TechNews" />
    <meta property="og:description" content="Apple reported strong Q2 results." />
    <meta name="author" content="Jane Reporter" />
    <meta name="keywords" content="apple, earnings, tech" />
    <meta property="article:published_time" content="2026-04-10T14:00:00Z" />
    <link rel="canonical" href="https://technews.example.com/apple-q2-earnings" />
</head>
<body>
    <nav>Navigation links here</nav>
    <article>
        <h1>Apple Q2 Earnings Beat Expectations</h1>
        <p>Apple Inc. reported quarterly revenue of $95 billion, exceeding analyst estimates.
        The company saw strong growth in its services division and iPhone sales across all
        major markets worldwide. Revenue from the App Store and iCloud subscriptions
        continued to climb, contributing significantly to the overall results.</p>
        <p>CEO Tim Cook highlighted the company's commitment to innovation and expanding
        its ecosystem. The services segment alone generated over $20 billion in revenue,
        marking a new quarterly record for the division.</p>
        <a href="https://other-site.com/analysis">External analysis</a>
        <a href="https://technews.example.com/related">Related article</a>
    </article>
    <footer>Copyright 2026 TechNews. All rights reserved. Privacy policy applies.</footer>
    <div class="sidebar">Sidebar content</div>
    <div class="newsletter">Subscribe to our newsletter for updates</div>
</body>
</html>"""

MINIMAL_HTML = "<html><body><p>Short.</p></body></html>"

BOILERPLATE_HTML = """<html><body>
<nav>Menu items</nav>
<div class="article-body">
    <p>The actual article content is here with enough words to pass quality checks.
    This paragraph discusses important market developments and financial results
    that are relevant to investors and analysts tracking the technology sector.</p>
</div>
<aside class="sidebar">Related links</aside>
<div class="advertisement">Buy stuff</div>
<footer>Copyright © 2026. All rights reserved. Terms of service apply.</footer>
</body></html>"""


class TestExtractBodyText:
    def test_extracts_article_content(self):
        text = extract_body_text(RICH_HTML)
        assert "Apple Inc. reported quarterly revenue" in text
        assert "strong growth" in text

    def test_strips_nav_footer_sidebar(self):
        text = extract_body_text(RICH_HTML)
        assert "Navigation links here" not in text
        assert "Sidebar content" not in text

    def test_strips_boilerplate_text(self):
        text = extract_body_text(BOILERPLATE_HTML)
        assert "Subscribe to our newsletter" not in text
        assert "Copyright ©" not in text

    def test_finds_article_body_class(self):
        text = extract_body_text(BOILERPLATE_HTML)
        assert "actual article content" in text

    def test_minimal_html_returns_text(self):
        text = extract_body_text(MINIMAL_HTML)
        assert "Short." in text

    def test_strips_script_and_style(self):
        html = "<html><body><script>alert('x')</script><style>.x{color:red}</style><p>Real content here</p></body></html>"
        text = extract_body_text(html)
        assert "alert" not in text
        assert "color:red" not in text
        assert "Real content here" in text

    def test_empty_html(self):
        text = extract_body_text("")
        assert text == ""


class TestExtractMetadata:
    def test_extracts_title(self):
        meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
        assert meta["title"] == "Apple Q2 Earnings Beat"

    def test_extracts_author(self):
        meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
        assert meta["author"] == "Jane Reporter"

    def test_extracts_publisher(self):
        meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
        assert meta["publisher"] == "TechNews"

    def test_extracts_published_at(self):
        meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
        assert meta["published_at"] == "2026-04-10T14:00:00Z"

    def test_extracts_canonical_url(self):
        meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
        assert meta["canonical_url"] == "https://technews.example.com/apple-q2-earnings"

    def test_extracts_language(self):
        meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
        assert meta["language"] == "en"

    def test_extracts_keywords(self):
        meta = extract_metadata(RICH_HTML, "https://technews.example.com/article")
        assert meta["tags"] is not None
        assert "apple" in str(meta["tags"])

    def test_fallback_publisher_from_hostname(self):
        meta = extract_metadata(MINIMAL_HTML, "https://example.com/page")
        assert meta["publisher"] == "example.com"

    def test_no_url_publisher_empty(self):
        meta = extract_metadata(MINIMAL_HTML, "")
        assert meta["publisher"] == ""


class TestExtractOutboundLinks:
    def test_finds_external_links(self):
        links = extract_outbound_links(RICH_HTML, "https://technews.example.com/article")
        assert "https://other-site.com/analysis" in links

    def test_excludes_same_host_links(self):
        links = extract_outbound_links(RICH_HTML, "https://technews.example.com/article")
        assert all("technews.example.com" not in link for link in links)

    def test_deduplicates_links(self):
        html = '<html><body><a href="https://ext.com/a">1</a><a href="https://ext.com/a">2</a></body></html>'
        links = extract_outbound_links(html, "https://example.com")
        assert links.count("https://ext.com/a") == 1

    def test_ignores_fragment_and_javascript(self):
        html = '<html><body><a href="#top">top</a><a href="javascript:void(0)">js</a></body></html>'
        links = extract_outbound_links(html, "https://example.com")
        assert links == []


class TestScoreQuality:
    def test_very_short_text_low(self):
        score, conf = score_quality("hello world")
        assert score < 0.5
        # With default body_found=True, very short text lands in medium
        assert conf in ("low", "medium")

    def test_medium_text(self):
        words = [f"word{i}" for i in range(100)]
        text = " ".join(words) + "."
        score, conf = score_quality(text)
        # 100 diverse words with sentence structure scores well
        assert conf in ("medium", "high")

    def test_long_diverse_text_high(self):
        words = [f"word{i}" for i in range(300)]
        text = ". ".join(" ".join(words[i:i+10]) for i in range(0, 300, 10)) + "."
        score, conf = score_quality(text)
        assert conf == "high"
        assert score >= 0.65

    def test_empty_text_low(self):
        score, conf = score_quality("")
        assert conf == "low"
        assert score < 0.35


class TestScoreParseQuality:
    """Tests for the multi-signal quality scoring function."""

    def test_returns_four_tuple(self):
        score, conf, signals, warnings = score_parse_quality("hello world")
        assert isinstance(score, float)
        assert conf in ("low", "medium", "high")
        assert isinstance(signals, QualitySignals)
        assert isinstance(warnings, list)

    def test_empty_text_is_low(self):
        score, conf, signals, warnings = score_parse_quality("")
        assert conf == "low"
        assert "very_short_text" in warnings

    def test_short_text_warns(self):
        text = " ".join(["word"] * 30)
        _score, _conf, _signals, warnings = score_parse_quality(text)
        assert "short_text" in warnings

    def test_body_not_found_warns(self):
        text = " ".join([f"word{i}" for i in range(100)]) + "."
        _score, _conf, signals, warnings = score_parse_quality(text, body_found=False)
        assert "no_article_body_found" in warnings
        assert signals.body_found_signal < 0.5

    def test_metadata_boosts_score(self):
        text = ". ".join(" ".join(f"word{i}" for i in range(j, j+10)) for j in range(0, 200, 10)) + "."
        score_no_meta, _, _, _ = score_parse_quality(text)
        score_with_meta, _, _, _ = score_parse_quality(
            text, has_title=True, has_author=True, has_publisher=True, has_published_at=True,
        )
        assert score_with_meta > score_no_meta

    def test_signals_as_dict(self):
        _, _, signals, _ = score_parse_quality("hello world")
        d = signals.as_dict()
        assert "word_count" in d
        assert "diversity" in d
        assert "body_found" in d

    def test_well_structured_article_scores_high(self):
        paragraphs = []
        for i in range(5):
            sentences = ". ".join(f"Sentence {j} of paragraph {i} with diverse vocabulary" for j in range(4))
            paragraphs.append(sentences + ".")
        text = "\n\n".join(paragraphs)
        score, conf, signals, warnings = score_parse_quality(
            text, body_found=True, has_title=True, has_author=True,
            has_publisher=True, has_published_at=True,
        )
        assert conf == "high"
        assert score >= 0.7
        assert signals.paragraph_signal == 1.0
        assert signals.body_found_signal == 1.0


class TestInferDocumentType:
    def test_filing_from_url(self):
        assert infer_document_type("", "https://sec.gov/filing/10-k") == "filing"

    def test_transcript_from_url(self):
        assert infer_document_type("", "https://example.com/earnings-call-transcript") == "transcript"

    def test_press_release_from_url(self):
        assert infer_document_type("", "https://example.com/press-release/q2") == "press_release"

    def test_default_article(self):
        assert infer_document_type("", "https://example.com/news/story") == "article"


class TestDetectCompanyMentions:
    def test_detects_ticker(self):
        aliases = [{"company_id": "1", "alias": "AAPL", "alias_type": "ticker", "ticker": "AAPL"}]
        mentions = detect_company_mentions("Shares of AAPL rose 5% today", aliases)
        assert len(mentions) == 1
        assert mentions[0]["ticker"] == "AAPL"
        assert mentions[0]["confidence"] == 0.9  # ticker confidence

    def test_detects_company_name(self):
        aliases = [{"company_id": "1", "alias": "Apple Inc.", "alias_type": "legal_name", "ticker": "AAPL"}]
        mentions = detect_company_mentions("Apple Inc. reported strong earnings", aliases)
        assert len(mentions) == 1
        assert mentions[0]["confidence"] == 0.85  # legal_name confidence

    def test_no_false_positive_short_ticker(self):
        aliases = [{"company_id": "1", "alias": "A", "alias_type": "ticker", "ticker": "A"}]
        mentions = detect_company_mentions("This is a sentence about nothing", aliases)
        assert len(mentions) == 0

    def test_short_ticker_case_sensitive(self):
        aliases = [{"company_id": "1", "alias": "AI", "alias_type": "ticker", "ticker": "AI"}]
        # "AI" as a word should match case-sensitively
        mentions = detect_company_mentions("The AI revolution is here", aliases)
        assert len(mentions) == 1
        # Lowercase "ai" should not match
        mentions2 = detect_company_mentions("the ai revolution is here", aliases)
        assert len(mentions2) == 0

    def test_deduplicates_by_company(self):
        aliases = [
            {"company_id": "1", "alias": "AAPL", "alias_type": "ticker", "ticker": "AAPL"},
            {"company_id": "1", "alias": "Apple Inc.", "alias_type": "legal_name", "ticker": "AAPL"},
        ]
        mentions = detect_company_mentions("AAPL Apple Inc. reported earnings", aliases)
        assert len(mentions) == 1
        # Should keep the higher confidence (ticker=0.9 > legal_name=0.85)
        assert mentions[0]["confidence"] == 0.9

    def test_match_count_accumulated(self):
        aliases = [{"company_id": "1", "alias": "AAPL", "alias_type": "ticker", "ticker": "AAPL"}]
        mentions = detect_company_mentions("AAPL beat estimates. AAPL shares rose.", aliases)
        assert len(mentions) == 1
        assert mentions[0]["match_count"] == 2

    def test_multiple_companies(self):
        aliases = [
            {"company_id": "1", "alias": "AAPL", "alias_type": "ticker", "ticker": "AAPL"},
            {"company_id": "2", "alias": "MSFT", "alias_type": "ticker", "ticker": "MSFT"},
        ]
        mentions = detect_company_mentions("AAPL and MSFT both reported earnings", aliases)
        assert len(mentions) == 2
        tickers = {m["ticker"] for m in mentions}
        assert tickers == {"AAPL", "MSFT"}

    def test_brand_alias(self):
        aliases = [{"company_id": "1", "alias": "iPhone", "alias_type": "brand", "ticker": "AAPL"}]
        mentions = detect_company_mentions("The new iPhone sales exceeded expectations", aliases)
        assert len(mentions) == 1
        assert mentions[0]["confidence"] == 0.6  # brand confidence

    def test_empty_text(self):
        aliases = [{"company_id": "1", "alias": "AAPL", "alias_type": "ticker", "ticker": "AAPL"}]
        assert detect_company_mentions("", aliases) == []

    def test_empty_aliases(self):
        assert detect_company_mentions("Some text about stocks", []) == []

    def test_case_insensitive_name_match(self):
        aliases = [{"company_id": "1", "alias": "Apple Inc.", "alias_type": "legal_name", "ticker": "AAPL"}]
        mentions = detect_company_mentions("APPLE INC. reported earnings", aliases)
        assert len(mentions) == 1


class TestParseHtml:
    def test_returns_parsed_document(self):
        result = parse_html(RICH_HTML, "https://technews.example.com/article")
        assert isinstance(result, ParsedDocument)

    def test_body_text_populated(self):
        result = parse_html(RICH_HTML, "https://technews.example.com/article")
        assert "Apple Inc." in result.body_text
        assert result.word_count > 0

    def test_metadata_populated(self):
        result = parse_html(RICH_HTML, "https://technews.example.com/article")
        assert result.title == "Apple Q2 Earnings Beat"
        assert result.author == "Jane Reporter"
        assert result.publisher == "TechNews"

    def test_quality_scoring(self):
        result = parse_html(RICH_HTML, "https://technews.example.com/article")
        assert result.quality_score > 0
        assert result.confidence in ("low", "medium", "high")

    def test_quality_signals_populated(self):
        result = parse_html(RICH_HTML, "https://technews.example.com/article")
        assert isinstance(result.quality_signals, QualitySignals)
        assert result.quality_signals.body_found_signal == 1.0
        assert result.quality_signals.metadata_signal > 0

    def test_low_quality_flag_on_minimal(self):
        result = parse_html(MINIMAL_HTML, "")
        assert result.low_quality_flag is True
        assert result.confidence == "low"

    def test_rich_html_not_low_quality(self):
        result = parse_html(RICH_HTML, "https://technews.example.com/article")
        assert result.low_quality_flag is False

    def test_quality_warnings_list(self):
        result = parse_html(MINIMAL_HTML, "")
        assert isinstance(result.quality_warnings, list)

    def test_tags_extracted(self):
        result = parse_html(RICH_HTML, "https://technews.example.com/article")
        assert "apple" in result.tags

    def test_document_type_inferred(self):
        result = parse_html(RICH_HTML, "https://technews.example.com/article")
        assert result.document_type == "article"

    def test_outbound_links(self):
        result = parse_html(RICH_HTML, "https://technews.example.com/article")
        assert any("other-site.com" in link for link in result.outbound_links)

    def test_mentioned_companies_with_aliases(self):
        aliases = [
            {"company_id": "1", "alias": "AAPL", "alias_type": "ticker", "ticker": "AAPL"},
            {"company_id": "1", "alias": "Apple Inc.", "alias_type": "legal_name", "ticker": "AAPL"},
        ]
        result = parse_html(RICH_HTML, "https://technews.example.com/article", aliases=aliases)
        assert len(result.mentioned_companies) == 1
        assert result.mentioned_companies[0].ticker == "AAPL"
        assert isinstance(result.mentioned_companies[0], CompanyMention)

    def test_no_mentions_without_aliases(self):
        result = parse_html(RICH_HTML, "https://technews.example.com/article")
        assert result.mentioned_companies == []

# --- HTML fixtures for boilerplate reduction tests ---

NO_SEMANTIC_HTML = """<html><body>
<div class="top-bar"><a href="/">Home</a> <a href="/about">About</a> <a href="/contact">Contact</a></div>
<div class="main-content">
    <p>The Federal Reserve announced a 25 basis point rate cut on Wednesday,
    surprising markets that had expected rates to remain unchanged. Bond yields
    fell sharply across the curve, with the 10-year Treasury dropping to 3.8 percent.
    Equity markets rallied on the news, with the S&P 500 gaining 1.2 percent by close.</p>
    <p>Analysts noted that the decision reflects growing concerns about slowing economic
    growth and weakening labor market data. Several Fed governors had signaled openness
    to easing in recent speeches, but the timing caught many off guard.</p>
    <p>Market participants are now pricing in additional cuts at the next two meetings,
    with futures indicating a 70 percent probability of another reduction in September.</p>
</div>
<div class="link-list"><a href="/1">Story 1</a><a href="/2">Story 2</a><a href="/3">Story 3</a><a href="/4">Story 4</a><a href="/5">Story 5</a></div>
</body></html>"""

HEAVY_BOILERPLATE_HTML = """<html><body>
<div class="cookie-banner">We use cookies. Accept all cookies.</div>
<div class="signup-form">Sign up for free alerts</div>
<nav class="menu">Home | Markets | Tech | Opinion</nav>
<article>
    <p>Tesla reported record deliveries in Q1 2026, shipping over 500,000 vehicles
    globally. The company attributed the strong performance to expanded production
    capacity at its Berlin and Austin gigafactories, as well as growing demand for
    the refreshed Model Y across European and Asian markets.</p>
    <p>Revenue for the quarter came in at $28 billion, beating consensus estimates
    by roughly 4 percent. Automotive gross margins improved to 19.5 percent,
    reversing a trend of compression seen throughout 2025.</p>
</article>
<div class="social-share">Share this article on Twitter Facebook LinkedIn</div>
<div class="related-posts">You may also like: Story A, Story B</div>
<div class="ad-container">Sponsored content here</div>
<footer>Copyright © 2026 FinanceDaily. All rights reserved. Terms of service. Privacy policy.</footer>
</body></html>"""

REPEATED_BLOCKS_HTML = """<html><body>
<article>
    <p>Apple announced a new partnership with Samsung to develop next-generation
    display technology for future iPhone models. The collaboration is expected to
    yield OLED panels with improved brightness and energy efficiency.</p>
    <p>This is a developing story. Check back for updates as more information becomes available.</p>
    <p>Industry analysts view the partnership as a strategic move to secure supply
    chain advantages ahead of the 2027 product cycle. Display costs represent a
    significant portion of iPhone bill of materials.</p>
    <p>This is a developing story. Check back for updates as more information becomes available.</p>
</article>
</body></html>"""


class TestTextDensityScoring:
    """Tests for text-density-based block scoring heuristics."""

    def test_content_rich_div_has_high_density(self):
        from bs4 import BeautifulSoup
        html = "<div><p>This is a substantial paragraph with real content about markets.</p></div>"
        soup = BeautifulSoup(html, "html.parser")
        tag = soup.find("div")
        assert _text_density(tag) > _MIN_TEXT_DENSITY

    def test_link_heavy_div_has_high_link_density(self):
        from bs4 import BeautifulSoup
        html = '<div><a href="/a">Link one</a> <a href="/b">Link two</a> <a href="/c">Link three</a></div>'
        soup = BeautifulSoup(html, "html.parser")
        tag = soup.find("div")
        assert _link_density(tag) > 0.8

    def test_article_div_has_low_link_density(self):
        from bs4 import BeautifulSoup
        html = "<div><p>A long paragraph of article text that discusses important financial results and market movements in detail.</p></div>"
        soup = BeautifulSoup(html, "html.parser")
        tag = soup.find("div")
        assert _link_density(tag) < 0.1

    def test_block_score_prefers_content_over_nav(self):
        from bs4 import BeautifulSoup
        content_html = "<div>" + "<p>Substantial article paragraph with real content about markets and earnings.</p>" * 3 + "</div>"
        nav_html = '<div><a href="/a">Link</a><a href="/b">Link</a><a href="/c">Link</a><a href="/d">Link</a></div>'
        soup_c = BeautifulSoup(content_html, "html.parser")
        soup_n = BeautifulSoup(nav_html, "html.parser")
        assert _block_score(soup_c.find("div")) > _block_score(soup_n.find("div"))


class TestBoilerplateReduction:
    """Tests for enhanced boilerplate reduction pipeline."""

    def test_strips_cookie_banner(self):
        text = extract_body_text(HEAVY_BOILERPLATE_HTML)
        assert "cookie" not in text.lower()

    def test_strips_signup_form(self):
        text = extract_body_text(HEAVY_BOILERPLATE_HTML)
        assert "Sign up for free" not in text

    def test_strips_social_share(self):
        text = extract_body_text(HEAVY_BOILERPLATE_HTML)
        assert "Share this article" not in text

    def test_strips_ad_container(self):
        text = extract_body_text(HEAVY_BOILERPLATE_HTML)
        assert "Sponsored content" not in text

    def test_strips_related_posts(self):
        text = extract_body_text(HEAVY_BOILERPLATE_HTML)
        assert "You may also like" not in text

    def test_preserves_article_content(self):
        text = extract_body_text(HEAVY_BOILERPLATE_HTML)
        assert "Tesla reported record deliveries" in text
        assert "Revenue for the quarter" in text

    def test_strips_copyright_footer(self):
        text = extract_body_text(HEAVY_BOILERPLATE_HTML)
        assert "Copyright ©" not in text


class TestBodyExtractionFallback:
    """Tests for text-density fallback when no semantic selector matches."""

    def test_finds_content_without_article_tag(self):
        text = extract_body_text(NO_SEMANTIC_HTML)
        assert "Federal Reserve announced" in text
        assert "25 basis point rate cut" in text

    def test_prefers_content_over_nav_links(self):
        text = extract_body_text(NO_SEMANTIC_HTML)
        # The nav-like link list should not dominate the output
        assert "Story 1" not in text or "Federal Reserve" in text


class TestRepeatedBlockDetection:
    """Tests for repeated/template text detection."""

    def test_collapses_repeated_template_text(self):
        text = extract_body_text(REPEATED_BLOCKS_HTML)
        count = text.count("This is a developing story")
        assert count <= 1

    def test_preserves_unique_content(self):
        text = extract_body_text(REPEATED_BLOCKS_HTML)
        assert "Apple announced a new partnership" in text
        assert "Industry analysts view" in text


class TestOrphanLineRemoval:
    """Tests for short orphan line removal."""

    def test_removes_short_fragments(self):
        text = _remove_short_orphan_lines("OK\nThis is a real sentence about markets.\nHi")
        assert "OK" not in text
        assert "Hi" not in text
        assert "real sentence" in text

    def test_keeps_short_lines_with_punctuation(self):
        text = _remove_short_orphan_lines("Breaking news.\nDetails follow in the article.")
        assert "Breaking news." in text


class TestCollapseWhitespace:
    """Tests for whitespace collapsing."""

    def test_collapses_multiple_blank_lines(self):
        text = _collapse_whitespace("Line one.\n\n\n\nLine two.")
        assert "\n\n\n" not in text
        assert "Line one." in text
        assert "Line two." in text

    def test_strips_leading_trailing(self):
        text = _collapse_whitespace("\n\n  Hello world.  \n\n")
        assert text == "Hello world."


# Import the constant for use in density tests
from services.parser.html_parser import _MIN_TEXT_DENSITY