"""Tests for parser worker helper functions. Validates build_parser_output_json produces the expected structure from ParsedDocument and mention data. Requirements: 4.1, 4.2, 4.3, 9.1 """ from services.parser.html_parser import ParsedDocument, QualitySignals from services.parser.worker import build_parser_output_json class TestBuildParserOutputJson: def test_includes_all_metadata_fields(self): parsed = ParsedDocument( body_text="Apple reported strong earnings.", title="Apple Earnings", author="Jane Reporter", publisher="TechNews", published_at="2026-04-10T14:00:00Z", canonical_url="https://technews.example.com/apple", language="en", description="Apple Q2 results.", document_type="article", word_count=5, outbound_links=["https://other.com/analysis"], tags=["apple", "earnings"], quality_score=0.75, confidence="high", low_quality_flag=False, quality_warnings=[], quality_signals=QualitySignals( word_count_signal=0.8, diversity_signal=0.9, sentence_signal=1.0, paragraph_signal=0.5, body_found_signal=1.0, metadata_signal=1.0, ), ) mentions = [ {"company_id": "1", "ticker": "AAPL", "mention_type": "ticker", "confidence": 0.9, "match_count": 2}, ] result = build_parser_output_json(parsed, mentions) assert result["title"] == "Apple Earnings" assert result["author"] == "Jane Reporter" assert result["publisher"] == "TechNews" assert result["published_at"] == "2026-04-10T14:00:00Z" assert result["canonical_url"] == "https://technews.example.com/apple" assert result["language"] == "en" assert result["description"] == "Apple Q2 results." assert result["document_type"] == "article" assert result["word_count"] == 5 assert result["outbound_links"] == ["https://other.com/analysis"] assert result["tags"] == ["apple", "earnings"] assert result["quality_score"] == 0.75 assert result["confidence"] == "high" assert result["low_quality_flag"] is False assert result["quality_warnings"] == [] assert result["mentioned_companies"] == mentions def test_quality_signals_serialized(self): parsed = ParsedDocument( quality_signals=QualitySignals( word_count_signal=0.3, diversity_signal=0.5, ), ) result = build_parser_output_json(parsed, []) signals = result["quality_signals"] assert signals["word_count"] == 0.3 assert signals["diversity"] == 0.5 def test_empty_parsed_document(self): parsed = ParsedDocument() result = build_parser_output_json(parsed, []) assert result["title"] == "" assert "body_text" not in result # body text stored separately in MinIO assert result["mentioned_companies"] == [] assert result["confidence"] == "low"