81 lines
3.1 KiB
Python
81 lines
3.1 KiB
Python
"""Tests for parser worker helper functions.
|
|
|
|
Validates build_parser_output_json produces the expected structure
|
|
from ParsedDocument and mention data.
|
|
|
|
Requirements: 4.1, 4.2, 4.3, 9.1
|
|
"""
|
|
from services.parser.html_parser import ParsedDocument, QualitySignals
|
|
from services.parser.worker import build_parser_output_json
|
|
|
|
|
|
class TestBuildParserOutputJson:
|
|
def test_includes_all_metadata_fields(self):
|
|
parsed = ParsedDocument(
|
|
body_text="Apple reported strong earnings.",
|
|
title="Apple Earnings",
|
|
author="Jane Reporter",
|
|
publisher="TechNews",
|
|
published_at="2026-04-10T14:00:00Z",
|
|
canonical_url="https://technews.example.com/apple",
|
|
language="en",
|
|
description="Apple Q2 results.",
|
|
document_type="article",
|
|
word_count=5,
|
|
outbound_links=["https://other.com/analysis"],
|
|
tags=["apple", "earnings"],
|
|
quality_score=0.75,
|
|
confidence="high",
|
|
low_quality_flag=False,
|
|
quality_warnings=[],
|
|
quality_signals=QualitySignals(
|
|
word_count_signal=0.8,
|
|
diversity_signal=0.9,
|
|
sentence_signal=1.0,
|
|
paragraph_signal=0.5,
|
|
body_found_signal=1.0,
|
|
metadata_signal=1.0,
|
|
),
|
|
)
|
|
mentions = [
|
|
{"company_id": "1", "ticker": "AAPL", "mention_type": "ticker", "confidence": 0.9, "match_count": 2},
|
|
]
|
|
result = build_parser_output_json(parsed, mentions)
|
|
|
|
assert result["title"] == "Apple Earnings"
|
|
assert result["author"] == "Jane Reporter"
|
|
assert result["publisher"] == "TechNews"
|
|
assert result["published_at"] == "2026-04-10T14:00:00Z"
|
|
assert result["canonical_url"] == "https://technews.example.com/apple"
|
|
assert result["language"] == "en"
|
|
assert result["description"] == "Apple Q2 results."
|
|
assert result["document_type"] == "article"
|
|
assert result["word_count"] == 5
|
|
assert result["outbound_links"] == ["https://other.com/analysis"]
|
|
assert result["tags"] == ["apple", "earnings"]
|
|
assert result["quality_score"] == 0.75
|
|
assert result["confidence"] == "high"
|
|
assert result["low_quality_flag"] is False
|
|
assert result["quality_warnings"] == []
|
|
assert result["mentioned_companies"] == mentions
|
|
|
|
def test_quality_signals_serialized(self):
|
|
parsed = ParsedDocument(
|
|
quality_signals=QualitySignals(
|
|
word_count_signal=0.3,
|
|
diversity_signal=0.5,
|
|
),
|
|
)
|
|
result = build_parser_output_json(parsed, [])
|
|
signals = result["quality_signals"]
|
|
assert signals["word_count"] == 0.3
|
|
assert signals["diversity"] == 0.5
|
|
|
|
def test_empty_parsed_document(self):
|
|
parsed = ParsedDocument()
|
|
result = build_parser_output_json(parsed, [])
|
|
assert result["title"] == ""
|
|
assert "body_text" not in result # body text stored separately in MinIO
|
|
assert result["mentioned_companies"] == []
|
|
assert result["confidence"] == "low"
|