Files
stonks-oracle/tests/test_parser_worker.py
T

81 lines
3.1 KiB
Python

"""Tests for parser worker helper functions.
Validates build_parser_output_json produces the expected structure
from ParsedDocument and mention data.
Requirements: 4.1, 4.2, 4.3, 9.1
"""
from services.parser.html_parser import ParsedDocument, QualitySignals
from services.parser.worker import build_parser_output_json
class TestBuildParserOutputJson:
def test_includes_all_metadata_fields(self):
parsed = ParsedDocument(
body_text="Apple reported strong earnings.",
title="Apple Earnings",
author="Jane Reporter",
publisher="TechNews",
published_at="2026-04-10T14:00:00Z",
canonical_url="https://technews.example.com/apple",
language="en",
description="Apple Q2 results.",
document_type="article",
word_count=5,
outbound_links=["https://other.com/analysis"],
tags=["apple", "earnings"],
quality_score=0.75,
confidence="high",
low_quality_flag=False,
quality_warnings=[],
quality_signals=QualitySignals(
word_count_signal=0.8,
diversity_signal=0.9,
sentence_signal=1.0,
paragraph_signal=0.5,
body_found_signal=1.0,
metadata_signal=1.0,
),
)
mentions = [
{"company_id": "1", "ticker": "AAPL", "mention_type": "ticker", "confidence": 0.9, "match_count": 2},
]
result = build_parser_output_json(parsed, mentions)
assert result["title"] == "Apple Earnings"
assert result["author"] == "Jane Reporter"
assert result["publisher"] == "TechNews"
assert result["published_at"] == "2026-04-10T14:00:00Z"
assert result["canonical_url"] == "https://technews.example.com/apple"
assert result["language"] == "en"
assert result["description"] == "Apple Q2 results."
assert result["document_type"] == "article"
assert result["word_count"] == 5
assert result["outbound_links"] == ["https://other.com/analysis"]
assert result["tags"] == ["apple", "earnings"]
assert result["quality_score"] == 0.75
assert result["confidence"] == "high"
assert result["low_quality_flag"] is False
assert result["quality_warnings"] == []
assert result["mentioned_companies"] == mentions
def test_quality_signals_serialized(self):
parsed = ParsedDocument(
quality_signals=QualitySignals(
word_count_signal=0.3,
diversity_signal=0.5,
),
)
result = build_parser_output_json(parsed, [])
signals = result["quality_signals"]
assert signals["word_count"] == 0.3
assert signals["diversity"] == 0.5
def test_empty_parsed_document(self):
parsed = ParsedDocument()
result = build_parser_output_json(parsed, [])
assert result["title"] == ""
assert "body_text" not in result # body text stored separately in MinIO
assert result["mentioned_companies"] == []
assert result["confidence"] == "low"