phase 14-15: docker build validation and helm deployment
This commit is contained in:
@@ -0,0 +1,858 @@
|
||||
"""HTML-to-text parsing pipeline using BeautifulSoup.
|
||||
|
||||
Provides structured HTML parsing with boilerplate removal, metadata extraction,
|
||||
outbound link extraction, and quality scoring. Inspired by Noctipede crawler
|
||||
patterns: BeautifulSoup + content hashing, boilerplate stripping, quality scoring.
|
||||
|
||||
Requirements: 4.1, 4.2, 4.3
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
logger = logging.getLogger("html_parser")
|
||||
|
||||
# Tags that never contain useful article content
|
||||
STRIP_TAGS = [
|
||||
"script", "style", "nav", "footer", "header", "aside",
|
||||
"iframe", "noscript", "svg", "form", "button",
|
||||
]
|
||||
|
||||
# CSS class / id substrings that signal boilerplate containers
|
||||
BOILERPLATE_SIGNALS = [
|
||||
"sidebar", "widget", "advert", "promo", "newsletter",
|
||||
"social-share", "share-bar", "related-posts", "comment",
|
||||
"cookie", "popup", "modal", "banner", "breadcrumb",
|
||||
"pagination", "nav-", "menu", "toolbar", "signup",
|
||||
"subscribe", "follow-us", "social-media", "share-button",
|
||||
"ad-slot", "ad-container", "sponsored",
|
||||
]
|
||||
|
||||
# Regex patterns for residual boilerplate in extracted text
|
||||
BOILERPLATE_TEXT_PATTERNS = [
|
||||
re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)advertisement\s*\n?"),
|
||||
re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
|
||||
re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
|
||||
re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
|
||||
re.compile(r"(?i)sign up for .*?(?:\n|$)"),
|
||||
re.compile(r"(?i)follow us on .*?(?:\n|$)"),
|
||||
re.compile(r"(?i)share this (article|story|post).*?(?:\n|$)"),
|
||||
re.compile(r"(?i)read more:?\s*$"),
|
||||
re.compile(r"(?i)recommended for you.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)you may also like.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)trending now.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)most (popular|read).*?(?:\n|$)"),
|
||||
re.compile(r"(?i)^tags:\s*$"),
|
||||
re.compile(r"(?i)^\s*photo\s*:.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)^\s*image\s*(credit|source|courtesy)\s*:.*?(?:\n|$)"),
|
||||
]
|
||||
|
||||
# Selectors for article body candidates, in priority order
|
||||
ARTICLE_SELECTORS = [
|
||||
"article",
|
||||
"[role='main']",
|
||||
".article-body",
|
||||
".post-content",
|
||||
".entry-content",
|
||||
".story-body",
|
||||
".article-content",
|
||||
"#article-body",
|
||||
"#story-body",
|
||||
".article-text",
|
||||
".post-body",
|
||||
".content-body",
|
||||
"main",
|
||||
]
|
||||
|
||||
# Minimum text density (text chars / total chars including markup) for a block
|
||||
# to be considered content-rich rather than boilerplate
|
||||
_MIN_TEXT_DENSITY = 0.25
|
||||
|
||||
# Minimum word count for a block to be a viable body candidate
|
||||
_MIN_BLOCK_WORDS = 20
|
||||
|
||||
|
||||
@dataclass
|
||||
class QualitySignals:
|
||||
"""Individual quality signals contributing to the overall parse score.
|
||||
|
||||
Each signal is a float in [0, 1] representing how well the parsed
|
||||
content performs on that dimension.
|
||||
|
||||
Requirements: 4.3
|
||||
"""
|
||||
word_count_signal: float = 0.0
|
||||
diversity_signal: float = 0.0
|
||||
sentence_signal: float = 0.0
|
||||
paragraph_signal: float = 0.0
|
||||
body_found_signal: float = 0.0
|
||||
metadata_signal: float = 0.0
|
||||
|
||||
def as_dict(self) -> dict[str, float]:
|
||||
return {
|
||||
"word_count": self.word_count_signal,
|
||||
"diversity": self.diversity_signal,
|
||||
"sentence": self.sentence_signal,
|
||||
"paragraph": self.paragraph_signal,
|
||||
"body_found": self.body_found_signal,
|
||||
"metadata": self.metadata_signal,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompanyMention:
|
||||
"""A detected company mention in parsed text.
|
||||
|
||||
Requirements: 1.3, 4.1
|
||||
"""
|
||||
company_id: str
|
||||
ticker: str
|
||||
mention_type: str # ticker, legal_name, alias, brand
|
||||
confidence: float
|
||||
match_count: int = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedDocument:
|
||||
"""Result of HTML-to-text parsing pipeline."""
|
||||
body_text: str = ""
|
||||
title: str = ""
|
||||
author: str = ""
|
||||
publisher: str = ""
|
||||
published_at: str | None = None
|
||||
canonical_url: str | None = None
|
||||
language: str = "en"
|
||||
description: str = ""
|
||||
document_type: str = "article"
|
||||
outbound_links: list[str] = field(default_factory=list)
|
||||
tags: list[str] = field(default_factory=list)
|
||||
mentioned_companies: list[CompanyMention] = field(default_factory=list)
|
||||
quality_score: float = 0.0
|
||||
confidence: str = "low"
|
||||
word_count: int = 0
|
||||
quality_signals: QualitySignals = field(default_factory=QualitySignals)
|
||||
low_quality_flag: bool = False
|
||||
quality_warnings: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
def _attr_str(tag: Tag, attr: str) -> str:
|
||||
"""Safely get a tag attribute as a joined string."""
|
||||
val = tag.get(attr, "")
|
||||
if isinstance(val, list):
|
||||
return " ".join(val)
|
||||
return str(val) if val else ""
|
||||
|
||||
|
||||
def _is_boilerplate_container(tag: Tag) -> bool:
|
||||
"""Check if a tag looks like a boilerplate container by class/id."""
|
||||
cls = _attr_str(tag, "class").lower()
|
||||
tag_id = _attr_str(tag, "id").lower()
|
||||
combined = f"{cls} {tag_id}"
|
||||
return any(sig in combined for sig in BOILERPLATE_SIGNALS)
|
||||
|
||||
|
||||
def _strip_boilerplate_tags(soup: BeautifulSoup) -> None:
|
||||
"""Remove known non-content tags and boilerplate containers in-place."""
|
||||
for tag_name in STRIP_TAGS:
|
||||
for tag in soup.find_all(tag_name):
|
||||
tag.decompose()
|
||||
|
||||
for tag in soup.find_all(True):
|
||||
if _is_boilerplate_container(tag):
|
||||
tag.decompose()
|
||||
|
||||
|
||||
def _reduce_boilerplate_text(text: str) -> str:
|
||||
"""Apply regex patterns to strip residual boilerplate from extracted text."""
|
||||
for pattern in BOILERPLATE_TEXT_PATTERNS:
|
||||
text = pattern.sub("", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _text_density(tag: Tag) -> float:
|
||||
"""Compute text density for a tag: ratio of text length to total markup length.
|
||||
|
||||
Higher density means more actual text relative to HTML structure,
|
||||
which is a strong signal for content blocks vs boilerplate.
|
||||
|
||||
Requirements: 4.2
|
||||
"""
|
||||
markup_len = len(str(tag))
|
||||
if markup_len == 0:
|
||||
return 0.0
|
||||
text_len = len(tag.get_text(strip=True))
|
||||
return text_len / markup_len
|
||||
|
||||
|
||||
def _link_density(tag: Tag) -> float:
|
||||
"""Compute link density: ratio of text inside <a> tags to total text.
|
||||
|
||||
High link density signals navigation/boilerplate blocks (menus, sidebars).
|
||||
Low link density signals content paragraphs.
|
||||
|
||||
Requirements: 4.2
|
||||
"""
|
||||
total_text = len(tag.get_text(strip=True))
|
||||
if total_text == 0:
|
||||
return 1.0
|
||||
link_text = sum(len(a.get_text(strip=True)) for a in tag.find_all("a"))
|
||||
return link_text / total_text
|
||||
|
||||
|
||||
def _block_score(tag: Tag) -> float:
|
||||
"""Score a block element as a body candidate using text density heuristics.
|
||||
|
||||
Combines text density, link density, paragraph count, and word count
|
||||
into a composite score. Higher is more likely to be the article body.
|
||||
|
||||
Requirements: 4.2
|
||||
"""
|
||||
text = tag.get_text(strip=True)
|
||||
word_count = len(text.split())
|
||||
if word_count < _MIN_BLOCK_WORDS:
|
||||
return 0.0
|
||||
|
||||
td = _text_density(tag)
|
||||
ld = _link_density(tag)
|
||||
p_count = len(tag.find_all("p"))
|
||||
|
||||
# Base score from text density (0-1), penalized by link density
|
||||
score = td * (1.0 - ld)
|
||||
|
||||
# Bonus for paragraph-rich blocks (structured article content)
|
||||
if p_count >= 2:
|
||||
score += 0.1 * min(p_count, 10)
|
||||
|
||||
# Bonus for word count (log-scaled to avoid runaway scores)
|
||||
score += 0.05 * math.log(max(word_count, 1))
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def _find_article_body(soup: BeautifulSoup) -> Tag | None:
|
||||
"""Find the most likely article body element.
|
||||
|
||||
First tries semantic selectors (article, [role=main], etc.).
|
||||
If no semantic match, falls back to text-density scoring across
|
||||
candidate block elements to find the content-richest container.
|
||||
|
||||
Requirements: 4.2
|
||||
"""
|
||||
# Priority 1: semantic selectors
|
||||
for selector in ARTICLE_SELECTORS:
|
||||
result = soup.select_one(selector)
|
||||
if result:
|
||||
text = result.get_text(strip=True)
|
||||
if len(text.split()) >= _MIN_BLOCK_WORDS:
|
||||
return result
|
||||
|
||||
# Priority 2: text-density scoring on block-level containers
|
||||
candidates: list[tuple[float, Tag]] = []
|
||||
for tag in soup.find_all(["div", "section", "td"]):
|
||||
score = _block_score(tag)
|
||||
if score > 0:
|
||||
candidates.append((score, tag))
|
||||
|
||||
if candidates:
|
||||
candidates.sort(key=lambda x: x[0], reverse=True)
|
||||
return candidates[0][1]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _collapse_whitespace(text: str) -> str:
|
||||
"""Collapse runs of blank lines into single separators."""
|
||||
lines = [line.strip() for line in text.splitlines()]
|
||||
result: list[str] = []
|
||||
prev_blank = False
|
||||
for line in lines:
|
||||
if not line:
|
||||
if not prev_blank:
|
||||
result.append("")
|
||||
prev_blank = True
|
||||
else:
|
||||
result.append(line)
|
||||
prev_blank = False
|
||||
return "\n".join(result).strip()
|
||||
|
||||
|
||||
def _remove_short_orphan_lines(text: str, min_words: int = 3) -> str:
|
||||
"""Remove very short orphan lines that are likely UI fragments or captions.
|
||||
|
||||
Lines shorter than min_words that don't end with sentence punctuation
|
||||
are stripped. This catches leftover button labels, image captions,
|
||||
and navigation fragments.
|
||||
|
||||
Requirements: 4.2
|
||||
"""
|
||||
lines = text.splitlines()
|
||||
kept: list[str] = []
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
words = stripped.split()
|
||||
if len(words) < min_words and not stripped.endswith((".", "!", "?", ":")):
|
||||
continue
|
||||
kept.append(line)
|
||||
return "\n".join(kept)
|
||||
|
||||
|
||||
def _detect_repeated_blocks(text: str, min_len: int = 40) -> str:
|
||||
"""Remove repeated text blocks that appear more than once.
|
||||
|
||||
Template text (disclaimers, repeated footers) often appears verbatim
|
||||
in multiple places. This strips exact duplicate blocks.
|
||||
|
||||
Requirements: 4.2
|
||||
"""
|
||||
lines = text.splitlines()
|
||||
seen: dict[str, int] = {}
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if len(stripped) >= min_len:
|
||||
seen[stripped] = seen.get(stripped, 0) + 1
|
||||
|
||||
duplicates = {k for k, v in seen.items() if v > 1}
|
||||
if not duplicates:
|
||||
return text
|
||||
|
||||
kept: list[str] = []
|
||||
emitted: set[str] = set()
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if stripped in duplicates:
|
||||
if stripped not in emitted:
|
||||
kept.append(line)
|
||||
emitted.add(stripped)
|
||||
# Skip subsequent duplicates
|
||||
else:
|
||||
kept.append(line)
|
||||
return "\n".join(kept)
|
||||
|
||||
|
||||
def extract_body_text(html: str) -> str:
|
||||
"""Extract main body text from HTML with boilerplate removal.
|
||||
|
||||
Pipeline:
|
||||
1. Strip non-content tags (script, style, nav, footer, etc.)
|
||||
2. Strip boilerplate containers by class/id signals
|
||||
3. Find article body via semantic selectors or text-density scoring
|
||||
4. Extract text from best candidate
|
||||
5. Remove residual boilerplate via regex patterns
|
||||
6. Remove short orphan lines (UI fragments)
|
||||
7. Detect and collapse repeated template blocks
|
||||
8. Collapse whitespace
|
||||
|
||||
Requirements: 4.1, 4.2
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
_strip_boilerplate_tags(soup)
|
||||
|
||||
article = _find_article_body(soup)
|
||||
if article:
|
||||
raw_text = article.get_text(separator="\n", strip=True)
|
||||
else:
|
||||
body = soup.find("body")
|
||||
raw_text = (body or soup).get_text(separator="\n", strip=True)
|
||||
|
||||
# Multi-stage text cleaning
|
||||
text = _reduce_boilerplate_text(raw_text)
|
||||
text = _remove_short_orphan_lines(text)
|
||||
text = _detect_repeated_blocks(text)
|
||||
text = _collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def extract_metadata(html: str, url: str = "") -> dict[str, str | None]:
|
||||
"""Extract document metadata from HTML head elements.
|
||||
|
||||
Extracts title, author, publisher, published date, canonical URL,
|
||||
language, description, and tags/keywords.
|
||||
|
||||
Requirements: 4.1
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
meta: dict[str, str | None] = {}
|
||||
|
||||
# Title: og:title > <title>
|
||||
og_title = soup.find("meta", property="og:title")
|
||||
if og_title and og_title.get("content"):
|
||||
content = og_title["content"]
|
||||
meta["title"] = content.strip() if isinstance(content, str) else ""
|
||||
elif soup.title and soup.title.string:
|
||||
meta["title"] = soup.title.string.strip()
|
||||
else:
|
||||
meta["title"] = ""
|
||||
|
||||
# Author
|
||||
author_tag = soup.find("meta", attrs={"name": "author"})
|
||||
if author_tag and author_tag.get("content"):
|
||||
content = author_tag["content"]
|
||||
meta["author"] = content.strip() if isinstance(content, str) else ""
|
||||
else:
|
||||
meta["author"] = ""
|
||||
|
||||
# Publisher: og:site_name > hostname
|
||||
site_name = soup.find("meta", property="og:site_name")
|
||||
if site_name and site_name.get("content"):
|
||||
content = site_name["content"]
|
||||
meta["publisher"] = content.strip() if isinstance(content, str) else ""
|
||||
else:
|
||||
meta["publisher"] = urlparse(url).hostname or "" if url else ""
|
||||
|
||||
# Published date: article:published_time > JSON-LD datePublished
|
||||
pub_time = soup.find("meta", property="article:published_time")
|
||||
if pub_time and pub_time.get("content"):
|
||||
content = pub_time["content"]
|
||||
meta["published_at"] = content.strip() if isinstance(content, str) else None
|
||||
else:
|
||||
meta["published_at"] = _extract_jsonld_date(soup)
|
||||
|
||||
# Canonical URL
|
||||
canonical = soup.find("link", rel="canonical")
|
||||
if canonical and canonical.get("href"):
|
||||
meta["canonical_url"] = str(canonical["href"])
|
||||
else:
|
||||
og_url = soup.find("meta", property="og:url")
|
||||
if og_url and og_url.get("content"):
|
||||
meta["canonical_url"] = str(og_url["content"])
|
||||
else:
|
||||
meta["canonical_url"] = url or None
|
||||
|
||||
# Language
|
||||
html_tag = soup.find("html")
|
||||
if html_tag and html_tag.get("lang"):
|
||||
lang = html_tag["lang"]
|
||||
meta["language"] = str(lang)[:5] if lang else "en"
|
||||
else:
|
||||
meta["language"] = "en"
|
||||
|
||||
# Description
|
||||
desc = soup.find("meta", property="og:description") or soup.find(
|
||||
"meta", attrs={"name": "description"}
|
||||
)
|
||||
if desc and desc.get("content"):
|
||||
content = desc["content"]
|
||||
meta["description"] = content.strip() if isinstance(content, str) else ""
|
||||
else:
|
||||
meta["description"] = ""
|
||||
|
||||
# Tags / keywords
|
||||
keywords = soup.find("meta", attrs={"name": "keywords"})
|
||||
if keywords and keywords.get("content"):
|
||||
content = keywords["content"]
|
||||
raw = content.strip() if isinstance(content, str) else ""
|
||||
meta["tags"] = raw # comma-separated string
|
||||
else:
|
||||
meta["tags"] = ""
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
def _extract_jsonld_date(soup: BeautifulSoup) -> str | None:
|
||||
"""Try to extract datePublished from JSON-LD script tags."""
|
||||
for script in soup.find_all("script", type="application/ld+json"):
|
||||
if script.string and "datePublished" in script.string:
|
||||
try:
|
||||
ld = json.loads(script.string)
|
||||
if isinstance(ld, dict) and "datePublished" in ld:
|
||||
return str(ld["datePublished"])
|
||||
if isinstance(ld, list):
|
||||
for item in ld:
|
||||
if isinstance(item, dict) and "datePublished" in item:
|
||||
return str(item["datePublished"])
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def extract_outbound_links(html: str, base_url: str = "") -> list[str]:
|
||||
"""Extract outbound links from HTML, filtering out self-references.
|
||||
|
||||
Requirements: 4.1
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
base_host = urlparse(base_url).hostname or "" if base_url else ""
|
||||
links: list[str] = []
|
||||
|
||||
for a_tag in soup.find_all("a", href=True):
|
||||
href = str(a_tag["href"]).strip()
|
||||
if not href or href.startswith("#") or href.startswith("javascript:"):
|
||||
continue
|
||||
parsed = urlparse(href)
|
||||
# Only include absolute URLs that point to different hosts
|
||||
if parsed.scheme in ("http", "https") and parsed.hostname:
|
||||
if parsed.hostname != base_host:
|
||||
links.append(href)
|
||||
|
||||
# Dedupe while preserving order
|
||||
seen: set[str] = set()
|
||||
unique: list[str] = []
|
||||
for link in links:
|
||||
if link not in seen:
|
||||
seen.add(link)
|
||||
unique.append(link)
|
||||
return unique
|
||||
|
||||
|
||||
def _count_sentences(text: str) -> int:
|
||||
"""Count approximate sentence count by terminal punctuation."""
|
||||
return len(re.findall(r"[.!?]+(?:\s|$)", text))
|
||||
|
||||
|
||||
def _count_paragraphs(text: str) -> int:
|
||||
"""Count non-empty paragraph blocks separated by blank lines."""
|
||||
blocks = re.split(r"\n\s*\n", text.strip())
|
||||
return sum(1 for b in blocks if len(b.strip().split()) >= 5)
|
||||
|
||||
|
||||
def score_parse_quality(
|
||||
text: str,
|
||||
*,
|
||||
body_found: bool = True,
|
||||
has_title: bool = False,
|
||||
has_author: bool = False,
|
||||
has_publisher: bool = False,
|
||||
has_published_at: bool = False,
|
||||
) -> tuple[float, str, QualitySignals, list[str]]:
|
||||
"""Score parse quality using multiple content and metadata signals.
|
||||
|
||||
Returns (score, confidence_label, signals, warnings).
|
||||
|
||||
Signals considered:
|
||||
- word_count_signal: length of extracted text
|
||||
- diversity_signal: vocabulary richness (unique/total words)
|
||||
- sentence_signal: presence of proper sentence structure
|
||||
- paragraph_signal: multi-paragraph structure
|
||||
- body_found_signal: whether a semantic article body was located
|
||||
- metadata_signal: presence of title, author, publisher, date
|
||||
|
||||
Requirements: 4.3
|
||||
"""
|
||||
warnings: list[str] = []
|
||||
words = text.split()
|
||||
word_count = len(words)
|
||||
|
||||
# --- word count signal ---
|
||||
if word_count < 20:
|
||||
wc_sig = 0.1
|
||||
warnings.append("very_short_text")
|
||||
elif word_count < 50:
|
||||
wc_sig = 0.3
|
||||
warnings.append("short_text")
|
||||
elif word_count < 150:
|
||||
wc_sig = 0.6
|
||||
elif word_count < 300:
|
||||
wc_sig = 0.8
|
||||
else:
|
||||
wc_sig = 1.0
|
||||
|
||||
# --- diversity signal ---
|
||||
if word_count > 0:
|
||||
unique = len(set(w.lower() for w in words))
|
||||
diversity = unique / word_count
|
||||
else:
|
||||
diversity = 0.0
|
||||
if diversity < 0.2:
|
||||
div_sig = 0.2
|
||||
if word_count >= 20:
|
||||
warnings.append("low_vocabulary_diversity")
|
||||
elif diversity < 0.4:
|
||||
div_sig = 0.5
|
||||
else:
|
||||
div_sig = 1.0
|
||||
|
||||
# --- sentence signal ---
|
||||
sentence_count = _count_sentences(text)
|
||||
if sentence_count == 0:
|
||||
sent_sig = 0.1
|
||||
if word_count >= 20:
|
||||
warnings.append("no_sentence_structure")
|
||||
elif sentence_count < 3:
|
||||
sent_sig = 0.5
|
||||
else:
|
||||
sent_sig = 1.0
|
||||
|
||||
# --- paragraph signal ---
|
||||
para_count = _count_paragraphs(text)
|
||||
if para_count == 0:
|
||||
para_sig = 0.2
|
||||
elif para_count == 1:
|
||||
para_sig = 0.5
|
||||
else:
|
||||
para_sig = 1.0
|
||||
|
||||
# --- body found signal ---
|
||||
body_sig = 1.0 if body_found else 0.3
|
||||
if not body_found:
|
||||
warnings.append("no_article_body_found")
|
||||
|
||||
# --- metadata signal ---
|
||||
meta_hits = sum([has_title, has_author, has_publisher, has_published_at])
|
||||
meta_sig = meta_hits / 4.0
|
||||
|
||||
signals = QualitySignals(
|
||||
word_count_signal=wc_sig,
|
||||
diversity_signal=div_sig,
|
||||
sentence_signal=sent_sig,
|
||||
paragraph_signal=para_sig,
|
||||
body_found_signal=body_sig,
|
||||
metadata_signal=meta_sig,
|
||||
)
|
||||
|
||||
# Weighted composite score
|
||||
score = (
|
||||
0.30 * wc_sig
|
||||
+ 0.15 * div_sig
|
||||
+ 0.15 * sent_sig
|
||||
+ 0.10 * para_sig
|
||||
+ 0.20 * body_sig
|
||||
+ 0.10 * meta_sig
|
||||
)
|
||||
score = round(min(score, 0.95), 2)
|
||||
|
||||
# Confidence label
|
||||
if score < 0.35:
|
||||
confidence = "low"
|
||||
elif score < 0.65:
|
||||
confidence = "medium"
|
||||
else:
|
||||
confidence = "high"
|
||||
|
||||
return score, confidence, signals, warnings
|
||||
|
||||
|
||||
def score_quality(text: str) -> tuple[float, str]:
|
||||
"""Score parse quality based on extracted text characteristics.
|
||||
|
||||
Returns (score, confidence_label) where confidence is low/medium/high.
|
||||
Thin wrapper around score_parse_quality for backward compatibility.
|
||||
|
||||
Requirements: 4.3
|
||||
"""
|
||||
score, confidence, _signals, _warnings = score_parse_quality(text)
|
||||
return score, confidence
|
||||
|
||||
|
||||
def infer_document_type(html: str, url: str = "") -> str:
|
||||
"""Infer document type from URL patterns and HTML content.
|
||||
|
||||
Requirements: 4.1
|
||||
"""
|
||||
url_lower = url.lower()
|
||||
if any(kw in url_lower for kw in ["sec.gov", "edgar", "filing", "10-k", "10-q", "8-k"]):
|
||||
return "filing"
|
||||
if any(kw in url_lower for kw in ["transcript", "earnings-call", "earnings_call"]):
|
||||
return "transcript"
|
||||
if any(kw in url_lower for kw in ["press-release", "press_release", "newsroom"]):
|
||||
return "press_release"
|
||||
# html reserved for future content-based inference
|
||||
_ = html
|
||||
return "article"
|
||||
|
||||
|
||||
def parse_html(html: str, url: str = "", aliases: list[dict[str, str]] | None = None) -> ParsedDocument:
|
||||
"""Full HTML-to-text parsing pipeline.
|
||||
|
||||
Combines body extraction, metadata extraction, link extraction,
|
||||
quality scoring, document type inference, and company mention
|
||||
detection into a single result.
|
||||
|
||||
Requirements: 1.3, 4.1, 4.2, 4.3
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
_strip_boilerplate_tags(soup)
|
||||
|
||||
article = _find_article_body(soup)
|
||||
body_found = article is not None
|
||||
if article:
|
||||
raw_text = article.get_text(separator="\n", strip=True)
|
||||
else:
|
||||
body = soup.find("body")
|
||||
raw_text = (body or soup).get_text(separator="\n", strip=True)
|
||||
|
||||
# Multi-stage text cleaning
|
||||
text = _reduce_boilerplate_text(raw_text)
|
||||
text = _remove_short_orphan_lines(text)
|
||||
text = _detect_repeated_blocks(text)
|
||||
text = _collapse_whitespace(text)
|
||||
|
||||
metadata = extract_metadata(html, url)
|
||||
outbound_links = extract_outbound_links(html, url)
|
||||
doc_type = infer_document_type(html, url)
|
||||
word_count = len(text.split())
|
||||
|
||||
tags_raw = metadata.get("tags", "") or ""
|
||||
tags = [t.strip() for t in tags_raw.split(",") if t.strip()] if tags_raw else []
|
||||
|
||||
# Rich quality scoring with all available signals
|
||||
quality, confidence, signals, warnings = score_parse_quality(
|
||||
text,
|
||||
body_found=body_found,
|
||||
has_title=bool(metadata.get("title")),
|
||||
has_author=bool(metadata.get("author")),
|
||||
has_publisher=bool(metadata.get("publisher")),
|
||||
has_published_at=bool(metadata.get("published_at")),
|
||||
)
|
||||
|
||||
low_quality_flag = confidence == "low"
|
||||
|
||||
# Company mention detection
|
||||
mentioned: list[CompanyMention] = []
|
||||
if aliases and text:
|
||||
# Search title + body for mentions
|
||||
search_text = f"{metadata.get('title', '')} {text}"
|
||||
raw_mentions = detect_company_mentions(search_text, aliases)
|
||||
for m in raw_mentions:
|
||||
mentioned.append(CompanyMention(
|
||||
company_id=str(m["company_id"]),
|
||||
ticker=str(m["ticker"]),
|
||||
mention_type=str(m["mention_type"]),
|
||||
confidence=float(m["confidence"]),
|
||||
match_count=int(m["match_count"]),
|
||||
))
|
||||
|
||||
return ParsedDocument(
|
||||
body_text=text,
|
||||
title=metadata.get("title", "") or "",
|
||||
author=metadata.get("author", "") or "",
|
||||
publisher=metadata.get("publisher", "") or "",
|
||||
published_at=metadata.get("published_at"),
|
||||
canonical_url=metadata.get("canonical_url"),
|
||||
language=metadata.get("language", "en") or "en",
|
||||
description=metadata.get("description", "") or "",
|
||||
document_type=doc_type,
|
||||
outbound_links=outbound_links,
|
||||
tags=tags,
|
||||
mentioned_companies=mentioned,
|
||||
quality_score=quality,
|
||||
confidence=confidence,
|
||||
word_count=word_count,
|
||||
quality_signals=signals,
|
||||
low_quality_flag=low_quality_flag,
|
||||
quality_warnings=warnings,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class AliasEntry:
|
||||
"""A company alias used for mention detection."""
|
||||
company_id: str
|
||||
alias: str
|
||||
alias_type: str = "alias"
|
||||
ticker: str = ""
|
||||
|
||||
|
||||
# Confidence by alias type — tickers are most precise, brands least
|
||||
_CONFIDENCE_BY_TYPE: dict[str, float] = {
|
||||
"ticker": 0.9,
|
||||
"legal_name": 0.85,
|
||||
"alias": 0.7,
|
||||
"brand": 0.6,
|
||||
}
|
||||
|
||||
|
||||
def _build_alias_entries(aliases: list[dict[str, str]]) -> list[AliasEntry]:
|
||||
"""Convert raw alias dicts to typed AliasEntry objects."""
|
||||
entries: list[AliasEntry] = []
|
||||
for a in aliases:
|
||||
alias_val = a.get("alias", "")
|
||||
if not alias_val:
|
||||
continue
|
||||
entries.append(AliasEntry(
|
||||
company_id=a.get("company_id", ""),
|
||||
alias=alias_val,
|
||||
alias_type=a.get("alias_type", "alias"),
|
||||
ticker=a.get("ticker", ""),
|
||||
))
|
||||
return entries
|
||||
|
||||
|
||||
def _count_matches(text: str, pattern: re.Pattern[str]) -> int:
|
||||
"""Count non-overlapping matches of pattern in text."""
|
||||
return len(pattern.findall(text))
|
||||
|
||||
|
||||
def detect_company_mentions(
|
||||
text: str,
|
||||
aliases: list[dict[str, str]],
|
||||
) -> list[dict[str, str | float | int]]:
|
||||
"""Detect company mentions using ticker, alias, and name matching.
|
||||
|
||||
Matching strategy by alias length:
|
||||
- 1-2 chars: case-sensitive word-boundary match (avoids "A" matching "a")
|
||||
- 3-4 chars: case-insensitive word-boundary match (standard tickers)
|
||||
- 5+ chars: case-insensitive substring match (company names, brands)
|
||||
|
||||
Confidence varies by alias_type: ticker > legal_name > alias > brand.
|
||||
Multiple alias hits for the same company are deduplicated, keeping the
|
||||
highest-confidence match and summing match counts.
|
||||
|
||||
Requirements: 1.3, 4.1
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
entries = _build_alias_entries(aliases)
|
||||
text_upper = text.upper()
|
||||
|
||||
# Track best match per company: company_id -> (confidence, ticker, mention_type, count)
|
||||
best: dict[str, tuple[float, str, str, int]] = {}
|
||||
|
||||
for entry in entries:
|
||||
alias = entry.alias
|
||||
alias_type = entry.alias_type
|
||||
base_confidence = _CONFIDENCE_BY_TYPE.get(alias_type, 0.7)
|
||||
|
||||
match_count = 0
|
||||
|
||||
if len(alias) <= 2:
|
||||
# Very short: case-sensitive word boundary
|
||||
pattern = re.compile(r"\b" + re.escape(alias) + r"\b")
|
||||
match_count = _count_matches(text, pattern)
|
||||
elif len(alias) <= 4:
|
||||
# Standard ticker length: case-insensitive word boundary
|
||||
pattern = re.compile(r"\b" + re.escape(alias.upper()) + r"\b")
|
||||
match_count = _count_matches(text_upper, pattern)
|
||||
else:
|
||||
# Longer names: case-insensitive substring
|
||||
alias_up = alias.upper()
|
||||
match_count = text_upper.count(alias_up)
|
||||
|
||||
if match_count == 0:
|
||||
continue
|
||||
|
||||
cid = entry.company_id
|
||||
existing = best.get(cid)
|
||||
if existing is None:
|
||||
best[cid] = (base_confidence, entry.ticker, alias_type, match_count)
|
||||
else:
|
||||
# Keep highest confidence, accumulate match count
|
||||
prev_conf, prev_ticker, prev_type, prev_count = existing
|
||||
if base_confidence > prev_conf:
|
||||
best[cid] = (base_confidence, entry.ticker, alias_type, prev_count + match_count)
|
||||
else:
|
||||
best[cid] = (prev_conf, prev_ticker, prev_type, prev_count + match_count)
|
||||
|
||||
mentions: list[dict[str, str | float | int]] = []
|
||||
for cid, (confidence, ticker, mention_type, count) in best.items():
|
||||
mentions.append({
|
||||
"company_id": cid,
|
||||
"ticker": ticker,
|
||||
"mention_type": mention_type,
|
||||
"confidence": confidence,
|
||||
"match_count": count,
|
||||
})
|
||||
|
||||
return mentions
|
||||
+108
-107
@@ -1,84 +1,41 @@
|
||||
"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring."""
|
||||
"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring.
|
||||
|
||||
Uses BeautifulSoup-based parsing pipeline for structured HTML extraction,
|
||||
metadata extraction, outbound link extraction, and quality scoring.
|
||||
Persists normalized text and structured parser output to MinIO,
|
||||
and updates document metadata in PostgreSQL.
|
||||
|
||||
Requirements: 4.1, 4.2, 4.3, 9.1, 9.2
|
||||
"""
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Tuple
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional
|
||||
|
||||
import asyncpg
|
||||
import httpx
|
||||
import redis.asyncio as aioredis
|
||||
from minio import Minio
|
||||
|
||||
from services.parser.html_parser import ParsedDocument, detect_company_mentions, parse_html
|
||||
from services.shared.config import load_config
|
||||
from services.shared.db import get_minio, get_pg_pool, get_redis
|
||||
from services.shared.logging import Span, extract_trace_context, inject_trace_context, new_trace_id, set_trace_context, setup_logging
|
||||
from services.shared.metrics import (
|
||||
ACTIVE_JOBS,
|
||||
PARSE_DURATION,
|
||||
PARSE_JOBS_TOTAL,
|
||||
PARSE_LOW_QUALITY_TOTAL,
|
||||
PARSE_QUALITY_SCORE,
|
||||
)
|
||||
from services.shared.metadata import update_document_parse_results
|
||||
from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key
|
||||
from services.shared.storage import upload_normalized_text, upload_parser_output
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("parser_worker")
|
||||
|
||||
# Simple boilerplate patterns to strip
|
||||
BOILERPLATE_PATTERNS = [
|
||||
re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)advertisement\s*\n"),
|
||||
re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
|
||||
re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
|
||||
re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
|
||||
]
|
||||
|
||||
|
||||
def strip_html_tags(html: str) -> str:
|
||||
"""Basic HTML tag removal."""
|
||||
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r"<[^>]+>", " ", text)
|
||||
text = re.sub(r" ", " ", text)
|
||||
text = re.sub(r"&", "&", text)
|
||||
text = re.sub(r"<", "<", text)
|
||||
text = re.sub(r">", ">", text)
|
||||
text = re.sub(r"&#\d+;", "", text)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def reduce_boilerplate(text: str) -> str:
|
||||
for pattern in BOILERPLATE_PATTERNS:
|
||||
text = pattern.sub("", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def score_quality(text: str) -> Tuple[float, str]:
|
||||
"""Score parse quality. Returns (score, confidence_label)."""
|
||||
word_count = len(text.split())
|
||||
if word_count < 20:
|
||||
return 0.1, "low"
|
||||
if word_count < 50:
|
||||
return 0.3, "low"
|
||||
if word_count < 150:
|
||||
return 0.6, "medium"
|
||||
return 0.85, "high"
|
||||
|
||||
|
||||
def detect_company_mentions(text: str, aliases: List[dict]) -> List[dict]:
|
||||
"""Detect company mentions using ticker, alias, and name matching."""
|
||||
mentions = []
|
||||
text_upper = text.upper()
|
||||
for alias_info in aliases:
|
||||
alias = alias_info["alias"]
|
||||
if alias.upper() in text_upper:
|
||||
mentions.append({
|
||||
"company_id": alias_info["company_id"],
|
||||
"ticker": alias_info.get("ticker", ""),
|
||||
"mention_type": alias_info.get("alias_type", "alias"),
|
||||
"confidence": 0.7,
|
||||
})
|
||||
return mentions
|
||||
|
||||
|
||||
async def fetch_html(url: str) -> Optional[str]:
|
||||
"""Fetch article HTML for scraping."""
|
||||
@@ -94,48 +51,65 @@ async def fetch_html(url: str) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def build_parser_output_json(parsed: ParsedDocument, mentions: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
"""Build a structured JSON dict from ParsedDocument and detected mentions.
|
||||
|
||||
This captures the full parser output for audit and downstream use:
|
||||
metadata, quality signals, warnings, outbound links, tags, and mentions.
|
||||
"""
|
||||
return {
|
||||
"title": parsed.title,
|
||||
"author": parsed.author,
|
||||
"publisher": parsed.publisher,
|
||||
"published_at": parsed.published_at,
|
||||
"canonical_url": parsed.canonical_url,
|
||||
"language": parsed.language,
|
||||
"description": parsed.description,
|
||||
"document_type": parsed.document_type,
|
||||
"word_count": parsed.word_count,
|
||||
"outbound_links": parsed.outbound_links,
|
||||
"tags": parsed.tags,
|
||||
"quality_score": parsed.quality_score,
|
||||
"confidence": parsed.confidence,
|
||||
"low_quality_flag": parsed.low_quality_flag,
|
||||
"quality_warnings": parsed.quality_warnings,
|
||||
"quality_signals": parsed.quality_signals.as_dict(),
|
||||
"mentioned_companies": mentions,
|
||||
}
|
||||
|
||||
|
||||
async def process_job(
|
||||
job: dict,
|
||||
job: dict[str, Any],
|
||||
pool: asyncpg.Pool,
|
||||
rds: aioredis.Redis,
|
||||
minio_client: Minio,
|
||||
):
|
||||
) -> None:
|
||||
doc_id = job["document_id"]
|
||||
ticker = job["ticker"]
|
||||
url = job.get("url", "")
|
||||
now = datetime.now(timezone.utc)
|
||||
_parse_start = time.monotonic()
|
||||
|
||||
set_trace_context(trace_id=job.get("_trace_id") or new_trace_id())
|
||||
|
||||
# Fetch HTML if we have a URL
|
||||
html = await fetch_html(url) if url else None
|
||||
|
||||
if html:
|
||||
# Store raw HTML
|
||||
html_bytes = html.encode("utf-8")
|
||||
now = datetime.utcnow()
|
||||
html_path = f"scrape/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.html"
|
||||
minio_client.put_object(
|
||||
"stonks-raw-news", html_path, io.BytesIO(html_bytes), len(html_bytes),
|
||||
content_type="text/html",
|
||||
)
|
||||
|
||||
# Parse
|
||||
text = strip_html_tags(html)
|
||||
text = reduce_boilerplate(text)
|
||||
# Parse using BeautifulSoup pipeline
|
||||
parsed = parse_html(html, url)
|
||||
else:
|
||||
text = ""
|
||||
parsed = ParsedDocument()
|
||||
|
||||
quality_score, confidence = score_quality(text)
|
||||
text = parsed.body_text
|
||||
|
||||
# Store normalized text
|
||||
# Upload normalized text to MinIO
|
||||
norm_ref: str | None = None
|
||||
if text:
|
||||
text_bytes = text.encode("utf-8")
|
||||
now = datetime.utcnow()
|
||||
norm_path = f"parsed/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/normalized.txt"
|
||||
minio_client.put_object(
|
||||
"stonks-normalized", norm_path, io.BytesIO(text_bytes), len(text_bytes),
|
||||
content_type="text/plain",
|
||||
norm_ref = upload_normalized_text(
|
||||
minio_client, ticker, doc_id,
|
||||
text.encode("utf-8"), timestamp=now,
|
||||
)
|
||||
else:
|
||||
norm_path = None
|
||||
|
||||
# Detect company mentions
|
||||
aliases = await pool.fetch(
|
||||
@@ -150,14 +124,24 @@ async def process_job(
|
||||
)
|
||||
mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else []
|
||||
|
||||
# Update document
|
||||
status = "parsed" if confidence != "low" else "low_quality"
|
||||
await pool.execute(
|
||||
"""UPDATE documents SET
|
||||
normalized_storage_ref=$2, parse_quality_score=$3, parse_confidence=$4, status=$5, updated_at=NOW()
|
||||
WHERE id=$1""",
|
||||
doc_id, f"s3://stonks-normalized/{norm_path}" if norm_path else None,
|
||||
quality_score, confidence, status,
|
||||
# Build and upload structured parser output JSON
|
||||
output_json = build_parser_output_json(parsed, mentions)
|
||||
output_bytes = json.dumps(output_json, default=str, indent=2).encode("utf-8")
|
||||
parser_output_ref = upload_parser_output(
|
||||
minio_client, ticker, doc_id,
|
||||
output_bytes, timestamp=now,
|
||||
)
|
||||
|
||||
# Update document in PostgreSQL
|
||||
status = "parsed" if parsed.confidence != "low" else "low_quality"
|
||||
await update_document_parse_results(
|
||||
pool,
|
||||
document_id=doc_id,
|
||||
normalized_storage_ref=norm_ref,
|
||||
parser_output_ref=parser_output_ref,
|
||||
parse_quality_score=parsed.quality_score,
|
||||
parse_confidence=parsed.confidence,
|
||||
status=status,
|
||||
)
|
||||
|
||||
# Insert company mentions
|
||||
@@ -169,19 +153,36 @@ async def process_job(
|
||||
)
|
||||
|
||||
# Only enqueue for extraction if quality is acceptable
|
||||
if confidence != "low":
|
||||
await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps({
|
||||
if parsed.confidence != "low":
|
||||
await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps(inject_trace_context({
|
||||
"document_id": doc_id,
|
||||
"ticker": ticker,
|
||||
"normalized_text": text[:8000], # Truncate for prompt
|
||||
}))
|
||||
logger.info(f"Parsed doc {doc_id} for {ticker}: quality={quality_score:.2f}, confidence={confidence}")
|
||||
"normalized_text": text[:8000],
|
||||
})))
|
||||
PARSE_JOBS_TOTAL.labels(status="parsed").inc()
|
||||
PARSE_QUALITY_SCORE.observe(parsed.quality_score)
|
||||
PARSE_DURATION.observe(time.monotonic() - _parse_start)
|
||||
logger.info(
|
||||
"Parsed doc %s for %s: quality=%.2f, confidence=%s",
|
||||
doc_id, ticker, parsed.quality_score, parsed.confidence,
|
||||
extra={"ticker": ticker, "document_id": doc_id},
|
||||
)
|
||||
else:
|
||||
logger.warning(f"Low quality parse for doc {doc_id}, skipping extraction")
|
||||
PARSE_JOBS_TOTAL.labels(status="low_quality").inc()
|
||||
PARSE_LOW_QUALITY_TOTAL.inc()
|
||||
PARSE_QUALITY_SCORE.observe(parsed.quality_score)
|
||||
PARSE_DURATION.observe(time.monotonic() - _parse_start)
|
||||
logger.warning(
|
||||
"Low quality parse for doc %s, skipping extraction",
|
||||
doc_id,
|
||||
extra={"ticker": ticker, "document_id": doc_id},
|
||||
)
|
||||
|
||||
|
||||
async def main():
|
||||
async def main() -> None:
|
||||
config = load_config()
|
||||
setup_logging("parser_worker", level=config.log_level, json_output=config.json_logs)
|
||||
|
||||
pool = await get_pg_pool(config)
|
||||
rds = get_redis(config)
|
||||
minio_client = get_minio(config)
|
||||
@@ -197,7 +198,7 @@ async def main():
|
||||
try:
|
||||
await process_job(job, pool, rds, minio_client)
|
||||
except Exception as e:
|
||||
logger.error(f"Parse error: {e}")
|
||||
logger.error("Parse error: %s", e, exc_info=True)
|
||||
else:
|
||||
await asyncio.sleep(2)
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user