phase 14-15: docker build validation and helm deployment
This commit is contained in:
+108
-107
@@ -1,84 +1,41 @@
|
||||
"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring."""
|
||||
"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring.
|
||||
|
||||
Uses BeautifulSoup-based parsing pipeline for structured HTML extraction,
|
||||
metadata extraction, outbound link extraction, and quality scoring.
|
||||
Persists normalized text and structured parser output to MinIO,
|
||||
and updates document metadata in PostgreSQL.
|
||||
|
||||
Requirements: 4.1, 4.2, 4.3, 9.1, 9.2
|
||||
"""
|
||||
import asyncio
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Tuple
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional
|
||||
|
||||
import asyncpg
|
||||
import httpx
|
||||
import redis.asyncio as aioredis
|
||||
from minio import Minio
|
||||
|
||||
from services.parser.html_parser import ParsedDocument, detect_company_mentions, parse_html
|
||||
from services.shared.config import load_config
|
||||
from services.shared.db import get_minio, get_pg_pool, get_redis
|
||||
from services.shared.logging import Span, extract_trace_context, inject_trace_context, new_trace_id, set_trace_context, setup_logging
|
||||
from services.shared.metrics import (
|
||||
ACTIVE_JOBS,
|
||||
PARSE_DURATION,
|
||||
PARSE_JOBS_TOTAL,
|
||||
PARSE_LOW_QUALITY_TOTAL,
|
||||
PARSE_QUALITY_SCORE,
|
||||
)
|
||||
from services.shared.metadata import update_document_parse_results
|
||||
from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key
|
||||
from services.shared.storage import upload_normalized_text, upload_parser_output
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger("parser_worker")
|
||||
|
||||
# Simple boilerplate patterns to strip
|
||||
BOILERPLATE_PATTERNS = [
|
||||
re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)advertisement\s*\n"),
|
||||
re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
|
||||
re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
|
||||
re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
|
||||
re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
|
||||
]
|
||||
|
||||
|
||||
def strip_html_tags(html: str) -> str:
|
||||
"""Basic HTML tag removal."""
|
||||
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r"<[^>]+>", " ", text)
|
||||
text = re.sub(r" ", " ", text)
|
||||
text = re.sub(r"&", "&", text)
|
||||
text = re.sub(r"<", "<", text)
|
||||
text = re.sub(r">", ">", text)
|
||||
text = re.sub(r"&#\d+;", "", text)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def reduce_boilerplate(text: str) -> str:
|
||||
for pattern in BOILERPLATE_PATTERNS:
|
||||
text = pattern.sub("", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def score_quality(text: str) -> Tuple[float, str]:
|
||||
"""Score parse quality. Returns (score, confidence_label)."""
|
||||
word_count = len(text.split())
|
||||
if word_count < 20:
|
||||
return 0.1, "low"
|
||||
if word_count < 50:
|
||||
return 0.3, "low"
|
||||
if word_count < 150:
|
||||
return 0.6, "medium"
|
||||
return 0.85, "high"
|
||||
|
||||
|
||||
def detect_company_mentions(text: str, aliases: List[dict]) -> List[dict]:
|
||||
"""Detect company mentions using ticker, alias, and name matching."""
|
||||
mentions = []
|
||||
text_upper = text.upper()
|
||||
for alias_info in aliases:
|
||||
alias = alias_info["alias"]
|
||||
if alias.upper() in text_upper:
|
||||
mentions.append({
|
||||
"company_id": alias_info["company_id"],
|
||||
"ticker": alias_info.get("ticker", ""),
|
||||
"mention_type": alias_info.get("alias_type", "alias"),
|
||||
"confidence": 0.7,
|
||||
})
|
||||
return mentions
|
||||
|
||||
|
||||
async def fetch_html(url: str) -> Optional[str]:
|
||||
"""Fetch article HTML for scraping."""
|
||||
@@ -94,48 +51,65 @@ async def fetch_html(url: str) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def build_parser_output_json(parsed: ParsedDocument, mentions: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
"""Build a structured JSON dict from ParsedDocument and detected mentions.
|
||||
|
||||
This captures the full parser output for audit and downstream use:
|
||||
metadata, quality signals, warnings, outbound links, tags, and mentions.
|
||||
"""
|
||||
return {
|
||||
"title": parsed.title,
|
||||
"author": parsed.author,
|
||||
"publisher": parsed.publisher,
|
||||
"published_at": parsed.published_at,
|
||||
"canonical_url": parsed.canonical_url,
|
||||
"language": parsed.language,
|
||||
"description": parsed.description,
|
||||
"document_type": parsed.document_type,
|
||||
"word_count": parsed.word_count,
|
||||
"outbound_links": parsed.outbound_links,
|
||||
"tags": parsed.tags,
|
||||
"quality_score": parsed.quality_score,
|
||||
"confidence": parsed.confidence,
|
||||
"low_quality_flag": parsed.low_quality_flag,
|
||||
"quality_warnings": parsed.quality_warnings,
|
||||
"quality_signals": parsed.quality_signals.as_dict(),
|
||||
"mentioned_companies": mentions,
|
||||
}
|
||||
|
||||
|
||||
async def process_job(
|
||||
job: dict,
|
||||
job: dict[str, Any],
|
||||
pool: asyncpg.Pool,
|
||||
rds: aioredis.Redis,
|
||||
minio_client: Minio,
|
||||
):
|
||||
) -> None:
|
||||
doc_id = job["document_id"]
|
||||
ticker = job["ticker"]
|
||||
url = job.get("url", "")
|
||||
now = datetime.now(timezone.utc)
|
||||
_parse_start = time.monotonic()
|
||||
|
||||
set_trace_context(trace_id=job.get("_trace_id") or new_trace_id())
|
||||
|
||||
# Fetch HTML if we have a URL
|
||||
html = await fetch_html(url) if url else None
|
||||
|
||||
if html:
|
||||
# Store raw HTML
|
||||
html_bytes = html.encode("utf-8")
|
||||
now = datetime.utcnow()
|
||||
html_path = f"scrape/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.html"
|
||||
minio_client.put_object(
|
||||
"stonks-raw-news", html_path, io.BytesIO(html_bytes), len(html_bytes),
|
||||
content_type="text/html",
|
||||
)
|
||||
|
||||
# Parse
|
||||
text = strip_html_tags(html)
|
||||
text = reduce_boilerplate(text)
|
||||
# Parse using BeautifulSoup pipeline
|
||||
parsed = parse_html(html, url)
|
||||
else:
|
||||
text = ""
|
||||
parsed = ParsedDocument()
|
||||
|
||||
quality_score, confidence = score_quality(text)
|
||||
text = parsed.body_text
|
||||
|
||||
# Store normalized text
|
||||
# Upload normalized text to MinIO
|
||||
norm_ref: str | None = None
|
||||
if text:
|
||||
text_bytes = text.encode("utf-8")
|
||||
now = datetime.utcnow()
|
||||
norm_path = f"parsed/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/normalized.txt"
|
||||
minio_client.put_object(
|
||||
"stonks-normalized", norm_path, io.BytesIO(text_bytes), len(text_bytes),
|
||||
content_type="text/plain",
|
||||
norm_ref = upload_normalized_text(
|
||||
minio_client, ticker, doc_id,
|
||||
text.encode("utf-8"), timestamp=now,
|
||||
)
|
||||
else:
|
||||
norm_path = None
|
||||
|
||||
# Detect company mentions
|
||||
aliases = await pool.fetch(
|
||||
@@ -150,14 +124,24 @@ async def process_job(
|
||||
)
|
||||
mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else []
|
||||
|
||||
# Update document
|
||||
status = "parsed" if confidence != "low" else "low_quality"
|
||||
await pool.execute(
|
||||
"""UPDATE documents SET
|
||||
normalized_storage_ref=$2, parse_quality_score=$3, parse_confidence=$4, status=$5, updated_at=NOW()
|
||||
WHERE id=$1""",
|
||||
doc_id, f"s3://stonks-normalized/{norm_path}" if norm_path else None,
|
||||
quality_score, confidence, status,
|
||||
# Build and upload structured parser output JSON
|
||||
output_json = build_parser_output_json(parsed, mentions)
|
||||
output_bytes = json.dumps(output_json, default=str, indent=2).encode("utf-8")
|
||||
parser_output_ref = upload_parser_output(
|
||||
minio_client, ticker, doc_id,
|
||||
output_bytes, timestamp=now,
|
||||
)
|
||||
|
||||
# Update document in PostgreSQL
|
||||
status = "parsed" if parsed.confidence != "low" else "low_quality"
|
||||
await update_document_parse_results(
|
||||
pool,
|
||||
document_id=doc_id,
|
||||
normalized_storage_ref=norm_ref,
|
||||
parser_output_ref=parser_output_ref,
|
||||
parse_quality_score=parsed.quality_score,
|
||||
parse_confidence=parsed.confidence,
|
||||
status=status,
|
||||
)
|
||||
|
||||
# Insert company mentions
|
||||
@@ -169,19 +153,36 @@ async def process_job(
|
||||
)
|
||||
|
||||
# Only enqueue for extraction if quality is acceptable
|
||||
if confidence != "low":
|
||||
await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps({
|
||||
if parsed.confidence != "low":
|
||||
await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps(inject_trace_context({
|
||||
"document_id": doc_id,
|
||||
"ticker": ticker,
|
||||
"normalized_text": text[:8000], # Truncate for prompt
|
||||
}))
|
||||
logger.info(f"Parsed doc {doc_id} for {ticker}: quality={quality_score:.2f}, confidence={confidence}")
|
||||
"normalized_text": text[:8000],
|
||||
})))
|
||||
PARSE_JOBS_TOTAL.labels(status="parsed").inc()
|
||||
PARSE_QUALITY_SCORE.observe(parsed.quality_score)
|
||||
PARSE_DURATION.observe(time.monotonic() - _parse_start)
|
||||
logger.info(
|
||||
"Parsed doc %s for %s: quality=%.2f, confidence=%s",
|
||||
doc_id, ticker, parsed.quality_score, parsed.confidence,
|
||||
extra={"ticker": ticker, "document_id": doc_id},
|
||||
)
|
||||
else:
|
||||
logger.warning(f"Low quality parse for doc {doc_id}, skipping extraction")
|
||||
PARSE_JOBS_TOTAL.labels(status="low_quality").inc()
|
||||
PARSE_LOW_QUALITY_TOTAL.inc()
|
||||
PARSE_QUALITY_SCORE.observe(parsed.quality_score)
|
||||
PARSE_DURATION.observe(time.monotonic() - _parse_start)
|
||||
logger.warning(
|
||||
"Low quality parse for doc %s, skipping extraction",
|
||||
doc_id,
|
||||
extra={"ticker": ticker, "document_id": doc_id},
|
||||
)
|
||||
|
||||
|
||||
async def main():
|
||||
async def main() -> None:
|
||||
config = load_config()
|
||||
setup_logging("parser_worker", level=config.log_level, json_output=config.json_logs)
|
||||
|
||||
pool = await get_pg_pool(config)
|
||||
rds = get_redis(config)
|
||||
minio_client = get_minio(config)
|
||||
@@ -197,7 +198,7 @@ async def main():
|
||||
try:
|
||||
await process_job(job, pool, rds, minio_client)
|
||||
except Exception as e:
|
||||
logger.error(f"Parse error: {e}")
|
||||
logger.error("Parse error: %s", e, exc_info=True)
|
||||
else:
|
||||
await asyncio.sleep(2)
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user