phase 14-15: docker build validation and helm deployment

This commit is contained in:
Celes Renata
2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
+108 -107
View File
@@ -1,84 +1,41 @@
"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring."""
"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring.
Uses BeautifulSoup-based parsing pipeline for structured HTML extraction,
metadata extraction, outbound link extraction, and quality scoring.
Persists normalized text and structured parser output to MinIO,
and updates document metadata in PostgreSQL.
Requirements: 4.1, 4.2, 4.3, 9.1, 9.2
"""
import asyncio
import io
import json
import logging
import re
from datetime import datetime
from typing import List, Optional, Tuple
import time
from datetime import datetime, timezone
from typing import Any, Optional
import asyncpg
import httpx
import redis.asyncio as aioredis
from minio import Minio
from services.parser.html_parser import ParsedDocument, detect_company_mentions, parse_html
from services.shared.config import load_config
from services.shared.db import get_minio, get_pg_pool, get_redis
from services.shared.logging import Span, extract_trace_context, inject_trace_context, new_trace_id, set_trace_context, setup_logging
from services.shared.metrics import (
ACTIVE_JOBS,
PARSE_DURATION,
PARSE_JOBS_TOTAL,
PARSE_LOW_QUALITY_TOTAL,
PARSE_QUALITY_SCORE,
)
from services.shared.metadata import update_document_parse_results
from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key
from services.shared.storage import upload_normalized_text, upload_parser_output
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("parser_worker")
# Simple boilerplate patterns to strip
BOILERPLATE_PATTERNS = [
re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
re.compile(r"(?i)advertisement\s*\n"),
re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
]
def strip_html_tags(html: str) -> str:
"""Basic HTML tag removal."""
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"&nbsp;", " ", text)
text = re.sub(r"&amp;", "&", text)
text = re.sub(r"&lt;", "<", text)
text = re.sub(r"&gt;", ">", text)
text = re.sub(r"&#\d+;", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def reduce_boilerplate(text: str) -> str:
for pattern in BOILERPLATE_PATTERNS:
text = pattern.sub("", text)
return text.strip()
def score_quality(text: str) -> Tuple[float, str]:
"""Score parse quality. Returns (score, confidence_label)."""
word_count = len(text.split())
if word_count < 20:
return 0.1, "low"
if word_count < 50:
return 0.3, "low"
if word_count < 150:
return 0.6, "medium"
return 0.85, "high"
def detect_company_mentions(text: str, aliases: List[dict]) -> List[dict]:
"""Detect company mentions using ticker, alias, and name matching."""
mentions = []
text_upper = text.upper()
for alias_info in aliases:
alias = alias_info["alias"]
if alias.upper() in text_upper:
mentions.append({
"company_id": alias_info["company_id"],
"ticker": alias_info.get("ticker", ""),
"mention_type": alias_info.get("alias_type", "alias"),
"confidence": 0.7,
})
return mentions
async def fetch_html(url: str) -> Optional[str]:
"""Fetch article HTML for scraping."""
@@ -94,48 +51,65 @@ async def fetch_html(url: str) -> Optional[str]:
return None
def build_parser_output_json(parsed: ParsedDocument, mentions: list[dict[str, Any]]) -> dict[str, Any]:
"""Build a structured JSON dict from ParsedDocument and detected mentions.
This captures the full parser output for audit and downstream use:
metadata, quality signals, warnings, outbound links, tags, and mentions.
"""
return {
"title": parsed.title,
"author": parsed.author,
"publisher": parsed.publisher,
"published_at": parsed.published_at,
"canonical_url": parsed.canonical_url,
"language": parsed.language,
"description": parsed.description,
"document_type": parsed.document_type,
"word_count": parsed.word_count,
"outbound_links": parsed.outbound_links,
"tags": parsed.tags,
"quality_score": parsed.quality_score,
"confidence": parsed.confidence,
"low_quality_flag": parsed.low_quality_flag,
"quality_warnings": parsed.quality_warnings,
"quality_signals": parsed.quality_signals.as_dict(),
"mentioned_companies": mentions,
}
async def process_job(
job: dict,
job: dict[str, Any],
pool: asyncpg.Pool,
rds: aioredis.Redis,
minio_client: Minio,
):
) -> None:
doc_id = job["document_id"]
ticker = job["ticker"]
url = job.get("url", "")
now = datetime.now(timezone.utc)
_parse_start = time.monotonic()
set_trace_context(trace_id=job.get("_trace_id") or new_trace_id())
# Fetch HTML if we have a URL
html = await fetch_html(url) if url else None
if html:
# Store raw HTML
html_bytes = html.encode("utf-8")
now = datetime.utcnow()
html_path = f"scrape/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.html"
minio_client.put_object(
"stonks-raw-news", html_path, io.BytesIO(html_bytes), len(html_bytes),
content_type="text/html",
)
# Parse
text = strip_html_tags(html)
text = reduce_boilerplate(text)
# Parse using BeautifulSoup pipeline
parsed = parse_html(html, url)
else:
text = ""
parsed = ParsedDocument()
quality_score, confidence = score_quality(text)
text = parsed.body_text
# Store normalized text
# Upload normalized text to MinIO
norm_ref: str | None = None
if text:
text_bytes = text.encode("utf-8")
now = datetime.utcnow()
norm_path = f"parsed/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/normalized.txt"
minio_client.put_object(
"stonks-normalized", norm_path, io.BytesIO(text_bytes), len(text_bytes),
content_type="text/plain",
norm_ref = upload_normalized_text(
minio_client, ticker, doc_id,
text.encode("utf-8"), timestamp=now,
)
else:
norm_path = None
# Detect company mentions
aliases = await pool.fetch(
@@ -150,14 +124,24 @@ async def process_job(
)
mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else []
# Update document
status = "parsed" if confidence != "low" else "low_quality"
await pool.execute(
"""UPDATE documents SET
normalized_storage_ref=$2, parse_quality_score=$3, parse_confidence=$4, status=$5, updated_at=NOW()
WHERE id=$1""",
doc_id, f"s3://stonks-normalized/{norm_path}" if norm_path else None,
quality_score, confidence, status,
# Build and upload structured parser output JSON
output_json = build_parser_output_json(parsed, mentions)
output_bytes = json.dumps(output_json, default=str, indent=2).encode("utf-8")
parser_output_ref = upload_parser_output(
minio_client, ticker, doc_id,
output_bytes, timestamp=now,
)
# Update document in PostgreSQL
status = "parsed" if parsed.confidence != "low" else "low_quality"
await update_document_parse_results(
pool,
document_id=doc_id,
normalized_storage_ref=norm_ref,
parser_output_ref=parser_output_ref,
parse_quality_score=parsed.quality_score,
parse_confidence=parsed.confidence,
status=status,
)
# Insert company mentions
@@ -169,19 +153,36 @@ async def process_job(
)
# Only enqueue for extraction if quality is acceptable
if confidence != "low":
await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps({
if parsed.confidence != "low":
await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps(inject_trace_context({
"document_id": doc_id,
"ticker": ticker,
"normalized_text": text[:8000], # Truncate for prompt
}))
logger.info(f"Parsed doc {doc_id} for {ticker}: quality={quality_score:.2f}, confidence={confidence}")
"normalized_text": text[:8000],
})))
PARSE_JOBS_TOTAL.labels(status="parsed").inc()
PARSE_QUALITY_SCORE.observe(parsed.quality_score)
PARSE_DURATION.observe(time.monotonic() - _parse_start)
logger.info(
"Parsed doc %s for %s: quality=%.2f, confidence=%s",
doc_id, ticker, parsed.quality_score, parsed.confidence,
extra={"ticker": ticker, "document_id": doc_id},
)
else:
logger.warning(f"Low quality parse for doc {doc_id}, skipping extraction")
PARSE_JOBS_TOTAL.labels(status="low_quality").inc()
PARSE_LOW_QUALITY_TOTAL.inc()
PARSE_QUALITY_SCORE.observe(parsed.quality_score)
PARSE_DURATION.observe(time.monotonic() - _parse_start)
logger.warning(
"Low quality parse for doc %s, skipping extraction",
doc_id,
extra={"ticker": ticker, "document_id": doc_id},
)
async def main():
async def main() -> None:
config = load_config()
setup_logging("parser_worker", level=config.log_level, json_output=config.json_logs)
pool = await get_pg_pool(config)
rds = get_redis(config)
minio_client = get_minio(config)
@@ -197,7 +198,7 @@ async def main():
try:
await process_job(job, pool, rds, minio_client)
except Exception as e:
logger.error(f"Parse error: {e}")
logger.error("Parse error: %s", e, exc_info=True)
else:
await asyncio.sleep(2)
finally: