phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -1,84 +1,41 @@
-"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring."""
+"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring.
+
+Uses BeautifulSoup-based parsing pipeline for structured HTML extraction,
+metadata extraction, outbound link extraction, and quality scoring.
+Persists normalized text and structured parser output to MinIO,
+and updates document metadata in PostgreSQL.
+
+Requirements: 4.1, 4.2, 4.3, 9.1, 9.2
+"""
 import asyncio
-import io
 import json
 import logging
-import re
-from datetime import datetime
-from typing import List, Optional, Tuple
+import time
+from datetime import datetime, timezone
+from typing import Any, Optional

 import asyncpg
 import httpx
 import redis.asyncio as aioredis
 from minio import Minio

+from services.parser.html_parser import ParsedDocument, detect_company_mentions, parse_html
 from services.shared.config import load_config
 from services.shared.db import get_minio, get_pg_pool, get_redis
+from services.shared.logging import Span, extract_trace_context, inject_trace_context, new_trace_id, set_trace_context, setup_logging
+from services.shared.metrics import (
+    ACTIVE_JOBS,
+    PARSE_DURATION,
+    PARSE_JOBS_TOTAL,
+    PARSE_LOW_QUALITY_TOTAL,
+    PARSE_QUALITY_SCORE,
+)
+from services.shared.metadata import update_document_parse_results
 from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key
+from services.shared.storage import upload_normalized_text, upload_parser_output

-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("parser_worker")

-# Simple boilerplate patterns to strip
-BOILERPLATE_PATTERNS = [
-    re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
-    re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
-    re.compile(r"(?i)advertisement\s*\n"),
-    re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
-    re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
-    re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
-    re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
-    re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
-]
-
-
-def strip_html_tags(html: str) -> str:
-    """Basic HTML tag removal."""
-    text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
-    text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
-    text = re.sub(r"<[^>]+>", " ", text)
-    text = re.sub(r"&nbsp;", " ", text)
-    text = re.sub(r"&amp;", "&", text)
-    text = re.sub(r"&lt;", "<", text)
-    text = re.sub(r"&gt;", ">", text)
-    text = re.sub(r"&#\d+;", "", text)
-    text = re.sub(r"\s+", " ", text).strip()
-    return text
-
-
-def reduce_boilerplate(text: str) -> str:
-    for pattern in BOILERPLATE_PATTERNS:
-        text = pattern.sub("", text)
-    return text.strip()
-
-
-def score_quality(text: str) -> Tuple[float, str]:
-    """Score parse quality. Returns (score, confidence_label)."""
-    word_count = len(text.split())
-    if word_count < 20:
-        return 0.1, "low"
-    if word_count < 50:
-        return 0.3, "low"
-    if word_count < 150:
-        return 0.6, "medium"
-    return 0.85, "high"
-
-
-def detect_company_mentions(text: str, aliases: List[dict]) -> List[dict]:
-    """Detect company mentions using ticker, alias, and name matching."""
-    mentions = []
-    text_upper = text.upper()
-    for alias_info in aliases:
-        alias = alias_info["alias"]
-        if alias.upper() in text_upper:
-            mentions.append({
-                "company_id": alias_info["company_id"],
-                "ticker": alias_info.get("ticker", ""),
-                "mention_type": alias_info.get("alias_type", "alias"),
-                "confidence": 0.7,
-            })
-    return mentions
-

 async def fetch_html(url: str) -> Optional[str]:
    """Fetch article HTML for scraping."""
@@ -94,48 +51,65 @@ async def fetch_html(url: str) -> Optional[str]:
            return None


+def build_parser_output_json(parsed: ParsedDocument, mentions: list[dict[str, Any]]) -> dict[str, Any]:
+    """Build a structured JSON dict from ParsedDocument and detected mentions.
+
+    This captures the full parser output for audit and downstream use:
+    metadata, quality signals, warnings, outbound links, tags, and mentions.
+    """
+    return {
+        "title": parsed.title,
+        "author": parsed.author,
+        "publisher": parsed.publisher,
+        "published_at": parsed.published_at,
+        "canonical_url": parsed.canonical_url,
+        "language": parsed.language,
+        "description": parsed.description,
+        "document_type": parsed.document_type,
+        "word_count": parsed.word_count,
+        "outbound_links": parsed.outbound_links,
+        "tags": parsed.tags,
+        "quality_score": parsed.quality_score,
+        "confidence": parsed.confidence,
+        "low_quality_flag": parsed.low_quality_flag,
+        "quality_warnings": parsed.quality_warnings,
+        "quality_signals": parsed.quality_signals.as_dict(),
+        "mentioned_companies": mentions,
+    }
+
+
 async def process_job(
-    job: dict,
+    job: dict[str, Any],
    pool: asyncpg.Pool,
    rds: aioredis.Redis,
    minio_client: Minio,
-):
+) -> None:
    doc_id = job["document_id"]
    ticker = job["ticker"]
    url = job.get("url", "")
+    now = datetime.now(timezone.utc)
+    _parse_start = time.monotonic()
+
+    set_trace_context(trace_id=job.get("_trace_id") or new_trace_id())

    # Fetch HTML if we have a URL
    html = await fetch_html(url) if url else None

    if html:
-        # Store raw HTML
-        html_bytes = html.encode("utf-8")
-        now = datetime.utcnow()
-        html_path = f"scrape/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.html"
-        minio_client.put_object(
-            "stonks-raw-news", html_path, io.BytesIO(html_bytes), len(html_bytes),
-            content_type="text/html",
-        )
-
-        # Parse
-        text = strip_html_tags(html)
-        text = reduce_boilerplate(text)
+        # Parse using BeautifulSoup pipeline
+        parsed = parse_html(html, url)
    else:
-        text = ""
+        parsed = ParsedDocument()

-    quality_score, confidence = score_quality(text)
+    text = parsed.body_text

-    # Store normalized text
+    # Upload normalized text to MinIO
+    norm_ref: str | None = None
    if text:
-        text_bytes = text.encode("utf-8")
-        now = datetime.utcnow()
-        norm_path = f"parsed/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/normalized.txt"
-        minio_client.put_object(
-            "stonks-normalized", norm_path, io.BytesIO(text_bytes), len(text_bytes),
-            content_type="text/plain",
+        norm_ref = upload_normalized_text(
+            minio_client, ticker, doc_id,
+            text.encode("utf-8"), timestamp=now,
        )
-    else:
-        norm_path = None

    # Detect company mentions
    aliases = await pool.fetch(
@@ -150,14 +124,24 @@ async def process_job(
    )
    mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else []

-    # Update document
-    status = "parsed" if confidence != "low" else "low_quality"
-    await pool.execute(
-        """UPDATE documents SET
-           normalized_storage_ref=$2, parse_quality_score=$3, parse_confidence=$4, status=$5, updated_at=NOW()
-           WHERE id=$1""",
-        doc_id, f"s3://stonks-normalized/{norm_path}" if norm_path else None,
-        quality_score, confidence, status,
+    # Build and upload structured parser output JSON
+    output_json = build_parser_output_json(parsed, mentions)
+    output_bytes = json.dumps(output_json, default=str, indent=2).encode("utf-8")
+    parser_output_ref = upload_parser_output(
+        minio_client, ticker, doc_id,
+        output_bytes, timestamp=now,
+    )
+
+    # Update document in PostgreSQL
+    status = "parsed" if parsed.confidence != "low" else "low_quality"
+    await update_document_parse_results(
+        pool,
+        document_id=doc_id,
+        normalized_storage_ref=norm_ref,
+        parser_output_ref=parser_output_ref,
+        parse_quality_score=parsed.quality_score,
+        parse_confidence=parsed.confidence,
+        status=status,
    )

    # Insert company mentions
@@ -169,19 +153,36 @@ async def process_job(
        )

    # Only enqueue for extraction if quality is acceptable
-    if confidence != "low":
-        await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps({
+    if parsed.confidence != "low":
+        await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps(inject_trace_context({
            "document_id": doc_id,
            "ticker": ticker,
-            "normalized_text": text[:8000],  # Truncate for prompt
-        }))
-        logger.info(f"Parsed doc {doc_id} for {ticker}: quality={quality_score:.2f}, confidence={confidence}")
+            "normalized_text": text[:8000],
+        })))
+        PARSE_JOBS_TOTAL.labels(status="parsed").inc()
+        PARSE_QUALITY_SCORE.observe(parsed.quality_score)
+        PARSE_DURATION.observe(time.monotonic() - _parse_start)
+        logger.info(
+            "Parsed doc %s for %s: quality=%.2f, confidence=%s",
+            doc_id, ticker, parsed.quality_score, parsed.confidence,
+            extra={"ticker": ticker, "document_id": doc_id},
+        )
    else:
-        logger.warning(f"Low quality parse for doc {doc_id}, skipping extraction")
+        PARSE_JOBS_TOTAL.labels(status="low_quality").inc()
+        PARSE_LOW_QUALITY_TOTAL.inc()
+        PARSE_QUALITY_SCORE.observe(parsed.quality_score)
+        PARSE_DURATION.observe(time.monotonic() - _parse_start)
+        logger.warning(
+            "Low quality parse for doc %s, skipping extraction",
+            doc_id,
+            extra={"ticker": ticker, "document_id": doc_id},
+        )


-async def main():
+async def main() -> None:
    config = load_config()
+    setup_logging("parser_worker", level=config.log_level, json_output=config.json_logs)
+
    pool = await get_pg_pool(config)
    rds = get_redis(config)
    minio_client = get_minio(config)
@@ -197,7 +198,7 @@ async def main():
                try:
                    await process_job(job, pool, rds, minio_client)
                except Exception as e:
-                    logger.error(f"Parse error: {e}")
+                    logger.error("Parse error: %s", e, exc_info=True)
            else:
                await asyncio.sleep(2)
    finally: