"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring.""" import asyncio import io import json import logging import re from datetime import datetime from typing import List, Optional, Tuple import asyncpg import httpx import redis.asyncio as aioredis from minio import Minio from services.shared.config import load_config from services.shared.db import get_minio, get_pg_pool, get_redis from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key logging.basicConfig(level=logging.INFO) logger = logging.getLogger("parser_worker") # Simple boilerplate patterns to strip BOILERPLATE_PATTERNS = [ re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"), re.compile(r"(?i)click here to read more.*?(?:\n|$)"), re.compile(r"(?i)advertisement\s*\n"), re.compile(r"(?i)copyright ©.*?(?:\n|$)"), re.compile(r"(?i)all rights reserved.*?(?:\n|$)"), re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"), re.compile(r"(?i)privacy policy.*?(?:\n|$)"), re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE), ] def strip_html_tags(html: str) -> str: """Basic HTML tag removal.""" text = re.sub(r"]*>.*?", "", html, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"]*>.*?", "", text, flags=re.DOTALL | re.IGNORECASE) text = re.sub(r"<[^>]+>", " ", text) text = re.sub(r" ", " ", text) text = re.sub(r"&", "&", text) text = re.sub(r"<", "<", text) text = re.sub(r">", ">", text) text = re.sub(r"&#\d+;", "", text) text = re.sub(r"\s+", " ", text).strip() return text def reduce_boilerplate(text: str) -> str: for pattern in BOILERPLATE_PATTERNS: text = pattern.sub("", text) return text.strip() def score_quality(text: str) -> Tuple[float, str]: """Score parse quality. Returns (score, confidence_label).""" word_count = len(text.split()) if word_count < 20: return 0.1, "low" if word_count < 50: return 0.3, "low" if word_count < 150: return 0.6, "medium" return 0.85, "high" def detect_company_mentions(text: str, aliases: List[dict]) -> List[dict]: """Detect company mentions using ticker, alias, and name matching.""" mentions = [] text_upper = text.upper() for alias_info in aliases: alias = alias_info["alias"] if alias.upper() in text_upper: mentions.append({ "company_id": alias_info["company_id"], "ticker": alias_info.get("ticker", ""), "mention_type": alias_info.get("alias_type", "alias"), "confidence": 0.7, }) return mentions async def fetch_html(url: str) -> Optional[str]: """Fetch article HTML for scraping.""" if not url: return None async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client: try: resp = await client.get(url, headers={"User-Agent": "StonksOracle/1.0"}) resp.raise_for_status() return resp.text except Exception as e: logger.warning(f"Failed to fetch {url}: {e}") return None async def process_job( job: dict, pool: asyncpg.Pool, rds: aioredis.Redis, minio_client: Minio, ): doc_id = job["document_id"] ticker = job["ticker"] url = job.get("url", "") # Fetch HTML if we have a URL html = await fetch_html(url) if url else None if html: # Store raw HTML html_bytes = html.encode("utf-8") now = datetime.utcnow() html_path = f"scrape/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.html" minio_client.put_object( "stonks-raw-news", html_path, io.BytesIO(html_bytes), len(html_bytes), content_type="text/html", ) # Parse text = strip_html_tags(html) text = reduce_boilerplate(text) else: text = "" quality_score, confidence = score_quality(text) # Store normalized text if text: text_bytes = text.encode("utf-8") now = datetime.utcnow() norm_path = f"parsed/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/normalized.txt" minio_client.put_object( "stonks-normalized", norm_path, io.BytesIO(text_bytes), len(text_bytes), content_type="text/plain", ) else: norm_path = None # Detect company mentions aliases = await pool.fetch( """SELECT ca.company_id::text, ca.alias, ca.alias_type, c.ticker FROM company_aliases ca JOIN companies c ON ca.company_id = c.id UNION ALL SELECT c.id::text as company_id, c.ticker as alias, 'ticker' as alias_type, c.ticker FROM companies c UNION ALL SELECT c.id::text as company_id, c.legal_name as alias, 'legal_name' as alias_type, c.ticker FROM companies c""" ) mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else [] # Update document status = "parsed" if confidence != "low" else "low_quality" await pool.execute( """UPDATE documents SET normalized_storage_ref=$2, parse_quality_score=$3, parse_confidence=$4, status=$5, updated_at=NOW() WHERE id=$1""", doc_id, f"s3://stonks-normalized/{norm_path}" if norm_path else None, quality_score, confidence, status, ) # Insert company mentions for m in mentions: await pool.execute( """INSERT INTO document_company_mentions (document_id, company_id, ticker, mention_type, confidence) VALUES ($1, $2, $3, $4, $5) ON CONFLICT DO NOTHING""", doc_id, m["company_id"], m["ticker"], m["mention_type"], m["confidence"], ) # Only enqueue for extraction if quality is acceptable if confidence != "low": await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps({ "document_id": doc_id, "ticker": ticker, "normalized_text": text[:8000], # Truncate for prompt })) logger.info(f"Parsed doc {doc_id} for {ticker}: quality={quality_score:.2f}, confidence={confidence}") else: logger.warning(f"Low quality parse for doc {doc_id}, skipping extraction") async def main(): config = load_config() pool = await get_pg_pool(config) rds = get_redis(config) minio_client = get_minio(config) logger.info("Parser worker started") queue = queue_key(QUEUE_PARSING) try: while True: raw = await rds.lpop(queue) if raw: job = json.loads(raw) try: await process_job(job, pool, rds, minio_client) except Exception as e: logger.error(f"Parse error: {e}") else: await asyncio.sleep(2) finally: await pool.close() await rds.close() if __name__ == "__main__": asyncio.run(main())