ebea70573b
- Repository structure for all services, infra, lakehouse, dashboards - K8s manifests targeting stonks-oracle namespace with GHCR images - Ingress via Traefik with ca-issuer TLS for internal services - ConfigMap wired to existing cluster services (pg, redis, minio, ollama) - GitHub Actions workflow for lint, test, multi-service container builds - Dockerfile with build-arg CMD per service - Makefile for local build/push/deploy - Steering rules for TDD workflow, K8s conventions, project context - Agent hooks for lint-on-save, test-on-save, k8s-validate, phase-commit - Ruff linter config, all lint issues fixed - 14 passing tests for schemas, config, redis keys - PostgreSQL migrations, Trino catalogs, Superset config, MinIO lifecycle
210 lines
6.9 KiB
Python
210 lines
6.9 KiB
Python
"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring."""
|
|
import asyncio
|
|
import io
|
|
import json
|
|
import logging
|
|
import re
|
|
from datetime import datetime
|
|
from typing import List, Optional, Tuple
|
|
|
|
import asyncpg
|
|
import httpx
|
|
import redis.asyncio as aioredis
|
|
from minio import Minio
|
|
|
|
from services.shared.config import load_config
|
|
from services.shared.db import get_minio, get_pg_pool, get_redis
|
|
from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger("parser_worker")
|
|
|
|
# Simple boilerplate patterns to strip
|
|
BOILERPLATE_PATTERNS = [
|
|
re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
|
|
re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
|
|
re.compile(r"(?i)advertisement\s*\n"),
|
|
re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
|
|
re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
|
|
re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
|
|
re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
|
|
re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
|
|
]
|
|
|
|
|
|
def strip_html_tags(html: str) -> str:
|
|
"""Basic HTML tag removal."""
|
|
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
|
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
|
|
text = re.sub(r"<[^>]+>", " ", text)
|
|
text = re.sub(r" ", " ", text)
|
|
text = re.sub(r"&", "&", text)
|
|
text = re.sub(r"<", "<", text)
|
|
text = re.sub(r">", ">", text)
|
|
text = re.sub(r"&#\d+;", "", text)
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
return text
|
|
|
|
|
|
def reduce_boilerplate(text: str) -> str:
|
|
for pattern in BOILERPLATE_PATTERNS:
|
|
text = pattern.sub("", text)
|
|
return text.strip()
|
|
|
|
|
|
def score_quality(text: str) -> Tuple[float, str]:
|
|
"""Score parse quality. Returns (score, confidence_label)."""
|
|
word_count = len(text.split())
|
|
if word_count < 20:
|
|
return 0.1, "low"
|
|
if word_count < 50:
|
|
return 0.3, "low"
|
|
if word_count < 150:
|
|
return 0.6, "medium"
|
|
return 0.85, "high"
|
|
|
|
|
|
def detect_company_mentions(text: str, aliases: List[dict]) -> List[dict]:
|
|
"""Detect company mentions using ticker, alias, and name matching."""
|
|
mentions = []
|
|
text_upper = text.upper()
|
|
for alias_info in aliases:
|
|
alias = alias_info["alias"]
|
|
if alias.upper() in text_upper:
|
|
mentions.append({
|
|
"company_id": alias_info["company_id"],
|
|
"ticker": alias_info.get("ticker", ""),
|
|
"mention_type": alias_info.get("alias_type", "alias"),
|
|
"confidence": 0.7,
|
|
})
|
|
return mentions
|
|
|
|
|
|
async def fetch_html(url: str) -> Optional[str]:
|
|
"""Fetch article HTML for scraping."""
|
|
if not url:
|
|
return None
|
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
|
try:
|
|
resp = await client.get(url, headers={"User-Agent": "StonksOracle/1.0"})
|
|
resp.raise_for_status()
|
|
return resp.text
|
|
except Exception as e:
|
|
logger.warning(f"Failed to fetch {url}: {e}")
|
|
return None
|
|
|
|
|
|
async def process_job(
|
|
job: dict,
|
|
pool: asyncpg.Pool,
|
|
rds: aioredis.Redis,
|
|
minio_client: Minio,
|
|
):
|
|
doc_id = job["document_id"]
|
|
ticker = job["ticker"]
|
|
url = job.get("url", "")
|
|
|
|
# Fetch HTML if we have a URL
|
|
html = await fetch_html(url) if url else None
|
|
|
|
if html:
|
|
# Store raw HTML
|
|
html_bytes = html.encode("utf-8")
|
|
now = datetime.utcnow()
|
|
html_path = f"scrape/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.html"
|
|
minio_client.put_object(
|
|
"stonks-raw-news", html_path, io.BytesIO(html_bytes), len(html_bytes),
|
|
content_type="text/html",
|
|
)
|
|
|
|
# Parse
|
|
text = strip_html_tags(html)
|
|
text = reduce_boilerplate(text)
|
|
else:
|
|
text = ""
|
|
|
|
quality_score, confidence = score_quality(text)
|
|
|
|
# Store normalized text
|
|
if text:
|
|
text_bytes = text.encode("utf-8")
|
|
now = datetime.utcnow()
|
|
norm_path = f"parsed/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/normalized.txt"
|
|
minio_client.put_object(
|
|
"stonks-normalized", norm_path, io.BytesIO(text_bytes), len(text_bytes),
|
|
content_type="text/plain",
|
|
)
|
|
else:
|
|
norm_path = None
|
|
|
|
# Detect company mentions
|
|
aliases = await pool.fetch(
|
|
"""SELECT ca.company_id::text, ca.alias, ca.alias_type, c.ticker
|
|
FROM company_aliases ca JOIN companies c ON ca.company_id = c.id
|
|
UNION ALL
|
|
SELECT c.id::text as company_id, c.ticker as alias, 'ticker' as alias_type, c.ticker
|
|
FROM companies c
|
|
UNION ALL
|
|
SELECT c.id::text as company_id, c.legal_name as alias, 'legal_name' as alias_type, c.ticker
|
|
FROM companies c"""
|
|
)
|
|
mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else []
|
|
|
|
# Update document
|
|
status = "parsed" if confidence != "low" else "low_quality"
|
|
await pool.execute(
|
|
"""UPDATE documents SET
|
|
normalized_storage_ref=$2, parse_quality_score=$3, parse_confidence=$4, status=$5, updated_at=NOW()
|
|
WHERE id=$1""",
|
|
doc_id, f"s3://stonks-normalized/{norm_path}" if norm_path else None,
|
|
quality_score, confidence, status,
|
|
)
|
|
|
|
# Insert company mentions
|
|
for m in mentions:
|
|
await pool.execute(
|
|
"""INSERT INTO document_company_mentions (document_id, company_id, ticker, mention_type, confidence)
|
|
VALUES ($1, $2, $3, $4, $5) ON CONFLICT DO NOTHING""",
|
|
doc_id, m["company_id"], m["ticker"], m["mention_type"], m["confidence"],
|
|
)
|
|
|
|
# Only enqueue for extraction if quality is acceptable
|
|
if confidence != "low":
|
|
await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps({
|
|
"document_id": doc_id,
|
|
"ticker": ticker,
|
|
"normalized_text": text[:8000], # Truncate for prompt
|
|
}))
|
|
logger.info(f"Parsed doc {doc_id} for {ticker}: quality={quality_score:.2f}, confidence={confidence}")
|
|
else:
|
|
logger.warning(f"Low quality parse for doc {doc_id}, skipping extraction")
|
|
|
|
|
|
async def main():
|
|
config = load_config()
|
|
pool = await get_pg_pool(config)
|
|
rds = get_redis(config)
|
|
minio_client = get_minio(config)
|
|
|
|
logger.info("Parser worker started")
|
|
queue = queue_key(QUEUE_PARSING)
|
|
|
|
try:
|
|
while True:
|
|
raw = await rds.lpop(queue)
|
|
if raw:
|
|
job = json.loads(raw)
|
|
try:
|
|
await process_job(job, pool, rds, minio_client)
|
|
except Exception as e:
|
|
logger.error(f"Parse error: {e}")
|
|
else:
|
|
await asyncio.sleep(2)
|
|
finally:
|
|
await pool.close()
|
|
await rds.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|