phase 0+1: project scaffold, k8s manifests, CI pipeline, steering, hooks, tests

- Repository structure for all services, infra, lakehouse, dashboards - K8s manifests targeting stonks-oracle namespace with GHCR images - Ingress via Traefik with ca-issuer TLS for internal services - ConfigMap wired to existing cluster services (pg, redis, minio, ollama) - GitHub Actions workflow for lint, test, multi-service container builds - Dockerfile with build-arg CMD per service - Makefile for local build/push/deploy - Steering rules for TDD workflow, K8s conventions, project context - Agent hooks for lint-on-save, test-on-save, k8s-validate, phase-commit - Ruff linter config, all lint issues fixed - 14 passing tests for schemas, config, redis keys - PostgreSQL migrations, Trino catalogs, Superset config, MinIO lifecycle
2026-04-11 03:25:08 -07:00
parent 8cfc4f423b
commit ebea70573b
90 changed files with 3590 additions and 19 deletions
@@ -0,0 +1 @@
+# Scraper / Parser Service
@@ -0,0 +1,209 @@
+"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring."""
+import asyncio
+import io
+import json
+import logging
+import re
+from datetime import datetime
+from typing import List, Optional, Tuple
+
+import asyncpg
+import httpx
+import redis.asyncio as aioredis
+from minio import Minio
+
+from services.shared.config import load_config
+from services.shared.db import get_minio, get_pg_pool, get_redis
+from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("parser_worker")
+
+# Simple boilerplate patterns to strip
+BOILERPLATE_PATTERNS = [
+    re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
+    re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
+    re.compile(r"(?i)advertisement\s*\n"),
+    re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
+    re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
+    re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
+    re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
+    re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
+]
+
+
+def strip_html_tags(html: str) -> str:
+    """Basic HTML tag removal."""
+    text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r"<[^>]+>", " ", text)
+    text = re.sub(r"&nbsp;", " ", text)
+    text = re.sub(r"&amp;", "&", text)
+    text = re.sub(r"&lt;", "<", text)
+    text = re.sub(r"&gt;", ">", text)
+    text = re.sub(r"&#\d+;", "", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+
+
+def reduce_boilerplate(text: str) -> str:
+    for pattern in BOILERPLATE_PATTERNS:
+        text = pattern.sub("", text)
+    return text.strip()
+
+
+def score_quality(text: str) -> Tuple[float, str]:
+    """Score parse quality. Returns (score, confidence_label)."""
+    word_count = len(text.split())
+    if word_count < 20:
+        return 0.1, "low"
+    if word_count < 50:
+        return 0.3, "low"
+    if word_count < 150:
+        return 0.6, "medium"
+    return 0.85, "high"
+
+
+def detect_company_mentions(text: str, aliases: List[dict]) -> List[dict]:
+    """Detect company mentions using ticker, alias, and name matching."""
+    mentions = []
+    text_upper = text.upper()
+    for alias_info in aliases:
+        alias = alias_info["alias"]
+        if alias.upper() in text_upper:
+            mentions.append({
+                "company_id": alias_info["company_id"],
+                "ticker": alias_info.get("ticker", ""),
+                "mention_type": alias_info.get("alias_type", "alias"),
+                "confidence": 0.7,
+            })
+    return mentions
+
+
+async def fetch_html(url: str) -> Optional[str]:
+    """Fetch article HTML for scraping."""
+    if not url:
+        return None
+    async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
+        try:
+            resp = await client.get(url, headers={"User-Agent": "StonksOracle/1.0"})
+            resp.raise_for_status()
+            return resp.text
+        except Exception as e:
+            logger.warning(f"Failed to fetch {url}: {e}")
+            return None
+
+
+async def process_job(
+    job: dict,
+    pool: asyncpg.Pool,
+    rds: aioredis.Redis,
+    minio_client: Minio,
+):
+    doc_id = job["document_id"]
+    ticker = job["ticker"]
+    url = job.get("url", "")
+
+    # Fetch HTML if we have a URL
+    html = await fetch_html(url) if url else None
+
+    if html:
+        # Store raw HTML
+        html_bytes = html.encode("utf-8")
+        now = datetime.utcnow()
+        html_path = f"scrape/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.html"
+        minio_client.put_object(
+            "stonks-raw-news", html_path, io.BytesIO(html_bytes), len(html_bytes),
+            content_type="text/html",
+        )
+
+        # Parse
+        text = strip_html_tags(html)
+        text = reduce_boilerplate(text)
+    else:
+        text = ""
+
+    quality_score, confidence = score_quality(text)
+
+    # Store normalized text
+    if text:
+        text_bytes = text.encode("utf-8")
+        now = datetime.utcnow()
+        norm_path = f"parsed/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/normalized.txt"
+        minio_client.put_object(
+            "stonks-normalized", norm_path, io.BytesIO(text_bytes), len(text_bytes),
+            content_type="text/plain",
+        )
+    else:
+        norm_path = None
+
+    # Detect company mentions
+    aliases = await pool.fetch(
+        """SELECT ca.company_id::text, ca.alias, ca.alias_type, c.ticker
+           FROM company_aliases ca JOIN companies c ON ca.company_id = c.id
+           UNION ALL
+           SELECT c.id::text as company_id, c.ticker as alias, 'ticker' as alias_type, c.ticker
+           FROM companies c
+           UNION ALL
+           SELECT c.id::text as company_id, c.legal_name as alias, 'legal_name' as alias_type, c.ticker
+           FROM companies c"""
+    )
+    mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else []
+
+    # Update document
+    status = "parsed" if confidence != "low" else "low_quality"
+    await pool.execute(
+        """UPDATE documents SET
+           normalized_storage_ref=$2, parse_quality_score=$3, parse_confidence=$4, status=$5, updated_at=NOW()
+           WHERE id=$1""",
+        doc_id, f"s3://stonks-normalized/{norm_path}" if norm_path else None,
+        quality_score, confidence, status,
+    )
+
+    # Insert company mentions
+    for m in mentions:
+        await pool.execute(
+            """INSERT INTO document_company_mentions (document_id, company_id, ticker, mention_type, confidence)
+               VALUES ($1, $2, $3, $4, $5) ON CONFLICT DO NOTHING""",
+            doc_id, m["company_id"], m["ticker"], m["mention_type"], m["confidence"],
+        )
+
+    # Only enqueue for extraction if quality is acceptable
+    if confidence != "low":
+        await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps({
+            "document_id": doc_id,
+            "ticker": ticker,
+            "normalized_text": text[:8000],  # Truncate for prompt
+        }))
+        logger.info(f"Parsed doc {doc_id} for {ticker}: quality={quality_score:.2f}, confidence={confidence}")
+    else:
+        logger.warning(f"Low quality parse for doc {doc_id}, skipping extraction")
+
+
+async def main():
+    config = load_config()
+    pool = await get_pg_pool(config)
+    rds = get_redis(config)
+    minio_client = get_minio(config)
+
+    logger.info("Parser worker started")
+    queue = queue_key(QUEUE_PARSING)
+
+    try:
+        while True:
+            raw = await rds.lpop(queue)
+            if raw:
+                job = json.loads(raw)
+                try:
+                    await process_job(job, pool, rds, minio_client)
+                except Exception as e:
+                    logger.error(f"Parse error: {e}")
+            else:
+                await asyncio.sleep(2)
+    finally:
+        await pool.close()
+        await rds.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())