phase 0+1: project scaffold, k8s manifests, CI pipeline, steering, hooks, tests

- Repository structure for all services, infra, lakehouse, dashboards
- K8s manifests targeting stonks-oracle namespace with GHCR images
- Ingress via Traefik with ca-issuer TLS for internal services
- ConfigMap wired to existing cluster services (pg, redis, minio, ollama)
- GitHub Actions workflow for lint, test, multi-service container builds
- Dockerfile with build-arg CMD per service
- Makefile for local build/push/deploy
- Steering rules for TDD workflow, K8s conventions, project context
- Agent hooks for lint-on-save, test-on-save, k8s-validate, phase-commit
- Ruff linter config, all lint issues fixed
- 14 passing tests for schemas, config, redis keys
- PostgreSQL migrations, Trino catalogs, Superset config, MinIO lifecycle
This commit is contained in:
Celes Renata
2026-04-11 03:25:08 -07:00
parent 8cfc4f423b
commit ebea70573b
90 changed files with 3590 additions and 19 deletions
+209
View File
@@ -0,0 +1,209 @@
"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring."""
import asyncio
import io
import json
import logging
import re
from datetime import datetime
from typing import List, Optional, Tuple
import asyncpg
import httpx
import redis.asyncio as aioredis
from minio import Minio
from services.shared.config import load_config
from services.shared.db import get_minio, get_pg_pool, get_redis
from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("parser_worker")
# Simple boilerplate patterns to strip
BOILERPLATE_PATTERNS = [
re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
re.compile(r"(?i)advertisement\s*\n"),
re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
]
def strip_html_tags(html: str) -> str:
"""Basic HTML tag removal."""
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"&nbsp;", " ", text)
text = re.sub(r"&amp;", "&", text)
text = re.sub(r"&lt;", "<", text)
text = re.sub(r"&gt;", ">", text)
text = re.sub(r"&#\d+;", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def reduce_boilerplate(text: str) -> str:
for pattern in BOILERPLATE_PATTERNS:
text = pattern.sub("", text)
return text.strip()
def score_quality(text: str) -> Tuple[float, str]:
"""Score parse quality. Returns (score, confidence_label)."""
word_count = len(text.split())
if word_count < 20:
return 0.1, "low"
if word_count < 50:
return 0.3, "low"
if word_count < 150:
return 0.6, "medium"
return 0.85, "high"
def detect_company_mentions(text: str, aliases: List[dict]) -> List[dict]:
"""Detect company mentions using ticker, alias, and name matching."""
mentions = []
text_upper = text.upper()
for alias_info in aliases:
alias = alias_info["alias"]
if alias.upper() in text_upper:
mentions.append({
"company_id": alias_info["company_id"],
"ticker": alias_info.get("ticker", ""),
"mention_type": alias_info.get("alias_type", "alias"),
"confidence": 0.7,
})
return mentions
async def fetch_html(url: str) -> Optional[str]:
"""Fetch article HTML for scraping."""
if not url:
return None
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
try:
resp = await client.get(url, headers={"User-Agent": "StonksOracle/1.0"})
resp.raise_for_status()
return resp.text
except Exception as e:
logger.warning(f"Failed to fetch {url}: {e}")
return None
async def process_job(
job: dict,
pool: asyncpg.Pool,
rds: aioredis.Redis,
minio_client: Minio,
):
doc_id = job["document_id"]
ticker = job["ticker"]
url = job.get("url", "")
# Fetch HTML if we have a URL
html = await fetch_html(url) if url else None
if html:
# Store raw HTML
html_bytes = html.encode("utf-8")
now = datetime.utcnow()
html_path = f"scrape/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.html"
minio_client.put_object(
"stonks-raw-news", html_path, io.BytesIO(html_bytes), len(html_bytes),
content_type="text/html",
)
# Parse
text = strip_html_tags(html)
text = reduce_boilerplate(text)
else:
text = ""
quality_score, confidence = score_quality(text)
# Store normalized text
if text:
text_bytes = text.encode("utf-8")
now = datetime.utcnow()
norm_path = f"parsed/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/normalized.txt"
minio_client.put_object(
"stonks-normalized", norm_path, io.BytesIO(text_bytes), len(text_bytes),
content_type="text/plain",
)
else:
norm_path = None
# Detect company mentions
aliases = await pool.fetch(
"""SELECT ca.company_id::text, ca.alias, ca.alias_type, c.ticker
FROM company_aliases ca JOIN companies c ON ca.company_id = c.id
UNION ALL
SELECT c.id::text as company_id, c.ticker as alias, 'ticker' as alias_type, c.ticker
FROM companies c
UNION ALL
SELECT c.id::text as company_id, c.legal_name as alias, 'legal_name' as alias_type, c.ticker
FROM companies c"""
)
mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else []
# Update document
status = "parsed" if confidence != "low" else "low_quality"
await pool.execute(
"""UPDATE documents SET
normalized_storage_ref=$2, parse_quality_score=$3, parse_confidence=$4, status=$5, updated_at=NOW()
WHERE id=$1""",
doc_id, f"s3://stonks-normalized/{norm_path}" if norm_path else None,
quality_score, confidence, status,
)
# Insert company mentions
for m in mentions:
await pool.execute(
"""INSERT INTO document_company_mentions (document_id, company_id, ticker, mention_type, confidence)
VALUES ($1, $2, $3, $4, $5) ON CONFLICT DO NOTHING""",
doc_id, m["company_id"], m["ticker"], m["mention_type"], m["confidence"],
)
# Only enqueue for extraction if quality is acceptable
if confidence != "low":
await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps({
"document_id": doc_id,
"ticker": ticker,
"normalized_text": text[:8000], # Truncate for prompt
}))
logger.info(f"Parsed doc {doc_id} for {ticker}: quality={quality_score:.2f}, confidence={confidence}")
else:
logger.warning(f"Low quality parse for doc {doc_id}, skipping extraction")
async def main():
config = load_config()
pool = await get_pg_pool(config)
rds = get_redis(config)
minio_client = get_minio(config)
logger.info("Parser worker started")
queue = queue_key(QUEUE_PARSING)
try:
while True:
raw = await rds.lpop(queue)
if raw:
job = json.loads(raw)
try:
await process_job(job, pool, rds, minio_client)
except Exception as e:
logger.error(f"Parse error: {e}")
else:
await asyncio.sleep(2)
finally:
await pool.close()
await rds.close()
if __name__ == "__main__":
asyncio.run(main())