phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -1,47 +1,50 @@
 """Ingestion worker - processes jobs from the ingestion queue."""
 import asyncio
-import hashlib
-import io
 import json
 import logging
-from datetime import datetime

 import asyncpg
 import redis.asyncio as aioredis
 from minio import Minio

 from services.adapters.base import AdapterResult
-from services.adapters.filings_adapter import FilingsAdapter
-from services.adapters.market_adapter import MarketDataAdapter
-from services.adapters.news_adapter import NewsApiAdapter
+from services.adapters.broker_adapter import AlpacaBrokerAdapter, TradingMode
+from services.adapters.filings_adapter import SECEdgarAdapter
+from services.adapters.market_adapter import PolygonMarketAdapter
+from services.adapters.news_adapter import PolygonNewsAdapter
+from services.adapters.web_scrape_adapter import WebScrapeAdapter
 from services.shared.config import load_config
 from services.shared.db import get_minio, get_pg_pool, get_redis
+from services.shared.dedupe import dedupe_items, mark_as_seen
+from services.shared.metadata import (
+    persist_ingestion_items,
+    record_retrieval_failure,
+    reset_source_retry_state,
+)
 from services.shared.redis_keys import (
    QUEUE_INGESTION,
    QUEUE_PARSING,
    dedupe_key,
    queue_key,
 )
+from services.shared.logging import Span, extract_trace_context, inject_trace_context, new_trace_id, set_trace_context, setup_logging
+from services.shared.metrics import (
+    ACTIVE_JOBS,
+    INGESTION_ADAPTER_DURATION,
+    INGESTION_ERRORS,
+    INGESTION_ITEMS_DEDUPED,
+    INGESTION_ITEMS_FETCHED,
+    INGESTION_ITEMS_NEW,
+    INGESTION_JOBS_TOTAL,
+)
+from services.shared.storage import (
+    bucket_for_source,
+    ensure_buckets,
+    upload_raw_artifact,
+)

-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("ingestion_worker")

-BUCKET_MAP = {
-    "market_api": "stonks-raw-market",
-    "news_api": "stonks-raw-news",
-    "filings_api": "stonks-raw-filings",
-    "broker": "stonks-raw-market",
-}
-
-
-def build_storage_path(source_type: str, ticker: str, doc_id: str) -> str:
-    now = datetime.utcnow()
-    return f"{source_type}/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.json"
-
-
-async def store_raw_artifact(minio_client: Minio, bucket: str, path: str, data: bytes):
-    minio_client.put_object(bucket, path, io.BytesIO(data), len(data), content_type="application/json")
-

 async def process_job(
    job: dict,
@@ -55,9 +58,11 @@ async def process_job(
    source_id = job["source_id"]
    config = job.get("config", {})

+    set_trace_context(trace_id=job.get("_trace_id") or new_trace_id())
+
    adapter = adapters.get(source_type)
    if not adapter:
-        logger.warning(f"No adapter for source_type={source_type}")
+        logger.warning("No adapter for source_type=%s", source_type)
        return

    # Record ingestion run
@@ -68,25 +73,37 @@ async def process_job(
    )

    try:
-        result: AdapterResult = await adapter.fetch(ticker, config)
+        with Span("adapter_fetch", ticker=ticker, source_type=source_type):
+            with INGESTION_ADAPTER_DURATION.labels(source_type=source_type).time():
+                result: AdapterResult = await adapter.fetch(ticker, config)

        if result.error:
-            await pool.execute(
-                "UPDATE ingestion_runs SET status='failed', error_message=$2, completed_at=NOW() WHERE id=$1",
-                run_id, result.error,
+            INGESTION_JOBS_TOTAL.labels(source_type=source_type, status="error").inc()
+            await record_retrieval_failure(
+                pool,
+                run_id=str(run_id),
+                source_id=source_id,
+                error_message=result.error,
            )
            return

-        # Store raw payload
-        bucket = BUCKET_MAP.get(source_type, "stonks-raw-market")
-        storage_path = build_storage_path(source_type, ticker, str(run_id))
-        await store_raw_artifact(minio_client, bucket, storage_path, result.raw_payload)
+        # Store raw payload in MinIO
+        bucket = bucket_for_source(source_type)
+        artifact_type = "raw_html" if source_type == "web_scrape" else "raw_json"
+        storage_uri = upload_raw_artifact(
+            minio_client,
+            source_type=source_type,
+            ticker=ticker,
+            document_id=str(run_id),
+            data=result.raw_payload,
+            artifact_type=artifact_type,
+        )

-        # Dedupe check
+        # Dedupe check on the overall payload hash
        if result.content_hash:
            already_seen = await rds.get(dedupe_key(result.content_hash))
            if already_seen:
-                logger.info(f"Duplicate content for {ticker}, skipping")
+                logger.info("Duplicate content for %s, skipping", ticker)
                await pool.execute(
                    "UPDATE ingestion_runs SET status='completed', items_fetched=$2, items_new=0, completed_at=NOW() WHERE id=$1",
                    run_id, len(result.items),
@@ -94,72 +111,126 @@ async def process_job(
                return
            await rds.set(dedupe_key(result.content_hash), "1", ex=86400)

-        new_items = 0
-        for item in result.items:
-            item_json = json.dumps(item)
-            item_hash = hashlib.sha256(item_json.encode()).hexdigest()
+        # Cross-source dedupe on individual document items (news, filings, web_scrape)
+        items_to_persist = result.items
+        deduped_count = 0
+        if source_type not in ("market_api", "broker"):
+            items_to_persist, dup_items = await dedupe_items(pool, rds, result.items)
+            deduped_count = len(dup_items)
+            if deduped_count:
+                INGESTION_ITEMS_DEDUPED.labels(source_type=source_type).inc(deduped_count)
+                logger.info(
+                    "Deduped %d/%d items for %s/%s",
+                    deduped_count, len(result.items), ticker, source_type,
+                )

-            # Check if document already exists
-            exists = await pool.fetchval("SELECT 1 FROM documents WHERE content_hash = $1", item_hash)
-            if exists:
-                continue
+        # Persist metadata via the unified metadata module
+        new_items, new_ids = await persist_ingestion_items(
+            pool,
+            source_type=source_type,
+            ticker=ticker,
+            company_id=job.get("company_id"),
+            items=items_to_persist,
+            storage_ref=storage_uri,
+            adapter_metadata=result.metadata,
+            content_hash=result.content_hash,
+        )

-            title = item.get("title", item.get("name", ""))
-            url = item.get("url", item.get("link", ""))
-            published = item.get("publishedAt", item.get("published_at"))
+        # Enqueue new document items for parsing (not market/broker)
+        if source_type not in ("market_api", "broker"):
+            for doc_id in new_ids:
+                await rds.rpush(queue_key(QUEUE_PARSING), json.dumps(inject_trace_context({
+                    "document_id": doc_id,
+                    "ticker": ticker,
+                    "source_type": source_type,
+                })))

-            doc_id = await pool.fetchval(
-                """INSERT INTO documents (document_type, source_type, publisher, url, title, published_at, content_hash, raw_storage_ref, status)
-                   VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'ingested')
-                   RETURNING id""",
-                "article" if source_type == "news_api" else "filing" if source_type == "filings_api" else "article",
-                source_type,
-                item.get("source", {}).get("name", "") if isinstance(item.get("source"), dict) else str(item.get("source", "")),
-                url, title,
-                datetime.fromisoformat(published.replace("Z", "+00:00")) if published else None,
-                item_hash,
-                f"s3://{bucket}/{storage_path}",
-            )
+            # Mark newly persisted documents in Redis for fast future dedupe
+            for item, doc_id in zip(items_to_persist, new_ids):
+                await mark_as_seen(
+                    rds,
+                    content_hash=item.get("content_hash", ""),
+                    canonical_url=item.get("canonical_url"),
+                    document_id=doc_id,
+                )

-            # Enqueue for parsing
-            await rds.rpush(queue_key(QUEUE_PARSING), json.dumps({
-                "document_id": str(doc_id),
-                "ticker": ticker,
-                "source_type": source_type,
-                "url": url,
-            }))
-            new_items += 1
+            # Link duplicate documents to this company if not already linked
+            company_id = job.get("company_id")
+            if company_id and deduped_count:
+                from services.shared.metadata import persist_document_company_mention
+                for dup in dup_items:
+                    existing_id = dup.get("_dedupe_existing_id")
+                    if existing_id:
+                        try:
+                            await persist_document_company_mention(
+                                pool,
+                                document_id=existing_id,
+                                company_id=company_id,
+                                ticker=ticker,
+                                mention_type="cross_source",
+                            )
+                        except Exception:
+                            # Duplicate mention link — safe to ignore
+                            pass

        await pool.execute(
            "UPDATE ingestion_runs SET status='completed', items_fetched=$2, items_new=$3, completed_at=NOW() WHERE id=$1",
            run_id, len(result.items), new_items,
        )
-        logger.info(f"Ingested {ticker}/{source_type}: {len(result.items)} fetched, {new_items} new")
+        # Clear any accumulated retry backoff after success
+        await reset_source_retry_state(pool, source_id)
+        INGESTION_ITEMS_FETCHED.labels(source_type=source_type).inc(len(result.items))
+        INGESTION_ITEMS_NEW.labels(source_type=source_type).inc(new_items)
+        INGESTION_JOBS_TOTAL.labels(source_type=source_type, status="success").inc()
+        logger.info(
+            "Ingested %s/%s: %d fetched, %d new",
+            ticker, source_type, len(result.items), new_items,
+            extra={"ticker": ticker, "source_type": source_type, "count": new_items},
+        )

    except Exception as e:
-        logger.error(f"Ingestion error for {ticker}: {e}")
-        await pool.execute(
-            "UPDATE ingestion_runs SET status='failed', error_message=$2, completed_at=NOW() WHERE id=$1",
-            run_id, str(e),
+        INGESTION_ERRORS.labels(source_type=source_type).inc()
+        INGESTION_JOBS_TOTAL.labels(source_type=source_type, status="error").inc()
+        logger.error(
+            "Ingestion error for %s: %s", ticker, e,
+            extra={"ticker": ticker, "source_type": source_type, "error": str(e)},
+        )
+        await record_retrieval_failure(
+            pool,
+            run_id=str(run_id),
+            source_id=source_id,
+            error_message=str(e),
        )


 async def main():
-    config = load_config()
-    pool = await get_pg_pool(config)
-    rds = get_redis(config)
-    minio_client = get_minio(config)
+    cfg = load_config()
+    setup_logging("ingestion_worker", level=cfg.log_level, json_output=cfg.json_logs)
+
+    pool = await get_pg_pool(cfg)
+    rds = get_redis(cfg)
+    minio_client = get_minio(cfg)
+
+    # Ensure all required buckets exist
+    ensure_buckets(minio_client)

    adapters = {
-        "market_api": MarketDataAdapter(
-            api_key=config.broker.api_key or "",
+        "market_api": PolygonMarketAdapter(
+            api_key=cfg.market_data.api_key,
+            base_url=cfg.market_data.base_url,
+        ),
+        "news_api": PolygonNewsAdapter(
+            api_key=cfg.market_data.api_key,
            base_url="https://api.polygon.io",
        ),
-        "news_api": NewsApiAdapter(
-            api_key="",
-            base_url="https://newsapi.org",
+        "filings_api": SECEdgarAdapter(),
+        "web_scrape": WebScrapeAdapter(),
+        "broker": AlpacaBrokerAdapter(
+            api_key=cfg.broker.api_key or "",
+            api_secret=cfg.broker.api_secret or "",
+            mode=TradingMode.LIVE if cfg.broker.mode == "live" else TradingMode.PAPER,
+            base_url=cfg.broker.base_url,
        ),
-        "filings_api": FilingsAdapter(),
    }

    logger.info("Ingestion worker started")