phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -0,0 +1,696 @@
+"""Metadata persistence for market payloads, documents, and broker events.
+
+Persists structured metadata records to PostgreSQL for all ingested artifacts.
+Each source type has its own persistence path:
+- market_api  → market_snapshots table
+- news_api / filings_api / web_scrape → documents + document_company_mentions
+- broker → order_events or market_snapshots (for position/account snapshots)
+
+Requirements: 3.3, 3.4, 8.3, 9.2
+"""
+from __future__ import annotations
+
+import json
+import logging
+from datetime import datetime, timedelta, timezone
+from typing import Any
+
+import asyncpg
+
+from services.shared.content import content_hash_str, normalize_url
+
+logger = logging.getLogger("metadata")
+
+
+async def persist_market_snapshot(
+    pool: asyncpg.Pool,
+    *,
+    company_id: str | None,
+    ticker: str,
+    snapshot_type: str,
+    data: dict[str, Any],
+    source_provider: str,
+    storage_ref: str,
+    content_hash: str,
+    captured_at: datetime | None = None,
+) -> str:
+    """Persist a market data snapshot to PostgreSQL.
+
+    Returns the snapshot row UUID.
+    """
+    ts = captured_at or datetime.now(timezone.utc)
+    row_id = await pool.fetchval(
+        """INSERT INTO market_snapshots
+               (company_id, ticker, snapshot_type, data, source_provider,
+                captured_at, storage_ref, content_hash)
+           VALUES ($1, $2, $3, $4::jsonb, $5, $6, $7, $8)
+           RETURNING id""",
+        company_id,
+        ticker,
+        snapshot_type,
+        json.dumps(data),
+        source_provider,
+        ts,
+        storage_ref,
+        content_hash,
+    )
+    logger.debug("Persisted market snapshot %s for %s", row_id, ticker)
+    return str(row_id)
+
+
+async def persist_document(
+    pool: asyncpg.Pool,
+    *,
+    document_type: str,
+    source_type: str,
+    publisher: str,
+    url: str | None,
+    canonical_url: str | None,
+    title: str,
+    published_at: datetime | None,
+    content_hash: str,
+    storage_ref: str,
+    language: str = "en",
+) -> str | None:
+    """Persist a document metadata record to PostgreSQL.
+
+    Returns the document row UUID, or None if a duplicate content_hash exists.
+    """
+    exists = await pool.fetchval(
+        "SELECT 1 FROM documents WHERE content_hash = $1", content_hash
+    )
+    if exists:
+        return None
+
+    doc_id = await pool.fetchval(
+        """INSERT INTO documents
+               (document_type, source_type, publisher, url, canonical_url,
+                title, published_at, content_hash, raw_storage_ref,
+                language, status)
+           VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 'ingested')
+           RETURNING id""",
+        document_type,
+        source_type,
+        publisher,
+        url,
+        canonical_url,
+        title,
+        published_at,
+        content_hash,
+        storage_ref,
+        language,
+    )
+    logger.debug("Persisted document %s (%s)", doc_id, title[:60] if title else "")
+    return str(doc_id)
+
+
+async def update_document_parse_results(
+    pool: asyncpg.Pool,
+    *,
+    document_id: str,
+    normalized_storage_ref: str | None,
+    parser_output_ref: str | None,
+    parse_quality_score: float,
+    parse_confidence: str,
+    status: str,
+) -> None:
+    """Update a document row with parser output references and quality scores.
+
+    Called after the parsing stage to persist normalized text location,
+    structured parser output location, quality score, and confidence.
+
+    Requirements: 4.1, 4.3, 9.1
+    """
+    await pool.execute(
+        """UPDATE documents SET
+               normalized_storage_ref = $2,
+               parser_output_ref = $3,
+               parse_quality_score = $4,
+               parse_confidence = $5,
+               status = $6,
+               updated_at = NOW()
+           WHERE id = $1""",
+        document_id,
+        normalized_storage_ref,
+        parser_output_ref,
+        parse_quality_score,
+        parse_confidence,
+        status,
+    )
+    logger.debug(
+        "Updated document %s parse results: quality=%.2f confidence=%s status=%s",
+        document_id, parse_quality_score, parse_confidence, status,
+    )
+
+
+async def persist_document_company_mention(
+    pool: asyncpg.Pool,
+    *,
+    document_id: str,
+    company_id: str,
+    ticker: str,
+    mention_type: str = "direct",
+    confidence: float = 1.0,
+) -> str:
+    """Link a document to a company via document_company_mentions.
+
+    Returns the mention row UUID.
+    """
+    mention_id = await pool.fetchval(
+        """INSERT INTO document_company_mentions
+               (document_id, company_id, ticker, mention_type, confidence)
+           VALUES ($1::uuid, $2::uuid, $3, $4, $5)
+           RETURNING id""",
+        document_id,
+        company_id,
+        ticker,
+        mention_type,
+        confidence,
+    )
+    return str(mention_id)
+
+
+async def persist_broker_event(
+    pool: asyncpg.Pool,
+    *,
+    ticker: str,
+    event_type: str,
+    data: dict[str, Any],
+    source_provider: str,
+    storage_ref: str,
+    content_hash: str,
+    captured_at: datetime | None = None,
+) -> str:
+    """Persist a broker event snapshot to market_snapshots.
+
+    Broker position/account snapshots are stored as market_snapshots
+    with snapshot_type prefixed by 'broker_' (e.g. broker_positions,
+    broker_account, broker_orders).
+
+    Returns the snapshot row UUID.
+    """
+    ts = captured_at or datetime.now(timezone.utc)
+    row_id = await pool.fetchval(
+        """INSERT INTO market_snapshots
+               (ticker, snapshot_type, data, source_provider,
+                captured_at, storage_ref, content_hash)
+           VALUES ($1, $2, $3::jsonb, $4, $5, $6, $7)
+           RETURNING id""",
+        ticker,
+        f"broker_{event_type}",
+        json.dumps(data),
+        source_provider,
+        ts,
+        storage_ref,
+        content_hash,
+    )
+    logger.debug("Persisted broker event %s for %s", row_id, ticker)
+    return str(row_id)
+
+
+def _resolve_document_type(source_type: str) -> str:
+    """Map source_type to a document_type value."""
+    mapping = {
+        "news_api": "article",
+        "filings_api": "filing",
+        "web_scrape": "press_release",
+    }
+    return mapping.get(source_type, "article")
+
+
+def _extract_publisher(item: dict[str, Any]) -> str:
+    """Extract publisher name from an adapter item dict."""
+    if item.get("publisher"):
+        return str(item["publisher"])
+    source = item.get("source")
+    if isinstance(source, dict):
+        return source.get("name", "")
+    if source:
+        return str(source)
+    return ""
+
+
+def _parse_published_at(item: dict[str, Any]) -> datetime | None:
+    """Parse published_at from various adapter item formats."""
+    raw = item.get("publishedAt") or item.get("published_at")
+    if not raw:
+        return None
+    if isinstance(raw, datetime):
+        return raw
+    try:
+        return datetime.fromisoformat(str(raw).replace("Z", "+00:00"))
+    except (ValueError, TypeError):
+        return None
+
+
+async def persist_ingestion_items(
+    pool: asyncpg.Pool,
+    *,
+    source_type: str,
+    ticker: str,
+    company_id: str | None,
+    items: list[dict[str, Any]],
+    storage_ref: str,
+    adapter_metadata: dict[str, Any],
+    content_hash: str,
+) -> tuple[int, list[str]]:
+    """Route ingestion items to the correct persistence path.
+
+    Returns (new_item_count, list_of_new_ids).
+    """
+    if source_type == "market_api":
+        return await _persist_market_items(
+            pool,
+            ticker=ticker,
+            company_id=company_id,
+            items=items,
+            storage_ref=storage_ref,
+            provider=adapter_metadata.get("provider", "unknown"),
+            content_hash=content_hash,
+        )
+
+    if source_type == "broker":
+        return await _persist_broker_items(
+            pool,
+            ticker=ticker,
+            items=items,
+            storage_ref=storage_ref,
+            provider=adapter_metadata.get("provider", "unknown"),
+            endpoint=adapter_metadata.get("endpoint", "positions"),
+            content_hash=content_hash,
+        )
+
+    # Document types: news_api, filings_api, web_scrape
+    return await _persist_document_items(
+        pool,
+        source_type=source_type,
+        ticker=ticker,
+        company_id=company_id,
+        items=items,
+        storage_ref=storage_ref,
+    )
+
+
+async def _persist_market_items(
+    pool: asyncpg.Pool,
+    *,
+    ticker: str,
+    company_id: str | None,
+    items: list[dict[str, Any]],
+    storage_ref: str,
+    provider: str,
+    content_hash: str,
+) -> tuple[int, list[str]]:
+    """Persist market data items as market_snapshots rows."""
+    ids: list[str] = []
+    for item in items:
+        item_hash = content_hash_str(json.dumps(item, sort_keys=True))
+        # Skip duplicates
+        exists = await pool.fetchval(
+            "SELECT 1 FROM market_snapshots WHERE content_hash = $1", item_hash
+        )
+        if exists:
+            continue
+
+        snapshot_type = _infer_market_snapshot_type(item)
+        row_id = await persist_market_snapshot(
+            pool,
+            company_id=company_id,
+            ticker=ticker,
+            snapshot_type=snapshot_type,
+            data=item,
+            source_provider=provider,
+            storage_ref=storage_ref,
+            content_hash=item_hash,
+        )
+        ids.append(row_id)
+    return len(ids), ids
+
+
+def _infer_market_snapshot_type(item: dict[str, Any]) -> str:
+    """Infer snapshot_type from market data item fields."""
+    # Polygon aggregate bars have 'o', 'h', 'l', 'c' fields
+    if all(k in item for k in ("o", "h", "l", "c")):
+        return "bar"
+    # Ticker details have 'market_cap' or 'sic_code'
+    if "market_cap" in item or "sic_code" in item:
+        return "ticker_details"
+    # Quote snapshots
+    if "ask" in item or "bid" in item:
+        return "quote"
+    return "snapshot"
+
+
+async def _persist_broker_items(
+    pool: asyncpg.Pool,
+    *,
+    ticker: str,
+    items: list[dict[str, Any]],
+    storage_ref: str,
+    provider: str,
+    endpoint: str,
+    content_hash: str,
+) -> tuple[int, list[str]]:
+    """Persist broker fetch items as market_snapshots with broker_ prefix."""
+    ids: list[str] = []
+    for item in items:
+        item_hash = content_hash_str(json.dumps(item, sort_keys=True))
+        exists = await pool.fetchval(
+            "SELECT 1 FROM market_snapshots WHERE content_hash = $1", item_hash
+        )
+        if exists:
+            continue
+
+        row_id = await persist_broker_event(
+            pool,
+            ticker=ticker,
+            event_type=endpoint,
+            data=item,
+            source_provider=provider,
+            storage_ref=storage_ref,
+            content_hash=item_hash,
+        )
+        ids.append(row_id)
+    return len(ids), ids
+
+
+async def _persist_document_items(
+    pool: asyncpg.Pool,
+    *,
+    source_type: str,
+    ticker: str,
+    company_id: str | None,
+    items: list[dict[str, Any]],
+    storage_ref: str,
+) -> tuple[int, list[str]]:
+    """Persist document items (news, filings, web scrape) to documents table."""
+    doc_type = _resolve_document_type(source_type)
+    ids: list[str] = []
+
+    for item in items:
+        item_hash = item.get("content_hash") or content_hash_str(
+            json.dumps(item, sort_keys=True)
+        )
+        title = item.get("title", item.get("name", ""))
+        url = item.get("url", item.get("link", ""))
+        canonical_url = item.get("canonical_url") or (
+            normalize_url(url) if url else None
+        )
+        published_at = _parse_published_at(item)
+        publisher = _extract_publisher(item)
+
+        doc_id = await persist_document(
+            pool,
+            document_type=doc_type,
+            source_type=source_type,
+            publisher=publisher,
+            url=url or None,
+            canonical_url=canonical_url,
+            title=title,
+            published_at=published_at,
+            content_hash=item_hash,
+            storage_ref=storage_ref,
+        )
+        if doc_id is None:
+            continue
+
+        # Link document to company if we have a company_id
+        if company_id:
+            await persist_document_company_mention(
+                pool,
+                document_id=doc_id,
+                company_id=company_id,
+                ticker=ticker,
+            )
+
+        ids.append(doc_id)
+
+    return len(ids), ids
+
+
+# --- Retry and failure tracking (Requirement 3.4) ---
+
+# Backoff constants — match scheduler defaults for consistency
+RETRY_BACKOFF_BASE: int = 60
+RETRY_BACKOFF_MAX: int = 3600
+RETRY_MAX_COUNT: int = 10
+
+
+def compute_next_retry_at(
+    retry_count: int,
+    now: datetime | None = None,
+    base: int = RETRY_BACKOFF_BASE,
+    cap: int = RETRY_BACKOFF_MAX,
+) -> datetime:
+    """Compute the next eligible retry time using exponential backoff.
+
+    Args:
+        retry_count: Current retry count (before incrementing).
+        now: Reference timestamp (defaults to UTC now).
+        base: Base delay in seconds.
+        cap: Maximum delay in seconds.
+
+    Returns:
+        Datetime of the next eligible retry.
+    """
+    ts = now or datetime.now(timezone.utc)
+    delay = min(base * (2 ** min(retry_count, 8)), cap)
+    return ts + timedelta(seconds=delay)
+
+
+async def get_source_retry_count(
+    pool: asyncpg.Pool,
+    source_id: str,
+) -> int:
+    """Return the retry count from the most recent failed run for a source.
+
+    If the last run succeeded or no runs exist, returns 0.
+    """
+    row = await pool.fetchrow(
+        """SELECT status, retry_count
+           FROM ingestion_runs
+           WHERE source_id = $1::uuid
+           ORDER BY started_at DESC
+           LIMIT 1""",
+        source_id,
+    )
+    if row and row["status"] == "failed":
+        return row["retry_count"] or 0
+    return 0
+
+
+async def record_retrieval_failure(
+    pool: asyncpg.Pool,
+    run_id: str,
+    source_id: str,
+    error_message: str,
+    retry_count: int | None = None,
+    now: datetime | None = None,
+) -> dict[str, Any]:
+    """Record a source retrieval failure with retry policy state.
+
+    Updates the ingestion_runs row with:
+    - error_message: the failure reason
+    - retry_count: incremented from the previous failed run (or provided)
+    - next_retry_at: computed via exponential backoff
+    - status: 'failed'
+
+    If retry_count is not provided, it is looked up from the most recent
+    failed run for the same source and incremented.
+
+    Returns a dict with the recorded retry state for observability.
+
+    Requirement 3.4
+    """
+    ts = now or datetime.now(timezone.utc)
+
+    if retry_count is None:
+        prev_count = await get_source_retry_count(pool, source_id)
+        retry_count = prev_count + 1
+    else:
+        retry_count = retry_count + 1
+
+    next_retry = compute_next_retry_at(retry_count - 1, now=ts)
+    exhausted = retry_count >= RETRY_MAX_COUNT
+
+    await pool.execute(
+        """UPDATE ingestion_runs
+           SET status = 'failed',
+               error_message = $2,
+               retry_count = $3,
+               next_retry_at = $4,
+               completed_at = $5
+           WHERE id = $1""",
+        run_id,
+        error_message,
+        retry_count,
+        next_retry,
+        ts,
+    )
+
+    state = {
+        "run_id": run_id,
+        "source_id": source_id,
+        "retry_count": retry_count,
+        "next_retry_at": next_retry.isoformat(),
+        "exhausted": exhausted,
+        "error_message": error_message,
+    }
+
+    if exhausted:
+        logger.warning(
+            "Source %s exhausted retries (%d/%d): %s",
+            source_id, retry_count, RETRY_MAX_COUNT, error_message,
+        )
+    else:
+        logger.info(
+            "Source %s failed (retry %d/%d), next retry at %s: %s",
+            source_id, retry_count, RETRY_MAX_COUNT,
+            next_retry.isoformat(), error_message,
+        )
+
+    return state
+
+
+async def persist_document_intelligence(
+    pool: asyncpg.Pool,
+    *,
+    document_id: str,
+    summary: str,
+    macro_themes: list[str],
+    novelty_score: float,
+    source_credibility: float,
+    extraction_warnings: list[str],
+    confidence: float,
+    model_provider: str,
+    model_name: str,
+    prompt_version: str,
+    schema_version: str,
+    raw_output_ref: str | None = None,
+    prompt_ref: str | None = None,
+    validation_status: str = "valid",
+    validation_errors: list[str] | None = None,
+    retry_count: int = 0,
+) -> str:
+    """Persist a document intelligence record to PostgreSQL.
+
+    Returns the intelligence row UUID.
+
+    Requirements: 5.3, 5.4, 9.2
+    """
+    intel_id = await pool.fetchval(
+        """INSERT INTO document_intelligence
+               (document_id, summary, macro_themes, novelty_score,
+                source_credibility, extraction_warnings, confidence,
+                model_provider, model_name, prompt_version, schema_version,
+                raw_output_ref, prompt_ref, validation_status,
+                validation_errors, retry_count)
+           VALUES ($1::uuid, $2, $3::jsonb, $4, $5, $6::jsonb, $7,
+                   $8, $9, $10, $11, $12, $13, $14, $15::jsonb, $16)
+           RETURNING id""",
+        document_id,
+        summary,
+        json.dumps(macro_themes),
+        novelty_score,
+        source_credibility,
+        json.dumps(extraction_warnings),
+        confidence,
+        model_provider,
+        model_name,
+        prompt_version,
+        schema_version,
+        raw_output_ref,
+        prompt_ref,
+        validation_status,
+        json.dumps(validation_errors or []),
+        retry_count,
+    )
+    logger.debug("Persisted document intelligence %s for doc %s", intel_id, document_id)
+    return str(intel_id)
+
+
+async def persist_document_impact(
+    pool: asyncpg.Pool,
+    *,
+    intelligence_id: str,
+    company_id: str,
+    ticker: str,
+    relevance: float,
+    sentiment: str,
+    impact_score: float,
+    impact_horizon: str,
+    catalyst_type: str,
+    key_facts: list[str],
+    risks: list[str],
+    evidence_spans: list[str],
+) -> str:
+    """Persist a per-company impact record linked to a document intelligence row.
+
+    Returns the impact record UUID.
+
+    Requirements: 5.3, 5.5, 9.2
+    """
+    impact_id = await pool.fetchval(
+        """INSERT INTO document_impact_records
+               (intelligence_id, company_id, ticker, relevance, sentiment,
+                impact_score, impact_horizon, catalyst_type,
+                key_facts, risks, evidence_spans)
+           VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6, $7, $8,
+                   $9::jsonb, $10::jsonb, $11::jsonb)
+           RETURNING id""",
+        intelligence_id,
+        company_id,
+        ticker,
+        relevance,
+        sentiment,
+        impact_score,
+        impact_horizon,
+        catalyst_type,
+        json.dumps(key_facts),
+        json.dumps(risks),
+        json.dumps(evidence_spans),
+    )
+    logger.debug("Persisted impact record %s for %s", impact_id, ticker)
+    return str(impact_id)
+
+
+async def update_document_status(
+    pool: asyncpg.Pool,
+    *,
+    document_id: str,
+    status: str,
+) -> None:
+    """Update the status field on a document row.
+
+    Used to advance documents through the pipeline: ingested → parsed → extracted → failed.
+
+    Requirements: 5.4
+    """
+    await pool.execute(
+        """UPDATE documents SET status = $2, updated_at = NOW() WHERE id = $1::uuid""",
+        document_id,
+        status,
+    )
+    logger.debug("Updated document %s status to %s", document_id, status)
+
+
+async def reset_source_retry_state(
+    pool: asyncpg.Pool,
+    source_id: str,
+) -> None:
+    """Reset retry state for a source after a successful run.
+
+    Sets retry_count=0 and next_retry_at=NULL on the most recent run.
+    Called after a successful ingestion to clear any accumulated backoff.
+    """
+    await pool.execute(
+        """UPDATE ingestion_runs
+           SET retry_count = 0, next_retry_at = NULL
+           WHERE id = (
+               SELECT id FROM ingestion_runs
+               WHERE source_id = $1::uuid
+               ORDER BY started_at DESC
+               LIMIT 1
+           )""",
+        source_id,
+    )