phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -0,0 +1,342 @@
+"""Operational alerting for Stonks Oracle pipeline health.
+
+Evaluates alert rules against PostgreSQL operational state and emits
+structured log events and Prometheus metrics when thresholds are breached.
+
+Alert rules:
+- source_failures: sustained source retrieval failures per source
+- schema_failure_spike: extraction validation failure rate exceeds threshold
+- analytical_lag: lake publication has not completed within threshold
+- broker_issues: consecutive broker submission errors
+
+Requirements: 12.3
+Design: Section 12 (Observability and Operations)
+"""
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Any
+
+import asyncpg
+
+from services.shared.config import AlertingConfig
+from services.shared.metrics import (
+    ALERT_ACTIVE,
+    ALERT_CHECK_DURATION,
+    ALERTS_FIRED,
+    ALERTS_RESOLVED,
+)
+
+logger = logging.getLogger("alerting")
+
+
+@dataclass
+class Alert:
+    """A single alert instance."""
+
+    rule: str
+    severity: str  # "warning" | "critical"
+    summary: str
+    details: dict[str, Any] = field(default_factory=dict)
+    fired_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
+
+
+@dataclass
+class AlertState:
+    """Tracks which rules are currently firing to detect transitions."""
+
+    active: dict[str, Alert] = field(default_factory=dict)
+
+    def fire(self, alert: Alert) -> bool:
+        """Record an alert firing. Returns True if this is a new firing."""
+        key = f"{alert.rule}:{alert.details.get('key', '')}"
+        is_new = key not in self.active
+        self.active[key] = alert
+        return is_new
+
+    def resolve(self, rule: str, key: str = "") -> bool:
+        """Resolve an alert. Returns True if it was previously active."""
+        full_key = f"{rule}:{key}"
+        if full_key in self.active:
+            del self.active[full_key]
+            return True
+        return False
+
+    def is_firing(self, rule: str, key: str = "") -> bool:
+        return f"{rule}:{key}" in self.active
+
+
+async def check_source_failures(
+    pool: asyncpg.Pool,
+    config: AlertingConfig,
+) -> list[Alert]:
+    """Check for sources with sustained consecutive failures.
+
+    Queries ingestion_runs for sources where the last N runs all failed
+    within the lookback window.
+    """
+    rows = await pool.fetch(
+        """WITH recent_runs AS (
+            SELECT source_id, status,
+                   ROW_NUMBER() OVER (PARTITION BY source_id ORDER BY started_at DESC) AS rn
+            FROM ingestion_runs
+            WHERE started_at >= NOW() - INTERVAL '1 hour' * $1
+        ),
+        failure_streaks AS (
+            SELECT source_id,
+                   COUNT(*) FILTER (WHERE status = 'failed') AS consecutive_failures,
+                   COUNT(*) AS total_runs
+            FROM recent_runs
+            WHERE rn <= $2
+            GROUP BY source_id
+            HAVING COUNT(*) FILTER (WHERE status = 'failed') = COUNT(*)
+               AND COUNT(*) >= $2
+        )
+        SELECT fs.source_id, fs.consecutive_failures,
+               s.source_type, s.source_name, c.ticker
+        FROM failure_streaks fs
+        JOIN sources s ON s.id = fs.source_id
+        JOIN companies c ON c.id = s.company_id""",
+        config.source_failure_window_hours,
+        config.source_failure_threshold,
+    )
+
+    alerts = []
+    for row in rows:
+        alerts.append(Alert(
+            rule="source_failures",
+            severity="warning",
+            summary=(
+                f"Source {row['source_name']} ({row['source_type']}) for "
+                f"{row['ticker']} has {row['consecutive_failures']} consecutive failures"
+            ),
+            details={
+                "key": str(row["source_id"]),
+                "source_id": str(row["source_id"]),
+                "source_type": row["source_type"],
+                "source_name": row["source_name"],
+                "ticker": row["ticker"],
+                "consecutive_failures": row["consecutive_failures"],
+            },
+        ))
+    return alerts
+
+
+async def check_schema_failure_spike(
+    pool: asyncpg.Pool,
+    config: AlertingConfig,
+) -> list[Alert]:
+    """Check if extraction schema validation failure rate exceeds threshold.
+
+    Queries model_performance_metrics for the recent window and computes
+    the failure rate.
+    """
+    row = await pool.fetchrow(
+        """SELECT
+            COUNT(*) AS total,
+            COUNT(*) FILTER (WHERE NOT success) AS failed
+        FROM model_performance_metrics
+        WHERE recorded_at >= NOW() - INTERVAL '1 hour' * $1""",
+        config.schema_failure_window_hours,
+    )
+
+    if not row or row["total"] == 0:
+        return []
+
+    total = row["total"]
+    failed = row["failed"]
+    failure_rate = failed / total
+
+    if failure_rate >= config.schema_failure_rate_threshold:
+        return [Alert(
+            rule="schema_failure_spike",
+            severity="critical" if failure_rate >= 0.5 else "warning",
+            summary=(
+                f"Extraction schema failure rate is {failure_rate:.1%} "
+                f"({failed}/{total}) in the last {config.schema_failure_window_hours}h"
+            ),
+            details={
+                "key": "global",
+                "total_extractions": total,
+                "failed_extractions": failed,
+                "failure_rate": round(failure_rate, 4),
+                "threshold": config.schema_failure_rate_threshold,
+                "window_hours": config.schema_failure_window_hours,
+            },
+        )]
+    return []
+
+
+async def check_analytical_lag(
+    pool: asyncpg.Pool,
+    config: AlertingConfig,
+) -> list[Alert]:
+    """Check if lake publication is lagging beyond threshold.
+
+    Looks at the audit_events table for the most recent successful
+    lake_publish events per table, and alerts if any are stale.
+    """
+    rows = await pool.fetch(
+        """SELECT
+            details->>'table_name' AS table_name,
+            MAX(created_at) AS last_publish
+        FROM audit_events
+        WHERE event_type = 'lake_publish'
+          AND details->>'status' = 'success'
+          AND details->>'table_name' IS NOT NULL
+        GROUP BY details->>'table_name'
+        HAVING MAX(created_at) < NOW() - INTERVAL '1 minute' * $1""",
+        config.lake_lag_threshold_minutes,
+    )
+
+    alerts = []
+    now = datetime.now(timezone.utc)
+    for row in rows:
+        table_name = row["table_name"]
+        last_publish = row["last_publish"]
+        if last_publish.tzinfo is None:
+            last_publish = last_publish.replace(tzinfo=timezone.utc)
+        lag_minutes = (now - last_publish).total_seconds() / 60
+
+        alerts.append(Alert(
+            rule="analytical_lag",
+            severity="warning",
+            summary=(
+                f"Lake table '{table_name}' last published {lag_minutes:.0f}m ago "
+                f"(threshold: {config.lake_lag_threshold_minutes}m)"
+            ),
+            details={
+                "key": table_name,
+                "table_name": table_name,
+                "last_publish": last_publish.isoformat(),
+                "lag_minutes": round(lag_minutes, 1),
+                "threshold_minutes": config.lake_lag_threshold_minutes,
+            },
+        ))
+    return alerts
+
+
+async def check_broker_issues(
+    pool: asyncpg.Pool,
+    config: AlertingConfig,
+) -> list[Alert]:
+    """Check for consecutive broker submission errors.
+
+    Queries order_events for recent broker-level errors (rejections,
+    timeouts, connection failures) within the lookback window.
+    """
+    rows = await pool.fetch(
+        """WITH recent_events AS (
+            SELECT order_id, event_type, created_at,
+                   ROW_NUMBER() OVER (ORDER BY created_at DESC) AS rn
+            FROM order_events
+            WHERE created_at >= NOW() - INTERVAL '1 hour' * $1
+              AND event_type IN ('broker_error', 'broker_timeout', 'connection_failed')
+        )
+        SELECT COUNT(*) AS error_count
+        FROM recent_events
+        WHERE rn <= $2""",
+        config.broker_error_window_hours,
+        config.broker_error_threshold,
+    )
+
+    if not rows:
+        return []
+
+    error_count = rows[0]["error_count"]
+    if error_count >= config.broker_error_threshold:
+        return [Alert(
+            rule="broker_issues",
+            severity="critical",
+            summary=(
+                f"{error_count} broker errors in the last "
+                f"{config.broker_error_window_hours}h"
+            ),
+            details={
+                "key": "global",
+                "error_count": error_count,
+                "threshold": config.broker_error_threshold,
+                "window_hours": config.broker_error_window_hours,
+            },
+        )]
+    return []
+
+
+async def evaluate_alerts(
+    pool: asyncpg.Pool,
+    config: AlertingConfig,
+    state: AlertState,
+) -> list[Alert]:
+    """Run all alert rules and return newly fired alerts.
+
+    Updates AlertState to track firing/resolved transitions and emits
+    structured log events and Prometheus metrics for each transition.
+    """
+    all_alerts: list[Alert] = []
+
+    with ALERT_CHECK_DURATION.time():
+        # Collect alerts from all rules
+        try:
+            all_alerts.extend(await check_source_failures(pool, config))
+        except Exception:
+            logger.exception("Error checking source failures")
+
+        try:
+            all_alerts.extend(await check_schema_failure_spike(pool, config))
+        except Exception:
+            logger.exception("Error checking schema failure spike")
+
+        try:
+            all_alerts.extend(await check_analytical_lag(pool, config))
+        except Exception:
+            logger.exception("Error checking analytical lag")
+
+        try:
+            all_alerts.extend(await check_broker_issues(pool, config))
+        except Exception:
+            logger.exception("Error checking broker issues")
+
+    # Track which rule+key combos are currently firing
+    current_keys: set[str] = set()
+    newly_fired: list[Alert] = []
+
+    for alert in all_alerts:
+        key = f"{alert.rule}:{alert.details.get('key', '')}"
+        current_keys.add(key)
+
+        if state.fire(alert):
+            # New alert firing
+            ALERTS_FIRED.labels(rule=alert.rule, severity=alert.severity).inc()
+            ALERT_ACTIVE.labels(rule=alert.rule).set(1)
+            newly_fired.append(alert)
+            logger.warning(
+                "ALERT FIRING: [%s] %s",
+                alert.rule,
+                alert.summary,
+                extra={
+                    "alert_rule": alert.rule,
+                    "alert_severity": alert.severity,
+                    "alert_details": alert.details,
+                },
+            )
+
+    # Check for resolved alerts
+    resolved_keys = set(state.active.keys()) - current_keys
+    for key in resolved_keys:
+        rule = key.split(":")[0]
+        detail_key = key[len(rule) + 1:]
+        if state.resolve(rule, detail_key):
+            ALERTS_RESOLVED.labels(rule=rule).inc()
+            # Only set gauge to 0 if no more alerts for this rule
+            still_firing = any(k.startswith(f"{rule}:") for k in state.active)
+            if not still_firing:
+                ALERT_ACTIVE.labels(rule=rule).set(0)
+            logger.info(
+                "ALERT RESOLVED: [%s] key=%s",
+                rule,
+                detail_key,
+            )
+
+    return newly_fired
@@ -0,0 +1,493 @@
+"""Execution audit trail - records every step from recommendation to market outcome.
+
+Writes structured audit events to the audit_events table so the full
+decision chain is traceable: recommendation → risk evaluation → order
+submission → broker response → fill/rejection/cancellation.
+
+Each event captures the entity type, entity ID, event type, actor,
+and a JSONB data payload with stage-specific details.
+
+Requirements: 8.3, 11.3
+Design: Section 4.9 (Broker Adapter), Section 6.1 (PostgreSQL audit_events)
+"""
+from __future__ import annotations
+
+import json
+import logging
+import uuid
+from datetime import datetime, timezone
+from typing import Any
+
+import asyncpg
+
+logger = logging.getLogger("audit")
+
+
+# ---------------------------------------------------------------------------
+# Event type constants
+# ---------------------------------------------------------------------------
+
+# Recommendation stage
+AUDIT_RECOMMENDATION_GENERATED = "recommendation.generated"
+AUDIT_RECOMMENDATION_SUPPRESSED = "recommendation.suppressed"
+
+# Risk evaluation stage
+AUDIT_RISK_EVALUATED = "risk.evaluated"
+AUDIT_RISK_REJECTED = "risk.rejected"
+
+# Order lifecycle
+AUDIT_ORDER_SUBMITTED = "order.submitted"
+AUDIT_ORDER_ACCEPTED = "order.accepted"
+AUDIT_ORDER_FILLED = "order.filled"
+AUDIT_ORDER_REJECTED = "order.rejected"
+AUDIT_ORDER_CANCELLED = "order.cancelled"
+AUDIT_ORDER_DUPLICATE = "order.duplicate_prevented"
+
+# Position changes
+AUDIT_POSITION_OPENED = "position.opened"
+AUDIT_POSITION_CLOSED = "position.closed"
+AUDIT_POSITION_UPDATED = "position.updated"
+
+# Trading mode changes
+AUDIT_TRADING_MODE_CHANGED = "trading.mode_changed"
+
+# Operator approval workflow
+AUDIT_APPROVAL_REQUESTED = "approval.requested"
+AUDIT_APPROVAL_APPROVED = "approval.approved"
+AUDIT_APPROVAL_REJECTED = "approval.rejected"
+AUDIT_APPROVAL_EXPIRED = "approval.expired"
+
+
+# ---------------------------------------------------------------------------
+# Core audit writer
+# ---------------------------------------------------------------------------
+
+_INSERT_AUDIT_EVENT = """
+INSERT INTO audit_events (id, event_type, entity_type, entity_id, actor, data, created_at)
+VALUES ($1::uuid, $2, $3, $4::uuid, $5, $6::jsonb, $7)
+"""
+
+
+async def record_audit_event(
+    pool: asyncpg.Pool,
+    event_type: str,
+    entity_type: str,
+    entity_id: str,
+    data: dict[str, Any],
+    actor: str = "system",
+    timestamp: datetime | None = None,
+) -> str:
+    """Write a single audit event to PostgreSQL.
+
+    Returns the audit event UUID.
+    """
+    event_id = str(uuid.uuid4())
+    ts = timestamp or datetime.now(timezone.utc)
+
+    try:
+        await pool.execute(
+            _INSERT_AUDIT_EVENT,
+            event_id,
+            event_type,
+            entity_type,
+            entity_id,
+            actor,
+            json.dumps(data, default=str),
+            ts,
+        )
+    except Exception:
+        logger.warning(
+            "Failed to write audit event %s for %s/%s",
+            event_type, entity_type, entity_id,
+            exc_info=True,
+        )
+        return ""
+
+    return event_id
+
+
+# ---------------------------------------------------------------------------
+# Convenience helpers for each execution stage
+# ---------------------------------------------------------------------------
+
+
+async def audit_recommendation_generated(
+    pool: asyncpg.Pool,
+    recommendation_id: str,
+    ticker: str,
+    action: str,
+    mode: str,
+    confidence: float,
+    evidence_count: int,
+    suppressed: bool = False,
+) -> str:
+    """Record that a recommendation was generated."""
+    event_type = AUDIT_RECOMMENDATION_SUPPRESSED if suppressed else AUDIT_RECOMMENDATION_GENERATED
+    return await record_audit_event(
+        pool,
+        event_type=event_type,
+        entity_type="recommendation",
+        entity_id=recommendation_id,
+        data={
+            "ticker": ticker,
+            "action": action,
+            "mode": mode,
+            "confidence": confidence,
+            "evidence_count": evidence_count,
+            "suppressed": suppressed,
+        },
+        actor="recommendation_worker",
+    )
+
+
+async def audit_risk_evaluated(
+    pool: asyncpg.Pool,
+    evaluation_id: str,
+    recommendation_id: str | None,
+    ticker: str,
+    eligible: bool,
+    allowed_mode: str,
+    rejection_reasons: list[str],
+    check_count: int,
+) -> str:
+    """Record a risk evaluation result."""
+    event_type = AUDIT_RISK_REJECTED if not eligible else AUDIT_RISK_EVALUATED
+    return await record_audit_event(
+        pool,
+        event_type=event_type,
+        entity_type="risk_evaluation",
+        entity_id=evaluation_id,
+        data={
+            "recommendation_id": recommendation_id,
+            "ticker": ticker,
+            "eligible": eligible,
+            "allowed_mode": allowed_mode,
+            "rejection_reasons": rejection_reasons,
+            "check_count": check_count,
+        },
+        actor="risk_engine",
+    )
+
+
+async def audit_order_submitted(
+    pool: asyncpg.Pool,
+    order_id: str,
+    ticker: str,
+    side: str,
+    quantity: float,
+    order_type: str,
+    idempotency_key: str,
+    recommendation_id: str | None = None,
+    evaluation_id: str | None = None,
+) -> str:
+    """Record that an order was submitted to the broker."""
+    return await record_audit_event(
+        pool,
+        event_type=AUDIT_ORDER_SUBMITTED,
+        entity_type="order",
+        entity_id=order_id,
+        data={
+            "ticker": ticker,
+            "side": side,
+            "quantity": quantity,
+            "order_type": order_type,
+            "idempotency_key": idempotency_key,
+            "recommendation_id": recommendation_id,
+            "evaluation_id": evaluation_id,
+        },
+        actor="broker_service",
+    )
+
+
+async def audit_order_filled(
+    pool: asyncpg.Pool,
+    order_id: str,
+    ticker: str,
+    side: str,
+    fill_quantity: float,
+    fill_price: float | None,
+    broker_order_id: str,
+) -> str:
+    """Record that an order was filled by the broker."""
+    return await record_audit_event(
+        pool,
+        event_type=AUDIT_ORDER_FILLED,
+        entity_type="order",
+        entity_id=order_id,
+        data={
+            "ticker": ticker,
+            "side": side,
+            "fill_quantity": fill_quantity,
+            "fill_price": fill_price,
+            "broker_order_id": broker_order_id,
+        },
+        actor="broker_service",
+    )
+
+
+async def audit_order_rejected(
+    pool: asyncpg.Pool,
+    order_id: str,
+    ticker: str,
+    reason: str,
+    source: str = "broker",
+) -> str:
+    """Record that an order was rejected (by risk engine or broker)."""
+    return await record_audit_event(
+        pool,
+        event_type=AUDIT_ORDER_REJECTED,
+        entity_type="order",
+        entity_id=order_id,
+        data={
+            "ticker": ticker,
+            "reason": reason,
+            "rejection_source": source,
+        },
+        actor="broker_service",
+    )
+
+
+async def audit_order_cancelled(
+    pool: asyncpg.Pool,
+    order_id: str,
+    ticker: str,
+    broker_order_id: str,
+) -> str:
+    """Record that an order was cancelled."""
+    return await record_audit_event(
+        pool,
+        event_type=AUDIT_ORDER_CANCELLED,
+        entity_type="order",
+        entity_id=order_id,
+        data={
+            "ticker": ticker,
+            "broker_order_id": broker_order_id,
+        },
+        actor="broker_service",
+    )
+
+
+async def audit_duplicate_prevented(
+    pool: asyncpg.Pool,
+    order_id: str,
+    ticker: str,
+    idempotency_key: str,
+    detected_via: str,
+) -> str:
+    """Record that a duplicate order was prevented."""
+    return await record_audit_event(
+        pool,
+        event_type=AUDIT_ORDER_DUPLICATE,
+        entity_type="order",
+        entity_id=order_id,
+        data={
+            "ticker": ticker,
+            "idempotency_key": idempotency_key,
+            "detected_via": detected_via,
+        },
+        actor="broker_service",
+    )
+
+
+async def audit_position_change(
+    pool: asyncpg.Pool,
+    order_id: str,
+    ticker: str,
+    side: str,
+    quantity_before: float,
+    quantity_after: float,
+    avg_entry_before: float,
+    avg_entry_after: float,
+) -> str:
+    """Record a position change resulting from a fill."""
+    if quantity_before == 0 and quantity_after > 0:
+        event_type = AUDIT_POSITION_OPENED
+    elif quantity_after == 0:
+        event_type = AUDIT_POSITION_CLOSED
+    else:
+        event_type = AUDIT_POSITION_UPDATED
+
+    return await record_audit_event(
+        pool,
+        event_type=event_type,
+        entity_type="position",
+        entity_id=order_id,
+        data={
+            "ticker": ticker,
+            "side": side,
+            "quantity_before": quantity_before,
+            "quantity_after": quantity_after,
+            "avg_entry_before": avg_entry_before,
+            "avg_entry_after": avg_entry_after,
+        },
+        actor="broker_service",
+    )
+
+
+async def audit_approval_requested(
+    pool: asyncpg.Pool,
+    approval_id: str,
+    ticker: str,
+    side: str,
+    quantity: float,
+    estimated_value: float,
+    recommendation_id: str | None = None,
+    expires_at: str | None = None,
+) -> str:
+    """Record that an operator approval was requested for a live order."""
+    return await record_audit_event(
+        pool,
+        event_type=AUDIT_APPROVAL_REQUESTED,
+        entity_type="approval",
+        entity_id=approval_id,
+        data={
+            "ticker": ticker,
+            "side": side,
+            "quantity": quantity,
+            "estimated_value": estimated_value,
+            "recommendation_id": recommendation_id,
+            "expires_at": expires_at,
+        },
+        actor="broker_service",
+    )
+
+
+async def audit_approval_reviewed(
+    pool: asyncpg.Pool,
+    approval_id: str,
+    ticker: str,
+    approved: bool,
+    reviewed_by: str = "operator",
+    review_note: str = "",
+) -> str:
+    """Record that an operator reviewed an approval request."""
+    event_type = AUDIT_APPROVAL_APPROVED if approved else AUDIT_APPROVAL_REJECTED
+    return await record_audit_event(
+        pool,
+        event_type=event_type,
+        entity_type="approval",
+        entity_id=approval_id,
+        data={
+            "ticker": ticker,
+            "approved": approved,
+            "reviewed_by": reviewed_by,
+            "review_note": review_note,
+        },
+        actor=reviewed_by,
+    )
+
+
+async def audit_approval_expired(
+    pool: asyncpg.Pool,
+    approval_id: str,
+    ticker: str,
+) -> str:
+    """Record that an approval request expired without review."""
+    return await record_audit_event(
+        pool,
+        event_type=AUDIT_APPROVAL_EXPIRED,
+        entity_type="approval",
+        entity_id=approval_id,
+        data={"ticker": ticker},
+        actor="system",
+    )
+
+
+async def audit_trading_mode_changed(
+    pool: asyncpg.Pool,
+    config_id: str,
+    old_mode: str,
+    new_mode: str,
+    actor: str = "operator",
+) -> str:
+    """Record a trading mode change."""
+    return await record_audit_event(
+        pool,
+        event_type=AUDIT_TRADING_MODE_CHANGED,
+        entity_type="risk_config",
+        entity_id=config_id,
+        data={
+            "old_mode": old_mode,
+            "new_mode": new_mode,
+        },
+        actor=actor,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Query helpers for audit trail retrieval (Requirement 11.3)
+# ---------------------------------------------------------------------------
+
+_FETCH_AUDIT_TRAIL_FOR_ORDER = """
+SELECT id, event_type, entity_type, entity_id, actor, data, created_at
+FROM audit_events
+WHERE entity_id = $1::uuid
+   OR data->>'recommendation_id' = $2
+   OR data->>'order_id' = $2
+ORDER BY created_at ASC
+"""
+
+_FETCH_AUDIT_TRAIL_BY_ENTITY = """
+SELECT id, event_type, entity_type, entity_id, actor, data, created_at
+FROM audit_events
+WHERE entity_type = $1 AND entity_id = $2::uuid
+ORDER BY created_at ASC
+"""
+
+_FETCH_FULL_EXECUTION_TRAIL = """
+SELECT id, event_type, entity_type, entity_id, actor, data, created_at
+FROM audit_events
+WHERE entity_id = $1::uuid
+   OR entity_id IN (
+       SELECT entity_id FROM audit_events
+       WHERE data->>'recommendation_id' = $2
+   )
+ORDER BY created_at ASC
+"""
+
+
+async def get_order_audit_trail(
+    pool: asyncpg.Pool,
+    order_id: str,
+    recommendation_id: str | None = None,
+) -> list[dict[str, Any]]:
+    """Fetch the full audit trail for an order, including related recommendation and risk events.
+
+    Returns events ordered chronologically so the full decision chain
+    is visible: recommendation → risk → order → fill/reject.
+    """
+    ref_id = recommendation_id or order_id
+    rows = await pool.fetch(_FETCH_AUDIT_TRAIL_FOR_ORDER, order_id, ref_id)
+    return [
+        {
+            "id": str(row["id"]),
+            "event_type": row["event_type"],
+            "entity_type": row["entity_type"],
+            "entity_id": str(row["entity_id"]),
+            "actor": row["actor"],
+            "data": row["data"] if isinstance(row["data"], dict) else json.loads(row["data"]),
+            "created_at": row["created_at"].isoformat() if row["created_at"] else None,
+        }
+        for row in rows
+    ]
+
+
+async def get_entity_audit_trail(
+    pool: asyncpg.Pool,
+    entity_type: str,
+    entity_id: str,
+) -> list[dict[str, Any]]:
+    """Fetch all audit events for a specific entity."""
+    rows = await pool.fetch(_FETCH_AUDIT_TRAIL_BY_ENTITY, entity_type, entity_id)
+    return [
+        {
+            "id": str(row["id"]),
+            "event_type": row["event_type"],
+            "entity_type": row["entity_type"],
+            "entity_id": str(row["entity_id"]),
+            "actor": row["actor"],
+            "data": row["data"] if isinstance(row["data"], dict) else json.loads(row["data"]),
+            "created_at": row["created_at"].isoformat() if row["created_at"] else None,
+        }
+        for row in rows
+    ]
@@ -43,6 +43,10 @@ class OllamaConfig:
    base_url: str = "http://localhost:11434"
    model: str = "llama3.1:8b"
    timeout: int = 120
+    max_retries: int = 2
+    retry_base_delay: float = 1.0
+    retry_max_delay: float = 10.0
+    retry_backoff_multiplier: float = 2.0


@dataclass
@@ -51,16 +55,82 @@ class TrinoConfig:
    port: int = 8080
    catalog: str = "lakehouse"
    schema: str = "stonks"
+    iceberg_catalog: str = "iceberg"
+
+
+@dataclass
+class MarketDataConfig:
+    api_key: str = ""
+    base_url: str = "https://api.polygon.io"
+    provider: str = "polygon"


@dataclass
 class BrokerConfig:
    mode: str = "paper"  # paper | live
+    provider: str = "alpaca"
    api_key: Optional[str] = None
    api_secret: Optional[str] = None
    base_url: Optional[str] = None


+@dataclass
+class RetentionConfig:
+    """Default retention periods (days) per bucket class.
+
+    These can be overridden per-bucket via the retention_policies DB table.
+    The cleanup_interval_hours controls how often the retention worker runs.
+    """
+    raw_market_days: int = 90
+    raw_news_days: int = 180
+    raw_filings_days: int = 365
+    normalized_days: int = 180
+    llm_prompts_days: int = 365
+    llm_results_days: int = 365
+    lakehouse_days: int = 730
+    audit_days: int = 730
+    cleanup_interval_hours: int = 24
+    batch_size: int = 1000
+
+
+# Map bucket names to RetentionConfig field names
+BUCKET_RETENTION_FIELDS: dict[str, str] = {
+    "stonks-raw-market": "raw_market_days",
+    "stonks-raw-news": "raw_news_days",
+    "stonks-raw-filings": "raw_filings_days",
+    "stonks-normalized": "normalized_days",
+    "stonks-llm-prompts": "llm_prompts_days",
+    "stonks-llm-results": "llm_results_days",
+    "stonks-lakehouse": "lakehouse_days",
+    "stonks-audit": "audit_days",
+}
+
+
+@dataclass
+class AlertingConfig:
+    """Thresholds for operational alerting rules.
+
+    Requirements: 12.3
+    """
+    # Source failure alerting
+    source_failure_threshold: int = 3  # consecutive failures before alert
+    source_failure_window_hours: int = 6  # lookback window
+
+    # Schema/extraction failure spike
+    schema_failure_rate_threshold: float = 0.3  # 30% failure rate triggers alert
+    schema_failure_window_hours: int = 1
+
+    # Analytical (lake publication) lag
+    lake_lag_threshold_minutes: int = 60  # minutes since last successful publish
+
+    # Broker issues
+    broker_error_threshold: int = 3  # consecutive broker errors
+    broker_error_window_hours: int = 1
+
+    # Evaluation interval
+    check_interval_seconds: int = 120
+
+
@dataclass
 class AppConfig:
    postgres: PostgresConfig = field(default_factory=PostgresConfig)
@@ -68,8 +138,12 @@ class AppConfig:
    minio: MinioConfig = field(default_factory=MinioConfig)
    ollama: OllamaConfig = field(default_factory=OllamaConfig)
    trino: TrinoConfig = field(default_factory=TrinoConfig)
+    market_data: MarketDataConfig = field(default_factory=MarketDataConfig)
    broker: BrokerConfig = field(default_factory=BrokerConfig)
+    retention: RetentionConfig = field(default_factory=RetentionConfig)
+    alerting: AlertingConfig = field(default_factory=AlertingConfig)
    log_level: str = "INFO"
+    json_logs: bool = True


 def load_config() -> AppConfig:
@@ -98,18 +172,52 @@ def load_config() -> AppConfig:
            base_url=os.getenv("OLLAMA_BASE_URL", "http://localhost:11434"),
            model=os.getenv("OLLAMA_MODEL", "llama3.1:8b"),
            timeout=int(os.getenv("OLLAMA_TIMEOUT", "120")),
+            max_retries=int(os.getenv("OLLAMA_MAX_RETRIES", "2")),
+            retry_base_delay=float(os.getenv("OLLAMA_RETRY_BASE_DELAY", "1.0")),
+            retry_max_delay=float(os.getenv("OLLAMA_RETRY_MAX_DELAY", "10.0")),
+            retry_backoff_multiplier=float(os.getenv("OLLAMA_RETRY_BACKOFF_MULTIPLIER", "2.0")),
        ),
        trino=TrinoConfig(
            host=os.getenv("TRINO_HOST", "localhost"),
            port=int(os.getenv("TRINO_PORT", "8080")),
            catalog=os.getenv("TRINO_CATALOG", "lakehouse"),
            schema=os.getenv("TRINO_SCHEMA", "stonks"),
+            iceberg_catalog=os.getenv("TRINO_ICEBERG_CATALOG", "iceberg"),
+        ),
+        market_data=MarketDataConfig(
+            api_key=os.getenv("MARKET_DATA_API_KEY", ""),
+            base_url=os.getenv("MARKET_DATA_BASE_URL", "https://api.polygon.io"),
+            provider=os.getenv("MARKET_DATA_PROVIDER", "polygon"),
        ),
        broker=BrokerConfig(
            mode=os.getenv("BROKER_MODE", "paper"),
+            provider=os.getenv("BROKER_PROVIDER", "alpaca"),
            api_key=os.getenv("BROKER_API_KEY", None),
            api_secret=os.getenv("BROKER_API_SECRET", None),
            base_url=os.getenv("BROKER_BASE_URL", None),
        ),
+        retention=RetentionConfig(
+            raw_market_days=int(os.getenv("RETENTION_RAW_MARKET_DAYS", "90")),
+            raw_news_days=int(os.getenv("RETENTION_RAW_NEWS_DAYS", "180")),
+            raw_filings_days=int(os.getenv("RETENTION_RAW_FILINGS_DAYS", "365")),
+            normalized_days=int(os.getenv("RETENTION_NORMALIZED_DAYS", "180")),
+            llm_prompts_days=int(os.getenv("RETENTION_LLM_PROMPTS_DAYS", "365")),
+            llm_results_days=int(os.getenv("RETENTION_LLM_RESULTS_DAYS", "365")),
+            lakehouse_days=int(os.getenv("RETENTION_LAKEHOUSE_DAYS", "730")),
+            audit_days=int(os.getenv("RETENTION_AUDIT_DAYS", "730")),
+            cleanup_interval_hours=int(os.getenv("RETENTION_CLEANUP_INTERVAL_HOURS", "24")),
+            batch_size=int(os.getenv("RETENTION_BATCH_SIZE", "1000")),
+        ),
+        alerting=AlertingConfig(
+            source_failure_threshold=int(os.getenv("ALERT_SOURCE_FAILURE_THRESHOLD", "3")),
+            source_failure_window_hours=int(os.getenv("ALERT_SOURCE_FAILURE_WINDOW_HOURS", "6")),
+            schema_failure_rate_threshold=float(os.getenv("ALERT_SCHEMA_FAILURE_RATE_THRESHOLD", "0.3")),
+            schema_failure_window_hours=int(os.getenv("ALERT_SCHEMA_FAILURE_WINDOW_HOURS", "1")),
+            lake_lag_threshold_minutes=int(os.getenv("ALERT_LAKE_LAG_THRESHOLD_MINUTES", "60")),
+            broker_error_threshold=int(os.getenv("ALERT_BROKER_ERROR_THRESHOLD", "3")),
+            broker_error_window_hours=int(os.getenv("ALERT_BROKER_ERROR_WINDOW_HOURS", "1")),
+            check_interval_seconds=int(os.getenv("ALERT_CHECK_INTERVAL_SECONDS", "120")),
+        ),
        log_level=os.getenv("LOG_LEVEL", "INFO"),
+        json_logs=os.getenv("JSON_LOGS", "true").lower() == "true",
    )
@@ -0,0 +1,43 @@
+"""Canonical URL normalization and content hashing utilities.
+
+Provides consistent URL canonicalization and SHA-256 content hashing
+across all ingestion adapters and pipeline stages.
+
+Requirements: 3.2, 3.3
+"""
+import hashlib
+from urllib.parse import parse_qsl, urlencode, urlparse
+
+
+def normalize_url(url: str) -> str:
+    """Canonical URL normalization.
+
+    - Lowercases scheme and host
+    - Strips fragments
+    - Strips trailing slashes from path (preserves root "/")
+    - Strips default ports (80, 443)
+    - Sorts query parameters for deterministic comparison
+    - Defaults scheme to https if missing
+    """
+    parsed = urlparse(url)
+    scheme = (parsed.scheme or "https").lower()
+    netloc = (parsed.hostname or "").lower()
+    if parsed.port and parsed.port not in (80, 443):
+        netloc = f"{netloc}:{parsed.port}"
+    path = parsed.path.rstrip("/") or "/"
+    # Sort query params for deterministic ordering
+    query = urlencode(sorted(parse_qsl(parsed.query)))
+    normalized = f"{scheme}://{netloc}{path}"
+    if query:
+        normalized = f"{normalized}?{query}"
+    return normalized
+
+
+def content_hash(data: bytes) -> str:
+    """Compute a stable SHA-256 hex digest for raw content bytes."""
+    return hashlib.sha256(data).hexdigest()
+
+
+def content_hash_str(text: str, encoding: str = "utf-8") -> str:
+    """Compute a stable SHA-256 hex digest for a text string."""
+    return hashlib.sha256(text.encode(encoding)).hexdigest()
@@ -0,0 +1,134 @@
+"""Dead-letter queue (DLQ) support and replay tooling.
+
+When a worker fails to process a job after exhausting retries, the job
+is pushed to a per-queue dead-letter list in Redis.  Each DLQ entry
+wraps the original payload with failure metadata (error message,
+timestamp, attempt count) so operators can inspect and replay later.
+
+Replay moves items from the DLQ back to the source queue for
+reprocessing.
+
+Requirements: 12.1 (observability), design section 8 (data flows)
+"""
+from __future__ import annotations
+
+import json
+import logging
+from datetime import datetime, timezone
+from typing import Any
+
+import redis.asyncio as aioredis
+
+from services.shared.redis_keys import dlq_key, queue_key
+
+logger = logging.getLogger(__name__)
+
+# Default max attempts before a job is dead-lettered
+DEFAULT_MAX_ATTEMPTS = 3
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def wrap_dlq_entry(
+    original_payload: dict[str, Any],
+    queue_name: str,
+    error: str,
+    attempt: int = 1,
+    worker: str = "",
+) -> dict[str, Any]:
+    """Wrap an original job payload with DLQ metadata."""
+    return {
+        "original_payload": original_payload,
+        "queue": queue_name,
+        "error": error,
+        "attempt": attempt,
+        "worker": worker,
+        "dead_lettered_at": _now_iso(),
+    }
+
+
+async def send_to_dlq(
+    rds: aioredis.Redis,
+    queue_name: str,
+    original_payload: dict[str, Any],
+    error: str,
+    attempt: int = 1,
+    worker: str = "",
+) -> None:
+    """Push a failed job to the dead-letter queue for *queue_name*."""
+    entry = wrap_dlq_entry(original_payload, queue_name, error, attempt, worker)
+    await rds.rpush(dlq_key(queue_name), json.dumps(entry, default=str))
+    logger.warning(
+        "Dead-lettered job on %s after %d attempts: %s",
+        queue_name, attempt, error,
+        extra={"queue": queue_name, "attempt": attempt},
+    )
+
+
+async def dlq_length(rds: aioredis.Redis, queue_name: str) -> int:
+    """Return the number of items in the DLQ for *queue_name*."""
+    return await rds.llen(dlq_key(queue_name))
+
+
+async def peek_dlq(
+    rds: aioredis.Redis,
+    queue_name: str,
+    start: int = 0,
+    count: int = 10,
+) -> list[dict[str, Any]]:
+    """Return DLQ entries without removing them (for inspection)."""
+    raw_items = await rds.lrange(dlq_key(queue_name), start, start + count - 1)
+    return [json.loads(item) for item in raw_items]
+
+
+async def replay_one(rds: aioredis.Redis, queue_name: str) -> dict[str, Any] | None:
+    """Pop the oldest DLQ entry and re-enqueue its original payload.
+
+    Returns the replayed DLQ entry, or None if the DLQ is empty.
+    """
+    raw = await rds.lpop(dlq_key(queue_name))
+    if raw is None:
+        return None
+    entry = json.loads(raw)
+    original = entry.get("original_payload", entry)
+    await rds.rpush(queue_key(queue_name), json.dumps(original, default=str))
+    logger.info("Replayed 1 job from DLQ back to %s", queue_name)
+    return entry
+
+
+async def replay_all(rds: aioredis.Redis, queue_name: str) -> int:
+    """Replay every item in the DLQ back to the source queue.
+
+    Returns the number of items replayed.
+    """
+    count = 0
+    while True:
+        raw = await rds.lpop(dlq_key(queue_name))
+        if raw is None:
+            break
+        entry = json.loads(raw)
+        original = entry.get("original_payload", entry)
+        await rds.rpush(queue_key(queue_name), json.dumps(original, default=str))
+        count += 1
+    if count:
+        logger.info("Replayed %d jobs from DLQ back to %s", count, queue_name)
+    return count
+
+
+async def purge_dlq(rds: aioredis.Redis, queue_name: str) -> int:
+    """Delete all items from the DLQ for *queue_name*. Returns count removed."""
+    key = dlq_key(queue_name)
+    length = await rds.llen(key)
+    if length:
+        await rds.delete(key)
+    return length
+
+
+async def dlq_summary(rds: aioredis.Redis, queue_names: list[str]) -> dict[str, int]:
+    """Return a mapping of queue_name -> DLQ depth for the given queues."""
+    result: dict[str, int] = {}
+    for name in queue_names:
+        result[name] = await rds.llen(dlq_key(name))
+    return result
@@ -0,0 +1,198 @@
+"""Cross-source deduplication for articles and filings.
+
+Detects duplicate documents across different source types (news_api,
+filings_api, web_scrape) using a layered approach:
+
+1. Redis fast-path: check content_hash and canonical_url markers for
+   recently-seen documents (TTL-bounded, cheap).
+2. PostgreSQL fallback: query the documents table by canonical_url or
+   content_hash for durable cross-source matching.
+
+When a duplicate is detected the caller receives the existing document_id
+so it can link additional company mentions without re-inserting the document.
+
+Requirements: 3.2, 3.3
+"""
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+import asyncpg
+import redis.asyncio as aioredis
+
+from services.shared.content import content_hash_str, normalize_url
+from services.shared.redis_keys import DEDUPE_PREFIX
+
+logger = logging.getLogger("dedupe")
+
+# Redis TTL for dedupe markers (24 hours)
+DEDUPE_TTL_SECONDS: int = 86400
+
+
+def _url_dedupe_key(canonical_url: str) -> str:
+    """Build a Redis key for URL-based deduplication."""
+    return f"{DEDUPE_PREFIX}:url:{content_hash_str(canonical_url)}"
+
+
+def _hash_dedupe_key(content_hash: str) -> str:
+    """Build a Redis key for content-hash-based deduplication."""
+    return f"{DEDUPE_PREFIX}:{content_hash}"
+
+
+@dataclass
+class DedupeResult:
+    """Result of a deduplication check."""
+
+    is_duplicate: bool
+    existing_document_id: str | None = None
+    match_type: str | None = None  # "content_hash" | "canonical_url" | None
+
+
+async def check_duplicate(
+    pool: asyncpg.Pool,
+    rds: aioredis.Redis,
+    *,
+    content_hash: str,
+    url: str | None = None,
+    canonical_url: str | None = None,
+) -> DedupeResult:
+    """Check whether a document is a duplicate across all source types.
+
+    Checks in order of cost:
+    1. Redis content_hash marker (fast path)
+    2. Redis canonical_url marker (fast path)
+    3. PostgreSQL documents.content_hash (durable)
+    4. PostgreSQL documents.canonical_url (cross-source)
+
+    Returns a DedupeResult indicating whether the document already exists.
+    """
+    # Resolve canonical URL if only raw URL provided
+    resolved_canonical = canonical_url or (normalize_url(url) if url else None)
+
+    # --- Redis fast path: content hash ---
+    if content_hash:
+        redis_key = _hash_dedupe_key(content_hash)
+        cached_id = await rds.get(redis_key)
+        if cached_id:
+            logger.debug("Dedupe hit (redis content_hash) for %s", content_hash[:16])
+            return DedupeResult(
+                is_duplicate=True,
+                existing_document_id=str(cached_id),
+                match_type="content_hash",
+            )
+
+    # --- Redis fast path: canonical URL ---
+    if resolved_canonical:
+        url_key = _url_dedupe_key(resolved_canonical)
+        cached_id = await rds.get(url_key)
+        if cached_id:
+            logger.debug("Dedupe hit (redis canonical_url) for %s", resolved_canonical[:60])
+            return DedupeResult(
+                is_duplicate=True,
+                existing_document_id=str(cached_id),
+                match_type="canonical_url",
+            )
+
+    # --- PostgreSQL fallback: content hash ---
+    if content_hash:
+        row = await pool.fetchrow(
+            "SELECT id FROM documents WHERE content_hash = $1 LIMIT 1",
+            content_hash,
+        )
+        if row:
+            doc_id = str(row["id"])
+            # Warm the Redis cache for future checks
+            await _set_dedupe_markers(rds, content_hash, resolved_canonical, doc_id)
+            logger.debug("Dedupe hit (pg content_hash) for %s", content_hash[:16])
+            return DedupeResult(
+                is_duplicate=True,
+                existing_document_id=doc_id,
+                match_type="content_hash",
+            )
+
+    # --- PostgreSQL fallback: canonical URL ---
+    if resolved_canonical:
+        row = await pool.fetchrow(
+            "SELECT id FROM documents WHERE canonical_url = $1 LIMIT 1",
+            resolved_canonical,
+        )
+        if row:
+            doc_id = str(row["id"])
+            await _set_dedupe_markers(rds, content_hash, resolved_canonical, doc_id)
+            logger.debug("Dedupe hit (pg canonical_url) for %s", resolved_canonical[:60])
+            return DedupeResult(
+                is_duplicate=True,
+                existing_document_id=doc_id,
+                match_type="canonical_url",
+            )
+
+    return DedupeResult(is_duplicate=False)
+
+
+async def mark_as_seen(
+    rds: aioredis.Redis,
+    *,
+    content_hash: str,
+    canonical_url: str | None,
+    document_id: str,
+) -> None:
+    """Mark a newly-persisted document in Redis for fast future dedupe checks."""
+    await _set_dedupe_markers(rds, content_hash, canonical_url, document_id)
+
+
+async def _set_dedupe_markers(
+    rds: aioredis.Redis,
+    content_hash: str | None,
+    canonical_url: str | None,
+    document_id: str,
+) -> None:
+    """Set Redis dedupe markers for both content hash and canonical URL."""
+    if content_hash:
+        await rds.set(
+            _hash_dedupe_key(content_hash), document_id, ex=DEDUPE_TTL_SECONDS
+        )
+    if canonical_url:
+        await rds.set(
+            _url_dedupe_key(canonical_url), document_id, ex=DEDUPE_TTL_SECONDS
+        )
+
+
+async def dedupe_items(
+    pool: asyncpg.Pool,
+    rds: aioredis.Redis,
+    items: list[dict[str, Any]],
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
+    """Partition a list of ingestion items into new and duplicate groups.
+
+    Each item is expected to have at least one of:
+    - content_hash: SHA-256 of the raw content
+    - url / canonical_url: the document URL
+
+    Returns (new_items, duplicate_items).
+    """
+    new_items: list[dict[str, Any]] = []
+    dup_items: list[dict[str, Any]] = []
+
+    for item in items:
+        item_hash = item.get("content_hash", "")
+        item_url = item.get("url") or item.get("link")
+        item_canonical = item.get("canonical_url")
+
+        result = await check_duplicate(
+            pool,
+            rds,
+            content_hash=item_hash,
+            url=item_url,
+            canonical_url=item_canonical,
+        )
+
+        if result.is_duplicate:
+            item["_dedupe_match_type"] = result.match_type
+            item["_dedupe_existing_id"] = result.existing_document_id
+            dup_items.append(item)
+        else:
+            new_items.append(item)
+
+    return new_items, dup_items
@@ -0,0 +1,224 @@
+"""Structured logging and distributed tracing for all Stonks Oracle services.
+
+Provides:
+- JSON-formatted structured log output for machine-parseable log aggregation
+- Trace context (trace_id, span_id, service) propagated through log records
+- Context manager for creating trace spans within a service
+- Helper to configure logging for any service worker or API
+
+Requirements: 12.1
+Design: Section 12 (Observability and Operations)
+"""
+from __future__ import annotations
+
+import json
+import logging
+import time
+import uuid
+from contextvars import ContextVar
+from datetime import datetime, timezone
+from typing import Any
+
+# ---------------------------------------------------------------------------
+# Trace context stored in contextvars for async-safe propagation
+# ---------------------------------------------------------------------------
+
+_trace_id: ContextVar[str] = ContextVar("trace_id", default="")
+_span_id: ContextVar[str] = ContextVar("span_id", default="")
+_service_name: ContextVar[str] = ContextVar("service_name", default="unknown")
+
+
+def get_trace_id() -> str:
+    return _trace_id.get()
+
+
+def get_span_id() -> str:
+    return _span_id.get()
+
+
+def get_service_name() -> str:
+    return _service_name.get()
+
+
+def set_trace_context(
+    trace_id: str | None = None,
+    span_id: str | None = None,
+    service: str | None = None,
+) -> None:
+    """Set trace context for the current async task / thread."""
+    if trace_id is not None:
+        _trace_id.set(trace_id)
+    if span_id is not None:
+        _span_id.set(span_id)
+    if service is not None:
+        _service_name.set(service)
+
+
+def new_trace_id() -> str:
+    return uuid.uuid4().hex[:16]
+
+
+def new_span_id() -> str:
+    return uuid.uuid4().hex[:8]
+
+
+# ---------------------------------------------------------------------------
+# Span context manager for tracing within a service
+# ---------------------------------------------------------------------------
+
+
+class Span:
+    """Lightweight span for distributed tracing.
+
+    Usage::
+
+        with Span("process_document", ticker="AAPL") as span:
+            # ... do work ...
+            span.set_attribute("doc_count", 5)
+
+    On exit the span logs its duration and attributes as a structured event.
+    """
+
+    def __init__(self, operation: str, **attributes: Any) -> None:
+        self.operation = operation
+        self.parent_span_id = get_span_id()
+        self.span_id = new_span_id()
+        self.trace_id = get_trace_id() or new_trace_id()
+        self.attributes: dict[str, Any] = dict(attributes)
+        self.start_time: float = 0.0
+        self.duration_ms: float = 0.0
+        self._token_trace: Any = None
+        self._token_span: Any = None
+        self._logger = logging.getLogger(get_service_name() or "tracing")
+
+    def set_attribute(self, key: str, value: Any) -> None:
+        self.attributes[key] = value
+
+    def __enter__(self) -> Span:
+        self.start_time = time.monotonic()
+        self._token_trace = _trace_id.set(self.trace_id)
+        self._token_span = _span_id.set(self.span_id)
+        return self
+
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        self.duration_ms = (time.monotonic() - self.start_time) * 1000
+        status = "error" if exc_type else "ok"
+
+        self._logger.info(
+            "span.end",
+            extra={
+                "span_operation": self.operation,
+                "span_status": status,
+                "span_duration_ms": round(self.duration_ms, 2),
+                "span_parent_id": self.parent_span_id,
+                "span_attributes": self.attributes,
+            },
+        )
+
+        # Restore parent span context
+        if self._token_span is not None:
+            _span_id.reset(self._token_span)
+        if self._token_trace is not None:
+            _trace_id.reset(self._token_trace)
+
+
+# ---------------------------------------------------------------------------
+# JSON log formatter
+# ---------------------------------------------------------------------------
+
+
+class JSONFormatter(logging.Formatter):
+    """Emit each log record as a single JSON line with trace context."""
+
+    def format(self, record: logging.LogRecord) -> str:
+        log_entry: dict[str, Any] = {
+            "timestamp": datetime.fromtimestamp(record.created, tz=timezone.utc).isoformat(),
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+            "service": get_service_name(),
+            "trace_id": get_trace_id(),
+            "span_id": get_span_id(),
+        }
+
+        # Merge extra fields from Span or manual extra={} usage
+        for key in (
+            "span_operation", "span_status", "span_duration_ms",
+            "span_parent_id", "span_attributes",
+            "ticker", "document_id", "source_type", "job_id",
+            "duration_ms", "error", "count",
+        ):
+            val = getattr(record, key, None)
+            if val is not None:
+                log_entry[key] = val
+
+        if record.exc_info and record.exc_info[1]:
+            log_entry["exception"] = self.formatException(record.exc_info)
+
+        return json.dumps(log_entry, default=str)
+
+
+# ---------------------------------------------------------------------------
+# Setup helper
+# ---------------------------------------------------------------------------
+
+
+def setup_logging(
+    service_name: str,
+    level: str = "INFO",
+    json_output: bool = True,
+) -> None:
+    """Configure structured logging for a service.
+
+    Call this once at service startup (before any log calls).
+
+    Args:
+        service_name: Identifies this service in log output (e.g. "ingestion_worker").
+        level: Log level string (DEBUG, INFO, WARNING, ERROR).
+        json_output: If True, emit JSON lines. If False, use a human-readable format.
+    """
+    _service_name.set(service_name)
+
+    root = logging.getLogger()
+    root.setLevel(getattr(logging, level.upper(), logging.INFO))
+
+    # Remove existing handlers to avoid duplicate output
+    root.handlers.clear()
+
+    handler = logging.StreamHandler()
+    if json_output:
+        handler.setFormatter(JSONFormatter())
+    else:
+        handler.setFormatter(logging.Formatter(
+            "%(asctime)s [%(levelname)s] %(name)s (%(service)s) "
+            "trace=%(trace_id)s span=%(span_id)s — %(message)s",
+            defaults={"service": service_name, "trace_id": "", "span_id": ""},
+        ))
+    root.addHandler(handler)
+
+
+# ---------------------------------------------------------------------------
+# Trace context propagation through job payloads
+# ---------------------------------------------------------------------------
+
+
+def inject_trace_context(payload: dict[str, Any]) -> dict[str, Any]:
+    """Inject current trace context into a job payload dict.
+
+    Call this before enqueuing a job to Redis so the downstream
+    worker can continue the same trace.
+    """
+    trace_id = get_trace_id()
+    if trace_id:
+        payload["_trace_id"] = trace_id
+    return payload
+
+
+def extract_trace_context(payload: dict[str, Any]) -> None:
+    """Extract and set trace context from an incoming job payload.
+
+    Call this at the start of job processing. If no trace context
+    is present, generates a new trace_id.
+    """
+    trace_id = payload.get("_trace_id") or new_trace_id()
+    set_trace_context(trace_id=trace_id, span_id=new_span_id())
@@ -0,0 +1,696 @@
+"""Metadata persistence for market payloads, documents, and broker events.
+
+Persists structured metadata records to PostgreSQL for all ingested artifacts.
+Each source type has its own persistence path:
+- market_api  → market_snapshots table
+- news_api / filings_api / web_scrape → documents + document_company_mentions
+- broker → order_events or market_snapshots (for position/account snapshots)
+
+Requirements: 3.3, 3.4, 8.3, 9.2
+"""
+from __future__ import annotations
+
+import json
+import logging
+from datetime import datetime, timedelta, timezone
+from typing import Any
+
+import asyncpg
+
+from services.shared.content import content_hash_str, normalize_url
+
+logger = logging.getLogger("metadata")
+
+
+async def persist_market_snapshot(
+    pool: asyncpg.Pool,
+    *,
+    company_id: str | None,
+    ticker: str,
+    snapshot_type: str,
+    data: dict[str, Any],
+    source_provider: str,
+    storage_ref: str,
+    content_hash: str,
+    captured_at: datetime | None = None,
+) -> str:
+    """Persist a market data snapshot to PostgreSQL.
+
+    Returns the snapshot row UUID.
+    """
+    ts = captured_at or datetime.now(timezone.utc)
+    row_id = await pool.fetchval(
+        """INSERT INTO market_snapshots
+               (company_id, ticker, snapshot_type, data, source_provider,
+                captured_at, storage_ref, content_hash)
+           VALUES ($1, $2, $3, $4::jsonb, $5, $6, $7, $8)
+           RETURNING id""",
+        company_id,
+        ticker,
+        snapshot_type,
+        json.dumps(data),
+        source_provider,
+        ts,
+        storage_ref,
+        content_hash,
+    )
+    logger.debug("Persisted market snapshot %s for %s", row_id, ticker)
+    return str(row_id)
+
+
+async def persist_document(
+    pool: asyncpg.Pool,
+    *,
+    document_type: str,
+    source_type: str,
+    publisher: str,
+    url: str | None,
+    canonical_url: str | None,
+    title: str,
+    published_at: datetime | None,
+    content_hash: str,
+    storage_ref: str,
+    language: str = "en",
+) -> str | None:
+    """Persist a document metadata record to PostgreSQL.
+
+    Returns the document row UUID, or None if a duplicate content_hash exists.
+    """
+    exists = await pool.fetchval(
+        "SELECT 1 FROM documents WHERE content_hash = $1", content_hash
+    )
+    if exists:
+        return None
+
+    doc_id = await pool.fetchval(
+        """INSERT INTO documents
+               (document_type, source_type, publisher, url, canonical_url,
+                title, published_at, content_hash, raw_storage_ref,
+                language, status)
+           VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 'ingested')
+           RETURNING id""",
+        document_type,
+        source_type,
+        publisher,
+        url,
+        canonical_url,
+        title,
+        published_at,
+        content_hash,
+        storage_ref,
+        language,
+    )
+    logger.debug("Persisted document %s (%s)", doc_id, title[:60] if title else "")
+    return str(doc_id)
+
+
+async def update_document_parse_results(
+    pool: asyncpg.Pool,
+    *,
+    document_id: str,
+    normalized_storage_ref: str | None,
+    parser_output_ref: str | None,
+    parse_quality_score: float,
+    parse_confidence: str,
+    status: str,
+) -> None:
+    """Update a document row with parser output references and quality scores.
+
+    Called after the parsing stage to persist normalized text location,
+    structured parser output location, quality score, and confidence.
+
+    Requirements: 4.1, 4.3, 9.1
+    """
+    await pool.execute(
+        """UPDATE documents SET
+               normalized_storage_ref = $2,
+               parser_output_ref = $3,
+               parse_quality_score = $4,
+               parse_confidence = $5,
+               status = $6,
+               updated_at = NOW()
+           WHERE id = $1""",
+        document_id,
+        normalized_storage_ref,
+        parser_output_ref,
+        parse_quality_score,
+        parse_confidence,
+        status,
+    )
+    logger.debug(
+        "Updated document %s parse results: quality=%.2f confidence=%s status=%s",
+        document_id, parse_quality_score, parse_confidence, status,
+    )
+
+
+async def persist_document_company_mention(
+    pool: asyncpg.Pool,
+    *,
+    document_id: str,
+    company_id: str,
+    ticker: str,
+    mention_type: str = "direct",
+    confidence: float = 1.0,
+) -> str:
+    """Link a document to a company via document_company_mentions.
+
+    Returns the mention row UUID.
+    """
+    mention_id = await pool.fetchval(
+        """INSERT INTO document_company_mentions
+               (document_id, company_id, ticker, mention_type, confidence)
+           VALUES ($1::uuid, $2::uuid, $3, $4, $5)
+           RETURNING id""",
+        document_id,
+        company_id,
+        ticker,
+        mention_type,
+        confidence,
+    )
+    return str(mention_id)
+
+
+async def persist_broker_event(
+    pool: asyncpg.Pool,
+    *,
+    ticker: str,
+    event_type: str,
+    data: dict[str, Any],
+    source_provider: str,
+    storage_ref: str,
+    content_hash: str,
+    captured_at: datetime | None = None,
+) -> str:
+    """Persist a broker event snapshot to market_snapshots.
+
+    Broker position/account snapshots are stored as market_snapshots
+    with snapshot_type prefixed by 'broker_' (e.g. broker_positions,
+    broker_account, broker_orders).
+
+    Returns the snapshot row UUID.
+    """
+    ts = captured_at or datetime.now(timezone.utc)
+    row_id = await pool.fetchval(
+        """INSERT INTO market_snapshots
+               (ticker, snapshot_type, data, source_provider,
+                captured_at, storage_ref, content_hash)
+           VALUES ($1, $2, $3::jsonb, $4, $5, $6, $7)
+           RETURNING id""",
+        ticker,
+        f"broker_{event_type}",
+        json.dumps(data),
+        source_provider,
+        ts,
+        storage_ref,
+        content_hash,
+    )
+    logger.debug("Persisted broker event %s for %s", row_id, ticker)
+    return str(row_id)
+
+
+def _resolve_document_type(source_type: str) -> str:
+    """Map source_type to a document_type value."""
+    mapping = {
+        "news_api": "article",
+        "filings_api": "filing",
+        "web_scrape": "press_release",
+    }
+    return mapping.get(source_type, "article")
+
+
+def _extract_publisher(item: dict[str, Any]) -> str:
+    """Extract publisher name from an adapter item dict."""
+    if item.get("publisher"):
+        return str(item["publisher"])
+    source = item.get("source")
+    if isinstance(source, dict):
+        return source.get("name", "")
+    if source:
+        return str(source)
+    return ""
+
+
+def _parse_published_at(item: dict[str, Any]) -> datetime | None:
+    """Parse published_at from various adapter item formats."""
+    raw = item.get("publishedAt") or item.get("published_at")
+    if not raw:
+        return None
+    if isinstance(raw, datetime):
+        return raw
+    try:
+        return datetime.fromisoformat(str(raw).replace("Z", "+00:00"))
+    except (ValueError, TypeError):
+        return None
+
+
+async def persist_ingestion_items(
+    pool: asyncpg.Pool,
+    *,
+    source_type: str,
+    ticker: str,
+    company_id: str | None,
+    items: list[dict[str, Any]],
+    storage_ref: str,
+    adapter_metadata: dict[str, Any],
+    content_hash: str,
+) -> tuple[int, list[str]]:
+    """Route ingestion items to the correct persistence path.
+
+    Returns (new_item_count, list_of_new_ids).
+    """
+    if source_type == "market_api":
+        return await _persist_market_items(
+            pool,
+            ticker=ticker,
+            company_id=company_id,
+            items=items,
+            storage_ref=storage_ref,
+            provider=adapter_metadata.get("provider", "unknown"),
+            content_hash=content_hash,
+        )
+
+    if source_type == "broker":
+        return await _persist_broker_items(
+            pool,
+            ticker=ticker,
+            items=items,
+            storage_ref=storage_ref,
+            provider=adapter_metadata.get("provider", "unknown"),
+            endpoint=adapter_metadata.get("endpoint", "positions"),
+            content_hash=content_hash,
+        )
+
+    # Document types: news_api, filings_api, web_scrape
+    return await _persist_document_items(
+        pool,
+        source_type=source_type,
+        ticker=ticker,
+        company_id=company_id,
+        items=items,
+        storage_ref=storage_ref,
+    )
+
+
+async def _persist_market_items(
+    pool: asyncpg.Pool,
+    *,
+    ticker: str,
+    company_id: str | None,
+    items: list[dict[str, Any]],
+    storage_ref: str,
+    provider: str,
+    content_hash: str,
+) -> tuple[int, list[str]]:
+    """Persist market data items as market_snapshots rows."""
+    ids: list[str] = []
+    for item in items:
+        item_hash = content_hash_str(json.dumps(item, sort_keys=True))
+        # Skip duplicates
+        exists = await pool.fetchval(
+            "SELECT 1 FROM market_snapshots WHERE content_hash = $1", item_hash
+        )
+        if exists:
+            continue
+
+        snapshot_type = _infer_market_snapshot_type(item)
+        row_id = await persist_market_snapshot(
+            pool,
+            company_id=company_id,
+            ticker=ticker,
+            snapshot_type=snapshot_type,
+            data=item,
+            source_provider=provider,
+            storage_ref=storage_ref,
+            content_hash=item_hash,
+        )
+        ids.append(row_id)
+    return len(ids), ids
+
+
+def _infer_market_snapshot_type(item: dict[str, Any]) -> str:
+    """Infer snapshot_type from market data item fields."""
+    # Polygon aggregate bars have 'o', 'h', 'l', 'c' fields
+    if all(k in item for k in ("o", "h", "l", "c")):
+        return "bar"
+    # Ticker details have 'market_cap' or 'sic_code'
+    if "market_cap" in item or "sic_code" in item:
+        return "ticker_details"
+    # Quote snapshots
+    if "ask" in item or "bid" in item:
+        return "quote"
+    return "snapshot"
+
+
+async def _persist_broker_items(
+    pool: asyncpg.Pool,
+    *,
+    ticker: str,
+    items: list[dict[str, Any]],
+    storage_ref: str,
+    provider: str,
+    endpoint: str,
+    content_hash: str,
+) -> tuple[int, list[str]]:
+    """Persist broker fetch items as market_snapshots with broker_ prefix."""
+    ids: list[str] = []
+    for item in items:
+        item_hash = content_hash_str(json.dumps(item, sort_keys=True))
+        exists = await pool.fetchval(
+            "SELECT 1 FROM market_snapshots WHERE content_hash = $1", item_hash
+        )
+        if exists:
+            continue
+
+        row_id = await persist_broker_event(
+            pool,
+            ticker=ticker,
+            event_type=endpoint,
+            data=item,
+            source_provider=provider,
+            storage_ref=storage_ref,
+            content_hash=item_hash,
+        )
+        ids.append(row_id)
+    return len(ids), ids
+
+
+async def _persist_document_items(
+    pool: asyncpg.Pool,
+    *,
+    source_type: str,
+    ticker: str,
+    company_id: str | None,
+    items: list[dict[str, Any]],
+    storage_ref: str,
+) -> tuple[int, list[str]]:
+    """Persist document items (news, filings, web scrape) to documents table."""
+    doc_type = _resolve_document_type(source_type)
+    ids: list[str] = []
+
+    for item in items:
+        item_hash = item.get("content_hash") or content_hash_str(
+            json.dumps(item, sort_keys=True)
+        )
+        title = item.get("title", item.get("name", ""))
+        url = item.get("url", item.get("link", ""))
+        canonical_url = item.get("canonical_url") or (
+            normalize_url(url) if url else None
+        )
+        published_at = _parse_published_at(item)
+        publisher = _extract_publisher(item)
+
+        doc_id = await persist_document(
+            pool,
+            document_type=doc_type,
+            source_type=source_type,
+            publisher=publisher,
+            url=url or None,
+            canonical_url=canonical_url,
+            title=title,
+            published_at=published_at,
+            content_hash=item_hash,
+            storage_ref=storage_ref,
+        )
+        if doc_id is None:
+            continue
+
+        # Link document to company if we have a company_id
+        if company_id:
+            await persist_document_company_mention(
+                pool,
+                document_id=doc_id,
+                company_id=company_id,
+                ticker=ticker,
+            )
+
+        ids.append(doc_id)
+
+    return len(ids), ids
+
+
+# --- Retry and failure tracking (Requirement 3.4) ---
+
+# Backoff constants — match scheduler defaults for consistency
+RETRY_BACKOFF_BASE: int = 60
+RETRY_BACKOFF_MAX: int = 3600
+RETRY_MAX_COUNT: int = 10
+
+
+def compute_next_retry_at(
+    retry_count: int,
+    now: datetime | None = None,
+    base: int = RETRY_BACKOFF_BASE,
+    cap: int = RETRY_BACKOFF_MAX,
+) -> datetime:
+    """Compute the next eligible retry time using exponential backoff.
+
+    Args:
+        retry_count: Current retry count (before incrementing).
+        now: Reference timestamp (defaults to UTC now).
+        base: Base delay in seconds.
+        cap: Maximum delay in seconds.
+
+    Returns:
+        Datetime of the next eligible retry.
+    """
+    ts = now or datetime.now(timezone.utc)
+    delay = min(base * (2 ** min(retry_count, 8)), cap)
+    return ts + timedelta(seconds=delay)
+
+
+async def get_source_retry_count(
+    pool: asyncpg.Pool,
+    source_id: str,
+) -> int:
+    """Return the retry count from the most recent failed run for a source.
+
+    If the last run succeeded or no runs exist, returns 0.
+    """
+    row = await pool.fetchrow(
+        """SELECT status, retry_count
+           FROM ingestion_runs
+           WHERE source_id = $1::uuid
+           ORDER BY started_at DESC
+           LIMIT 1""",
+        source_id,
+    )
+    if row and row["status"] == "failed":
+        return row["retry_count"] or 0
+    return 0
+
+
+async def record_retrieval_failure(
+    pool: asyncpg.Pool,
+    run_id: str,
+    source_id: str,
+    error_message: str,
+    retry_count: int | None = None,
+    now: datetime | None = None,
+) -> dict[str, Any]:
+    """Record a source retrieval failure with retry policy state.
+
+    Updates the ingestion_runs row with:
+    - error_message: the failure reason
+    - retry_count: incremented from the previous failed run (or provided)
+    - next_retry_at: computed via exponential backoff
+    - status: 'failed'
+
+    If retry_count is not provided, it is looked up from the most recent
+    failed run for the same source and incremented.
+
+    Returns a dict with the recorded retry state for observability.
+
+    Requirement 3.4
+    """
+    ts = now or datetime.now(timezone.utc)
+
+    if retry_count is None:
+        prev_count = await get_source_retry_count(pool, source_id)
+        retry_count = prev_count + 1
+    else:
+        retry_count = retry_count + 1
+
+    next_retry = compute_next_retry_at(retry_count - 1, now=ts)
+    exhausted = retry_count >= RETRY_MAX_COUNT
+
+    await pool.execute(
+        """UPDATE ingestion_runs
+           SET status = 'failed',
+               error_message = $2,
+               retry_count = $3,
+               next_retry_at = $4,
+               completed_at = $5
+           WHERE id = $1""",
+        run_id,
+        error_message,
+        retry_count,
+        next_retry,
+        ts,
+    )
+
+    state = {
+        "run_id": run_id,
+        "source_id": source_id,
+        "retry_count": retry_count,
+        "next_retry_at": next_retry.isoformat(),
+        "exhausted": exhausted,
+        "error_message": error_message,
+    }
+
+    if exhausted:
+        logger.warning(
+            "Source %s exhausted retries (%d/%d): %s",
+            source_id, retry_count, RETRY_MAX_COUNT, error_message,
+        )
+    else:
+        logger.info(
+            "Source %s failed (retry %d/%d), next retry at %s: %s",
+            source_id, retry_count, RETRY_MAX_COUNT,
+            next_retry.isoformat(), error_message,
+        )
+
+    return state
+
+
+async def persist_document_intelligence(
+    pool: asyncpg.Pool,
+    *,
+    document_id: str,
+    summary: str,
+    macro_themes: list[str],
+    novelty_score: float,
+    source_credibility: float,
+    extraction_warnings: list[str],
+    confidence: float,
+    model_provider: str,
+    model_name: str,
+    prompt_version: str,
+    schema_version: str,
+    raw_output_ref: str | None = None,
+    prompt_ref: str | None = None,
+    validation_status: str = "valid",
+    validation_errors: list[str] | None = None,
+    retry_count: int = 0,
+) -> str:
+    """Persist a document intelligence record to PostgreSQL.
+
+    Returns the intelligence row UUID.
+
+    Requirements: 5.3, 5.4, 9.2
+    """
+    intel_id = await pool.fetchval(
+        """INSERT INTO document_intelligence
+               (document_id, summary, macro_themes, novelty_score,
+                source_credibility, extraction_warnings, confidence,
+                model_provider, model_name, prompt_version, schema_version,
+                raw_output_ref, prompt_ref, validation_status,
+                validation_errors, retry_count)
+           VALUES ($1::uuid, $2, $3::jsonb, $4, $5, $6::jsonb, $7,
+                   $8, $9, $10, $11, $12, $13, $14, $15::jsonb, $16)
+           RETURNING id""",
+        document_id,
+        summary,
+        json.dumps(macro_themes),
+        novelty_score,
+        source_credibility,
+        json.dumps(extraction_warnings),
+        confidence,
+        model_provider,
+        model_name,
+        prompt_version,
+        schema_version,
+        raw_output_ref,
+        prompt_ref,
+        validation_status,
+        json.dumps(validation_errors or []),
+        retry_count,
+    )
+    logger.debug("Persisted document intelligence %s for doc %s", intel_id, document_id)
+    return str(intel_id)
+
+
+async def persist_document_impact(
+    pool: asyncpg.Pool,
+    *,
+    intelligence_id: str,
+    company_id: str,
+    ticker: str,
+    relevance: float,
+    sentiment: str,
+    impact_score: float,
+    impact_horizon: str,
+    catalyst_type: str,
+    key_facts: list[str],
+    risks: list[str],
+    evidence_spans: list[str],
+) -> str:
+    """Persist a per-company impact record linked to a document intelligence row.
+
+    Returns the impact record UUID.
+
+    Requirements: 5.3, 5.5, 9.2
+    """
+    impact_id = await pool.fetchval(
+        """INSERT INTO document_impact_records
+               (intelligence_id, company_id, ticker, relevance, sentiment,
+                impact_score, impact_horizon, catalyst_type,
+                key_facts, risks, evidence_spans)
+           VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6, $7, $8,
+                   $9::jsonb, $10::jsonb, $11::jsonb)
+           RETURNING id""",
+        intelligence_id,
+        company_id,
+        ticker,
+        relevance,
+        sentiment,
+        impact_score,
+        impact_horizon,
+        catalyst_type,
+        json.dumps(key_facts),
+        json.dumps(risks),
+        json.dumps(evidence_spans),
+    )
+    logger.debug("Persisted impact record %s for %s", impact_id, ticker)
+    return str(impact_id)
+
+
+async def update_document_status(
+    pool: asyncpg.Pool,
+    *,
+    document_id: str,
+    status: str,
+) -> None:
+    """Update the status field on a document row.
+
+    Used to advance documents through the pipeline: ingested → parsed → extracted → failed.
+
+    Requirements: 5.4
+    """
+    await pool.execute(
+        """UPDATE documents SET status = $2, updated_at = NOW() WHERE id = $1::uuid""",
+        document_id,
+        status,
+    )
+    logger.debug("Updated document %s status to %s", document_id, status)
+
+
+async def reset_source_retry_state(
+    pool: asyncpg.Pool,
+    source_id: str,
+) -> None:
+    """Reset retry state for a source after a successful run.
+
+    Sets retry_count=0 and next_retry_at=NULL on the most recent run.
+    Called after a successful ingestion to clear any accumulated backoff.
+    """
+    await pool.execute(
+        """UPDATE ingestion_runs
+           SET retry_count = 0, next_retry_at = NULL
+           WHERE id = (
+               SELECT id FROM ingestion_runs
+               WHERE source_id = $1::uuid
+               ORDER BY started_at DESC
+               LIMIT 1
+           )""",
+        source_id,
+    )
@@ -0,0 +1,317 @@
+"""Prometheus metrics for all Stonks Oracle pipeline stages.
+
+Provides counters, histograms, and gauges covering:
+- Ingestion: items fetched, new items, errors, adapter latency
+- Parsing: documents parsed, quality scores, low-quality flags
+- Extraction: attempts, successes, failures, latency, confidence, retries
+- Aggregation: trend windows computed, signal counts, contradiction scores
+- Lake publication: facts published per table, write latency
+- Trading: orders submitted, rejected, filled, risk evaluations
+
+Requirements: 12.1, 12.2
+Design: Section 12 (Observability and Operations)
+"""
+from __future__ import annotations
+
+from prometheus_client import Counter, Gauge, Histogram, Info
+
+# ---------------------------------------------------------------------------
+# Service info
+# ---------------------------------------------------------------------------
+
+SERVICE_INFO = Info("stonks_oracle", "Stonks Oracle service metadata")
+
+# ---------------------------------------------------------------------------
+# Ingestion metrics
+# ---------------------------------------------------------------------------
+
+INGESTION_JOBS_TOTAL = Counter(
+    "stonks_ingestion_jobs_total",
+    "Total ingestion jobs processed",
+    ["source_type", "status"],
+)
+
+INGESTION_ITEMS_FETCHED = Counter(
+    "stonks_ingestion_items_fetched_total",
+    "Total items fetched from external sources",
+    ["source_type"],
+)
+
+INGESTION_ITEMS_NEW = Counter(
+    "stonks_ingestion_items_new_total",
+    "New (non-duplicate) items ingested",
+    ["source_type"],
+)
+
+INGESTION_ITEMS_DEDUPED = Counter(
+    "stonks_ingestion_items_deduped_total",
+    "Items skipped due to deduplication",
+    ["source_type"],
+)
+
+INGESTION_ERRORS = Counter(
+    "stonks_ingestion_errors_total",
+    "Ingestion errors by source type",
+    ["source_type"],
+)
+
+INGESTION_ADAPTER_DURATION = Histogram(
+    "stonks_ingestion_adapter_duration_seconds",
+    "Adapter fetch latency in seconds",
+    ["source_type"],
+    buckets=(0.1, 0.5, 1, 2, 5, 10, 30, 60),
+)
+
+# ---------------------------------------------------------------------------
+# Parsing metrics
+# ---------------------------------------------------------------------------
+
+PARSE_JOBS_TOTAL = Counter(
+    "stonks_parse_jobs_total",
+    "Total parse jobs processed",
+    ["status"],
+)
+
+PARSE_QUALITY_SCORE = Histogram(
+    "stonks_parse_quality_score",
+    "Distribution of parser quality scores",
+    buckets=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
+)
+
+PARSE_LOW_QUALITY_TOTAL = Counter(
+    "stonks_parse_low_quality_total",
+    "Documents flagged as low quality by the parser",
+)
+
+PARSE_DURATION = Histogram(
+    "stonks_parse_duration_seconds",
+    "Parse job duration in seconds",
+    buckets=(0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10),
+)
+
+# ---------------------------------------------------------------------------
+# Extraction metrics
+# ---------------------------------------------------------------------------
+
+EXTRACTION_JOBS_TOTAL = Counter(
+    "stonks_extraction_jobs_total",
+    "Total extraction jobs processed",
+    ["status"],
+)
+
+EXTRACTION_ATTEMPTS = Counter(
+    "stonks_extraction_attempts_total",
+    "Total Ollama extraction attempts (including retries)",
+)
+
+EXTRACTION_RETRIES = Counter(
+    "stonks_extraction_retries_total",
+    "Extraction retry count",
+)
+
+EXTRACTION_DURATION = Histogram(
+    "stonks_extraction_duration_seconds",
+    "Extraction total duration in seconds",
+    buckets=(1, 2, 5, 10, 20, 30, 60, 120),
+)
+
+EXTRACTION_CONFIDENCE = Histogram(
+    "stonks_extraction_confidence",
+    "Distribution of extraction confidence scores",
+    buckets=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
+)
+
+EXTRACTION_VALIDATION_ERRORS = Counter(
+    "stonks_extraction_validation_errors_total",
+    "Total validation errors across extractions",
+)
+
+EXTRACTION_TOKEN_ESTIMATE = Counter(
+    "stonks_extraction_tokens_total",
+    "Estimated token usage",
+    ["direction"],
+)
+
+# ---------------------------------------------------------------------------
+# Aggregation metrics
+# ---------------------------------------------------------------------------
+
+AGGREGATION_WINDOWS_COMPUTED = Counter(
+    "stonks_aggregation_windows_total",
+    "Trend windows computed",
+    ["window"],
+)
+
+AGGREGATION_SIGNALS_PROCESSED = Counter(
+    "stonks_aggregation_signals_total",
+    "Signals processed during aggregation",
+    ["window"],
+)
+
+AGGREGATION_CONTRADICTION_SCORE = Histogram(
+    "stonks_aggregation_contradiction_score",
+    "Distribution of contradiction scores in trend windows",
+    buckets=(0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0),
+)
+
+AGGREGATION_DURATION = Histogram(
+    "stonks_aggregation_duration_seconds",
+    "Aggregation job duration in seconds",
+    ["window"],
+    buckets=(0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10),
+)
+
+# ---------------------------------------------------------------------------
+# Recommendation metrics
+# ---------------------------------------------------------------------------
+
+RECOMMENDATION_GENERATED = Counter(
+    "stonks_recommendations_total",
+    "Recommendations generated",
+    ["action", "mode"],
+)
+
+RECOMMENDATION_SUPPRESSED = Counter(
+    "stonks_recommendations_suppressed_total",
+    "Recommendations suppressed due to low data quality",
+)
+
+RECOMMENDATION_CONFIDENCE = Histogram(
+    "stonks_recommendation_confidence",
+    "Distribution of recommendation confidence scores",
+    buckets=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
+)
+
+# ---------------------------------------------------------------------------
+# Lake publication metrics
+# ---------------------------------------------------------------------------
+
+LAKE_FACTS_PUBLISHED = Counter(
+    "stonks_lake_facts_published_total",
+    "Analytical facts published to the lakehouse",
+    ["table_name"],
+)
+
+LAKE_PUBLISH_DURATION = Histogram(
+    "stonks_lake_publish_duration_seconds",
+    "Lake publication write latency in seconds",
+    ["table_name"],
+    buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5),
+)
+
+LAKE_PUBLISH_ERRORS = Counter(
+    "stonks_lake_publish_errors_total",
+    "Lake publication errors",
+    ["table_name"],
+)
+
+LAKE_PUBLISH_BYTES = Counter(
+    "stonks_lake_publish_bytes_total",
+    "Total bytes written to the lakehouse",
+    ["table_name"],
+)
+
+# ---------------------------------------------------------------------------
+# Trading / broker metrics
+# ---------------------------------------------------------------------------
+
+ORDERS_SUBMITTED = Counter(
+    "stonks_orders_submitted_total",
+    "Orders submitted to broker",
+    ["side", "order_type", "mode"],
+)
+
+ORDERS_REJECTED = Counter(
+    "stonks_orders_rejected_total",
+    "Orders rejected before broker submission",
+    ["reason_category"],
+)
+
+ORDERS_FILLED = Counter(
+    "stonks_orders_filled_total",
+    "Orders filled by broker",
+    ["side"],
+)
+
+ORDERS_DUPLICATES_PREVENTED = Counter(
+    "stonks_orders_duplicates_prevented_total",
+    "Duplicate orders prevented by idempotency checks",
+    ["detected_via"],
+)
+
+RISK_EVALUATIONS_TOTAL = Counter(
+    "stonks_risk_evaluations_total",
+    "Risk evaluations performed",
+    ["result"],
+)
+
+RISK_CHECK_FAILURES = Counter(
+    "stonks_risk_check_failures_total",
+    "Individual risk check failures",
+    ["check_name"],
+)
+
+POSITIONS_SYNCED = Counter(
+    "stonks_positions_synced_total",
+    "Position sync operations completed",
+)
+
+# ---------------------------------------------------------------------------
+# Active gauges
+# ---------------------------------------------------------------------------
+
+ACTIVE_JOBS = Gauge(
+    "stonks_active_jobs",
+    "Currently processing jobs by stage",
+    ["stage"],
+)
+
+# ---------------------------------------------------------------------------
+# Alerting metrics
+# ---------------------------------------------------------------------------
+
+ALERTS_FIRED = Counter(
+    "stonks_alerts_fired_total",
+    "Total alerts fired by rule",
+    ["rule", "severity"],
+)
+
+ALERTS_RESOLVED = Counter(
+    "stonks_alerts_resolved_total",
+    "Total alerts resolved by rule",
+    ["rule"],
+)
+
+ALERT_CHECK_DURATION = Histogram(
+    "stonks_alert_check_duration_seconds",
+    "Duration of alert evaluation cycle",
+    buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5),
+)
+
+ALERT_ACTIVE = Gauge(
+    "stonks_alert_active",
+    "Whether an alert rule is currently firing (1) or resolved (0)",
+    ["rule"],
+)
+
+# ---------------------------------------------------------------------------
+# Dead-letter queue metrics
+# ---------------------------------------------------------------------------
+
+DLQ_ITEMS_TOTAL = Counter(
+    "stonks_dlq_items_total",
+    "Jobs sent to dead-letter queues",
+    ["queue"],
+)
+
+DLQ_REPLAYED_TOTAL = Counter(
+    "stonks_dlq_replayed_total",
+    "Jobs replayed from dead-letter queues",
+    ["queue"],
+)
+
+DLQ_DEPTH = Gauge(
+    "stonks_dlq_depth",
+    "Current dead-letter queue depth",
+    ["queue"],
+)
@@ -46,6 +46,15 @@ def retry_key(job_id: str) -> str:
    return f"{RETRY_PREFIX}:{job_id}"


+# Dead-letter queues
+DLQ_PREFIX = f"{PREFIX}:dlq"
+
+
+def dlq_key(queue_name: str) -> str:
+    """Return the dead-letter queue key for a given source queue."""
+    return f"{DLQ_PREFIX}:{queue_name}"
+
+
 # --- Queue names ---
 QUEUE_INGESTION = "ingestion"
 QUEUE_PARSING = "parsing"
@@ -54,3 +63,4 @@ QUEUE_AGGREGATION = "aggregation"
 QUEUE_RECOMMENDATION = "recommendation"
 QUEUE_LAKE_PUBLISH = "lake_publish"
 QUEUE_TRADE = "trade"
+QUEUE_BROKER = "broker_orders"
@@ -0,0 +1,306 @@
+"""Data retention and lifecycle controls for raw and derived artifacts.
+
+Provides configurable per-bucket retention policies, expired object cleanup
+from MinIO, and expired metadata cleanup from PostgreSQL.
+
+Requirements: N3 (preserve source metadata, access policy, and retention policy)
+Design ref: Section 5.2 (MinIO bucket layout), Section 10 (Reliability and Safety)
+"""
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+
+import asyncpg
+from minio import Minio
+
+from services.shared.config import BUCKET_RETENTION_FIELDS, RetentionConfig
+from services.shared.storage import ALL_BUCKETS
+
+logger = logging.getLogger("retention")
+
+
+@dataclass
+class RetentionPolicy:
+    """Resolved retention policy for a single bucket."""
+    bucket_name: str
+    retention_days: int
+    archive_before_delete: bool = False
+
+
+@dataclass
+class CleanupResult:
+    """Result of a single bucket cleanup run."""
+    bucket_name: str
+    objects_scanned: int = 0
+    objects_deleted: int = 0
+    bytes_freed: int = 0
+    db_rows_deleted: int = 0
+
+
+def default_retention_days(bucket: str, config: RetentionConfig) -> int:
+    """Get the default retention days for a bucket from config."""
+    field_name = BUCKET_RETENTION_FIELDS.get(bucket)
+    if field_name:
+        return getattr(config, field_name, 365)
+    return 365
+
+
+def resolve_policies(config: RetentionConfig) -> list[RetentionPolicy]:
+    """Build retention policies for all known buckets from config defaults."""
+    return [
+        RetentionPolicy(
+            bucket_name=bucket,
+            retention_days=default_retention_days(bucket, config),
+        )
+        for bucket in ALL_BUCKETS
+    ]
+
+
+async def load_db_policies(pool: asyncpg.Pool) -> dict[str, RetentionPolicy]:
+    """Load retention policy overrides from the database.
+
+    Returns a dict keyed by bucket_name. DB policies take precedence
+    over config defaults when active.
+    """
+    rows = await pool.fetch(
+        """SELECT bucket_name, retention_days, archive_before_delete
+           FROM retention_policies
+           WHERE active = TRUE AND artifact_class = 'default'"""
+    )
+    return {
+        row["bucket_name"]: RetentionPolicy(
+            bucket_name=row["bucket_name"],
+            retention_days=row["retention_days"],
+            archive_before_delete=row["archive_before_delete"],
+        )
+        for row in rows
+    }
+
+
+def merge_policies(
+    config_policies: list[RetentionPolicy],
+    db_policies: dict[str, RetentionPolicy],
+) -> list[RetentionPolicy]:
+    """Merge config defaults with DB overrides. DB wins on conflict."""
+    merged: list[RetentionPolicy] = []
+    for policy in config_policies:
+        if policy.bucket_name in db_policies:
+            merged.append(db_policies[policy.bucket_name])
+        else:
+            merged.append(policy)
+    return merged
+
+
+def cutoff_date(retention_days: int, now: datetime | None = None) -> datetime:
+    """Calculate the cutoff datetime. Objects older than this are expired."""
+    ref = now or datetime.now(timezone.utc)
+    return ref - timedelta(days=retention_days)
+
+
+def list_expired_objects(
+    client: Minio,
+    bucket: str,
+    retention_days: int,
+    batch_size: int = 1000,
+    now: datetime | None = None,
+) -> list[str]:
+    """List object names in a bucket that are older than the retention cutoff.
+
+    Uses the object's last_modified timestamp from MinIO metadata.
+    Returns at most batch_size object names.
+    """
+    cutoff = cutoff_date(retention_days, now)
+    expired: list[str] = []
+
+    try:
+        objects = client.list_objects(bucket, recursive=True)
+        for obj in objects:
+            if obj.last_modified and obj.last_modified < cutoff:
+                if obj.object_name:
+                    expired.append(obj.object_name)
+                if len(expired) >= batch_size:
+                    break
+    except Exception:
+        logger.exception("Error listing objects in bucket %s", bucket)
+
+    return expired
+
+
+def delete_expired_objects(
+    client: Minio,
+    bucket: str,
+    object_names: list[str],
+) -> int:
+    """Delete a list of objects from a MinIO bucket.
+
+    Returns the count of successfully deleted objects.
+    """
+    deleted = 0
+    for name in object_names:
+        try:
+            client.remove_object(bucket, name)
+            deleted += 1
+        except Exception:
+            logger.warning("Failed to delete %s/%s", bucket, name, exc_info=True)
+    return deleted
+
+
+def cleanup_bucket(
+    client: Minio,
+    policy: RetentionPolicy,
+    batch_size: int = 1000,
+    now: datetime | None = None,
+) -> CleanupResult:
+    """Run retention cleanup for a single bucket.
+
+    Lists expired objects and deletes them in batches.
+    Returns a CleanupResult with counts.
+    """
+    result = CleanupResult(bucket_name=policy.bucket_name)
+
+    expired = list_expired_objects(
+        client, policy.bucket_name, policy.retention_days,
+        batch_size=batch_size, now=now,
+    )
+    result.objects_scanned = len(expired)
+
+    if expired:
+        result.objects_deleted = delete_expired_objects(client, policy.bucket_name, expired)
+        logger.info(
+            "Bucket %s: scanned=%d deleted=%d (retention=%dd)",
+            policy.bucket_name, result.objects_scanned,
+            result.objects_deleted, policy.retention_days,
+        )
+    else:
+        logger.debug("Bucket %s: no expired objects (retention=%dd)",
+                      policy.bucket_name, policy.retention_days)
+
+    return result
+
+
+# --- PostgreSQL metadata cleanup ---
+
+# Tables with a created_at or retrieved_at column that should be cleaned up
+# when the corresponding MinIO artifacts are expired.
+DB_CLEANUP_QUERIES: list[tuple[str, str]] = [
+    (
+        "ingestion_runs",
+        "DELETE FROM ingestion_runs WHERE started_at < $1",
+    ),
+    (
+        "market_snapshots",
+        "DELETE FROM market_snapshots WHERE captured_at < $1",
+    ),
+]
+
+
+async def cleanup_expired_db_records(
+    pool: asyncpg.Pool,
+    retention_days: int,
+    now: datetime | None = None,
+) -> int:
+    """Delete expired operational metadata from PostgreSQL.
+
+    Uses the shortest raw retention period to clean up ingestion tracking
+    and market snapshot records that are past their useful life.
+
+    Returns total rows deleted.
+    """
+    cutoff = cutoff_date(retention_days, now)
+    total_deleted = 0
+
+    async with pool.acquire() as conn:
+        for table_name, query in DB_CLEANUP_QUERIES:
+            try:
+                result = await conn.execute(query, cutoff)
+                # asyncpg returns "DELETE N"
+                count = int(result.split()[-1]) if result else 0
+                total_deleted += count
+                if count > 0:
+                    logger.info("Cleaned %d expired rows from %s (cutoff=%s)",
+                                count, table_name, cutoff.isoformat())
+            except Exception:
+                logger.exception("Error cleaning table %s", table_name)
+
+    return total_deleted
+
+
+async def record_retention_run(
+    pool: asyncpg.Pool,
+    bucket_name: str,
+    result: CleanupResult,
+    status: str = "completed",
+    error_message: str | None = None,
+) -> None:
+    """Record a retention cleanup run in the retention_runs table."""
+    await pool.execute(
+        """INSERT INTO retention_runs
+           (bucket_name, objects_scanned, objects_deleted, bytes_freed,
+            db_rows_deleted, completed_at, status, error_message)
+           VALUES ($1, $2, $3, $4, $5, NOW(), $6, $7)""",
+        bucket_name,
+        result.objects_scanned,
+        result.objects_deleted,
+        result.bytes_freed,
+        result.db_rows_deleted,
+        status,
+        error_message,
+    )
+
+
+async def run_retention_cleanup(
+    minio_client: Minio,
+    pool: asyncpg.Pool,
+    config: RetentionConfig,
+    now: datetime | None = None,
+) -> list[CleanupResult]:
+    """Run the full retention cleanup cycle.
+
+    1. Resolve policies from config defaults + DB overrides
+    2. Clean up expired MinIO objects per bucket
+    3. Clean up expired PostgreSQL metadata
+    4. Record each run for observability
+
+    Returns a list of CleanupResult for each bucket processed.
+    """
+    # Resolve policies
+    config_policies = resolve_policies(config)
+    try:
+        db_policies = await load_db_policies(pool)
+    except Exception:
+        logger.warning("Could not load DB retention policies, using config defaults")
+        db_policies = {}
+
+    policies = merge_policies(config_policies, db_policies)
+    results: list[CleanupResult] = []
+
+    # Clean up MinIO objects per bucket
+    for policy in policies:
+        try:
+            result = cleanup_bucket(
+                minio_client, policy,
+                batch_size=config.batch_size, now=now,
+            )
+            results.append(result)
+            await record_retention_run(pool, policy.bucket_name, result)
+        except Exception:
+            logger.exception("Retention cleanup failed for bucket %s", policy.bucket_name)
+            empty = CleanupResult(bucket_name=policy.bucket_name)
+            await record_retention_run(
+                pool, policy.bucket_name, empty,
+                status="failed", error_message="See logs",
+            )
+            results.append(empty)
+
+    # Clean up expired DB records using the shortest raw retention period
+    min_retention = min(p.retention_days for p in policies)
+    try:
+        db_deleted = await cleanup_expired_db_records(pool, min_retention, now=now)
+        if db_deleted > 0:
+            logger.info("Total DB rows cleaned: %d", db_deleted)
+    except Exception:
+        logger.exception("DB retention cleanup failed")
+
+    return results
@@ -108,6 +108,41 @@ class DocumentIntelligence(BaseModel):

 # --- Trend Summary ---

+class MarketContext(BaseModel):
+    """Recent market data features for a symbol, used to enrich aggregation."""
+
+    ticker: str = ""
+    price_change_pct: Optional[float] = None  # % change over the window
+    avg_volume: Optional[float] = None  # average daily volume
+    volume_change_pct: Optional[float] = None  # volume vs prior period
+    volatility: Optional[float] = None  # intra-window price std dev
+    latest_close: Optional[float] = None
+    latest_bar_at: Optional[datetime] = None
+    bars_available: int = 0
+
+    @property
+    def has_data(self) -> bool:
+        return self.bars_available > 0
+
+
+class DisagreementDetail(BaseModel):
+    """Represents an explicit disagreement between document signals.
+
+    Rather than collapsing contradictory signals into a single score,
+    this captures the nature of the disagreement so downstream consumers
+    can inspect *why* signals conflict.
+
+    Requirements: 6.4
+    """
+
+    dimension: str = ""  # e.g. "sentiment", "catalyst", "impact_horizon"
+    positive_doc_ids: List[str] = Field(default_factory=list)
+    negative_doc_ids: List[str] = Field(default_factory=list)
+    positive_weight: float = 0.0
+    negative_weight: float = 0.0
+    description: str = ""
+
+
 class TrendSummary(BaseModel):
    entity_type: str = "company"
    entity_id: str = ""
@@ -120,6 +155,8 @@ class TrendSummary(BaseModel):
    dominant_catalysts: List[str] = Field(default_factory=list)
    material_risks: List[str] = Field(default_factory=list)
    contradiction_score: float = Field(ge=0, le=1, default=0.0)
+    disagreement_details: List[DisagreementDetail] = Field(default_factory=list)
+    market_context: Optional[MarketContext] = None
    generated_at: datetime = Field(default_factory=datetime.utcnow)


@@ -0,0 +1,352 @@
+"""Raw artifact upload to MinIO.
+
+Provides a reusable storage layer for uploading raw artifacts (API payloads,
+HTML, normalized text, model outputs) to MinIO with consistent path conventions,
+bucket management, and content-type handling.
+
+Bucket layout follows the design spec:
+  - stonks-raw-market    — raw market API payloads
+  - stonks-raw-news      — raw news API payloads and article HTML
+  - stonks-raw-filings   — raw filings and issuer event payloads
+  - stonks-normalized    — cleaned text and parser outputs
+  - stonks-llm-prompts   — prompts and schemas used
+  - stonks-llm-results   — raw model outputs and validation reports
+  - stonks-lakehouse     — partitioned analytical datasets and table metadata
+  - stonks-audit         — execution traces and exported reports
+
+Object path pattern:
+  /{stage}/{symbol}/{yyyy}/{mm}/{dd}/{document_id}/{artifact_type}.{ext}
+
+Requirements: 3.1, 3.2, 3.3, 9.1
+"""
+import io
+import logging
+from datetime import datetime, timezone
+from typing import Mapping
+
+from minio import Minio
+from minio.error import S3Error
+
+logger = logging.getLogger("storage")
+
+# All known buckets the platform uses
+ALL_BUCKETS = [
+    "stonks-raw-market",
+    "stonks-raw-news",
+    "stonks-raw-filings",
+    "stonks-normalized",
+    "stonks-llm-prompts",
+    "stonks-llm-results",
+    "stonks-lakehouse",
+    "stonks-audit",
+]
+
+# Map source_type to the correct raw bucket
+SOURCE_BUCKET_MAP: dict[str, str] = {
+    "market_api": "stonks-raw-market",
+    "news_api": "stonks-raw-news",
+    "filings_api": "stonks-raw-filings",
+    "web_scrape": "stonks-raw-news",
+    "broker": "stonks-raw-market",
+}
+
+# Map artifact type to content type and file extension
+ARTIFACT_CONTENT_TYPES: dict[str, tuple[str, str]] = {
+    "raw_json": ("application/json", "json"),
+    "raw_html": ("text/html", "html"),
+    "raw_text": ("text/plain", "txt"),
+    "raw_payload": ("application/octet-stream", "bin"),
+}
+
+
+def bucket_for_source(source_type: str) -> str:
+    """Return the MinIO bucket name for a given source type."""
+    return SOURCE_BUCKET_MAP.get(source_type, "stonks-raw-market")
+
+
+def build_artifact_path(
+    source_type: str,
+    ticker: str,
+    document_id: str,
+    artifact_name: str = "raw",
+    ext: str = "json",
+    timestamp: datetime | None = None,
+) -> str:
+    """Build a MinIO object path following the design convention.
+
+    Pattern: {source_type}/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/{artifact_name}.{ext}
+    """
+    ts = timestamp or datetime.now(timezone.utc)
+    return (
+        f"{source_type}/{ticker}/"
+        f"{ts.year}/{ts.month:02d}/{ts.day:02d}/"
+        f"{document_id}/{artifact_name}.{ext}"
+    )
+
+
+def storage_ref(bucket: str, path: str) -> str:
+    """Build an s3:// URI for a stored artifact."""
+    return f"s3://{bucket}/{path}"
+
+
+def ensure_buckets(client: Minio, buckets: list[str] | None = None) -> list[str]:
+    """Create any missing buckets. Returns list of buckets that were created."""
+    target_buckets = buckets or ALL_BUCKETS
+    created: list[str] = []
+    for bucket in target_buckets:
+        try:
+            if not client.bucket_exists(bucket):
+                client.make_bucket(bucket)
+                created.append(bucket)
+                logger.info("Created bucket: %s", bucket)
+        except S3Error as e:
+            logger.error("Failed to ensure bucket %s: %s", bucket, e)
+            raise
+    return created
+
+
+def upload_artifact(
+    client: Minio,
+    bucket: str,
+    path: str,
+    data: bytes,
+    content_type: str = "application/json",
+    metadata: Mapping[str, str] | None = None,
+) -> str:
+    """Upload raw bytes to MinIO and return the s3:// storage reference.
+
+    Args:
+        client: MinIO client instance.
+        bucket: Target bucket name.
+        path: Object path within the bucket.
+        data: Raw bytes to upload.
+        content_type: MIME type for the object.
+        metadata: Optional user metadata to attach to the object.
+
+    Returns:
+        s3:// URI pointing to the uploaded object.
+    """
+    _result = client.put_object(
+        bucket,
+        path,
+        io.BytesIO(data),
+        length=len(data),
+        content_type=content_type,
+        metadata=metadata,
+    )
+    ref = storage_ref(bucket, path)
+    logger.debug("Uploaded %d bytes to %s", len(data), ref)
+    return ref
+
+
+def upload_raw_artifact(
+    client: Minio,
+    source_type: str,
+    ticker: str,
+    document_id: str,
+    data: bytes,
+    artifact_type: str = "raw_json",
+    timestamp: datetime | None = None,
+    metadata: Mapping[str, str] | None = None,
+) -> str:
+    """Upload a raw artifact using standard conventions for bucket, path, and content type.
+
+    This is the primary entry point for ingestion workers to store raw payloads.
+
+    Args:
+        client: MinIO client instance.
+        source_type: One of market_api, news_api, filings_api, web_scrape, broker.
+        ticker: Company ticker symbol.
+        document_id: Unique document or run identifier.
+        data: Raw bytes to upload.
+        artifact_type: One of raw_json, raw_html, raw_text, raw_payload.
+        timestamp: Override timestamp for path generation (defaults to now UTC).
+        metadata: Optional user metadata dict.
+
+    Returns:
+        s3:// URI pointing to the uploaded object.
+    """
+    bucket = bucket_for_source(source_type)
+    ct, ext = ARTIFACT_CONTENT_TYPES.get(artifact_type, ("application/octet-stream", "bin"))
+    path = build_artifact_path(
+        source_type=source_type,
+        ticker=ticker,
+        document_id=document_id,
+        artifact_name="raw",
+        ext=ext,
+        timestamp=timestamp,
+    )
+    return upload_artifact(client, bucket, path, data, content_type=ct, metadata=metadata)
+
+
+def upload_html_artifact(
+    client: Minio,
+    ticker: str,
+    document_id: str,
+    html_bytes: bytes,
+    timestamp: datetime | None = None,
+    metadata: Mapping[str, str] | None = None,
+) -> str:
+    """Upload raw HTML for a scraped web page.
+
+    Stores in stonks-raw-news under the web_scrape source path.
+    """
+    bucket = bucket_for_source("web_scrape")
+    path = build_artifact_path(
+        source_type="web_scrape",
+        ticker=ticker,
+        document_id=document_id,
+        artifact_name="raw",
+        ext="html",
+        timestamp=timestamp,
+    )
+    return upload_artifact(client, bucket, path, html_bytes, content_type="text/html", metadata=metadata)
+
+
+def upload_normalized_text(
+    client: Minio,
+    ticker: str,
+    document_id: str,
+    text_bytes: bytes,
+    timestamp: datetime | None = None,
+    metadata: Mapping[str, str] | None = None,
+) -> str:
+    """Upload normalized (parsed) text to the stonks-normalized bucket.
+
+    Stores under parsed/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/normalized.txt
+    """
+    ts = timestamp or datetime.now(timezone.utc)
+    path = (
+        f"parsed/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
+        f"{document_id}/normalized.txt"
+    )
+    return upload_artifact(
+        client, "stonks-normalized", path, text_bytes,
+        content_type="text/plain", metadata=metadata,
+    )
+
+
+def upload_parser_output(
+    client: Minio,
+    ticker: str,
+    document_id: str,
+    output_bytes: bytes,
+    timestamp: datetime | None = None,
+    metadata: Mapping[str, str] | None = None,
+) -> str:
+    """Upload structured parser output JSON to the stonks-normalized bucket.
+
+    Stores under parsed/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/parser_output.json
+    """
+    ts = timestamp or datetime.now(timezone.utc)
+    path = (
+        f"parsed/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
+        f"{document_id}/parser_output.json"
+    )
+    return upload_artifact(
+        client, "stonks-normalized", path, output_bytes,
+        content_type="application/json", metadata=metadata,
+    )
+
+
+def upload_extraction_prompt(
+    client: Minio,
+    ticker: str,
+    document_id: str,
+    prompt_data: bytes,
+    timestamp: datetime | None = None,
+    metadata: Mapping[str, str] | None = None,
+) -> str:
+    """Upload the extraction prompt and schema to stonks-llm-prompts.
+
+    Stores under extraction/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/prompt.json
+    """
+    ts = timestamp or datetime.now(timezone.utc)
+    path = (
+        f"extraction/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
+        f"{document_id}/prompt.json"
+    )
+    return upload_artifact(
+        client, "stonks-llm-prompts", path, prompt_data,
+        content_type="application/json", metadata=metadata,
+    )
+
+
+def upload_extraction_raw_output(
+    client: Minio,
+    ticker: str,
+    document_id: str,
+    output_data: bytes,
+    attempt_index: int = 0,
+    timestamp: datetime | None = None,
+    metadata: Mapping[str, str] | None = None,
+) -> str:
+    """Upload a raw model output to stonks-llm-results.
+
+    Stores under extraction/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/raw_output_{attempt}.json
+    """
+    ts = timestamp or datetime.now(timezone.utc)
+    path = (
+        f"extraction/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
+        f"{document_id}/raw_output_{attempt_index}.json"
+    )
+    return upload_artifact(
+        client, "stonks-llm-results", path, output_data,
+        content_type="application/json", metadata=metadata,
+    )
+
+
+def upload_extraction_validation(
+    client: Minio,
+    ticker: str,
+    document_id: str,
+    validation_data: bytes,
+    timestamp: datetime | None = None,
+    metadata: Mapping[str, str] | None = None,
+) -> str:
+    """Upload a validation report to stonks-llm-results.
+
+    Stores under extraction/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/validation.json
+    """
+    ts = timestamp or datetime.now(timezone.utc)
+    path = (
+        f"extraction/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
+        f"{document_id}/validation.json"
+    )
+    return upload_artifact(
+        client, "stonks-llm-results", path, validation_data,
+        content_type="application/json", metadata=metadata,
+    )
+
+
+def upload_extraction_intelligence(
+    client: Minio,
+    ticker: str,
+    document_id: str,
+    intelligence_data: bytes,
+    timestamp: datetime | None = None,
+    metadata: Mapping[str, str] | None = None,
+) -> str:
+    """Upload the final intelligence object to stonks-llm-results.
+
+    Stores under extraction/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/intelligence.json
+    """
+    ts = timestamp or datetime.now(timezone.utc)
+    path = (
+        f"extraction/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
+        f"{document_id}/intelligence.json"
+    )
+    return upload_artifact(
+        client, "stonks-llm-results", path, intelligence_data,
+        content_type="application/json", metadata=metadata,
+    )
+
+
+def download_artifact(client: Minio, bucket: str, path: str) -> bytes:
+    """Download an artifact from MinIO and return its bytes."""
+    response = client.get_object(bucket, path)
+    try:
+        return response.read()
+    finally:
+        response.close()
+        response.release_conn()