phase 14-15: docker build validation and helm deployment

This commit is contained in:
Celes Renata
2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
+342
View File
@@ -0,0 +1,342 @@
"""Operational alerting for Stonks Oracle pipeline health.
Evaluates alert rules against PostgreSQL operational state and emits
structured log events and Prometheus metrics when thresholds are breached.
Alert rules:
- source_failures: sustained source retrieval failures per source
- schema_failure_spike: extraction validation failure rate exceeds threshold
- analytical_lag: lake publication has not completed within threshold
- broker_issues: consecutive broker submission errors
Requirements: 12.3
Design: Section 12 (Observability and Operations)
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Any
import asyncpg
from services.shared.config import AlertingConfig
from services.shared.metrics import (
ALERT_ACTIVE,
ALERT_CHECK_DURATION,
ALERTS_FIRED,
ALERTS_RESOLVED,
)
logger = logging.getLogger("alerting")
@dataclass
class Alert:
"""A single alert instance."""
rule: str
severity: str # "warning" | "critical"
summary: str
details: dict[str, Any] = field(default_factory=dict)
fired_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
@dataclass
class AlertState:
"""Tracks which rules are currently firing to detect transitions."""
active: dict[str, Alert] = field(default_factory=dict)
def fire(self, alert: Alert) -> bool:
"""Record an alert firing. Returns True if this is a new firing."""
key = f"{alert.rule}:{alert.details.get('key', '')}"
is_new = key not in self.active
self.active[key] = alert
return is_new
def resolve(self, rule: str, key: str = "") -> bool:
"""Resolve an alert. Returns True if it was previously active."""
full_key = f"{rule}:{key}"
if full_key in self.active:
del self.active[full_key]
return True
return False
def is_firing(self, rule: str, key: str = "") -> bool:
return f"{rule}:{key}" in self.active
async def check_source_failures(
pool: asyncpg.Pool,
config: AlertingConfig,
) -> list[Alert]:
"""Check for sources with sustained consecutive failures.
Queries ingestion_runs for sources where the last N runs all failed
within the lookback window.
"""
rows = await pool.fetch(
"""WITH recent_runs AS (
SELECT source_id, status,
ROW_NUMBER() OVER (PARTITION BY source_id ORDER BY started_at DESC) AS rn
FROM ingestion_runs
WHERE started_at >= NOW() - INTERVAL '1 hour' * $1
),
failure_streaks AS (
SELECT source_id,
COUNT(*) FILTER (WHERE status = 'failed') AS consecutive_failures,
COUNT(*) AS total_runs
FROM recent_runs
WHERE rn <= $2
GROUP BY source_id
HAVING COUNT(*) FILTER (WHERE status = 'failed') = COUNT(*)
AND COUNT(*) >= $2
)
SELECT fs.source_id, fs.consecutive_failures,
s.source_type, s.source_name, c.ticker
FROM failure_streaks fs
JOIN sources s ON s.id = fs.source_id
JOIN companies c ON c.id = s.company_id""",
config.source_failure_window_hours,
config.source_failure_threshold,
)
alerts = []
for row in rows:
alerts.append(Alert(
rule="source_failures",
severity="warning",
summary=(
f"Source {row['source_name']} ({row['source_type']}) for "
f"{row['ticker']} has {row['consecutive_failures']} consecutive failures"
),
details={
"key": str(row["source_id"]),
"source_id": str(row["source_id"]),
"source_type": row["source_type"],
"source_name": row["source_name"],
"ticker": row["ticker"],
"consecutive_failures": row["consecutive_failures"],
},
))
return alerts
async def check_schema_failure_spike(
pool: asyncpg.Pool,
config: AlertingConfig,
) -> list[Alert]:
"""Check if extraction schema validation failure rate exceeds threshold.
Queries model_performance_metrics for the recent window and computes
the failure rate.
"""
row = await pool.fetchrow(
"""SELECT
COUNT(*) AS total,
COUNT(*) FILTER (WHERE NOT success) AS failed
FROM model_performance_metrics
WHERE recorded_at >= NOW() - INTERVAL '1 hour' * $1""",
config.schema_failure_window_hours,
)
if not row or row["total"] == 0:
return []
total = row["total"]
failed = row["failed"]
failure_rate = failed / total
if failure_rate >= config.schema_failure_rate_threshold:
return [Alert(
rule="schema_failure_spike",
severity="critical" if failure_rate >= 0.5 else "warning",
summary=(
f"Extraction schema failure rate is {failure_rate:.1%} "
f"({failed}/{total}) in the last {config.schema_failure_window_hours}h"
),
details={
"key": "global",
"total_extractions": total,
"failed_extractions": failed,
"failure_rate": round(failure_rate, 4),
"threshold": config.schema_failure_rate_threshold,
"window_hours": config.schema_failure_window_hours,
},
)]
return []
async def check_analytical_lag(
pool: asyncpg.Pool,
config: AlertingConfig,
) -> list[Alert]:
"""Check if lake publication is lagging beyond threshold.
Looks at the audit_events table for the most recent successful
lake_publish events per table, and alerts if any are stale.
"""
rows = await pool.fetch(
"""SELECT
details->>'table_name' AS table_name,
MAX(created_at) AS last_publish
FROM audit_events
WHERE event_type = 'lake_publish'
AND details->>'status' = 'success'
AND details->>'table_name' IS NOT NULL
GROUP BY details->>'table_name'
HAVING MAX(created_at) < NOW() - INTERVAL '1 minute' * $1""",
config.lake_lag_threshold_minutes,
)
alerts = []
now = datetime.now(timezone.utc)
for row in rows:
table_name = row["table_name"]
last_publish = row["last_publish"]
if last_publish.tzinfo is None:
last_publish = last_publish.replace(tzinfo=timezone.utc)
lag_minutes = (now - last_publish).total_seconds() / 60
alerts.append(Alert(
rule="analytical_lag",
severity="warning",
summary=(
f"Lake table '{table_name}' last published {lag_minutes:.0f}m ago "
f"(threshold: {config.lake_lag_threshold_minutes}m)"
),
details={
"key": table_name,
"table_name": table_name,
"last_publish": last_publish.isoformat(),
"lag_minutes": round(lag_minutes, 1),
"threshold_minutes": config.lake_lag_threshold_minutes,
},
))
return alerts
async def check_broker_issues(
pool: asyncpg.Pool,
config: AlertingConfig,
) -> list[Alert]:
"""Check for consecutive broker submission errors.
Queries order_events for recent broker-level errors (rejections,
timeouts, connection failures) within the lookback window.
"""
rows = await pool.fetch(
"""WITH recent_events AS (
SELECT order_id, event_type, created_at,
ROW_NUMBER() OVER (ORDER BY created_at DESC) AS rn
FROM order_events
WHERE created_at >= NOW() - INTERVAL '1 hour' * $1
AND event_type IN ('broker_error', 'broker_timeout', 'connection_failed')
)
SELECT COUNT(*) AS error_count
FROM recent_events
WHERE rn <= $2""",
config.broker_error_window_hours,
config.broker_error_threshold,
)
if not rows:
return []
error_count = rows[0]["error_count"]
if error_count >= config.broker_error_threshold:
return [Alert(
rule="broker_issues",
severity="critical",
summary=(
f"{error_count} broker errors in the last "
f"{config.broker_error_window_hours}h"
),
details={
"key": "global",
"error_count": error_count,
"threshold": config.broker_error_threshold,
"window_hours": config.broker_error_window_hours,
},
)]
return []
async def evaluate_alerts(
pool: asyncpg.Pool,
config: AlertingConfig,
state: AlertState,
) -> list[Alert]:
"""Run all alert rules and return newly fired alerts.
Updates AlertState to track firing/resolved transitions and emits
structured log events and Prometheus metrics for each transition.
"""
all_alerts: list[Alert] = []
with ALERT_CHECK_DURATION.time():
# Collect alerts from all rules
try:
all_alerts.extend(await check_source_failures(pool, config))
except Exception:
logger.exception("Error checking source failures")
try:
all_alerts.extend(await check_schema_failure_spike(pool, config))
except Exception:
logger.exception("Error checking schema failure spike")
try:
all_alerts.extend(await check_analytical_lag(pool, config))
except Exception:
logger.exception("Error checking analytical lag")
try:
all_alerts.extend(await check_broker_issues(pool, config))
except Exception:
logger.exception("Error checking broker issues")
# Track which rule+key combos are currently firing
current_keys: set[str] = set()
newly_fired: list[Alert] = []
for alert in all_alerts:
key = f"{alert.rule}:{alert.details.get('key', '')}"
current_keys.add(key)
if state.fire(alert):
# New alert firing
ALERTS_FIRED.labels(rule=alert.rule, severity=alert.severity).inc()
ALERT_ACTIVE.labels(rule=alert.rule).set(1)
newly_fired.append(alert)
logger.warning(
"ALERT FIRING: [%s] %s",
alert.rule,
alert.summary,
extra={
"alert_rule": alert.rule,
"alert_severity": alert.severity,
"alert_details": alert.details,
},
)
# Check for resolved alerts
resolved_keys = set(state.active.keys()) - current_keys
for key in resolved_keys:
rule = key.split(":")[0]
detail_key = key[len(rule) + 1:]
if state.resolve(rule, detail_key):
ALERTS_RESOLVED.labels(rule=rule).inc()
# Only set gauge to 0 if no more alerts for this rule
still_firing = any(k.startswith(f"{rule}:") for k in state.active)
if not still_firing:
ALERT_ACTIVE.labels(rule=rule).set(0)
logger.info(
"ALERT RESOLVED: [%s] key=%s",
rule,
detail_key,
)
return newly_fired
+493
View File
@@ -0,0 +1,493 @@
"""Execution audit trail - records every step from recommendation to market outcome.
Writes structured audit events to the audit_events table so the full
decision chain is traceable: recommendation → risk evaluation → order
submission → broker response → fill/rejection/cancellation.
Each event captures the entity type, entity ID, event type, actor,
and a JSONB data payload with stage-specific details.
Requirements: 8.3, 11.3
Design: Section 4.9 (Broker Adapter), Section 6.1 (PostgreSQL audit_events)
"""
from __future__ import annotations
import json
import logging
import uuid
from datetime import datetime, timezone
from typing import Any
import asyncpg
logger = logging.getLogger("audit")
# ---------------------------------------------------------------------------
# Event type constants
# ---------------------------------------------------------------------------
# Recommendation stage
AUDIT_RECOMMENDATION_GENERATED = "recommendation.generated"
AUDIT_RECOMMENDATION_SUPPRESSED = "recommendation.suppressed"
# Risk evaluation stage
AUDIT_RISK_EVALUATED = "risk.evaluated"
AUDIT_RISK_REJECTED = "risk.rejected"
# Order lifecycle
AUDIT_ORDER_SUBMITTED = "order.submitted"
AUDIT_ORDER_ACCEPTED = "order.accepted"
AUDIT_ORDER_FILLED = "order.filled"
AUDIT_ORDER_REJECTED = "order.rejected"
AUDIT_ORDER_CANCELLED = "order.cancelled"
AUDIT_ORDER_DUPLICATE = "order.duplicate_prevented"
# Position changes
AUDIT_POSITION_OPENED = "position.opened"
AUDIT_POSITION_CLOSED = "position.closed"
AUDIT_POSITION_UPDATED = "position.updated"
# Trading mode changes
AUDIT_TRADING_MODE_CHANGED = "trading.mode_changed"
# Operator approval workflow
AUDIT_APPROVAL_REQUESTED = "approval.requested"
AUDIT_APPROVAL_APPROVED = "approval.approved"
AUDIT_APPROVAL_REJECTED = "approval.rejected"
AUDIT_APPROVAL_EXPIRED = "approval.expired"
# ---------------------------------------------------------------------------
# Core audit writer
# ---------------------------------------------------------------------------
_INSERT_AUDIT_EVENT = """
INSERT INTO audit_events (id, event_type, entity_type, entity_id, actor, data, created_at)
VALUES ($1::uuid, $2, $3, $4::uuid, $5, $6::jsonb, $7)
"""
async def record_audit_event(
pool: asyncpg.Pool,
event_type: str,
entity_type: str,
entity_id: str,
data: dict[str, Any],
actor: str = "system",
timestamp: datetime | None = None,
) -> str:
"""Write a single audit event to PostgreSQL.
Returns the audit event UUID.
"""
event_id = str(uuid.uuid4())
ts = timestamp or datetime.now(timezone.utc)
try:
await pool.execute(
_INSERT_AUDIT_EVENT,
event_id,
event_type,
entity_type,
entity_id,
actor,
json.dumps(data, default=str),
ts,
)
except Exception:
logger.warning(
"Failed to write audit event %s for %s/%s",
event_type, entity_type, entity_id,
exc_info=True,
)
return ""
return event_id
# ---------------------------------------------------------------------------
# Convenience helpers for each execution stage
# ---------------------------------------------------------------------------
async def audit_recommendation_generated(
pool: asyncpg.Pool,
recommendation_id: str,
ticker: str,
action: str,
mode: str,
confidence: float,
evidence_count: int,
suppressed: bool = False,
) -> str:
"""Record that a recommendation was generated."""
event_type = AUDIT_RECOMMENDATION_SUPPRESSED if suppressed else AUDIT_RECOMMENDATION_GENERATED
return await record_audit_event(
pool,
event_type=event_type,
entity_type="recommendation",
entity_id=recommendation_id,
data={
"ticker": ticker,
"action": action,
"mode": mode,
"confidence": confidence,
"evidence_count": evidence_count,
"suppressed": suppressed,
},
actor="recommendation_worker",
)
async def audit_risk_evaluated(
pool: asyncpg.Pool,
evaluation_id: str,
recommendation_id: str | None,
ticker: str,
eligible: bool,
allowed_mode: str,
rejection_reasons: list[str],
check_count: int,
) -> str:
"""Record a risk evaluation result."""
event_type = AUDIT_RISK_REJECTED if not eligible else AUDIT_RISK_EVALUATED
return await record_audit_event(
pool,
event_type=event_type,
entity_type="risk_evaluation",
entity_id=evaluation_id,
data={
"recommendation_id": recommendation_id,
"ticker": ticker,
"eligible": eligible,
"allowed_mode": allowed_mode,
"rejection_reasons": rejection_reasons,
"check_count": check_count,
},
actor="risk_engine",
)
async def audit_order_submitted(
pool: asyncpg.Pool,
order_id: str,
ticker: str,
side: str,
quantity: float,
order_type: str,
idempotency_key: str,
recommendation_id: str | None = None,
evaluation_id: str | None = None,
) -> str:
"""Record that an order was submitted to the broker."""
return await record_audit_event(
pool,
event_type=AUDIT_ORDER_SUBMITTED,
entity_type="order",
entity_id=order_id,
data={
"ticker": ticker,
"side": side,
"quantity": quantity,
"order_type": order_type,
"idempotency_key": idempotency_key,
"recommendation_id": recommendation_id,
"evaluation_id": evaluation_id,
},
actor="broker_service",
)
async def audit_order_filled(
pool: asyncpg.Pool,
order_id: str,
ticker: str,
side: str,
fill_quantity: float,
fill_price: float | None,
broker_order_id: str,
) -> str:
"""Record that an order was filled by the broker."""
return await record_audit_event(
pool,
event_type=AUDIT_ORDER_FILLED,
entity_type="order",
entity_id=order_id,
data={
"ticker": ticker,
"side": side,
"fill_quantity": fill_quantity,
"fill_price": fill_price,
"broker_order_id": broker_order_id,
},
actor="broker_service",
)
async def audit_order_rejected(
pool: asyncpg.Pool,
order_id: str,
ticker: str,
reason: str,
source: str = "broker",
) -> str:
"""Record that an order was rejected (by risk engine or broker)."""
return await record_audit_event(
pool,
event_type=AUDIT_ORDER_REJECTED,
entity_type="order",
entity_id=order_id,
data={
"ticker": ticker,
"reason": reason,
"rejection_source": source,
},
actor="broker_service",
)
async def audit_order_cancelled(
pool: asyncpg.Pool,
order_id: str,
ticker: str,
broker_order_id: str,
) -> str:
"""Record that an order was cancelled."""
return await record_audit_event(
pool,
event_type=AUDIT_ORDER_CANCELLED,
entity_type="order",
entity_id=order_id,
data={
"ticker": ticker,
"broker_order_id": broker_order_id,
},
actor="broker_service",
)
async def audit_duplicate_prevented(
pool: asyncpg.Pool,
order_id: str,
ticker: str,
idempotency_key: str,
detected_via: str,
) -> str:
"""Record that a duplicate order was prevented."""
return await record_audit_event(
pool,
event_type=AUDIT_ORDER_DUPLICATE,
entity_type="order",
entity_id=order_id,
data={
"ticker": ticker,
"idempotency_key": idempotency_key,
"detected_via": detected_via,
},
actor="broker_service",
)
async def audit_position_change(
pool: asyncpg.Pool,
order_id: str,
ticker: str,
side: str,
quantity_before: float,
quantity_after: float,
avg_entry_before: float,
avg_entry_after: float,
) -> str:
"""Record a position change resulting from a fill."""
if quantity_before == 0 and quantity_after > 0:
event_type = AUDIT_POSITION_OPENED
elif quantity_after == 0:
event_type = AUDIT_POSITION_CLOSED
else:
event_type = AUDIT_POSITION_UPDATED
return await record_audit_event(
pool,
event_type=event_type,
entity_type="position",
entity_id=order_id,
data={
"ticker": ticker,
"side": side,
"quantity_before": quantity_before,
"quantity_after": quantity_after,
"avg_entry_before": avg_entry_before,
"avg_entry_after": avg_entry_after,
},
actor="broker_service",
)
async def audit_approval_requested(
pool: asyncpg.Pool,
approval_id: str,
ticker: str,
side: str,
quantity: float,
estimated_value: float,
recommendation_id: str | None = None,
expires_at: str | None = None,
) -> str:
"""Record that an operator approval was requested for a live order."""
return await record_audit_event(
pool,
event_type=AUDIT_APPROVAL_REQUESTED,
entity_type="approval",
entity_id=approval_id,
data={
"ticker": ticker,
"side": side,
"quantity": quantity,
"estimated_value": estimated_value,
"recommendation_id": recommendation_id,
"expires_at": expires_at,
},
actor="broker_service",
)
async def audit_approval_reviewed(
pool: asyncpg.Pool,
approval_id: str,
ticker: str,
approved: bool,
reviewed_by: str = "operator",
review_note: str = "",
) -> str:
"""Record that an operator reviewed an approval request."""
event_type = AUDIT_APPROVAL_APPROVED if approved else AUDIT_APPROVAL_REJECTED
return await record_audit_event(
pool,
event_type=event_type,
entity_type="approval",
entity_id=approval_id,
data={
"ticker": ticker,
"approved": approved,
"reviewed_by": reviewed_by,
"review_note": review_note,
},
actor=reviewed_by,
)
async def audit_approval_expired(
pool: asyncpg.Pool,
approval_id: str,
ticker: str,
) -> str:
"""Record that an approval request expired without review."""
return await record_audit_event(
pool,
event_type=AUDIT_APPROVAL_EXPIRED,
entity_type="approval",
entity_id=approval_id,
data={"ticker": ticker},
actor="system",
)
async def audit_trading_mode_changed(
pool: asyncpg.Pool,
config_id: str,
old_mode: str,
new_mode: str,
actor: str = "operator",
) -> str:
"""Record a trading mode change."""
return await record_audit_event(
pool,
event_type=AUDIT_TRADING_MODE_CHANGED,
entity_type="risk_config",
entity_id=config_id,
data={
"old_mode": old_mode,
"new_mode": new_mode,
},
actor=actor,
)
# ---------------------------------------------------------------------------
# Query helpers for audit trail retrieval (Requirement 11.3)
# ---------------------------------------------------------------------------
_FETCH_AUDIT_TRAIL_FOR_ORDER = """
SELECT id, event_type, entity_type, entity_id, actor, data, created_at
FROM audit_events
WHERE entity_id = $1::uuid
OR data->>'recommendation_id' = $2
OR data->>'order_id' = $2
ORDER BY created_at ASC
"""
_FETCH_AUDIT_TRAIL_BY_ENTITY = """
SELECT id, event_type, entity_type, entity_id, actor, data, created_at
FROM audit_events
WHERE entity_type = $1 AND entity_id = $2::uuid
ORDER BY created_at ASC
"""
_FETCH_FULL_EXECUTION_TRAIL = """
SELECT id, event_type, entity_type, entity_id, actor, data, created_at
FROM audit_events
WHERE entity_id = $1::uuid
OR entity_id IN (
SELECT entity_id FROM audit_events
WHERE data->>'recommendation_id' = $2
)
ORDER BY created_at ASC
"""
async def get_order_audit_trail(
pool: asyncpg.Pool,
order_id: str,
recommendation_id: str | None = None,
) -> list[dict[str, Any]]:
"""Fetch the full audit trail for an order, including related recommendation and risk events.
Returns events ordered chronologically so the full decision chain
is visible: recommendation → risk → order → fill/reject.
"""
ref_id = recommendation_id or order_id
rows = await pool.fetch(_FETCH_AUDIT_TRAIL_FOR_ORDER, order_id, ref_id)
return [
{
"id": str(row["id"]),
"event_type": row["event_type"],
"entity_type": row["entity_type"],
"entity_id": str(row["entity_id"]),
"actor": row["actor"],
"data": row["data"] if isinstance(row["data"], dict) else json.loads(row["data"]),
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
}
for row in rows
]
async def get_entity_audit_trail(
pool: asyncpg.Pool,
entity_type: str,
entity_id: str,
) -> list[dict[str, Any]]:
"""Fetch all audit events for a specific entity."""
rows = await pool.fetch(_FETCH_AUDIT_TRAIL_BY_ENTITY, entity_type, entity_id)
return [
{
"id": str(row["id"]),
"event_type": row["event_type"],
"entity_type": row["entity_type"],
"entity_id": str(row["entity_id"]),
"actor": row["actor"],
"data": row["data"] if isinstance(row["data"], dict) else json.loads(row["data"]),
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
}
for row in rows
]
+108
View File
@@ -43,6 +43,10 @@ class OllamaConfig:
base_url: str = "http://localhost:11434"
model: str = "llama3.1:8b"
timeout: int = 120
max_retries: int = 2
retry_base_delay: float = 1.0
retry_max_delay: float = 10.0
retry_backoff_multiplier: float = 2.0
@dataclass
@@ -51,16 +55,82 @@ class TrinoConfig:
port: int = 8080
catalog: str = "lakehouse"
schema: str = "stonks"
iceberg_catalog: str = "iceberg"
@dataclass
class MarketDataConfig:
api_key: str = ""
base_url: str = "https://api.polygon.io"
provider: str = "polygon"
@dataclass
class BrokerConfig:
mode: str = "paper" # paper | live
provider: str = "alpaca"
api_key: Optional[str] = None
api_secret: Optional[str] = None
base_url: Optional[str] = None
@dataclass
class RetentionConfig:
"""Default retention periods (days) per bucket class.
These can be overridden per-bucket via the retention_policies DB table.
The cleanup_interval_hours controls how often the retention worker runs.
"""
raw_market_days: int = 90
raw_news_days: int = 180
raw_filings_days: int = 365
normalized_days: int = 180
llm_prompts_days: int = 365
llm_results_days: int = 365
lakehouse_days: int = 730
audit_days: int = 730
cleanup_interval_hours: int = 24
batch_size: int = 1000
# Map bucket names to RetentionConfig field names
BUCKET_RETENTION_FIELDS: dict[str, str] = {
"stonks-raw-market": "raw_market_days",
"stonks-raw-news": "raw_news_days",
"stonks-raw-filings": "raw_filings_days",
"stonks-normalized": "normalized_days",
"stonks-llm-prompts": "llm_prompts_days",
"stonks-llm-results": "llm_results_days",
"stonks-lakehouse": "lakehouse_days",
"stonks-audit": "audit_days",
}
@dataclass
class AlertingConfig:
"""Thresholds for operational alerting rules.
Requirements: 12.3
"""
# Source failure alerting
source_failure_threshold: int = 3 # consecutive failures before alert
source_failure_window_hours: int = 6 # lookback window
# Schema/extraction failure spike
schema_failure_rate_threshold: float = 0.3 # 30% failure rate triggers alert
schema_failure_window_hours: int = 1
# Analytical (lake publication) lag
lake_lag_threshold_minutes: int = 60 # minutes since last successful publish
# Broker issues
broker_error_threshold: int = 3 # consecutive broker errors
broker_error_window_hours: int = 1
# Evaluation interval
check_interval_seconds: int = 120
@dataclass
class AppConfig:
postgres: PostgresConfig = field(default_factory=PostgresConfig)
@@ -68,8 +138,12 @@ class AppConfig:
minio: MinioConfig = field(default_factory=MinioConfig)
ollama: OllamaConfig = field(default_factory=OllamaConfig)
trino: TrinoConfig = field(default_factory=TrinoConfig)
market_data: MarketDataConfig = field(default_factory=MarketDataConfig)
broker: BrokerConfig = field(default_factory=BrokerConfig)
retention: RetentionConfig = field(default_factory=RetentionConfig)
alerting: AlertingConfig = field(default_factory=AlertingConfig)
log_level: str = "INFO"
json_logs: bool = True
def load_config() -> AppConfig:
@@ -98,18 +172,52 @@ def load_config() -> AppConfig:
base_url=os.getenv("OLLAMA_BASE_URL", "http://localhost:11434"),
model=os.getenv("OLLAMA_MODEL", "llama3.1:8b"),
timeout=int(os.getenv("OLLAMA_TIMEOUT", "120")),
max_retries=int(os.getenv("OLLAMA_MAX_RETRIES", "2")),
retry_base_delay=float(os.getenv("OLLAMA_RETRY_BASE_DELAY", "1.0")),
retry_max_delay=float(os.getenv("OLLAMA_RETRY_MAX_DELAY", "10.0")),
retry_backoff_multiplier=float(os.getenv("OLLAMA_RETRY_BACKOFF_MULTIPLIER", "2.0")),
),
trino=TrinoConfig(
host=os.getenv("TRINO_HOST", "localhost"),
port=int(os.getenv("TRINO_PORT", "8080")),
catalog=os.getenv("TRINO_CATALOG", "lakehouse"),
schema=os.getenv("TRINO_SCHEMA", "stonks"),
iceberg_catalog=os.getenv("TRINO_ICEBERG_CATALOG", "iceberg"),
),
market_data=MarketDataConfig(
api_key=os.getenv("MARKET_DATA_API_KEY", ""),
base_url=os.getenv("MARKET_DATA_BASE_URL", "https://api.polygon.io"),
provider=os.getenv("MARKET_DATA_PROVIDER", "polygon"),
),
broker=BrokerConfig(
mode=os.getenv("BROKER_MODE", "paper"),
provider=os.getenv("BROKER_PROVIDER", "alpaca"),
api_key=os.getenv("BROKER_API_KEY", None),
api_secret=os.getenv("BROKER_API_SECRET", None),
base_url=os.getenv("BROKER_BASE_URL", None),
),
retention=RetentionConfig(
raw_market_days=int(os.getenv("RETENTION_RAW_MARKET_DAYS", "90")),
raw_news_days=int(os.getenv("RETENTION_RAW_NEWS_DAYS", "180")),
raw_filings_days=int(os.getenv("RETENTION_RAW_FILINGS_DAYS", "365")),
normalized_days=int(os.getenv("RETENTION_NORMALIZED_DAYS", "180")),
llm_prompts_days=int(os.getenv("RETENTION_LLM_PROMPTS_DAYS", "365")),
llm_results_days=int(os.getenv("RETENTION_LLM_RESULTS_DAYS", "365")),
lakehouse_days=int(os.getenv("RETENTION_LAKEHOUSE_DAYS", "730")),
audit_days=int(os.getenv("RETENTION_AUDIT_DAYS", "730")),
cleanup_interval_hours=int(os.getenv("RETENTION_CLEANUP_INTERVAL_HOURS", "24")),
batch_size=int(os.getenv("RETENTION_BATCH_SIZE", "1000")),
),
alerting=AlertingConfig(
source_failure_threshold=int(os.getenv("ALERT_SOURCE_FAILURE_THRESHOLD", "3")),
source_failure_window_hours=int(os.getenv("ALERT_SOURCE_FAILURE_WINDOW_HOURS", "6")),
schema_failure_rate_threshold=float(os.getenv("ALERT_SCHEMA_FAILURE_RATE_THRESHOLD", "0.3")),
schema_failure_window_hours=int(os.getenv("ALERT_SCHEMA_FAILURE_WINDOW_HOURS", "1")),
lake_lag_threshold_minutes=int(os.getenv("ALERT_LAKE_LAG_THRESHOLD_MINUTES", "60")),
broker_error_threshold=int(os.getenv("ALERT_BROKER_ERROR_THRESHOLD", "3")),
broker_error_window_hours=int(os.getenv("ALERT_BROKER_ERROR_WINDOW_HOURS", "1")),
check_interval_seconds=int(os.getenv("ALERT_CHECK_INTERVAL_SECONDS", "120")),
),
log_level=os.getenv("LOG_LEVEL", "INFO"),
json_logs=os.getenv("JSON_LOGS", "true").lower() == "true",
)
+43
View File
@@ -0,0 +1,43 @@
"""Canonical URL normalization and content hashing utilities.
Provides consistent URL canonicalization and SHA-256 content hashing
across all ingestion adapters and pipeline stages.
Requirements: 3.2, 3.3
"""
import hashlib
from urllib.parse import parse_qsl, urlencode, urlparse
def normalize_url(url: str) -> str:
"""Canonical URL normalization.
- Lowercases scheme and host
- Strips fragments
- Strips trailing slashes from path (preserves root "/")
- Strips default ports (80, 443)
- Sorts query parameters for deterministic comparison
- Defaults scheme to https if missing
"""
parsed = urlparse(url)
scheme = (parsed.scheme or "https").lower()
netloc = (parsed.hostname or "").lower()
if parsed.port and parsed.port not in (80, 443):
netloc = f"{netloc}:{parsed.port}"
path = parsed.path.rstrip("/") or "/"
# Sort query params for deterministic ordering
query = urlencode(sorted(parse_qsl(parsed.query)))
normalized = f"{scheme}://{netloc}{path}"
if query:
normalized = f"{normalized}?{query}"
return normalized
def content_hash(data: bytes) -> str:
"""Compute a stable SHA-256 hex digest for raw content bytes."""
return hashlib.sha256(data).hexdigest()
def content_hash_str(text: str, encoding: str = "utf-8") -> str:
"""Compute a stable SHA-256 hex digest for a text string."""
return hashlib.sha256(text.encode(encoding)).hexdigest()
+134
View File
@@ -0,0 +1,134 @@
"""Dead-letter queue (DLQ) support and replay tooling.
When a worker fails to process a job after exhausting retries, the job
is pushed to a per-queue dead-letter list in Redis. Each DLQ entry
wraps the original payload with failure metadata (error message,
timestamp, attempt count) so operators can inspect and replay later.
Replay moves items from the DLQ back to the source queue for
reprocessing.
Requirements: 12.1 (observability), design section 8 (data flows)
"""
from __future__ import annotations
import json
import logging
from datetime import datetime, timezone
from typing import Any
import redis.asyncio as aioredis
from services.shared.redis_keys import dlq_key, queue_key
logger = logging.getLogger(__name__)
# Default max attempts before a job is dead-lettered
DEFAULT_MAX_ATTEMPTS = 3
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def wrap_dlq_entry(
original_payload: dict[str, Any],
queue_name: str,
error: str,
attempt: int = 1,
worker: str = "",
) -> dict[str, Any]:
"""Wrap an original job payload with DLQ metadata."""
return {
"original_payload": original_payload,
"queue": queue_name,
"error": error,
"attempt": attempt,
"worker": worker,
"dead_lettered_at": _now_iso(),
}
async def send_to_dlq(
rds: aioredis.Redis,
queue_name: str,
original_payload: dict[str, Any],
error: str,
attempt: int = 1,
worker: str = "",
) -> None:
"""Push a failed job to the dead-letter queue for *queue_name*."""
entry = wrap_dlq_entry(original_payload, queue_name, error, attempt, worker)
await rds.rpush(dlq_key(queue_name), json.dumps(entry, default=str))
logger.warning(
"Dead-lettered job on %s after %d attempts: %s",
queue_name, attempt, error,
extra={"queue": queue_name, "attempt": attempt},
)
async def dlq_length(rds: aioredis.Redis, queue_name: str) -> int:
"""Return the number of items in the DLQ for *queue_name*."""
return await rds.llen(dlq_key(queue_name))
async def peek_dlq(
rds: aioredis.Redis,
queue_name: str,
start: int = 0,
count: int = 10,
) -> list[dict[str, Any]]:
"""Return DLQ entries without removing them (for inspection)."""
raw_items = await rds.lrange(dlq_key(queue_name), start, start + count - 1)
return [json.loads(item) for item in raw_items]
async def replay_one(rds: aioredis.Redis, queue_name: str) -> dict[str, Any] | None:
"""Pop the oldest DLQ entry and re-enqueue its original payload.
Returns the replayed DLQ entry, or None if the DLQ is empty.
"""
raw = await rds.lpop(dlq_key(queue_name))
if raw is None:
return None
entry = json.loads(raw)
original = entry.get("original_payload", entry)
await rds.rpush(queue_key(queue_name), json.dumps(original, default=str))
logger.info("Replayed 1 job from DLQ back to %s", queue_name)
return entry
async def replay_all(rds: aioredis.Redis, queue_name: str) -> int:
"""Replay every item in the DLQ back to the source queue.
Returns the number of items replayed.
"""
count = 0
while True:
raw = await rds.lpop(dlq_key(queue_name))
if raw is None:
break
entry = json.loads(raw)
original = entry.get("original_payload", entry)
await rds.rpush(queue_key(queue_name), json.dumps(original, default=str))
count += 1
if count:
logger.info("Replayed %d jobs from DLQ back to %s", count, queue_name)
return count
async def purge_dlq(rds: aioredis.Redis, queue_name: str) -> int:
"""Delete all items from the DLQ for *queue_name*. Returns count removed."""
key = dlq_key(queue_name)
length = await rds.llen(key)
if length:
await rds.delete(key)
return length
async def dlq_summary(rds: aioredis.Redis, queue_names: list[str]) -> dict[str, int]:
"""Return a mapping of queue_name -> DLQ depth for the given queues."""
result: dict[str, int] = {}
for name in queue_names:
result[name] = await rds.llen(dlq_key(name))
return result
+198
View File
@@ -0,0 +1,198 @@
"""Cross-source deduplication for articles and filings.
Detects duplicate documents across different source types (news_api,
filings_api, web_scrape) using a layered approach:
1. Redis fast-path: check content_hash and canonical_url markers for
recently-seen documents (TTL-bounded, cheap).
2. PostgreSQL fallback: query the documents table by canonical_url or
content_hash for durable cross-source matching.
When a duplicate is detected the caller receives the existing document_id
so it can link additional company mentions without re-inserting the document.
Requirements: 3.2, 3.3
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import Any
import asyncpg
import redis.asyncio as aioredis
from services.shared.content import content_hash_str, normalize_url
from services.shared.redis_keys import DEDUPE_PREFIX
logger = logging.getLogger("dedupe")
# Redis TTL for dedupe markers (24 hours)
DEDUPE_TTL_SECONDS: int = 86400
def _url_dedupe_key(canonical_url: str) -> str:
"""Build a Redis key for URL-based deduplication."""
return f"{DEDUPE_PREFIX}:url:{content_hash_str(canonical_url)}"
def _hash_dedupe_key(content_hash: str) -> str:
"""Build a Redis key for content-hash-based deduplication."""
return f"{DEDUPE_PREFIX}:{content_hash}"
@dataclass
class DedupeResult:
"""Result of a deduplication check."""
is_duplicate: bool
existing_document_id: str | None = None
match_type: str | None = None # "content_hash" | "canonical_url" | None
async def check_duplicate(
pool: asyncpg.Pool,
rds: aioredis.Redis,
*,
content_hash: str,
url: str | None = None,
canonical_url: str | None = None,
) -> DedupeResult:
"""Check whether a document is a duplicate across all source types.
Checks in order of cost:
1. Redis content_hash marker (fast path)
2. Redis canonical_url marker (fast path)
3. PostgreSQL documents.content_hash (durable)
4. PostgreSQL documents.canonical_url (cross-source)
Returns a DedupeResult indicating whether the document already exists.
"""
# Resolve canonical URL if only raw URL provided
resolved_canonical = canonical_url or (normalize_url(url) if url else None)
# --- Redis fast path: content hash ---
if content_hash:
redis_key = _hash_dedupe_key(content_hash)
cached_id = await rds.get(redis_key)
if cached_id:
logger.debug("Dedupe hit (redis content_hash) for %s", content_hash[:16])
return DedupeResult(
is_duplicate=True,
existing_document_id=str(cached_id),
match_type="content_hash",
)
# --- Redis fast path: canonical URL ---
if resolved_canonical:
url_key = _url_dedupe_key(resolved_canonical)
cached_id = await rds.get(url_key)
if cached_id:
logger.debug("Dedupe hit (redis canonical_url) for %s", resolved_canonical[:60])
return DedupeResult(
is_duplicate=True,
existing_document_id=str(cached_id),
match_type="canonical_url",
)
# --- PostgreSQL fallback: content hash ---
if content_hash:
row = await pool.fetchrow(
"SELECT id FROM documents WHERE content_hash = $1 LIMIT 1",
content_hash,
)
if row:
doc_id = str(row["id"])
# Warm the Redis cache for future checks
await _set_dedupe_markers(rds, content_hash, resolved_canonical, doc_id)
logger.debug("Dedupe hit (pg content_hash) for %s", content_hash[:16])
return DedupeResult(
is_duplicate=True,
existing_document_id=doc_id,
match_type="content_hash",
)
# --- PostgreSQL fallback: canonical URL ---
if resolved_canonical:
row = await pool.fetchrow(
"SELECT id FROM documents WHERE canonical_url = $1 LIMIT 1",
resolved_canonical,
)
if row:
doc_id = str(row["id"])
await _set_dedupe_markers(rds, content_hash, resolved_canonical, doc_id)
logger.debug("Dedupe hit (pg canonical_url) for %s", resolved_canonical[:60])
return DedupeResult(
is_duplicate=True,
existing_document_id=doc_id,
match_type="canonical_url",
)
return DedupeResult(is_duplicate=False)
async def mark_as_seen(
rds: aioredis.Redis,
*,
content_hash: str,
canonical_url: str | None,
document_id: str,
) -> None:
"""Mark a newly-persisted document in Redis for fast future dedupe checks."""
await _set_dedupe_markers(rds, content_hash, canonical_url, document_id)
async def _set_dedupe_markers(
rds: aioredis.Redis,
content_hash: str | None,
canonical_url: str | None,
document_id: str,
) -> None:
"""Set Redis dedupe markers for both content hash and canonical URL."""
if content_hash:
await rds.set(
_hash_dedupe_key(content_hash), document_id, ex=DEDUPE_TTL_SECONDS
)
if canonical_url:
await rds.set(
_url_dedupe_key(canonical_url), document_id, ex=DEDUPE_TTL_SECONDS
)
async def dedupe_items(
pool: asyncpg.Pool,
rds: aioredis.Redis,
items: list[dict[str, Any]],
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
"""Partition a list of ingestion items into new and duplicate groups.
Each item is expected to have at least one of:
- content_hash: SHA-256 of the raw content
- url / canonical_url: the document URL
Returns (new_items, duplicate_items).
"""
new_items: list[dict[str, Any]] = []
dup_items: list[dict[str, Any]] = []
for item in items:
item_hash = item.get("content_hash", "")
item_url = item.get("url") or item.get("link")
item_canonical = item.get("canonical_url")
result = await check_duplicate(
pool,
rds,
content_hash=item_hash,
url=item_url,
canonical_url=item_canonical,
)
if result.is_duplicate:
item["_dedupe_match_type"] = result.match_type
item["_dedupe_existing_id"] = result.existing_document_id
dup_items.append(item)
else:
new_items.append(item)
return new_items, dup_items
+224
View File
@@ -0,0 +1,224 @@
"""Structured logging and distributed tracing for all Stonks Oracle services.
Provides:
- JSON-formatted structured log output for machine-parseable log aggregation
- Trace context (trace_id, span_id, service) propagated through log records
- Context manager for creating trace spans within a service
- Helper to configure logging for any service worker or API
Requirements: 12.1
Design: Section 12 (Observability and Operations)
"""
from __future__ import annotations
import json
import logging
import time
import uuid
from contextvars import ContextVar
from datetime import datetime, timezone
from typing import Any
# ---------------------------------------------------------------------------
# Trace context stored in contextvars for async-safe propagation
# ---------------------------------------------------------------------------
_trace_id: ContextVar[str] = ContextVar("trace_id", default="")
_span_id: ContextVar[str] = ContextVar("span_id", default="")
_service_name: ContextVar[str] = ContextVar("service_name", default="unknown")
def get_trace_id() -> str:
return _trace_id.get()
def get_span_id() -> str:
return _span_id.get()
def get_service_name() -> str:
return _service_name.get()
def set_trace_context(
trace_id: str | None = None,
span_id: str | None = None,
service: str | None = None,
) -> None:
"""Set trace context for the current async task / thread."""
if trace_id is not None:
_trace_id.set(trace_id)
if span_id is not None:
_span_id.set(span_id)
if service is not None:
_service_name.set(service)
def new_trace_id() -> str:
return uuid.uuid4().hex[:16]
def new_span_id() -> str:
return uuid.uuid4().hex[:8]
# ---------------------------------------------------------------------------
# Span context manager for tracing within a service
# ---------------------------------------------------------------------------
class Span:
"""Lightweight span for distributed tracing.
Usage::
with Span("process_document", ticker="AAPL") as span:
# ... do work ...
span.set_attribute("doc_count", 5)
On exit the span logs its duration and attributes as a structured event.
"""
def __init__(self, operation: str, **attributes: Any) -> None:
self.operation = operation
self.parent_span_id = get_span_id()
self.span_id = new_span_id()
self.trace_id = get_trace_id() or new_trace_id()
self.attributes: dict[str, Any] = dict(attributes)
self.start_time: float = 0.0
self.duration_ms: float = 0.0
self._token_trace: Any = None
self._token_span: Any = None
self._logger = logging.getLogger(get_service_name() or "tracing")
def set_attribute(self, key: str, value: Any) -> None:
self.attributes[key] = value
def __enter__(self) -> Span:
self.start_time = time.monotonic()
self._token_trace = _trace_id.set(self.trace_id)
self._token_span = _span_id.set(self.span_id)
return self
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
self.duration_ms = (time.monotonic() - self.start_time) * 1000
status = "error" if exc_type else "ok"
self._logger.info(
"span.end",
extra={
"span_operation": self.operation,
"span_status": status,
"span_duration_ms": round(self.duration_ms, 2),
"span_parent_id": self.parent_span_id,
"span_attributes": self.attributes,
},
)
# Restore parent span context
if self._token_span is not None:
_span_id.reset(self._token_span)
if self._token_trace is not None:
_trace_id.reset(self._token_trace)
# ---------------------------------------------------------------------------
# JSON log formatter
# ---------------------------------------------------------------------------
class JSONFormatter(logging.Formatter):
"""Emit each log record as a single JSON line with trace context."""
def format(self, record: logging.LogRecord) -> str:
log_entry: dict[str, Any] = {
"timestamp": datetime.fromtimestamp(record.created, tz=timezone.utc).isoformat(),
"level": record.levelname,
"logger": record.name,
"message": record.getMessage(),
"service": get_service_name(),
"trace_id": get_trace_id(),
"span_id": get_span_id(),
}
# Merge extra fields from Span or manual extra={} usage
for key in (
"span_operation", "span_status", "span_duration_ms",
"span_parent_id", "span_attributes",
"ticker", "document_id", "source_type", "job_id",
"duration_ms", "error", "count",
):
val = getattr(record, key, None)
if val is not None:
log_entry[key] = val
if record.exc_info and record.exc_info[1]:
log_entry["exception"] = self.formatException(record.exc_info)
return json.dumps(log_entry, default=str)
# ---------------------------------------------------------------------------
# Setup helper
# ---------------------------------------------------------------------------
def setup_logging(
service_name: str,
level: str = "INFO",
json_output: bool = True,
) -> None:
"""Configure structured logging for a service.
Call this once at service startup (before any log calls).
Args:
service_name: Identifies this service in log output (e.g. "ingestion_worker").
level: Log level string (DEBUG, INFO, WARNING, ERROR).
json_output: If True, emit JSON lines. If False, use a human-readable format.
"""
_service_name.set(service_name)
root = logging.getLogger()
root.setLevel(getattr(logging, level.upper(), logging.INFO))
# Remove existing handlers to avoid duplicate output
root.handlers.clear()
handler = logging.StreamHandler()
if json_output:
handler.setFormatter(JSONFormatter())
else:
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(name)s (%(service)s) "
"trace=%(trace_id)s span=%(span_id)s%(message)s",
defaults={"service": service_name, "trace_id": "", "span_id": ""},
))
root.addHandler(handler)
# ---------------------------------------------------------------------------
# Trace context propagation through job payloads
# ---------------------------------------------------------------------------
def inject_trace_context(payload: dict[str, Any]) -> dict[str, Any]:
"""Inject current trace context into a job payload dict.
Call this before enqueuing a job to Redis so the downstream
worker can continue the same trace.
"""
trace_id = get_trace_id()
if trace_id:
payload["_trace_id"] = trace_id
return payload
def extract_trace_context(payload: dict[str, Any]) -> None:
"""Extract and set trace context from an incoming job payload.
Call this at the start of job processing. If no trace context
is present, generates a new trace_id.
"""
trace_id = payload.get("_trace_id") or new_trace_id()
set_trace_context(trace_id=trace_id, span_id=new_span_id())
+696
View File
@@ -0,0 +1,696 @@
"""Metadata persistence for market payloads, documents, and broker events.
Persists structured metadata records to PostgreSQL for all ingested artifacts.
Each source type has its own persistence path:
- market_api → market_snapshots table
- news_api / filings_api / web_scrape → documents + document_company_mentions
- broker → order_events or market_snapshots (for position/account snapshots)
Requirements: 3.3, 3.4, 8.3, 9.2
"""
from __future__ import annotations
import json
import logging
from datetime import datetime, timedelta, timezone
from typing import Any
import asyncpg
from services.shared.content import content_hash_str, normalize_url
logger = logging.getLogger("metadata")
async def persist_market_snapshot(
pool: asyncpg.Pool,
*,
company_id: str | None,
ticker: str,
snapshot_type: str,
data: dict[str, Any],
source_provider: str,
storage_ref: str,
content_hash: str,
captured_at: datetime | None = None,
) -> str:
"""Persist a market data snapshot to PostgreSQL.
Returns the snapshot row UUID.
"""
ts = captured_at or datetime.now(timezone.utc)
row_id = await pool.fetchval(
"""INSERT INTO market_snapshots
(company_id, ticker, snapshot_type, data, source_provider,
captured_at, storage_ref, content_hash)
VALUES ($1, $2, $3, $4::jsonb, $5, $6, $7, $8)
RETURNING id""",
company_id,
ticker,
snapshot_type,
json.dumps(data),
source_provider,
ts,
storage_ref,
content_hash,
)
logger.debug("Persisted market snapshot %s for %s", row_id, ticker)
return str(row_id)
async def persist_document(
pool: asyncpg.Pool,
*,
document_type: str,
source_type: str,
publisher: str,
url: str | None,
canonical_url: str | None,
title: str,
published_at: datetime | None,
content_hash: str,
storage_ref: str,
language: str = "en",
) -> str | None:
"""Persist a document metadata record to PostgreSQL.
Returns the document row UUID, or None if a duplicate content_hash exists.
"""
exists = await pool.fetchval(
"SELECT 1 FROM documents WHERE content_hash = $1", content_hash
)
if exists:
return None
doc_id = await pool.fetchval(
"""INSERT INTO documents
(document_type, source_type, publisher, url, canonical_url,
title, published_at, content_hash, raw_storage_ref,
language, status)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 'ingested')
RETURNING id""",
document_type,
source_type,
publisher,
url,
canonical_url,
title,
published_at,
content_hash,
storage_ref,
language,
)
logger.debug("Persisted document %s (%s)", doc_id, title[:60] if title else "")
return str(doc_id)
async def update_document_parse_results(
pool: asyncpg.Pool,
*,
document_id: str,
normalized_storage_ref: str | None,
parser_output_ref: str | None,
parse_quality_score: float,
parse_confidence: str,
status: str,
) -> None:
"""Update a document row with parser output references and quality scores.
Called after the parsing stage to persist normalized text location,
structured parser output location, quality score, and confidence.
Requirements: 4.1, 4.3, 9.1
"""
await pool.execute(
"""UPDATE documents SET
normalized_storage_ref = $2,
parser_output_ref = $3,
parse_quality_score = $4,
parse_confidence = $5,
status = $6,
updated_at = NOW()
WHERE id = $1""",
document_id,
normalized_storage_ref,
parser_output_ref,
parse_quality_score,
parse_confidence,
status,
)
logger.debug(
"Updated document %s parse results: quality=%.2f confidence=%s status=%s",
document_id, parse_quality_score, parse_confidence, status,
)
async def persist_document_company_mention(
pool: asyncpg.Pool,
*,
document_id: str,
company_id: str,
ticker: str,
mention_type: str = "direct",
confidence: float = 1.0,
) -> str:
"""Link a document to a company via document_company_mentions.
Returns the mention row UUID.
"""
mention_id = await pool.fetchval(
"""INSERT INTO document_company_mentions
(document_id, company_id, ticker, mention_type, confidence)
VALUES ($1::uuid, $2::uuid, $3, $4, $5)
RETURNING id""",
document_id,
company_id,
ticker,
mention_type,
confidence,
)
return str(mention_id)
async def persist_broker_event(
pool: asyncpg.Pool,
*,
ticker: str,
event_type: str,
data: dict[str, Any],
source_provider: str,
storage_ref: str,
content_hash: str,
captured_at: datetime | None = None,
) -> str:
"""Persist a broker event snapshot to market_snapshots.
Broker position/account snapshots are stored as market_snapshots
with snapshot_type prefixed by 'broker_' (e.g. broker_positions,
broker_account, broker_orders).
Returns the snapshot row UUID.
"""
ts = captured_at or datetime.now(timezone.utc)
row_id = await pool.fetchval(
"""INSERT INTO market_snapshots
(ticker, snapshot_type, data, source_provider,
captured_at, storage_ref, content_hash)
VALUES ($1, $2, $3::jsonb, $4, $5, $6, $7)
RETURNING id""",
ticker,
f"broker_{event_type}",
json.dumps(data),
source_provider,
ts,
storage_ref,
content_hash,
)
logger.debug("Persisted broker event %s for %s", row_id, ticker)
return str(row_id)
def _resolve_document_type(source_type: str) -> str:
"""Map source_type to a document_type value."""
mapping = {
"news_api": "article",
"filings_api": "filing",
"web_scrape": "press_release",
}
return mapping.get(source_type, "article")
def _extract_publisher(item: dict[str, Any]) -> str:
"""Extract publisher name from an adapter item dict."""
if item.get("publisher"):
return str(item["publisher"])
source = item.get("source")
if isinstance(source, dict):
return source.get("name", "")
if source:
return str(source)
return ""
def _parse_published_at(item: dict[str, Any]) -> datetime | None:
"""Parse published_at from various adapter item formats."""
raw = item.get("publishedAt") or item.get("published_at")
if not raw:
return None
if isinstance(raw, datetime):
return raw
try:
return datetime.fromisoformat(str(raw).replace("Z", "+00:00"))
except (ValueError, TypeError):
return None
async def persist_ingestion_items(
pool: asyncpg.Pool,
*,
source_type: str,
ticker: str,
company_id: str | None,
items: list[dict[str, Any]],
storage_ref: str,
adapter_metadata: dict[str, Any],
content_hash: str,
) -> tuple[int, list[str]]:
"""Route ingestion items to the correct persistence path.
Returns (new_item_count, list_of_new_ids).
"""
if source_type == "market_api":
return await _persist_market_items(
pool,
ticker=ticker,
company_id=company_id,
items=items,
storage_ref=storage_ref,
provider=adapter_metadata.get("provider", "unknown"),
content_hash=content_hash,
)
if source_type == "broker":
return await _persist_broker_items(
pool,
ticker=ticker,
items=items,
storage_ref=storage_ref,
provider=adapter_metadata.get("provider", "unknown"),
endpoint=adapter_metadata.get("endpoint", "positions"),
content_hash=content_hash,
)
# Document types: news_api, filings_api, web_scrape
return await _persist_document_items(
pool,
source_type=source_type,
ticker=ticker,
company_id=company_id,
items=items,
storage_ref=storage_ref,
)
async def _persist_market_items(
pool: asyncpg.Pool,
*,
ticker: str,
company_id: str | None,
items: list[dict[str, Any]],
storage_ref: str,
provider: str,
content_hash: str,
) -> tuple[int, list[str]]:
"""Persist market data items as market_snapshots rows."""
ids: list[str] = []
for item in items:
item_hash = content_hash_str(json.dumps(item, sort_keys=True))
# Skip duplicates
exists = await pool.fetchval(
"SELECT 1 FROM market_snapshots WHERE content_hash = $1", item_hash
)
if exists:
continue
snapshot_type = _infer_market_snapshot_type(item)
row_id = await persist_market_snapshot(
pool,
company_id=company_id,
ticker=ticker,
snapshot_type=snapshot_type,
data=item,
source_provider=provider,
storage_ref=storage_ref,
content_hash=item_hash,
)
ids.append(row_id)
return len(ids), ids
def _infer_market_snapshot_type(item: dict[str, Any]) -> str:
"""Infer snapshot_type from market data item fields."""
# Polygon aggregate bars have 'o', 'h', 'l', 'c' fields
if all(k in item for k in ("o", "h", "l", "c")):
return "bar"
# Ticker details have 'market_cap' or 'sic_code'
if "market_cap" in item or "sic_code" in item:
return "ticker_details"
# Quote snapshots
if "ask" in item or "bid" in item:
return "quote"
return "snapshot"
async def _persist_broker_items(
pool: asyncpg.Pool,
*,
ticker: str,
items: list[dict[str, Any]],
storage_ref: str,
provider: str,
endpoint: str,
content_hash: str,
) -> tuple[int, list[str]]:
"""Persist broker fetch items as market_snapshots with broker_ prefix."""
ids: list[str] = []
for item in items:
item_hash = content_hash_str(json.dumps(item, sort_keys=True))
exists = await pool.fetchval(
"SELECT 1 FROM market_snapshots WHERE content_hash = $1", item_hash
)
if exists:
continue
row_id = await persist_broker_event(
pool,
ticker=ticker,
event_type=endpoint,
data=item,
source_provider=provider,
storage_ref=storage_ref,
content_hash=item_hash,
)
ids.append(row_id)
return len(ids), ids
async def _persist_document_items(
pool: asyncpg.Pool,
*,
source_type: str,
ticker: str,
company_id: str | None,
items: list[dict[str, Any]],
storage_ref: str,
) -> tuple[int, list[str]]:
"""Persist document items (news, filings, web scrape) to documents table."""
doc_type = _resolve_document_type(source_type)
ids: list[str] = []
for item in items:
item_hash = item.get("content_hash") or content_hash_str(
json.dumps(item, sort_keys=True)
)
title = item.get("title", item.get("name", ""))
url = item.get("url", item.get("link", ""))
canonical_url = item.get("canonical_url") or (
normalize_url(url) if url else None
)
published_at = _parse_published_at(item)
publisher = _extract_publisher(item)
doc_id = await persist_document(
pool,
document_type=doc_type,
source_type=source_type,
publisher=publisher,
url=url or None,
canonical_url=canonical_url,
title=title,
published_at=published_at,
content_hash=item_hash,
storage_ref=storage_ref,
)
if doc_id is None:
continue
# Link document to company if we have a company_id
if company_id:
await persist_document_company_mention(
pool,
document_id=doc_id,
company_id=company_id,
ticker=ticker,
)
ids.append(doc_id)
return len(ids), ids
# --- Retry and failure tracking (Requirement 3.4) ---
# Backoff constants — match scheduler defaults for consistency
RETRY_BACKOFF_BASE: int = 60
RETRY_BACKOFF_MAX: int = 3600
RETRY_MAX_COUNT: int = 10
def compute_next_retry_at(
retry_count: int,
now: datetime | None = None,
base: int = RETRY_BACKOFF_BASE,
cap: int = RETRY_BACKOFF_MAX,
) -> datetime:
"""Compute the next eligible retry time using exponential backoff.
Args:
retry_count: Current retry count (before incrementing).
now: Reference timestamp (defaults to UTC now).
base: Base delay in seconds.
cap: Maximum delay in seconds.
Returns:
Datetime of the next eligible retry.
"""
ts = now or datetime.now(timezone.utc)
delay = min(base * (2 ** min(retry_count, 8)), cap)
return ts + timedelta(seconds=delay)
async def get_source_retry_count(
pool: asyncpg.Pool,
source_id: str,
) -> int:
"""Return the retry count from the most recent failed run for a source.
If the last run succeeded or no runs exist, returns 0.
"""
row = await pool.fetchrow(
"""SELECT status, retry_count
FROM ingestion_runs
WHERE source_id = $1::uuid
ORDER BY started_at DESC
LIMIT 1""",
source_id,
)
if row and row["status"] == "failed":
return row["retry_count"] or 0
return 0
async def record_retrieval_failure(
pool: asyncpg.Pool,
run_id: str,
source_id: str,
error_message: str,
retry_count: int | None = None,
now: datetime | None = None,
) -> dict[str, Any]:
"""Record a source retrieval failure with retry policy state.
Updates the ingestion_runs row with:
- error_message: the failure reason
- retry_count: incremented from the previous failed run (or provided)
- next_retry_at: computed via exponential backoff
- status: 'failed'
If retry_count is not provided, it is looked up from the most recent
failed run for the same source and incremented.
Returns a dict with the recorded retry state for observability.
Requirement 3.4
"""
ts = now or datetime.now(timezone.utc)
if retry_count is None:
prev_count = await get_source_retry_count(pool, source_id)
retry_count = prev_count + 1
else:
retry_count = retry_count + 1
next_retry = compute_next_retry_at(retry_count - 1, now=ts)
exhausted = retry_count >= RETRY_MAX_COUNT
await pool.execute(
"""UPDATE ingestion_runs
SET status = 'failed',
error_message = $2,
retry_count = $3,
next_retry_at = $4,
completed_at = $5
WHERE id = $1""",
run_id,
error_message,
retry_count,
next_retry,
ts,
)
state = {
"run_id": run_id,
"source_id": source_id,
"retry_count": retry_count,
"next_retry_at": next_retry.isoformat(),
"exhausted": exhausted,
"error_message": error_message,
}
if exhausted:
logger.warning(
"Source %s exhausted retries (%d/%d): %s",
source_id, retry_count, RETRY_MAX_COUNT, error_message,
)
else:
logger.info(
"Source %s failed (retry %d/%d), next retry at %s: %s",
source_id, retry_count, RETRY_MAX_COUNT,
next_retry.isoformat(), error_message,
)
return state
async def persist_document_intelligence(
pool: asyncpg.Pool,
*,
document_id: str,
summary: str,
macro_themes: list[str],
novelty_score: float,
source_credibility: float,
extraction_warnings: list[str],
confidence: float,
model_provider: str,
model_name: str,
prompt_version: str,
schema_version: str,
raw_output_ref: str | None = None,
prompt_ref: str | None = None,
validation_status: str = "valid",
validation_errors: list[str] | None = None,
retry_count: int = 0,
) -> str:
"""Persist a document intelligence record to PostgreSQL.
Returns the intelligence row UUID.
Requirements: 5.3, 5.4, 9.2
"""
intel_id = await pool.fetchval(
"""INSERT INTO document_intelligence
(document_id, summary, macro_themes, novelty_score,
source_credibility, extraction_warnings, confidence,
model_provider, model_name, prompt_version, schema_version,
raw_output_ref, prompt_ref, validation_status,
validation_errors, retry_count)
VALUES ($1::uuid, $2, $3::jsonb, $4, $5, $6::jsonb, $7,
$8, $9, $10, $11, $12, $13, $14, $15::jsonb, $16)
RETURNING id""",
document_id,
summary,
json.dumps(macro_themes),
novelty_score,
source_credibility,
json.dumps(extraction_warnings),
confidence,
model_provider,
model_name,
prompt_version,
schema_version,
raw_output_ref,
prompt_ref,
validation_status,
json.dumps(validation_errors or []),
retry_count,
)
logger.debug("Persisted document intelligence %s for doc %s", intel_id, document_id)
return str(intel_id)
async def persist_document_impact(
pool: asyncpg.Pool,
*,
intelligence_id: str,
company_id: str,
ticker: str,
relevance: float,
sentiment: str,
impact_score: float,
impact_horizon: str,
catalyst_type: str,
key_facts: list[str],
risks: list[str],
evidence_spans: list[str],
) -> str:
"""Persist a per-company impact record linked to a document intelligence row.
Returns the impact record UUID.
Requirements: 5.3, 5.5, 9.2
"""
impact_id = await pool.fetchval(
"""INSERT INTO document_impact_records
(intelligence_id, company_id, ticker, relevance, sentiment,
impact_score, impact_horizon, catalyst_type,
key_facts, risks, evidence_spans)
VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6, $7, $8,
$9::jsonb, $10::jsonb, $11::jsonb)
RETURNING id""",
intelligence_id,
company_id,
ticker,
relevance,
sentiment,
impact_score,
impact_horizon,
catalyst_type,
json.dumps(key_facts),
json.dumps(risks),
json.dumps(evidence_spans),
)
logger.debug("Persisted impact record %s for %s", impact_id, ticker)
return str(impact_id)
async def update_document_status(
pool: asyncpg.Pool,
*,
document_id: str,
status: str,
) -> None:
"""Update the status field on a document row.
Used to advance documents through the pipeline: ingested → parsed → extracted → failed.
Requirements: 5.4
"""
await pool.execute(
"""UPDATE documents SET status = $2, updated_at = NOW() WHERE id = $1::uuid""",
document_id,
status,
)
logger.debug("Updated document %s status to %s", document_id, status)
async def reset_source_retry_state(
pool: asyncpg.Pool,
source_id: str,
) -> None:
"""Reset retry state for a source after a successful run.
Sets retry_count=0 and next_retry_at=NULL on the most recent run.
Called after a successful ingestion to clear any accumulated backoff.
"""
await pool.execute(
"""UPDATE ingestion_runs
SET retry_count = 0, next_retry_at = NULL
WHERE id = (
SELECT id FROM ingestion_runs
WHERE source_id = $1::uuid
ORDER BY started_at DESC
LIMIT 1
)""",
source_id,
)
+317
View File
@@ -0,0 +1,317 @@
"""Prometheus metrics for all Stonks Oracle pipeline stages.
Provides counters, histograms, and gauges covering:
- Ingestion: items fetched, new items, errors, adapter latency
- Parsing: documents parsed, quality scores, low-quality flags
- Extraction: attempts, successes, failures, latency, confidence, retries
- Aggregation: trend windows computed, signal counts, contradiction scores
- Lake publication: facts published per table, write latency
- Trading: orders submitted, rejected, filled, risk evaluations
Requirements: 12.1, 12.2
Design: Section 12 (Observability and Operations)
"""
from __future__ import annotations
from prometheus_client import Counter, Gauge, Histogram, Info
# ---------------------------------------------------------------------------
# Service info
# ---------------------------------------------------------------------------
SERVICE_INFO = Info("stonks_oracle", "Stonks Oracle service metadata")
# ---------------------------------------------------------------------------
# Ingestion metrics
# ---------------------------------------------------------------------------
INGESTION_JOBS_TOTAL = Counter(
"stonks_ingestion_jobs_total",
"Total ingestion jobs processed",
["source_type", "status"],
)
INGESTION_ITEMS_FETCHED = Counter(
"stonks_ingestion_items_fetched_total",
"Total items fetched from external sources",
["source_type"],
)
INGESTION_ITEMS_NEW = Counter(
"stonks_ingestion_items_new_total",
"New (non-duplicate) items ingested",
["source_type"],
)
INGESTION_ITEMS_DEDUPED = Counter(
"stonks_ingestion_items_deduped_total",
"Items skipped due to deduplication",
["source_type"],
)
INGESTION_ERRORS = Counter(
"stonks_ingestion_errors_total",
"Ingestion errors by source type",
["source_type"],
)
INGESTION_ADAPTER_DURATION = Histogram(
"stonks_ingestion_adapter_duration_seconds",
"Adapter fetch latency in seconds",
["source_type"],
buckets=(0.1, 0.5, 1, 2, 5, 10, 30, 60),
)
# ---------------------------------------------------------------------------
# Parsing metrics
# ---------------------------------------------------------------------------
PARSE_JOBS_TOTAL = Counter(
"stonks_parse_jobs_total",
"Total parse jobs processed",
["status"],
)
PARSE_QUALITY_SCORE = Histogram(
"stonks_parse_quality_score",
"Distribution of parser quality scores",
buckets=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
)
PARSE_LOW_QUALITY_TOTAL = Counter(
"stonks_parse_low_quality_total",
"Documents flagged as low quality by the parser",
)
PARSE_DURATION = Histogram(
"stonks_parse_duration_seconds",
"Parse job duration in seconds",
buckets=(0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10),
)
# ---------------------------------------------------------------------------
# Extraction metrics
# ---------------------------------------------------------------------------
EXTRACTION_JOBS_TOTAL = Counter(
"stonks_extraction_jobs_total",
"Total extraction jobs processed",
["status"],
)
EXTRACTION_ATTEMPTS = Counter(
"stonks_extraction_attempts_total",
"Total Ollama extraction attempts (including retries)",
)
EXTRACTION_RETRIES = Counter(
"stonks_extraction_retries_total",
"Extraction retry count",
)
EXTRACTION_DURATION = Histogram(
"stonks_extraction_duration_seconds",
"Extraction total duration in seconds",
buckets=(1, 2, 5, 10, 20, 30, 60, 120),
)
EXTRACTION_CONFIDENCE = Histogram(
"stonks_extraction_confidence",
"Distribution of extraction confidence scores",
buckets=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
)
EXTRACTION_VALIDATION_ERRORS = Counter(
"stonks_extraction_validation_errors_total",
"Total validation errors across extractions",
)
EXTRACTION_TOKEN_ESTIMATE = Counter(
"stonks_extraction_tokens_total",
"Estimated token usage",
["direction"],
)
# ---------------------------------------------------------------------------
# Aggregation metrics
# ---------------------------------------------------------------------------
AGGREGATION_WINDOWS_COMPUTED = Counter(
"stonks_aggregation_windows_total",
"Trend windows computed",
["window"],
)
AGGREGATION_SIGNALS_PROCESSED = Counter(
"stonks_aggregation_signals_total",
"Signals processed during aggregation",
["window"],
)
AGGREGATION_CONTRADICTION_SCORE = Histogram(
"stonks_aggregation_contradiction_score",
"Distribution of contradiction scores in trend windows",
buckets=(0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0),
)
AGGREGATION_DURATION = Histogram(
"stonks_aggregation_duration_seconds",
"Aggregation job duration in seconds",
["window"],
buckets=(0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10),
)
# ---------------------------------------------------------------------------
# Recommendation metrics
# ---------------------------------------------------------------------------
RECOMMENDATION_GENERATED = Counter(
"stonks_recommendations_total",
"Recommendations generated",
["action", "mode"],
)
RECOMMENDATION_SUPPRESSED = Counter(
"stonks_recommendations_suppressed_total",
"Recommendations suppressed due to low data quality",
)
RECOMMENDATION_CONFIDENCE = Histogram(
"stonks_recommendation_confidence",
"Distribution of recommendation confidence scores",
buckets=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
)
# ---------------------------------------------------------------------------
# Lake publication metrics
# ---------------------------------------------------------------------------
LAKE_FACTS_PUBLISHED = Counter(
"stonks_lake_facts_published_total",
"Analytical facts published to the lakehouse",
["table_name"],
)
LAKE_PUBLISH_DURATION = Histogram(
"stonks_lake_publish_duration_seconds",
"Lake publication write latency in seconds",
["table_name"],
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5),
)
LAKE_PUBLISH_ERRORS = Counter(
"stonks_lake_publish_errors_total",
"Lake publication errors",
["table_name"],
)
LAKE_PUBLISH_BYTES = Counter(
"stonks_lake_publish_bytes_total",
"Total bytes written to the lakehouse",
["table_name"],
)
# ---------------------------------------------------------------------------
# Trading / broker metrics
# ---------------------------------------------------------------------------
ORDERS_SUBMITTED = Counter(
"stonks_orders_submitted_total",
"Orders submitted to broker",
["side", "order_type", "mode"],
)
ORDERS_REJECTED = Counter(
"stonks_orders_rejected_total",
"Orders rejected before broker submission",
["reason_category"],
)
ORDERS_FILLED = Counter(
"stonks_orders_filled_total",
"Orders filled by broker",
["side"],
)
ORDERS_DUPLICATES_PREVENTED = Counter(
"stonks_orders_duplicates_prevented_total",
"Duplicate orders prevented by idempotency checks",
["detected_via"],
)
RISK_EVALUATIONS_TOTAL = Counter(
"stonks_risk_evaluations_total",
"Risk evaluations performed",
["result"],
)
RISK_CHECK_FAILURES = Counter(
"stonks_risk_check_failures_total",
"Individual risk check failures",
["check_name"],
)
POSITIONS_SYNCED = Counter(
"stonks_positions_synced_total",
"Position sync operations completed",
)
# ---------------------------------------------------------------------------
# Active gauges
# ---------------------------------------------------------------------------
ACTIVE_JOBS = Gauge(
"stonks_active_jobs",
"Currently processing jobs by stage",
["stage"],
)
# ---------------------------------------------------------------------------
# Alerting metrics
# ---------------------------------------------------------------------------
ALERTS_FIRED = Counter(
"stonks_alerts_fired_total",
"Total alerts fired by rule",
["rule", "severity"],
)
ALERTS_RESOLVED = Counter(
"stonks_alerts_resolved_total",
"Total alerts resolved by rule",
["rule"],
)
ALERT_CHECK_DURATION = Histogram(
"stonks_alert_check_duration_seconds",
"Duration of alert evaluation cycle",
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5),
)
ALERT_ACTIVE = Gauge(
"stonks_alert_active",
"Whether an alert rule is currently firing (1) or resolved (0)",
["rule"],
)
# ---------------------------------------------------------------------------
# Dead-letter queue metrics
# ---------------------------------------------------------------------------
DLQ_ITEMS_TOTAL = Counter(
"stonks_dlq_items_total",
"Jobs sent to dead-letter queues",
["queue"],
)
DLQ_REPLAYED_TOTAL = Counter(
"stonks_dlq_replayed_total",
"Jobs replayed from dead-letter queues",
["queue"],
)
DLQ_DEPTH = Gauge(
"stonks_dlq_depth",
"Current dead-letter queue depth",
["queue"],
)
+10
View File
@@ -46,6 +46,15 @@ def retry_key(job_id: str) -> str:
return f"{RETRY_PREFIX}:{job_id}"
# Dead-letter queues
DLQ_PREFIX = f"{PREFIX}:dlq"
def dlq_key(queue_name: str) -> str:
"""Return the dead-letter queue key for a given source queue."""
return f"{DLQ_PREFIX}:{queue_name}"
# --- Queue names ---
QUEUE_INGESTION = "ingestion"
QUEUE_PARSING = "parsing"
@@ -54,3 +63,4 @@ QUEUE_AGGREGATION = "aggregation"
QUEUE_RECOMMENDATION = "recommendation"
QUEUE_LAKE_PUBLISH = "lake_publish"
QUEUE_TRADE = "trade"
QUEUE_BROKER = "broker_orders"
+306
View File
@@ -0,0 +1,306 @@
"""Data retention and lifecycle controls for raw and derived artifacts.
Provides configurable per-bucket retention policies, expired object cleanup
from MinIO, and expired metadata cleanup from PostgreSQL.
Requirements: N3 (preserve source metadata, access policy, and retention policy)
Design ref: Section 5.2 (MinIO bucket layout), Section 10 (Reliability and Safety)
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
import asyncpg
from minio import Minio
from services.shared.config import BUCKET_RETENTION_FIELDS, RetentionConfig
from services.shared.storage import ALL_BUCKETS
logger = logging.getLogger("retention")
@dataclass
class RetentionPolicy:
"""Resolved retention policy for a single bucket."""
bucket_name: str
retention_days: int
archive_before_delete: bool = False
@dataclass
class CleanupResult:
"""Result of a single bucket cleanup run."""
bucket_name: str
objects_scanned: int = 0
objects_deleted: int = 0
bytes_freed: int = 0
db_rows_deleted: int = 0
def default_retention_days(bucket: str, config: RetentionConfig) -> int:
"""Get the default retention days for a bucket from config."""
field_name = BUCKET_RETENTION_FIELDS.get(bucket)
if field_name:
return getattr(config, field_name, 365)
return 365
def resolve_policies(config: RetentionConfig) -> list[RetentionPolicy]:
"""Build retention policies for all known buckets from config defaults."""
return [
RetentionPolicy(
bucket_name=bucket,
retention_days=default_retention_days(bucket, config),
)
for bucket in ALL_BUCKETS
]
async def load_db_policies(pool: asyncpg.Pool) -> dict[str, RetentionPolicy]:
"""Load retention policy overrides from the database.
Returns a dict keyed by bucket_name. DB policies take precedence
over config defaults when active.
"""
rows = await pool.fetch(
"""SELECT bucket_name, retention_days, archive_before_delete
FROM retention_policies
WHERE active = TRUE AND artifact_class = 'default'"""
)
return {
row["bucket_name"]: RetentionPolicy(
bucket_name=row["bucket_name"],
retention_days=row["retention_days"],
archive_before_delete=row["archive_before_delete"],
)
for row in rows
}
def merge_policies(
config_policies: list[RetentionPolicy],
db_policies: dict[str, RetentionPolicy],
) -> list[RetentionPolicy]:
"""Merge config defaults with DB overrides. DB wins on conflict."""
merged: list[RetentionPolicy] = []
for policy in config_policies:
if policy.bucket_name in db_policies:
merged.append(db_policies[policy.bucket_name])
else:
merged.append(policy)
return merged
def cutoff_date(retention_days: int, now: datetime | None = None) -> datetime:
"""Calculate the cutoff datetime. Objects older than this are expired."""
ref = now or datetime.now(timezone.utc)
return ref - timedelta(days=retention_days)
def list_expired_objects(
client: Minio,
bucket: str,
retention_days: int,
batch_size: int = 1000,
now: datetime | None = None,
) -> list[str]:
"""List object names in a bucket that are older than the retention cutoff.
Uses the object's last_modified timestamp from MinIO metadata.
Returns at most batch_size object names.
"""
cutoff = cutoff_date(retention_days, now)
expired: list[str] = []
try:
objects = client.list_objects(bucket, recursive=True)
for obj in objects:
if obj.last_modified and obj.last_modified < cutoff:
if obj.object_name:
expired.append(obj.object_name)
if len(expired) >= batch_size:
break
except Exception:
logger.exception("Error listing objects in bucket %s", bucket)
return expired
def delete_expired_objects(
client: Minio,
bucket: str,
object_names: list[str],
) -> int:
"""Delete a list of objects from a MinIO bucket.
Returns the count of successfully deleted objects.
"""
deleted = 0
for name in object_names:
try:
client.remove_object(bucket, name)
deleted += 1
except Exception:
logger.warning("Failed to delete %s/%s", bucket, name, exc_info=True)
return deleted
def cleanup_bucket(
client: Minio,
policy: RetentionPolicy,
batch_size: int = 1000,
now: datetime | None = None,
) -> CleanupResult:
"""Run retention cleanup for a single bucket.
Lists expired objects and deletes them in batches.
Returns a CleanupResult with counts.
"""
result = CleanupResult(bucket_name=policy.bucket_name)
expired = list_expired_objects(
client, policy.bucket_name, policy.retention_days,
batch_size=batch_size, now=now,
)
result.objects_scanned = len(expired)
if expired:
result.objects_deleted = delete_expired_objects(client, policy.bucket_name, expired)
logger.info(
"Bucket %s: scanned=%d deleted=%d (retention=%dd)",
policy.bucket_name, result.objects_scanned,
result.objects_deleted, policy.retention_days,
)
else:
logger.debug("Bucket %s: no expired objects (retention=%dd)",
policy.bucket_name, policy.retention_days)
return result
# --- PostgreSQL metadata cleanup ---
# Tables with a created_at or retrieved_at column that should be cleaned up
# when the corresponding MinIO artifacts are expired.
DB_CLEANUP_QUERIES: list[tuple[str, str]] = [
(
"ingestion_runs",
"DELETE FROM ingestion_runs WHERE started_at < $1",
),
(
"market_snapshots",
"DELETE FROM market_snapshots WHERE captured_at < $1",
),
]
async def cleanup_expired_db_records(
pool: asyncpg.Pool,
retention_days: int,
now: datetime | None = None,
) -> int:
"""Delete expired operational metadata from PostgreSQL.
Uses the shortest raw retention period to clean up ingestion tracking
and market snapshot records that are past their useful life.
Returns total rows deleted.
"""
cutoff = cutoff_date(retention_days, now)
total_deleted = 0
async with pool.acquire() as conn:
for table_name, query in DB_CLEANUP_QUERIES:
try:
result = await conn.execute(query, cutoff)
# asyncpg returns "DELETE N"
count = int(result.split()[-1]) if result else 0
total_deleted += count
if count > 0:
logger.info("Cleaned %d expired rows from %s (cutoff=%s)",
count, table_name, cutoff.isoformat())
except Exception:
logger.exception("Error cleaning table %s", table_name)
return total_deleted
async def record_retention_run(
pool: asyncpg.Pool,
bucket_name: str,
result: CleanupResult,
status: str = "completed",
error_message: str | None = None,
) -> None:
"""Record a retention cleanup run in the retention_runs table."""
await pool.execute(
"""INSERT INTO retention_runs
(bucket_name, objects_scanned, objects_deleted, bytes_freed,
db_rows_deleted, completed_at, status, error_message)
VALUES ($1, $2, $3, $4, $5, NOW(), $6, $7)""",
bucket_name,
result.objects_scanned,
result.objects_deleted,
result.bytes_freed,
result.db_rows_deleted,
status,
error_message,
)
async def run_retention_cleanup(
minio_client: Minio,
pool: asyncpg.Pool,
config: RetentionConfig,
now: datetime | None = None,
) -> list[CleanupResult]:
"""Run the full retention cleanup cycle.
1. Resolve policies from config defaults + DB overrides
2. Clean up expired MinIO objects per bucket
3. Clean up expired PostgreSQL metadata
4. Record each run for observability
Returns a list of CleanupResult for each bucket processed.
"""
# Resolve policies
config_policies = resolve_policies(config)
try:
db_policies = await load_db_policies(pool)
except Exception:
logger.warning("Could not load DB retention policies, using config defaults")
db_policies = {}
policies = merge_policies(config_policies, db_policies)
results: list[CleanupResult] = []
# Clean up MinIO objects per bucket
for policy in policies:
try:
result = cleanup_bucket(
minio_client, policy,
batch_size=config.batch_size, now=now,
)
results.append(result)
await record_retention_run(pool, policy.bucket_name, result)
except Exception:
logger.exception("Retention cleanup failed for bucket %s", policy.bucket_name)
empty = CleanupResult(bucket_name=policy.bucket_name)
await record_retention_run(
pool, policy.bucket_name, empty,
status="failed", error_message="See logs",
)
results.append(empty)
# Clean up expired DB records using the shortest raw retention period
min_retention = min(p.retention_days for p in policies)
try:
db_deleted = await cleanup_expired_db_records(pool, min_retention, now=now)
if db_deleted > 0:
logger.info("Total DB rows cleaned: %d", db_deleted)
except Exception:
logger.exception("DB retention cleanup failed")
return results
+37
View File
@@ -108,6 +108,41 @@ class DocumentIntelligence(BaseModel):
# --- Trend Summary ---
class MarketContext(BaseModel):
"""Recent market data features for a symbol, used to enrich aggregation."""
ticker: str = ""
price_change_pct: Optional[float] = None # % change over the window
avg_volume: Optional[float] = None # average daily volume
volume_change_pct: Optional[float] = None # volume vs prior period
volatility: Optional[float] = None # intra-window price std dev
latest_close: Optional[float] = None
latest_bar_at: Optional[datetime] = None
bars_available: int = 0
@property
def has_data(self) -> bool:
return self.bars_available > 0
class DisagreementDetail(BaseModel):
"""Represents an explicit disagreement between document signals.
Rather than collapsing contradictory signals into a single score,
this captures the nature of the disagreement so downstream consumers
can inspect *why* signals conflict.
Requirements: 6.4
"""
dimension: str = "" # e.g. "sentiment", "catalyst", "impact_horizon"
positive_doc_ids: List[str] = Field(default_factory=list)
negative_doc_ids: List[str] = Field(default_factory=list)
positive_weight: float = 0.0
negative_weight: float = 0.0
description: str = ""
class TrendSummary(BaseModel):
entity_type: str = "company"
entity_id: str = ""
@@ -120,6 +155,8 @@ class TrendSummary(BaseModel):
dominant_catalysts: List[str] = Field(default_factory=list)
material_risks: List[str] = Field(default_factory=list)
contradiction_score: float = Field(ge=0, le=1, default=0.0)
disagreement_details: List[DisagreementDetail] = Field(default_factory=list)
market_context: Optional[MarketContext] = None
generated_at: datetime = Field(default_factory=datetime.utcnow)
+352
View File
@@ -0,0 +1,352 @@
"""Raw artifact upload to MinIO.
Provides a reusable storage layer for uploading raw artifacts (API payloads,
HTML, normalized text, model outputs) to MinIO with consistent path conventions,
bucket management, and content-type handling.
Bucket layout follows the design spec:
- stonks-raw-market — raw market API payloads
- stonks-raw-news — raw news API payloads and article HTML
- stonks-raw-filings — raw filings and issuer event payloads
- stonks-normalized — cleaned text and parser outputs
- stonks-llm-prompts — prompts and schemas used
- stonks-llm-results — raw model outputs and validation reports
- stonks-lakehouse — partitioned analytical datasets and table metadata
- stonks-audit — execution traces and exported reports
Object path pattern:
/{stage}/{symbol}/{yyyy}/{mm}/{dd}/{document_id}/{artifact_type}.{ext}
Requirements: 3.1, 3.2, 3.3, 9.1
"""
import io
import logging
from datetime import datetime, timezone
from typing import Mapping
from minio import Minio
from minio.error import S3Error
logger = logging.getLogger("storage")
# All known buckets the platform uses
ALL_BUCKETS = [
"stonks-raw-market",
"stonks-raw-news",
"stonks-raw-filings",
"stonks-normalized",
"stonks-llm-prompts",
"stonks-llm-results",
"stonks-lakehouse",
"stonks-audit",
]
# Map source_type to the correct raw bucket
SOURCE_BUCKET_MAP: dict[str, str] = {
"market_api": "stonks-raw-market",
"news_api": "stonks-raw-news",
"filings_api": "stonks-raw-filings",
"web_scrape": "stonks-raw-news",
"broker": "stonks-raw-market",
}
# Map artifact type to content type and file extension
ARTIFACT_CONTENT_TYPES: dict[str, tuple[str, str]] = {
"raw_json": ("application/json", "json"),
"raw_html": ("text/html", "html"),
"raw_text": ("text/plain", "txt"),
"raw_payload": ("application/octet-stream", "bin"),
}
def bucket_for_source(source_type: str) -> str:
"""Return the MinIO bucket name for a given source type."""
return SOURCE_BUCKET_MAP.get(source_type, "stonks-raw-market")
def build_artifact_path(
source_type: str,
ticker: str,
document_id: str,
artifact_name: str = "raw",
ext: str = "json",
timestamp: datetime | None = None,
) -> str:
"""Build a MinIO object path following the design convention.
Pattern: {source_type}/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/{artifact_name}.{ext}
"""
ts = timestamp or datetime.now(timezone.utc)
return (
f"{source_type}/{ticker}/"
f"{ts.year}/{ts.month:02d}/{ts.day:02d}/"
f"{document_id}/{artifact_name}.{ext}"
)
def storage_ref(bucket: str, path: str) -> str:
"""Build an s3:// URI for a stored artifact."""
return f"s3://{bucket}/{path}"
def ensure_buckets(client: Minio, buckets: list[str] | None = None) -> list[str]:
"""Create any missing buckets. Returns list of buckets that were created."""
target_buckets = buckets or ALL_BUCKETS
created: list[str] = []
for bucket in target_buckets:
try:
if not client.bucket_exists(bucket):
client.make_bucket(bucket)
created.append(bucket)
logger.info("Created bucket: %s", bucket)
except S3Error as e:
logger.error("Failed to ensure bucket %s: %s", bucket, e)
raise
return created
def upload_artifact(
client: Minio,
bucket: str,
path: str,
data: bytes,
content_type: str = "application/json",
metadata: Mapping[str, str] | None = None,
) -> str:
"""Upload raw bytes to MinIO and return the s3:// storage reference.
Args:
client: MinIO client instance.
bucket: Target bucket name.
path: Object path within the bucket.
data: Raw bytes to upload.
content_type: MIME type for the object.
metadata: Optional user metadata to attach to the object.
Returns:
s3:// URI pointing to the uploaded object.
"""
_result = client.put_object(
bucket,
path,
io.BytesIO(data),
length=len(data),
content_type=content_type,
metadata=metadata,
)
ref = storage_ref(bucket, path)
logger.debug("Uploaded %d bytes to %s", len(data), ref)
return ref
def upload_raw_artifact(
client: Minio,
source_type: str,
ticker: str,
document_id: str,
data: bytes,
artifact_type: str = "raw_json",
timestamp: datetime | None = None,
metadata: Mapping[str, str] | None = None,
) -> str:
"""Upload a raw artifact using standard conventions for bucket, path, and content type.
This is the primary entry point for ingestion workers to store raw payloads.
Args:
client: MinIO client instance.
source_type: One of market_api, news_api, filings_api, web_scrape, broker.
ticker: Company ticker symbol.
document_id: Unique document or run identifier.
data: Raw bytes to upload.
artifact_type: One of raw_json, raw_html, raw_text, raw_payload.
timestamp: Override timestamp for path generation (defaults to now UTC).
metadata: Optional user metadata dict.
Returns:
s3:// URI pointing to the uploaded object.
"""
bucket = bucket_for_source(source_type)
ct, ext = ARTIFACT_CONTENT_TYPES.get(artifact_type, ("application/octet-stream", "bin"))
path = build_artifact_path(
source_type=source_type,
ticker=ticker,
document_id=document_id,
artifact_name="raw",
ext=ext,
timestamp=timestamp,
)
return upload_artifact(client, bucket, path, data, content_type=ct, metadata=metadata)
def upload_html_artifact(
client: Minio,
ticker: str,
document_id: str,
html_bytes: bytes,
timestamp: datetime | None = None,
metadata: Mapping[str, str] | None = None,
) -> str:
"""Upload raw HTML for a scraped web page.
Stores in stonks-raw-news under the web_scrape source path.
"""
bucket = bucket_for_source("web_scrape")
path = build_artifact_path(
source_type="web_scrape",
ticker=ticker,
document_id=document_id,
artifact_name="raw",
ext="html",
timestamp=timestamp,
)
return upload_artifact(client, bucket, path, html_bytes, content_type="text/html", metadata=metadata)
def upload_normalized_text(
client: Minio,
ticker: str,
document_id: str,
text_bytes: bytes,
timestamp: datetime | None = None,
metadata: Mapping[str, str] | None = None,
) -> str:
"""Upload normalized (parsed) text to the stonks-normalized bucket.
Stores under parsed/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/normalized.txt
"""
ts = timestamp or datetime.now(timezone.utc)
path = (
f"parsed/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
f"{document_id}/normalized.txt"
)
return upload_artifact(
client, "stonks-normalized", path, text_bytes,
content_type="text/plain", metadata=metadata,
)
def upload_parser_output(
client: Minio,
ticker: str,
document_id: str,
output_bytes: bytes,
timestamp: datetime | None = None,
metadata: Mapping[str, str] | None = None,
) -> str:
"""Upload structured parser output JSON to the stonks-normalized bucket.
Stores under parsed/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/parser_output.json
"""
ts = timestamp or datetime.now(timezone.utc)
path = (
f"parsed/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
f"{document_id}/parser_output.json"
)
return upload_artifact(
client, "stonks-normalized", path, output_bytes,
content_type="application/json", metadata=metadata,
)
def upload_extraction_prompt(
client: Minio,
ticker: str,
document_id: str,
prompt_data: bytes,
timestamp: datetime | None = None,
metadata: Mapping[str, str] | None = None,
) -> str:
"""Upload the extraction prompt and schema to stonks-llm-prompts.
Stores under extraction/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/prompt.json
"""
ts = timestamp or datetime.now(timezone.utc)
path = (
f"extraction/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
f"{document_id}/prompt.json"
)
return upload_artifact(
client, "stonks-llm-prompts", path, prompt_data,
content_type="application/json", metadata=metadata,
)
def upload_extraction_raw_output(
client: Minio,
ticker: str,
document_id: str,
output_data: bytes,
attempt_index: int = 0,
timestamp: datetime | None = None,
metadata: Mapping[str, str] | None = None,
) -> str:
"""Upload a raw model output to stonks-llm-results.
Stores under extraction/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/raw_output_{attempt}.json
"""
ts = timestamp or datetime.now(timezone.utc)
path = (
f"extraction/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
f"{document_id}/raw_output_{attempt_index}.json"
)
return upload_artifact(
client, "stonks-llm-results", path, output_data,
content_type="application/json", metadata=metadata,
)
def upload_extraction_validation(
client: Minio,
ticker: str,
document_id: str,
validation_data: bytes,
timestamp: datetime | None = None,
metadata: Mapping[str, str] | None = None,
) -> str:
"""Upload a validation report to stonks-llm-results.
Stores under extraction/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/validation.json
"""
ts = timestamp or datetime.now(timezone.utc)
path = (
f"extraction/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
f"{document_id}/validation.json"
)
return upload_artifact(
client, "stonks-llm-results", path, validation_data,
content_type="application/json", metadata=metadata,
)
def upload_extraction_intelligence(
client: Minio,
ticker: str,
document_id: str,
intelligence_data: bytes,
timestamp: datetime | None = None,
metadata: Mapping[str, str] | None = None,
) -> str:
"""Upload the final intelligence object to stonks-llm-results.
Stores under extraction/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/intelligence.json
"""
ts = timestamp or datetime.now(timezone.utc)
path = (
f"extraction/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
f"{document_id}/intelligence.json"
)
return upload_artifact(
client, "stonks-llm-results", path, intelligence_data,
content_type="application/json", metadata=metadata,
)
def download_artifact(client: Minio, bucket: str, path: str) -> bytes:
"""Download an artifact from MinIO and return its bytes."""
response = client.get_object(bucket, path)
try:
return response.read()
finally:
response.close()
response.release_conn()