phase 14-15: docker build validation and helm deployment
This commit is contained in:
@@ -0,0 +1,342 @@
|
||||
"""Operational alerting for Stonks Oracle pipeline health.
|
||||
|
||||
Evaluates alert rules against PostgreSQL operational state and emits
|
||||
structured log events and Prometheus metrics when thresholds are breached.
|
||||
|
||||
Alert rules:
|
||||
- source_failures: sustained source retrieval failures per source
|
||||
- schema_failure_spike: extraction validation failure rate exceeds threshold
|
||||
- analytical_lag: lake publication has not completed within threshold
|
||||
- broker_issues: consecutive broker submission errors
|
||||
|
||||
Requirements: 12.3
|
||||
Design: Section 12 (Observability and Operations)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
import asyncpg
|
||||
|
||||
from services.shared.config import AlertingConfig
|
||||
from services.shared.metrics import (
|
||||
ALERT_ACTIVE,
|
||||
ALERT_CHECK_DURATION,
|
||||
ALERTS_FIRED,
|
||||
ALERTS_RESOLVED,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("alerting")
|
||||
|
||||
|
||||
@dataclass
|
||||
class Alert:
|
||||
"""A single alert instance."""
|
||||
|
||||
rule: str
|
||||
severity: str # "warning" | "critical"
|
||||
summary: str
|
||||
details: dict[str, Any] = field(default_factory=dict)
|
||||
fired_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlertState:
|
||||
"""Tracks which rules are currently firing to detect transitions."""
|
||||
|
||||
active: dict[str, Alert] = field(default_factory=dict)
|
||||
|
||||
def fire(self, alert: Alert) -> bool:
|
||||
"""Record an alert firing. Returns True if this is a new firing."""
|
||||
key = f"{alert.rule}:{alert.details.get('key', '')}"
|
||||
is_new = key not in self.active
|
||||
self.active[key] = alert
|
||||
return is_new
|
||||
|
||||
def resolve(self, rule: str, key: str = "") -> bool:
|
||||
"""Resolve an alert. Returns True if it was previously active."""
|
||||
full_key = f"{rule}:{key}"
|
||||
if full_key in self.active:
|
||||
del self.active[full_key]
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_firing(self, rule: str, key: str = "") -> bool:
|
||||
return f"{rule}:{key}" in self.active
|
||||
|
||||
|
||||
async def check_source_failures(
|
||||
pool: asyncpg.Pool,
|
||||
config: AlertingConfig,
|
||||
) -> list[Alert]:
|
||||
"""Check for sources with sustained consecutive failures.
|
||||
|
||||
Queries ingestion_runs for sources where the last N runs all failed
|
||||
within the lookback window.
|
||||
"""
|
||||
rows = await pool.fetch(
|
||||
"""WITH recent_runs AS (
|
||||
SELECT source_id, status,
|
||||
ROW_NUMBER() OVER (PARTITION BY source_id ORDER BY started_at DESC) AS rn
|
||||
FROM ingestion_runs
|
||||
WHERE started_at >= NOW() - INTERVAL '1 hour' * $1
|
||||
),
|
||||
failure_streaks AS (
|
||||
SELECT source_id,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') AS consecutive_failures,
|
||||
COUNT(*) AS total_runs
|
||||
FROM recent_runs
|
||||
WHERE rn <= $2
|
||||
GROUP BY source_id
|
||||
HAVING COUNT(*) FILTER (WHERE status = 'failed') = COUNT(*)
|
||||
AND COUNT(*) >= $2
|
||||
)
|
||||
SELECT fs.source_id, fs.consecutive_failures,
|
||||
s.source_type, s.source_name, c.ticker
|
||||
FROM failure_streaks fs
|
||||
JOIN sources s ON s.id = fs.source_id
|
||||
JOIN companies c ON c.id = s.company_id""",
|
||||
config.source_failure_window_hours,
|
||||
config.source_failure_threshold,
|
||||
)
|
||||
|
||||
alerts = []
|
||||
for row in rows:
|
||||
alerts.append(Alert(
|
||||
rule="source_failures",
|
||||
severity="warning",
|
||||
summary=(
|
||||
f"Source {row['source_name']} ({row['source_type']}) for "
|
||||
f"{row['ticker']} has {row['consecutive_failures']} consecutive failures"
|
||||
),
|
||||
details={
|
||||
"key": str(row["source_id"]),
|
||||
"source_id": str(row["source_id"]),
|
||||
"source_type": row["source_type"],
|
||||
"source_name": row["source_name"],
|
||||
"ticker": row["ticker"],
|
||||
"consecutive_failures": row["consecutive_failures"],
|
||||
},
|
||||
))
|
||||
return alerts
|
||||
|
||||
|
||||
async def check_schema_failure_spike(
|
||||
pool: asyncpg.Pool,
|
||||
config: AlertingConfig,
|
||||
) -> list[Alert]:
|
||||
"""Check if extraction schema validation failure rate exceeds threshold.
|
||||
|
||||
Queries model_performance_metrics for the recent window and computes
|
||||
the failure rate.
|
||||
"""
|
||||
row = await pool.fetchrow(
|
||||
"""SELECT
|
||||
COUNT(*) AS total,
|
||||
COUNT(*) FILTER (WHERE NOT success) AS failed
|
||||
FROM model_performance_metrics
|
||||
WHERE recorded_at >= NOW() - INTERVAL '1 hour' * $1""",
|
||||
config.schema_failure_window_hours,
|
||||
)
|
||||
|
||||
if not row or row["total"] == 0:
|
||||
return []
|
||||
|
||||
total = row["total"]
|
||||
failed = row["failed"]
|
||||
failure_rate = failed / total
|
||||
|
||||
if failure_rate >= config.schema_failure_rate_threshold:
|
||||
return [Alert(
|
||||
rule="schema_failure_spike",
|
||||
severity="critical" if failure_rate >= 0.5 else "warning",
|
||||
summary=(
|
||||
f"Extraction schema failure rate is {failure_rate:.1%} "
|
||||
f"({failed}/{total}) in the last {config.schema_failure_window_hours}h"
|
||||
),
|
||||
details={
|
||||
"key": "global",
|
||||
"total_extractions": total,
|
||||
"failed_extractions": failed,
|
||||
"failure_rate": round(failure_rate, 4),
|
||||
"threshold": config.schema_failure_rate_threshold,
|
||||
"window_hours": config.schema_failure_window_hours,
|
||||
},
|
||||
)]
|
||||
return []
|
||||
|
||||
|
||||
async def check_analytical_lag(
|
||||
pool: asyncpg.Pool,
|
||||
config: AlertingConfig,
|
||||
) -> list[Alert]:
|
||||
"""Check if lake publication is lagging beyond threshold.
|
||||
|
||||
Looks at the audit_events table for the most recent successful
|
||||
lake_publish events per table, and alerts if any are stale.
|
||||
"""
|
||||
rows = await pool.fetch(
|
||||
"""SELECT
|
||||
details->>'table_name' AS table_name,
|
||||
MAX(created_at) AS last_publish
|
||||
FROM audit_events
|
||||
WHERE event_type = 'lake_publish'
|
||||
AND details->>'status' = 'success'
|
||||
AND details->>'table_name' IS NOT NULL
|
||||
GROUP BY details->>'table_name'
|
||||
HAVING MAX(created_at) < NOW() - INTERVAL '1 minute' * $1""",
|
||||
config.lake_lag_threshold_minutes,
|
||||
)
|
||||
|
||||
alerts = []
|
||||
now = datetime.now(timezone.utc)
|
||||
for row in rows:
|
||||
table_name = row["table_name"]
|
||||
last_publish = row["last_publish"]
|
||||
if last_publish.tzinfo is None:
|
||||
last_publish = last_publish.replace(tzinfo=timezone.utc)
|
||||
lag_minutes = (now - last_publish).total_seconds() / 60
|
||||
|
||||
alerts.append(Alert(
|
||||
rule="analytical_lag",
|
||||
severity="warning",
|
||||
summary=(
|
||||
f"Lake table '{table_name}' last published {lag_minutes:.0f}m ago "
|
||||
f"(threshold: {config.lake_lag_threshold_minutes}m)"
|
||||
),
|
||||
details={
|
||||
"key": table_name,
|
||||
"table_name": table_name,
|
||||
"last_publish": last_publish.isoformat(),
|
||||
"lag_minutes": round(lag_minutes, 1),
|
||||
"threshold_minutes": config.lake_lag_threshold_minutes,
|
||||
},
|
||||
))
|
||||
return alerts
|
||||
|
||||
|
||||
async def check_broker_issues(
|
||||
pool: asyncpg.Pool,
|
||||
config: AlertingConfig,
|
||||
) -> list[Alert]:
|
||||
"""Check for consecutive broker submission errors.
|
||||
|
||||
Queries order_events for recent broker-level errors (rejections,
|
||||
timeouts, connection failures) within the lookback window.
|
||||
"""
|
||||
rows = await pool.fetch(
|
||||
"""WITH recent_events AS (
|
||||
SELECT order_id, event_type, created_at,
|
||||
ROW_NUMBER() OVER (ORDER BY created_at DESC) AS rn
|
||||
FROM order_events
|
||||
WHERE created_at >= NOW() - INTERVAL '1 hour' * $1
|
||||
AND event_type IN ('broker_error', 'broker_timeout', 'connection_failed')
|
||||
)
|
||||
SELECT COUNT(*) AS error_count
|
||||
FROM recent_events
|
||||
WHERE rn <= $2""",
|
||||
config.broker_error_window_hours,
|
||||
config.broker_error_threshold,
|
||||
)
|
||||
|
||||
if not rows:
|
||||
return []
|
||||
|
||||
error_count = rows[0]["error_count"]
|
||||
if error_count >= config.broker_error_threshold:
|
||||
return [Alert(
|
||||
rule="broker_issues",
|
||||
severity="critical",
|
||||
summary=(
|
||||
f"{error_count} broker errors in the last "
|
||||
f"{config.broker_error_window_hours}h"
|
||||
),
|
||||
details={
|
||||
"key": "global",
|
||||
"error_count": error_count,
|
||||
"threshold": config.broker_error_threshold,
|
||||
"window_hours": config.broker_error_window_hours,
|
||||
},
|
||||
)]
|
||||
return []
|
||||
|
||||
|
||||
async def evaluate_alerts(
|
||||
pool: asyncpg.Pool,
|
||||
config: AlertingConfig,
|
||||
state: AlertState,
|
||||
) -> list[Alert]:
|
||||
"""Run all alert rules and return newly fired alerts.
|
||||
|
||||
Updates AlertState to track firing/resolved transitions and emits
|
||||
structured log events and Prometheus metrics for each transition.
|
||||
"""
|
||||
all_alerts: list[Alert] = []
|
||||
|
||||
with ALERT_CHECK_DURATION.time():
|
||||
# Collect alerts from all rules
|
||||
try:
|
||||
all_alerts.extend(await check_source_failures(pool, config))
|
||||
except Exception:
|
||||
logger.exception("Error checking source failures")
|
||||
|
||||
try:
|
||||
all_alerts.extend(await check_schema_failure_spike(pool, config))
|
||||
except Exception:
|
||||
logger.exception("Error checking schema failure spike")
|
||||
|
||||
try:
|
||||
all_alerts.extend(await check_analytical_lag(pool, config))
|
||||
except Exception:
|
||||
logger.exception("Error checking analytical lag")
|
||||
|
||||
try:
|
||||
all_alerts.extend(await check_broker_issues(pool, config))
|
||||
except Exception:
|
||||
logger.exception("Error checking broker issues")
|
||||
|
||||
# Track which rule+key combos are currently firing
|
||||
current_keys: set[str] = set()
|
||||
newly_fired: list[Alert] = []
|
||||
|
||||
for alert in all_alerts:
|
||||
key = f"{alert.rule}:{alert.details.get('key', '')}"
|
||||
current_keys.add(key)
|
||||
|
||||
if state.fire(alert):
|
||||
# New alert firing
|
||||
ALERTS_FIRED.labels(rule=alert.rule, severity=alert.severity).inc()
|
||||
ALERT_ACTIVE.labels(rule=alert.rule).set(1)
|
||||
newly_fired.append(alert)
|
||||
logger.warning(
|
||||
"ALERT FIRING: [%s] %s",
|
||||
alert.rule,
|
||||
alert.summary,
|
||||
extra={
|
||||
"alert_rule": alert.rule,
|
||||
"alert_severity": alert.severity,
|
||||
"alert_details": alert.details,
|
||||
},
|
||||
)
|
||||
|
||||
# Check for resolved alerts
|
||||
resolved_keys = set(state.active.keys()) - current_keys
|
||||
for key in resolved_keys:
|
||||
rule = key.split(":")[0]
|
||||
detail_key = key[len(rule) + 1:]
|
||||
if state.resolve(rule, detail_key):
|
||||
ALERTS_RESOLVED.labels(rule=rule).inc()
|
||||
# Only set gauge to 0 if no more alerts for this rule
|
||||
still_firing = any(k.startswith(f"{rule}:") for k in state.active)
|
||||
if not still_firing:
|
||||
ALERT_ACTIVE.labels(rule=rule).set(0)
|
||||
logger.info(
|
||||
"ALERT RESOLVED: [%s] key=%s",
|
||||
rule,
|
||||
detail_key,
|
||||
)
|
||||
|
||||
return newly_fired
|
||||
@@ -0,0 +1,493 @@
|
||||
"""Execution audit trail - records every step from recommendation to market outcome.
|
||||
|
||||
Writes structured audit events to the audit_events table so the full
|
||||
decision chain is traceable: recommendation → risk evaluation → order
|
||||
submission → broker response → fill/rejection/cancellation.
|
||||
|
||||
Each event captures the entity type, entity ID, event type, actor,
|
||||
and a JSONB data payload with stage-specific details.
|
||||
|
||||
Requirements: 8.3, 11.3
|
||||
Design: Section 4.9 (Broker Adapter), Section 6.1 (PostgreSQL audit_events)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
import asyncpg
|
||||
|
||||
logger = logging.getLogger("audit")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Event type constants
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Recommendation stage
|
||||
AUDIT_RECOMMENDATION_GENERATED = "recommendation.generated"
|
||||
AUDIT_RECOMMENDATION_SUPPRESSED = "recommendation.suppressed"
|
||||
|
||||
# Risk evaluation stage
|
||||
AUDIT_RISK_EVALUATED = "risk.evaluated"
|
||||
AUDIT_RISK_REJECTED = "risk.rejected"
|
||||
|
||||
# Order lifecycle
|
||||
AUDIT_ORDER_SUBMITTED = "order.submitted"
|
||||
AUDIT_ORDER_ACCEPTED = "order.accepted"
|
||||
AUDIT_ORDER_FILLED = "order.filled"
|
||||
AUDIT_ORDER_REJECTED = "order.rejected"
|
||||
AUDIT_ORDER_CANCELLED = "order.cancelled"
|
||||
AUDIT_ORDER_DUPLICATE = "order.duplicate_prevented"
|
||||
|
||||
# Position changes
|
||||
AUDIT_POSITION_OPENED = "position.opened"
|
||||
AUDIT_POSITION_CLOSED = "position.closed"
|
||||
AUDIT_POSITION_UPDATED = "position.updated"
|
||||
|
||||
# Trading mode changes
|
||||
AUDIT_TRADING_MODE_CHANGED = "trading.mode_changed"
|
||||
|
||||
# Operator approval workflow
|
||||
AUDIT_APPROVAL_REQUESTED = "approval.requested"
|
||||
AUDIT_APPROVAL_APPROVED = "approval.approved"
|
||||
AUDIT_APPROVAL_REJECTED = "approval.rejected"
|
||||
AUDIT_APPROVAL_EXPIRED = "approval.expired"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core audit writer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_INSERT_AUDIT_EVENT = """
|
||||
INSERT INTO audit_events (id, event_type, entity_type, entity_id, actor, data, created_at)
|
||||
VALUES ($1::uuid, $2, $3, $4::uuid, $5, $6::jsonb, $7)
|
||||
"""
|
||||
|
||||
|
||||
async def record_audit_event(
|
||||
pool: asyncpg.Pool,
|
||||
event_type: str,
|
||||
entity_type: str,
|
||||
entity_id: str,
|
||||
data: dict[str, Any],
|
||||
actor: str = "system",
|
||||
timestamp: datetime | None = None,
|
||||
) -> str:
|
||||
"""Write a single audit event to PostgreSQL.
|
||||
|
||||
Returns the audit event UUID.
|
||||
"""
|
||||
event_id = str(uuid.uuid4())
|
||||
ts = timestamp or datetime.now(timezone.utc)
|
||||
|
||||
try:
|
||||
await pool.execute(
|
||||
_INSERT_AUDIT_EVENT,
|
||||
event_id,
|
||||
event_type,
|
||||
entity_type,
|
||||
entity_id,
|
||||
actor,
|
||||
json.dumps(data, default=str),
|
||||
ts,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"Failed to write audit event %s for %s/%s",
|
||||
event_type, entity_type, entity_id,
|
||||
exc_info=True,
|
||||
)
|
||||
return ""
|
||||
|
||||
return event_id
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Convenience helpers for each execution stage
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def audit_recommendation_generated(
|
||||
pool: asyncpg.Pool,
|
||||
recommendation_id: str,
|
||||
ticker: str,
|
||||
action: str,
|
||||
mode: str,
|
||||
confidence: float,
|
||||
evidence_count: int,
|
||||
suppressed: bool = False,
|
||||
) -> str:
|
||||
"""Record that a recommendation was generated."""
|
||||
event_type = AUDIT_RECOMMENDATION_SUPPRESSED if suppressed else AUDIT_RECOMMENDATION_GENERATED
|
||||
return await record_audit_event(
|
||||
pool,
|
||||
event_type=event_type,
|
||||
entity_type="recommendation",
|
||||
entity_id=recommendation_id,
|
||||
data={
|
||||
"ticker": ticker,
|
||||
"action": action,
|
||||
"mode": mode,
|
||||
"confidence": confidence,
|
||||
"evidence_count": evidence_count,
|
||||
"suppressed": suppressed,
|
||||
},
|
||||
actor="recommendation_worker",
|
||||
)
|
||||
|
||||
|
||||
async def audit_risk_evaluated(
|
||||
pool: asyncpg.Pool,
|
||||
evaluation_id: str,
|
||||
recommendation_id: str | None,
|
||||
ticker: str,
|
||||
eligible: bool,
|
||||
allowed_mode: str,
|
||||
rejection_reasons: list[str],
|
||||
check_count: int,
|
||||
) -> str:
|
||||
"""Record a risk evaluation result."""
|
||||
event_type = AUDIT_RISK_REJECTED if not eligible else AUDIT_RISK_EVALUATED
|
||||
return await record_audit_event(
|
||||
pool,
|
||||
event_type=event_type,
|
||||
entity_type="risk_evaluation",
|
||||
entity_id=evaluation_id,
|
||||
data={
|
||||
"recommendation_id": recommendation_id,
|
||||
"ticker": ticker,
|
||||
"eligible": eligible,
|
||||
"allowed_mode": allowed_mode,
|
||||
"rejection_reasons": rejection_reasons,
|
||||
"check_count": check_count,
|
||||
},
|
||||
actor="risk_engine",
|
||||
)
|
||||
|
||||
|
||||
async def audit_order_submitted(
|
||||
pool: asyncpg.Pool,
|
||||
order_id: str,
|
||||
ticker: str,
|
||||
side: str,
|
||||
quantity: float,
|
||||
order_type: str,
|
||||
idempotency_key: str,
|
||||
recommendation_id: str | None = None,
|
||||
evaluation_id: str | None = None,
|
||||
) -> str:
|
||||
"""Record that an order was submitted to the broker."""
|
||||
return await record_audit_event(
|
||||
pool,
|
||||
event_type=AUDIT_ORDER_SUBMITTED,
|
||||
entity_type="order",
|
||||
entity_id=order_id,
|
||||
data={
|
||||
"ticker": ticker,
|
||||
"side": side,
|
||||
"quantity": quantity,
|
||||
"order_type": order_type,
|
||||
"idempotency_key": idempotency_key,
|
||||
"recommendation_id": recommendation_id,
|
||||
"evaluation_id": evaluation_id,
|
||||
},
|
||||
actor="broker_service",
|
||||
)
|
||||
|
||||
|
||||
async def audit_order_filled(
|
||||
pool: asyncpg.Pool,
|
||||
order_id: str,
|
||||
ticker: str,
|
||||
side: str,
|
||||
fill_quantity: float,
|
||||
fill_price: float | None,
|
||||
broker_order_id: str,
|
||||
) -> str:
|
||||
"""Record that an order was filled by the broker."""
|
||||
return await record_audit_event(
|
||||
pool,
|
||||
event_type=AUDIT_ORDER_FILLED,
|
||||
entity_type="order",
|
||||
entity_id=order_id,
|
||||
data={
|
||||
"ticker": ticker,
|
||||
"side": side,
|
||||
"fill_quantity": fill_quantity,
|
||||
"fill_price": fill_price,
|
||||
"broker_order_id": broker_order_id,
|
||||
},
|
||||
actor="broker_service",
|
||||
)
|
||||
|
||||
|
||||
async def audit_order_rejected(
|
||||
pool: asyncpg.Pool,
|
||||
order_id: str,
|
||||
ticker: str,
|
||||
reason: str,
|
||||
source: str = "broker",
|
||||
) -> str:
|
||||
"""Record that an order was rejected (by risk engine or broker)."""
|
||||
return await record_audit_event(
|
||||
pool,
|
||||
event_type=AUDIT_ORDER_REJECTED,
|
||||
entity_type="order",
|
||||
entity_id=order_id,
|
||||
data={
|
||||
"ticker": ticker,
|
||||
"reason": reason,
|
||||
"rejection_source": source,
|
||||
},
|
||||
actor="broker_service",
|
||||
)
|
||||
|
||||
|
||||
async def audit_order_cancelled(
|
||||
pool: asyncpg.Pool,
|
||||
order_id: str,
|
||||
ticker: str,
|
||||
broker_order_id: str,
|
||||
) -> str:
|
||||
"""Record that an order was cancelled."""
|
||||
return await record_audit_event(
|
||||
pool,
|
||||
event_type=AUDIT_ORDER_CANCELLED,
|
||||
entity_type="order",
|
||||
entity_id=order_id,
|
||||
data={
|
||||
"ticker": ticker,
|
||||
"broker_order_id": broker_order_id,
|
||||
},
|
||||
actor="broker_service",
|
||||
)
|
||||
|
||||
|
||||
async def audit_duplicate_prevented(
|
||||
pool: asyncpg.Pool,
|
||||
order_id: str,
|
||||
ticker: str,
|
||||
idempotency_key: str,
|
||||
detected_via: str,
|
||||
) -> str:
|
||||
"""Record that a duplicate order was prevented."""
|
||||
return await record_audit_event(
|
||||
pool,
|
||||
event_type=AUDIT_ORDER_DUPLICATE,
|
||||
entity_type="order",
|
||||
entity_id=order_id,
|
||||
data={
|
||||
"ticker": ticker,
|
||||
"idempotency_key": idempotency_key,
|
||||
"detected_via": detected_via,
|
||||
},
|
||||
actor="broker_service",
|
||||
)
|
||||
|
||||
|
||||
async def audit_position_change(
|
||||
pool: asyncpg.Pool,
|
||||
order_id: str,
|
||||
ticker: str,
|
||||
side: str,
|
||||
quantity_before: float,
|
||||
quantity_after: float,
|
||||
avg_entry_before: float,
|
||||
avg_entry_after: float,
|
||||
) -> str:
|
||||
"""Record a position change resulting from a fill."""
|
||||
if quantity_before == 0 and quantity_after > 0:
|
||||
event_type = AUDIT_POSITION_OPENED
|
||||
elif quantity_after == 0:
|
||||
event_type = AUDIT_POSITION_CLOSED
|
||||
else:
|
||||
event_type = AUDIT_POSITION_UPDATED
|
||||
|
||||
return await record_audit_event(
|
||||
pool,
|
||||
event_type=event_type,
|
||||
entity_type="position",
|
||||
entity_id=order_id,
|
||||
data={
|
||||
"ticker": ticker,
|
||||
"side": side,
|
||||
"quantity_before": quantity_before,
|
||||
"quantity_after": quantity_after,
|
||||
"avg_entry_before": avg_entry_before,
|
||||
"avg_entry_after": avg_entry_after,
|
||||
},
|
||||
actor="broker_service",
|
||||
)
|
||||
|
||||
|
||||
async def audit_approval_requested(
|
||||
pool: asyncpg.Pool,
|
||||
approval_id: str,
|
||||
ticker: str,
|
||||
side: str,
|
||||
quantity: float,
|
||||
estimated_value: float,
|
||||
recommendation_id: str | None = None,
|
||||
expires_at: str | None = None,
|
||||
) -> str:
|
||||
"""Record that an operator approval was requested for a live order."""
|
||||
return await record_audit_event(
|
||||
pool,
|
||||
event_type=AUDIT_APPROVAL_REQUESTED,
|
||||
entity_type="approval",
|
||||
entity_id=approval_id,
|
||||
data={
|
||||
"ticker": ticker,
|
||||
"side": side,
|
||||
"quantity": quantity,
|
||||
"estimated_value": estimated_value,
|
||||
"recommendation_id": recommendation_id,
|
||||
"expires_at": expires_at,
|
||||
},
|
||||
actor="broker_service",
|
||||
)
|
||||
|
||||
|
||||
async def audit_approval_reviewed(
|
||||
pool: asyncpg.Pool,
|
||||
approval_id: str,
|
||||
ticker: str,
|
||||
approved: bool,
|
||||
reviewed_by: str = "operator",
|
||||
review_note: str = "",
|
||||
) -> str:
|
||||
"""Record that an operator reviewed an approval request."""
|
||||
event_type = AUDIT_APPROVAL_APPROVED if approved else AUDIT_APPROVAL_REJECTED
|
||||
return await record_audit_event(
|
||||
pool,
|
||||
event_type=event_type,
|
||||
entity_type="approval",
|
||||
entity_id=approval_id,
|
||||
data={
|
||||
"ticker": ticker,
|
||||
"approved": approved,
|
||||
"reviewed_by": reviewed_by,
|
||||
"review_note": review_note,
|
||||
},
|
||||
actor=reviewed_by,
|
||||
)
|
||||
|
||||
|
||||
async def audit_approval_expired(
|
||||
pool: asyncpg.Pool,
|
||||
approval_id: str,
|
||||
ticker: str,
|
||||
) -> str:
|
||||
"""Record that an approval request expired without review."""
|
||||
return await record_audit_event(
|
||||
pool,
|
||||
event_type=AUDIT_APPROVAL_EXPIRED,
|
||||
entity_type="approval",
|
||||
entity_id=approval_id,
|
||||
data={"ticker": ticker},
|
||||
actor="system",
|
||||
)
|
||||
|
||||
|
||||
async def audit_trading_mode_changed(
|
||||
pool: asyncpg.Pool,
|
||||
config_id: str,
|
||||
old_mode: str,
|
||||
new_mode: str,
|
||||
actor: str = "operator",
|
||||
) -> str:
|
||||
"""Record a trading mode change."""
|
||||
return await record_audit_event(
|
||||
pool,
|
||||
event_type=AUDIT_TRADING_MODE_CHANGED,
|
||||
entity_type="risk_config",
|
||||
entity_id=config_id,
|
||||
data={
|
||||
"old_mode": old_mode,
|
||||
"new_mode": new_mode,
|
||||
},
|
||||
actor=actor,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Query helpers for audit trail retrieval (Requirement 11.3)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_FETCH_AUDIT_TRAIL_FOR_ORDER = """
|
||||
SELECT id, event_type, entity_type, entity_id, actor, data, created_at
|
||||
FROM audit_events
|
||||
WHERE entity_id = $1::uuid
|
||||
OR data->>'recommendation_id' = $2
|
||||
OR data->>'order_id' = $2
|
||||
ORDER BY created_at ASC
|
||||
"""
|
||||
|
||||
_FETCH_AUDIT_TRAIL_BY_ENTITY = """
|
||||
SELECT id, event_type, entity_type, entity_id, actor, data, created_at
|
||||
FROM audit_events
|
||||
WHERE entity_type = $1 AND entity_id = $2::uuid
|
||||
ORDER BY created_at ASC
|
||||
"""
|
||||
|
||||
_FETCH_FULL_EXECUTION_TRAIL = """
|
||||
SELECT id, event_type, entity_type, entity_id, actor, data, created_at
|
||||
FROM audit_events
|
||||
WHERE entity_id = $1::uuid
|
||||
OR entity_id IN (
|
||||
SELECT entity_id FROM audit_events
|
||||
WHERE data->>'recommendation_id' = $2
|
||||
)
|
||||
ORDER BY created_at ASC
|
||||
"""
|
||||
|
||||
|
||||
async def get_order_audit_trail(
|
||||
pool: asyncpg.Pool,
|
||||
order_id: str,
|
||||
recommendation_id: str | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Fetch the full audit trail for an order, including related recommendation and risk events.
|
||||
|
||||
Returns events ordered chronologically so the full decision chain
|
||||
is visible: recommendation → risk → order → fill/reject.
|
||||
"""
|
||||
ref_id = recommendation_id or order_id
|
||||
rows = await pool.fetch(_FETCH_AUDIT_TRAIL_FOR_ORDER, order_id, ref_id)
|
||||
return [
|
||||
{
|
||||
"id": str(row["id"]),
|
||||
"event_type": row["event_type"],
|
||||
"entity_type": row["entity_type"],
|
||||
"entity_id": str(row["entity_id"]),
|
||||
"actor": row["actor"],
|
||||
"data": row["data"] if isinstance(row["data"], dict) else json.loads(row["data"]),
|
||||
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
|
||||
async def get_entity_audit_trail(
|
||||
pool: asyncpg.Pool,
|
||||
entity_type: str,
|
||||
entity_id: str,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Fetch all audit events for a specific entity."""
|
||||
rows = await pool.fetch(_FETCH_AUDIT_TRAIL_BY_ENTITY, entity_type, entity_id)
|
||||
return [
|
||||
{
|
||||
"id": str(row["id"]),
|
||||
"event_type": row["event_type"],
|
||||
"entity_type": row["entity_type"],
|
||||
"entity_id": str(row["entity_id"]),
|
||||
"actor": row["actor"],
|
||||
"data": row["data"] if isinstance(row["data"], dict) else json.loads(row["data"]),
|
||||
"created_at": row["created_at"].isoformat() if row["created_at"] else None,
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
@@ -43,6 +43,10 @@ class OllamaConfig:
|
||||
base_url: str = "http://localhost:11434"
|
||||
model: str = "llama3.1:8b"
|
||||
timeout: int = 120
|
||||
max_retries: int = 2
|
||||
retry_base_delay: float = 1.0
|
||||
retry_max_delay: float = 10.0
|
||||
retry_backoff_multiplier: float = 2.0
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -51,16 +55,82 @@ class TrinoConfig:
|
||||
port: int = 8080
|
||||
catalog: str = "lakehouse"
|
||||
schema: str = "stonks"
|
||||
iceberg_catalog: str = "iceberg"
|
||||
|
||||
|
||||
@dataclass
|
||||
class MarketDataConfig:
|
||||
api_key: str = ""
|
||||
base_url: str = "https://api.polygon.io"
|
||||
provider: str = "polygon"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BrokerConfig:
|
||||
mode: str = "paper" # paper | live
|
||||
provider: str = "alpaca"
|
||||
api_key: Optional[str] = None
|
||||
api_secret: Optional[str] = None
|
||||
base_url: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RetentionConfig:
|
||||
"""Default retention periods (days) per bucket class.
|
||||
|
||||
These can be overridden per-bucket via the retention_policies DB table.
|
||||
The cleanup_interval_hours controls how often the retention worker runs.
|
||||
"""
|
||||
raw_market_days: int = 90
|
||||
raw_news_days: int = 180
|
||||
raw_filings_days: int = 365
|
||||
normalized_days: int = 180
|
||||
llm_prompts_days: int = 365
|
||||
llm_results_days: int = 365
|
||||
lakehouse_days: int = 730
|
||||
audit_days: int = 730
|
||||
cleanup_interval_hours: int = 24
|
||||
batch_size: int = 1000
|
||||
|
||||
|
||||
# Map bucket names to RetentionConfig field names
|
||||
BUCKET_RETENTION_FIELDS: dict[str, str] = {
|
||||
"stonks-raw-market": "raw_market_days",
|
||||
"stonks-raw-news": "raw_news_days",
|
||||
"stonks-raw-filings": "raw_filings_days",
|
||||
"stonks-normalized": "normalized_days",
|
||||
"stonks-llm-prompts": "llm_prompts_days",
|
||||
"stonks-llm-results": "llm_results_days",
|
||||
"stonks-lakehouse": "lakehouse_days",
|
||||
"stonks-audit": "audit_days",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlertingConfig:
|
||||
"""Thresholds for operational alerting rules.
|
||||
|
||||
Requirements: 12.3
|
||||
"""
|
||||
# Source failure alerting
|
||||
source_failure_threshold: int = 3 # consecutive failures before alert
|
||||
source_failure_window_hours: int = 6 # lookback window
|
||||
|
||||
# Schema/extraction failure spike
|
||||
schema_failure_rate_threshold: float = 0.3 # 30% failure rate triggers alert
|
||||
schema_failure_window_hours: int = 1
|
||||
|
||||
# Analytical (lake publication) lag
|
||||
lake_lag_threshold_minutes: int = 60 # minutes since last successful publish
|
||||
|
||||
# Broker issues
|
||||
broker_error_threshold: int = 3 # consecutive broker errors
|
||||
broker_error_window_hours: int = 1
|
||||
|
||||
# Evaluation interval
|
||||
check_interval_seconds: int = 120
|
||||
|
||||
|
||||
@dataclass
|
||||
class AppConfig:
|
||||
postgres: PostgresConfig = field(default_factory=PostgresConfig)
|
||||
@@ -68,8 +138,12 @@ class AppConfig:
|
||||
minio: MinioConfig = field(default_factory=MinioConfig)
|
||||
ollama: OllamaConfig = field(default_factory=OllamaConfig)
|
||||
trino: TrinoConfig = field(default_factory=TrinoConfig)
|
||||
market_data: MarketDataConfig = field(default_factory=MarketDataConfig)
|
||||
broker: BrokerConfig = field(default_factory=BrokerConfig)
|
||||
retention: RetentionConfig = field(default_factory=RetentionConfig)
|
||||
alerting: AlertingConfig = field(default_factory=AlertingConfig)
|
||||
log_level: str = "INFO"
|
||||
json_logs: bool = True
|
||||
|
||||
|
||||
def load_config() -> AppConfig:
|
||||
@@ -98,18 +172,52 @@ def load_config() -> AppConfig:
|
||||
base_url=os.getenv("OLLAMA_BASE_URL", "http://localhost:11434"),
|
||||
model=os.getenv("OLLAMA_MODEL", "llama3.1:8b"),
|
||||
timeout=int(os.getenv("OLLAMA_TIMEOUT", "120")),
|
||||
max_retries=int(os.getenv("OLLAMA_MAX_RETRIES", "2")),
|
||||
retry_base_delay=float(os.getenv("OLLAMA_RETRY_BASE_DELAY", "1.0")),
|
||||
retry_max_delay=float(os.getenv("OLLAMA_RETRY_MAX_DELAY", "10.0")),
|
||||
retry_backoff_multiplier=float(os.getenv("OLLAMA_RETRY_BACKOFF_MULTIPLIER", "2.0")),
|
||||
),
|
||||
trino=TrinoConfig(
|
||||
host=os.getenv("TRINO_HOST", "localhost"),
|
||||
port=int(os.getenv("TRINO_PORT", "8080")),
|
||||
catalog=os.getenv("TRINO_CATALOG", "lakehouse"),
|
||||
schema=os.getenv("TRINO_SCHEMA", "stonks"),
|
||||
iceberg_catalog=os.getenv("TRINO_ICEBERG_CATALOG", "iceberg"),
|
||||
),
|
||||
market_data=MarketDataConfig(
|
||||
api_key=os.getenv("MARKET_DATA_API_KEY", ""),
|
||||
base_url=os.getenv("MARKET_DATA_BASE_URL", "https://api.polygon.io"),
|
||||
provider=os.getenv("MARKET_DATA_PROVIDER", "polygon"),
|
||||
),
|
||||
broker=BrokerConfig(
|
||||
mode=os.getenv("BROKER_MODE", "paper"),
|
||||
provider=os.getenv("BROKER_PROVIDER", "alpaca"),
|
||||
api_key=os.getenv("BROKER_API_KEY", None),
|
||||
api_secret=os.getenv("BROKER_API_SECRET", None),
|
||||
base_url=os.getenv("BROKER_BASE_URL", None),
|
||||
),
|
||||
retention=RetentionConfig(
|
||||
raw_market_days=int(os.getenv("RETENTION_RAW_MARKET_DAYS", "90")),
|
||||
raw_news_days=int(os.getenv("RETENTION_RAW_NEWS_DAYS", "180")),
|
||||
raw_filings_days=int(os.getenv("RETENTION_RAW_FILINGS_DAYS", "365")),
|
||||
normalized_days=int(os.getenv("RETENTION_NORMALIZED_DAYS", "180")),
|
||||
llm_prompts_days=int(os.getenv("RETENTION_LLM_PROMPTS_DAYS", "365")),
|
||||
llm_results_days=int(os.getenv("RETENTION_LLM_RESULTS_DAYS", "365")),
|
||||
lakehouse_days=int(os.getenv("RETENTION_LAKEHOUSE_DAYS", "730")),
|
||||
audit_days=int(os.getenv("RETENTION_AUDIT_DAYS", "730")),
|
||||
cleanup_interval_hours=int(os.getenv("RETENTION_CLEANUP_INTERVAL_HOURS", "24")),
|
||||
batch_size=int(os.getenv("RETENTION_BATCH_SIZE", "1000")),
|
||||
),
|
||||
alerting=AlertingConfig(
|
||||
source_failure_threshold=int(os.getenv("ALERT_SOURCE_FAILURE_THRESHOLD", "3")),
|
||||
source_failure_window_hours=int(os.getenv("ALERT_SOURCE_FAILURE_WINDOW_HOURS", "6")),
|
||||
schema_failure_rate_threshold=float(os.getenv("ALERT_SCHEMA_FAILURE_RATE_THRESHOLD", "0.3")),
|
||||
schema_failure_window_hours=int(os.getenv("ALERT_SCHEMA_FAILURE_WINDOW_HOURS", "1")),
|
||||
lake_lag_threshold_minutes=int(os.getenv("ALERT_LAKE_LAG_THRESHOLD_MINUTES", "60")),
|
||||
broker_error_threshold=int(os.getenv("ALERT_BROKER_ERROR_THRESHOLD", "3")),
|
||||
broker_error_window_hours=int(os.getenv("ALERT_BROKER_ERROR_WINDOW_HOURS", "1")),
|
||||
check_interval_seconds=int(os.getenv("ALERT_CHECK_INTERVAL_SECONDS", "120")),
|
||||
),
|
||||
log_level=os.getenv("LOG_LEVEL", "INFO"),
|
||||
json_logs=os.getenv("JSON_LOGS", "true").lower() == "true",
|
||||
)
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
"""Canonical URL normalization and content hashing utilities.
|
||||
|
||||
Provides consistent URL canonicalization and SHA-256 content hashing
|
||||
across all ingestion adapters and pipeline stages.
|
||||
|
||||
Requirements: 3.2, 3.3
|
||||
"""
|
||||
import hashlib
|
||||
from urllib.parse import parse_qsl, urlencode, urlparse
|
||||
|
||||
|
||||
def normalize_url(url: str) -> str:
|
||||
"""Canonical URL normalization.
|
||||
|
||||
- Lowercases scheme and host
|
||||
- Strips fragments
|
||||
- Strips trailing slashes from path (preserves root "/")
|
||||
- Strips default ports (80, 443)
|
||||
- Sorts query parameters for deterministic comparison
|
||||
- Defaults scheme to https if missing
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
scheme = (parsed.scheme or "https").lower()
|
||||
netloc = (parsed.hostname or "").lower()
|
||||
if parsed.port and parsed.port not in (80, 443):
|
||||
netloc = f"{netloc}:{parsed.port}"
|
||||
path = parsed.path.rstrip("/") or "/"
|
||||
# Sort query params for deterministic ordering
|
||||
query = urlencode(sorted(parse_qsl(parsed.query)))
|
||||
normalized = f"{scheme}://{netloc}{path}"
|
||||
if query:
|
||||
normalized = f"{normalized}?{query}"
|
||||
return normalized
|
||||
|
||||
|
||||
def content_hash(data: bytes) -> str:
|
||||
"""Compute a stable SHA-256 hex digest for raw content bytes."""
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def content_hash_str(text: str, encoding: str = "utf-8") -> str:
|
||||
"""Compute a stable SHA-256 hex digest for a text string."""
|
||||
return hashlib.sha256(text.encode(encoding)).hexdigest()
|
||||
@@ -0,0 +1,134 @@
|
||||
"""Dead-letter queue (DLQ) support and replay tooling.
|
||||
|
||||
When a worker fails to process a job after exhausting retries, the job
|
||||
is pushed to a per-queue dead-letter list in Redis. Each DLQ entry
|
||||
wraps the original payload with failure metadata (error message,
|
||||
timestamp, attempt count) so operators can inspect and replay later.
|
||||
|
||||
Replay moves items from the DLQ back to the source queue for
|
||||
reprocessing.
|
||||
|
||||
Requirements: 12.1 (observability), design section 8 (data flows)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
import redis.asyncio as aioredis
|
||||
|
||||
from services.shared.redis_keys import dlq_key, queue_key
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Default max attempts before a job is dead-lettered
|
||||
DEFAULT_MAX_ATTEMPTS = 3
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def wrap_dlq_entry(
|
||||
original_payload: dict[str, Any],
|
||||
queue_name: str,
|
||||
error: str,
|
||||
attempt: int = 1,
|
||||
worker: str = "",
|
||||
) -> dict[str, Any]:
|
||||
"""Wrap an original job payload with DLQ metadata."""
|
||||
return {
|
||||
"original_payload": original_payload,
|
||||
"queue": queue_name,
|
||||
"error": error,
|
||||
"attempt": attempt,
|
||||
"worker": worker,
|
||||
"dead_lettered_at": _now_iso(),
|
||||
}
|
||||
|
||||
|
||||
async def send_to_dlq(
|
||||
rds: aioredis.Redis,
|
||||
queue_name: str,
|
||||
original_payload: dict[str, Any],
|
||||
error: str,
|
||||
attempt: int = 1,
|
||||
worker: str = "",
|
||||
) -> None:
|
||||
"""Push a failed job to the dead-letter queue for *queue_name*."""
|
||||
entry = wrap_dlq_entry(original_payload, queue_name, error, attempt, worker)
|
||||
await rds.rpush(dlq_key(queue_name), json.dumps(entry, default=str))
|
||||
logger.warning(
|
||||
"Dead-lettered job on %s after %d attempts: %s",
|
||||
queue_name, attempt, error,
|
||||
extra={"queue": queue_name, "attempt": attempt},
|
||||
)
|
||||
|
||||
|
||||
async def dlq_length(rds: aioredis.Redis, queue_name: str) -> int:
|
||||
"""Return the number of items in the DLQ for *queue_name*."""
|
||||
return await rds.llen(dlq_key(queue_name))
|
||||
|
||||
|
||||
async def peek_dlq(
|
||||
rds: aioredis.Redis,
|
||||
queue_name: str,
|
||||
start: int = 0,
|
||||
count: int = 10,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Return DLQ entries without removing them (for inspection)."""
|
||||
raw_items = await rds.lrange(dlq_key(queue_name), start, start + count - 1)
|
||||
return [json.loads(item) for item in raw_items]
|
||||
|
||||
|
||||
async def replay_one(rds: aioredis.Redis, queue_name: str) -> dict[str, Any] | None:
|
||||
"""Pop the oldest DLQ entry and re-enqueue its original payload.
|
||||
|
||||
Returns the replayed DLQ entry, or None if the DLQ is empty.
|
||||
"""
|
||||
raw = await rds.lpop(dlq_key(queue_name))
|
||||
if raw is None:
|
||||
return None
|
||||
entry = json.loads(raw)
|
||||
original = entry.get("original_payload", entry)
|
||||
await rds.rpush(queue_key(queue_name), json.dumps(original, default=str))
|
||||
logger.info("Replayed 1 job from DLQ back to %s", queue_name)
|
||||
return entry
|
||||
|
||||
|
||||
async def replay_all(rds: aioredis.Redis, queue_name: str) -> int:
|
||||
"""Replay every item in the DLQ back to the source queue.
|
||||
|
||||
Returns the number of items replayed.
|
||||
"""
|
||||
count = 0
|
||||
while True:
|
||||
raw = await rds.lpop(dlq_key(queue_name))
|
||||
if raw is None:
|
||||
break
|
||||
entry = json.loads(raw)
|
||||
original = entry.get("original_payload", entry)
|
||||
await rds.rpush(queue_key(queue_name), json.dumps(original, default=str))
|
||||
count += 1
|
||||
if count:
|
||||
logger.info("Replayed %d jobs from DLQ back to %s", count, queue_name)
|
||||
return count
|
||||
|
||||
|
||||
async def purge_dlq(rds: aioredis.Redis, queue_name: str) -> int:
|
||||
"""Delete all items from the DLQ for *queue_name*. Returns count removed."""
|
||||
key = dlq_key(queue_name)
|
||||
length = await rds.llen(key)
|
||||
if length:
|
||||
await rds.delete(key)
|
||||
return length
|
||||
|
||||
|
||||
async def dlq_summary(rds: aioredis.Redis, queue_names: list[str]) -> dict[str, int]:
|
||||
"""Return a mapping of queue_name -> DLQ depth for the given queues."""
|
||||
result: dict[str, int] = {}
|
||||
for name in queue_names:
|
||||
result[name] = await rds.llen(dlq_key(name))
|
||||
return result
|
||||
@@ -0,0 +1,198 @@
|
||||
"""Cross-source deduplication for articles and filings.
|
||||
|
||||
Detects duplicate documents across different source types (news_api,
|
||||
filings_api, web_scrape) using a layered approach:
|
||||
|
||||
1. Redis fast-path: check content_hash and canonical_url markers for
|
||||
recently-seen documents (TTL-bounded, cheap).
|
||||
2. PostgreSQL fallback: query the documents table by canonical_url or
|
||||
content_hash for durable cross-source matching.
|
||||
|
||||
When a duplicate is detected the caller receives the existing document_id
|
||||
so it can link additional company mentions without re-inserting the document.
|
||||
|
||||
Requirements: 3.2, 3.3
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import asyncpg
|
||||
import redis.asyncio as aioredis
|
||||
|
||||
from services.shared.content import content_hash_str, normalize_url
|
||||
from services.shared.redis_keys import DEDUPE_PREFIX
|
||||
|
||||
logger = logging.getLogger("dedupe")
|
||||
|
||||
# Redis TTL for dedupe markers (24 hours)
|
||||
DEDUPE_TTL_SECONDS: int = 86400
|
||||
|
||||
|
||||
def _url_dedupe_key(canonical_url: str) -> str:
|
||||
"""Build a Redis key for URL-based deduplication."""
|
||||
return f"{DEDUPE_PREFIX}:url:{content_hash_str(canonical_url)}"
|
||||
|
||||
|
||||
def _hash_dedupe_key(content_hash: str) -> str:
|
||||
"""Build a Redis key for content-hash-based deduplication."""
|
||||
return f"{DEDUPE_PREFIX}:{content_hash}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class DedupeResult:
|
||||
"""Result of a deduplication check."""
|
||||
|
||||
is_duplicate: bool
|
||||
existing_document_id: str | None = None
|
||||
match_type: str | None = None # "content_hash" | "canonical_url" | None
|
||||
|
||||
|
||||
async def check_duplicate(
|
||||
pool: asyncpg.Pool,
|
||||
rds: aioredis.Redis,
|
||||
*,
|
||||
content_hash: str,
|
||||
url: str | None = None,
|
||||
canonical_url: str | None = None,
|
||||
) -> DedupeResult:
|
||||
"""Check whether a document is a duplicate across all source types.
|
||||
|
||||
Checks in order of cost:
|
||||
1. Redis content_hash marker (fast path)
|
||||
2. Redis canonical_url marker (fast path)
|
||||
3. PostgreSQL documents.content_hash (durable)
|
||||
4. PostgreSQL documents.canonical_url (cross-source)
|
||||
|
||||
Returns a DedupeResult indicating whether the document already exists.
|
||||
"""
|
||||
# Resolve canonical URL if only raw URL provided
|
||||
resolved_canonical = canonical_url or (normalize_url(url) if url else None)
|
||||
|
||||
# --- Redis fast path: content hash ---
|
||||
if content_hash:
|
||||
redis_key = _hash_dedupe_key(content_hash)
|
||||
cached_id = await rds.get(redis_key)
|
||||
if cached_id:
|
||||
logger.debug("Dedupe hit (redis content_hash) for %s", content_hash[:16])
|
||||
return DedupeResult(
|
||||
is_duplicate=True,
|
||||
existing_document_id=str(cached_id),
|
||||
match_type="content_hash",
|
||||
)
|
||||
|
||||
# --- Redis fast path: canonical URL ---
|
||||
if resolved_canonical:
|
||||
url_key = _url_dedupe_key(resolved_canonical)
|
||||
cached_id = await rds.get(url_key)
|
||||
if cached_id:
|
||||
logger.debug("Dedupe hit (redis canonical_url) for %s", resolved_canonical[:60])
|
||||
return DedupeResult(
|
||||
is_duplicate=True,
|
||||
existing_document_id=str(cached_id),
|
||||
match_type="canonical_url",
|
||||
)
|
||||
|
||||
# --- PostgreSQL fallback: content hash ---
|
||||
if content_hash:
|
||||
row = await pool.fetchrow(
|
||||
"SELECT id FROM documents WHERE content_hash = $1 LIMIT 1",
|
||||
content_hash,
|
||||
)
|
||||
if row:
|
||||
doc_id = str(row["id"])
|
||||
# Warm the Redis cache for future checks
|
||||
await _set_dedupe_markers(rds, content_hash, resolved_canonical, doc_id)
|
||||
logger.debug("Dedupe hit (pg content_hash) for %s", content_hash[:16])
|
||||
return DedupeResult(
|
||||
is_duplicate=True,
|
||||
existing_document_id=doc_id,
|
||||
match_type="content_hash",
|
||||
)
|
||||
|
||||
# --- PostgreSQL fallback: canonical URL ---
|
||||
if resolved_canonical:
|
||||
row = await pool.fetchrow(
|
||||
"SELECT id FROM documents WHERE canonical_url = $1 LIMIT 1",
|
||||
resolved_canonical,
|
||||
)
|
||||
if row:
|
||||
doc_id = str(row["id"])
|
||||
await _set_dedupe_markers(rds, content_hash, resolved_canonical, doc_id)
|
||||
logger.debug("Dedupe hit (pg canonical_url) for %s", resolved_canonical[:60])
|
||||
return DedupeResult(
|
||||
is_duplicate=True,
|
||||
existing_document_id=doc_id,
|
||||
match_type="canonical_url",
|
||||
)
|
||||
|
||||
return DedupeResult(is_duplicate=False)
|
||||
|
||||
|
||||
async def mark_as_seen(
|
||||
rds: aioredis.Redis,
|
||||
*,
|
||||
content_hash: str,
|
||||
canonical_url: str | None,
|
||||
document_id: str,
|
||||
) -> None:
|
||||
"""Mark a newly-persisted document in Redis for fast future dedupe checks."""
|
||||
await _set_dedupe_markers(rds, content_hash, canonical_url, document_id)
|
||||
|
||||
|
||||
async def _set_dedupe_markers(
|
||||
rds: aioredis.Redis,
|
||||
content_hash: str | None,
|
||||
canonical_url: str | None,
|
||||
document_id: str,
|
||||
) -> None:
|
||||
"""Set Redis dedupe markers for both content hash and canonical URL."""
|
||||
if content_hash:
|
||||
await rds.set(
|
||||
_hash_dedupe_key(content_hash), document_id, ex=DEDUPE_TTL_SECONDS
|
||||
)
|
||||
if canonical_url:
|
||||
await rds.set(
|
||||
_url_dedupe_key(canonical_url), document_id, ex=DEDUPE_TTL_SECONDS
|
||||
)
|
||||
|
||||
|
||||
async def dedupe_items(
|
||||
pool: asyncpg.Pool,
|
||||
rds: aioredis.Redis,
|
||||
items: list[dict[str, Any]],
|
||||
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
||||
"""Partition a list of ingestion items into new and duplicate groups.
|
||||
|
||||
Each item is expected to have at least one of:
|
||||
- content_hash: SHA-256 of the raw content
|
||||
- url / canonical_url: the document URL
|
||||
|
||||
Returns (new_items, duplicate_items).
|
||||
"""
|
||||
new_items: list[dict[str, Any]] = []
|
||||
dup_items: list[dict[str, Any]] = []
|
||||
|
||||
for item in items:
|
||||
item_hash = item.get("content_hash", "")
|
||||
item_url = item.get("url") or item.get("link")
|
||||
item_canonical = item.get("canonical_url")
|
||||
|
||||
result = await check_duplicate(
|
||||
pool,
|
||||
rds,
|
||||
content_hash=item_hash,
|
||||
url=item_url,
|
||||
canonical_url=item_canonical,
|
||||
)
|
||||
|
||||
if result.is_duplicate:
|
||||
item["_dedupe_match_type"] = result.match_type
|
||||
item["_dedupe_existing_id"] = result.existing_document_id
|
||||
dup_items.append(item)
|
||||
else:
|
||||
new_items.append(item)
|
||||
|
||||
return new_items, dup_items
|
||||
@@ -0,0 +1,224 @@
|
||||
"""Structured logging and distributed tracing for all Stonks Oracle services.
|
||||
|
||||
Provides:
|
||||
- JSON-formatted structured log output for machine-parseable log aggregation
|
||||
- Trace context (trace_id, span_id, service) propagated through log records
|
||||
- Context manager for creating trace spans within a service
|
||||
- Helper to configure logging for any service worker or API
|
||||
|
||||
Requirements: 12.1
|
||||
Design: Section 12 (Observability and Operations)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from contextvars import ContextVar
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Trace context stored in contextvars for async-safe propagation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_trace_id: ContextVar[str] = ContextVar("trace_id", default="")
|
||||
_span_id: ContextVar[str] = ContextVar("span_id", default="")
|
||||
_service_name: ContextVar[str] = ContextVar("service_name", default="unknown")
|
||||
|
||||
|
||||
def get_trace_id() -> str:
|
||||
return _trace_id.get()
|
||||
|
||||
|
||||
def get_span_id() -> str:
|
||||
return _span_id.get()
|
||||
|
||||
|
||||
def get_service_name() -> str:
|
||||
return _service_name.get()
|
||||
|
||||
|
||||
def set_trace_context(
|
||||
trace_id: str | None = None,
|
||||
span_id: str | None = None,
|
||||
service: str | None = None,
|
||||
) -> None:
|
||||
"""Set trace context for the current async task / thread."""
|
||||
if trace_id is not None:
|
||||
_trace_id.set(trace_id)
|
||||
if span_id is not None:
|
||||
_span_id.set(span_id)
|
||||
if service is not None:
|
||||
_service_name.set(service)
|
||||
|
||||
|
||||
def new_trace_id() -> str:
|
||||
return uuid.uuid4().hex[:16]
|
||||
|
||||
|
||||
def new_span_id() -> str:
|
||||
return uuid.uuid4().hex[:8]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Span context manager for tracing within a service
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class Span:
|
||||
"""Lightweight span for distributed tracing.
|
||||
|
||||
Usage::
|
||||
|
||||
with Span("process_document", ticker="AAPL") as span:
|
||||
# ... do work ...
|
||||
span.set_attribute("doc_count", 5)
|
||||
|
||||
On exit the span logs its duration and attributes as a structured event.
|
||||
"""
|
||||
|
||||
def __init__(self, operation: str, **attributes: Any) -> None:
|
||||
self.operation = operation
|
||||
self.parent_span_id = get_span_id()
|
||||
self.span_id = new_span_id()
|
||||
self.trace_id = get_trace_id() or new_trace_id()
|
||||
self.attributes: dict[str, Any] = dict(attributes)
|
||||
self.start_time: float = 0.0
|
||||
self.duration_ms: float = 0.0
|
||||
self._token_trace: Any = None
|
||||
self._token_span: Any = None
|
||||
self._logger = logging.getLogger(get_service_name() or "tracing")
|
||||
|
||||
def set_attribute(self, key: str, value: Any) -> None:
|
||||
self.attributes[key] = value
|
||||
|
||||
def __enter__(self) -> Span:
|
||||
self.start_time = time.monotonic()
|
||||
self._token_trace = _trace_id.set(self.trace_id)
|
||||
self._token_span = _span_id.set(self.span_id)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
||||
self.duration_ms = (time.monotonic() - self.start_time) * 1000
|
||||
status = "error" if exc_type else "ok"
|
||||
|
||||
self._logger.info(
|
||||
"span.end",
|
||||
extra={
|
||||
"span_operation": self.operation,
|
||||
"span_status": status,
|
||||
"span_duration_ms": round(self.duration_ms, 2),
|
||||
"span_parent_id": self.parent_span_id,
|
||||
"span_attributes": self.attributes,
|
||||
},
|
||||
)
|
||||
|
||||
# Restore parent span context
|
||||
if self._token_span is not None:
|
||||
_span_id.reset(self._token_span)
|
||||
if self._token_trace is not None:
|
||||
_trace_id.reset(self._token_trace)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# JSON log formatter
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class JSONFormatter(logging.Formatter):
|
||||
"""Emit each log record as a single JSON line with trace context."""
|
||||
|
||||
def format(self, record: logging.LogRecord) -> str:
|
||||
log_entry: dict[str, Any] = {
|
||||
"timestamp": datetime.fromtimestamp(record.created, tz=timezone.utc).isoformat(),
|
||||
"level": record.levelname,
|
||||
"logger": record.name,
|
||||
"message": record.getMessage(),
|
||||
"service": get_service_name(),
|
||||
"trace_id": get_trace_id(),
|
||||
"span_id": get_span_id(),
|
||||
}
|
||||
|
||||
# Merge extra fields from Span or manual extra={} usage
|
||||
for key in (
|
||||
"span_operation", "span_status", "span_duration_ms",
|
||||
"span_parent_id", "span_attributes",
|
||||
"ticker", "document_id", "source_type", "job_id",
|
||||
"duration_ms", "error", "count",
|
||||
):
|
||||
val = getattr(record, key, None)
|
||||
if val is not None:
|
||||
log_entry[key] = val
|
||||
|
||||
if record.exc_info and record.exc_info[1]:
|
||||
log_entry["exception"] = self.formatException(record.exc_info)
|
||||
|
||||
return json.dumps(log_entry, default=str)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Setup helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def setup_logging(
|
||||
service_name: str,
|
||||
level: str = "INFO",
|
||||
json_output: bool = True,
|
||||
) -> None:
|
||||
"""Configure structured logging for a service.
|
||||
|
||||
Call this once at service startup (before any log calls).
|
||||
|
||||
Args:
|
||||
service_name: Identifies this service in log output (e.g. "ingestion_worker").
|
||||
level: Log level string (DEBUG, INFO, WARNING, ERROR).
|
||||
json_output: If True, emit JSON lines. If False, use a human-readable format.
|
||||
"""
|
||||
_service_name.set(service_name)
|
||||
|
||||
root = logging.getLogger()
|
||||
root.setLevel(getattr(logging, level.upper(), logging.INFO))
|
||||
|
||||
# Remove existing handlers to avoid duplicate output
|
||||
root.handlers.clear()
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
if json_output:
|
||||
handler.setFormatter(JSONFormatter())
|
||||
else:
|
||||
handler.setFormatter(logging.Formatter(
|
||||
"%(asctime)s [%(levelname)s] %(name)s (%(service)s) "
|
||||
"trace=%(trace_id)s span=%(span_id)s — %(message)s",
|
||||
defaults={"service": service_name, "trace_id": "", "span_id": ""},
|
||||
))
|
||||
root.addHandler(handler)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Trace context propagation through job payloads
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def inject_trace_context(payload: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Inject current trace context into a job payload dict.
|
||||
|
||||
Call this before enqueuing a job to Redis so the downstream
|
||||
worker can continue the same trace.
|
||||
"""
|
||||
trace_id = get_trace_id()
|
||||
if trace_id:
|
||||
payload["_trace_id"] = trace_id
|
||||
return payload
|
||||
|
||||
|
||||
def extract_trace_context(payload: dict[str, Any]) -> None:
|
||||
"""Extract and set trace context from an incoming job payload.
|
||||
|
||||
Call this at the start of job processing. If no trace context
|
||||
is present, generates a new trace_id.
|
||||
"""
|
||||
trace_id = payload.get("_trace_id") or new_trace_id()
|
||||
set_trace_context(trace_id=trace_id, span_id=new_span_id())
|
||||
@@ -0,0 +1,696 @@
|
||||
"""Metadata persistence for market payloads, documents, and broker events.
|
||||
|
||||
Persists structured metadata records to PostgreSQL for all ingested artifacts.
|
||||
Each source type has its own persistence path:
|
||||
- market_api → market_snapshots table
|
||||
- news_api / filings_api / web_scrape → documents + document_company_mentions
|
||||
- broker → order_events or market_snapshots (for position/account snapshots)
|
||||
|
||||
Requirements: 3.3, 3.4, 8.3, 9.2
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any
|
||||
|
||||
import asyncpg
|
||||
|
||||
from services.shared.content import content_hash_str, normalize_url
|
||||
|
||||
logger = logging.getLogger("metadata")
|
||||
|
||||
|
||||
async def persist_market_snapshot(
|
||||
pool: asyncpg.Pool,
|
||||
*,
|
||||
company_id: str | None,
|
||||
ticker: str,
|
||||
snapshot_type: str,
|
||||
data: dict[str, Any],
|
||||
source_provider: str,
|
||||
storage_ref: str,
|
||||
content_hash: str,
|
||||
captured_at: datetime | None = None,
|
||||
) -> str:
|
||||
"""Persist a market data snapshot to PostgreSQL.
|
||||
|
||||
Returns the snapshot row UUID.
|
||||
"""
|
||||
ts = captured_at or datetime.now(timezone.utc)
|
||||
row_id = await pool.fetchval(
|
||||
"""INSERT INTO market_snapshots
|
||||
(company_id, ticker, snapshot_type, data, source_provider,
|
||||
captured_at, storage_ref, content_hash)
|
||||
VALUES ($1, $2, $3, $4::jsonb, $5, $6, $7, $8)
|
||||
RETURNING id""",
|
||||
company_id,
|
||||
ticker,
|
||||
snapshot_type,
|
||||
json.dumps(data),
|
||||
source_provider,
|
||||
ts,
|
||||
storage_ref,
|
||||
content_hash,
|
||||
)
|
||||
logger.debug("Persisted market snapshot %s for %s", row_id, ticker)
|
||||
return str(row_id)
|
||||
|
||||
|
||||
async def persist_document(
|
||||
pool: asyncpg.Pool,
|
||||
*,
|
||||
document_type: str,
|
||||
source_type: str,
|
||||
publisher: str,
|
||||
url: str | None,
|
||||
canonical_url: str | None,
|
||||
title: str,
|
||||
published_at: datetime | None,
|
||||
content_hash: str,
|
||||
storage_ref: str,
|
||||
language: str = "en",
|
||||
) -> str | None:
|
||||
"""Persist a document metadata record to PostgreSQL.
|
||||
|
||||
Returns the document row UUID, or None if a duplicate content_hash exists.
|
||||
"""
|
||||
exists = await pool.fetchval(
|
||||
"SELECT 1 FROM documents WHERE content_hash = $1", content_hash
|
||||
)
|
||||
if exists:
|
||||
return None
|
||||
|
||||
doc_id = await pool.fetchval(
|
||||
"""INSERT INTO documents
|
||||
(document_type, source_type, publisher, url, canonical_url,
|
||||
title, published_at, content_hash, raw_storage_ref,
|
||||
language, status)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 'ingested')
|
||||
RETURNING id""",
|
||||
document_type,
|
||||
source_type,
|
||||
publisher,
|
||||
url,
|
||||
canonical_url,
|
||||
title,
|
||||
published_at,
|
||||
content_hash,
|
||||
storage_ref,
|
||||
language,
|
||||
)
|
||||
logger.debug("Persisted document %s (%s)", doc_id, title[:60] if title else "")
|
||||
return str(doc_id)
|
||||
|
||||
|
||||
async def update_document_parse_results(
|
||||
pool: asyncpg.Pool,
|
||||
*,
|
||||
document_id: str,
|
||||
normalized_storage_ref: str | None,
|
||||
parser_output_ref: str | None,
|
||||
parse_quality_score: float,
|
||||
parse_confidence: str,
|
||||
status: str,
|
||||
) -> None:
|
||||
"""Update a document row with parser output references and quality scores.
|
||||
|
||||
Called after the parsing stage to persist normalized text location,
|
||||
structured parser output location, quality score, and confidence.
|
||||
|
||||
Requirements: 4.1, 4.3, 9.1
|
||||
"""
|
||||
await pool.execute(
|
||||
"""UPDATE documents SET
|
||||
normalized_storage_ref = $2,
|
||||
parser_output_ref = $3,
|
||||
parse_quality_score = $4,
|
||||
parse_confidence = $5,
|
||||
status = $6,
|
||||
updated_at = NOW()
|
||||
WHERE id = $1""",
|
||||
document_id,
|
||||
normalized_storage_ref,
|
||||
parser_output_ref,
|
||||
parse_quality_score,
|
||||
parse_confidence,
|
||||
status,
|
||||
)
|
||||
logger.debug(
|
||||
"Updated document %s parse results: quality=%.2f confidence=%s status=%s",
|
||||
document_id, parse_quality_score, parse_confidence, status,
|
||||
)
|
||||
|
||||
|
||||
async def persist_document_company_mention(
|
||||
pool: asyncpg.Pool,
|
||||
*,
|
||||
document_id: str,
|
||||
company_id: str,
|
||||
ticker: str,
|
||||
mention_type: str = "direct",
|
||||
confidence: float = 1.0,
|
||||
) -> str:
|
||||
"""Link a document to a company via document_company_mentions.
|
||||
|
||||
Returns the mention row UUID.
|
||||
"""
|
||||
mention_id = await pool.fetchval(
|
||||
"""INSERT INTO document_company_mentions
|
||||
(document_id, company_id, ticker, mention_type, confidence)
|
||||
VALUES ($1::uuid, $2::uuid, $3, $4, $5)
|
||||
RETURNING id""",
|
||||
document_id,
|
||||
company_id,
|
||||
ticker,
|
||||
mention_type,
|
||||
confidence,
|
||||
)
|
||||
return str(mention_id)
|
||||
|
||||
|
||||
async def persist_broker_event(
|
||||
pool: asyncpg.Pool,
|
||||
*,
|
||||
ticker: str,
|
||||
event_type: str,
|
||||
data: dict[str, Any],
|
||||
source_provider: str,
|
||||
storage_ref: str,
|
||||
content_hash: str,
|
||||
captured_at: datetime | None = None,
|
||||
) -> str:
|
||||
"""Persist a broker event snapshot to market_snapshots.
|
||||
|
||||
Broker position/account snapshots are stored as market_snapshots
|
||||
with snapshot_type prefixed by 'broker_' (e.g. broker_positions,
|
||||
broker_account, broker_orders).
|
||||
|
||||
Returns the snapshot row UUID.
|
||||
"""
|
||||
ts = captured_at or datetime.now(timezone.utc)
|
||||
row_id = await pool.fetchval(
|
||||
"""INSERT INTO market_snapshots
|
||||
(ticker, snapshot_type, data, source_provider,
|
||||
captured_at, storage_ref, content_hash)
|
||||
VALUES ($1, $2, $3::jsonb, $4, $5, $6, $7)
|
||||
RETURNING id""",
|
||||
ticker,
|
||||
f"broker_{event_type}",
|
||||
json.dumps(data),
|
||||
source_provider,
|
||||
ts,
|
||||
storage_ref,
|
||||
content_hash,
|
||||
)
|
||||
logger.debug("Persisted broker event %s for %s", row_id, ticker)
|
||||
return str(row_id)
|
||||
|
||||
|
||||
def _resolve_document_type(source_type: str) -> str:
|
||||
"""Map source_type to a document_type value."""
|
||||
mapping = {
|
||||
"news_api": "article",
|
||||
"filings_api": "filing",
|
||||
"web_scrape": "press_release",
|
||||
}
|
||||
return mapping.get(source_type, "article")
|
||||
|
||||
|
||||
def _extract_publisher(item: dict[str, Any]) -> str:
|
||||
"""Extract publisher name from an adapter item dict."""
|
||||
if item.get("publisher"):
|
||||
return str(item["publisher"])
|
||||
source = item.get("source")
|
||||
if isinstance(source, dict):
|
||||
return source.get("name", "")
|
||||
if source:
|
||||
return str(source)
|
||||
return ""
|
||||
|
||||
|
||||
def _parse_published_at(item: dict[str, Any]) -> datetime | None:
|
||||
"""Parse published_at from various adapter item formats."""
|
||||
raw = item.get("publishedAt") or item.get("published_at")
|
||||
if not raw:
|
||||
return None
|
||||
if isinstance(raw, datetime):
|
||||
return raw
|
||||
try:
|
||||
return datetime.fromisoformat(str(raw).replace("Z", "+00:00"))
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
async def persist_ingestion_items(
|
||||
pool: asyncpg.Pool,
|
||||
*,
|
||||
source_type: str,
|
||||
ticker: str,
|
||||
company_id: str | None,
|
||||
items: list[dict[str, Any]],
|
||||
storage_ref: str,
|
||||
adapter_metadata: dict[str, Any],
|
||||
content_hash: str,
|
||||
) -> tuple[int, list[str]]:
|
||||
"""Route ingestion items to the correct persistence path.
|
||||
|
||||
Returns (new_item_count, list_of_new_ids).
|
||||
"""
|
||||
if source_type == "market_api":
|
||||
return await _persist_market_items(
|
||||
pool,
|
||||
ticker=ticker,
|
||||
company_id=company_id,
|
||||
items=items,
|
||||
storage_ref=storage_ref,
|
||||
provider=adapter_metadata.get("provider", "unknown"),
|
||||
content_hash=content_hash,
|
||||
)
|
||||
|
||||
if source_type == "broker":
|
||||
return await _persist_broker_items(
|
||||
pool,
|
||||
ticker=ticker,
|
||||
items=items,
|
||||
storage_ref=storage_ref,
|
||||
provider=adapter_metadata.get("provider", "unknown"),
|
||||
endpoint=adapter_metadata.get("endpoint", "positions"),
|
||||
content_hash=content_hash,
|
||||
)
|
||||
|
||||
# Document types: news_api, filings_api, web_scrape
|
||||
return await _persist_document_items(
|
||||
pool,
|
||||
source_type=source_type,
|
||||
ticker=ticker,
|
||||
company_id=company_id,
|
||||
items=items,
|
||||
storage_ref=storage_ref,
|
||||
)
|
||||
|
||||
|
||||
async def _persist_market_items(
|
||||
pool: asyncpg.Pool,
|
||||
*,
|
||||
ticker: str,
|
||||
company_id: str | None,
|
||||
items: list[dict[str, Any]],
|
||||
storage_ref: str,
|
||||
provider: str,
|
||||
content_hash: str,
|
||||
) -> tuple[int, list[str]]:
|
||||
"""Persist market data items as market_snapshots rows."""
|
||||
ids: list[str] = []
|
||||
for item in items:
|
||||
item_hash = content_hash_str(json.dumps(item, sort_keys=True))
|
||||
# Skip duplicates
|
||||
exists = await pool.fetchval(
|
||||
"SELECT 1 FROM market_snapshots WHERE content_hash = $1", item_hash
|
||||
)
|
||||
if exists:
|
||||
continue
|
||||
|
||||
snapshot_type = _infer_market_snapshot_type(item)
|
||||
row_id = await persist_market_snapshot(
|
||||
pool,
|
||||
company_id=company_id,
|
||||
ticker=ticker,
|
||||
snapshot_type=snapshot_type,
|
||||
data=item,
|
||||
source_provider=provider,
|
||||
storage_ref=storage_ref,
|
||||
content_hash=item_hash,
|
||||
)
|
||||
ids.append(row_id)
|
||||
return len(ids), ids
|
||||
|
||||
|
||||
def _infer_market_snapshot_type(item: dict[str, Any]) -> str:
|
||||
"""Infer snapshot_type from market data item fields."""
|
||||
# Polygon aggregate bars have 'o', 'h', 'l', 'c' fields
|
||||
if all(k in item for k in ("o", "h", "l", "c")):
|
||||
return "bar"
|
||||
# Ticker details have 'market_cap' or 'sic_code'
|
||||
if "market_cap" in item or "sic_code" in item:
|
||||
return "ticker_details"
|
||||
# Quote snapshots
|
||||
if "ask" in item or "bid" in item:
|
||||
return "quote"
|
||||
return "snapshot"
|
||||
|
||||
|
||||
async def _persist_broker_items(
|
||||
pool: asyncpg.Pool,
|
||||
*,
|
||||
ticker: str,
|
||||
items: list[dict[str, Any]],
|
||||
storage_ref: str,
|
||||
provider: str,
|
||||
endpoint: str,
|
||||
content_hash: str,
|
||||
) -> tuple[int, list[str]]:
|
||||
"""Persist broker fetch items as market_snapshots with broker_ prefix."""
|
||||
ids: list[str] = []
|
||||
for item in items:
|
||||
item_hash = content_hash_str(json.dumps(item, sort_keys=True))
|
||||
exists = await pool.fetchval(
|
||||
"SELECT 1 FROM market_snapshots WHERE content_hash = $1", item_hash
|
||||
)
|
||||
if exists:
|
||||
continue
|
||||
|
||||
row_id = await persist_broker_event(
|
||||
pool,
|
||||
ticker=ticker,
|
||||
event_type=endpoint,
|
||||
data=item,
|
||||
source_provider=provider,
|
||||
storage_ref=storage_ref,
|
||||
content_hash=item_hash,
|
||||
)
|
||||
ids.append(row_id)
|
||||
return len(ids), ids
|
||||
|
||||
|
||||
async def _persist_document_items(
|
||||
pool: asyncpg.Pool,
|
||||
*,
|
||||
source_type: str,
|
||||
ticker: str,
|
||||
company_id: str | None,
|
||||
items: list[dict[str, Any]],
|
||||
storage_ref: str,
|
||||
) -> tuple[int, list[str]]:
|
||||
"""Persist document items (news, filings, web scrape) to documents table."""
|
||||
doc_type = _resolve_document_type(source_type)
|
||||
ids: list[str] = []
|
||||
|
||||
for item in items:
|
||||
item_hash = item.get("content_hash") or content_hash_str(
|
||||
json.dumps(item, sort_keys=True)
|
||||
)
|
||||
title = item.get("title", item.get("name", ""))
|
||||
url = item.get("url", item.get("link", ""))
|
||||
canonical_url = item.get("canonical_url") or (
|
||||
normalize_url(url) if url else None
|
||||
)
|
||||
published_at = _parse_published_at(item)
|
||||
publisher = _extract_publisher(item)
|
||||
|
||||
doc_id = await persist_document(
|
||||
pool,
|
||||
document_type=doc_type,
|
||||
source_type=source_type,
|
||||
publisher=publisher,
|
||||
url=url or None,
|
||||
canonical_url=canonical_url,
|
||||
title=title,
|
||||
published_at=published_at,
|
||||
content_hash=item_hash,
|
||||
storage_ref=storage_ref,
|
||||
)
|
||||
if doc_id is None:
|
||||
continue
|
||||
|
||||
# Link document to company if we have a company_id
|
||||
if company_id:
|
||||
await persist_document_company_mention(
|
||||
pool,
|
||||
document_id=doc_id,
|
||||
company_id=company_id,
|
||||
ticker=ticker,
|
||||
)
|
||||
|
||||
ids.append(doc_id)
|
||||
|
||||
return len(ids), ids
|
||||
|
||||
|
||||
# --- Retry and failure tracking (Requirement 3.4) ---
|
||||
|
||||
# Backoff constants — match scheduler defaults for consistency
|
||||
RETRY_BACKOFF_BASE: int = 60
|
||||
RETRY_BACKOFF_MAX: int = 3600
|
||||
RETRY_MAX_COUNT: int = 10
|
||||
|
||||
|
||||
def compute_next_retry_at(
|
||||
retry_count: int,
|
||||
now: datetime | None = None,
|
||||
base: int = RETRY_BACKOFF_BASE,
|
||||
cap: int = RETRY_BACKOFF_MAX,
|
||||
) -> datetime:
|
||||
"""Compute the next eligible retry time using exponential backoff.
|
||||
|
||||
Args:
|
||||
retry_count: Current retry count (before incrementing).
|
||||
now: Reference timestamp (defaults to UTC now).
|
||||
base: Base delay in seconds.
|
||||
cap: Maximum delay in seconds.
|
||||
|
||||
Returns:
|
||||
Datetime of the next eligible retry.
|
||||
"""
|
||||
ts = now or datetime.now(timezone.utc)
|
||||
delay = min(base * (2 ** min(retry_count, 8)), cap)
|
||||
return ts + timedelta(seconds=delay)
|
||||
|
||||
|
||||
async def get_source_retry_count(
|
||||
pool: asyncpg.Pool,
|
||||
source_id: str,
|
||||
) -> int:
|
||||
"""Return the retry count from the most recent failed run for a source.
|
||||
|
||||
If the last run succeeded or no runs exist, returns 0.
|
||||
"""
|
||||
row = await pool.fetchrow(
|
||||
"""SELECT status, retry_count
|
||||
FROM ingestion_runs
|
||||
WHERE source_id = $1::uuid
|
||||
ORDER BY started_at DESC
|
||||
LIMIT 1""",
|
||||
source_id,
|
||||
)
|
||||
if row and row["status"] == "failed":
|
||||
return row["retry_count"] or 0
|
||||
return 0
|
||||
|
||||
|
||||
async def record_retrieval_failure(
|
||||
pool: asyncpg.Pool,
|
||||
run_id: str,
|
||||
source_id: str,
|
||||
error_message: str,
|
||||
retry_count: int | None = None,
|
||||
now: datetime | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Record a source retrieval failure with retry policy state.
|
||||
|
||||
Updates the ingestion_runs row with:
|
||||
- error_message: the failure reason
|
||||
- retry_count: incremented from the previous failed run (or provided)
|
||||
- next_retry_at: computed via exponential backoff
|
||||
- status: 'failed'
|
||||
|
||||
If retry_count is not provided, it is looked up from the most recent
|
||||
failed run for the same source and incremented.
|
||||
|
||||
Returns a dict with the recorded retry state for observability.
|
||||
|
||||
Requirement 3.4
|
||||
"""
|
||||
ts = now or datetime.now(timezone.utc)
|
||||
|
||||
if retry_count is None:
|
||||
prev_count = await get_source_retry_count(pool, source_id)
|
||||
retry_count = prev_count + 1
|
||||
else:
|
||||
retry_count = retry_count + 1
|
||||
|
||||
next_retry = compute_next_retry_at(retry_count - 1, now=ts)
|
||||
exhausted = retry_count >= RETRY_MAX_COUNT
|
||||
|
||||
await pool.execute(
|
||||
"""UPDATE ingestion_runs
|
||||
SET status = 'failed',
|
||||
error_message = $2,
|
||||
retry_count = $3,
|
||||
next_retry_at = $4,
|
||||
completed_at = $5
|
||||
WHERE id = $1""",
|
||||
run_id,
|
||||
error_message,
|
||||
retry_count,
|
||||
next_retry,
|
||||
ts,
|
||||
)
|
||||
|
||||
state = {
|
||||
"run_id": run_id,
|
||||
"source_id": source_id,
|
||||
"retry_count": retry_count,
|
||||
"next_retry_at": next_retry.isoformat(),
|
||||
"exhausted": exhausted,
|
||||
"error_message": error_message,
|
||||
}
|
||||
|
||||
if exhausted:
|
||||
logger.warning(
|
||||
"Source %s exhausted retries (%d/%d): %s",
|
||||
source_id, retry_count, RETRY_MAX_COUNT, error_message,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Source %s failed (retry %d/%d), next retry at %s: %s",
|
||||
source_id, retry_count, RETRY_MAX_COUNT,
|
||||
next_retry.isoformat(), error_message,
|
||||
)
|
||||
|
||||
return state
|
||||
|
||||
|
||||
async def persist_document_intelligence(
|
||||
pool: asyncpg.Pool,
|
||||
*,
|
||||
document_id: str,
|
||||
summary: str,
|
||||
macro_themes: list[str],
|
||||
novelty_score: float,
|
||||
source_credibility: float,
|
||||
extraction_warnings: list[str],
|
||||
confidence: float,
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
prompt_version: str,
|
||||
schema_version: str,
|
||||
raw_output_ref: str | None = None,
|
||||
prompt_ref: str | None = None,
|
||||
validation_status: str = "valid",
|
||||
validation_errors: list[str] | None = None,
|
||||
retry_count: int = 0,
|
||||
) -> str:
|
||||
"""Persist a document intelligence record to PostgreSQL.
|
||||
|
||||
Returns the intelligence row UUID.
|
||||
|
||||
Requirements: 5.3, 5.4, 9.2
|
||||
"""
|
||||
intel_id = await pool.fetchval(
|
||||
"""INSERT INTO document_intelligence
|
||||
(document_id, summary, macro_themes, novelty_score,
|
||||
source_credibility, extraction_warnings, confidence,
|
||||
model_provider, model_name, prompt_version, schema_version,
|
||||
raw_output_ref, prompt_ref, validation_status,
|
||||
validation_errors, retry_count)
|
||||
VALUES ($1::uuid, $2, $3::jsonb, $4, $5, $6::jsonb, $7,
|
||||
$8, $9, $10, $11, $12, $13, $14, $15::jsonb, $16)
|
||||
RETURNING id""",
|
||||
document_id,
|
||||
summary,
|
||||
json.dumps(macro_themes),
|
||||
novelty_score,
|
||||
source_credibility,
|
||||
json.dumps(extraction_warnings),
|
||||
confidence,
|
||||
model_provider,
|
||||
model_name,
|
||||
prompt_version,
|
||||
schema_version,
|
||||
raw_output_ref,
|
||||
prompt_ref,
|
||||
validation_status,
|
||||
json.dumps(validation_errors or []),
|
||||
retry_count,
|
||||
)
|
||||
logger.debug("Persisted document intelligence %s for doc %s", intel_id, document_id)
|
||||
return str(intel_id)
|
||||
|
||||
|
||||
async def persist_document_impact(
|
||||
pool: asyncpg.Pool,
|
||||
*,
|
||||
intelligence_id: str,
|
||||
company_id: str,
|
||||
ticker: str,
|
||||
relevance: float,
|
||||
sentiment: str,
|
||||
impact_score: float,
|
||||
impact_horizon: str,
|
||||
catalyst_type: str,
|
||||
key_facts: list[str],
|
||||
risks: list[str],
|
||||
evidence_spans: list[str],
|
||||
) -> str:
|
||||
"""Persist a per-company impact record linked to a document intelligence row.
|
||||
|
||||
Returns the impact record UUID.
|
||||
|
||||
Requirements: 5.3, 5.5, 9.2
|
||||
"""
|
||||
impact_id = await pool.fetchval(
|
||||
"""INSERT INTO document_impact_records
|
||||
(intelligence_id, company_id, ticker, relevance, sentiment,
|
||||
impact_score, impact_horizon, catalyst_type,
|
||||
key_facts, risks, evidence_spans)
|
||||
VALUES ($1::uuid, $2::uuid, $3, $4, $5, $6, $7, $8,
|
||||
$9::jsonb, $10::jsonb, $11::jsonb)
|
||||
RETURNING id""",
|
||||
intelligence_id,
|
||||
company_id,
|
||||
ticker,
|
||||
relevance,
|
||||
sentiment,
|
||||
impact_score,
|
||||
impact_horizon,
|
||||
catalyst_type,
|
||||
json.dumps(key_facts),
|
||||
json.dumps(risks),
|
||||
json.dumps(evidence_spans),
|
||||
)
|
||||
logger.debug("Persisted impact record %s for %s", impact_id, ticker)
|
||||
return str(impact_id)
|
||||
|
||||
|
||||
async def update_document_status(
|
||||
pool: asyncpg.Pool,
|
||||
*,
|
||||
document_id: str,
|
||||
status: str,
|
||||
) -> None:
|
||||
"""Update the status field on a document row.
|
||||
|
||||
Used to advance documents through the pipeline: ingested → parsed → extracted → failed.
|
||||
|
||||
Requirements: 5.4
|
||||
"""
|
||||
await pool.execute(
|
||||
"""UPDATE documents SET status = $2, updated_at = NOW() WHERE id = $1::uuid""",
|
||||
document_id,
|
||||
status,
|
||||
)
|
||||
logger.debug("Updated document %s status to %s", document_id, status)
|
||||
|
||||
|
||||
async def reset_source_retry_state(
|
||||
pool: asyncpg.Pool,
|
||||
source_id: str,
|
||||
) -> None:
|
||||
"""Reset retry state for a source after a successful run.
|
||||
|
||||
Sets retry_count=0 and next_retry_at=NULL on the most recent run.
|
||||
Called after a successful ingestion to clear any accumulated backoff.
|
||||
"""
|
||||
await pool.execute(
|
||||
"""UPDATE ingestion_runs
|
||||
SET retry_count = 0, next_retry_at = NULL
|
||||
WHERE id = (
|
||||
SELECT id FROM ingestion_runs
|
||||
WHERE source_id = $1::uuid
|
||||
ORDER BY started_at DESC
|
||||
LIMIT 1
|
||||
)""",
|
||||
source_id,
|
||||
)
|
||||
@@ -0,0 +1,317 @@
|
||||
"""Prometheus metrics for all Stonks Oracle pipeline stages.
|
||||
|
||||
Provides counters, histograms, and gauges covering:
|
||||
- Ingestion: items fetched, new items, errors, adapter latency
|
||||
- Parsing: documents parsed, quality scores, low-quality flags
|
||||
- Extraction: attempts, successes, failures, latency, confidence, retries
|
||||
- Aggregation: trend windows computed, signal counts, contradiction scores
|
||||
- Lake publication: facts published per table, write latency
|
||||
- Trading: orders submitted, rejected, filled, risk evaluations
|
||||
|
||||
Requirements: 12.1, 12.2
|
||||
Design: Section 12 (Observability and Operations)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from prometheus_client import Counter, Gauge, Histogram, Info
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Service info
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SERVICE_INFO = Info("stonks_oracle", "Stonks Oracle service metadata")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Ingestion metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
INGESTION_JOBS_TOTAL = Counter(
|
||||
"stonks_ingestion_jobs_total",
|
||||
"Total ingestion jobs processed",
|
||||
["source_type", "status"],
|
||||
)
|
||||
|
||||
INGESTION_ITEMS_FETCHED = Counter(
|
||||
"stonks_ingestion_items_fetched_total",
|
||||
"Total items fetched from external sources",
|
||||
["source_type"],
|
||||
)
|
||||
|
||||
INGESTION_ITEMS_NEW = Counter(
|
||||
"stonks_ingestion_items_new_total",
|
||||
"New (non-duplicate) items ingested",
|
||||
["source_type"],
|
||||
)
|
||||
|
||||
INGESTION_ITEMS_DEDUPED = Counter(
|
||||
"stonks_ingestion_items_deduped_total",
|
||||
"Items skipped due to deduplication",
|
||||
["source_type"],
|
||||
)
|
||||
|
||||
INGESTION_ERRORS = Counter(
|
||||
"stonks_ingestion_errors_total",
|
||||
"Ingestion errors by source type",
|
||||
["source_type"],
|
||||
)
|
||||
|
||||
INGESTION_ADAPTER_DURATION = Histogram(
|
||||
"stonks_ingestion_adapter_duration_seconds",
|
||||
"Adapter fetch latency in seconds",
|
||||
["source_type"],
|
||||
buckets=(0.1, 0.5, 1, 2, 5, 10, 30, 60),
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parsing metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
PARSE_JOBS_TOTAL = Counter(
|
||||
"stonks_parse_jobs_total",
|
||||
"Total parse jobs processed",
|
||||
["status"],
|
||||
)
|
||||
|
||||
PARSE_QUALITY_SCORE = Histogram(
|
||||
"stonks_parse_quality_score",
|
||||
"Distribution of parser quality scores",
|
||||
buckets=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
|
||||
)
|
||||
|
||||
PARSE_LOW_QUALITY_TOTAL = Counter(
|
||||
"stonks_parse_low_quality_total",
|
||||
"Documents flagged as low quality by the parser",
|
||||
)
|
||||
|
||||
PARSE_DURATION = Histogram(
|
||||
"stonks_parse_duration_seconds",
|
||||
"Parse job duration in seconds",
|
||||
buckets=(0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10),
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extraction metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
EXTRACTION_JOBS_TOTAL = Counter(
|
||||
"stonks_extraction_jobs_total",
|
||||
"Total extraction jobs processed",
|
||||
["status"],
|
||||
)
|
||||
|
||||
EXTRACTION_ATTEMPTS = Counter(
|
||||
"stonks_extraction_attempts_total",
|
||||
"Total Ollama extraction attempts (including retries)",
|
||||
)
|
||||
|
||||
EXTRACTION_RETRIES = Counter(
|
||||
"stonks_extraction_retries_total",
|
||||
"Extraction retry count",
|
||||
)
|
||||
|
||||
EXTRACTION_DURATION = Histogram(
|
||||
"stonks_extraction_duration_seconds",
|
||||
"Extraction total duration in seconds",
|
||||
buckets=(1, 2, 5, 10, 20, 30, 60, 120),
|
||||
)
|
||||
|
||||
EXTRACTION_CONFIDENCE = Histogram(
|
||||
"stonks_extraction_confidence",
|
||||
"Distribution of extraction confidence scores",
|
||||
buckets=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
|
||||
)
|
||||
|
||||
EXTRACTION_VALIDATION_ERRORS = Counter(
|
||||
"stonks_extraction_validation_errors_total",
|
||||
"Total validation errors across extractions",
|
||||
)
|
||||
|
||||
EXTRACTION_TOKEN_ESTIMATE = Counter(
|
||||
"stonks_extraction_tokens_total",
|
||||
"Estimated token usage",
|
||||
["direction"],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Aggregation metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
AGGREGATION_WINDOWS_COMPUTED = Counter(
|
||||
"stonks_aggregation_windows_total",
|
||||
"Trend windows computed",
|
||||
["window"],
|
||||
)
|
||||
|
||||
AGGREGATION_SIGNALS_PROCESSED = Counter(
|
||||
"stonks_aggregation_signals_total",
|
||||
"Signals processed during aggregation",
|
||||
["window"],
|
||||
)
|
||||
|
||||
AGGREGATION_CONTRADICTION_SCORE = Histogram(
|
||||
"stonks_aggregation_contradiction_score",
|
||||
"Distribution of contradiction scores in trend windows",
|
||||
buckets=(0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0),
|
||||
)
|
||||
|
||||
AGGREGATION_DURATION = Histogram(
|
||||
"stonks_aggregation_duration_seconds",
|
||||
"Aggregation job duration in seconds",
|
||||
["window"],
|
||||
buckets=(0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10),
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Recommendation metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
RECOMMENDATION_GENERATED = Counter(
|
||||
"stonks_recommendations_total",
|
||||
"Recommendations generated",
|
||||
["action", "mode"],
|
||||
)
|
||||
|
||||
RECOMMENDATION_SUPPRESSED = Counter(
|
||||
"stonks_recommendations_suppressed_total",
|
||||
"Recommendations suppressed due to low data quality",
|
||||
)
|
||||
|
||||
RECOMMENDATION_CONFIDENCE = Histogram(
|
||||
"stonks_recommendation_confidence",
|
||||
"Distribution of recommendation confidence scores",
|
||||
buckets=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Lake publication metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
LAKE_FACTS_PUBLISHED = Counter(
|
||||
"stonks_lake_facts_published_total",
|
||||
"Analytical facts published to the lakehouse",
|
||||
["table_name"],
|
||||
)
|
||||
|
||||
LAKE_PUBLISH_DURATION = Histogram(
|
||||
"stonks_lake_publish_duration_seconds",
|
||||
"Lake publication write latency in seconds",
|
||||
["table_name"],
|
||||
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5),
|
||||
)
|
||||
|
||||
LAKE_PUBLISH_ERRORS = Counter(
|
||||
"stonks_lake_publish_errors_total",
|
||||
"Lake publication errors",
|
||||
["table_name"],
|
||||
)
|
||||
|
||||
LAKE_PUBLISH_BYTES = Counter(
|
||||
"stonks_lake_publish_bytes_total",
|
||||
"Total bytes written to the lakehouse",
|
||||
["table_name"],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Trading / broker metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ORDERS_SUBMITTED = Counter(
|
||||
"stonks_orders_submitted_total",
|
||||
"Orders submitted to broker",
|
||||
["side", "order_type", "mode"],
|
||||
)
|
||||
|
||||
ORDERS_REJECTED = Counter(
|
||||
"stonks_orders_rejected_total",
|
||||
"Orders rejected before broker submission",
|
||||
["reason_category"],
|
||||
)
|
||||
|
||||
ORDERS_FILLED = Counter(
|
||||
"stonks_orders_filled_total",
|
||||
"Orders filled by broker",
|
||||
["side"],
|
||||
)
|
||||
|
||||
ORDERS_DUPLICATES_PREVENTED = Counter(
|
||||
"stonks_orders_duplicates_prevented_total",
|
||||
"Duplicate orders prevented by idempotency checks",
|
||||
["detected_via"],
|
||||
)
|
||||
|
||||
RISK_EVALUATIONS_TOTAL = Counter(
|
||||
"stonks_risk_evaluations_total",
|
||||
"Risk evaluations performed",
|
||||
["result"],
|
||||
)
|
||||
|
||||
RISK_CHECK_FAILURES = Counter(
|
||||
"stonks_risk_check_failures_total",
|
||||
"Individual risk check failures",
|
||||
["check_name"],
|
||||
)
|
||||
|
||||
POSITIONS_SYNCED = Counter(
|
||||
"stonks_positions_synced_total",
|
||||
"Position sync operations completed",
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Active gauges
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ACTIVE_JOBS = Gauge(
|
||||
"stonks_active_jobs",
|
||||
"Currently processing jobs by stage",
|
||||
["stage"],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Alerting metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ALERTS_FIRED = Counter(
|
||||
"stonks_alerts_fired_total",
|
||||
"Total alerts fired by rule",
|
||||
["rule", "severity"],
|
||||
)
|
||||
|
||||
ALERTS_RESOLVED = Counter(
|
||||
"stonks_alerts_resolved_total",
|
||||
"Total alerts resolved by rule",
|
||||
["rule"],
|
||||
)
|
||||
|
||||
ALERT_CHECK_DURATION = Histogram(
|
||||
"stonks_alert_check_duration_seconds",
|
||||
"Duration of alert evaluation cycle",
|
||||
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5),
|
||||
)
|
||||
|
||||
ALERT_ACTIVE = Gauge(
|
||||
"stonks_alert_active",
|
||||
"Whether an alert rule is currently firing (1) or resolved (0)",
|
||||
["rule"],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Dead-letter queue metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DLQ_ITEMS_TOTAL = Counter(
|
||||
"stonks_dlq_items_total",
|
||||
"Jobs sent to dead-letter queues",
|
||||
["queue"],
|
||||
)
|
||||
|
||||
DLQ_REPLAYED_TOTAL = Counter(
|
||||
"stonks_dlq_replayed_total",
|
||||
"Jobs replayed from dead-letter queues",
|
||||
["queue"],
|
||||
)
|
||||
|
||||
DLQ_DEPTH = Gauge(
|
||||
"stonks_dlq_depth",
|
||||
"Current dead-letter queue depth",
|
||||
["queue"],
|
||||
)
|
||||
@@ -46,6 +46,15 @@ def retry_key(job_id: str) -> str:
|
||||
return f"{RETRY_PREFIX}:{job_id}"
|
||||
|
||||
|
||||
# Dead-letter queues
|
||||
DLQ_PREFIX = f"{PREFIX}:dlq"
|
||||
|
||||
|
||||
def dlq_key(queue_name: str) -> str:
|
||||
"""Return the dead-letter queue key for a given source queue."""
|
||||
return f"{DLQ_PREFIX}:{queue_name}"
|
||||
|
||||
|
||||
# --- Queue names ---
|
||||
QUEUE_INGESTION = "ingestion"
|
||||
QUEUE_PARSING = "parsing"
|
||||
@@ -54,3 +63,4 @@ QUEUE_AGGREGATION = "aggregation"
|
||||
QUEUE_RECOMMENDATION = "recommendation"
|
||||
QUEUE_LAKE_PUBLISH = "lake_publish"
|
||||
QUEUE_TRADE = "trade"
|
||||
QUEUE_BROKER = "broker_orders"
|
||||
|
||||
@@ -0,0 +1,306 @@
|
||||
"""Data retention and lifecycle controls for raw and derived artifacts.
|
||||
|
||||
Provides configurable per-bucket retention policies, expired object cleanup
|
||||
from MinIO, and expired metadata cleanup from PostgreSQL.
|
||||
|
||||
Requirements: N3 (preserve source metadata, access policy, and retention policy)
|
||||
Design ref: Section 5.2 (MinIO bucket layout), Section 10 (Reliability and Safety)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
import asyncpg
|
||||
from minio import Minio
|
||||
|
||||
from services.shared.config import BUCKET_RETENTION_FIELDS, RetentionConfig
|
||||
from services.shared.storage import ALL_BUCKETS
|
||||
|
||||
logger = logging.getLogger("retention")
|
||||
|
||||
|
||||
@dataclass
|
||||
class RetentionPolicy:
|
||||
"""Resolved retention policy for a single bucket."""
|
||||
bucket_name: str
|
||||
retention_days: int
|
||||
archive_before_delete: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class CleanupResult:
|
||||
"""Result of a single bucket cleanup run."""
|
||||
bucket_name: str
|
||||
objects_scanned: int = 0
|
||||
objects_deleted: int = 0
|
||||
bytes_freed: int = 0
|
||||
db_rows_deleted: int = 0
|
||||
|
||||
|
||||
def default_retention_days(bucket: str, config: RetentionConfig) -> int:
|
||||
"""Get the default retention days for a bucket from config."""
|
||||
field_name = BUCKET_RETENTION_FIELDS.get(bucket)
|
||||
if field_name:
|
||||
return getattr(config, field_name, 365)
|
||||
return 365
|
||||
|
||||
|
||||
def resolve_policies(config: RetentionConfig) -> list[RetentionPolicy]:
|
||||
"""Build retention policies for all known buckets from config defaults."""
|
||||
return [
|
||||
RetentionPolicy(
|
||||
bucket_name=bucket,
|
||||
retention_days=default_retention_days(bucket, config),
|
||||
)
|
||||
for bucket in ALL_BUCKETS
|
||||
]
|
||||
|
||||
|
||||
async def load_db_policies(pool: asyncpg.Pool) -> dict[str, RetentionPolicy]:
|
||||
"""Load retention policy overrides from the database.
|
||||
|
||||
Returns a dict keyed by bucket_name. DB policies take precedence
|
||||
over config defaults when active.
|
||||
"""
|
||||
rows = await pool.fetch(
|
||||
"""SELECT bucket_name, retention_days, archive_before_delete
|
||||
FROM retention_policies
|
||||
WHERE active = TRUE AND artifact_class = 'default'"""
|
||||
)
|
||||
return {
|
||||
row["bucket_name"]: RetentionPolicy(
|
||||
bucket_name=row["bucket_name"],
|
||||
retention_days=row["retention_days"],
|
||||
archive_before_delete=row["archive_before_delete"],
|
||||
)
|
||||
for row in rows
|
||||
}
|
||||
|
||||
|
||||
def merge_policies(
|
||||
config_policies: list[RetentionPolicy],
|
||||
db_policies: dict[str, RetentionPolicy],
|
||||
) -> list[RetentionPolicy]:
|
||||
"""Merge config defaults with DB overrides. DB wins on conflict."""
|
||||
merged: list[RetentionPolicy] = []
|
||||
for policy in config_policies:
|
||||
if policy.bucket_name in db_policies:
|
||||
merged.append(db_policies[policy.bucket_name])
|
||||
else:
|
||||
merged.append(policy)
|
||||
return merged
|
||||
|
||||
|
||||
def cutoff_date(retention_days: int, now: datetime | None = None) -> datetime:
|
||||
"""Calculate the cutoff datetime. Objects older than this are expired."""
|
||||
ref = now or datetime.now(timezone.utc)
|
||||
return ref - timedelta(days=retention_days)
|
||||
|
||||
|
||||
def list_expired_objects(
|
||||
client: Minio,
|
||||
bucket: str,
|
||||
retention_days: int,
|
||||
batch_size: int = 1000,
|
||||
now: datetime | None = None,
|
||||
) -> list[str]:
|
||||
"""List object names in a bucket that are older than the retention cutoff.
|
||||
|
||||
Uses the object's last_modified timestamp from MinIO metadata.
|
||||
Returns at most batch_size object names.
|
||||
"""
|
||||
cutoff = cutoff_date(retention_days, now)
|
||||
expired: list[str] = []
|
||||
|
||||
try:
|
||||
objects = client.list_objects(bucket, recursive=True)
|
||||
for obj in objects:
|
||||
if obj.last_modified and obj.last_modified < cutoff:
|
||||
if obj.object_name:
|
||||
expired.append(obj.object_name)
|
||||
if len(expired) >= batch_size:
|
||||
break
|
||||
except Exception:
|
||||
logger.exception("Error listing objects in bucket %s", bucket)
|
||||
|
||||
return expired
|
||||
|
||||
|
||||
def delete_expired_objects(
|
||||
client: Minio,
|
||||
bucket: str,
|
||||
object_names: list[str],
|
||||
) -> int:
|
||||
"""Delete a list of objects from a MinIO bucket.
|
||||
|
||||
Returns the count of successfully deleted objects.
|
||||
"""
|
||||
deleted = 0
|
||||
for name in object_names:
|
||||
try:
|
||||
client.remove_object(bucket, name)
|
||||
deleted += 1
|
||||
except Exception:
|
||||
logger.warning("Failed to delete %s/%s", bucket, name, exc_info=True)
|
||||
return deleted
|
||||
|
||||
|
||||
def cleanup_bucket(
|
||||
client: Minio,
|
||||
policy: RetentionPolicy,
|
||||
batch_size: int = 1000,
|
||||
now: datetime | None = None,
|
||||
) -> CleanupResult:
|
||||
"""Run retention cleanup for a single bucket.
|
||||
|
||||
Lists expired objects and deletes them in batches.
|
||||
Returns a CleanupResult with counts.
|
||||
"""
|
||||
result = CleanupResult(bucket_name=policy.bucket_name)
|
||||
|
||||
expired = list_expired_objects(
|
||||
client, policy.bucket_name, policy.retention_days,
|
||||
batch_size=batch_size, now=now,
|
||||
)
|
||||
result.objects_scanned = len(expired)
|
||||
|
||||
if expired:
|
||||
result.objects_deleted = delete_expired_objects(client, policy.bucket_name, expired)
|
||||
logger.info(
|
||||
"Bucket %s: scanned=%d deleted=%d (retention=%dd)",
|
||||
policy.bucket_name, result.objects_scanned,
|
||||
result.objects_deleted, policy.retention_days,
|
||||
)
|
||||
else:
|
||||
logger.debug("Bucket %s: no expired objects (retention=%dd)",
|
||||
policy.bucket_name, policy.retention_days)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# --- PostgreSQL metadata cleanup ---
|
||||
|
||||
# Tables with a created_at or retrieved_at column that should be cleaned up
|
||||
# when the corresponding MinIO artifacts are expired.
|
||||
DB_CLEANUP_QUERIES: list[tuple[str, str]] = [
|
||||
(
|
||||
"ingestion_runs",
|
||||
"DELETE FROM ingestion_runs WHERE started_at < $1",
|
||||
),
|
||||
(
|
||||
"market_snapshots",
|
||||
"DELETE FROM market_snapshots WHERE captured_at < $1",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
async def cleanup_expired_db_records(
|
||||
pool: asyncpg.Pool,
|
||||
retention_days: int,
|
||||
now: datetime | None = None,
|
||||
) -> int:
|
||||
"""Delete expired operational metadata from PostgreSQL.
|
||||
|
||||
Uses the shortest raw retention period to clean up ingestion tracking
|
||||
and market snapshot records that are past their useful life.
|
||||
|
||||
Returns total rows deleted.
|
||||
"""
|
||||
cutoff = cutoff_date(retention_days, now)
|
||||
total_deleted = 0
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
for table_name, query in DB_CLEANUP_QUERIES:
|
||||
try:
|
||||
result = await conn.execute(query, cutoff)
|
||||
# asyncpg returns "DELETE N"
|
||||
count = int(result.split()[-1]) if result else 0
|
||||
total_deleted += count
|
||||
if count > 0:
|
||||
logger.info("Cleaned %d expired rows from %s (cutoff=%s)",
|
||||
count, table_name, cutoff.isoformat())
|
||||
except Exception:
|
||||
logger.exception("Error cleaning table %s", table_name)
|
||||
|
||||
return total_deleted
|
||||
|
||||
|
||||
async def record_retention_run(
|
||||
pool: asyncpg.Pool,
|
||||
bucket_name: str,
|
||||
result: CleanupResult,
|
||||
status: str = "completed",
|
||||
error_message: str | None = None,
|
||||
) -> None:
|
||||
"""Record a retention cleanup run in the retention_runs table."""
|
||||
await pool.execute(
|
||||
"""INSERT INTO retention_runs
|
||||
(bucket_name, objects_scanned, objects_deleted, bytes_freed,
|
||||
db_rows_deleted, completed_at, status, error_message)
|
||||
VALUES ($1, $2, $3, $4, $5, NOW(), $6, $7)""",
|
||||
bucket_name,
|
||||
result.objects_scanned,
|
||||
result.objects_deleted,
|
||||
result.bytes_freed,
|
||||
result.db_rows_deleted,
|
||||
status,
|
||||
error_message,
|
||||
)
|
||||
|
||||
|
||||
async def run_retention_cleanup(
|
||||
minio_client: Minio,
|
||||
pool: asyncpg.Pool,
|
||||
config: RetentionConfig,
|
||||
now: datetime | None = None,
|
||||
) -> list[CleanupResult]:
|
||||
"""Run the full retention cleanup cycle.
|
||||
|
||||
1. Resolve policies from config defaults + DB overrides
|
||||
2. Clean up expired MinIO objects per bucket
|
||||
3. Clean up expired PostgreSQL metadata
|
||||
4. Record each run for observability
|
||||
|
||||
Returns a list of CleanupResult for each bucket processed.
|
||||
"""
|
||||
# Resolve policies
|
||||
config_policies = resolve_policies(config)
|
||||
try:
|
||||
db_policies = await load_db_policies(pool)
|
||||
except Exception:
|
||||
logger.warning("Could not load DB retention policies, using config defaults")
|
||||
db_policies = {}
|
||||
|
||||
policies = merge_policies(config_policies, db_policies)
|
||||
results: list[CleanupResult] = []
|
||||
|
||||
# Clean up MinIO objects per bucket
|
||||
for policy in policies:
|
||||
try:
|
||||
result = cleanup_bucket(
|
||||
minio_client, policy,
|
||||
batch_size=config.batch_size, now=now,
|
||||
)
|
||||
results.append(result)
|
||||
await record_retention_run(pool, policy.bucket_name, result)
|
||||
except Exception:
|
||||
logger.exception("Retention cleanup failed for bucket %s", policy.bucket_name)
|
||||
empty = CleanupResult(bucket_name=policy.bucket_name)
|
||||
await record_retention_run(
|
||||
pool, policy.bucket_name, empty,
|
||||
status="failed", error_message="See logs",
|
||||
)
|
||||
results.append(empty)
|
||||
|
||||
# Clean up expired DB records using the shortest raw retention period
|
||||
min_retention = min(p.retention_days for p in policies)
|
||||
try:
|
||||
db_deleted = await cleanup_expired_db_records(pool, min_retention, now=now)
|
||||
if db_deleted > 0:
|
||||
logger.info("Total DB rows cleaned: %d", db_deleted)
|
||||
except Exception:
|
||||
logger.exception("DB retention cleanup failed")
|
||||
|
||||
return results
|
||||
@@ -108,6 +108,41 @@ class DocumentIntelligence(BaseModel):
|
||||
|
||||
# --- Trend Summary ---
|
||||
|
||||
class MarketContext(BaseModel):
|
||||
"""Recent market data features for a symbol, used to enrich aggregation."""
|
||||
|
||||
ticker: str = ""
|
||||
price_change_pct: Optional[float] = None # % change over the window
|
||||
avg_volume: Optional[float] = None # average daily volume
|
||||
volume_change_pct: Optional[float] = None # volume vs prior period
|
||||
volatility: Optional[float] = None # intra-window price std dev
|
||||
latest_close: Optional[float] = None
|
||||
latest_bar_at: Optional[datetime] = None
|
||||
bars_available: int = 0
|
||||
|
||||
@property
|
||||
def has_data(self) -> bool:
|
||||
return self.bars_available > 0
|
||||
|
||||
|
||||
class DisagreementDetail(BaseModel):
|
||||
"""Represents an explicit disagreement between document signals.
|
||||
|
||||
Rather than collapsing contradictory signals into a single score,
|
||||
this captures the nature of the disagreement so downstream consumers
|
||||
can inspect *why* signals conflict.
|
||||
|
||||
Requirements: 6.4
|
||||
"""
|
||||
|
||||
dimension: str = "" # e.g. "sentiment", "catalyst", "impact_horizon"
|
||||
positive_doc_ids: List[str] = Field(default_factory=list)
|
||||
negative_doc_ids: List[str] = Field(default_factory=list)
|
||||
positive_weight: float = 0.0
|
||||
negative_weight: float = 0.0
|
||||
description: str = ""
|
||||
|
||||
|
||||
class TrendSummary(BaseModel):
|
||||
entity_type: str = "company"
|
||||
entity_id: str = ""
|
||||
@@ -120,6 +155,8 @@ class TrendSummary(BaseModel):
|
||||
dominant_catalysts: List[str] = Field(default_factory=list)
|
||||
material_risks: List[str] = Field(default_factory=list)
|
||||
contradiction_score: float = Field(ge=0, le=1, default=0.0)
|
||||
disagreement_details: List[DisagreementDetail] = Field(default_factory=list)
|
||||
market_context: Optional[MarketContext] = None
|
||||
generated_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,352 @@
|
||||
"""Raw artifact upload to MinIO.
|
||||
|
||||
Provides a reusable storage layer for uploading raw artifacts (API payloads,
|
||||
HTML, normalized text, model outputs) to MinIO with consistent path conventions,
|
||||
bucket management, and content-type handling.
|
||||
|
||||
Bucket layout follows the design spec:
|
||||
- stonks-raw-market — raw market API payloads
|
||||
- stonks-raw-news — raw news API payloads and article HTML
|
||||
- stonks-raw-filings — raw filings and issuer event payloads
|
||||
- stonks-normalized — cleaned text and parser outputs
|
||||
- stonks-llm-prompts — prompts and schemas used
|
||||
- stonks-llm-results — raw model outputs and validation reports
|
||||
- stonks-lakehouse — partitioned analytical datasets and table metadata
|
||||
- stonks-audit — execution traces and exported reports
|
||||
|
||||
Object path pattern:
|
||||
/{stage}/{symbol}/{yyyy}/{mm}/{dd}/{document_id}/{artifact_type}.{ext}
|
||||
|
||||
Requirements: 3.1, 3.2, 3.3, 9.1
|
||||
"""
|
||||
import io
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Mapping
|
||||
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
|
||||
logger = logging.getLogger("storage")
|
||||
|
||||
# All known buckets the platform uses
|
||||
ALL_BUCKETS = [
|
||||
"stonks-raw-market",
|
||||
"stonks-raw-news",
|
||||
"stonks-raw-filings",
|
||||
"stonks-normalized",
|
||||
"stonks-llm-prompts",
|
||||
"stonks-llm-results",
|
||||
"stonks-lakehouse",
|
||||
"stonks-audit",
|
||||
]
|
||||
|
||||
# Map source_type to the correct raw bucket
|
||||
SOURCE_BUCKET_MAP: dict[str, str] = {
|
||||
"market_api": "stonks-raw-market",
|
||||
"news_api": "stonks-raw-news",
|
||||
"filings_api": "stonks-raw-filings",
|
||||
"web_scrape": "stonks-raw-news",
|
||||
"broker": "stonks-raw-market",
|
||||
}
|
||||
|
||||
# Map artifact type to content type and file extension
|
||||
ARTIFACT_CONTENT_TYPES: dict[str, tuple[str, str]] = {
|
||||
"raw_json": ("application/json", "json"),
|
||||
"raw_html": ("text/html", "html"),
|
||||
"raw_text": ("text/plain", "txt"),
|
||||
"raw_payload": ("application/octet-stream", "bin"),
|
||||
}
|
||||
|
||||
|
||||
def bucket_for_source(source_type: str) -> str:
|
||||
"""Return the MinIO bucket name for a given source type."""
|
||||
return SOURCE_BUCKET_MAP.get(source_type, "stonks-raw-market")
|
||||
|
||||
|
||||
def build_artifact_path(
|
||||
source_type: str,
|
||||
ticker: str,
|
||||
document_id: str,
|
||||
artifact_name: str = "raw",
|
||||
ext: str = "json",
|
||||
timestamp: datetime | None = None,
|
||||
) -> str:
|
||||
"""Build a MinIO object path following the design convention.
|
||||
|
||||
Pattern: {source_type}/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/{artifact_name}.{ext}
|
||||
"""
|
||||
ts = timestamp or datetime.now(timezone.utc)
|
||||
return (
|
||||
f"{source_type}/{ticker}/"
|
||||
f"{ts.year}/{ts.month:02d}/{ts.day:02d}/"
|
||||
f"{document_id}/{artifact_name}.{ext}"
|
||||
)
|
||||
|
||||
|
||||
def storage_ref(bucket: str, path: str) -> str:
|
||||
"""Build an s3:// URI for a stored artifact."""
|
||||
return f"s3://{bucket}/{path}"
|
||||
|
||||
|
||||
def ensure_buckets(client: Minio, buckets: list[str] | None = None) -> list[str]:
|
||||
"""Create any missing buckets. Returns list of buckets that were created."""
|
||||
target_buckets = buckets or ALL_BUCKETS
|
||||
created: list[str] = []
|
||||
for bucket in target_buckets:
|
||||
try:
|
||||
if not client.bucket_exists(bucket):
|
||||
client.make_bucket(bucket)
|
||||
created.append(bucket)
|
||||
logger.info("Created bucket: %s", bucket)
|
||||
except S3Error as e:
|
||||
logger.error("Failed to ensure bucket %s: %s", bucket, e)
|
||||
raise
|
||||
return created
|
||||
|
||||
|
||||
def upload_artifact(
|
||||
client: Minio,
|
||||
bucket: str,
|
||||
path: str,
|
||||
data: bytes,
|
||||
content_type: str = "application/json",
|
||||
metadata: Mapping[str, str] | None = None,
|
||||
) -> str:
|
||||
"""Upload raw bytes to MinIO and return the s3:// storage reference.
|
||||
|
||||
Args:
|
||||
client: MinIO client instance.
|
||||
bucket: Target bucket name.
|
||||
path: Object path within the bucket.
|
||||
data: Raw bytes to upload.
|
||||
content_type: MIME type for the object.
|
||||
metadata: Optional user metadata to attach to the object.
|
||||
|
||||
Returns:
|
||||
s3:// URI pointing to the uploaded object.
|
||||
"""
|
||||
_result = client.put_object(
|
||||
bucket,
|
||||
path,
|
||||
io.BytesIO(data),
|
||||
length=len(data),
|
||||
content_type=content_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
ref = storage_ref(bucket, path)
|
||||
logger.debug("Uploaded %d bytes to %s", len(data), ref)
|
||||
return ref
|
||||
|
||||
|
||||
def upload_raw_artifact(
|
||||
client: Minio,
|
||||
source_type: str,
|
||||
ticker: str,
|
||||
document_id: str,
|
||||
data: bytes,
|
||||
artifact_type: str = "raw_json",
|
||||
timestamp: datetime | None = None,
|
||||
metadata: Mapping[str, str] | None = None,
|
||||
) -> str:
|
||||
"""Upload a raw artifact using standard conventions for bucket, path, and content type.
|
||||
|
||||
This is the primary entry point for ingestion workers to store raw payloads.
|
||||
|
||||
Args:
|
||||
client: MinIO client instance.
|
||||
source_type: One of market_api, news_api, filings_api, web_scrape, broker.
|
||||
ticker: Company ticker symbol.
|
||||
document_id: Unique document or run identifier.
|
||||
data: Raw bytes to upload.
|
||||
artifact_type: One of raw_json, raw_html, raw_text, raw_payload.
|
||||
timestamp: Override timestamp for path generation (defaults to now UTC).
|
||||
metadata: Optional user metadata dict.
|
||||
|
||||
Returns:
|
||||
s3:// URI pointing to the uploaded object.
|
||||
"""
|
||||
bucket = bucket_for_source(source_type)
|
||||
ct, ext = ARTIFACT_CONTENT_TYPES.get(artifact_type, ("application/octet-stream", "bin"))
|
||||
path = build_artifact_path(
|
||||
source_type=source_type,
|
||||
ticker=ticker,
|
||||
document_id=document_id,
|
||||
artifact_name="raw",
|
||||
ext=ext,
|
||||
timestamp=timestamp,
|
||||
)
|
||||
return upload_artifact(client, bucket, path, data, content_type=ct, metadata=metadata)
|
||||
|
||||
|
||||
def upload_html_artifact(
|
||||
client: Minio,
|
||||
ticker: str,
|
||||
document_id: str,
|
||||
html_bytes: bytes,
|
||||
timestamp: datetime | None = None,
|
||||
metadata: Mapping[str, str] | None = None,
|
||||
) -> str:
|
||||
"""Upload raw HTML for a scraped web page.
|
||||
|
||||
Stores in stonks-raw-news under the web_scrape source path.
|
||||
"""
|
||||
bucket = bucket_for_source("web_scrape")
|
||||
path = build_artifact_path(
|
||||
source_type="web_scrape",
|
||||
ticker=ticker,
|
||||
document_id=document_id,
|
||||
artifact_name="raw",
|
||||
ext="html",
|
||||
timestamp=timestamp,
|
||||
)
|
||||
return upload_artifact(client, bucket, path, html_bytes, content_type="text/html", metadata=metadata)
|
||||
|
||||
|
||||
def upload_normalized_text(
|
||||
client: Minio,
|
||||
ticker: str,
|
||||
document_id: str,
|
||||
text_bytes: bytes,
|
||||
timestamp: datetime | None = None,
|
||||
metadata: Mapping[str, str] | None = None,
|
||||
) -> str:
|
||||
"""Upload normalized (parsed) text to the stonks-normalized bucket.
|
||||
|
||||
Stores under parsed/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/normalized.txt
|
||||
"""
|
||||
ts = timestamp or datetime.now(timezone.utc)
|
||||
path = (
|
||||
f"parsed/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
|
||||
f"{document_id}/normalized.txt"
|
||||
)
|
||||
return upload_artifact(
|
||||
client, "stonks-normalized", path, text_bytes,
|
||||
content_type="text/plain", metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
def upload_parser_output(
|
||||
client: Minio,
|
||||
ticker: str,
|
||||
document_id: str,
|
||||
output_bytes: bytes,
|
||||
timestamp: datetime | None = None,
|
||||
metadata: Mapping[str, str] | None = None,
|
||||
) -> str:
|
||||
"""Upload structured parser output JSON to the stonks-normalized bucket.
|
||||
|
||||
Stores under parsed/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/parser_output.json
|
||||
"""
|
||||
ts = timestamp or datetime.now(timezone.utc)
|
||||
path = (
|
||||
f"parsed/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
|
||||
f"{document_id}/parser_output.json"
|
||||
)
|
||||
return upload_artifact(
|
||||
client, "stonks-normalized", path, output_bytes,
|
||||
content_type="application/json", metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
def upload_extraction_prompt(
|
||||
client: Minio,
|
||||
ticker: str,
|
||||
document_id: str,
|
||||
prompt_data: bytes,
|
||||
timestamp: datetime | None = None,
|
||||
metadata: Mapping[str, str] | None = None,
|
||||
) -> str:
|
||||
"""Upload the extraction prompt and schema to stonks-llm-prompts.
|
||||
|
||||
Stores under extraction/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/prompt.json
|
||||
"""
|
||||
ts = timestamp or datetime.now(timezone.utc)
|
||||
path = (
|
||||
f"extraction/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
|
||||
f"{document_id}/prompt.json"
|
||||
)
|
||||
return upload_artifact(
|
||||
client, "stonks-llm-prompts", path, prompt_data,
|
||||
content_type="application/json", metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
def upload_extraction_raw_output(
|
||||
client: Minio,
|
||||
ticker: str,
|
||||
document_id: str,
|
||||
output_data: bytes,
|
||||
attempt_index: int = 0,
|
||||
timestamp: datetime | None = None,
|
||||
metadata: Mapping[str, str] | None = None,
|
||||
) -> str:
|
||||
"""Upload a raw model output to stonks-llm-results.
|
||||
|
||||
Stores under extraction/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/raw_output_{attempt}.json
|
||||
"""
|
||||
ts = timestamp or datetime.now(timezone.utc)
|
||||
path = (
|
||||
f"extraction/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
|
||||
f"{document_id}/raw_output_{attempt_index}.json"
|
||||
)
|
||||
return upload_artifact(
|
||||
client, "stonks-llm-results", path, output_data,
|
||||
content_type="application/json", metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
def upload_extraction_validation(
|
||||
client: Minio,
|
||||
ticker: str,
|
||||
document_id: str,
|
||||
validation_data: bytes,
|
||||
timestamp: datetime | None = None,
|
||||
metadata: Mapping[str, str] | None = None,
|
||||
) -> str:
|
||||
"""Upload a validation report to stonks-llm-results.
|
||||
|
||||
Stores under extraction/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/validation.json
|
||||
"""
|
||||
ts = timestamp or datetime.now(timezone.utc)
|
||||
path = (
|
||||
f"extraction/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
|
||||
f"{document_id}/validation.json"
|
||||
)
|
||||
return upload_artifact(
|
||||
client, "stonks-llm-results", path, validation_data,
|
||||
content_type="application/json", metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
def upload_extraction_intelligence(
|
||||
client: Minio,
|
||||
ticker: str,
|
||||
document_id: str,
|
||||
intelligence_data: bytes,
|
||||
timestamp: datetime | None = None,
|
||||
metadata: Mapping[str, str] | None = None,
|
||||
) -> str:
|
||||
"""Upload the final intelligence object to stonks-llm-results.
|
||||
|
||||
Stores under extraction/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/intelligence.json
|
||||
"""
|
||||
ts = timestamp or datetime.now(timezone.utc)
|
||||
path = (
|
||||
f"extraction/{ticker}/{ts.year}/{ts.month:02d}/{ts.day:02d}/"
|
||||
f"{document_id}/intelligence.json"
|
||||
)
|
||||
return upload_artifact(
|
||||
client, "stonks-llm-results", path, intelligence_data,
|
||||
content_type="application/json", metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
def download_artifact(client: Minio, bucket: str, path: str) -> bytes:
|
||||
"""Download an artifact from MinIO and return its bytes."""
|
||||
response = client.get_object(bucket, path)
|
||||
try:
|
||||
return response.read()
|
||||
finally:
|
||||
response.close()
|
||||
response.release_conn()
|
||||
Reference in New Issue
Block a user