phase 14-15: docker build validation and helm deployment

This commit is contained in:
Celes Renata
2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
+342
View File
@@ -0,0 +1,342 @@
"""Operational alerting for Stonks Oracle pipeline health.
Evaluates alert rules against PostgreSQL operational state and emits
structured log events and Prometheus metrics when thresholds are breached.
Alert rules:
- source_failures: sustained source retrieval failures per source
- schema_failure_spike: extraction validation failure rate exceeds threshold
- analytical_lag: lake publication has not completed within threshold
- broker_issues: consecutive broker submission errors
Requirements: 12.3
Design: Section 12 (Observability and Operations)
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Any
import asyncpg
from services.shared.config import AlertingConfig
from services.shared.metrics import (
ALERT_ACTIVE,
ALERT_CHECK_DURATION,
ALERTS_FIRED,
ALERTS_RESOLVED,
)
logger = logging.getLogger("alerting")
@dataclass
class Alert:
"""A single alert instance."""
rule: str
severity: str # "warning" | "critical"
summary: str
details: dict[str, Any] = field(default_factory=dict)
fired_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
@dataclass
class AlertState:
"""Tracks which rules are currently firing to detect transitions."""
active: dict[str, Alert] = field(default_factory=dict)
def fire(self, alert: Alert) -> bool:
"""Record an alert firing. Returns True if this is a new firing."""
key = f"{alert.rule}:{alert.details.get('key', '')}"
is_new = key not in self.active
self.active[key] = alert
return is_new
def resolve(self, rule: str, key: str = "") -> bool:
"""Resolve an alert. Returns True if it was previously active."""
full_key = f"{rule}:{key}"
if full_key in self.active:
del self.active[full_key]
return True
return False
def is_firing(self, rule: str, key: str = "") -> bool:
return f"{rule}:{key}" in self.active
async def check_source_failures(
pool: asyncpg.Pool,
config: AlertingConfig,
) -> list[Alert]:
"""Check for sources with sustained consecutive failures.
Queries ingestion_runs for sources where the last N runs all failed
within the lookback window.
"""
rows = await pool.fetch(
"""WITH recent_runs AS (
SELECT source_id, status,
ROW_NUMBER() OVER (PARTITION BY source_id ORDER BY started_at DESC) AS rn
FROM ingestion_runs
WHERE started_at >= NOW() - INTERVAL '1 hour' * $1
),
failure_streaks AS (
SELECT source_id,
COUNT(*) FILTER (WHERE status = 'failed') AS consecutive_failures,
COUNT(*) AS total_runs
FROM recent_runs
WHERE rn <= $2
GROUP BY source_id
HAVING COUNT(*) FILTER (WHERE status = 'failed') = COUNT(*)
AND COUNT(*) >= $2
)
SELECT fs.source_id, fs.consecutive_failures,
s.source_type, s.source_name, c.ticker
FROM failure_streaks fs
JOIN sources s ON s.id = fs.source_id
JOIN companies c ON c.id = s.company_id""",
config.source_failure_window_hours,
config.source_failure_threshold,
)
alerts = []
for row in rows:
alerts.append(Alert(
rule="source_failures",
severity="warning",
summary=(
f"Source {row['source_name']} ({row['source_type']}) for "
f"{row['ticker']} has {row['consecutive_failures']} consecutive failures"
),
details={
"key": str(row["source_id"]),
"source_id": str(row["source_id"]),
"source_type": row["source_type"],
"source_name": row["source_name"],
"ticker": row["ticker"],
"consecutive_failures": row["consecutive_failures"],
},
))
return alerts
async def check_schema_failure_spike(
pool: asyncpg.Pool,
config: AlertingConfig,
) -> list[Alert]:
"""Check if extraction schema validation failure rate exceeds threshold.
Queries model_performance_metrics for the recent window and computes
the failure rate.
"""
row = await pool.fetchrow(
"""SELECT
COUNT(*) AS total,
COUNT(*) FILTER (WHERE NOT success) AS failed
FROM model_performance_metrics
WHERE recorded_at >= NOW() - INTERVAL '1 hour' * $1""",
config.schema_failure_window_hours,
)
if not row or row["total"] == 0:
return []
total = row["total"]
failed = row["failed"]
failure_rate = failed / total
if failure_rate >= config.schema_failure_rate_threshold:
return [Alert(
rule="schema_failure_spike",
severity="critical" if failure_rate >= 0.5 else "warning",
summary=(
f"Extraction schema failure rate is {failure_rate:.1%} "
f"({failed}/{total}) in the last {config.schema_failure_window_hours}h"
),
details={
"key": "global",
"total_extractions": total,
"failed_extractions": failed,
"failure_rate": round(failure_rate, 4),
"threshold": config.schema_failure_rate_threshold,
"window_hours": config.schema_failure_window_hours,
},
)]
return []
async def check_analytical_lag(
pool: asyncpg.Pool,
config: AlertingConfig,
) -> list[Alert]:
"""Check if lake publication is lagging beyond threshold.
Looks at the audit_events table for the most recent successful
lake_publish events per table, and alerts if any are stale.
"""
rows = await pool.fetch(
"""SELECT
details->>'table_name' AS table_name,
MAX(created_at) AS last_publish
FROM audit_events
WHERE event_type = 'lake_publish'
AND details->>'status' = 'success'
AND details->>'table_name' IS NOT NULL
GROUP BY details->>'table_name'
HAVING MAX(created_at) < NOW() - INTERVAL '1 minute' * $1""",
config.lake_lag_threshold_minutes,
)
alerts = []
now = datetime.now(timezone.utc)
for row in rows:
table_name = row["table_name"]
last_publish = row["last_publish"]
if last_publish.tzinfo is None:
last_publish = last_publish.replace(tzinfo=timezone.utc)
lag_minutes = (now - last_publish).total_seconds() / 60
alerts.append(Alert(
rule="analytical_lag",
severity="warning",
summary=(
f"Lake table '{table_name}' last published {lag_minutes:.0f}m ago "
f"(threshold: {config.lake_lag_threshold_minutes}m)"
),
details={
"key": table_name,
"table_name": table_name,
"last_publish": last_publish.isoformat(),
"lag_minutes": round(lag_minutes, 1),
"threshold_minutes": config.lake_lag_threshold_minutes,
},
))
return alerts
async def check_broker_issues(
pool: asyncpg.Pool,
config: AlertingConfig,
) -> list[Alert]:
"""Check for consecutive broker submission errors.
Queries order_events for recent broker-level errors (rejections,
timeouts, connection failures) within the lookback window.
"""
rows = await pool.fetch(
"""WITH recent_events AS (
SELECT order_id, event_type, created_at,
ROW_NUMBER() OVER (ORDER BY created_at DESC) AS rn
FROM order_events
WHERE created_at >= NOW() - INTERVAL '1 hour' * $1
AND event_type IN ('broker_error', 'broker_timeout', 'connection_failed')
)
SELECT COUNT(*) AS error_count
FROM recent_events
WHERE rn <= $2""",
config.broker_error_window_hours,
config.broker_error_threshold,
)
if not rows:
return []
error_count = rows[0]["error_count"]
if error_count >= config.broker_error_threshold:
return [Alert(
rule="broker_issues",
severity="critical",
summary=(
f"{error_count} broker errors in the last "
f"{config.broker_error_window_hours}h"
),
details={
"key": "global",
"error_count": error_count,
"threshold": config.broker_error_threshold,
"window_hours": config.broker_error_window_hours,
},
)]
return []
async def evaluate_alerts(
pool: asyncpg.Pool,
config: AlertingConfig,
state: AlertState,
) -> list[Alert]:
"""Run all alert rules and return newly fired alerts.
Updates AlertState to track firing/resolved transitions and emits
structured log events and Prometheus metrics for each transition.
"""
all_alerts: list[Alert] = []
with ALERT_CHECK_DURATION.time():
# Collect alerts from all rules
try:
all_alerts.extend(await check_source_failures(pool, config))
except Exception:
logger.exception("Error checking source failures")
try:
all_alerts.extend(await check_schema_failure_spike(pool, config))
except Exception:
logger.exception("Error checking schema failure spike")
try:
all_alerts.extend(await check_analytical_lag(pool, config))
except Exception:
logger.exception("Error checking analytical lag")
try:
all_alerts.extend(await check_broker_issues(pool, config))
except Exception:
logger.exception("Error checking broker issues")
# Track which rule+key combos are currently firing
current_keys: set[str] = set()
newly_fired: list[Alert] = []
for alert in all_alerts:
key = f"{alert.rule}:{alert.details.get('key', '')}"
current_keys.add(key)
if state.fire(alert):
# New alert firing
ALERTS_FIRED.labels(rule=alert.rule, severity=alert.severity).inc()
ALERT_ACTIVE.labels(rule=alert.rule).set(1)
newly_fired.append(alert)
logger.warning(
"ALERT FIRING: [%s] %s",
alert.rule,
alert.summary,
extra={
"alert_rule": alert.rule,
"alert_severity": alert.severity,
"alert_details": alert.details,
},
)
# Check for resolved alerts
resolved_keys = set(state.active.keys()) - current_keys
for key in resolved_keys:
rule = key.split(":")[0]
detail_key = key[len(rule) + 1:]
if state.resolve(rule, detail_key):
ALERTS_RESOLVED.labels(rule=rule).inc()
# Only set gauge to 0 if no more alerts for this rule
still_firing = any(k.startswith(f"{rule}:") for k in state.active)
if not still_firing:
ALERT_ACTIVE.labels(rule=rule).set(0)
logger.info(
"ALERT RESOLVED: [%s] key=%s",
rule,
detail_key,
)
return newly_fired