phase 14-15: docker build validation and helm deployment
This commit is contained in:
@@ -0,0 +1,342 @@
|
||||
"""Operational alerting for Stonks Oracle pipeline health.
|
||||
|
||||
Evaluates alert rules against PostgreSQL operational state and emits
|
||||
structured log events and Prometheus metrics when thresholds are breached.
|
||||
|
||||
Alert rules:
|
||||
- source_failures: sustained source retrieval failures per source
|
||||
- schema_failure_spike: extraction validation failure rate exceeds threshold
|
||||
- analytical_lag: lake publication has not completed within threshold
|
||||
- broker_issues: consecutive broker submission errors
|
||||
|
||||
Requirements: 12.3
|
||||
Design: Section 12 (Observability and Operations)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
import asyncpg
|
||||
|
||||
from services.shared.config import AlertingConfig
|
||||
from services.shared.metrics import (
|
||||
ALERT_ACTIVE,
|
||||
ALERT_CHECK_DURATION,
|
||||
ALERTS_FIRED,
|
||||
ALERTS_RESOLVED,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("alerting")
|
||||
|
||||
|
||||
@dataclass
|
||||
class Alert:
|
||||
"""A single alert instance."""
|
||||
|
||||
rule: str
|
||||
severity: str # "warning" | "critical"
|
||||
summary: str
|
||||
details: dict[str, Any] = field(default_factory=dict)
|
||||
fired_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
|
||||
|
||||
@dataclass
|
||||
class AlertState:
|
||||
"""Tracks which rules are currently firing to detect transitions."""
|
||||
|
||||
active: dict[str, Alert] = field(default_factory=dict)
|
||||
|
||||
def fire(self, alert: Alert) -> bool:
|
||||
"""Record an alert firing. Returns True if this is a new firing."""
|
||||
key = f"{alert.rule}:{alert.details.get('key', '')}"
|
||||
is_new = key not in self.active
|
||||
self.active[key] = alert
|
||||
return is_new
|
||||
|
||||
def resolve(self, rule: str, key: str = "") -> bool:
|
||||
"""Resolve an alert. Returns True if it was previously active."""
|
||||
full_key = f"{rule}:{key}"
|
||||
if full_key in self.active:
|
||||
del self.active[full_key]
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_firing(self, rule: str, key: str = "") -> bool:
|
||||
return f"{rule}:{key}" in self.active
|
||||
|
||||
|
||||
async def check_source_failures(
|
||||
pool: asyncpg.Pool,
|
||||
config: AlertingConfig,
|
||||
) -> list[Alert]:
|
||||
"""Check for sources with sustained consecutive failures.
|
||||
|
||||
Queries ingestion_runs for sources where the last N runs all failed
|
||||
within the lookback window.
|
||||
"""
|
||||
rows = await pool.fetch(
|
||||
"""WITH recent_runs AS (
|
||||
SELECT source_id, status,
|
||||
ROW_NUMBER() OVER (PARTITION BY source_id ORDER BY started_at DESC) AS rn
|
||||
FROM ingestion_runs
|
||||
WHERE started_at >= NOW() - INTERVAL '1 hour' * $1
|
||||
),
|
||||
failure_streaks AS (
|
||||
SELECT source_id,
|
||||
COUNT(*) FILTER (WHERE status = 'failed') AS consecutive_failures,
|
||||
COUNT(*) AS total_runs
|
||||
FROM recent_runs
|
||||
WHERE rn <= $2
|
||||
GROUP BY source_id
|
||||
HAVING COUNT(*) FILTER (WHERE status = 'failed') = COUNT(*)
|
||||
AND COUNT(*) >= $2
|
||||
)
|
||||
SELECT fs.source_id, fs.consecutive_failures,
|
||||
s.source_type, s.source_name, c.ticker
|
||||
FROM failure_streaks fs
|
||||
JOIN sources s ON s.id = fs.source_id
|
||||
JOIN companies c ON c.id = s.company_id""",
|
||||
config.source_failure_window_hours,
|
||||
config.source_failure_threshold,
|
||||
)
|
||||
|
||||
alerts = []
|
||||
for row in rows:
|
||||
alerts.append(Alert(
|
||||
rule="source_failures",
|
||||
severity="warning",
|
||||
summary=(
|
||||
f"Source {row['source_name']} ({row['source_type']}) for "
|
||||
f"{row['ticker']} has {row['consecutive_failures']} consecutive failures"
|
||||
),
|
||||
details={
|
||||
"key": str(row["source_id"]),
|
||||
"source_id": str(row["source_id"]),
|
||||
"source_type": row["source_type"],
|
||||
"source_name": row["source_name"],
|
||||
"ticker": row["ticker"],
|
||||
"consecutive_failures": row["consecutive_failures"],
|
||||
},
|
||||
))
|
||||
return alerts
|
||||
|
||||
|
||||
async def check_schema_failure_spike(
|
||||
pool: asyncpg.Pool,
|
||||
config: AlertingConfig,
|
||||
) -> list[Alert]:
|
||||
"""Check if extraction schema validation failure rate exceeds threshold.
|
||||
|
||||
Queries model_performance_metrics for the recent window and computes
|
||||
the failure rate.
|
||||
"""
|
||||
row = await pool.fetchrow(
|
||||
"""SELECT
|
||||
COUNT(*) AS total,
|
||||
COUNT(*) FILTER (WHERE NOT success) AS failed
|
||||
FROM model_performance_metrics
|
||||
WHERE recorded_at >= NOW() - INTERVAL '1 hour' * $1""",
|
||||
config.schema_failure_window_hours,
|
||||
)
|
||||
|
||||
if not row or row["total"] == 0:
|
||||
return []
|
||||
|
||||
total = row["total"]
|
||||
failed = row["failed"]
|
||||
failure_rate = failed / total
|
||||
|
||||
if failure_rate >= config.schema_failure_rate_threshold:
|
||||
return [Alert(
|
||||
rule="schema_failure_spike",
|
||||
severity="critical" if failure_rate >= 0.5 else "warning",
|
||||
summary=(
|
||||
f"Extraction schema failure rate is {failure_rate:.1%} "
|
||||
f"({failed}/{total}) in the last {config.schema_failure_window_hours}h"
|
||||
),
|
||||
details={
|
||||
"key": "global",
|
||||
"total_extractions": total,
|
||||
"failed_extractions": failed,
|
||||
"failure_rate": round(failure_rate, 4),
|
||||
"threshold": config.schema_failure_rate_threshold,
|
||||
"window_hours": config.schema_failure_window_hours,
|
||||
},
|
||||
)]
|
||||
return []
|
||||
|
||||
|
||||
async def check_analytical_lag(
|
||||
pool: asyncpg.Pool,
|
||||
config: AlertingConfig,
|
||||
) -> list[Alert]:
|
||||
"""Check if lake publication is lagging beyond threshold.
|
||||
|
||||
Looks at the audit_events table for the most recent successful
|
||||
lake_publish events per table, and alerts if any are stale.
|
||||
"""
|
||||
rows = await pool.fetch(
|
||||
"""SELECT
|
||||
details->>'table_name' AS table_name,
|
||||
MAX(created_at) AS last_publish
|
||||
FROM audit_events
|
||||
WHERE event_type = 'lake_publish'
|
||||
AND details->>'status' = 'success'
|
||||
AND details->>'table_name' IS NOT NULL
|
||||
GROUP BY details->>'table_name'
|
||||
HAVING MAX(created_at) < NOW() - INTERVAL '1 minute' * $1""",
|
||||
config.lake_lag_threshold_minutes,
|
||||
)
|
||||
|
||||
alerts = []
|
||||
now = datetime.now(timezone.utc)
|
||||
for row in rows:
|
||||
table_name = row["table_name"]
|
||||
last_publish = row["last_publish"]
|
||||
if last_publish.tzinfo is None:
|
||||
last_publish = last_publish.replace(tzinfo=timezone.utc)
|
||||
lag_minutes = (now - last_publish).total_seconds() / 60
|
||||
|
||||
alerts.append(Alert(
|
||||
rule="analytical_lag",
|
||||
severity="warning",
|
||||
summary=(
|
||||
f"Lake table '{table_name}' last published {lag_minutes:.0f}m ago "
|
||||
f"(threshold: {config.lake_lag_threshold_minutes}m)"
|
||||
),
|
||||
details={
|
||||
"key": table_name,
|
||||
"table_name": table_name,
|
||||
"last_publish": last_publish.isoformat(),
|
||||
"lag_minutes": round(lag_minutes, 1),
|
||||
"threshold_minutes": config.lake_lag_threshold_minutes,
|
||||
},
|
||||
))
|
||||
return alerts
|
||||
|
||||
|
||||
async def check_broker_issues(
|
||||
pool: asyncpg.Pool,
|
||||
config: AlertingConfig,
|
||||
) -> list[Alert]:
|
||||
"""Check for consecutive broker submission errors.
|
||||
|
||||
Queries order_events for recent broker-level errors (rejections,
|
||||
timeouts, connection failures) within the lookback window.
|
||||
"""
|
||||
rows = await pool.fetch(
|
||||
"""WITH recent_events AS (
|
||||
SELECT order_id, event_type, created_at,
|
||||
ROW_NUMBER() OVER (ORDER BY created_at DESC) AS rn
|
||||
FROM order_events
|
||||
WHERE created_at >= NOW() - INTERVAL '1 hour' * $1
|
||||
AND event_type IN ('broker_error', 'broker_timeout', 'connection_failed')
|
||||
)
|
||||
SELECT COUNT(*) AS error_count
|
||||
FROM recent_events
|
||||
WHERE rn <= $2""",
|
||||
config.broker_error_window_hours,
|
||||
config.broker_error_threshold,
|
||||
)
|
||||
|
||||
if not rows:
|
||||
return []
|
||||
|
||||
error_count = rows[0]["error_count"]
|
||||
if error_count >= config.broker_error_threshold:
|
||||
return [Alert(
|
||||
rule="broker_issues",
|
||||
severity="critical",
|
||||
summary=(
|
||||
f"{error_count} broker errors in the last "
|
||||
f"{config.broker_error_window_hours}h"
|
||||
),
|
||||
details={
|
||||
"key": "global",
|
||||
"error_count": error_count,
|
||||
"threshold": config.broker_error_threshold,
|
||||
"window_hours": config.broker_error_window_hours,
|
||||
},
|
||||
)]
|
||||
return []
|
||||
|
||||
|
||||
async def evaluate_alerts(
|
||||
pool: asyncpg.Pool,
|
||||
config: AlertingConfig,
|
||||
state: AlertState,
|
||||
) -> list[Alert]:
|
||||
"""Run all alert rules and return newly fired alerts.
|
||||
|
||||
Updates AlertState to track firing/resolved transitions and emits
|
||||
structured log events and Prometheus metrics for each transition.
|
||||
"""
|
||||
all_alerts: list[Alert] = []
|
||||
|
||||
with ALERT_CHECK_DURATION.time():
|
||||
# Collect alerts from all rules
|
||||
try:
|
||||
all_alerts.extend(await check_source_failures(pool, config))
|
||||
except Exception:
|
||||
logger.exception("Error checking source failures")
|
||||
|
||||
try:
|
||||
all_alerts.extend(await check_schema_failure_spike(pool, config))
|
||||
except Exception:
|
||||
logger.exception("Error checking schema failure spike")
|
||||
|
||||
try:
|
||||
all_alerts.extend(await check_analytical_lag(pool, config))
|
||||
except Exception:
|
||||
logger.exception("Error checking analytical lag")
|
||||
|
||||
try:
|
||||
all_alerts.extend(await check_broker_issues(pool, config))
|
||||
except Exception:
|
||||
logger.exception("Error checking broker issues")
|
||||
|
||||
# Track which rule+key combos are currently firing
|
||||
current_keys: set[str] = set()
|
||||
newly_fired: list[Alert] = []
|
||||
|
||||
for alert in all_alerts:
|
||||
key = f"{alert.rule}:{alert.details.get('key', '')}"
|
||||
current_keys.add(key)
|
||||
|
||||
if state.fire(alert):
|
||||
# New alert firing
|
||||
ALERTS_FIRED.labels(rule=alert.rule, severity=alert.severity).inc()
|
||||
ALERT_ACTIVE.labels(rule=alert.rule).set(1)
|
||||
newly_fired.append(alert)
|
||||
logger.warning(
|
||||
"ALERT FIRING: [%s] %s",
|
||||
alert.rule,
|
||||
alert.summary,
|
||||
extra={
|
||||
"alert_rule": alert.rule,
|
||||
"alert_severity": alert.severity,
|
||||
"alert_details": alert.details,
|
||||
},
|
||||
)
|
||||
|
||||
# Check for resolved alerts
|
||||
resolved_keys = set(state.active.keys()) - current_keys
|
||||
for key in resolved_keys:
|
||||
rule = key.split(":")[0]
|
||||
detail_key = key[len(rule) + 1:]
|
||||
if state.resolve(rule, detail_key):
|
||||
ALERTS_RESOLVED.labels(rule=rule).inc()
|
||||
# Only set gauge to 0 if no more alerts for this rule
|
||||
still_firing = any(k.startswith(f"{rule}:") for k in state.active)
|
||||
if not still_firing:
|
||||
ALERT_ACTIVE.labels(rule=rule).set(0)
|
||||
logger.info(
|
||||
"ALERT RESOLVED: [%s] key=%s",
|
||||
rule,
|
||||
detail_key,
|
||||
)
|
||||
|
||||
return newly_fired
|
||||
Reference in New Issue
Block a user