Files
stonks-oracle/services/shared/metrics.py
T

323 lines
8.8 KiB
Python

"""Prometheus metrics for all Stonks Oracle pipeline stages.
Provides counters, histograms, and gauges covering:
- Ingestion: items fetched, new items, errors, adapter latency
- Parsing: documents parsed, quality scores, low-quality flags
- Extraction: attempts, successes, failures, latency, confidence, retries
- Aggregation: trend windows computed, signal counts, contradiction scores
- Lake publication: facts published per table, write latency
- Trading: orders submitted, rejected, filled, risk evaluations
Requirements: 12.1, 12.2
Design: Section 12 (Observability and Operations)
"""
from __future__ import annotations
from prometheus_client import Counter, Gauge, Histogram, Info
# ---------------------------------------------------------------------------
# Service info
# ---------------------------------------------------------------------------
SERVICE_INFO = Info("stonks_oracle", "Stonks Oracle service metadata")
# ---------------------------------------------------------------------------
# Ingestion metrics
# ---------------------------------------------------------------------------
INGESTION_JOBS_TOTAL = Counter(
"stonks_ingestion_jobs_total",
"Total ingestion jobs processed",
["source_type", "status"],
)
INGESTION_ITEMS_FETCHED = Counter(
"stonks_ingestion_items_fetched_total",
"Total items fetched from external sources",
["source_type"],
)
INGESTION_ITEMS_NEW = Counter(
"stonks_ingestion_items_new_total",
"New (non-duplicate) items ingested",
["source_type"],
)
INGESTION_ITEMS_DEDUPED = Counter(
"stonks_ingestion_items_deduped_total",
"Items skipped due to deduplication",
["source_type"],
)
INGESTION_ERRORS = Counter(
"stonks_ingestion_errors_total",
"Ingestion errors by source type",
["source_type"],
)
INGESTION_ADAPTER_DURATION = Histogram(
"stonks_ingestion_adapter_duration_seconds",
"Adapter fetch latency in seconds",
["source_type"],
buckets=(0.1, 0.5, 1, 2, 5, 10, 30, 60),
)
# ---------------------------------------------------------------------------
# Parsing metrics
# ---------------------------------------------------------------------------
PARSE_JOBS_TOTAL = Counter(
"stonks_parse_jobs_total",
"Total parse jobs processed",
["status"],
)
PARSE_QUALITY_SCORE = Histogram(
"stonks_parse_quality_score",
"Distribution of parser quality scores",
buckets=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
)
PARSE_LOW_QUALITY_TOTAL = Counter(
"stonks_parse_low_quality_total",
"Documents flagged as low quality by the parser",
)
PARSE_DURATION = Histogram(
"stonks_parse_duration_seconds",
"Parse job duration in seconds",
buckets=(0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10),
)
# ---------------------------------------------------------------------------
# Extraction metrics
# ---------------------------------------------------------------------------
EXTRACTION_JOBS_TOTAL = Counter(
"stonks_extraction_jobs_total",
"Total extraction jobs processed",
["status"],
)
EXTRACTION_ATTEMPTS = Counter(
"stonks_extraction_attempts_total",
"Total Ollama extraction attempts (including retries)",
)
EXTRACTION_RETRIES = Counter(
"stonks_extraction_retries_total",
"Extraction retry count",
)
EXTRACTION_DURATION = Histogram(
"stonks_extraction_duration_seconds",
"Extraction total duration in seconds",
buckets=(1, 2, 5, 10, 20, 30, 60, 120),
)
EXTRACTION_CONFIDENCE = Histogram(
"stonks_extraction_confidence",
"Distribution of extraction confidence scores",
buckets=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
)
EXTRACTION_VALIDATION_ERRORS = Counter(
"stonks_extraction_validation_errors_total",
"Total validation errors across extractions",
)
EXTRACTION_TOKEN_ESTIMATE = Counter(
"stonks_extraction_tokens_total",
"Estimated token usage",
["direction"],
)
# ---------------------------------------------------------------------------
# Aggregation metrics
# ---------------------------------------------------------------------------
AGGREGATION_WINDOWS_COMPUTED = Counter(
"stonks_aggregation_windows_total",
"Trend windows computed",
["window"],
)
AGGREGATION_SIGNALS_PROCESSED = Counter(
"stonks_aggregation_signals_total",
"Signals processed during aggregation",
["window"],
)
AGGREGATION_CONTRADICTION_SCORE = Histogram(
"stonks_aggregation_contradiction_score",
"Distribution of contradiction scores in trend windows",
buckets=(0.0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0),
)
AGGREGATION_DURATION = Histogram(
"stonks_aggregation_duration_seconds",
"Aggregation job duration in seconds",
["window"],
buckets=(0.05, 0.1, 0.25, 0.5, 1, 2, 5, 10),
)
# ---------------------------------------------------------------------------
# Recommendation metrics
# ---------------------------------------------------------------------------
RECOMMENDATION_GENERATED = Counter(
"stonks_recommendations_total",
"Recommendations generated",
["action", "mode"],
)
RECOMMENDATION_SUPPRESSED = Counter(
"stonks_recommendations_suppressed_total",
"Recommendations suppressed due to low data quality",
)
RECOMMENDATION_CONFIDENCE = Histogram(
"stonks_recommendation_confidence",
"Distribution of recommendation confidence scores",
buckets=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
)
# ---------------------------------------------------------------------------
# Lake publication metrics
# ---------------------------------------------------------------------------
LAKE_FACTS_PUBLISHED = Counter(
"stonks_lake_facts_published_total",
"Analytical facts published to the lakehouse",
["table_name"],
)
LAKE_PUBLISH_DURATION = Histogram(
"stonks_lake_publish_duration_seconds",
"Lake publication write latency in seconds",
["table_name"],
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5),
)
LAKE_PUBLISH_ERRORS = Counter(
"stonks_lake_publish_errors_total",
"Lake publication errors",
["table_name"],
)
LAKE_PUBLISH_BYTES = Counter(
"stonks_lake_publish_bytes_total",
"Total bytes written to the lakehouse",
["table_name"],
)
# ---------------------------------------------------------------------------
# Trading / broker metrics
# ---------------------------------------------------------------------------
ORDERS_SUBMITTED = Counter(
"stonks_orders_submitted_total",
"Orders submitted to broker",
["side", "order_type", "mode"],
)
ORDERS_REJECTED = Counter(
"stonks_orders_rejected_total",
"Orders rejected before broker submission",
["reason_category"],
)
ORDERS_FILLED = Counter(
"stonks_orders_filled_total",
"Orders filled by broker",
["side"],
)
ORDERS_DUPLICATES_PREVENTED = Counter(
"stonks_orders_duplicates_prevented_total",
"Duplicate orders prevented by idempotency checks",
["detected_via"],
)
RISK_EVALUATIONS_TOTAL = Counter(
"stonks_risk_evaluations_total",
"Risk evaluations performed",
["result"],
)
RISK_CHECK_FAILURES = Counter(
"stonks_risk_check_failures_total",
"Individual risk check failures",
["check_name"],
)
ORDERS_CLAMPED = Counter(
"stonks_orders_clamped_total",
"Orders auto-clamped to fit within position limits",
)
POSITIONS_SYNCED = Counter(
"stonks_positions_synced_total",
"Position sync operations completed",
)
# ---------------------------------------------------------------------------
# Active gauges
# ---------------------------------------------------------------------------
ACTIVE_JOBS = Gauge(
"stonks_active_jobs",
"Currently processing jobs by stage",
["stage"],
)
# ---------------------------------------------------------------------------
# Alerting metrics
# ---------------------------------------------------------------------------
ALERTS_FIRED = Counter(
"stonks_alerts_fired_total",
"Total alerts fired by rule",
["rule", "severity"],
)
ALERTS_RESOLVED = Counter(
"stonks_alerts_resolved_total",
"Total alerts resolved by rule",
["rule"],
)
ALERT_CHECK_DURATION = Histogram(
"stonks_alert_check_duration_seconds",
"Duration of alert evaluation cycle",
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1, 2, 5),
)
ALERT_ACTIVE = Gauge(
"stonks_alert_active",
"Whether an alert rule is currently firing (1) or resolved (0)",
["rule"],
)
# ---------------------------------------------------------------------------
# Dead-letter queue metrics
# ---------------------------------------------------------------------------
DLQ_ITEMS_TOTAL = Counter(
"stonks_dlq_items_total",
"Jobs sent to dead-letter queues",
["queue"],
)
DLQ_REPLAYED_TOTAL = Counter(
"stonks_dlq_replayed_total",
"Jobs replayed from dead-letter queues",
["queue"],
)
DLQ_DEPTH = Gauge(
"stonks_dlq_depth",
"Current dead-letter queue depth",
["queue"],
)