phase 14-15: docker build validation and helm deployment

This commit is contained in:
Celes Renata
2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
+22
View File
@@ -0,0 +1,22 @@
# Operational Dashboard
Superset dashboard definitions for Stonks Oracle operational monitoring.
## Dashboards
- Ingestion Throughput — documents ingested per hour by source type, success/failure rates
- Model Extraction Quality — extraction success rates, latency percentiles, validation failures
- Source Coverage Gaps — symbols missing source types, stale sources with no recent data
## Data Sources
These dashboards query the Query API operational endpoints:
- `/api/ops/ingestion/throughput` — time-bucketed ingestion metrics
- `/api/ops/ingestion/summary` — aggregate ingestion stats
- `/api/ops/model/failures` — recent extraction failures
- `/api/ops/model/performance` — model performance summary
- `/api/ops/pipeline/health` — pipeline stage health
- `/api/ops/sources/coverage-gaps` — source coverage analysis
## Setup
Import the dashboard JSON files into Superset via the Superset UI or CLI.
The dashboards use the Trino `lakehouse` catalog as their primary datasource,
with supplementary queries against the Query API for real-time operational data.
@@ -0,0 +1,75 @@
{
"dashboard_title": "Ingestion Throughput",
"description": "Operational dashboard for monitoring ingestion pipeline throughput, success rates, and item counts across source types.",
"slug": "ingestion-throughput",
"position_json": {
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Ingestion Throughput"}},
"ROW-1": {
"type": "ROW",
"children": ["CHART-throughput-timeseries", "CHART-source-type-breakdown"]
},
"ROW-2": {
"type": "ROW",
"children": ["CHART-success-failure-rate", "CHART-items-fetched"]
},
"ROW-3": {
"type": "ROW",
"children": ["CHART-stale-sources", "CHART-active-companies"]
}
},
"metadata": {
"refresh_frequency": 300,
"default_filters": "{}",
"color_scheme": "supersetColors"
},
"charts": [
{
"slice_name": "Ingestion Runs Over Time",
"viz_type": "echarts_timeseries_bar",
"description": "Ingestion run counts bucketed by hour, stacked by source type",
"datasource_type": "query",
"query": "SELECT date_trunc('hour', ir.started_at) AS bucket, ir.source_type, COUNT(*) AS run_count, COUNT(*) FILTER (WHERE ir.status = 'completed') AS completed, COUNT(*) FILTER (WHERE ir.status = 'failed') AS failed FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY 1, 2 ORDER BY 1",
"params": {
"x_axis": "bucket",
"metrics": ["run_count"],
"groupby": ["source_type"],
"time_grain_sqla": "PT1H"
}
},
{
"slice_name": "Source Type Breakdown",
"viz_type": "pie",
"description": "Distribution of ingestion runs by source type in the last 24h",
"datasource_type": "query",
"query": "SELECT ir.source_type, COUNT(*) AS runs FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY ir.source_type ORDER BY runs DESC"
},
{
"slice_name": "Success vs Failure Rate",
"viz_type": "echarts_timeseries_line",
"description": "Hourly success and failure counts over time",
"datasource_type": "query",
"query": "SELECT date_trunc('hour', ir.started_at) AS bucket, COUNT(*) FILTER (WHERE ir.status = 'completed') AS completed, COUNT(*) FILTER (WHERE ir.status = 'failed') AS failed, ROUND(COUNT(*) FILTER (WHERE ir.status = 'completed')::numeric / NULLIF(COUNT(*), 0), 3) AS success_rate FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
},
{
"slice_name": "Items Fetched Over Time",
"viz_type": "echarts_timeseries_bar",
"description": "Total items fetched and new items per hour",
"datasource_type": "query",
"query": "SELECT date_trunc('hour', ir.started_at) AS bucket, COALESCE(SUM(ir.items_fetched), 0) AS items_fetched, COALESCE(SUM(ir.items_new), 0) AS items_new FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
},
{
"slice_name": "Stale Sources",
"viz_type": "table",
"description": "Sources with no successful run in the last 24 hours",
"datasource_type": "query",
"query": "SELECT c.ticker, s.source_type, s.source_name, MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') AS last_success, COUNT(*) FILTER (WHERE ir.status = 'failed' AND ir.started_at >= NOW() - INTERVAL '24 hours') AS recent_failures FROM sources s JOIN companies c ON c.id = s.company_id LEFT JOIN ingestion_runs ir ON ir.source_id = s.id WHERE s.active = TRUE AND c.active = TRUE GROUP BY c.ticker, s.source_type, s.source_name HAVING MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') < NOW() - INTERVAL '24 hours' OR MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') IS NULL ORDER BY c.ticker"
},
{
"slice_name": "Active Companies Ingested",
"viz_type": "big_number_total",
"description": "Count of distinct companies with ingestion activity in the last 24h",
"datasource_type": "query",
"query": "SELECT COUNT(DISTINCT company_id) AS active_companies FROM ingestion_runs WHERE started_at >= NOW() - INTERVAL '24 hours'"
}
]
}
+94
View File
@@ -0,0 +1,94 @@
{
"dashboard_title": "Model Extraction Quality",
"description": "Operational dashboard for monitoring Ollama extraction success rates, latency, validation failures, and confidence distributions.",
"slug": "model-extraction-quality",
"position_json": {
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Model Extraction Quality"}},
"ROW-1": {
"type": "ROW",
"children": ["CHART-success-rate-kpi", "CHART-avg-latency-kpi", "CHART-avg-confidence-kpi", "CHART-retry-rate-kpi"]
},
"ROW-2": {
"type": "ROW",
"children": ["CHART-extraction-timeseries", "CHART-validation-status-pie"]
},
"ROW-3": {
"type": "ROW",
"children": ["CHART-latency-percentiles", "CHART-confidence-distribution"]
},
"ROW-4": {
"type": "ROW",
"children": ["CHART-recent-failures-table"]
}
},
"metadata": {
"refresh_frequency": 300,
"default_filters": "{}",
"color_scheme": "supersetColors"
},
"charts": [
{
"slice_name": "Extraction Success Rate",
"viz_type": "big_number_total",
"description": "Overall extraction success rate in the last 24h",
"datasource_type": "query",
"query": "SELECT ROUND(COUNT(*) FILTER (WHERE success)::numeric / NULLIF(COUNT(*), 0), 4) AS success_rate FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours'"
},
{
"slice_name": "Avg Extraction Latency",
"viz_type": "big_number_total",
"description": "Average extraction duration in milliseconds",
"datasource_type": "query",
"query": "SELECT ROUND(AVG(total_duration_ms)::numeric, 0) AS avg_latency_ms FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours'"
},
{
"slice_name": "Avg Confidence Score",
"viz_type": "big_number_total",
"description": "Average confidence of successful extractions",
"datasource_type": "query",
"query": "SELECT ROUND(AVG(confidence)::numeric, 3) AS avg_confidence FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' AND success = TRUE"
},
{
"slice_name": "Avg Retry Count",
"viz_type": "big_number_total",
"description": "Average retries per extraction attempt",
"datasource_type": "query",
"query": "SELECT ROUND(AVG(retry_count)::numeric, 2) AS avg_retries FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours'"
},
{
"slice_name": "Extractions Over Time",
"viz_type": "echarts_timeseries_bar",
"description": "Hourly extraction counts split by success/failure",
"datasource_type": "query",
"query": "SELECT date_trunc('hour', recorded_at) AS bucket, COUNT(*) FILTER (WHERE success) AS successful, COUNT(*) FILTER (WHERE NOT success) AS failed FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
},
{
"slice_name": "Validation Status Distribution",
"viz_type": "pie",
"description": "Breakdown of extraction validation outcomes",
"datasource_type": "query",
"query": "SELECT validation_status, COUNT(*) AS count FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' GROUP BY validation_status"
},
{
"slice_name": "Latency Percentiles Over Time",
"viz_type": "echarts_timeseries_line",
"description": "P50, P95, P99 extraction latency per hour",
"datasource_type": "query",
"query": "SELECT date_trunc('hour', recorded_at) AS bucket, ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 0) AS p50_ms, ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 0) AS p95_ms, ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 0) AS p99_ms FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
},
{
"slice_name": "Confidence Distribution",
"viz_type": "histogram",
"description": "Distribution of extraction confidence scores",
"datasource_type": "query",
"query": "SELECT CASE WHEN confidence >= 0.9 THEN '0.9-1.0' WHEN confidence >= 0.8 THEN '0.8-0.9' WHEN confidence >= 0.7 THEN '0.7-0.8' WHEN confidence >= 0.6 THEN '0.6-0.7' WHEN confidence >= 0.5 THEN '0.5-0.6' ELSE '<0.5' END AS confidence_bucket, COUNT(*) AS count FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' AND success = TRUE GROUP BY 1 ORDER BY 1"
},
{
"slice_name": "Recent Extraction Failures",
"viz_type": "table",
"description": "Most recent failed extractions with error details",
"datasource_type": "query",
"query": "SELECT mpm.ticker, mpm.model_name, mpm.validation_status, mpm.validation_error_count, mpm.attempt_count, mpm.total_duration_ms, mpm.recorded_at, d.title, d.document_type FROM model_performance_metrics mpm LEFT JOIN documents d ON d.id = mpm.document_id WHERE mpm.success = FALSE AND mpm.recorded_at >= NOW() - INTERVAL '24 hours' ORDER BY mpm.recorded_at DESC LIMIT 50"
}
]
}
@@ -0,0 +1,51 @@
{
"dashboard_title": "Source Coverage & Gaps",
"description": "Operational dashboard for identifying source coverage gaps, stale sources, and symbols missing expected data feeds.",
"slug": "source-coverage-gaps",
"position_json": {
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Source Coverage & Gaps"}},
"ROW-1": {
"type": "ROW",
"children": ["CHART-coverage-matrix", "CHART-missing-types-table"]
},
"ROW-2": {
"type": "ROW",
"children": ["CHART-stale-sources-table", "CHART-failure-heatmap"]
}
},
"metadata": {
"refresh_frequency": 600,
"default_filters": "{}",
"color_scheme": "supersetColors"
},
"charts": [
{
"slice_name": "Source Coverage Matrix",
"viz_type": "table",
"description": "Per-symbol source type coverage showing active source counts",
"datasource_type": "query",
"query": "SELECT c.ticker, c.legal_name, c.sector, COUNT(s.id) FILTER (WHERE s.active) AS active_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'market_api' AND s.active) AS market_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'news_api' AND s.active) AS news_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'filings_api' AND s.active) AS filings_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'web_scrape' AND s.active) AS web_scrape_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'broker' AND s.active) AS broker_sources FROM companies c LEFT JOIN sources s ON s.company_id = c.id WHERE c.active = TRUE GROUP BY c.ticker, c.legal_name, c.sector ORDER BY c.ticker"
},
{
"slice_name": "Symbols Missing Source Types",
"viz_type": "table",
"description": "Companies that lack one or more expected source types (market_api, news_api, filings_api)",
"datasource_type": "query",
"query": "SELECT c.ticker, c.legal_name, c.sector, ARRAY_AGG(DISTINCT s.source_type) FILTER (WHERE s.active) AS active_types FROM companies c LEFT JOIN sources s ON s.company_id = c.id AND s.active = TRUE WHERE c.active = TRUE GROUP BY c.ticker, c.legal_name, c.sector HAVING NOT ARRAY['market_api', 'news_api', 'filings_api'] <@ ARRAY_AGG(DISTINCT s.source_type) FILTER (WHERE s.active) OR ARRAY_AGG(DISTINCT s.source_type) FILTER (WHERE s.active) IS NULL ORDER BY c.ticker"
},
{
"slice_name": "Stale Sources (No Success in 24h)",
"viz_type": "table",
"description": "Active sources that have not completed a successful ingestion run in the last 24 hours",
"datasource_type": "query",
"query": "SELECT c.ticker, s.source_type, s.source_name, MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') AS last_success, MAX(ir.started_at) AS last_attempt, COUNT(*) FILTER (WHERE ir.status = 'failed' AND ir.started_at >= NOW() - INTERVAL '24 hours') AS recent_failures FROM sources s JOIN companies c ON c.id = s.company_id LEFT JOIN ingestion_runs ir ON ir.source_id = s.id WHERE s.active = TRUE AND c.active = TRUE GROUP BY c.ticker, s.source_type, s.source_name HAVING MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') < NOW() - INTERVAL '24 hours' OR MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') IS NULL ORDER BY c.ticker, s.source_type"
},
{
"slice_name": "Source Failure Heatmap",
"viz_type": "heatmap",
"description": "Failure counts by source type and ticker in the last 24h",
"datasource_type": "query",
"query": "SELECT c.ticker, ir.source_type, COUNT(*) FILTER (WHERE ir.status = 'failed') AS failures FROM ingestion_runs ir JOIN companies c ON c.id = ir.company_id WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY c.ticker, ir.source_type HAVING COUNT(*) FILTER (WHERE ir.status = 'failed') > 0 ORDER BY failures DESC"
}
]
}