phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -0,0 +1,22 @@
+# Operational Dashboard
+
+Superset dashboard definitions for Stonks Oracle operational monitoring.
+
+## Dashboards
+- Ingestion Throughput — documents ingested per hour by source type, success/failure rates
+- Model Extraction Quality — extraction success rates, latency percentiles, validation failures
+- Source Coverage Gaps — symbols missing source types, stale sources with no recent data
+
+## Data Sources
+These dashboards query the Query API operational endpoints:
+- `/api/ops/ingestion/throughput` — time-bucketed ingestion metrics
+- `/api/ops/ingestion/summary` — aggregate ingestion stats
+- `/api/ops/model/failures` — recent extraction failures
+- `/api/ops/model/performance` — model performance summary
+- `/api/ops/pipeline/health` — pipeline stage health
+- `/api/ops/sources/coverage-gaps` — source coverage analysis
+
+## Setup
+Import the dashboard JSON files into Superset via the Superset UI or CLI.
+The dashboards use the Trino `lakehouse` catalog as their primary datasource,
+with supplementary queries against the Query API for real-time operational data.
@@ -0,0 +1,75 @@
+{
+  "dashboard_title": "Ingestion Throughput",
+  "description": "Operational dashboard for monitoring ingestion pipeline throughput, success rates, and item counts across source types.",
+  "slug": "ingestion-throughput",
+  "position_json": {
+    "HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Ingestion Throughput"}},
+    "ROW-1": {
+      "type": "ROW",
+      "children": ["CHART-throughput-timeseries", "CHART-source-type-breakdown"]
+    },
+    "ROW-2": {
+      "type": "ROW",
+      "children": ["CHART-success-failure-rate", "CHART-items-fetched"]
+    },
+    "ROW-3": {
+      "type": "ROW",
+      "children": ["CHART-stale-sources", "CHART-active-companies"]
+    }
+  },
+  "metadata": {
+    "refresh_frequency": 300,
+    "default_filters": "{}",
+    "color_scheme": "supersetColors"
+  },
+  "charts": [
+    {
+      "slice_name": "Ingestion Runs Over Time",
+      "viz_type": "echarts_timeseries_bar",
+      "description": "Ingestion run counts bucketed by hour, stacked by source type",
+      "datasource_type": "query",
+      "query": "SELECT date_trunc('hour', ir.started_at) AS bucket, ir.source_type, COUNT(*) AS run_count, COUNT(*) FILTER (WHERE ir.status = 'completed') AS completed, COUNT(*) FILTER (WHERE ir.status = 'failed') AS failed FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY 1, 2 ORDER BY 1",
+      "params": {
+        "x_axis": "bucket",
+        "metrics": ["run_count"],
+        "groupby": ["source_type"],
+        "time_grain_sqla": "PT1H"
+      }
+    },
+    {
+      "slice_name": "Source Type Breakdown",
+      "viz_type": "pie",
+      "description": "Distribution of ingestion runs by source type in the last 24h",
+      "datasource_type": "query",
+      "query": "SELECT ir.source_type, COUNT(*) AS runs FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY ir.source_type ORDER BY runs DESC"
+    },
+    {
+      "slice_name": "Success vs Failure Rate",
+      "viz_type": "echarts_timeseries_line",
+      "description": "Hourly success and failure counts over time",
+      "datasource_type": "query",
+      "query": "SELECT date_trunc('hour', ir.started_at) AS bucket, COUNT(*) FILTER (WHERE ir.status = 'completed') AS completed, COUNT(*) FILTER (WHERE ir.status = 'failed') AS failed, ROUND(COUNT(*) FILTER (WHERE ir.status = 'completed')::numeric / NULLIF(COUNT(*), 0), 3) AS success_rate FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
+    },
+    {
+      "slice_name": "Items Fetched Over Time",
+      "viz_type": "echarts_timeseries_bar",
+      "description": "Total items fetched and new items per hour",
+      "datasource_type": "query",
+      "query": "SELECT date_trunc('hour', ir.started_at) AS bucket, COALESCE(SUM(ir.items_fetched), 0) AS items_fetched, COALESCE(SUM(ir.items_new), 0) AS items_new FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
+    },
+    {
+      "slice_name": "Stale Sources",
+      "viz_type": "table",
+      "description": "Sources with no successful run in the last 24 hours",
+      "datasource_type": "query",
+      "query": "SELECT c.ticker, s.source_type, s.source_name, MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') AS last_success, COUNT(*) FILTER (WHERE ir.status = 'failed' AND ir.started_at >= NOW() - INTERVAL '24 hours') AS recent_failures FROM sources s JOIN companies c ON c.id = s.company_id LEFT JOIN ingestion_runs ir ON ir.source_id = s.id WHERE s.active = TRUE AND c.active = TRUE GROUP BY c.ticker, s.source_type, s.source_name HAVING MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') < NOW() - INTERVAL '24 hours' OR MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') IS NULL ORDER BY c.ticker"
+    },
+    {
+      "slice_name": "Active Companies Ingested",
+      "viz_type": "big_number_total",
+      "description": "Count of distinct companies with ingestion activity in the last 24h",
+      "datasource_type": "query",
+      "query": "SELECT COUNT(DISTINCT company_id) AS active_companies FROM ingestion_runs WHERE started_at >= NOW() - INTERVAL '24 hours'"
+    }
+  ]
+}
@@ -0,0 +1,94 @@
+{
+  "dashboard_title": "Model Extraction Quality",
+  "description": "Operational dashboard for monitoring Ollama extraction success rates, latency, validation failures, and confidence distributions.",
+  "slug": "model-extraction-quality",
+  "position_json": {
+    "HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Model Extraction Quality"}},
+    "ROW-1": {
+      "type": "ROW",
+      "children": ["CHART-success-rate-kpi", "CHART-avg-latency-kpi", "CHART-avg-confidence-kpi", "CHART-retry-rate-kpi"]
+    },
+    "ROW-2": {
+      "type": "ROW",
+      "children": ["CHART-extraction-timeseries", "CHART-validation-status-pie"]
+    },
+    "ROW-3": {
+      "type": "ROW",
+      "children": ["CHART-latency-percentiles", "CHART-confidence-distribution"]
+    },
+    "ROW-4": {
+      "type": "ROW",
+      "children": ["CHART-recent-failures-table"]
+    }
+  },
+  "metadata": {
+    "refresh_frequency": 300,
+    "default_filters": "{}",
+    "color_scheme": "supersetColors"
+  },
+  "charts": [
+    {
+      "slice_name": "Extraction Success Rate",
+      "viz_type": "big_number_total",
+      "description": "Overall extraction success rate in the last 24h",
+      "datasource_type": "query",
+      "query": "SELECT ROUND(COUNT(*) FILTER (WHERE success)::numeric / NULLIF(COUNT(*), 0), 4) AS success_rate FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours'"
+    },
+    {
+      "slice_name": "Avg Extraction Latency",
+      "viz_type": "big_number_total",
+      "description": "Average extraction duration in milliseconds",
+      "datasource_type": "query",
+      "query": "SELECT ROUND(AVG(total_duration_ms)::numeric, 0) AS avg_latency_ms FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours'"
+    },
+    {
+      "slice_name": "Avg Confidence Score",
+      "viz_type": "big_number_total",
+      "description": "Average confidence of successful extractions",
+      "datasource_type": "query",
+      "query": "SELECT ROUND(AVG(confidence)::numeric, 3) AS avg_confidence FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' AND success = TRUE"
+    },
+    {
+      "slice_name": "Avg Retry Count",
+      "viz_type": "big_number_total",
+      "description": "Average retries per extraction attempt",
+      "datasource_type": "query",
+      "query": "SELECT ROUND(AVG(retry_count)::numeric, 2) AS avg_retries FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours'"
+    },
+    {
+      "slice_name": "Extractions Over Time",
+      "viz_type": "echarts_timeseries_bar",
+      "description": "Hourly extraction counts split by success/failure",
+      "datasource_type": "query",
+      "query": "SELECT date_trunc('hour', recorded_at) AS bucket, COUNT(*) FILTER (WHERE success) AS successful, COUNT(*) FILTER (WHERE NOT success) AS failed FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
+    },
+    {
+      "slice_name": "Validation Status Distribution",
+      "viz_type": "pie",
+      "description": "Breakdown of extraction validation outcomes",
+      "datasource_type": "query",
+      "query": "SELECT validation_status, COUNT(*) AS count FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' GROUP BY validation_status"
+    },
+    {
+      "slice_name": "Latency Percentiles Over Time",
+      "viz_type": "echarts_timeseries_line",
+      "description": "P50, P95, P99 extraction latency per hour",
+      "datasource_type": "query",
+      "query": "SELECT date_trunc('hour', recorded_at) AS bucket, ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 0) AS p50_ms, ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 0) AS p95_ms, ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 0) AS p99_ms FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
+    },
+    {
+      "slice_name": "Confidence Distribution",
+      "viz_type": "histogram",
+      "description": "Distribution of extraction confidence scores",
+      "datasource_type": "query",
+      "query": "SELECT CASE WHEN confidence >= 0.9 THEN '0.9-1.0' WHEN confidence >= 0.8 THEN '0.8-0.9' WHEN confidence >= 0.7 THEN '0.7-0.8' WHEN confidence >= 0.6 THEN '0.6-0.7' WHEN confidence >= 0.5 THEN '0.5-0.6' ELSE '<0.5' END AS confidence_bucket, COUNT(*) AS count FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' AND success = TRUE GROUP BY 1 ORDER BY 1"
+    },
+    {
+      "slice_name": "Recent Extraction Failures",
+      "viz_type": "table",
+      "description": "Most recent failed extractions with error details",
+      "datasource_type": "query",
+      "query": "SELECT mpm.ticker, mpm.model_name, mpm.validation_status, mpm.validation_error_count, mpm.attempt_count, mpm.total_duration_ms, mpm.recorded_at, d.title, d.document_type FROM model_performance_metrics mpm LEFT JOIN documents d ON d.id = mpm.document_id WHERE mpm.success = FALSE AND mpm.recorded_at >= NOW() - INTERVAL '24 hours' ORDER BY mpm.recorded_at DESC LIMIT 50"
+    }
+  ]
+}
@@ -0,0 +1,51 @@
+{
+  "dashboard_title": "Source Coverage & Gaps",
+  "description": "Operational dashboard for identifying source coverage gaps, stale sources, and symbols missing expected data feeds.",
+  "slug": "source-coverage-gaps",
+  "position_json": {
+    "HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Source Coverage & Gaps"}},
+    "ROW-1": {
+      "type": "ROW",
+      "children": ["CHART-coverage-matrix", "CHART-missing-types-table"]
+    },
+    "ROW-2": {
+      "type": "ROW",
+      "children": ["CHART-stale-sources-table", "CHART-failure-heatmap"]
+    }
+  },
+  "metadata": {
+    "refresh_frequency": 600,
+    "default_filters": "{}",
+    "color_scheme": "supersetColors"
+  },
+  "charts": [
+    {
+      "slice_name": "Source Coverage Matrix",
+      "viz_type": "table",
+      "description": "Per-symbol source type coverage showing active source counts",
+      "datasource_type": "query",
+      "query": "SELECT c.ticker, c.legal_name, c.sector, COUNT(s.id) FILTER (WHERE s.active) AS active_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'market_api' AND s.active) AS market_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'news_api' AND s.active) AS news_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'filings_api' AND s.active) AS filings_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'web_scrape' AND s.active) AS web_scrape_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'broker' AND s.active) AS broker_sources FROM companies c LEFT JOIN sources s ON s.company_id = c.id WHERE c.active = TRUE GROUP BY c.ticker, c.legal_name, c.sector ORDER BY c.ticker"
+    },
+    {
+      "slice_name": "Symbols Missing Source Types",
+      "viz_type": "table",
+      "description": "Companies that lack one or more expected source types (market_api, news_api, filings_api)",
+      "datasource_type": "query",
+      "query": "SELECT c.ticker, c.legal_name, c.sector, ARRAY_AGG(DISTINCT s.source_type) FILTER (WHERE s.active) AS active_types FROM companies c LEFT JOIN sources s ON s.company_id = c.id AND s.active = TRUE WHERE c.active = TRUE GROUP BY c.ticker, c.legal_name, c.sector HAVING NOT ARRAY['market_api', 'news_api', 'filings_api'] <@ ARRAY_AGG(DISTINCT s.source_type) FILTER (WHERE s.active) OR ARRAY_AGG(DISTINCT s.source_type) FILTER (WHERE s.active) IS NULL ORDER BY c.ticker"
+    },
+    {
+      "slice_name": "Stale Sources (No Success in 24h)",
+      "viz_type": "table",
+      "description": "Active sources that have not completed a successful ingestion run in the last 24 hours",
+      "datasource_type": "query",
+      "query": "SELECT c.ticker, s.source_type, s.source_name, MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') AS last_success, MAX(ir.started_at) AS last_attempt, COUNT(*) FILTER (WHERE ir.status = 'failed' AND ir.started_at >= NOW() - INTERVAL '24 hours') AS recent_failures FROM sources s JOIN companies c ON c.id = s.company_id LEFT JOIN ingestion_runs ir ON ir.source_id = s.id WHERE s.active = TRUE AND c.active = TRUE GROUP BY c.ticker, s.source_type, s.source_name HAVING MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') < NOW() - INTERVAL '24 hours' OR MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') IS NULL ORDER BY c.ticker, s.source_type"
+    },
+    {
+      "slice_name": "Source Failure Heatmap",
+      "viz_type": "heatmap",
+      "description": "Failure counts by source type and ticker in the last 24h",
+      "datasource_type": "query",
+      "query": "SELECT c.ticker, ir.source_type, COUNT(*) FILTER (WHERE ir.status = 'failed') AS failures FROM ingestion_runs ir JOIN companies c ON c.id = ir.company_id WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY c.ticker, ir.source_type HAVING COUNT(*) FILTER (WHERE ir.status = 'failed') > 0 ORDER BY failures DESC"
+    }
+  ]
+}