feat: add 12 integrity check saved queries for SQL Explorer (migration 037)
ci/woodpecker/push/test Pipeline was successful
ci/woodpecker/push/build-2 Pipeline was successful
ci/woodpecker/push/build-1 Pipeline was successful
ci/woodpecker/push/build-3 Pipeline was successful
ci/woodpecker/push/finalize Pipeline was successful
Build and Push / lint-and-test (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.adapters.broker_adapter name:broker-adapter]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.aggregation.worker name:aggregation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.extractor.worker name:extractor]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.ingestion.worker name:ingestion]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.lake_publisher.worker name:lake-publisher]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.parser.worker name:parser]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.recommendation.worker name:recommendation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.scheduler.app name:scheduler]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.api.app:app --host 0.0.0.0 --port 8000 name:query-api]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.risk.app:app --host 0.0.0.0 --port 8000 name:risk]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.symbol_registry.app:app --host 0.0.0.0 --port 8000 name:symbol-registry]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.trading.app:app --host 0.0.0.0 --port 8000 name:trading-engine]) (push) Has been cancelled
Build and Push / build-dashboard (push) Has been cancelled
Build and Push / build-superset (push) Has been cancelled
Build and Push / integration-test (push) Has been cancelled
Build and Push / beta-gate (push) Has been cancelled

Prefixed with ⚕ for easy identification. Checks: duplicate snapshots,
orphaned links, evidence count mismatches, contribution score sums,
canonical key consistency, out-of-range values, unmatched snapshots,
zero evidence rate, duplicate count mismatches, missing prices,
outcome integrity, and pipeline health summary.
This commit is contained in:
Celes Renata
2026-05-01 19:20:56 +00:00
parent cc21fd9e8f
commit 376fcb4bb4
2 changed files with 43 additions and 1 deletions
+1 -1
View File
@@ -81,7 +81,7 @@ When a full reset is needed:
## Database Migrations ## Database Migrations
- Located in `infra/migrations/001_*.sql` through `030_*.sql` - Located in `infra/migrations/001_*.sql` through `030_*.sql`
- Applied automatically by `runmefirst.sh` in sorted order - Applied automatically by `runmefirst.sh` in sorted order
- Next migration number: **037** - Next migration number: **038**
- Key migrations: - Key migrations:
- 016: Global news interpolation (global_events, macro_impact_records, exposure_profiles, trend_projections) - 016: Global news interpolation (global_events, macro_impact_records, exposure_profiles, trend_projections)
- 017: Competitive intelligence (competitor_relationships, competitive_signal_records) - 017: Competitive intelligence (competitor_relationships, competitive_signal_records)
@@ -0,0 +1,42 @@
-- Integrity check saved queries for the SQL Explorer
-- These validate data consistency across the model validation pipeline.
INSERT INTO saved_queries (name, description, sql_text) VALUES
('⚕ Duplicate Snapshots', 'Detect duplicate prediction snapshots (same ticker+timestamp)',
'SELECT ticker, generated_at, count(*) AS duplicates FROM prediction_snapshots GROUP BY ticker, generated_at HAVING count(*) > 1 ORDER BY duplicates DESC'),
('⚕ Orphaned Evidence Links', 'Evidence links referencing non-existent snapshots',
'SELECT sel.id, sel.prediction_id, sel.ticker, sel.source_type FROM signal_evidence_links sel WHERE NOT EXISTS (SELECT 1 FROM prediction_snapshots ps WHERE ps.id = sel.prediction_id) LIMIT 20'),
('⚕ Evidence Count Mismatches', 'Snapshots where stored evidence_count differs from actual link count',
'SELECT ps.id, ps.ticker, ps.evidence_count AS stored, count(sel.id) AS actual, ps.evidence_count - count(sel.id) AS diff FROM prediction_snapshots ps LEFT JOIN signal_evidence_links sel ON sel.prediction_id = ps.id GROUP BY ps.id, ps.ticker, ps.evidence_count HAVING ps.evidence_count != count(sel.id) ORDER BY abs(ps.evidence_count - count(sel.id)) DESC LIMIT 20'),
('⚕ Contribution Score Integrity', 'Snapshots where contribution scores do not sum to 1.0 (±0.01)',
'SELECT prediction_id, round(sum(contribution_score)::numeric, 6) AS score_sum, count(*) AS links FROM signal_evidence_links WHERE contribution_score IS NOT NULL GROUP BY prediction_id HAVING abs(sum(contribution_score) - 1.0) > 0.01 LIMIT 20'),
('⚕ Canonical Key Consistency', 'Documents producing different canonical keys across predictions (should be 0)',
'SELECT document_id, count(DISTINCT canonical_evidence_key) AS key_variants, array_agg(DISTINCT canonical_evidence_key) AS keys FROM signal_evidence_links WHERE document_id IS NOT NULL AND canonical_evidence_key IS NOT NULL GROUP BY document_id HAVING count(DISTINCT canonical_evidence_key) > 1 LIMIT 20'),
('⚕ Out-of-Range Values', 'Snapshots with confidence or strength outside [0, 1]',
'SELECT id, ticker, confidence, strength, generated_at FROM prediction_snapshots WHERE confidence < 0 OR confidence > 1 OR strength < 0 OR strength > 1 LIMIT 20'),
('⚕ Unmatched Snapshots', 'Prediction snapshots with no matching recommendation',
'SELECT ps.id, ps.ticker, ps.action, ps.confidence, ps.generated_at FROM prediction_snapshots ps WHERE NOT EXISTS (SELECT 1 FROM recommendations r WHERE r.ticker = ps.ticker AND r.generated_at = ps.generated_at) LIMIT 20'),
('⚕ Zero Evidence Rate', 'Percentage of snapshots with no evidence links by action type',
'SELECT ps.action, count(*) AS total, count(*) FILTER (WHERE NOT EXISTS (SELECT 1 FROM signal_evidence_links sel WHERE sel.prediction_id = ps.id)) AS zero_evidence, round(count(*) FILTER (WHERE NOT EXISTS (SELECT 1 FROM signal_evidence_links sel WHERE sel.prediction_id = ps.id))::numeric / NULLIF(count(*), 0) * 100, 1) AS zero_pct FROM prediction_snapshots ps GROUP BY ps.action ORDER BY zero_pct DESC'),
('⚕ Duplicate Evidence Mismatches', 'Snapshots where stored duplicate count differs from actual is_duplicate count',
'SELECT ps.id, ps.ticker, ps.duplicate_evidence_count AS stored_dupes, count(sel.id) FILTER (WHERE sel.is_duplicate) AS actual_dupes FROM prediction_snapshots ps JOIN signal_evidence_links sel ON sel.prediction_id = ps.id GROUP BY ps.id, ps.ticker, ps.duplicate_evidence_count HAVING ps.duplicate_evidence_count != count(sel.id) FILTER (WHERE sel.is_duplicate) LIMIT 20'),
('⚕ Missing Price Data', 'Snapshots missing ticker or SPY price at prediction time',
'SELECT ticker, count(*) AS total, count(*) FILTER (WHERE price_at_prediction IS NULL) AS null_price, count(*) FILTER (WHERE spy_price_at_prediction IS NULL) AS null_spy FROM prediction_snapshots GROUP BY ticker HAVING count(*) FILTER (WHERE price_at_prediction IS NULL) > 0 OR count(*) FILTER (WHERE spy_price_at_prediction IS NULL) > 0 ORDER BY null_price DESC'),
('⚕ Outcome Integrity', 'Prediction outcomes with impossible values (return outside [-1, 10] or NULL direction_correct)',
'SELECT po.id, ps.ticker, po.horizon, po.future_return, po.direction_correct, po.profitable FROM prediction_outcomes po JOIN prediction_snapshots ps ON ps.id = po.prediction_id WHERE po.future_return < -1 OR po.future_return > 10 OR po.direction_correct IS NULL LIMIT 20'),
('⚕ Pipeline Health Summary', 'Overall validation pipeline health dashboard',
'SELECT ''Snapshots'' AS metric, count(*)::text AS value FROM prediction_snapshots UNION ALL SELECT ''Evidence Links'', count(*)::text FROM signal_evidence_links UNION ALL SELECT ''Outcomes'', count(*)::text FROM prediction_outcomes UNION ALL SELECT ''Metric Snapshots'', count(*)::text FROM model_metric_snapshots UNION ALL SELECT ''Duplicate Evidence %'', round(avg(CASE WHEN is_duplicate THEN 100.0 ELSE 0.0 END)::numeric, 1)::text FROM signal_evidence_links UNION ALL SELECT ''Zero-Evidence Snapshots'', count(*)::text FROM prediction_snapshots ps WHERE NOT EXISTS (SELECT 1 FROM signal_evidence_links sel WHERE sel.prediction_id = ps.id) UNION ALL SELECT ''Avg Confidence'', round(avg(confidence)::numeric, 3)::text FROM prediction_snapshots UNION ALL SELECT ''Distinct Tickers'', count(DISTINCT ticker)::text FROM prediction_snapshots')
ON CONFLICT (name) DO UPDATE SET sql_text = EXCLUDED.sql_text, description = EXCLUDED.description;