feat: add 12 integrity check saved queries for SQL Explorer (migration 037)

Prefixed with ⚕ for easy identification. Checks: duplicate snapshots, orphaned links, evidence count mismatches, contribution score sums, canonical key consistency, out-of-range values, unmatched snapshots, zero evidence rate, duplicate count mismatches, missing prices, outcome integrity, and pipeline health summary.
2026-05-01 19:20:56 +00:00
parent cc21fd9e8f
commit 376fcb4bb4
2 changed files with 43 additions and 1 deletions
@@ -81,7 +81,7 @@ When a full reset is needed:
 ## Database Migrations
 - Located in `infra/migrations/001_*.sql` through `030_*.sql`
 - Applied automatically by `runmefirst.sh` in sorted order
- Next migration number: **037**
+- Next migration number: **038**
 - Key migrations:
  - 016: Global news interpolation (global_events, macro_impact_records, exposure_profiles, trend_projections)
  - 017: Competitive intelligence (competitor_relationships, competitive_signal_records)
@@ -0,0 +1,42 @@
 -- Integrity check saved queries for the SQL Explorer
 -- These validate data consistency across the model validation pipeline.
 INSERT INTO saved_queries (name, description, sql_text) VALUES
 ('⚕ Duplicate Snapshots', 'Detect duplicate prediction snapshots (same ticker+timestamp)',
 'SELECT ticker, generated_at, count(*) AS duplicates FROM prediction_snapshots GROUP BY ticker, generated_at HAVING count(*) > 1 ORDER BY duplicates DESC'),
 ('⚕ Orphaned Evidence Links', 'Evidence links referencing non-existent snapshots',
 'SELECT sel.id, sel.prediction_id, sel.ticker, sel.source_type FROM signal_evidence_links sel WHERE NOT EXISTS (SELECT 1 FROM prediction_snapshots ps WHERE ps.id = sel.prediction_id) LIMIT 20'),
 ('⚕ Evidence Count Mismatches', 'Snapshots where stored evidence_count differs from actual link count',
 'SELECT ps.id, ps.ticker, ps.evidence_count AS stored, count(sel.id) AS actual, ps.evidence_count - count(sel.id) AS diff FROM prediction_snapshots ps LEFT JOIN signal_evidence_links sel ON sel.prediction_id = ps.id GROUP BY ps.id, ps.ticker, ps.evidence_count HAVING ps.evidence_count != count(sel.id) ORDER BY abs(ps.evidence_count - count(sel.id)) DESC LIMIT 20'),
 ('⚕ Contribution Score Integrity', 'Snapshots where contribution scores do not sum to 1.0 (±0.01)',
 'SELECT prediction_id, round(sum(contribution_score)::numeric, 6) AS score_sum, count(*) AS links FROM signal_evidence_links WHERE contribution_score IS NOT NULL GROUP BY prediction_id HAVING abs(sum(contribution_score) - 1.0) > 0.01 LIMIT 20'),
 ('⚕ Canonical Key Consistency', 'Documents producing different canonical keys across predictions (should be 0)',
 'SELECT document_id, count(DISTINCT canonical_evidence_key) AS key_variants, array_agg(DISTINCT canonical_evidence_key) AS keys FROM signal_evidence_links WHERE document_id IS NOT NULL AND canonical_evidence_key IS NOT NULL GROUP BY document_id HAVING count(DISTINCT canonical_evidence_key) > 1 LIMIT 20'),
 ('⚕ Out-of-Range Values', 'Snapshots with confidence or strength outside [0, 1]',
 'SELECT id, ticker, confidence, strength, generated_at FROM prediction_snapshots WHERE confidence < 0 OR confidence > 1 OR strength < 0 OR strength > 1 LIMIT 20'),
 ('⚕ Unmatched Snapshots', 'Prediction snapshots with no matching recommendation',
 'SELECT ps.id, ps.ticker, ps.action, ps.confidence, ps.generated_at FROM prediction_snapshots ps WHERE NOT EXISTS (SELECT 1 FROM recommendations r WHERE r.ticker = ps.ticker AND r.generated_at = ps.generated_at) LIMIT 20'),
 ('⚕ Zero Evidence Rate', 'Percentage of snapshots with no evidence links by action type',
 'SELECT ps.action, count(*) AS total, count(*) FILTER (WHERE NOT EXISTS (SELECT 1 FROM signal_evidence_links sel WHERE sel.prediction_id = ps.id)) AS zero_evidence, round(count(*) FILTER (WHERE NOT EXISTS (SELECT 1 FROM signal_evidence_links sel WHERE sel.prediction_id = ps.id))::numeric / NULLIF(count(*), 0) * 100, 1) AS zero_pct FROM prediction_snapshots ps GROUP BY ps.action ORDER BY zero_pct DESC'),
 ('⚕ Duplicate Evidence Mismatches', 'Snapshots where stored duplicate count differs from actual is_duplicate count',
 'SELECT ps.id, ps.ticker, ps.duplicate_evidence_count AS stored_dupes, count(sel.id) FILTER (WHERE sel.is_duplicate) AS actual_dupes FROM prediction_snapshots ps JOIN signal_evidence_links sel ON sel.prediction_id = ps.id GROUP BY ps.id, ps.ticker, ps.duplicate_evidence_count HAVING ps.duplicate_evidence_count != count(sel.id) FILTER (WHERE sel.is_duplicate) LIMIT 20'),
 ('⚕ Missing Price Data', 'Snapshots missing ticker or SPY price at prediction time',
 'SELECT ticker, count(*) AS total, count(*) FILTER (WHERE price_at_prediction IS NULL) AS null_price, count(*) FILTER (WHERE spy_price_at_prediction IS NULL) AS null_spy FROM prediction_snapshots GROUP BY ticker HAVING count(*) FILTER (WHERE price_at_prediction IS NULL) > 0 OR count(*) FILTER (WHERE spy_price_at_prediction IS NULL) > 0 ORDER BY null_price DESC'),
 ('⚕ Outcome Integrity', 'Prediction outcomes with impossible values (return outside [-1, 10] or NULL direction_correct)',
 'SELECT po.id, ps.ticker, po.horizon, po.future_return, po.direction_correct, po.profitable FROM prediction_outcomes po JOIN prediction_snapshots ps ON ps.id = po.prediction_id WHERE po.future_return < -1 OR po.future_return > 10 OR po.direction_correct IS NULL LIMIT 20'),
 ('⚕ Pipeline Health Summary', 'Overall validation pipeline health dashboard',
 'SELECT ''Snapshots'' AS metric, count(*)::text AS value FROM prediction_snapshots UNION ALL SELECT ''Evidence Links'', count(*)::text FROM signal_evidence_links UNION ALL SELECT ''Outcomes'', count(*)::text FROM prediction_outcomes UNION ALL SELECT ''Metric Snapshots'', count(*)::text FROM model_metric_snapshots UNION ALL SELECT ''Duplicate Evidence %'', round(avg(CASE WHEN is_duplicate THEN 100.0 ELSE 0.0 END)::numeric, 1)::text FROM signal_evidence_links UNION ALL SELECT ''Zero-Evidence Snapshots'', count(*)::text FROM prediction_snapshots ps WHERE NOT EXISTS (SELECT 1 FROM signal_evidence_links sel WHERE sel.prediction_id = ps.id) UNION ALL SELECT ''Avg Confidence'', round(avg(confidence)::numeric, 3)::text FROM prediction_snapshots UNION ALL SELECT ''Distinct Tickers'', count(DISTINCT ticker)::text FROM prediction_snapshots')
 ON CONFLICT (name) DO UPDATE SET sql_text = EXCLUDED.sql_text, description = EXCLUDED.description;