From 376fcb4bb4fa3d53a2ba1bfe92f87ae38a55e6b6 Mon Sep 17 00:00:00 2001
From: Celes Renata <celes@frameshift.net>
Date: Fri, 1 May 2026 19:20:56 +0000
Subject: [PATCH] feat: add 12 integrity check saved queries for SQL Explorer
 (migration 037)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prefixed with ⚕ for easy identification. Checks: duplicate snapshots,
orphaned links, evidence count mismatches, contribution score sums,
canonical key consistency, out-of-range values, unmatched snapshots,
zero evidence rate, duplicate count mismatches, missing prices,
outcome integrity, and pipeline health summary.
---
 .kiro/steering/project-context.md             |  2 +-
 .../037_integrity_saved_queries.sql           | 42 +++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)
 create mode 100644 infra/migrations/037_integrity_saved_queries.sql

diff --git a/.kiro/steering/project-context.md b/.kiro/steering/project-context.md
index 2b06e2f..12dbe48 100644
--- a/.kiro/steering/project-context.md
+++ b/.kiro/steering/project-context.md
@@ -81,7 +81,7 @@ When a full reset is needed:
 ## Database Migrations
 - Located in `infra/migrations/001_*.sql` through `030_*.sql`
 - Applied automatically by `runmefirst.sh` in sorted order
-- Next migration number: **037**
+- Next migration number: **038**
 - Key migrations:
   - 016: Global news interpolation (global_events, macro_impact_records, exposure_profiles, trend_projections)
   - 017: Competitive intelligence (competitor_relationships, competitive_signal_records)
diff --git a/infra/migrations/037_integrity_saved_queries.sql b/infra/migrations/037_integrity_saved_queries.sql
new file mode 100644
index 0000000..6fac540
--- /dev/null
+++ b/infra/migrations/037_integrity_saved_queries.sql
@@ -0,0 +1,42 @@
+-- Integrity check saved queries for the SQL Explorer
+-- These validate data consistency across the model validation pipeline.
+
+INSERT INTO saved_queries (name, description, sql_text) VALUES
+
+('⚕ Duplicate Snapshots', 'Detect duplicate prediction snapshots (same ticker+timestamp)',
+ 'SELECT ticker, generated_at, count(*) AS duplicates FROM prediction_snapshots GROUP BY ticker, generated_at HAVING count(*) > 1 ORDER BY duplicates DESC'),
+
+('⚕ Orphaned Evidence Links', 'Evidence links referencing non-existent snapshots',
+ 'SELECT sel.id, sel.prediction_id, sel.ticker, sel.source_type FROM signal_evidence_links sel WHERE NOT EXISTS (SELECT 1 FROM prediction_snapshots ps WHERE ps.id = sel.prediction_id) LIMIT 20'),
+
+('⚕ Evidence Count Mismatches', 'Snapshots where stored evidence_count differs from actual link count',
+ 'SELECT ps.id, ps.ticker, ps.evidence_count AS stored, count(sel.id) AS actual, ps.evidence_count - count(sel.id) AS diff FROM prediction_snapshots ps LEFT JOIN signal_evidence_links sel ON sel.prediction_id = ps.id GROUP BY ps.id, ps.ticker, ps.evidence_count HAVING ps.evidence_count != count(sel.id) ORDER BY abs(ps.evidence_count - count(sel.id)) DESC LIMIT 20'),
+
+('⚕ Contribution Score Integrity', 'Snapshots where contribution scores do not sum to 1.0 (±0.01)',
+ 'SELECT prediction_id, round(sum(contribution_score)::numeric, 6) AS score_sum, count(*) AS links FROM signal_evidence_links WHERE contribution_score IS NOT NULL GROUP BY prediction_id HAVING abs(sum(contribution_score) - 1.0) > 0.01 LIMIT 20'),
+
+('⚕ Canonical Key Consistency', 'Documents producing different canonical keys across predictions (should be 0)',
+ 'SELECT document_id, count(DISTINCT canonical_evidence_key) AS key_variants, array_agg(DISTINCT canonical_evidence_key) AS keys FROM signal_evidence_links WHERE document_id IS NOT NULL AND canonical_evidence_key IS NOT NULL GROUP BY document_id HAVING count(DISTINCT canonical_evidence_key) > 1 LIMIT 20'),
+
+('⚕ Out-of-Range Values', 'Snapshots with confidence or strength outside [0, 1]',
+ 'SELECT id, ticker, confidence, strength, generated_at FROM prediction_snapshots WHERE confidence < 0 OR confidence > 1 OR strength < 0 OR strength > 1 LIMIT 20'),
+
+('⚕ Unmatched Snapshots', 'Prediction snapshots with no matching recommendation',
+ 'SELECT ps.id, ps.ticker, ps.action, ps.confidence, ps.generated_at FROM prediction_snapshots ps WHERE NOT EXISTS (SELECT 1 FROM recommendations r WHERE r.ticker = ps.ticker AND r.generated_at = ps.generated_at) LIMIT 20'),
+
+('⚕ Zero Evidence Rate', 'Percentage of snapshots with no evidence links by action type',
+ 'SELECT ps.action, count(*) AS total, count(*) FILTER (WHERE NOT EXISTS (SELECT 1 FROM signal_evidence_links sel WHERE sel.prediction_id = ps.id)) AS zero_evidence, round(count(*) FILTER (WHERE NOT EXISTS (SELECT 1 FROM signal_evidence_links sel WHERE sel.prediction_id = ps.id))::numeric / NULLIF(count(*), 0) * 100, 1) AS zero_pct FROM prediction_snapshots ps GROUP BY ps.action ORDER BY zero_pct DESC'),
+
+('⚕ Duplicate Evidence Mismatches', 'Snapshots where stored duplicate count differs from actual is_duplicate count',
+ 'SELECT ps.id, ps.ticker, ps.duplicate_evidence_count AS stored_dupes, count(sel.id) FILTER (WHERE sel.is_duplicate) AS actual_dupes FROM prediction_snapshots ps JOIN signal_evidence_links sel ON sel.prediction_id = ps.id GROUP BY ps.id, ps.ticker, ps.duplicate_evidence_count HAVING ps.duplicate_evidence_count != count(sel.id) FILTER (WHERE sel.is_duplicate) LIMIT 20'),
+
+('⚕ Missing Price Data', 'Snapshots missing ticker or SPY price at prediction time',
+ 'SELECT ticker, count(*) AS total, count(*) FILTER (WHERE price_at_prediction IS NULL) AS null_price, count(*) FILTER (WHERE spy_price_at_prediction IS NULL) AS null_spy FROM prediction_snapshots GROUP BY ticker HAVING count(*) FILTER (WHERE price_at_prediction IS NULL) > 0 OR count(*) FILTER (WHERE spy_price_at_prediction IS NULL) > 0 ORDER BY null_price DESC'),
+
+('⚕ Outcome Integrity', 'Prediction outcomes with impossible values (return outside [-1, 10] or NULL direction_correct)',
+ 'SELECT po.id, ps.ticker, po.horizon, po.future_return, po.direction_correct, po.profitable FROM prediction_outcomes po JOIN prediction_snapshots ps ON ps.id = po.prediction_id WHERE po.future_return < -1 OR po.future_return > 10 OR po.direction_correct IS NULL LIMIT 20'),
+
+('⚕ Pipeline Health Summary', 'Overall validation pipeline health dashboard',
+ 'SELECT ''Snapshots'' AS metric, count(*)::text AS value FROM prediction_snapshots UNION ALL SELECT ''Evidence Links'', count(*)::text FROM signal_evidence_links UNION ALL SELECT ''Outcomes'', count(*)::text FROM prediction_outcomes UNION ALL SELECT ''Metric Snapshots'', count(*)::text FROM model_metric_snapshots UNION ALL SELECT ''Duplicate Evidence %'', round(avg(CASE WHEN is_duplicate THEN 100.0 ELSE 0.0 END)::numeric, 1)::text FROM signal_evidence_links UNION ALL SELECT ''Zero-Evidence Snapshots'', count(*)::text FROM prediction_snapshots ps WHERE NOT EXISTS (SELECT 1 FROM signal_evidence_links sel WHERE sel.prediction_id = ps.id) UNION ALL SELECT ''Avg Confidence'', round(avg(confidence)::numeric, 3)::text FROM prediction_snapshots UNION ALL SELECT ''Distinct Tickers'', count(DISTINCT ticker)::text FROM prediction_snapshots')
+
+ON CONFLICT (name) DO UPDATE SET sql_text = EXCLUDED.sql_text, description = EXCLUDED.description;