From 376fcb4bb4fa3d53a2ba1bfe92f87ae38a55e6b6 Mon Sep 17 00:00:00 2001 From: Celes Renata Date: Fri, 1 May 2026 19:20:56 +0000 Subject: [PATCH] feat: add 12 integrity check saved queries for SQL Explorer (migration 037) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prefixed with ⚕ for easy identification. Checks: duplicate snapshots, orphaned links, evidence count mismatches, contribution score sums, canonical key consistency, out-of-range values, unmatched snapshots, zero evidence rate, duplicate count mismatches, missing prices, outcome integrity, and pipeline health summary. --- .kiro/steering/project-context.md | 2 +- .../037_integrity_saved_queries.sql | 42 +++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 infra/migrations/037_integrity_saved_queries.sql diff --git a/.kiro/steering/project-context.md b/.kiro/steering/project-context.md index 2b06e2f..12dbe48 100644 --- a/.kiro/steering/project-context.md +++ b/.kiro/steering/project-context.md @@ -81,7 +81,7 @@ When a full reset is needed: ## Database Migrations - Located in `infra/migrations/001_*.sql` through `030_*.sql` - Applied automatically by `runmefirst.sh` in sorted order -- Next migration number: **037** +- Next migration number: **038** - Key migrations: - 016: Global news interpolation (global_events, macro_impact_records, exposure_profiles, trend_projections) - 017: Competitive intelligence (competitor_relationships, competitive_signal_records) diff --git a/infra/migrations/037_integrity_saved_queries.sql b/infra/migrations/037_integrity_saved_queries.sql new file mode 100644 index 0000000..6fac540 --- /dev/null +++ b/infra/migrations/037_integrity_saved_queries.sql @@ -0,0 +1,42 @@ +-- Integrity check saved queries for the SQL Explorer +-- These validate data consistency across the model validation pipeline. + +INSERT INTO saved_queries (name, description, sql_text) VALUES + +('⚕ Duplicate Snapshots', 'Detect duplicate prediction snapshots (same ticker+timestamp)', + 'SELECT ticker, generated_at, count(*) AS duplicates FROM prediction_snapshots GROUP BY ticker, generated_at HAVING count(*) > 1 ORDER BY duplicates DESC'), + +('⚕ Orphaned Evidence Links', 'Evidence links referencing non-existent snapshots', + 'SELECT sel.id, sel.prediction_id, sel.ticker, sel.source_type FROM signal_evidence_links sel WHERE NOT EXISTS (SELECT 1 FROM prediction_snapshots ps WHERE ps.id = sel.prediction_id) LIMIT 20'), + +('⚕ Evidence Count Mismatches', 'Snapshots where stored evidence_count differs from actual link count', + 'SELECT ps.id, ps.ticker, ps.evidence_count AS stored, count(sel.id) AS actual, ps.evidence_count - count(sel.id) AS diff FROM prediction_snapshots ps LEFT JOIN signal_evidence_links sel ON sel.prediction_id = ps.id GROUP BY ps.id, ps.ticker, ps.evidence_count HAVING ps.evidence_count != count(sel.id) ORDER BY abs(ps.evidence_count - count(sel.id)) DESC LIMIT 20'), + +('⚕ Contribution Score Integrity', 'Snapshots where contribution scores do not sum to 1.0 (±0.01)', + 'SELECT prediction_id, round(sum(contribution_score)::numeric, 6) AS score_sum, count(*) AS links FROM signal_evidence_links WHERE contribution_score IS NOT NULL GROUP BY prediction_id HAVING abs(sum(contribution_score) - 1.0) > 0.01 LIMIT 20'), + +('⚕ Canonical Key Consistency', 'Documents producing different canonical keys across predictions (should be 0)', + 'SELECT document_id, count(DISTINCT canonical_evidence_key) AS key_variants, array_agg(DISTINCT canonical_evidence_key) AS keys FROM signal_evidence_links WHERE document_id IS NOT NULL AND canonical_evidence_key IS NOT NULL GROUP BY document_id HAVING count(DISTINCT canonical_evidence_key) > 1 LIMIT 20'), + +('⚕ Out-of-Range Values', 'Snapshots with confidence or strength outside [0, 1]', + 'SELECT id, ticker, confidence, strength, generated_at FROM prediction_snapshots WHERE confidence < 0 OR confidence > 1 OR strength < 0 OR strength > 1 LIMIT 20'), + +('⚕ Unmatched Snapshots', 'Prediction snapshots with no matching recommendation', + 'SELECT ps.id, ps.ticker, ps.action, ps.confidence, ps.generated_at FROM prediction_snapshots ps WHERE NOT EXISTS (SELECT 1 FROM recommendations r WHERE r.ticker = ps.ticker AND r.generated_at = ps.generated_at) LIMIT 20'), + +('⚕ Zero Evidence Rate', 'Percentage of snapshots with no evidence links by action type', + 'SELECT ps.action, count(*) AS total, count(*) FILTER (WHERE NOT EXISTS (SELECT 1 FROM signal_evidence_links sel WHERE sel.prediction_id = ps.id)) AS zero_evidence, round(count(*) FILTER (WHERE NOT EXISTS (SELECT 1 FROM signal_evidence_links sel WHERE sel.prediction_id = ps.id))::numeric / NULLIF(count(*), 0) * 100, 1) AS zero_pct FROM prediction_snapshots ps GROUP BY ps.action ORDER BY zero_pct DESC'), + +('⚕ Duplicate Evidence Mismatches', 'Snapshots where stored duplicate count differs from actual is_duplicate count', + 'SELECT ps.id, ps.ticker, ps.duplicate_evidence_count AS stored_dupes, count(sel.id) FILTER (WHERE sel.is_duplicate) AS actual_dupes FROM prediction_snapshots ps JOIN signal_evidence_links sel ON sel.prediction_id = ps.id GROUP BY ps.id, ps.ticker, ps.duplicate_evidence_count HAVING ps.duplicate_evidence_count != count(sel.id) FILTER (WHERE sel.is_duplicate) LIMIT 20'), + +('⚕ Missing Price Data', 'Snapshots missing ticker or SPY price at prediction time', + 'SELECT ticker, count(*) AS total, count(*) FILTER (WHERE price_at_prediction IS NULL) AS null_price, count(*) FILTER (WHERE spy_price_at_prediction IS NULL) AS null_spy FROM prediction_snapshots GROUP BY ticker HAVING count(*) FILTER (WHERE price_at_prediction IS NULL) > 0 OR count(*) FILTER (WHERE spy_price_at_prediction IS NULL) > 0 ORDER BY null_price DESC'), + +('⚕ Outcome Integrity', 'Prediction outcomes with impossible values (return outside [-1, 10] or NULL direction_correct)', + 'SELECT po.id, ps.ticker, po.horizon, po.future_return, po.direction_correct, po.profitable FROM prediction_outcomes po JOIN prediction_snapshots ps ON ps.id = po.prediction_id WHERE po.future_return < -1 OR po.future_return > 10 OR po.direction_correct IS NULL LIMIT 20'), + +('⚕ Pipeline Health Summary', 'Overall validation pipeline health dashboard', + 'SELECT ''Snapshots'' AS metric, count(*)::text AS value FROM prediction_snapshots UNION ALL SELECT ''Evidence Links'', count(*)::text FROM signal_evidence_links UNION ALL SELECT ''Outcomes'', count(*)::text FROM prediction_outcomes UNION ALL SELECT ''Metric Snapshots'', count(*)::text FROM model_metric_snapshots UNION ALL SELECT ''Duplicate Evidence %'', round(avg(CASE WHEN is_duplicate THEN 100.0 ELSE 0.0 END)::numeric, 1)::text FROM signal_evidence_links UNION ALL SELECT ''Zero-Evidence Snapshots'', count(*)::text FROM prediction_snapshots ps WHERE NOT EXISTS (SELECT 1 FROM signal_evidence_links sel WHERE sel.prediction_id = ps.id) UNION ALL SELECT ''Avg Confidence'', round(avg(confidence)::numeric, 3)::text FROM prediction_snapshots UNION ALL SELECT ''Distinct Tickers'', count(DISTINCT ticker)::text FROM prediction_snapshots') + +ON CONFLICT (name) DO UPDATE SET sql_text = EXCLUDED.sql_text, description = EXCLUDED.description;