phase 14-15: docker build validation and helm deployment

This commit is contained in:
Celes Renata
2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
+25 -9
View File
@@ -2,15 +2,31 @@
Analytical fact table definitions for MinIO-backed datasets queried via Trino.
All tables use Hive-compatible partition layouts on MinIO (`s3a://stonks-lakehouse/warehouse/`)
and are defined in the `lakehouse.stonks` schema. Parquet is the storage format.
## Fact Tables
- `lake.market_bars` — OHLCV bar data
- `lake.market_quotes` — quote snapshots
- `lake.company_events` — corporate actions and events
- `lake.documents` — ingested document metadata
- `lake.document_extractions` — AI extraction outputs
- `lake.trade_signals` — aggregated trend signals
- `lake.trade_orders` — order submission records
- `lake.trade_fills` — fill and execution records
- `lake.market_bars` — OHLCV bar data per symbol per interval
- `lake.market_quotes` bid/ask quote snapshots
- `lake.company_events` — corporate actions, earnings, filings, and issuer events
- `lake.documents` — ingested document metadata (articles, filings, transcripts)
- `lake.document_extractions` — AI extraction outputs per document per company
- `lake.trade_signals` — aggregated trend signals and recommendation actions
- `lake.trade_orders` — order submission records (paper and live)
- `lake.trade_fills` — fill and execution records from broker
- `lake.positions_daily` — end-of-day position snapshots
- `lake.pnl_daily` — daily PnL records
- `lake.pnl_daily` — daily PnL records per symbol per account
- `lake.prediction_vs_outcome` — prediction accuracy tracking
- `lake.model_performance` — extraction model performance metrics
## Partitioning
- Most tables partition by `dt` (date)
- `document_extractions`, `prediction_vs_outcome`, and `model_performance` also partition by `model_version`
## Trino Catalogs
- `lakehouse` catalog (Hive connector) for external Hive-compatible tables
- `iceberg` catalog (Iceberg connector) for managed Iceberg tables
## Views
Example SQL views for dashboards and ad hoc analysis are in `lakehouse/views/`.
See `lakehouse/views/README.md` for details.
+24
View File
@@ -0,0 +1,24 @@
-- Analytical fact table: company_events
-- Corporate actions, earnings, filings, and other issuer events.
-- Partitioned by dt (date) on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/company_events/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 2.3, 9.4, 9.5, 10.1
-- Design ref: Section 7 (lake.company_events)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.company_events (
event_id VARCHAR,
ticker VARCHAR,
event_type VARCHAR,
event_subtype VARCHAR,
title VARCHAR,
description VARCHAR,
source VARCHAR,
source_url VARCHAR,
event_at TIMESTAMP(6) WITH TIME ZONE,
ingested_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE
) WITH (
format = 'PARQUET',
partitioned_by = ARRAY['dt'],
external_location = 's3a://stonks-lakehouse/warehouse/company_events/'
);
+13 -1
View File
@@ -1,16 +1,28 @@
-- Analytical fact table: document_extractions
-- Partitioned by dt and model_version on MinIO
-- AI extraction outputs per document per company.
-- Partitioned by dt and model_version on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/document_extractions/dt={yyyy-mm-dd}/model_version={ver}/part-*.parquet
-- Requirements: 5.3, 5.5, 9.4, 9.5, 10.1, 10.4
-- Design ref: Section 6.3, Section 7 (lake.document_extractions)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.document_extractions (
document_id VARCHAR,
ticker VARCHAR,
company_name VARCHAR,
relevance DOUBLE,
sentiment VARCHAR,
impact_score DOUBLE,
impact_horizon VARCHAR,
catalyst_type VARCHAR,
confidence DOUBLE,
novelty_score DOUBLE,
source_credibility DOUBLE,
key_facts VARCHAR,
risks VARCHAR,
macro_themes VARCHAR,
model_name VARCHAR,
prompt_version VARCHAR,
schema_version VARCHAR,
extraction_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE,
model_version VARCHAR
+9 -2
View File
@@ -1,6 +1,9 @@
-- Analytical fact table: documents
-- Partitioned by dt and source_type on MinIO
-- Path: s3://stonks-lakehouse/warehouse/documents/dt={yyyy-mm-dd}/source_type={type}/part-*.parquet
-- Ingested document metadata for articles, filings, transcripts, and press releases.
-- Partitioned by dt on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/documents/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 3.1, 3.3, 9.4, 9.5, 10.1, 10.4
-- Design ref: Section 6.2, Section 7 (lake.documents)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.documents (
document_id VARCHAR,
@@ -9,7 +12,11 @@ CREATE TABLE IF NOT EXISTS lakehouse.stonks.documents (
ticker VARCHAR,
publisher VARCHAR,
title VARCHAR,
url VARCHAR,
canonical_url VARCHAR,
language VARCHAR,
published_at TIMESTAMP(6) WITH TIME ZONE,
retrieved_at TIMESTAMP(6) WITH TIME ZONE,
content_hash VARCHAR,
confidence DOUBLE,
dt DATE
+6 -1
View File
@@ -1,6 +1,9 @@
-- Analytical fact table: market_bars
-- Partitioned by dt (date) on MinIO
-- OHLCV bar data for tracked symbols.
-- Partitioned by dt (date) on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/market_bars/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 2.1, 9.4, 9.5, 10.1
-- Design ref: Section 7 (lake.market_bars)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.market_bars (
ticker VARCHAR,
@@ -10,7 +13,9 @@ CREATE TABLE IF NOT EXISTS lakehouse.stonks.market_bars (
close_price DOUBLE,
volume BIGINT,
vwap DOUBLE,
trade_count BIGINT,
bar_timestamp TIMESTAMP(6) WITH TIME ZONE,
bar_interval VARCHAR,
source VARCHAR,
dt DATE
) WITH (
+23
View File
@@ -0,0 +1,23 @@
-- Analytical fact table: market_quotes
-- Quote snapshots for tracked symbols.
-- Partitioned by dt (date) on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/market_quotes/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 2.1, 9.4, 9.5, 10.1
-- Design ref: Section 7 (lake.market_quotes)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.market_quotes (
ticker VARCHAR,
bid_price DOUBLE,
ask_price DOUBLE,
bid_size BIGINT,
ask_size BIGINT,
last_price DOUBLE,
last_size BIGINT,
source VARCHAR,
quote_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE
) WITH (
format = 'PARQUET',
partitioned_by = ARRAY['dt'],
external_location = 's3a://stonks-lakehouse/warehouse/market_quotes/'
);
+33
View File
@@ -0,0 +1,33 @@
-- Analytical fact table: model_performance
-- Tracks extraction model performance for Trino/Superset dashboards.
-- Partitioned by dt and model_name on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/model_performance/dt={yyyy-mm-dd}/model_name={name}/part-*.parquet
-- Requirements: 12.1, 12.2
CREATE TABLE IF NOT EXISTS lakehouse.stonks.model_performance (
document_id VARCHAR,
ticker VARCHAR,
model_name VARCHAR,
prompt_version VARCHAR,
schema_version VARCHAR,
success BOOLEAN,
attempt_count INTEGER,
total_duration_ms INTEGER,
first_attempt_duration_ms INTEGER,
final_attempt_duration_ms INTEGER,
confidence DOUBLE,
validation_status VARCHAR,
validation_error_count INTEGER,
validation_warning_count INTEGER,
retry_count INTEGER,
input_token_estimate INTEGER,
output_token_estimate INTEGER,
company_count INTEGER,
recorded_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE,
model_version VARCHAR
) WITH (
format = 'PARQUET',
partitioned_by = ARRAY['dt', 'model_version'],
external_location = 's3a://stonks-lakehouse/warehouse/model_performance/'
);
+8 -1
View File
@@ -1,12 +1,19 @@
-- Analytical fact table: pnl_daily
-- Partitioned by dt on MinIO
-- Daily profit and loss records per symbol per account.
-- Partitioned by dt on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/pnl_daily/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 9.4, 9.5, 10.1, 10.3
-- Design ref: Section 7 (lake.pnl_daily)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.pnl_daily (
ticker VARCHAR,
realized_pnl DOUBLE,
unrealized_pnl DOUBLE,
total_pnl DOUBLE,
fees DOUBLE,
net_pnl DOUBLE,
broker_account VARCHAR,
execution_mode VARCHAR,
dt DATE
) WITH (
format = 'PARQUET',
+7 -1
View File
@@ -1,13 +1,19 @@
-- Analytical fact table: positions_daily
-- Partitioned by dt on MinIO
-- End-of-day position snapshots.
-- Partitioned by dt on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/positions_daily/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 9.4, 9.5, 10.1, 10.3
-- Design ref: Section 7 (lake.positions_daily)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.positions_daily (
ticker VARCHAR,
quantity DOUBLE,
avg_entry_price DOUBLE,
close_price DOUBLE,
market_value DOUBLE,
unrealized_pnl DOUBLE,
broker_account VARCHAR,
execution_mode VARCHAR,
snapshot_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE
) WITH (
+16 -11
View File
@@ -1,19 +1,24 @@
-- Analytical fact table: prediction_vs_outcome
-- Partitioned by dt on MinIO
-- Prediction accuracy tracking: predicted signals vs realized market moves.
-- Partitioned by dt and model_version on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/prediction_vs_outcome/dt={yyyy-mm-dd}/model_version={ver}/part-*.parquet
-- Requirements: 9.4, 9.5, 10.1, 10.3
-- Design ref: Section 7 (lake.prediction_vs_outcome)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.prediction_vs_outcome (
recommendation_id VARCHAR,
ticker VARCHAR,
predicted_action VARCHAR,
recommendation_id VARCHAR,
ticker VARCHAR,
predicted_action VARCHAR,
predicted_confidence DOUBLE,
actual_move_pct DOUBLE,
outcome VARCHAR,
horizon_days INTEGER,
predicted_at TIMESTAMP(6) WITH TIME ZONE,
evaluated_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE
actual_move_pct DOUBLE,
outcome VARCHAR,
horizon_days INTEGER,
predicted_at TIMESTAMP(6) WITH TIME ZONE,
evaluated_at TIMESTAMP(6) WITH TIME ZONE,
model_version VARCHAR,
dt DATE
) WITH (
format = 'PARQUET',
partitioned_by = ARRAY['dt'],
partitioned_by = ARRAY['dt', 'model_version'],
external_location = 's3a://stonks-lakehouse/warehouse/prediction_vs_outcome/'
);
+6 -1
View File
@@ -1,5 +1,9 @@
-- Analytical fact table: trade_fills
-- Partitioned by dt on MinIO
-- Fill and execution records from broker.
-- Partitioned by dt on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/trade_fills/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 9.4, 9.5, 10.1, 10.3
-- Design ref: Section 7 (lake.trade_fills)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_fills (
fill_id VARCHAR,
@@ -8,6 +12,7 @@ CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_fills (
side VARCHAR,
fill_price DOUBLE,
fill_quantity DOUBLE,
commission DOUBLE,
broker_account VARCHAR,
filled_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE
+7 -1
View File
@@ -1,14 +1,20 @@
-- Analytical fact table: trade_orders
-- Partitioned by dt on MinIO
-- Order submission records for paper and live trading.
-- Partitioned by dt on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/trade_orders/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 8.3, 9.4, 9.5, 10.1, 10.3
-- Design ref: Section 7 (lake.trade_orders)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_orders (
order_id VARCHAR,
recommendation_id VARCHAR,
ticker VARCHAR,
side VARCHAR,
order_type VARCHAR,
quantity DOUBLE,
limit_price DOUBLE,
status VARCHAR,
execution_mode VARCHAR,
broker_account VARCHAR,
submitted_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE
+18 -10
View File
@@ -1,16 +1,24 @@
-- Analytical fact table: trade_signals
-- Partitioned by dt on MinIO
-- Aggregated trend signals and recommendation actions.
-- Partitioned by dt on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/trade_signals/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 6.1, 6.2, 6.4, 6.5, 7.1, 9.4, 9.5, 10.1
-- Design ref: Section 6.4, Section 6.5, Section 7 (lake.trade_signals)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_signals (
signal_id VARCHAR,
ticker VARCHAR,
trend_direction VARCHAR,
trend_strength DOUBLE,
confidence DOUBLE,
action VARCHAR,
time_horizon VARCHAR,
generated_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE
signal_id VARCHAR,
ticker VARCHAR,
trend_direction VARCHAR,
trend_strength DOUBLE,
confidence DOUBLE,
contradiction_score DOUBLE,
dominant_catalysts VARCHAR,
material_risks VARCHAR,
action VARCHAR,
time_horizon VARCHAR,
recommendation_id VARCHAR,
generated_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE
) WITH (
format = 'PARQUET',
partitioned_by = ARRAY['dt'],
+23
View File
@@ -0,0 +1,23 @@
# Lakehouse Views
Example SQL views for Trino over MinIO-backed analytical fact tables.
These views are designed to be created in the `lakehouse.stonks` schema and
can be used directly in Superset dashboards or ad hoc Trino queries.
## Views
- `prediction_accuracy` — Joins predicted signals with realized market moves to score prediction quality
- `paper_trade_scorecard` — Aggregates paper trading performance by symbol with win rates and PnL
- `paper_trade_detail` — Per-order paper trade detail with fill prices and realized outcomes
- `signal_hit_rate` — Daily signal accuracy summary across all symbols
## Usage
Connect to Trino and run each `.sql` file to create the view:
```bash
trino --catalog lakehouse --schema stonks < lakehouse/views/prediction_accuracy.sql
```
Or paste into the Superset SQL Lab to explore interactively.
+47
View File
@@ -0,0 +1,47 @@
-- View: paper_trade_detail
-- Per-order paper trade detail joining orders, fills, and the originating
-- recommendation's prediction outcome. Useful for drill-down from the scorecard.
-- Requirements: 10.1, 10.3, 10.4
-- Design ref: Section 9.2 (evidence-to-outcome drill-down)
CREATE OR REPLACE VIEW lakehouse.stonks.paper_trade_detail AS
SELECT
o.order_id,
o.recommendation_id,
o.ticker,
o.side,
o.order_type,
o.quantity,
o.limit_price,
o.status AS order_status,
o.submitted_at,
f.fill_id,
f.fill_price,
f.fill_quantity,
f.commission,
f.filled_at,
-- Slippage: difference between limit and fill price (buys positive = worse)
CASE
WHEN o.limit_price IS NOT NULL AND o.limit_price > 0 THEN
(f.fill_price - o.limit_price) / o.limit_price * 100
ELSE NULL
END AS slippage_pct,
-- Link back to prediction outcome
pvo.predicted_action,
pvo.predicted_confidence,
pvo.actual_move_pct,
pvo.outcome AS prediction_outcome,
o.broker_account,
o.dt
FROM
lakehouse.stonks.trade_orders o
LEFT JOIN
lakehouse.stonks.trade_fills f
ON o.order_id = f.order_id
AND o.dt = f.dt
LEFT JOIN
lakehouse.stonks.prediction_vs_outcome pvo
ON o.recommendation_id = pvo.recommendation_id
AND o.dt = pvo.dt
WHERE
o.execution_mode = 'paper';
+42
View File
@@ -0,0 +1,42 @@
-- View: paper_trade_scorecard
-- Aggregates paper trading performance per symbol with win rates, PnL, and
-- average fill quality. Filters to paper execution mode only.
-- Requirements: 10.1, 10.2, 10.3
-- Design ref: Section 9.2 (paper trading PnL scorecard)
CREATE OR REPLACE VIEW lakehouse.stonks.paper_trade_scorecard AS
SELECT
pnl.ticker,
pnl.broker_account,
COUNT(DISTINCT pnl.dt) AS trading_days,
SUM(pnl.realized_pnl) AS total_realized_pnl,
SUM(pnl.unrealized_pnl) AS total_unrealized_pnl,
SUM(pnl.net_pnl) AS total_net_pnl,
SUM(pnl.fees) AS total_fees,
AVG(pnl.net_pnl) AS avg_daily_pnl,
-- Win rate: fraction of days with positive net PnL
CAST(
COUNT(CASE WHEN pnl.net_pnl > 0 THEN 1 END) AS DOUBLE
) / NULLIF(COUNT(*), 0) AS win_rate,
-- Worst and best single-day PnL
MIN(pnl.net_pnl) AS worst_day_pnl,
MAX(pnl.net_pnl) AS best_day_pnl,
-- Order counts from trade_orders
COUNT(DISTINCT o.order_id) AS total_orders,
COUNT(DISTINCT CASE WHEN o.status = 'filled' THEN o.order_id END)
AS filled_orders,
MIN(pnl.dt) AS first_trade_date,
MAX(pnl.dt) AS last_trade_date
FROM
lakehouse.stonks.pnl_daily pnl
LEFT JOIN
lakehouse.stonks.trade_orders o
ON pnl.ticker = o.ticker
AND pnl.broker_account = o.broker_account
AND pnl.dt = o.dt
AND o.execution_mode = 'paper'
WHERE
pnl.execution_mode = 'paper'
GROUP BY
pnl.ticker,
pnl.broker_account;
+44
View File
@@ -0,0 +1,44 @@
-- View: prediction_accuracy
-- Joins prediction_vs_outcome with trade_signals and market_bars to provide
-- a comprehensive prediction accuracy scorecard.
-- Requirements: 10.1, 10.2, 10.3, 10.4
-- Design ref: Section 9.2 (prediction confidence vs realized move)
CREATE OR REPLACE VIEW lakehouse.stonks.prediction_accuracy AS
SELECT
pvo.recommendation_id,
pvo.ticker,
pvo.predicted_action,
pvo.predicted_confidence,
pvo.actual_move_pct,
pvo.outcome,
pvo.horizon_days,
pvo.predicted_at,
pvo.evaluated_at,
pvo.model_version,
ts.trend_direction,
ts.trend_strength,
ts.contradiction_score,
ts.dominant_catalysts,
-- Confidence bucket for dashboard grouping
CASE
WHEN pvo.predicted_confidence >= 0.8 THEN 'high'
WHEN pvo.predicted_confidence >= 0.5 THEN 'medium'
ELSE 'low'
END AS confidence_bucket,
-- Direction correctness: did the predicted action match the actual move?
CASE
WHEN pvo.predicted_action = 'buy' AND pvo.actual_move_pct > 0 THEN true
WHEN pvo.predicted_action = 'sell' AND pvo.actual_move_pct < 0 THEN true
WHEN pvo.predicted_action IN ('hold', 'watch') THEN NULL
ELSE false
END AS direction_correct,
-- Magnitude of prediction error
ABS(pvo.actual_move_pct) AS abs_move_pct,
pvo.dt
FROM
lakehouse.stonks.prediction_vs_outcome pvo
LEFT JOIN
lakehouse.stonks.trade_signals ts
ON pvo.recommendation_id = ts.recommendation_id
AND pvo.dt = ts.dt;
+31
View File
@@ -0,0 +1,31 @@
-- View: signal_hit_rate
-- Daily summary of signal accuracy across all symbols and model versions.
-- Designed for the Superset prediction accuracy dashboard.
-- Requirements: 10.1, 10.2, 10.3
-- Design ref: Section 9.2 (prediction confidence vs realized move)
CREATE OR REPLACE VIEW lakehouse.stonks.signal_hit_rate AS
SELECT
pvo.dt,
pvo.model_version,
COUNT(*) AS total_predictions,
COUNT(CASE WHEN pvo.outcome = 'correct' THEN 1 END) AS correct_predictions,
COUNT(CASE WHEN pvo.outcome = 'incorrect' THEN 1 END) AS incorrect_predictions,
COUNT(CASE WHEN pvo.outcome = 'neutral' THEN 1 END) AS neutral_predictions,
-- Hit rate
CAST(
COUNT(CASE WHEN pvo.outcome = 'correct' THEN 1 END) AS DOUBLE
) / NULLIF(COUNT(*), 0) AS hit_rate,
-- Average confidence of correct vs incorrect
AVG(CASE WHEN pvo.outcome = 'correct' THEN pvo.predicted_confidence END)
AS avg_confidence_correct,
AVG(CASE WHEN pvo.outcome = 'incorrect' THEN pvo.predicted_confidence END)
AS avg_confidence_incorrect,
-- Average realized move magnitude
AVG(ABS(pvo.actual_move_pct)) AS avg_abs_move_pct,
AVG(pvo.actual_move_pct) AS avg_move_pct
FROM
lakehouse.stonks.prediction_vs_outcome pvo
GROUP BY
pvo.dt,
pvo.model_version;