phase 14-15: docker build validation and helm deployment

This commit is contained in:
Celes Renata
2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
+603
View File
@@ -0,0 +1,603 @@
"""Validate lake publication and Trino query correctness over partitioned MinIO datasets.
Ensures that:
- PyArrow schemas in worker.py match the lakehouse DDL column definitions
- Iceberg DDL generated from PyArrow schemas is consistent with lakehouse DDL
- Partition layouts are Hive-compatible and discoverable by Trino
- Published Parquet files embed partition columns in the data
- Cross-table join keys used by views are present and type-consistent
- All 12 analytical fact tables have aligned schema definitions across layers
Requirements: 9.4, 9.5, 10.1, 10.3, N4, N6
Design ref: Section 5.2, 5.3, 7, 8.4
"""
from __future__ import annotations
import io
import re
from datetime import date, datetime, timezone
from pathlib import Path
from unittest.mock import MagicMock
import pyarrow as pa
import pyarrow.parquet as pq
from services.lake_publisher.iceberg import (
ICEBERG_CATALOG,
ICEBERG_SCHEMA,
TABLE_SCHEMAS,
IcebergTableDef,
_arrow_type_to_trino,
get_all_table_defs,
get_table_def,
)
from services.lake_publisher.partitions import (
LAKEHOUSE_BUCKET,
TABLE_PARTITIONS,
WAREHOUSE_PREFIX,
partition_path,
partition_values,
)
from services.lake_publisher.worker import (
COMPANY_EVENTS_SCHEMA,
DOCUMENTS_SCHEMA,
DOCUMENT_EXTRACTIONS_SCHEMA,
MARKET_BARS_SCHEMA,
MARKET_QUOTES_SCHEMA,
MODEL_PERFORMANCE_SCHEMA,
PNL_DAILY_SCHEMA,
POSITIONS_DAILY_SCHEMA,
PREDICTION_VS_OUTCOME_SCHEMA,
TRADE_FILLS_SCHEMA,
TRADE_ORDERS_SCHEMA,
TRADE_SIGNALS_SCHEMA,
publish_market_bar,
publish_document_fact,
publish_document_extraction,
publish_trade_signal,
publish_trade_order,
publish_trade_fill,
publish_position_daily,
publish_pnl_daily,
publish_company_event,
publish_market_quote,
publish_prediction_fact,
publish_model_performance,
)
from services.shared.schemas import (
ActionType,
ModelMetadata,
PositionSizing,
Recommendation,
RecommendationMode,
)
NOW = datetime(2026, 4, 11, 14, 30, 0, tzinfo=timezone.utc)
LAKEHOUSE_DDL_DIR = Path("lakehouse/schemas")
# All 12 expected analytical fact tables
ALL_TABLES = [
"market_bars",
"market_quotes",
"company_events",
"documents",
"document_extractions",
"trade_signals",
"trade_orders",
"trade_fills",
"positions_daily",
"pnl_daily",
"prediction_vs_outcome",
"model_performance",
]
# Map table names to their PyArrow schemas for direct reference
PYARROW_SCHEMAS: dict[str, pa.Schema] = {
"market_bars": MARKET_BARS_SCHEMA,
"market_quotes": MARKET_QUOTES_SCHEMA,
"company_events": COMPANY_EVENTS_SCHEMA,
"documents": DOCUMENTS_SCHEMA,
"document_extractions": DOCUMENT_EXTRACTIONS_SCHEMA,
"trade_signals": TRADE_SIGNALS_SCHEMA,
"trade_orders": TRADE_ORDERS_SCHEMA,
"trade_fills": TRADE_FILLS_SCHEMA,
"positions_daily": POSITIONS_DAILY_SCHEMA,
"pnl_daily": PNL_DAILY_SCHEMA,
"prediction_vs_outcome": PREDICTION_VS_OUTCOME_SCHEMA,
"model_performance": MODEL_PERFORMANCE_SCHEMA,
}
# ---------------------------------------------------------------------------
# Helpers: parse lakehouse DDL SQL files
# ---------------------------------------------------------------------------
def _parse_ddl_columns(sql_path: Path) -> list[tuple[str, str]]:
"""Parse column definitions from a lakehouse DDL SQL file.
Returns list of (column_name, trino_type) tuples in declaration order.
Includes partition columns from the partitioned_by clause appended at the end,
since Hive DDL separates them but PyArrow/Iceberg schemas include them inline.
"""
text = sql_path.read_text()
# Extract the column block — match balanced parens for the CREATE TABLE body.
# The column block ends at the closing ) before WITH.
match = re.search(
r"CREATE TABLE[^(]+\((.*)\)\s*WITH",
text, re.DOTALL | re.IGNORECASE,
)
if not match:
return []
col_block = match.group(1)
columns = []
for line in col_block.strip().split("\n"):
line = line.strip().rstrip(",")
if not line or line.startswith("--"):
continue
# Split only on first whitespace to keep multi-word types intact
parts = line.split(None, 1)
if len(parts) >= 2:
col_name = parts[0].lower()
col_type = parts[1].upper().strip()
columns.append((col_name, col_type))
return columns
def _parse_ddl_partitions(sql_path: Path) -> list[str]:
"""Parse partition keys from a lakehouse DDL SQL file."""
text = sql_path.read_text()
match = re.search(r"partitioned_by\s*=\s*ARRAY\[([^\]]+)\]", text, re.IGNORECASE)
if not match:
return []
raw = match.group(1)
return [k.strip().strip("'\"") for k in raw.split(",")]
# ---------------------------------------------------------------------------
# 1. All 12 tables are registered across all layers
# ---------------------------------------------------------------------------
def test_all_tables_in_partition_registry():
"""Every expected analytical table is registered in TABLE_PARTITIONS."""
for table in ALL_TABLES:
assert table in TABLE_PARTITIONS, f"{table} missing from TABLE_PARTITIONS"
def test_all_tables_in_schema_registry():
"""Every expected analytical table has a PyArrow schema in TABLE_SCHEMAS."""
for table in ALL_TABLES:
assert table in TABLE_SCHEMAS, f"{table} missing from TABLE_SCHEMAS"
def test_all_tables_have_ddl_files():
"""Every expected analytical table has a lakehouse DDL SQL file."""
for table in ALL_TABLES:
ddl_path = LAKEHOUSE_DDL_DIR / f"{table}.sql"
assert ddl_path.exists(), f"Missing DDL file: {ddl_path}"
def test_all_tables_have_iceberg_defs():
"""Every table in TABLE_PARTITIONS produces a valid IcebergTableDef."""
defs = get_all_table_defs()
def_names = {d.table_name for d in defs}
for table in ALL_TABLES:
assert table in def_names, f"{table} missing from Iceberg table defs"
# ---------------------------------------------------------------------------
# 2. PyArrow schema ↔ Lakehouse DDL column alignment
# ---------------------------------------------------------------------------
def test_pyarrow_columns_match_ddl():
"""PyArrow schema column names and order match the lakehouse DDL for every table."""
for table in ALL_TABLES:
ddl_path = LAKEHOUSE_DDL_DIR / f"{table}.sql"
if not ddl_path.exists():
continue
ddl_cols = _parse_ddl_columns(ddl_path)
ddl_col_names = [c[0] for c in ddl_cols]
arrow_schema = PYARROW_SCHEMAS[table]
arrow_col_names = [arrow_schema.field(i).name for i in range(len(arrow_schema))]
assert arrow_col_names == ddl_col_names, (
f"Column mismatch for {table}:\n"
f" PyArrow: {arrow_col_names}\n"
f" DDL: {ddl_col_names}"
)
def test_pyarrow_types_compatible_with_ddl():
"""PyArrow types map to Trino types that match the lakehouse DDL."""
for table in ALL_TABLES:
ddl_path = LAKEHOUSE_DDL_DIR / f"{table}.sql"
if not ddl_path.exists():
continue
ddl_cols = _parse_ddl_columns(ddl_path)
ddl_type_map = {name: typ for name, typ in ddl_cols}
arrow_schema = PYARROW_SCHEMAS[table]
for i in range(len(arrow_schema)):
col_name = arrow_schema.field(i).name
arrow_type = arrow_schema.field(i).type
trino_type = _arrow_type_to_trino(arrow_type)
ddl_type = ddl_type_map.get(col_name, "")
assert trino_type == ddl_type, (
f"Type mismatch for {table}.{col_name}: "
f"PyArrow→Trino={trino_type}, DDL={ddl_type}"
)
# ---------------------------------------------------------------------------
# 3. Partition key alignment across layers
# ---------------------------------------------------------------------------
def test_partition_keys_match_ddl():
"""Partition keys in TABLE_PARTITIONS match the DDL partitioned_by clause."""
for table in ALL_TABLES:
ddl_path = LAKEHOUSE_DDL_DIR / f"{table}.sql"
if not ddl_path.exists():
continue
ddl_parts = _parse_ddl_partitions(ddl_path)
spec = TABLE_PARTITIONS[table]
arrow_parts = list(spec.all_keys)
assert arrow_parts == ddl_parts, (
f"Partition key mismatch for {table}: "
f"TABLE_PARTITIONS={arrow_parts}, DDL={ddl_parts}"
)
def test_iceberg_partition_keys_match():
"""Iceberg DDL partition keys match TABLE_PARTITIONS for every table."""
for td in get_all_table_defs():
spec = TABLE_PARTITIONS[td.table_name]
expected_keys = list(spec.all_keys)
# Parse from the generated SQL
sql = td.create_table_sql()
match = re.search(r"partitioning = ARRAY\[([^\]]+)\]", sql)
if expected_keys:
assert match is not None, f"No partitioning clause for {td.table_name}"
parsed = [k.strip().strip("'") for k in match.group(1).split(",")]
assert parsed == expected_keys, (
f"Iceberg partition mismatch for {td.table_name}: "
f"expected={expected_keys}, got={parsed}"
)
# ---------------------------------------------------------------------------
# 4. Partition columns are embedded in PyArrow schemas
# ---------------------------------------------------------------------------
def test_partition_columns_in_pyarrow_schemas():
"""Partition columns (dt, model_version, etc.) appear in the PyArrow schema
so they are written into Parquet files, not just inferred from paths."""
for table in ALL_TABLES:
schema = PYARROW_SCHEMAS[table]
spec = TABLE_PARTITIONS[table]
col_names = {schema.field(i).name for i in range(len(schema))}
for key in spec.all_keys:
assert key in col_names, (
f"Partition column '{key}' missing from PyArrow schema for {table}"
)
# ---------------------------------------------------------------------------
# 5. Hive-compatible partition path format
# ---------------------------------------------------------------------------
def test_partition_paths_are_hive_compatible():
"""Partition paths follow Hive key=value directory convention."""
for table in ALL_TABLES:
spec = TABLE_PARTITIONS[table]
extras = {}
if spec.extra_keys:
extras = {k: "test_val" for k in spec.extra_keys}
path = partition_path(table, NOW, extras)
# Must start with warehouse prefix
assert path.startswith(f"{WAREHOUSE_PREFIX}/{table}/"), (
f"Path for {table} doesn't start with warehouse prefix: {path}"
)
# Must contain dt= partition
assert "dt=2026-04-11" in path, f"Missing dt partition in path for {table}: {path}"
# Must end with .parquet
assert path.endswith(".parquet"), f"Path for {table} doesn't end with .parquet: {path}"
# Extra partition keys must appear
for key in spec.extra_keys:
assert f"{key}=test_val" in path, (
f"Missing extra partition {key} in path for {table}: {path}"
)
def test_partition_path_dt_from_date_object():
"""partition_path works with both datetime and date objects."""
d = date(2026, 4, 11)
path = partition_path("market_bars", d)
assert "dt=2026-04-11" in path
# ---------------------------------------------------------------------------
# 6. Published Parquet files contain partition columns in data
# ---------------------------------------------------------------------------
def _capture_parquet(mock_client: MagicMock) -> pa.Table:
"""Extract the Parquet table from a MagicMock MinIO client's put_object call."""
put_call = mock_client.put_object.call_args
buf = put_call[0][2]
buf.seek(0)
return pq.read_table(buf)
def test_published_market_bar_has_dt_column():
client = MagicMock()
publish_market_bar(
client, ticker="AAPL", open_price=150.0, high_price=155.0,
low_price=149.0, close_price=153.0, volume=1000000,
bar_timestamp=NOW, source="test",
)
table = _capture_parquet(client)
assert "dt" in table.column_names
assert table.column("dt")[0].as_py() == date(2026, 4, 11)
def test_published_document_extraction_has_partition_columns():
client = MagicMock()
publish_document_extraction(
client, document_id="doc-1", ticker="AAPL", sentiment="positive",
impact_score=0.7, catalyst_type="earnings", confidence=0.85,
extraction_at=NOW, model_name="test-model", prompt_version="v1",
schema_version="2.0.0",
)
table = _capture_parquet(client)
assert "dt" in table.column_names
assert "model_version" in table.column_names
assert table.column("dt")[0].as_py() == date(2026, 4, 11)
assert table.column("model_version")[0].as_py() == "2.0.0"
def test_published_prediction_vs_outcome_has_partition_columns():
client = MagicMock()
rec = Recommendation(
recommendation_id="rec-001", ticker="AAPL", action=ActionType.BUY,
mode=RecommendationMode.PAPER_ELIGIBLE, confidence=0.72,
time_horizon="swing_1d_10d", thesis="test",
invalidation_conditions=["x"], position_sizing=PositionSizing(portfolio_pct=0.02, max_loss_pct=0.005),
evidence_refs=["doc1"], model_metadata=ModelMetadata(provider="ollama", model_name="test-v1"),
generated_at=NOW,
)
publish_prediction_fact(client, rec)
table = _capture_parquet(client)
assert "dt" in table.column_names
assert "model_version" in table.column_names
def test_published_model_performance_has_partition_columns():
client = MagicMock()
publish_model_performance(
client, document_id="doc-1", model_name="gpt-oss:20b",
success=True, total_duration_ms=1500, recorded_at=NOW,
schema_version="2.0.0",
)
table = _capture_parquet(client)
assert "dt" in table.column_names
assert "model_version" in table.column_names
assert table.column("model_version")[0].as_py() == "2.0.0"
# ---------------------------------------------------------------------------
# 7. Parquet schema matches PyArrow schema for every publisher
# ---------------------------------------------------------------------------
def _publish_and_verify_schema(table_name: str, publish_fn, expected_schema: pa.Schema):
"""Helper: call a publish function, read back the Parquet, verify column names match."""
client = MagicMock()
publish_fn(client)
table = _capture_parquet(client)
expected_names = [expected_schema.field(i).name for i in range(len(expected_schema))]
assert list(table.column_names) == expected_names, (
f"Parquet column mismatch for {table_name}: "
f"got={list(table.column_names)}, expected={expected_names}"
)
def test_parquet_schema_market_bars():
_publish_and_verify_schema("market_bars", lambda c: publish_market_bar(
c, "AAPL", 150.0, 155.0, 149.0, 153.0, 1000000, NOW, "test",
), MARKET_BARS_SCHEMA)
def test_parquet_schema_market_quotes():
_publish_and_verify_schema("market_quotes", lambda c: publish_market_quote(
c, "AAPL", 150.0, 150.5, 150.25, NOW, "test",
), MARKET_QUOTES_SCHEMA)
def test_parquet_schema_company_events():
_publish_and_verify_schema("company_events", lambda c: publish_company_event(
c, "evt-1", "AAPL", "earnings", "Q1 Earnings", NOW, "test",
), COMPANY_EVENTS_SCHEMA)
def test_parquet_schema_documents():
_publish_and_verify_schema("documents", lambda c: publish_document_fact(
c, "doc-1", "article", "news_api", "AAPL", "Reuters", "Test", NOW, "hash123",
), DOCUMENTS_SCHEMA)
def test_parquet_schema_trade_orders():
_publish_and_verify_schema("trade_orders", lambda c: publish_trade_order(
c, "ord-1", "AAPL", "buy", "market", 10.0, None, "filled", "acct-1", NOW,
), TRADE_ORDERS_SCHEMA)
def test_parquet_schema_trade_fills():
_publish_and_verify_schema("trade_fills", lambda c: publish_trade_fill(
c, "fill-1", "ord-1", "AAPL", "buy", 150.25, 10.0, "acct-1", NOW,
), TRADE_FILLS_SCHEMA)
def test_parquet_schema_positions_daily():
_publish_and_verify_schema("positions_daily", lambda c: publish_position_daily(
c, "AAPL", 100.0, 145.0, 150.0, 500.0, "acct-1", NOW,
), POSITIONS_DAILY_SCHEMA)
def test_parquet_schema_pnl_daily():
_publish_and_verify_schema("pnl_daily", lambda c: publish_pnl_daily(
c, "AAPL", 200.0, 500.0, 700.0, "acct-1", NOW,
), PNL_DAILY_SCHEMA)
# ---------------------------------------------------------------------------
# 8. Cross-table join keys for views
# ---------------------------------------------------------------------------
def test_prediction_accuracy_view_join_keys():
"""prediction_accuracy view joins prediction_vs_outcome with trade_signals
on recommendation_id and dt — both tables must have these columns."""
pvo_cols = {PREDICTION_VS_OUTCOME_SCHEMA.field(i).name for i in range(len(PREDICTION_VS_OUTCOME_SCHEMA))}
ts_cols = {TRADE_SIGNALS_SCHEMA.field(i).name for i in range(len(TRADE_SIGNALS_SCHEMA))}
assert "recommendation_id" in pvo_cols
assert "recommendation_id" in ts_cols
assert "dt" in pvo_cols
assert "dt" in ts_cols
def test_paper_trade_scorecard_view_join_keys():
"""paper_trade_scorecard joins pnl_daily with trade_orders
on ticker, broker_account, and dt."""
pnl_cols = {PNL_DAILY_SCHEMA.field(i).name for i in range(len(PNL_DAILY_SCHEMA))}
ord_cols = {TRADE_ORDERS_SCHEMA.field(i).name for i in range(len(TRADE_ORDERS_SCHEMA))}
for key in ["ticker", "broker_account", "dt"]:
assert key in pnl_cols, f"pnl_daily missing join key: {key}"
assert key in ord_cols, f"trade_orders missing join key: {key}"
def test_paper_trade_detail_view_join_keys():
"""paper_trade_detail joins trade_orders, trade_fills, and prediction_vs_outcome."""
ord_cols = {TRADE_ORDERS_SCHEMA.field(i).name for i in range(len(TRADE_ORDERS_SCHEMA))}
fill_cols = {TRADE_FILLS_SCHEMA.field(i).name for i in range(len(TRADE_FILLS_SCHEMA))}
pvo_cols = {PREDICTION_VS_OUTCOME_SCHEMA.field(i).name for i in range(len(PREDICTION_VS_OUTCOME_SCHEMA))}
# orders ↔ fills on order_id, dt
assert "order_id" in ord_cols
assert "order_id" in fill_cols
assert "dt" in ord_cols
assert "dt" in fill_cols
# orders ↔ prediction_vs_outcome on recommendation_id, dt
assert "recommendation_id" in ord_cols
assert "recommendation_id" in pvo_cols
def test_signal_hit_rate_view_columns():
"""signal_hit_rate groups by dt and model_version from prediction_vs_outcome."""
pvo_cols = {PREDICTION_VS_OUTCOME_SCHEMA.field(i).name for i in range(len(PREDICTION_VS_OUTCOME_SCHEMA))}
assert "dt" in pvo_cols
assert "model_version" in pvo_cols
assert "outcome" in pvo_cols
assert "predicted_confidence" in pvo_cols
assert "actual_move_pct" in pvo_cols
# ---------------------------------------------------------------------------
# 9. Iceberg DDL consistency with lakehouse DDL
# ---------------------------------------------------------------------------
def test_iceberg_ddl_columns_match_lakehouse_ddl():
"""Iceberg CREATE TABLE columns match the lakehouse DDL columns for every table."""
for td in get_all_table_defs():
ddl_path = LAKEHOUSE_DDL_DIR / f"{td.table_name}.sql"
if not ddl_path.exists():
continue
ddl_cols = _parse_ddl_columns(ddl_path)
ddl_col_names = [c[0] for c in ddl_cols]
iceberg_sql = td.create_table_sql()
# Extract column block from Iceberg DDL (greedy to handle nested parens)
match = re.search(r"CREATE TABLE[^(]+\((.*)\)\s*WITH", iceberg_sql, re.DOTALL)
assert match is not None, f"Could not parse Iceberg DDL for {td.table_name}"
iceberg_col_block = match.group(1)
iceberg_col_names = []
for line in iceberg_col_block.strip().split("\n"):
line = line.strip().rstrip(",")
if line:
parts = line.split()
if parts:
iceberg_col_names.append(parts[0].lower())
assert iceberg_col_names == ddl_col_names, (
f"Iceberg DDL column mismatch for {td.table_name}:\n"
f" Iceberg: {iceberg_col_names}\n"
f" DDL: {ddl_col_names}"
)
# ---------------------------------------------------------------------------
# 10. MinIO bucket and path conventions
# ---------------------------------------------------------------------------
def test_lakehouse_bucket_name():
assert LAKEHOUSE_BUCKET == "stonks-lakehouse"
def test_warehouse_prefix():
assert WAREHOUSE_PREFIX == "warehouse"
def test_all_paths_use_warehouse_prefix():
"""Every table's partition path starts with warehouse/{table_name}/."""
for table in ALL_TABLES:
spec = TABLE_PARTITIONS[table]
extras = {k: "v" for k in spec.extra_keys}
path = partition_path(table, NOW, extras)
assert path.startswith(f"warehouse/{table}/"), (
f"Path for {table} doesn't follow convention: {path}"
)
# ---------------------------------------------------------------------------
# 11. Iceberg table locations point to correct MinIO paths
# ---------------------------------------------------------------------------
def test_iceberg_locations_match_ddl_external_locations():
"""Iceberg table locations use s3a:// and match the lakehouse DDL external_location."""
for td in get_all_table_defs():
expected = f"s3a://{LAKEHOUSE_BUCKET}/{WAREHOUSE_PREFIX}/{td.table_name}/"
assert td.location == expected, (
f"Iceberg location mismatch for {td.table_name}: "
f"got={td.location}, expected={expected}"
)
# ---------------------------------------------------------------------------
# 12. Partition values are injected correctly
# ---------------------------------------------------------------------------
def test_partition_values_dt_only():
pv = partition_values(NOW)
assert pv == {"dt": date(2026, 4, 11)}
def test_partition_values_with_model_version():
pv = partition_values(NOW, {"model_version": "2.0.0"})
assert pv == {"dt": date(2026, 4, 11), "model_version": "2.0.0"}
def test_partition_values_from_date():
pv = partition_values(date(2026, 4, 11))
assert pv == {"dt": date(2026, 4, 11)}