phase 14-15: docker build validation and helm deployment
This commit is contained in:
@@ -0,0 +1,603 @@
|
||||
"""Validate lake publication and Trino query correctness over partitioned MinIO datasets.
|
||||
|
||||
Ensures that:
|
||||
- PyArrow schemas in worker.py match the lakehouse DDL column definitions
|
||||
- Iceberg DDL generated from PyArrow schemas is consistent with lakehouse DDL
|
||||
- Partition layouts are Hive-compatible and discoverable by Trino
|
||||
- Published Parquet files embed partition columns in the data
|
||||
- Cross-table join keys used by views are present and type-consistent
|
||||
- All 12 analytical fact tables have aligned schema definitions across layers
|
||||
|
||||
Requirements: 9.4, 9.5, 10.1, 10.3, N4, N6
|
||||
Design ref: Section 5.2, 5.3, 7, 8.4
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import re
|
||||
from datetime import date, datetime, timezone
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pyarrow as pa
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
from services.lake_publisher.iceberg import (
|
||||
ICEBERG_CATALOG,
|
||||
ICEBERG_SCHEMA,
|
||||
TABLE_SCHEMAS,
|
||||
IcebergTableDef,
|
||||
_arrow_type_to_trino,
|
||||
get_all_table_defs,
|
||||
get_table_def,
|
||||
)
|
||||
from services.lake_publisher.partitions import (
|
||||
LAKEHOUSE_BUCKET,
|
||||
TABLE_PARTITIONS,
|
||||
WAREHOUSE_PREFIX,
|
||||
partition_path,
|
||||
partition_values,
|
||||
)
|
||||
from services.lake_publisher.worker import (
|
||||
COMPANY_EVENTS_SCHEMA,
|
||||
DOCUMENTS_SCHEMA,
|
||||
DOCUMENT_EXTRACTIONS_SCHEMA,
|
||||
MARKET_BARS_SCHEMA,
|
||||
MARKET_QUOTES_SCHEMA,
|
||||
MODEL_PERFORMANCE_SCHEMA,
|
||||
PNL_DAILY_SCHEMA,
|
||||
POSITIONS_DAILY_SCHEMA,
|
||||
PREDICTION_VS_OUTCOME_SCHEMA,
|
||||
TRADE_FILLS_SCHEMA,
|
||||
TRADE_ORDERS_SCHEMA,
|
||||
TRADE_SIGNALS_SCHEMA,
|
||||
publish_market_bar,
|
||||
publish_document_fact,
|
||||
publish_document_extraction,
|
||||
publish_trade_signal,
|
||||
publish_trade_order,
|
||||
publish_trade_fill,
|
||||
publish_position_daily,
|
||||
publish_pnl_daily,
|
||||
publish_company_event,
|
||||
publish_market_quote,
|
||||
publish_prediction_fact,
|
||||
publish_model_performance,
|
||||
)
|
||||
from services.shared.schemas import (
|
||||
ActionType,
|
||||
ModelMetadata,
|
||||
PositionSizing,
|
||||
Recommendation,
|
||||
RecommendationMode,
|
||||
)
|
||||
|
||||
NOW = datetime(2026, 4, 11, 14, 30, 0, tzinfo=timezone.utc)
|
||||
LAKEHOUSE_DDL_DIR = Path("lakehouse/schemas")
|
||||
|
||||
# All 12 expected analytical fact tables
|
||||
ALL_TABLES = [
|
||||
"market_bars",
|
||||
"market_quotes",
|
||||
"company_events",
|
||||
"documents",
|
||||
"document_extractions",
|
||||
"trade_signals",
|
||||
"trade_orders",
|
||||
"trade_fills",
|
||||
"positions_daily",
|
||||
"pnl_daily",
|
||||
"prediction_vs_outcome",
|
||||
"model_performance",
|
||||
]
|
||||
|
||||
# Map table names to their PyArrow schemas for direct reference
|
||||
PYARROW_SCHEMAS: dict[str, pa.Schema] = {
|
||||
"market_bars": MARKET_BARS_SCHEMA,
|
||||
"market_quotes": MARKET_QUOTES_SCHEMA,
|
||||
"company_events": COMPANY_EVENTS_SCHEMA,
|
||||
"documents": DOCUMENTS_SCHEMA,
|
||||
"document_extractions": DOCUMENT_EXTRACTIONS_SCHEMA,
|
||||
"trade_signals": TRADE_SIGNALS_SCHEMA,
|
||||
"trade_orders": TRADE_ORDERS_SCHEMA,
|
||||
"trade_fills": TRADE_FILLS_SCHEMA,
|
||||
"positions_daily": POSITIONS_DAILY_SCHEMA,
|
||||
"pnl_daily": PNL_DAILY_SCHEMA,
|
||||
"prediction_vs_outcome": PREDICTION_VS_OUTCOME_SCHEMA,
|
||||
"model_performance": MODEL_PERFORMANCE_SCHEMA,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers: parse lakehouse DDL SQL files
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _parse_ddl_columns(sql_path: Path) -> list[tuple[str, str]]:
|
||||
"""Parse column definitions from a lakehouse DDL SQL file.
|
||||
|
||||
Returns list of (column_name, trino_type) tuples in declaration order.
|
||||
Includes partition columns from the partitioned_by clause appended at the end,
|
||||
since Hive DDL separates them but PyArrow/Iceberg schemas include them inline.
|
||||
"""
|
||||
text = sql_path.read_text()
|
||||
# Extract the column block — match balanced parens for the CREATE TABLE body.
|
||||
# The column block ends at the closing ) before WITH.
|
||||
match = re.search(
|
||||
r"CREATE TABLE[^(]+\((.*)\)\s*WITH",
|
||||
text, re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
if not match:
|
||||
return []
|
||||
col_block = match.group(1)
|
||||
columns = []
|
||||
for line in col_block.strip().split("\n"):
|
||||
line = line.strip().rstrip(",")
|
||||
if not line or line.startswith("--"):
|
||||
continue
|
||||
# Split only on first whitespace to keep multi-word types intact
|
||||
parts = line.split(None, 1)
|
||||
if len(parts) >= 2:
|
||||
col_name = parts[0].lower()
|
||||
col_type = parts[1].upper().strip()
|
||||
columns.append((col_name, col_type))
|
||||
return columns
|
||||
|
||||
|
||||
def _parse_ddl_partitions(sql_path: Path) -> list[str]:
|
||||
"""Parse partition keys from a lakehouse DDL SQL file."""
|
||||
text = sql_path.read_text()
|
||||
match = re.search(r"partitioned_by\s*=\s*ARRAY\[([^\]]+)\]", text, re.IGNORECASE)
|
||||
if not match:
|
||||
return []
|
||||
raw = match.group(1)
|
||||
return [k.strip().strip("'\"") for k in raw.split(",")]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. All 12 tables are registered across all layers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_all_tables_in_partition_registry():
|
||||
"""Every expected analytical table is registered in TABLE_PARTITIONS."""
|
||||
for table in ALL_TABLES:
|
||||
assert table in TABLE_PARTITIONS, f"{table} missing from TABLE_PARTITIONS"
|
||||
|
||||
|
||||
def test_all_tables_in_schema_registry():
|
||||
"""Every expected analytical table has a PyArrow schema in TABLE_SCHEMAS."""
|
||||
for table in ALL_TABLES:
|
||||
assert table in TABLE_SCHEMAS, f"{table} missing from TABLE_SCHEMAS"
|
||||
|
||||
|
||||
def test_all_tables_have_ddl_files():
|
||||
"""Every expected analytical table has a lakehouse DDL SQL file."""
|
||||
for table in ALL_TABLES:
|
||||
ddl_path = LAKEHOUSE_DDL_DIR / f"{table}.sql"
|
||||
assert ddl_path.exists(), f"Missing DDL file: {ddl_path}"
|
||||
|
||||
|
||||
def test_all_tables_have_iceberg_defs():
|
||||
"""Every table in TABLE_PARTITIONS produces a valid IcebergTableDef."""
|
||||
defs = get_all_table_defs()
|
||||
def_names = {d.table_name for d in defs}
|
||||
for table in ALL_TABLES:
|
||||
assert table in def_names, f"{table} missing from Iceberg table defs"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. PyArrow schema ↔ Lakehouse DDL column alignment
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pyarrow_columns_match_ddl():
|
||||
"""PyArrow schema column names and order match the lakehouse DDL for every table."""
|
||||
for table in ALL_TABLES:
|
||||
ddl_path = LAKEHOUSE_DDL_DIR / f"{table}.sql"
|
||||
if not ddl_path.exists():
|
||||
continue
|
||||
ddl_cols = _parse_ddl_columns(ddl_path)
|
||||
ddl_col_names = [c[0] for c in ddl_cols]
|
||||
|
||||
arrow_schema = PYARROW_SCHEMAS[table]
|
||||
arrow_col_names = [arrow_schema.field(i).name for i in range(len(arrow_schema))]
|
||||
|
||||
assert arrow_col_names == ddl_col_names, (
|
||||
f"Column mismatch for {table}:\n"
|
||||
f" PyArrow: {arrow_col_names}\n"
|
||||
f" DDL: {ddl_col_names}"
|
||||
)
|
||||
|
||||
|
||||
def test_pyarrow_types_compatible_with_ddl():
|
||||
"""PyArrow types map to Trino types that match the lakehouse DDL."""
|
||||
for table in ALL_TABLES:
|
||||
ddl_path = LAKEHOUSE_DDL_DIR / f"{table}.sql"
|
||||
if not ddl_path.exists():
|
||||
continue
|
||||
ddl_cols = _parse_ddl_columns(ddl_path)
|
||||
ddl_type_map = {name: typ for name, typ in ddl_cols}
|
||||
|
||||
arrow_schema = PYARROW_SCHEMAS[table]
|
||||
for i in range(len(arrow_schema)):
|
||||
col_name = arrow_schema.field(i).name
|
||||
arrow_type = arrow_schema.field(i).type
|
||||
trino_type = _arrow_type_to_trino(arrow_type)
|
||||
|
||||
ddl_type = ddl_type_map.get(col_name, "")
|
||||
assert trino_type == ddl_type, (
|
||||
f"Type mismatch for {table}.{col_name}: "
|
||||
f"PyArrow→Trino={trino_type}, DDL={ddl_type}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. Partition key alignment across layers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_keys_match_ddl():
|
||||
"""Partition keys in TABLE_PARTITIONS match the DDL partitioned_by clause."""
|
||||
for table in ALL_TABLES:
|
||||
ddl_path = LAKEHOUSE_DDL_DIR / f"{table}.sql"
|
||||
if not ddl_path.exists():
|
||||
continue
|
||||
ddl_parts = _parse_ddl_partitions(ddl_path)
|
||||
spec = TABLE_PARTITIONS[table]
|
||||
arrow_parts = list(spec.all_keys)
|
||||
|
||||
assert arrow_parts == ddl_parts, (
|
||||
f"Partition key mismatch for {table}: "
|
||||
f"TABLE_PARTITIONS={arrow_parts}, DDL={ddl_parts}"
|
||||
)
|
||||
|
||||
|
||||
def test_iceberg_partition_keys_match():
|
||||
"""Iceberg DDL partition keys match TABLE_PARTITIONS for every table."""
|
||||
for td in get_all_table_defs():
|
||||
spec = TABLE_PARTITIONS[td.table_name]
|
||||
expected_keys = list(spec.all_keys)
|
||||
# Parse from the generated SQL
|
||||
sql = td.create_table_sql()
|
||||
match = re.search(r"partitioning = ARRAY\[([^\]]+)\]", sql)
|
||||
if expected_keys:
|
||||
assert match is not None, f"No partitioning clause for {td.table_name}"
|
||||
parsed = [k.strip().strip("'") for k in match.group(1).split(",")]
|
||||
assert parsed == expected_keys, (
|
||||
f"Iceberg partition mismatch for {td.table_name}: "
|
||||
f"expected={expected_keys}, got={parsed}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 4. Partition columns are embedded in PyArrow schemas
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_columns_in_pyarrow_schemas():
|
||||
"""Partition columns (dt, model_version, etc.) appear in the PyArrow schema
|
||||
so they are written into Parquet files, not just inferred from paths."""
|
||||
for table in ALL_TABLES:
|
||||
schema = PYARROW_SCHEMAS[table]
|
||||
spec = TABLE_PARTITIONS[table]
|
||||
col_names = {schema.field(i).name for i in range(len(schema))}
|
||||
for key in spec.all_keys:
|
||||
assert key in col_names, (
|
||||
f"Partition column '{key}' missing from PyArrow schema for {table}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 5. Hive-compatible partition path format
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_paths_are_hive_compatible():
|
||||
"""Partition paths follow Hive key=value directory convention."""
|
||||
for table in ALL_TABLES:
|
||||
spec = TABLE_PARTITIONS[table]
|
||||
extras = {}
|
||||
if spec.extra_keys:
|
||||
extras = {k: "test_val" for k in spec.extra_keys}
|
||||
path = partition_path(table, NOW, extras)
|
||||
|
||||
# Must start with warehouse prefix
|
||||
assert path.startswith(f"{WAREHOUSE_PREFIX}/{table}/"), (
|
||||
f"Path for {table} doesn't start with warehouse prefix: {path}"
|
||||
)
|
||||
# Must contain dt= partition
|
||||
assert "dt=2026-04-11" in path, f"Missing dt partition in path for {table}: {path}"
|
||||
# Must end with .parquet
|
||||
assert path.endswith(".parquet"), f"Path for {table} doesn't end with .parquet: {path}"
|
||||
# Extra partition keys must appear
|
||||
for key in spec.extra_keys:
|
||||
assert f"{key}=test_val" in path, (
|
||||
f"Missing extra partition {key} in path for {table}: {path}"
|
||||
)
|
||||
|
||||
|
||||
def test_partition_path_dt_from_date_object():
|
||||
"""partition_path works with both datetime and date objects."""
|
||||
d = date(2026, 4, 11)
|
||||
path = partition_path("market_bars", d)
|
||||
assert "dt=2026-04-11" in path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 6. Published Parquet files contain partition columns in data
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _capture_parquet(mock_client: MagicMock) -> pa.Table:
|
||||
"""Extract the Parquet table from a MagicMock MinIO client's put_object call."""
|
||||
put_call = mock_client.put_object.call_args
|
||||
buf = put_call[0][2]
|
||||
buf.seek(0)
|
||||
return pq.read_table(buf)
|
||||
|
||||
|
||||
def test_published_market_bar_has_dt_column():
|
||||
client = MagicMock()
|
||||
publish_market_bar(
|
||||
client, ticker="AAPL", open_price=150.0, high_price=155.0,
|
||||
low_price=149.0, close_price=153.0, volume=1000000,
|
||||
bar_timestamp=NOW, source="test",
|
||||
)
|
||||
table = _capture_parquet(client)
|
||||
assert "dt" in table.column_names
|
||||
assert table.column("dt")[0].as_py() == date(2026, 4, 11)
|
||||
|
||||
|
||||
def test_published_document_extraction_has_partition_columns():
|
||||
client = MagicMock()
|
||||
publish_document_extraction(
|
||||
client, document_id="doc-1", ticker="AAPL", sentiment="positive",
|
||||
impact_score=0.7, catalyst_type="earnings", confidence=0.85,
|
||||
extraction_at=NOW, model_name="test-model", prompt_version="v1",
|
||||
schema_version="2.0.0",
|
||||
)
|
||||
table = _capture_parquet(client)
|
||||
assert "dt" in table.column_names
|
||||
assert "model_version" in table.column_names
|
||||
assert table.column("dt")[0].as_py() == date(2026, 4, 11)
|
||||
assert table.column("model_version")[0].as_py() == "2.0.0"
|
||||
|
||||
|
||||
def test_published_prediction_vs_outcome_has_partition_columns():
|
||||
client = MagicMock()
|
||||
rec = Recommendation(
|
||||
recommendation_id="rec-001", ticker="AAPL", action=ActionType.BUY,
|
||||
mode=RecommendationMode.PAPER_ELIGIBLE, confidence=0.72,
|
||||
time_horizon="swing_1d_10d", thesis="test",
|
||||
invalidation_conditions=["x"], position_sizing=PositionSizing(portfolio_pct=0.02, max_loss_pct=0.005),
|
||||
evidence_refs=["doc1"], model_metadata=ModelMetadata(provider="ollama", model_name="test-v1"),
|
||||
generated_at=NOW,
|
||||
)
|
||||
publish_prediction_fact(client, rec)
|
||||
table = _capture_parquet(client)
|
||||
assert "dt" in table.column_names
|
||||
assert "model_version" in table.column_names
|
||||
|
||||
|
||||
def test_published_model_performance_has_partition_columns():
|
||||
client = MagicMock()
|
||||
publish_model_performance(
|
||||
client, document_id="doc-1", model_name="gpt-oss:20b",
|
||||
success=True, total_duration_ms=1500, recorded_at=NOW,
|
||||
schema_version="2.0.0",
|
||||
)
|
||||
table = _capture_parquet(client)
|
||||
assert "dt" in table.column_names
|
||||
assert "model_version" in table.column_names
|
||||
assert table.column("model_version")[0].as_py() == "2.0.0"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 7. Parquet schema matches PyArrow schema for every publisher
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _publish_and_verify_schema(table_name: str, publish_fn, expected_schema: pa.Schema):
|
||||
"""Helper: call a publish function, read back the Parquet, verify column names match."""
|
||||
client = MagicMock()
|
||||
publish_fn(client)
|
||||
table = _capture_parquet(client)
|
||||
expected_names = [expected_schema.field(i).name for i in range(len(expected_schema))]
|
||||
assert list(table.column_names) == expected_names, (
|
||||
f"Parquet column mismatch for {table_name}: "
|
||||
f"got={list(table.column_names)}, expected={expected_names}"
|
||||
)
|
||||
|
||||
|
||||
def test_parquet_schema_market_bars():
|
||||
_publish_and_verify_schema("market_bars", lambda c: publish_market_bar(
|
||||
c, "AAPL", 150.0, 155.0, 149.0, 153.0, 1000000, NOW, "test",
|
||||
), MARKET_BARS_SCHEMA)
|
||||
|
||||
|
||||
def test_parquet_schema_market_quotes():
|
||||
_publish_and_verify_schema("market_quotes", lambda c: publish_market_quote(
|
||||
c, "AAPL", 150.0, 150.5, 150.25, NOW, "test",
|
||||
), MARKET_QUOTES_SCHEMA)
|
||||
|
||||
|
||||
def test_parquet_schema_company_events():
|
||||
_publish_and_verify_schema("company_events", lambda c: publish_company_event(
|
||||
c, "evt-1", "AAPL", "earnings", "Q1 Earnings", NOW, "test",
|
||||
), COMPANY_EVENTS_SCHEMA)
|
||||
|
||||
|
||||
def test_parquet_schema_documents():
|
||||
_publish_and_verify_schema("documents", lambda c: publish_document_fact(
|
||||
c, "doc-1", "article", "news_api", "AAPL", "Reuters", "Test", NOW, "hash123",
|
||||
), DOCUMENTS_SCHEMA)
|
||||
|
||||
|
||||
def test_parquet_schema_trade_orders():
|
||||
_publish_and_verify_schema("trade_orders", lambda c: publish_trade_order(
|
||||
c, "ord-1", "AAPL", "buy", "market", 10.0, None, "filled", "acct-1", NOW,
|
||||
), TRADE_ORDERS_SCHEMA)
|
||||
|
||||
|
||||
def test_parquet_schema_trade_fills():
|
||||
_publish_and_verify_schema("trade_fills", lambda c: publish_trade_fill(
|
||||
c, "fill-1", "ord-1", "AAPL", "buy", 150.25, 10.0, "acct-1", NOW,
|
||||
), TRADE_FILLS_SCHEMA)
|
||||
|
||||
|
||||
def test_parquet_schema_positions_daily():
|
||||
_publish_and_verify_schema("positions_daily", lambda c: publish_position_daily(
|
||||
c, "AAPL", 100.0, 145.0, 150.0, 500.0, "acct-1", NOW,
|
||||
), POSITIONS_DAILY_SCHEMA)
|
||||
|
||||
|
||||
def test_parquet_schema_pnl_daily():
|
||||
_publish_and_verify_schema("pnl_daily", lambda c: publish_pnl_daily(
|
||||
c, "AAPL", 200.0, 500.0, 700.0, "acct-1", NOW,
|
||||
), PNL_DAILY_SCHEMA)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 8. Cross-table join keys for views
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_prediction_accuracy_view_join_keys():
|
||||
"""prediction_accuracy view joins prediction_vs_outcome with trade_signals
|
||||
on recommendation_id and dt — both tables must have these columns."""
|
||||
pvo_cols = {PREDICTION_VS_OUTCOME_SCHEMA.field(i).name for i in range(len(PREDICTION_VS_OUTCOME_SCHEMA))}
|
||||
ts_cols = {TRADE_SIGNALS_SCHEMA.field(i).name for i in range(len(TRADE_SIGNALS_SCHEMA))}
|
||||
assert "recommendation_id" in pvo_cols
|
||||
assert "recommendation_id" in ts_cols
|
||||
assert "dt" in pvo_cols
|
||||
assert "dt" in ts_cols
|
||||
|
||||
|
||||
def test_paper_trade_scorecard_view_join_keys():
|
||||
"""paper_trade_scorecard joins pnl_daily with trade_orders
|
||||
on ticker, broker_account, and dt."""
|
||||
pnl_cols = {PNL_DAILY_SCHEMA.field(i).name for i in range(len(PNL_DAILY_SCHEMA))}
|
||||
ord_cols = {TRADE_ORDERS_SCHEMA.field(i).name for i in range(len(TRADE_ORDERS_SCHEMA))}
|
||||
for key in ["ticker", "broker_account", "dt"]:
|
||||
assert key in pnl_cols, f"pnl_daily missing join key: {key}"
|
||||
assert key in ord_cols, f"trade_orders missing join key: {key}"
|
||||
|
||||
|
||||
def test_paper_trade_detail_view_join_keys():
|
||||
"""paper_trade_detail joins trade_orders, trade_fills, and prediction_vs_outcome."""
|
||||
ord_cols = {TRADE_ORDERS_SCHEMA.field(i).name for i in range(len(TRADE_ORDERS_SCHEMA))}
|
||||
fill_cols = {TRADE_FILLS_SCHEMA.field(i).name for i in range(len(TRADE_FILLS_SCHEMA))}
|
||||
pvo_cols = {PREDICTION_VS_OUTCOME_SCHEMA.field(i).name for i in range(len(PREDICTION_VS_OUTCOME_SCHEMA))}
|
||||
|
||||
# orders ↔ fills on order_id, dt
|
||||
assert "order_id" in ord_cols
|
||||
assert "order_id" in fill_cols
|
||||
assert "dt" in ord_cols
|
||||
assert "dt" in fill_cols
|
||||
|
||||
# orders ↔ prediction_vs_outcome on recommendation_id, dt
|
||||
assert "recommendation_id" in ord_cols
|
||||
assert "recommendation_id" in pvo_cols
|
||||
|
||||
|
||||
def test_signal_hit_rate_view_columns():
|
||||
"""signal_hit_rate groups by dt and model_version from prediction_vs_outcome."""
|
||||
pvo_cols = {PREDICTION_VS_OUTCOME_SCHEMA.field(i).name for i in range(len(PREDICTION_VS_OUTCOME_SCHEMA))}
|
||||
assert "dt" in pvo_cols
|
||||
assert "model_version" in pvo_cols
|
||||
assert "outcome" in pvo_cols
|
||||
assert "predicted_confidence" in pvo_cols
|
||||
assert "actual_move_pct" in pvo_cols
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 9. Iceberg DDL consistency with lakehouse DDL
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_iceberg_ddl_columns_match_lakehouse_ddl():
|
||||
"""Iceberg CREATE TABLE columns match the lakehouse DDL columns for every table."""
|
||||
for td in get_all_table_defs():
|
||||
ddl_path = LAKEHOUSE_DDL_DIR / f"{td.table_name}.sql"
|
||||
if not ddl_path.exists():
|
||||
continue
|
||||
ddl_cols = _parse_ddl_columns(ddl_path)
|
||||
ddl_col_names = [c[0] for c in ddl_cols]
|
||||
|
||||
iceberg_sql = td.create_table_sql()
|
||||
# Extract column block from Iceberg DDL (greedy to handle nested parens)
|
||||
match = re.search(r"CREATE TABLE[^(]+\((.*)\)\s*WITH", iceberg_sql, re.DOTALL)
|
||||
assert match is not None, f"Could not parse Iceberg DDL for {td.table_name}"
|
||||
iceberg_col_block = match.group(1)
|
||||
iceberg_col_names = []
|
||||
for line in iceberg_col_block.strip().split("\n"):
|
||||
line = line.strip().rstrip(",")
|
||||
if line:
|
||||
parts = line.split()
|
||||
if parts:
|
||||
iceberg_col_names.append(parts[0].lower())
|
||||
|
||||
assert iceberg_col_names == ddl_col_names, (
|
||||
f"Iceberg DDL column mismatch for {td.table_name}:\n"
|
||||
f" Iceberg: {iceberg_col_names}\n"
|
||||
f" DDL: {ddl_col_names}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 10. MinIO bucket and path conventions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_lakehouse_bucket_name():
|
||||
assert LAKEHOUSE_BUCKET == "stonks-lakehouse"
|
||||
|
||||
|
||||
def test_warehouse_prefix():
|
||||
assert WAREHOUSE_PREFIX == "warehouse"
|
||||
|
||||
|
||||
def test_all_paths_use_warehouse_prefix():
|
||||
"""Every table's partition path starts with warehouse/{table_name}/."""
|
||||
for table in ALL_TABLES:
|
||||
spec = TABLE_PARTITIONS[table]
|
||||
extras = {k: "v" for k in spec.extra_keys}
|
||||
path = partition_path(table, NOW, extras)
|
||||
assert path.startswith(f"warehouse/{table}/"), (
|
||||
f"Path for {table} doesn't follow convention: {path}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 11. Iceberg table locations point to correct MinIO paths
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_iceberg_locations_match_ddl_external_locations():
|
||||
"""Iceberg table locations use s3a:// and match the lakehouse DDL external_location."""
|
||||
for td in get_all_table_defs():
|
||||
expected = f"s3a://{LAKEHOUSE_BUCKET}/{WAREHOUSE_PREFIX}/{td.table_name}/"
|
||||
assert td.location == expected, (
|
||||
f"Iceberg location mismatch for {td.table_name}: "
|
||||
f"got={td.location}, expected={expected}"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 12. Partition values are injected correctly
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_values_dt_only():
|
||||
pv = partition_values(NOW)
|
||||
assert pv == {"dt": date(2026, 4, 11)}
|
||||
|
||||
|
||||
def test_partition_values_with_model_version():
|
||||
pv = partition_values(NOW, {"model_version": "2.0.0"})
|
||||
assert pv == {"dt": date(2026, 4, 11), "model_version": "2.0.0"}
|
||||
|
||||
|
||||
def test_partition_values_from_date():
|
||||
pv = partition_values(date(2026, 4, 11))
|
||||
assert pv == {"dt": date(2026, 4, 11)}
|
||||
Reference in New Issue
Block a user