phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -0,0 +1,420 @@
+"""Iceberg table creation and metadata management for analytical datasets.
+
+Manages Iceberg tables in Trino's Iceberg catalog, providing:
+- Table creation with proper schemas and partition specs
+- Schema synchronization between PyArrow definitions and Iceberg tables
+- Table metadata inspection (existence checks, schema retrieval, partition listing)
+
+The Iceberg catalog complements the existing Hive-compatible partition layout.
+Parquet files written by the lake publisher are stored in the same MinIO paths,
+but Iceberg metadata enables schema evolution, snapshot isolation, and better
+partition pruning via Trino's Iceberg connector.
+
+Requirements: 9.4, 9.5, 10.1, N4, N6
+Design ref: Section 5.3 (Lakehouse model), Section 4.12 (SQL Query Engine)
+"""
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+import pyarrow as pa
+from trino.dbapi import connect as trino_connect
+
+from services.lake_publisher.partitions import (
+    LAKEHOUSE_BUCKET,
+    TABLE_PARTITIONS,
+    WAREHOUSE_PREFIX,
+    PartitionSpec,
+)
+from services.lake_publisher.worker import (
+    COMPANY_EVENTS_SCHEMA,
+    DOCUMENTS_SCHEMA,
+    DOCUMENT_EXTRACTIONS_SCHEMA,
+    MARKET_BARS_SCHEMA,
+    MARKET_QUOTES_SCHEMA,
+    MODEL_PERFORMANCE_SCHEMA,
+    PNL_DAILY_SCHEMA,
+    POSITIONS_DAILY_SCHEMA,
+    PREDICTION_VS_OUTCOME_SCHEMA,
+    TRADE_FILLS_SCHEMA,
+    TRADE_ORDERS_SCHEMA,
+    TRADE_SIGNALS_SCHEMA,
+)
+
+logger = logging.getLogger(__name__)
+
+ICEBERG_CATALOG = "iceberg"
+ICEBERG_SCHEMA = "stonks"
+
+
+def _get_iceberg_catalog() -> str:
+    """Return the Iceberg catalog name from env or default."""
+    import os
+    return os.getenv("TRINO_ICEBERG_CATALOG", ICEBERG_CATALOG)
+
+# Map PyArrow types to Trino/Iceberg SQL types.
+_ARROW_TO_TRINO: dict[str, str] = {
+    "string": "VARCHAR",
+    "utf8": "VARCHAR",
+    "large_string": "VARCHAR",
+    "large_utf8": "VARCHAR",
+    "float64": "DOUBLE",
+    "double": "DOUBLE",
+    "float32": "REAL",
+    "float": "REAL",
+    "int8": "TINYINT",
+    "int16": "SMALLINT",
+    "int32": "INTEGER",
+    "int64": "BIGINT",
+    "bool": "BOOLEAN",
+    "date32": "DATE",
+    "date32[day]": "DATE",
+    "date64": "DATE",
+}
+
+
+def _arrow_type_to_trino(arrow_type: pa.DataType) -> str:
+    """Convert a PyArrow data type to a Trino SQL type string."""
+    type_str = str(arrow_type)
+
+    # Handle timestamp types (with or without timezone)
+    if type_str.startswith("timestamp"):
+        if "tz=" in type_str:
+            return "TIMESTAMP(6) WITH TIME ZONE"
+        return "TIMESTAMP(6)"
+
+    # Direct lookup
+    result = _ARROW_TO_TRINO.get(type_str)
+    if result:
+        return result
+
+    # Fallback for type IDs
+    if pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type):
+        return "VARCHAR"
+    if pa.types.is_floating(arrow_type):
+        return "DOUBLE"
+    if pa.types.is_integer(arrow_type):
+        return "BIGINT"
+    if pa.types.is_boolean(arrow_type):
+        return "BOOLEAN"
+    if pa.types.is_date(arrow_type):
+        return "DATE"
+    if pa.types.is_timestamp(arrow_type):
+        return "TIMESTAMP(6) WITH TIME ZONE"
+
+    raise ValueError(f"Unsupported PyArrow type for Iceberg DDL: {arrow_type}")
+
+
+
+# Registry mapping table names to their PyArrow schemas.
+TABLE_SCHEMAS: dict[str, pa.Schema] = {
+    "market_bars": MARKET_BARS_SCHEMA,
+    "market_quotes": MARKET_QUOTES_SCHEMA,
+    "company_events": COMPANY_EVENTS_SCHEMA,
+    "documents": DOCUMENTS_SCHEMA,
+    "document_extractions": DOCUMENT_EXTRACTIONS_SCHEMA,
+    "trade_signals": TRADE_SIGNALS_SCHEMA,
+    "trade_orders": TRADE_ORDERS_SCHEMA,
+    "trade_fills": TRADE_FILLS_SCHEMA,
+    "positions_daily": POSITIONS_DAILY_SCHEMA,
+    "pnl_daily": PNL_DAILY_SCHEMA,
+    "prediction_vs_outcome": PREDICTION_VS_OUTCOME_SCHEMA,
+    "model_performance": MODEL_PERFORMANCE_SCHEMA,
+}
+
+
+@dataclass(frozen=True)
+class IcebergTableDef:
+    """Definition for an Iceberg table derived from PyArrow schema + partition spec."""
+
+    table_name: str
+    schema: pa.Schema
+    partition_spec: PartitionSpec
+
+    @property
+    def qualified_name(self) -> str:
+        return f"{ICEBERG_CATALOG}.{ICEBERG_SCHEMA}.{self.table_name}"
+
+    @property
+    def location(self) -> str:
+        return f"s3a://{LAKEHOUSE_BUCKET}/{WAREHOUSE_PREFIX}/{self.table_name}/"
+
+    def column_defs_sql(self) -> list[str]:
+        """Generate SQL column definitions from the PyArrow schema.
+
+        Partition columns are included in the column list (Iceberg stores them
+        in the data files, unlike Hive external tables).
+        """
+        cols: list[str] = []
+        for i in range(len(self.schema)):
+            name = self.schema.field(i).name
+            arrow_type = self.schema.field(i).type
+            trino_type = _arrow_type_to_trino(arrow_type)
+            cols.append(f"    {name} {trino_type}")
+        return cols
+
+    def partition_keys_sql(self) -> str:
+        """Generate the partitioning clause for CREATE TABLE."""
+        keys = list(self.partition_spec.all_keys)
+        if not keys:
+            return ""
+        quoted = ", ".join(f"'{k}'" for k in keys)
+        return f"partitioning = ARRAY[{quoted}]"
+
+    def create_table_sql(self) -> str:
+        """Generate a CREATE TABLE IF NOT EXISTS statement for Trino's Iceberg catalog."""
+        col_lines = ",\n".join(self.column_defs_sql())
+        with_clauses = [
+            "format = 'PARQUET'",
+            f"location = '{self.location}'",
+        ]
+        part_sql = self.partition_keys_sql()
+        if part_sql:
+            with_clauses.append(part_sql)
+
+        with_block = ",\n    ".join(with_clauses)
+
+        return (
+            f"CREATE TABLE IF NOT EXISTS {self.qualified_name} (\n"
+            f"{col_lines}\n"
+            f") WITH (\n"
+            f"    {with_block}\n"
+            f")"
+        )
+
+
+def get_all_table_defs() -> list[IcebergTableDef]:
+    """Build IcebergTableDef for every registered analytical table."""
+    defs: list[IcebergTableDef] = []
+    for table_name, partition_spec in TABLE_PARTITIONS.items():
+        schema = TABLE_SCHEMAS.get(table_name)
+        if schema is None:
+            logger.warning("No PyArrow schema for table %s, skipping", table_name)
+            continue
+        defs.append(IcebergTableDef(
+            table_name=table_name,
+            schema=schema,
+            partition_spec=partition_spec,
+        ))
+    return defs
+
+
+def get_table_def(table_name: str) -> IcebergTableDef:
+    """Get the IcebergTableDef for a single table by name."""
+    if table_name not in TABLE_PARTITIONS:
+        raise ValueError(f"Unknown table: {table_name}")
+    schema = TABLE_SCHEMAS.get(table_name)
+    if schema is None:
+        raise ValueError(f"No PyArrow schema registered for table: {table_name}")
+    return IcebergTableDef(
+        table_name=table_name,
+        schema=schema,
+        partition_spec=TABLE_PARTITIONS[table_name],
+    )
+
+
+
+@dataclass
+class IcebergManager:
+    """Manages Iceberg tables via Trino's Iceberg catalog.
+
+    Provides table creation, existence checks, schema inspection,
+    and metadata operations against the Trino Iceberg connector.
+    """
+
+    host: str = "localhost"
+    port: int = 8080
+    user: str = "stonks"
+    catalog: str = ICEBERG_CATALOG
+    schema: str = ICEBERG_SCHEMA
+
+    def _get_connection(self) -> Any:
+        """Create a Trino DBAPI connection."""
+        return trino_connect(
+            host=self.host,
+            port=self.port,
+            user=self.user,
+            catalog=self.catalog,
+            schema=self.schema,
+        )
+
+    def _execute(self, sql: str) -> list[list[Any]]:
+        """Execute a SQL statement and return all rows."""
+        conn = self._get_connection()
+        try:
+            cursor = conn.cursor()
+            cursor.execute(sql)
+            return cursor.fetchall()
+        finally:
+            conn.close()
+
+    def _execute_no_fetch(self, sql: str) -> None:
+        """Execute a DDL statement that returns no rows."""
+        conn = self._get_connection()
+        try:
+            cursor = conn.cursor()
+            cursor.execute(sql)
+            # DDL statements in Trino still need fetchall to complete
+            try:
+                cursor.fetchall()
+            except Exception:
+                pass
+        finally:
+            conn.close()
+
+    def ensure_schema(self) -> None:
+        """Create the Iceberg schema if it doesn't exist."""
+        sql = f"CREATE SCHEMA IF NOT EXISTS {self.catalog}.{self.schema}"
+        logger.info("Ensuring Iceberg schema: %s.%s", self.catalog, self.schema)
+        self._execute_no_fetch(sql)
+
+    def table_exists(self, table_name: str) -> bool:
+        """Check if an Iceberg table exists."""
+        sql = (
+            f"SELECT table_name FROM {self.catalog}.information_schema.tables "
+            f"WHERE table_schema = '{self.schema}' AND table_name = '{table_name}'"
+        )
+        rows = self._execute(sql)
+        return len(rows) > 0
+
+    def create_table(self, table_name: str) -> bool:
+        """Create a single Iceberg table if it doesn't exist.
+
+        Returns True if the table was created, False if it already existed.
+        """
+        table_def = get_table_def(table_name)
+        ddl = table_def.create_table_sql()
+        logger.info("Creating Iceberg table: %s", table_def.qualified_name)
+        self._execute_no_fetch(ddl)
+        logger.info("Iceberg table ready: %s", table_def.qualified_name)
+        return True
+
+    def create_all_tables(self) -> dict[str, bool]:
+        """Create all registered Iceberg tables.
+
+        Returns a dict mapping table_name -> True (created) or False (error).
+        """
+        self.ensure_schema()
+        results: dict[str, bool] = {}
+        for table_def in get_all_table_defs():
+            try:
+                self.create_table(table_def.table_name)
+                results[table_def.table_name] = True
+            except Exception:
+                logger.exception("Failed to create Iceberg table: %s", table_def.table_name)
+                results[table_def.table_name] = False
+        return results
+
+    def get_table_schema(self, table_name: str) -> list[dict[str, str]]:
+        """Retrieve the column schema of an Iceberg table from Trino.
+
+        Returns a list of dicts with 'column_name', 'data_type', and 'is_nullable'.
+        """
+        sql = (
+            f"SELECT column_name, data_type, is_nullable "
+            f"FROM {self.catalog}.information_schema.columns "
+            f"WHERE table_schema = '{self.schema}' AND table_name = '{table_name}' "
+            f"ORDER BY ordinal_position"
+        )
+        rows = self._execute(sql)
+        return [
+            {"column_name": r[0], "data_type": r[1], "is_nullable": r[2]}
+            for r in rows
+        ]
+
+    def get_table_snapshots(self, table_name: str) -> list[dict[str, Any]]:
+        """List Iceberg snapshots for a table (useful for auditing and rollback).
+
+        Returns snapshot metadata from Trino's $snapshots metadata table.
+        """
+        qualified = f"{self.catalog}.{self.schema}.{table_name}"
+        sql = f'SELECT * FROM "{qualified}$snapshots"'
+        try:
+            rows = self._execute(sql)
+            return [{"snapshot_id": r[0], "parent_id": r[1], "operation": r[2],
+                      "manifest_list": r[3], "summary": r[4]} for r in rows]
+        except Exception:
+            logger.debug("Could not read snapshots for %s (table may be empty)", table_name)
+            return []
+
+    def get_table_partitions(self, table_name: str) -> list[dict[str, Any]]:
+        """List partition values for an Iceberg table.
+
+        Returns partition metadata from Trino's $partitions metadata table.
+        """
+        qualified = f"{self.catalog}.{self.schema}.{table_name}"
+        sql = f'SELECT * FROM "{qualified}$partitions"'
+        try:
+            rows = self._execute(sql)
+            return [{"row": r} for r in rows]
+        except Exception:
+            logger.debug("Could not read partitions for %s (table may be empty)", table_name)
+            return []
+
+    def list_tables(self) -> list[str]:
+        """List all tables in the Iceberg schema."""
+        sql = (
+            f"SELECT table_name FROM {self.catalog}.information_schema.tables "
+            f"WHERE table_schema = '{self.schema}' ORDER BY table_name"
+        )
+        rows = self._execute(sql)
+        return [r[0] for r in rows]
+
+    def drop_table(self, table_name: str) -> None:
+        """Drop an Iceberg table (for testing/reset purposes)."""
+        qualified = f"{self.catalog}.{self.schema}.{table_name}"
+        logger.warning("Dropping Iceberg table: %s", qualified)
+        self._execute_no_fetch(f"DROP TABLE IF EXISTS {qualified}")
+
+    def sync_table_schema(self, table_name: str) -> list[str]:
+        """Compare the expected PyArrow schema with the actual Iceberg table schema.
+
+        If columns are missing from the Iceberg table, adds them via ALTER TABLE.
+        Returns a list of columns that were added.
+
+        This supports forward-only schema evolution — columns are never dropped.
+        """
+        table_def = get_table_def(table_name)
+        existing = self.get_table_schema(table_name)
+        existing_names = {col["column_name"] for col in existing}
+
+        added: list[str] = []
+        qualified = table_def.qualified_name
+
+        for i in range(len(table_def.schema)):
+            col_name = table_def.schema.field(i).name
+            if col_name not in existing_names:
+                trino_type = _arrow_type_to_trino(table_def.schema.field(i).type)
+                alter_sql = f"ALTER TABLE {qualified} ADD COLUMN {col_name} {trino_type}"
+                logger.info("Adding column %s to %s", col_name, qualified)
+                self._execute_no_fetch(alter_sql)
+                added.append(col_name)
+
+        return added
+
+    def sync_all_schemas(self) -> dict[str, list[str]]:
+        """Sync schemas for all registered tables. Returns table_name -> added columns."""
+        results: dict[str, list[str]] = {}
+        for table_def in get_all_table_defs():
+            try:
+                if self.table_exists(table_def.table_name):
+                    added = self.sync_table_schema(table_def.table_name)
+                    results[table_def.table_name] = added
+                else:
+                    logger.info("Table %s doesn't exist yet, skipping sync", table_def.table_name)
+                    results[table_def.table_name] = []
+            except Exception:
+                logger.exception("Failed to sync schema for %s", table_def.table_name)
+                results[table_def.table_name] = []
+        return results
+
+
+def create_iceberg_manager_from_config(
+    host: str = "localhost",
+    port: int = 8080,
+    user: str = "stonks",
+) -> IcebergManager:
+    """Factory that creates an IcebergManager from explicit connection params."""
+    return IcebergManager(host=host, port=port, user=user)