phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -0,0 +1,128 @@
+"""Hive-compatible partition layout conventions for the MinIO lakehouse.
+
+Centralizes partition path generation, partition column injection, and
+bucket provisioning so that all lake publisher writers produce layouts
+that Trino's Hive and Iceberg connectors can discover and prune.
+
+Design ref: Section 5.2, 5.3 (Lakehouse model)
+Requirements: 9.4, 9.5, N4, N6
+
+Layout convention:
+    s3://stonks-lakehouse/warehouse/{table_name}/dt={YYYY-MM-DD}[/{extra_key}={value}]/part-{uuid}.parquet
+
+Rules:
+    - Every fact table is partitioned by ``dt`` (DATE) derived from the row timestamp.
+    - Some tables have a second partition key (e.g. ``model_version``).
+    - Partition columns MUST appear in the Parquet file so Trino can read them
+      without relying solely on path parsing.
+    - File names use a UUID suffix to avoid collisions on concurrent writes.
+"""
+from __future__ import annotations
+
+import uuid
+from dataclasses import dataclass, field
+from datetime import date, datetime, timezone
+
+
+LAKEHOUSE_BUCKET = "stonks-lakehouse"
+WAREHOUSE_PREFIX = "warehouse"
+
+
+@dataclass(frozen=True)
+class PartitionSpec:
+    """Describes the partition layout for a single fact table."""
+
+    table_name: str
+    extra_keys: tuple[str, ...] = field(default_factory=tuple)
+
+    @property
+    def all_keys(self) -> tuple[str, ...]:
+        """Return all partition keys in order (dt first, then extras)."""
+        return ("dt", *self.extra_keys)
+
+
+# Registry of every analytical fact table and its partition keys.
+# This is the single source of truth — DDL, publisher, and tests should agree.
+TABLE_PARTITIONS: dict[str, PartitionSpec] = {
+    "market_bars":            PartitionSpec("market_bars"),
+    "market_quotes":          PartitionSpec("market_quotes"),
+    "company_events":         PartitionSpec("company_events"),
+    "documents":              PartitionSpec("documents"),
+    "document_extractions":   PartitionSpec("document_extractions", extra_keys=("model_version",)),
+    "trade_signals":          PartitionSpec("trade_signals"),
+    "trade_orders":           PartitionSpec("trade_orders"),
+    "trade_fills":            PartitionSpec("trade_fills"),
+    "positions_daily":        PartitionSpec("positions_daily"),
+    "pnl_daily":              PartitionSpec("pnl_daily"),
+    "prediction_vs_outcome":  PartitionSpec("prediction_vs_outcome", extra_keys=("model_version",)),
+    "model_performance":      PartitionSpec("model_performance", extra_keys=("model_version",)),
+}
+
+
+def partition_path(
+    table_name: str,
+    dt: datetime | date,
+    extra_partitions: dict[str, str] | None = None,
+    file_id: str | None = None,
+) -> str:
+    """Build a Hive-compatible object path for a Parquet file.
+
+    Args:
+        table_name: Logical fact table name (must be in TABLE_PARTITIONS).
+        dt: Row timestamp or date used to derive the ``dt=`` partition.
+        extra_partitions: Additional partition key/value pairs (e.g. model_version).
+        file_id: Optional override for the file suffix (defaults to a UUID4).
+
+    Returns:
+        Object key relative to the bucket root, e.g.
+        ``warehouse/trade_signals/dt=2026-04-11/part-<uuid>.parquet``
+    """
+    spec = TABLE_PARTITIONS.get(table_name)
+    if spec is None:
+        raise ValueError(f"Unknown table: {table_name}. Register it in TABLE_PARTITIONS.")
+
+    if isinstance(dt, datetime):
+        dt_str = dt.strftime("%Y-%m-%d")
+    else:
+        dt_str = dt.isoformat()
+
+    segments = [WAREHOUSE_PREFIX, table_name, f"dt={dt_str}"]
+
+    # Append extra partition directories in the order declared by the spec.
+    extras = extra_partitions or {}
+    for key in spec.extra_keys:
+        value = extras.get(key, "__NONE__")
+        segments.append(f"{key}={value}")
+
+    suffix = file_id or uuid.uuid4().hex[:16]
+    segments.append(f"part-{suffix}.parquet")
+
+    return "/".join(segments)
+
+
+def partition_values(
+    dt: datetime | date,
+    extra_partitions: dict[str, str] | None = None,
+) -> dict[str, object]:
+    """Return partition column values to inject into Parquet row data.
+
+    Trino's Hive connector can read partition values from the directory path,
+    but embedding them in the Parquet file as well ensures compatibility with
+    engines that don't parse Hive paths (e.g. plain PyArrow reads, DuckDB).
+
+    Returns a dict like ``{"dt": date(2026, 4, 11), "model_version": "v2"}``.
+    """
+    if isinstance(dt, datetime):
+        dt_date = dt.date()
+    else:
+        dt_date = dt
+
+    values: dict[str, object] = {"dt": dt_date}
+    if extra_partitions:
+        values.update(extra_partitions)
+    return values
+
+
+def s3_uri(path: str) -> str:
+    """Build an s3:// URI from a bucket-relative object path."""
+    return f"s3://{LAKEHOUSE_BUCKET}/{path}"