"""Hive-compatible partition layout conventions for the MinIO lakehouse. Centralizes partition path generation, partition column injection, and bucket provisioning so that all lake publisher writers produce layouts that Trino's Hive and Iceberg connectors can discover and prune. Design ref: Section 5.2, 5.3 (Lakehouse model) Requirements: 9.4, 9.5, N4, N6 Layout convention: s3://stonks-lakehouse/warehouse/{table_name}/dt={YYYY-MM-DD}[/{extra_key}={value}]/part-{uuid}.parquet Rules: - Every fact table is partitioned by ``dt`` (DATE) derived from the row timestamp. - Some tables have a second partition key (e.g. ``model_version``). - Partition columns MUST appear in the Parquet file so Trino can read them without relying solely on path parsing. - File names use a UUID suffix to avoid collisions on concurrent writes. """ from __future__ import annotations import uuid from dataclasses import dataclass, field from datetime import date, datetime, timezone LAKEHOUSE_BUCKET = "stonks-lakehouse" WAREHOUSE_PREFIX = "warehouse" @dataclass(frozen=True) class PartitionSpec: """Describes the partition layout for a single fact table.""" table_name: str extra_keys: tuple[str, ...] = field(default_factory=tuple) @property def all_keys(self) -> tuple[str, ...]: """Return all partition keys in order (dt first, then extras).""" return ("dt", *self.extra_keys) # Registry of every analytical fact table and its partition keys. # This is the single source of truth — DDL, publisher, and tests should agree. TABLE_PARTITIONS: dict[str, PartitionSpec] = { "market_bars": PartitionSpec("market_bars"), "market_quotes": PartitionSpec("market_quotes"), "company_events": PartitionSpec("company_events"), "documents": PartitionSpec("documents"), "document_extractions": PartitionSpec("document_extractions", extra_keys=("model_version",)), "trade_signals": PartitionSpec("trade_signals"), "trade_orders": PartitionSpec("trade_orders"), "trade_fills": PartitionSpec("trade_fills"), "positions_daily": PartitionSpec("positions_daily"), "pnl_daily": PartitionSpec("pnl_daily"), "prediction_vs_outcome": PartitionSpec("prediction_vs_outcome", extra_keys=("model_version",)), "model_performance": PartitionSpec("model_performance", extra_keys=("model_version",)), } def partition_path( table_name: str, dt: datetime | date, extra_partitions: dict[str, str] | None = None, file_id: str | None = None, ) -> str: """Build a Hive-compatible object path for a Parquet file. Args: table_name: Logical fact table name (must be in TABLE_PARTITIONS). dt: Row timestamp or date used to derive the ``dt=`` partition. extra_partitions: Additional partition key/value pairs (e.g. model_version). file_id: Optional override for the file suffix (defaults to a UUID4). Returns: Object key relative to the bucket root, e.g. ``warehouse/trade_signals/dt=2026-04-11/part-.parquet`` """ spec = TABLE_PARTITIONS.get(table_name) if spec is None: raise ValueError(f"Unknown table: {table_name}. Register it in TABLE_PARTITIONS.") if isinstance(dt, datetime): dt_str = dt.strftime("%Y-%m-%d") else: dt_str = dt.isoformat() segments = [WAREHOUSE_PREFIX, table_name, f"dt={dt_str}"] # Append extra partition directories in the order declared by the spec. extras = extra_partitions or {} for key in spec.extra_keys: value = extras.get(key, "__NONE__") segments.append(f"{key}={value}") suffix = file_id or uuid.uuid4().hex[:16] segments.append(f"part-{suffix}.parquet") return "/".join(segments) def partition_values( dt: datetime | date, extra_partitions: dict[str, str] | None = None, ) -> dict[str, object]: """Return partition column values to inject into Parquet row data. Trino's Hive connector can read partition values from the directory path, but embedding them in the Parquet file as well ensures compatibility with engines that don't parse Hive paths (e.g. plain PyArrow reads, DuckDB). Returns a dict like ``{"dt": date(2026, 4, 11), "model_version": "v2"}``. """ if isinstance(dt, datetime): dt_date = dt.date() else: dt_date = dt values: dict[str, object] = {"dt": dt_date} if extra_partitions: values.update(extra_partitions) return values def s3_uri(path: str) -> str: """Build an s3:// URI from a bucket-relative object path.""" return f"s3://{LAKEHOUSE_BUCKET}/{path}"