Files
stonks-oracle/services/lake_publisher/partitions.py
T

129 lines
4.6 KiB
Python

"""Hive-compatible partition layout conventions for the MinIO lakehouse.
Centralizes partition path generation, partition column injection, and
bucket provisioning so that all lake publisher writers produce layouts
that Trino's Hive and Iceberg connectors can discover and prune.
Design ref: Section 5.2, 5.3 (Lakehouse model)
Requirements: 9.4, 9.5, N4, N6
Layout convention:
s3://stonks-lakehouse/warehouse/{table_name}/dt={YYYY-MM-DD}[/{extra_key}={value}]/part-{uuid}.parquet
Rules:
- Every fact table is partitioned by ``dt`` (DATE) derived from the row timestamp.
- Some tables have a second partition key (e.g. ``model_version``).
- Partition columns MUST appear in the Parquet file so Trino can read them
without relying solely on path parsing.
- File names use a UUID suffix to avoid collisions on concurrent writes.
"""
from __future__ import annotations
import uuid
from dataclasses import dataclass, field
from datetime import date, datetime, timezone
LAKEHOUSE_BUCKET = "stonks-lakehouse"
WAREHOUSE_PREFIX = "warehouse"
@dataclass(frozen=True)
class PartitionSpec:
"""Describes the partition layout for a single fact table."""
table_name: str
extra_keys: tuple[str, ...] = field(default_factory=tuple)
@property
def all_keys(self) -> tuple[str, ...]:
"""Return all partition keys in order (dt first, then extras)."""
return ("dt", *self.extra_keys)
# Registry of every analytical fact table and its partition keys.
# This is the single source of truth — DDL, publisher, and tests should agree.
TABLE_PARTITIONS: dict[str, PartitionSpec] = {
"market_bars": PartitionSpec("market_bars"),
"market_quotes": PartitionSpec("market_quotes"),
"company_events": PartitionSpec("company_events"),
"documents": PartitionSpec("documents"),
"document_extractions": PartitionSpec("document_extractions", extra_keys=("model_version",)),
"trade_signals": PartitionSpec("trade_signals"),
"trade_orders": PartitionSpec("trade_orders"),
"trade_fills": PartitionSpec("trade_fills"),
"positions_daily": PartitionSpec("positions_daily"),
"pnl_daily": PartitionSpec("pnl_daily"),
"prediction_vs_outcome": PartitionSpec("prediction_vs_outcome", extra_keys=("model_version",)),
"model_performance": PartitionSpec("model_performance", extra_keys=("model_version",)),
}
def partition_path(
table_name: str,
dt: datetime | date,
extra_partitions: dict[str, str] | None = None,
file_id: str | None = None,
) -> str:
"""Build a Hive-compatible object path for a Parquet file.
Args:
table_name: Logical fact table name (must be in TABLE_PARTITIONS).
dt: Row timestamp or date used to derive the ``dt=`` partition.
extra_partitions: Additional partition key/value pairs (e.g. model_version).
file_id: Optional override for the file suffix (defaults to a UUID4).
Returns:
Object key relative to the bucket root, e.g.
``warehouse/trade_signals/dt=2026-04-11/part-<uuid>.parquet``
"""
spec = TABLE_PARTITIONS.get(table_name)
if spec is None:
raise ValueError(f"Unknown table: {table_name}. Register it in TABLE_PARTITIONS.")
if isinstance(dt, datetime):
dt_str = dt.strftime("%Y-%m-%d")
else:
dt_str = dt.isoformat()
segments = [WAREHOUSE_PREFIX, table_name, f"dt={dt_str}"]
# Append extra partition directories in the order declared by the spec.
extras = extra_partitions or {}
for key in spec.extra_keys:
value = extras.get(key, "__NONE__")
segments.append(f"{key}={value}")
suffix = file_id or uuid.uuid4().hex[:16]
segments.append(f"part-{suffix}.parquet")
return "/".join(segments)
def partition_values(
dt: datetime | date,
extra_partitions: dict[str, str] | None = None,
) -> dict[str, object]:
"""Return partition column values to inject into Parquet row data.
Trino's Hive connector can read partition values from the directory path,
but embedding them in the Parquet file as well ensures compatibility with
engines that don't parse Hive paths (e.g. plain PyArrow reads, DuckDB).
Returns a dict like ``{"dt": date(2026, 4, 11), "model_version": "v2"}``.
"""
if isinstance(dt, datetime):
dt_date = dt.date()
else:
dt_date = dt
values: dict[str, object] = {"dt": dt_date}
if extra_partitions:
values.update(extra_partitions)
return values
def s3_uri(path: str) -> str:
"""Build an s3:// URI from a bucket-relative object path."""
return f"s3://{LAKEHOUSE_BUCKET}/{path}"