phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -1 +1 @@
-# Lake Publisher - transforms operational data into analytical fact datasets
+"""Lake publisher — writes partitioned Parquet facts to MinIO for Trino/Superset."""
@@ -0,0 +1,39 @@
+"""Helpers for enqueuing lake publish jobs from upstream workers.
+
+Other services import these helpers to push jobs onto the QUEUE_LAKE_PUBLISH
+Redis queue. The lake publisher worker (jobs.py) consumes them.
+
+Usage:
+    await enqueue_lake_job(rds, "document", document_id)
+    await enqueue_lake_job(rds, "trade_order", order_id)
+    await enqueue_lake_job(rds, "bulk_documents", since=cutoff.isoformat())
+"""
+from __future__ import annotations
+
+import json
+
+import redis.asyncio as aioredis
+
+from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key
+
+
+async def enqueue_lake_job(
+    rds: aioredis.Redis,
+    job_type: str,
+    entity_id: str = "",
+    since: str | None = None,
+) -> None:
+    """Push a lake publish job onto the Redis queue.
+
+    Args:
+        rds: Async Redis client.
+        job_type: One of the supported job types (document, document_extraction,
+                  market_snapshot, trade_order, trade_fill, positions_snapshot,
+                  pnl_snapshot, bulk_documents, bulk_extractions).
+        entity_id: UUID or identifier for the entity to publish.
+        since: ISO datetime string for bulk jobs (cutoff timestamp).
+    """
+    payload: dict[str, str] = {"job_type": job_type, "entity_id": entity_id}
+    if since:
+        payload["since"] = since
+    await rds.rpush(queue_key(QUEUE_LAKE_PUBLISH), json.dumps(payload))  # type: ignore[misc]
@@ -0,0 +1,420 @@
+"""Iceberg table creation and metadata management for analytical datasets.
+
+Manages Iceberg tables in Trino's Iceberg catalog, providing:
+- Table creation with proper schemas and partition specs
+- Schema synchronization between PyArrow definitions and Iceberg tables
+- Table metadata inspection (existence checks, schema retrieval, partition listing)
+
+The Iceberg catalog complements the existing Hive-compatible partition layout.
+Parquet files written by the lake publisher are stored in the same MinIO paths,
+but Iceberg metadata enables schema evolution, snapshot isolation, and better
+partition pruning via Trino's Iceberg connector.
+
+Requirements: 9.4, 9.5, 10.1, N4, N6
+Design ref: Section 5.3 (Lakehouse model), Section 4.12 (SQL Query Engine)
+"""
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import Any
+
+import pyarrow as pa
+from trino.dbapi import connect as trino_connect
+
+from services.lake_publisher.partitions import (
+    LAKEHOUSE_BUCKET,
+    TABLE_PARTITIONS,
+    WAREHOUSE_PREFIX,
+    PartitionSpec,
+)
+from services.lake_publisher.worker import (
+    COMPANY_EVENTS_SCHEMA,
+    DOCUMENTS_SCHEMA,
+    DOCUMENT_EXTRACTIONS_SCHEMA,
+    MARKET_BARS_SCHEMA,
+    MARKET_QUOTES_SCHEMA,
+    MODEL_PERFORMANCE_SCHEMA,
+    PNL_DAILY_SCHEMA,
+    POSITIONS_DAILY_SCHEMA,
+    PREDICTION_VS_OUTCOME_SCHEMA,
+    TRADE_FILLS_SCHEMA,
+    TRADE_ORDERS_SCHEMA,
+    TRADE_SIGNALS_SCHEMA,
+)
+
+logger = logging.getLogger(__name__)
+
+ICEBERG_CATALOG = "iceberg"
+ICEBERG_SCHEMA = "stonks"
+
+
+def _get_iceberg_catalog() -> str:
+    """Return the Iceberg catalog name from env or default."""
+    import os
+    return os.getenv("TRINO_ICEBERG_CATALOG", ICEBERG_CATALOG)
+
+# Map PyArrow types to Trino/Iceberg SQL types.
+_ARROW_TO_TRINO: dict[str, str] = {
+    "string": "VARCHAR",
+    "utf8": "VARCHAR",
+    "large_string": "VARCHAR",
+    "large_utf8": "VARCHAR",
+    "float64": "DOUBLE",
+    "double": "DOUBLE",
+    "float32": "REAL",
+    "float": "REAL",
+    "int8": "TINYINT",
+    "int16": "SMALLINT",
+    "int32": "INTEGER",
+    "int64": "BIGINT",
+    "bool": "BOOLEAN",
+    "date32": "DATE",
+    "date32[day]": "DATE",
+    "date64": "DATE",
+}
+
+
+def _arrow_type_to_trino(arrow_type: pa.DataType) -> str:
+    """Convert a PyArrow data type to a Trino SQL type string."""
+    type_str = str(arrow_type)
+
+    # Handle timestamp types (with or without timezone)
+    if type_str.startswith("timestamp"):
+        if "tz=" in type_str:
+            return "TIMESTAMP(6) WITH TIME ZONE"
+        return "TIMESTAMP(6)"
+
+    # Direct lookup
+    result = _ARROW_TO_TRINO.get(type_str)
+    if result:
+        return result
+
+    # Fallback for type IDs
+    if pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type):
+        return "VARCHAR"
+    if pa.types.is_floating(arrow_type):
+        return "DOUBLE"
+    if pa.types.is_integer(arrow_type):
+        return "BIGINT"
+    if pa.types.is_boolean(arrow_type):
+        return "BOOLEAN"
+    if pa.types.is_date(arrow_type):
+        return "DATE"
+    if pa.types.is_timestamp(arrow_type):
+        return "TIMESTAMP(6) WITH TIME ZONE"
+
+    raise ValueError(f"Unsupported PyArrow type for Iceberg DDL: {arrow_type}")
+
+
+
+# Registry mapping table names to their PyArrow schemas.
+TABLE_SCHEMAS: dict[str, pa.Schema] = {
+    "market_bars": MARKET_BARS_SCHEMA,
+    "market_quotes": MARKET_QUOTES_SCHEMA,
+    "company_events": COMPANY_EVENTS_SCHEMA,
+    "documents": DOCUMENTS_SCHEMA,
+    "document_extractions": DOCUMENT_EXTRACTIONS_SCHEMA,
+    "trade_signals": TRADE_SIGNALS_SCHEMA,
+    "trade_orders": TRADE_ORDERS_SCHEMA,
+    "trade_fills": TRADE_FILLS_SCHEMA,
+    "positions_daily": POSITIONS_DAILY_SCHEMA,
+    "pnl_daily": PNL_DAILY_SCHEMA,
+    "prediction_vs_outcome": PREDICTION_VS_OUTCOME_SCHEMA,
+    "model_performance": MODEL_PERFORMANCE_SCHEMA,
+}
+
+
+@dataclass(frozen=True)
+class IcebergTableDef:
+    """Definition for an Iceberg table derived from PyArrow schema + partition spec."""
+
+    table_name: str
+    schema: pa.Schema
+    partition_spec: PartitionSpec
+
+    @property
+    def qualified_name(self) -> str:
+        return f"{ICEBERG_CATALOG}.{ICEBERG_SCHEMA}.{self.table_name}"
+
+    @property
+    def location(self) -> str:
+        return f"s3a://{LAKEHOUSE_BUCKET}/{WAREHOUSE_PREFIX}/{self.table_name}/"
+
+    def column_defs_sql(self) -> list[str]:
+        """Generate SQL column definitions from the PyArrow schema.
+
+        Partition columns are included in the column list (Iceberg stores them
+        in the data files, unlike Hive external tables).
+        """
+        cols: list[str] = []
+        for i in range(len(self.schema)):
+            name = self.schema.field(i).name
+            arrow_type = self.schema.field(i).type
+            trino_type = _arrow_type_to_trino(arrow_type)
+            cols.append(f"    {name} {trino_type}")
+        return cols
+
+    def partition_keys_sql(self) -> str:
+        """Generate the partitioning clause for CREATE TABLE."""
+        keys = list(self.partition_spec.all_keys)
+        if not keys:
+            return ""
+        quoted = ", ".join(f"'{k}'" for k in keys)
+        return f"partitioning = ARRAY[{quoted}]"
+
+    def create_table_sql(self) -> str:
+        """Generate a CREATE TABLE IF NOT EXISTS statement for Trino's Iceberg catalog."""
+        col_lines = ",\n".join(self.column_defs_sql())
+        with_clauses = [
+            "format = 'PARQUET'",
+            f"location = '{self.location}'",
+        ]
+        part_sql = self.partition_keys_sql()
+        if part_sql:
+            with_clauses.append(part_sql)
+
+        with_block = ",\n    ".join(with_clauses)
+
+        return (
+            f"CREATE TABLE IF NOT EXISTS {self.qualified_name} (\n"
+            f"{col_lines}\n"
+            f") WITH (\n"
+            f"    {with_block}\n"
+            f")"
+        )
+
+
+def get_all_table_defs() -> list[IcebergTableDef]:
+    """Build IcebergTableDef for every registered analytical table."""
+    defs: list[IcebergTableDef] = []
+    for table_name, partition_spec in TABLE_PARTITIONS.items():
+        schema = TABLE_SCHEMAS.get(table_name)
+        if schema is None:
+            logger.warning("No PyArrow schema for table %s, skipping", table_name)
+            continue
+        defs.append(IcebergTableDef(
+            table_name=table_name,
+            schema=schema,
+            partition_spec=partition_spec,
+        ))
+    return defs
+
+
+def get_table_def(table_name: str) -> IcebergTableDef:
+    """Get the IcebergTableDef for a single table by name."""
+    if table_name not in TABLE_PARTITIONS:
+        raise ValueError(f"Unknown table: {table_name}")
+    schema = TABLE_SCHEMAS.get(table_name)
+    if schema is None:
+        raise ValueError(f"No PyArrow schema registered for table: {table_name}")
+    return IcebergTableDef(
+        table_name=table_name,
+        schema=schema,
+        partition_spec=TABLE_PARTITIONS[table_name],
+    )
+
+
+
+@dataclass
+class IcebergManager:
+    """Manages Iceberg tables via Trino's Iceberg catalog.
+
+    Provides table creation, existence checks, schema inspection,
+    and metadata operations against the Trino Iceberg connector.
+    """
+
+    host: str = "localhost"
+    port: int = 8080
+    user: str = "stonks"
+    catalog: str = ICEBERG_CATALOG
+    schema: str = ICEBERG_SCHEMA
+
+    def _get_connection(self) -> Any:
+        """Create a Trino DBAPI connection."""
+        return trino_connect(
+            host=self.host,
+            port=self.port,
+            user=self.user,
+            catalog=self.catalog,
+            schema=self.schema,
+        )
+
+    def _execute(self, sql: str) -> list[list[Any]]:
+        """Execute a SQL statement and return all rows."""
+        conn = self._get_connection()
+        try:
+            cursor = conn.cursor()
+            cursor.execute(sql)
+            return cursor.fetchall()
+        finally:
+            conn.close()
+
+    def _execute_no_fetch(self, sql: str) -> None:
+        """Execute a DDL statement that returns no rows."""
+        conn = self._get_connection()
+        try:
+            cursor = conn.cursor()
+            cursor.execute(sql)
+            # DDL statements in Trino still need fetchall to complete
+            try:
+                cursor.fetchall()
+            except Exception:
+                pass
+        finally:
+            conn.close()
+
+    def ensure_schema(self) -> None:
+        """Create the Iceberg schema if it doesn't exist."""
+        sql = f"CREATE SCHEMA IF NOT EXISTS {self.catalog}.{self.schema}"
+        logger.info("Ensuring Iceberg schema: %s.%s", self.catalog, self.schema)
+        self._execute_no_fetch(sql)
+
+    def table_exists(self, table_name: str) -> bool:
+        """Check if an Iceberg table exists."""
+        sql = (
+            f"SELECT table_name FROM {self.catalog}.information_schema.tables "
+            f"WHERE table_schema = '{self.schema}' AND table_name = '{table_name}'"
+        )
+        rows = self._execute(sql)
+        return len(rows) > 0
+
+    def create_table(self, table_name: str) -> bool:
+        """Create a single Iceberg table if it doesn't exist.
+
+        Returns True if the table was created, False if it already existed.
+        """
+        table_def = get_table_def(table_name)
+        ddl = table_def.create_table_sql()
+        logger.info("Creating Iceberg table: %s", table_def.qualified_name)
+        self._execute_no_fetch(ddl)
+        logger.info("Iceberg table ready: %s", table_def.qualified_name)
+        return True
+
+    def create_all_tables(self) -> dict[str, bool]:
+        """Create all registered Iceberg tables.
+
+        Returns a dict mapping table_name -> True (created) or False (error).
+        """
+        self.ensure_schema()
+        results: dict[str, bool] = {}
+        for table_def in get_all_table_defs():
+            try:
+                self.create_table(table_def.table_name)
+                results[table_def.table_name] = True
+            except Exception:
+                logger.exception("Failed to create Iceberg table: %s", table_def.table_name)
+                results[table_def.table_name] = False
+        return results
+
+    def get_table_schema(self, table_name: str) -> list[dict[str, str]]:
+        """Retrieve the column schema of an Iceberg table from Trino.
+
+        Returns a list of dicts with 'column_name', 'data_type', and 'is_nullable'.
+        """
+        sql = (
+            f"SELECT column_name, data_type, is_nullable "
+            f"FROM {self.catalog}.information_schema.columns "
+            f"WHERE table_schema = '{self.schema}' AND table_name = '{table_name}' "
+            f"ORDER BY ordinal_position"
+        )
+        rows = self._execute(sql)
+        return [
+            {"column_name": r[0], "data_type": r[1], "is_nullable": r[2]}
+            for r in rows
+        ]
+
+    def get_table_snapshots(self, table_name: str) -> list[dict[str, Any]]:
+        """List Iceberg snapshots for a table (useful for auditing and rollback).
+
+        Returns snapshot metadata from Trino's $snapshots metadata table.
+        """
+        qualified = f"{self.catalog}.{self.schema}.{table_name}"
+        sql = f'SELECT * FROM "{qualified}$snapshots"'
+        try:
+            rows = self._execute(sql)
+            return [{"snapshot_id": r[0], "parent_id": r[1], "operation": r[2],
+                      "manifest_list": r[3], "summary": r[4]} for r in rows]
+        except Exception:
+            logger.debug("Could not read snapshots for %s (table may be empty)", table_name)
+            return []
+
+    def get_table_partitions(self, table_name: str) -> list[dict[str, Any]]:
+        """List partition values for an Iceberg table.
+
+        Returns partition metadata from Trino's $partitions metadata table.
+        """
+        qualified = f"{self.catalog}.{self.schema}.{table_name}"
+        sql = f'SELECT * FROM "{qualified}$partitions"'
+        try:
+            rows = self._execute(sql)
+            return [{"row": r} for r in rows]
+        except Exception:
+            logger.debug("Could not read partitions for %s (table may be empty)", table_name)
+            return []
+
+    def list_tables(self) -> list[str]:
+        """List all tables in the Iceberg schema."""
+        sql = (
+            f"SELECT table_name FROM {self.catalog}.information_schema.tables "
+            f"WHERE table_schema = '{self.schema}' ORDER BY table_name"
+        )
+        rows = self._execute(sql)
+        return [r[0] for r in rows]
+
+    def drop_table(self, table_name: str) -> None:
+        """Drop an Iceberg table (for testing/reset purposes)."""
+        qualified = f"{self.catalog}.{self.schema}.{table_name}"
+        logger.warning("Dropping Iceberg table: %s", qualified)
+        self._execute_no_fetch(f"DROP TABLE IF EXISTS {qualified}")
+
+    def sync_table_schema(self, table_name: str) -> list[str]:
+        """Compare the expected PyArrow schema with the actual Iceberg table schema.
+
+        If columns are missing from the Iceberg table, adds them via ALTER TABLE.
+        Returns a list of columns that were added.
+
+        This supports forward-only schema evolution — columns are never dropped.
+        """
+        table_def = get_table_def(table_name)
+        existing = self.get_table_schema(table_name)
+        existing_names = {col["column_name"] for col in existing}
+
+        added: list[str] = []
+        qualified = table_def.qualified_name
+
+        for i in range(len(table_def.schema)):
+            col_name = table_def.schema.field(i).name
+            if col_name not in existing_names:
+                trino_type = _arrow_type_to_trino(table_def.schema.field(i).type)
+                alter_sql = f"ALTER TABLE {qualified} ADD COLUMN {col_name} {trino_type}"
+                logger.info("Adding column %s to %s", col_name, qualified)
+                self._execute_no_fetch(alter_sql)
+                added.append(col_name)
+
+        return added
+
+    def sync_all_schemas(self) -> dict[str, list[str]]:
+        """Sync schemas for all registered tables. Returns table_name -> added columns."""
+        results: dict[str, list[str]] = {}
+        for table_def in get_all_table_defs():
+            try:
+                if self.table_exists(table_def.table_name):
+                    added = self.sync_table_schema(table_def.table_name)
+                    results[table_def.table_name] = added
+                else:
+                    logger.info("Table %s doesn't exist yet, skipping sync", table_def.table_name)
+                    results[table_def.table_name] = []
+            except Exception:
+                logger.exception("Failed to sync schema for %s", table_def.table_name)
+                results[table_def.table_name] = []
+        return results
+
+
+def create_iceberg_manager_from_config(
+    host: str = "localhost",
+    port: int = 8080,
+    user: str = "stonks",
+) -> IcebergManager:
+    """Factory that creates an IcebergManager from explicit connection params."""
+    return IcebergManager(host=host, port=port, user=user)
@@ -0,0 +1,673 @@
+"""Lake publisher async job runner — transforms operational data into analytical facts.
+
+Reads jobs from the QUEUE_LAKE_PUBLISH Redis queue, queries PostgreSQL for
+operational records, and publishes them as partitioned Parquet files to MinIO
+via the existing publish_* functions in worker.py.
+
+Job message format:
+    {"job_type": "<table_name>", "entity_id": "<uuid or ticker>", "dt": "2026-04-11T..."}
+
+Supported job types:
+    - document: publish a single document metadata fact
+    - document_extraction: publish extraction facts for a document
+    - market_snapshot: publish market bars/quotes from a snapshot
+    - trade_order: publish an order fact
+    - trade_fill: publish fill facts for an order
+    - positions_snapshot: publish daily position snapshots for a broker account
+    - pnl_snapshot: publish daily PnL for a broker account
+    - company_event: publish a company event fact
+    - bulk_documents: publish all unpublished documents since a cutoff
+    - bulk_extractions: publish all unpublished extractions since a cutoff
+
+Requirements: 9.4, 9.5, 10.1
+Design ref: Section 4.10 (Lake Publisher), Section 8.4 (Lake publication flow)
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+from datetime import datetime, timezone
+
+import asyncpg
+import redis.asyncio as aioredis
+from minio import Minio
+
+from services.lake_publisher.worker import (
+    publish_document_extraction,
+    publish_document_fact,
+    publish_market_bar,
+    publish_market_quote,
+    publish_trade_order,
+    publish_trade_fill,
+    publish_pnl_daily,
+    publish_documents_batch,
+    publish_document_extractions_batch,
+    publish_positions_daily_batch,
+)
+from services.lake_publisher.partitions import partition_values
+from services.shared.config import load_config
+from services.shared.db import get_minio, get_pg_pool, get_redis
+from services.shared.logging import setup_logging
+from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# SQL queries for fetching operational data
+# ---------------------------------------------------------------------------
+
+_FETCH_DOCUMENT = """
+SELECT
+    d.id, d.document_type, d.source_type, d.publisher, d.title,
+    d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
+    d.content_hash, d.parse_quality_score,
+    COALESCE(
+        (SELECT dcm.ticker FROM document_company_mentions dcm
+         WHERE dcm.document_id = d.id LIMIT 1),
+        ''
+    ) AS ticker
+FROM documents d
+WHERE d.id = $1::uuid
+"""
+
+_FETCH_EXTRACTIONS = """
+SELECT
+    di.document_id, dir.ticker, dir.relevance, dir.sentiment,
+    dir.impact_score, dir.impact_horizon, dir.catalyst_type,
+    di.confidence, di.novelty_score, di.source_credibility,
+    dir.key_facts, dir.risks, di.macro_themes,
+    di.model_name, di.prompt_version, di.schema_version,
+    di.created_at AS extraction_at,
+    COALESCE(c.legal_name, '') AS company_name
+FROM document_intelligence di
+JOIN document_impact_records dir ON dir.intelligence_id = di.id
+LEFT JOIN companies c ON c.id = dir.company_id
+WHERE di.document_id = $1::uuid
+  AND di.validation_status = 'valid'
+"""
+
+_FETCH_MARKET_SNAPSHOT = """
+SELECT
+    ms.ticker, ms.snapshot_type, ms.data, ms.source_provider, ms.captured_at
+FROM market_snapshots ms
+WHERE ms.id = $1::uuid
+"""
+
+_FETCH_ORDER = """
+SELECT
+    o.id, o.recommendation_id, o.ticker, o.side, o.order_type,
+    o.quantity, o.limit_price, o.status, o.submitted_at,
+    o.fill_price, o.fill_quantity, o.filled_at,
+    COALESCE(ba.account_id, '') AS broker_account,
+    COALESCE(ba.mode, 'paper') AS execution_mode
+FROM orders o
+LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
+WHERE o.id = $1::uuid
+"""
+
+_FETCH_ORDER_FILLS = """
+SELECT
+    oe.id AS fill_id, oe.order_id, oe.data, oe.broker_timestamp,
+    o.ticker, o.side,
+    COALESCE(ba.account_id, '') AS broker_account
+FROM order_events oe
+JOIN orders o ON o.id = oe.order_id
+LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
+WHERE oe.order_id = $1::uuid AND oe.event_type = 'fill'
+"""
+
+_FETCH_POSITIONS = """
+SELECT
+    p.ticker, p.quantity, p.avg_entry_price, p.current_price,
+    p.unrealized_pnl, p.realized_pnl,
+    COALESCE(ba.account_id, '') AS broker_account,
+    COALESCE(ba.mode, 'paper') AS execution_mode
+FROM positions p
+LEFT JOIN broker_accounts ba ON ba.id = p.broker_account_id
+WHERE p.broker_account_id = $1::uuid AND p.quantity != 0
+"""
+
+_FETCH_BULK_DOCUMENTS = """
+SELECT
+    d.id, d.document_type, d.source_type, d.publisher, d.title,
+    d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
+    d.content_hash, d.parse_quality_score,
+    COALESCE(
+        (SELECT dcm.ticker FROM document_company_mentions dcm
+         WHERE dcm.document_id = d.id LIMIT 1),
+        ''
+    ) AS ticker
+FROM documents d
+WHERE d.created_at >= $1
+  AND d.status IN ('parsed', 'extracted')
+ORDER BY d.created_at
+LIMIT 500
+"""
+
+_FETCH_BULK_EXTRACTIONS = """
+SELECT
+    di.document_id, dir.ticker, dir.relevance, dir.sentiment,
+    dir.impact_score, dir.impact_horizon, dir.catalyst_type,
+    di.confidence, di.novelty_score, di.source_credibility,
+    dir.key_facts, dir.risks, di.macro_themes,
+    di.model_name, di.prompt_version, di.schema_version,
+    di.created_at AS extraction_at,
+    COALESCE(c.legal_name, '') AS company_name
+FROM document_intelligence di
+JOIN document_impact_records dir ON dir.intelligence_id = di.id
+LEFT JOIN companies c ON c.id = dir.company_id
+WHERE di.created_at >= $1
+  AND di.validation_status = 'valid'
+ORDER BY di.created_at
+LIMIT 500
+"""
+
+
+# ---------------------------------------------------------------------------
+# Job handlers — each transforms operational rows into lake facts
+# ---------------------------------------------------------------------------
+
+
+def _jsonb_to_str(val: object) -> str:
+    """Convert a JSONB column value (list or str) to a comma-separated string."""
+    if val is None:
+        return ""
+    if isinstance(val, str):
+        try:
+            parsed = json.loads(val)
+            if isinstance(parsed, list):
+                return ", ".join(str(x) for x in parsed)
+            return val
+        except (json.JSONDecodeError, TypeError):
+            return val
+    if isinstance(val, list):
+        return ", ".join(str(x) for x in val)
+    return str(val)
+
+
+async def publish_document_job(
+    pool: asyncpg.Pool,
+    minio_client: Minio,
+    entity_id: str,
+) -> str:
+    """Publish a single document metadata fact from PostgreSQL to the lake."""
+    row = await pool.fetchrow(_FETCH_DOCUMENT, entity_id)
+    if row is None:
+        logger.warning("Document %s not found, skipping lake publish", entity_id)
+        return ""
+
+    published_at = row["published_at"] or row["retrieved_at"]
+    return publish_document_fact(
+        client=minio_client,
+        document_id=str(row["id"]),
+        document_type=row["document_type"],
+        source_type=row["source_type"],
+        ticker=row["ticker"] or "",
+        publisher=row["publisher"] or "",
+        title=row["title"] or "",
+        published_at=published_at,
+        content_hash=row["content_hash"],
+        url=row["url"] or "",
+        canonical_url=row["canonical_url"] or "",
+        language=row["language"] or "en",
+        confidence=float(row["parse_quality_score"] or 0.0),
+        retrieved_at=row["retrieved_at"],
+    )
+
+
+async def publish_extraction_job(
+    pool: asyncpg.Pool,
+    minio_client: Minio,
+    entity_id: str,
+) -> list[str]:
+    """Publish document extraction facts for a document from PostgreSQL to the lake."""
+    rows = await pool.fetch(_FETCH_EXTRACTIONS, entity_id)
+    if not rows:
+        logger.info("No valid extractions for document %s", entity_id)
+        return []
+
+    refs: list[str] = []
+    for row in rows:
+        ref = publish_document_extraction(
+            client=minio_client,
+            document_id=str(row["document_id"]),
+            ticker=row["ticker"],
+            sentiment=row["sentiment"] or "neutral",
+            impact_score=float(row["impact_score"] or 0.0),
+            catalyst_type=row["catalyst_type"] or "other",
+            confidence=float(row["confidence"] or 0.0),
+            extraction_at=row["extraction_at"],
+            model_name=row["model_name"] or "",
+            prompt_version=row["prompt_version"] or "",
+            company_name=row["company_name"] or "",
+            relevance=float(row["relevance"] or 0.0),
+            impact_horizon=row["impact_horizon"] or "",
+            novelty_score=float(row["novelty_score"] or 0.0),
+            source_credibility=float(row["source_credibility"] or 0.0),
+            key_facts=_jsonb_to_str(row["key_facts"]),
+            risks=_jsonb_to_str(row["risks"]),
+            macro_themes=_jsonb_to_str(row["macro_themes"]),
+            schema_version=row["schema_version"] or "",
+        )
+        refs.append(ref)
+    return refs
+
+
+async def publish_market_snapshot_job(
+    pool: asyncpg.Pool,
+    minio_client: Minio,
+    entity_id: str,
+) -> list[str]:
+    """Publish market bar/quote facts from a market_snapshots row."""
+    row = await pool.fetchrow(_FETCH_MARKET_SNAPSHOT, entity_id)
+    if row is None:
+        logger.warning("Market snapshot %s not found", entity_id)
+        return []
+
+    ticker = row["ticker"]
+    data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"])
+    source = row["source_provider"] or ""
+    captured_at = row["captured_at"]
+    snapshot_type = row["snapshot_type"]
+    refs: list[str] = []
+
+    if snapshot_type == "bar" or snapshot_type == "bars":
+        # Single bar or list of bars
+        bars = data.get("bars", [data]) if "bars" in data else [data]
+        for bar in bars:
+            ref = publish_market_bar(
+                client=minio_client,
+                ticker=ticker,
+                open_price=float(bar.get("open", bar.get("o", 0))),
+                high_price=float(bar.get("high", bar.get("h", 0))),
+                low_price=float(bar.get("low", bar.get("l", 0))),
+                close_price=float(bar.get("close", bar.get("c", 0))),
+                volume=int(bar.get("volume", bar.get("v", 0))),
+                bar_timestamp=captured_at,
+                source=source,
+                vwap=float(bar.get("vwap", bar.get("vw", 0))),
+                trade_count=int(bar.get("trade_count", bar.get("n", 0))),
+                bar_interval=bar.get("interval", "1d"),
+            )
+            refs.append(ref)
+    elif snapshot_type == "quote" or snapshot_type == "quotes":
+        ref = publish_market_quote(
+            client=minio_client,
+            ticker=ticker,
+            bid_price=float(data.get("bid_price", data.get("bp", 0))),
+            ask_price=float(data.get("ask_price", data.get("ap", 0))),
+            last_price=float(data.get("last_price", data.get("lp", 0))),
+            quote_at=captured_at,
+            source=source,
+            bid_size=int(data.get("bid_size", data.get("bs", 0))),
+            ask_size=int(data.get("ask_size", data.get("as", 0))),
+            last_size=int(data.get("last_size", data.get("ls", 0))),
+        )
+        refs.append(ref)
+
+    return refs
+
+
+async def publish_order_job(
+    pool: asyncpg.Pool,
+    minio_client: Minio,
+    entity_id: str,
+) -> str:
+    """Publish a trade order fact from PostgreSQL to the lake."""
+    row = await pool.fetchrow(_FETCH_ORDER, entity_id)
+    if row is None:
+        logger.warning("Order %s not found", entity_id)
+        return ""
+
+    submitted_at = row["submitted_at"] or datetime.now(timezone.utc)
+    return publish_trade_order(
+        client=minio_client,
+        order_id=str(row["id"]),
+        ticker=row["ticker"],
+        side=row["side"],
+        order_type=row["order_type"],
+        quantity=float(row["quantity"]),
+        limit_price=float(row["limit_price"]) if row["limit_price"] else None,
+        status=row["status"],
+        broker_account=row["broker_account"],
+        submitted_at=submitted_at,
+        recommendation_id=str(row["recommendation_id"]) if row["recommendation_id"] else "",
+        execution_mode=row["execution_mode"],
+    )
+
+
+async def publish_fills_job(
+    pool: asyncpg.Pool,
+    minio_client: Minio,
+    entity_id: str,
+) -> list[str]:
+    """Publish trade fill facts for an order from PostgreSQL to the lake."""
+    rows = await pool.fetch(_FETCH_ORDER_FILLS, entity_id)
+    if not rows:
+        logger.info("No fill events for order %s", entity_id)
+        return []
+
+    refs: list[str] = []
+    for row in rows:
+        data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"] or "{}")
+        filled_at = row["broker_timestamp"] or datetime.now(timezone.utc)
+        ref = publish_trade_fill(
+            client=minio_client,
+            fill_id=str(row["fill_id"]),
+            order_id=str(row["order_id"]),
+            ticker=row["ticker"],
+            side=row["side"],
+            fill_price=float(data.get("fill_price", data.get("price", 0))),
+            fill_quantity=float(data.get("fill_quantity", data.get("qty", 0))),
+            broker_account=row["broker_account"],
+            filled_at=filled_at,
+            commission=float(data.get("commission", 0)),
+        )
+        refs.append(ref)
+    return refs
+
+
+async def publish_positions_job(
+    pool: asyncpg.Pool,
+    minio_client: Minio,
+    entity_id: str,
+) -> str:
+    """Publish daily position snapshots for a broker account."""
+    rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
+    if not rows:
+        logger.info("No open positions for account %s", entity_id)
+        return ""
+
+    snapshot_at = datetime.now(timezone.utc)
+    positions = [
+        {
+            "ticker": row["ticker"],
+            "quantity": float(row["quantity"]),
+            "avg_entry_price": float(row["avg_entry_price"] or 0),
+            "close_price": float(row["current_price"] or 0),
+            "unrealized_pnl": float(row["unrealized_pnl"] or 0),
+        }
+        for row in rows
+    ]
+    broker_account = rows[0]["broker_account"] if rows else ""
+    return publish_positions_daily_batch(
+        client=minio_client,
+        positions=positions,
+        broker_account=broker_account,
+        snapshot_at=snapshot_at,
+    )
+
+
+async def publish_pnl_job(
+    pool: asyncpg.Pool,
+    minio_client: Minio,
+    entity_id: str,
+) -> list[str]:
+    """Publish daily PnL facts for a broker account's positions."""
+    rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
+    if not rows:
+        logger.info("No positions for PnL snapshot, account %s", entity_id)
+        return []
+
+    now = datetime.now(timezone.utc)
+    refs: list[str] = []
+    for row in rows:
+        realized = float(row["realized_pnl"] or 0)
+        unrealized = float(row["unrealized_pnl"] or 0)
+        total = realized + unrealized
+        ref = publish_pnl_daily(
+            client=minio_client,
+            ticker=row["ticker"],
+            realized_pnl=realized,
+            unrealized_pnl=unrealized,
+            total_pnl=total,
+            broker_account=row["broker_account"],
+            dt=now,
+            execution_mode=row["execution_mode"],
+        )
+        refs.append(ref)
+    return refs
+
+
+async def publish_bulk_documents_job(
+    pool: asyncpg.Pool,
+    minio_client: Minio,
+    since: datetime,
+) -> list[str]:
+    """Publish all documents created since a cutoff as a batch."""
+    rows = await pool.fetch(_FETCH_BULK_DOCUMENTS, since)
+    if not rows:
+        logger.info("No documents to bulk-publish since %s", since)
+        return []
+
+    doc_rows: list[dict[str, object]] = []
+    for row in rows:
+        published_at = row["published_at"] or row["retrieved_at"]
+        doc_rows.append({
+            "document_id": str(row["id"]),
+            "document_type": row["document_type"],
+            "source_type": row["source_type"],
+            "ticker": row["ticker"] or "",
+            "publisher": row["publisher"] or "",
+            "title": row["title"] or "",
+            "url": row["url"] or "",
+            "canonical_url": row["canonical_url"] or "",
+            "language": row["language"] or "en",
+            "published_at": published_at,
+            "retrieved_at": row["retrieved_at"],
+            "content_hash": row["content_hash"],
+            "confidence": float(row["parse_quality_score"] or 0.0),
+            **partition_values(published_at),
+        })
+
+    ref = publish_documents_batch(minio_client, doc_rows, since)
+    return [ref] if ref else []
+
+
+async def publish_bulk_extractions_job(
+    pool: asyncpg.Pool,
+    minio_client: Minio,
+    since: datetime,
+) -> list[str]:
+    """Publish all extractions created since a cutoff as a batch."""
+    rows = await pool.fetch(_FETCH_BULK_EXTRACTIONS, since)
+    if not rows:
+        logger.info("No extractions to bulk-publish since %s", since)
+        return []
+
+    extraction_rows: list[dict[str, object]] = []
+    for row in rows:
+        model_ver = row["schema_version"] or row["prompt_version"] or ""
+        extraction_rows.append({
+            "document_id": str(row["document_id"]),
+            "ticker": row["ticker"],
+            "company_name": row["company_name"] or "",
+            "relevance": float(row["relevance"] or 0.0),
+            "sentiment": row["sentiment"] or "neutral",
+            "impact_score": float(row["impact_score"] or 0.0),
+            "impact_horizon": row["impact_horizon"] or "",
+            "catalyst_type": row["catalyst_type"] or "other",
+            "confidence": float(row["confidence"] or 0.0),
+            "novelty_score": float(row["novelty_score"] or 0.0),
+            "source_credibility": float(row["source_credibility"] or 0.0),
+            "key_facts": _jsonb_to_str(row["key_facts"]),
+            "risks": _jsonb_to_str(row["risks"]),
+            "macro_themes": _jsonb_to_str(row["macro_themes"]),
+            "model_name": row["model_name"] or "",
+            "prompt_version": row["prompt_version"] or "",
+            "schema_version": row["schema_version"] or "",
+            "extraction_at": row["extraction_at"],
+            **partition_values(row["extraction_at"], {"model_version": model_ver}),
+        })
+
+    model_ver = extraction_rows[0].get("model_version", "") if extraction_rows else ""
+    ref = publish_document_extractions_batch(
+        minio_client, extraction_rows, since,
+        model_version=str(model_ver),
+    )
+    return [ref] if ref else []
+
+
+# ---------------------------------------------------------------------------
+# Job dispatcher
+# ---------------------------------------------------------------------------
+
+JOB_TYPES = {
+    "document",
+    "document_extraction",
+    "market_snapshot",
+    "trade_order",
+    "trade_fill",
+    "positions_snapshot",
+    "pnl_snapshot",
+    "company_event",
+    "bulk_documents",
+    "bulk_extractions",
+}
+
+
+async def dispatch_job(
+    pool: asyncpg.Pool,
+    minio_client: Minio,
+    job: dict[str, str],
+) -> dict[str, object]:
+    """Dispatch a lake publish job to the appropriate handler.
+
+    Args:
+        pool: PostgreSQL connection pool.
+        minio_client: MinIO client for writing Parquet files.
+        job: Job dict with at least 'job_type' and 'entity_id'.
+
+    Returns:
+        A result dict with 'job_type', 'entity_id', 'refs' (list of s3 URIs),
+        and 'error' (None on success).
+    """
+    job_type = job.get("job_type", "")
+    entity_id = job.get("entity_id", "")
+    since_str = job.get("since")
+
+    result: dict[str, object] = {
+        "job_type": job_type,
+        "entity_id": entity_id,
+        "refs": [],
+        "error": None,
+    }
+
+    try:
+        if job_type == "document":
+            ref = await publish_document_job(pool, minio_client, entity_id)
+            result["refs"] = [ref] if ref else []
+
+        elif job_type == "document_extraction":
+            refs = await publish_extraction_job(pool, minio_client, entity_id)
+            result["refs"] = refs
+
+        elif job_type == "market_snapshot":
+            refs = await publish_market_snapshot_job(pool, minio_client, entity_id)
+            result["refs"] = refs
+
+        elif job_type == "trade_order":
+            ref = await publish_order_job(pool, minio_client, entity_id)
+            result["refs"] = [ref] if ref else []
+
+        elif job_type == "trade_fill":
+            refs = await publish_fills_job(pool, minio_client, entity_id)
+            result["refs"] = refs
+
+        elif job_type == "positions_snapshot":
+            ref = await publish_positions_job(pool, minio_client, entity_id)
+            result["refs"] = [ref] if ref else []
+
+        elif job_type == "pnl_snapshot":
+            refs = await publish_pnl_job(pool, minio_client, entity_id)
+            result["refs"] = refs
+
+        elif job_type == "bulk_documents":
+            since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
+            refs = await publish_bulk_documents_job(pool, minio_client, since)
+            result["refs"] = refs
+
+        elif job_type == "bulk_extractions":
+            since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
+            refs = await publish_bulk_extractions_job(pool, minio_client, since)
+            result["refs"] = refs
+
+        else:
+            result["error"] = f"Unknown job_type: {job_type}"
+            logger.warning("Unknown lake publish job type: %s", job_type)
+
+    except Exception as exc:
+        result["error"] = str(exc)
+        logger.exception("Lake publish job failed: %s/%s", job_type, entity_id)
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Async worker loop
+# ---------------------------------------------------------------------------
+
+
+async def run_worker(
+    pool: asyncpg.Pool,
+    rds: aioredis.Redis,
+    minio_client: Minio,
+    poll_interval: float = 2.0,
+) -> None:
+    """Main worker loop — reads jobs from Redis and dispatches them.
+
+    Runs indefinitely until cancelled. Each job is processed sequentially
+    to keep MinIO write ordering predictable.
+    """
+    queue = queue_key(QUEUE_LAKE_PUBLISH)
+    logger.info("Lake publisher worker started, listening on %s", queue)
+
+    while True:
+        raw = await rds.lpop(queue)  # type: ignore[misc]
+        if raw is None:
+            await asyncio.sleep(poll_interval)
+            continue
+
+        try:
+            job = json.loads(str(raw))
+        except (json.JSONDecodeError, TypeError):
+            logger.error("Invalid lake publish job payload: %s", raw)
+            continue
+
+        result = await dispatch_job(pool, minio_client, job)
+        refs = result.get("refs") or []
+        error = result.get("error")
+
+        if error:
+            logger.error(
+                "Lake publish job %s/%s failed: %s",
+                result["job_type"], result["entity_id"], error,
+            )
+        else:
+            ref_count = len(refs) if isinstance(refs, list) else 0
+            logger.info(
+                "Lake publish job %s/%s completed: %d facts written",
+                result["job_type"], result["entity_id"], ref_count,
+            )
+
+
+async def main() -> None:
+    """Entry point for the lake publisher worker process."""
+    config = load_config()
+    pool = await get_pg_pool(config)
+    rds = get_redis(config)
+    minio_client = get_minio(config)
+
+    try:
+        await run_worker(pool, rds, minio_client)
+    finally:
+        await pool.close()
+        await rds.close()
+
+
+if __name__ == "__main__":
+    cfg = load_config()
+    setup_logging("lake_publisher", level=cfg.log_level, json_output=cfg.json_logs)
+    asyncio.run(main())
@@ -0,0 +1,128 @@
+"""Hive-compatible partition layout conventions for the MinIO lakehouse.
+
+Centralizes partition path generation, partition column injection, and
+bucket provisioning so that all lake publisher writers produce layouts
+that Trino's Hive and Iceberg connectors can discover and prune.
+
+Design ref: Section 5.2, 5.3 (Lakehouse model)
+Requirements: 9.4, 9.5, N4, N6
+
+Layout convention:
+    s3://stonks-lakehouse/warehouse/{table_name}/dt={YYYY-MM-DD}[/{extra_key}={value}]/part-{uuid}.parquet
+
+Rules:
+    - Every fact table is partitioned by ``dt`` (DATE) derived from the row timestamp.
+    - Some tables have a second partition key (e.g. ``model_version``).
+    - Partition columns MUST appear in the Parquet file so Trino can read them
+      without relying solely on path parsing.
+    - File names use a UUID suffix to avoid collisions on concurrent writes.
+"""
+from __future__ import annotations
+
+import uuid
+from dataclasses import dataclass, field
+from datetime import date, datetime, timezone
+
+
+LAKEHOUSE_BUCKET = "stonks-lakehouse"
+WAREHOUSE_PREFIX = "warehouse"
+
+
+@dataclass(frozen=True)
+class PartitionSpec:
+    """Describes the partition layout for a single fact table."""
+
+    table_name: str
+    extra_keys: tuple[str, ...] = field(default_factory=tuple)
+
+    @property
+    def all_keys(self) -> tuple[str, ...]:
+        """Return all partition keys in order (dt first, then extras)."""
+        return ("dt", *self.extra_keys)
+
+
+# Registry of every analytical fact table and its partition keys.
+# This is the single source of truth — DDL, publisher, and tests should agree.
+TABLE_PARTITIONS: dict[str, PartitionSpec] = {
+    "market_bars":            PartitionSpec("market_bars"),
+    "market_quotes":          PartitionSpec("market_quotes"),
+    "company_events":         PartitionSpec("company_events"),
+    "documents":              PartitionSpec("documents"),
+    "document_extractions":   PartitionSpec("document_extractions", extra_keys=("model_version",)),
+    "trade_signals":          PartitionSpec("trade_signals"),
+    "trade_orders":           PartitionSpec("trade_orders"),
+    "trade_fills":            PartitionSpec("trade_fills"),
+    "positions_daily":        PartitionSpec("positions_daily"),
+    "pnl_daily":              PartitionSpec("pnl_daily"),
+    "prediction_vs_outcome":  PartitionSpec("prediction_vs_outcome", extra_keys=("model_version",)),
+    "model_performance":      PartitionSpec("model_performance", extra_keys=("model_version",)),
+}
+
+
+def partition_path(
+    table_name: str,
+    dt: datetime | date,
+    extra_partitions: dict[str, str] | None = None,
+    file_id: str | None = None,
+) -> str:
+    """Build a Hive-compatible object path for a Parquet file.
+
+    Args:
+        table_name: Logical fact table name (must be in TABLE_PARTITIONS).
+        dt: Row timestamp or date used to derive the ``dt=`` partition.
+        extra_partitions: Additional partition key/value pairs (e.g. model_version).
+        file_id: Optional override for the file suffix (defaults to a UUID4).
+
+    Returns:
+        Object key relative to the bucket root, e.g.
+        ``warehouse/trade_signals/dt=2026-04-11/part-<uuid>.parquet``
+    """
+    spec = TABLE_PARTITIONS.get(table_name)
+    if spec is None:
+        raise ValueError(f"Unknown table: {table_name}. Register it in TABLE_PARTITIONS.")
+
+    if isinstance(dt, datetime):
+        dt_str = dt.strftime("%Y-%m-%d")
+    else:
+        dt_str = dt.isoformat()
+
+    segments = [WAREHOUSE_PREFIX, table_name, f"dt={dt_str}"]
+
+    # Append extra partition directories in the order declared by the spec.
+    extras = extra_partitions or {}
+    for key in spec.extra_keys:
+        value = extras.get(key, "__NONE__")
+        segments.append(f"{key}={value}")
+
+    suffix = file_id or uuid.uuid4().hex[:16]
+    segments.append(f"part-{suffix}.parquet")
+
+    return "/".join(segments)
+
+
+def partition_values(
+    dt: datetime | date,
+    extra_partitions: dict[str, str] | None = None,
+) -> dict[str, object]:
+    """Return partition column values to inject into Parquet row data.
+
+    Trino's Hive connector can read partition values from the directory path,
+    but embedding them in the Parquet file as well ensures compatibility with
+    engines that don't parse Hive paths (e.g. plain PyArrow reads, DuckDB).
+
+    Returns a dict like ``{"dt": date(2026, 4, 11), "model_version": "v2"}``.
+    """
+    if isinstance(dt, datetime):
+        dt_date = dt.date()
+    else:
+        dt_date = dt
+
+    values: dict[str, object] = {"dt": dt_date}
+    if extra_partitions:
+        values.update(extra_partitions)
+    return values
+
+
+def s3_uri(path: str) -> str:
+    """Build an s3:// URI from a bucket-relative object path."""
+    return f"s3://{LAKEHOUSE_BUCKET}/{path}"