phase 14-15: docker build validation and helm deployment

This commit is contained in:
Celes Renata
2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
+1 -1
View File
@@ -1 +1 @@
# Lake Publisher - transforms operational data into analytical fact datasets
"""Lake publisher — writes partitioned Parquet facts to MinIO for Trino/Superset."""
+39
View File
@@ -0,0 +1,39 @@
"""Helpers for enqueuing lake publish jobs from upstream workers.
Other services import these helpers to push jobs onto the QUEUE_LAKE_PUBLISH
Redis queue. The lake publisher worker (jobs.py) consumes them.
Usage:
await enqueue_lake_job(rds, "document", document_id)
await enqueue_lake_job(rds, "trade_order", order_id)
await enqueue_lake_job(rds, "bulk_documents", since=cutoff.isoformat())
"""
from __future__ import annotations
import json
import redis.asyncio as aioredis
from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key
async def enqueue_lake_job(
rds: aioredis.Redis,
job_type: str,
entity_id: str = "",
since: str | None = None,
) -> None:
"""Push a lake publish job onto the Redis queue.
Args:
rds: Async Redis client.
job_type: One of the supported job types (document, document_extraction,
market_snapshot, trade_order, trade_fill, positions_snapshot,
pnl_snapshot, bulk_documents, bulk_extractions).
entity_id: UUID or identifier for the entity to publish.
since: ISO datetime string for bulk jobs (cutoff timestamp).
"""
payload: dict[str, str] = {"job_type": job_type, "entity_id": entity_id}
if since:
payload["since"] = since
await rds.rpush(queue_key(QUEUE_LAKE_PUBLISH), json.dumps(payload)) # type: ignore[misc]
+420
View File
@@ -0,0 +1,420 @@
"""Iceberg table creation and metadata management for analytical datasets.
Manages Iceberg tables in Trino's Iceberg catalog, providing:
- Table creation with proper schemas and partition specs
- Schema synchronization between PyArrow definitions and Iceberg tables
- Table metadata inspection (existence checks, schema retrieval, partition listing)
The Iceberg catalog complements the existing Hive-compatible partition layout.
Parquet files written by the lake publisher are stored in the same MinIO paths,
but Iceberg metadata enables schema evolution, snapshot isolation, and better
partition pruning via Trino's Iceberg connector.
Requirements: 9.4, 9.5, 10.1, N4, N6
Design ref: Section 5.3 (Lakehouse model), Section 4.12 (SQL Query Engine)
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import Any
import pyarrow as pa
from trino.dbapi import connect as trino_connect
from services.lake_publisher.partitions import (
LAKEHOUSE_BUCKET,
TABLE_PARTITIONS,
WAREHOUSE_PREFIX,
PartitionSpec,
)
from services.lake_publisher.worker import (
COMPANY_EVENTS_SCHEMA,
DOCUMENTS_SCHEMA,
DOCUMENT_EXTRACTIONS_SCHEMA,
MARKET_BARS_SCHEMA,
MARKET_QUOTES_SCHEMA,
MODEL_PERFORMANCE_SCHEMA,
PNL_DAILY_SCHEMA,
POSITIONS_DAILY_SCHEMA,
PREDICTION_VS_OUTCOME_SCHEMA,
TRADE_FILLS_SCHEMA,
TRADE_ORDERS_SCHEMA,
TRADE_SIGNALS_SCHEMA,
)
logger = logging.getLogger(__name__)
ICEBERG_CATALOG = "iceberg"
ICEBERG_SCHEMA = "stonks"
def _get_iceberg_catalog() -> str:
"""Return the Iceberg catalog name from env or default."""
import os
return os.getenv("TRINO_ICEBERG_CATALOG", ICEBERG_CATALOG)
# Map PyArrow types to Trino/Iceberg SQL types.
_ARROW_TO_TRINO: dict[str, str] = {
"string": "VARCHAR",
"utf8": "VARCHAR",
"large_string": "VARCHAR",
"large_utf8": "VARCHAR",
"float64": "DOUBLE",
"double": "DOUBLE",
"float32": "REAL",
"float": "REAL",
"int8": "TINYINT",
"int16": "SMALLINT",
"int32": "INTEGER",
"int64": "BIGINT",
"bool": "BOOLEAN",
"date32": "DATE",
"date32[day]": "DATE",
"date64": "DATE",
}
def _arrow_type_to_trino(arrow_type: pa.DataType) -> str:
"""Convert a PyArrow data type to a Trino SQL type string."""
type_str = str(arrow_type)
# Handle timestamp types (with or without timezone)
if type_str.startswith("timestamp"):
if "tz=" in type_str:
return "TIMESTAMP(6) WITH TIME ZONE"
return "TIMESTAMP(6)"
# Direct lookup
result = _ARROW_TO_TRINO.get(type_str)
if result:
return result
# Fallback for type IDs
if pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type):
return "VARCHAR"
if pa.types.is_floating(arrow_type):
return "DOUBLE"
if pa.types.is_integer(arrow_type):
return "BIGINT"
if pa.types.is_boolean(arrow_type):
return "BOOLEAN"
if pa.types.is_date(arrow_type):
return "DATE"
if pa.types.is_timestamp(arrow_type):
return "TIMESTAMP(6) WITH TIME ZONE"
raise ValueError(f"Unsupported PyArrow type for Iceberg DDL: {arrow_type}")
# Registry mapping table names to their PyArrow schemas.
TABLE_SCHEMAS: dict[str, pa.Schema] = {
"market_bars": MARKET_BARS_SCHEMA,
"market_quotes": MARKET_QUOTES_SCHEMA,
"company_events": COMPANY_EVENTS_SCHEMA,
"documents": DOCUMENTS_SCHEMA,
"document_extractions": DOCUMENT_EXTRACTIONS_SCHEMA,
"trade_signals": TRADE_SIGNALS_SCHEMA,
"trade_orders": TRADE_ORDERS_SCHEMA,
"trade_fills": TRADE_FILLS_SCHEMA,
"positions_daily": POSITIONS_DAILY_SCHEMA,
"pnl_daily": PNL_DAILY_SCHEMA,
"prediction_vs_outcome": PREDICTION_VS_OUTCOME_SCHEMA,
"model_performance": MODEL_PERFORMANCE_SCHEMA,
}
@dataclass(frozen=True)
class IcebergTableDef:
"""Definition for an Iceberg table derived from PyArrow schema + partition spec."""
table_name: str
schema: pa.Schema
partition_spec: PartitionSpec
@property
def qualified_name(self) -> str:
return f"{ICEBERG_CATALOG}.{ICEBERG_SCHEMA}.{self.table_name}"
@property
def location(self) -> str:
return f"s3a://{LAKEHOUSE_BUCKET}/{WAREHOUSE_PREFIX}/{self.table_name}/"
def column_defs_sql(self) -> list[str]:
"""Generate SQL column definitions from the PyArrow schema.
Partition columns are included in the column list (Iceberg stores them
in the data files, unlike Hive external tables).
"""
cols: list[str] = []
for i in range(len(self.schema)):
name = self.schema.field(i).name
arrow_type = self.schema.field(i).type
trino_type = _arrow_type_to_trino(arrow_type)
cols.append(f" {name} {trino_type}")
return cols
def partition_keys_sql(self) -> str:
"""Generate the partitioning clause for CREATE TABLE."""
keys = list(self.partition_spec.all_keys)
if not keys:
return ""
quoted = ", ".join(f"'{k}'" for k in keys)
return f"partitioning = ARRAY[{quoted}]"
def create_table_sql(self) -> str:
"""Generate a CREATE TABLE IF NOT EXISTS statement for Trino's Iceberg catalog."""
col_lines = ",\n".join(self.column_defs_sql())
with_clauses = [
"format = 'PARQUET'",
f"location = '{self.location}'",
]
part_sql = self.partition_keys_sql()
if part_sql:
with_clauses.append(part_sql)
with_block = ",\n ".join(with_clauses)
return (
f"CREATE TABLE IF NOT EXISTS {self.qualified_name} (\n"
f"{col_lines}\n"
f") WITH (\n"
f" {with_block}\n"
f")"
)
def get_all_table_defs() -> list[IcebergTableDef]:
"""Build IcebergTableDef for every registered analytical table."""
defs: list[IcebergTableDef] = []
for table_name, partition_spec in TABLE_PARTITIONS.items():
schema = TABLE_SCHEMAS.get(table_name)
if schema is None:
logger.warning("No PyArrow schema for table %s, skipping", table_name)
continue
defs.append(IcebergTableDef(
table_name=table_name,
schema=schema,
partition_spec=partition_spec,
))
return defs
def get_table_def(table_name: str) -> IcebergTableDef:
"""Get the IcebergTableDef for a single table by name."""
if table_name not in TABLE_PARTITIONS:
raise ValueError(f"Unknown table: {table_name}")
schema = TABLE_SCHEMAS.get(table_name)
if schema is None:
raise ValueError(f"No PyArrow schema registered for table: {table_name}")
return IcebergTableDef(
table_name=table_name,
schema=schema,
partition_spec=TABLE_PARTITIONS[table_name],
)
@dataclass
class IcebergManager:
"""Manages Iceberg tables via Trino's Iceberg catalog.
Provides table creation, existence checks, schema inspection,
and metadata operations against the Trino Iceberg connector.
"""
host: str = "localhost"
port: int = 8080
user: str = "stonks"
catalog: str = ICEBERG_CATALOG
schema: str = ICEBERG_SCHEMA
def _get_connection(self) -> Any:
"""Create a Trino DBAPI connection."""
return trino_connect(
host=self.host,
port=self.port,
user=self.user,
catalog=self.catalog,
schema=self.schema,
)
def _execute(self, sql: str) -> list[list[Any]]:
"""Execute a SQL statement and return all rows."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(sql)
return cursor.fetchall()
finally:
conn.close()
def _execute_no_fetch(self, sql: str) -> None:
"""Execute a DDL statement that returns no rows."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(sql)
# DDL statements in Trino still need fetchall to complete
try:
cursor.fetchall()
except Exception:
pass
finally:
conn.close()
def ensure_schema(self) -> None:
"""Create the Iceberg schema if it doesn't exist."""
sql = f"CREATE SCHEMA IF NOT EXISTS {self.catalog}.{self.schema}"
logger.info("Ensuring Iceberg schema: %s.%s", self.catalog, self.schema)
self._execute_no_fetch(sql)
def table_exists(self, table_name: str) -> bool:
"""Check if an Iceberg table exists."""
sql = (
f"SELECT table_name FROM {self.catalog}.information_schema.tables "
f"WHERE table_schema = '{self.schema}' AND table_name = '{table_name}'"
)
rows = self._execute(sql)
return len(rows) > 0
def create_table(self, table_name: str) -> bool:
"""Create a single Iceberg table if it doesn't exist.
Returns True if the table was created, False if it already existed.
"""
table_def = get_table_def(table_name)
ddl = table_def.create_table_sql()
logger.info("Creating Iceberg table: %s", table_def.qualified_name)
self._execute_no_fetch(ddl)
logger.info("Iceberg table ready: %s", table_def.qualified_name)
return True
def create_all_tables(self) -> dict[str, bool]:
"""Create all registered Iceberg tables.
Returns a dict mapping table_name -> True (created) or False (error).
"""
self.ensure_schema()
results: dict[str, bool] = {}
for table_def in get_all_table_defs():
try:
self.create_table(table_def.table_name)
results[table_def.table_name] = True
except Exception:
logger.exception("Failed to create Iceberg table: %s", table_def.table_name)
results[table_def.table_name] = False
return results
def get_table_schema(self, table_name: str) -> list[dict[str, str]]:
"""Retrieve the column schema of an Iceberg table from Trino.
Returns a list of dicts with 'column_name', 'data_type', and 'is_nullable'.
"""
sql = (
f"SELECT column_name, data_type, is_nullable "
f"FROM {self.catalog}.information_schema.columns "
f"WHERE table_schema = '{self.schema}' AND table_name = '{table_name}' "
f"ORDER BY ordinal_position"
)
rows = self._execute(sql)
return [
{"column_name": r[0], "data_type": r[1], "is_nullable": r[2]}
for r in rows
]
def get_table_snapshots(self, table_name: str) -> list[dict[str, Any]]:
"""List Iceberg snapshots for a table (useful for auditing and rollback).
Returns snapshot metadata from Trino's $snapshots metadata table.
"""
qualified = f"{self.catalog}.{self.schema}.{table_name}"
sql = f'SELECT * FROM "{qualified}$snapshots"'
try:
rows = self._execute(sql)
return [{"snapshot_id": r[0], "parent_id": r[1], "operation": r[2],
"manifest_list": r[3], "summary": r[4]} for r in rows]
except Exception:
logger.debug("Could not read snapshots for %s (table may be empty)", table_name)
return []
def get_table_partitions(self, table_name: str) -> list[dict[str, Any]]:
"""List partition values for an Iceberg table.
Returns partition metadata from Trino's $partitions metadata table.
"""
qualified = f"{self.catalog}.{self.schema}.{table_name}"
sql = f'SELECT * FROM "{qualified}$partitions"'
try:
rows = self._execute(sql)
return [{"row": r} for r in rows]
except Exception:
logger.debug("Could not read partitions for %s (table may be empty)", table_name)
return []
def list_tables(self) -> list[str]:
"""List all tables in the Iceberg schema."""
sql = (
f"SELECT table_name FROM {self.catalog}.information_schema.tables "
f"WHERE table_schema = '{self.schema}' ORDER BY table_name"
)
rows = self._execute(sql)
return [r[0] for r in rows]
def drop_table(self, table_name: str) -> None:
"""Drop an Iceberg table (for testing/reset purposes)."""
qualified = f"{self.catalog}.{self.schema}.{table_name}"
logger.warning("Dropping Iceberg table: %s", qualified)
self._execute_no_fetch(f"DROP TABLE IF EXISTS {qualified}")
def sync_table_schema(self, table_name: str) -> list[str]:
"""Compare the expected PyArrow schema with the actual Iceberg table schema.
If columns are missing from the Iceberg table, adds them via ALTER TABLE.
Returns a list of columns that were added.
This supports forward-only schema evolution — columns are never dropped.
"""
table_def = get_table_def(table_name)
existing = self.get_table_schema(table_name)
existing_names = {col["column_name"] for col in existing}
added: list[str] = []
qualified = table_def.qualified_name
for i in range(len(table_def.schema)):
col_name = table_def.schema.field(i).name
if col_name not in existing_names:
trino_type = _arrow_type_to_trino(table_def.schema.field(i).type)
alter_sql = f"ALTER TABLE {qualified} ADD COLUMN {col_name} {trino_type}"
logger.info("Adding column %s to %s", col_name, qualified)
self._execute_no_fetch(alter_sql)
added.append(col_name)
return added
def sync_all_schemas(self) -> dict[str, list[str]]:
"""Sync schemas for all registered tables. Returns table_name -> added columns."""
results: dict[str, list[str]] = {}
for table_def in get_all_table_defs():
try:
if self.table_exists(table_def.table_name):
added = self.sync_table_schema(table_def.table_name)
results[table_def.table_name] = added
else:
logger.info("Table %s doesn't exist yet, skipping sync", table_def.table_name)
results[table_def.table_name] = []
except Exception:
logger.exception("Failed to sync schema for %s", table_def.table_name)
results[table_def.table_name] = []
return results
def create_iceberg_manager_from_config(
host: str = "localhost",
port: int = 8080,
user: str = "stonks",
) -> IcebergManager:
"""Factory that creates an IcebergManager from explicit connection params."""
return IcebergManager(host=host, port=port, user=user)
+673
View File
@@ -0,0 +1,673 @@
"""Lake publisher async job runner — transforms operational data into analytical facts.
Reads jobs from the QUEUE_LAKE_PUBLISH Redis queue, queries PostgreSQL for
operational records, and publishes them as partitioned Parquet files to MinIO
via the existing publish_* functions in worker.py.
Job message format:
{"job_type": "<table_name>", "entity_id": "<uuid or ticker>", "dt": "2026-04-11T..."}
Supported job types:
- document: publish a single document metadata fact
- document_extraction: publish extraction facts for a document
- market_snapshot: publish market bars/quotes from a snapshot
- trade_order: publish an order fact
- trade_fill: publish fill facts for an order
- positions_snapshot: publish daily position snapshots for a broker account
- pnl_snapshot: publish daily PnL for a broker account
- company_event: publish a company event fact
- bulk_documents: publish all unpublished documents since a cutoff
- bulk_extractions: publish all unpublished extractions since a cutoff
Requirements: 9.4, 9.5, 10.1
Design ref: Section 4.10 (Lake Publisher), Section 8.4 (Lake publication flow)
"""
from __future__ import annotations
import asyncio
import json
import logging
from datetime import datetime, timezone
import asyncpg
import redis.asyncio as aioredis
from minio import Minio
from services.lake_publisher.worker import (
publish_document_extraction,
publish_document_fact,
publish_market_bar,
publish_market_quote,
publish_trade_order,
publish_trade_fill,
publish_pnl_daily,
publish_documents_batch,
publish_document_extractions_batch,
publish_positions_daily_batch,
)
from services.lake_publisher.partitions import partition_values
from services.shared.config import load_config
from services.shared.db import get_minio, get_pg_pool, get_redis
from services.shared.logging import setup_logging
from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# SQL queries for fetching operational data
# ---------------------------------------------------------------------------
_FETCH_DOCUMENT = """
SELECT
d.id, d.document_type, d.source_type, d.publisher, d.title,
d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
d.content_hash, d.parse_quality_score,
COALESCE(
(SELECT dcm.ticker FROM document_company_mentions dcm
WHERE dcm.document_id = d.id LIMIT 1),
''
) AS ticker
FROM documents d
WHERE d.id = $1::uuid
"""
_FETCH_EXTRACTIONS = """
SELECT
di.document_id, dir.ticker, dir.relevance, dir.sentiment,
dir.impact_score, dir.impact_horizon, dir.catalyst_type,
di.confidence, di.novelty_score, di.source_credibility,
dir.key_facts, dir.risks, di.macro_themes,
di.model_name, di.prompt_version, di.schema_version,
di.created_at AS extraction_at,
COALESCE(c.legal_name, '') AS company_name
FROM document_intelligence di
JOIN document_impact_records dir ON dir.intelligence_id = di.id
LEFT JOIN companies c ON c.id = dir.company_id
WHERE di.document_id = $1::uuid
AND di.validation_status = 'valid'
"""
_FETCH_MARKET_SNAPSHOT = """
SELECT
ms.ticker, ms.snapshot_type, ms.data, ms.source_provider, ms.captured_at
FROM market_snapshots ms
WHERE ms.id = $1::uuid
"""
_FETCH_ORDER = """
SELECT
o.id, o.recommendation_id, o.ticker, o.side, o.order_type,
o.quantity, o.limit_price, o.status, o.submitted_at,
o.fill_price, o.fill_quantity, o.filled_at,
COALESCE(ba.account_id, '') AS broker_account,
COALESCE(ba.mode, 'paper') AS execution_mode
FROM orders o
LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
WHERE o.id = $1::uuid
"""
_FETCH_ORDER_FILLS = """
SELECT
oe.id AS fill_id, oe.order_id, oe.data, oe.broker_timestamp,
o.ticker, o.side,
COALESCE(ba.account_id, '') AS broker_account
FROM order_events oe
JOIN orders o ON o.id = oe.order_id
LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
WHERE oe.order_id = $1::uuid AND oe.event_type = 'fill'
"""
_FETCH_POSITIONS = """
SELECT
p.ticker, p.quantity, p.avg_entry_price, p.current_price,
p.unrealized_pnl, p.realized_pnl,
COALESCE(ba.account_id, '') AS broker_account,
COALESCE(ba.mode, 'paper') AS execution_mode
FROM positions p
LEFT JOIN broker_accounts ba ON ba.id = p.broker_account_id
WHERE p.broker_account_id = $1::uuid AND p.quantity != 0
"""
_FETCH_BULK_DOCUMENTS = """
SELECT
d.id, d.document_type, d.source_type, d.publisher, d.title,
d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
d.content_hash, d.parse_quality_score,
COALESCE(
(SELECT dcm.ticker FROM document_company_mentions dcm
WHERE dcm.document_id = d.id LIMIT 1),
''
) AS ticker
FROM documents d
WHERE d.created_at >= $1
AND d.status IN ('parsed', 'extracted')
ORDER BY d.created_at
LIMIT 500
"""
_FETCH_BULK_EXTRACTIONS = """
SELECT
di.document_id, dir.ticker, dir.relevance, dir.sentiment,
dir.impact_score, dir.impact_horizon, dir.catalyst_type,
di.confidence, di.novelty_score, di.source_credibility,
dir.key_facts, dir.risks, di.macro_themes,
di.model_name, di.prompt_version, di.schema_version,
di.created_at AS extraction_at,
COALESCE(c.legal_name, '') AS company_name
FROM document_intelligence di
JOIN document_impact_records dir ON dir.intelligence_id = di.id
LEFT JOIN companies c ON c.id = dir.company_id
WHERE di.created_at >= $1
AND di.validation_status = 'valid'
ORDER BY di.created_at
LIMIT 500
"""
# ---------------------------------------------------------------------------
# Job handlers — each transforms operational rows into lake facts
# ---------------------------------------------------------------------------
def _jsonb_to_str(val: object) -> str:
"""Convert a JSONB column value (list or str) to a comma-separated string."""
if val is None:
return ""
if isinstance(val, str):
try:
parsed = json.loads(val)
if isinstance(parsed, list):
return ", ".join(str(x) for x in parsed)
return val
except (json.JSONDecodeError, TypeError):
return val
if isinstance(val, list):
return ", ".join(str(x) for x in val)
return str(val)
async def publish_document_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> str:
"""Publish a single document metadata fact from PostgreSQL to the lake."""
row = await pool.fetchrow(_FETCH_DOCUMENT, entity_id)
if row is None:
logger.warning("Document %s not found, skipping lake publish", entity_id)
return ""
published_at = row["published_at"] or row["retrieved_at"]
return publish_document_fact(
client=minio_client,
document_id=str(row["id"]),
document_type=row["document_type"],
source_type=row["source_type"],
ticker=row["ticker"] or "",
publisher=row["publisher"] or "",
title=row["title"] or "",
published_at=published_at,
content_hash=row["content_hash"],
url=row["url"] or "",
canonical_url=row["canonical_url"] or "",
language=row["language"] or "en",
confidence=float(row["parse_quality_score"] or 0.0),
retrieved_at=row["retrieved_at"],
)
async def publish_extraction_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> list[str]:
"""Publish document extraction facts for a document from PostgreSQL to the lake."""
rows = await pool.fetch(_FETCH_EXTRACTIONS, entity_id)
if not rows:
logger.info("No valid extractions for document %s", entity_id)
return []
refs: list[str] = []
for row in rows:
ref = publish_document_extraction(
client=minio_client,
document_id=str(row["document_id"]),
ticker=row["ticker"],
sentiment=row["sentiment"] or "neutral",
impact_score=float(row["impact_score"] or 0.0),
catalyst_type=row["catalyst_type"] or "other",
confidence=float(row["confidence"] or 0.0),
extraction_at=row["extraction_at"],
model_name=row["model_name"] or "",
prompt_version=row["prompt_version"] or "",
company_name=row["company_name"] or "",
relevance=float(row["relevance"] or 0.0),
impact_horizon=row["impact_horizon"] or "",
novelty_score=float(row["novelty_score"] or 0.0),
source_credibility=float(row["source_credibility"] or 0.0),
key_facts=_jsonb_to_str(row["key_facts"]),
risks=_jsonb_to_str(row["risks"]),
macro_themes=_jsonb_to_str(row["macro_themes"]),
schema_version=row["schema_version"] or "",
)
refs.append(ref)
return refs
async def publish_market_snapshot_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> list[str]:
"""Publish market bar/quote facts from a market_snapshots row."""
row = await pool.fetchrow(_FETCH_MARKET_SNAPSHOT, entity_id)
if row is None:
logger.warning("Market snapshot %s not found", entity_id)
return []
ticker = row["ticker"]
data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"])
source = row["source_provider"] or ""
captured_at = row["captured_at"]
snapshot_type = row["snapshot_type"]
refs: list[str] = []
if snapshot_type == "bar" or snapshot_type == "bars":
# Single bar or list of bars
bars = data.get("bars", [data]) if "bars" in data else [data]
for bar in bars:
ref = publish_market_bar(
client=minio_client,
ticker=ticker,
open_price=float(bar.get("open", bar.get("o", 0))),
high_price=float(bar.get("high", bar.get("h", 0))),
low_price=float(bar.get("low", bar.get("l", 0))),
close_price=float(bar.get("close", bar.get("c", 0))),
volume=int(bar.get("volume", bar.get("v", 0))),
bar_timestamp=captured_at,
source=source,
vwap=float(bar.get("vwap", bar.get("vw", 0))),
trade_count=int(bar.get("trade_count", bar.get("n", 0))),
bar_interval=bar.get("interval", "1d"),
)
refs.append(ref)
elif snapshot_type == "quote" or snapshot_type == "quotes":
ref = publish_market_quote(
client=minio_client,
ticker=ticker,
bid_price=float(data.get("bid_price", data.get("bp", 0))),
ask_price=float(data.get("ask_price", data.get("ap", 0))),
last_price=float(data.get("last_price", data.get("lp", 0))),
quote_at=captured_at,
source=source,
bid_size=int(data.get("bid_size", data.get("bs", 0))),
ask_size=int(data.get("ask_size", data.get("as", 0))),
last_size=int(data.get("last_size", data.get("ls", 0))),
)
refs.append(ref)
return refs
async def publish_order_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> str:
"""Publish a trade order fact from PostgreSQL to the lake."""
row = await pool.fetchrow(_FETCH_ORDER, entity_id)
if row is None:
logger.warning("Order %s not found", entity_id)
return ""
submitted_at = row["submitted_at"] or datetime.now(timezone.utc)
return publish_trade_order(
client=minio_client,
order_id=str(row["id"]),
ticker=row["ticker"],
side=row["side"],
order_type=row["order_type"],
quantity=float(row["quantity"]),
limit_price=float(row["limit_price"]) if row["limit_price"] else None,
status=row["status"],
broker_account=row["broker_account"],
submitted_at=submitted_at,
recommendation_id=str(row["recommendation_id"]) if row["recommendation_id"] else "",
execution_mode=row["execution_mode"],
)
async def publish_fills_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> list[str]:
"""Publish trade fill facts for an order from PostgreSQL to the lake."""
rows = await pool.fetch(_FETCH_ORDER_FILLS, entity_id)
if not rows:
logger.info("No fill events for order %s", entity_id)
return []
refs: list[str] = []
for row in rows:
data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"] or "{}")
filled_at = row["broker_timestamp"] or datetime.now(timezone.utc)
ref = publish_trade_fill(
client=minio_client,
fill_id=str(row["fill_id"]),
order_id=str(row["order_id"]),
ticker=row["ticker"],
side=row["side"],
fill_price=float(data.get("fill_price", data.get("price", 0))),
fill_quantity=float(data.get("fill_quantity", data.get("qty", 0))),
broker_account=row["broker_account"],
filled_at=filled_at,
commission=float(data.get("commission", 0)),
)
refs.append(ref)
return refs
async def publish_positions_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> str:
"""Publish daily position snapshots for a broker account."""
rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
if not rows:
logger.info("No open positions for account %s", entity_id)
return ""
snapshot_at = datetime.now(timezone.utc)
positions = [
{
"ticker": row["ticker"],
"quantity": float(row["quantity"]),
"avg_entry_price": float(row["avg_entry_price"] or 0),
"close_price": float(row["current_price"] or 0),
"unrealized_pnl": float(row["unrealized_pnl"] or 0),
}
for row in rows
]
broker_account = rows[0]["broker_account"] if rows else ""
return publish_positions_daily_batch(
client=minio_client,
positions=positions,
broker_account=broker_account,
snapshot_at=snapshot_at,
)
async def publish_pnl_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> list[str]:
"""Publish daily PnL facts for a broker account's positions."""
rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
if not rows:
logger.info("No positions for PnL snapshot, account %s", entity_id)
return []
now = datetime.now(timezone.utc)
refs: list[str] = []
for row in rows:
realized = float(row["realized_pnl"] or 0)
unrealized = float(row["unrealized_pnl"] or 0)
total = realized + unrealized
ref = publish_pnl_daily(
client=minio_client,
ticker=row["ticker"],
realized_pnl=realized,
unrealized_pnl=unrealized,
total_pnl=total,
broker_account=row["broker_account"],
dt=now,
execution_mode=row["execution_mode"],
)
refs.append(ref)
return refs
async def publish_bulk_documents_job(
pool: asyncpg.Pool,
minio_client: Minio,
since: datetime,
) -> list[str]:
"""Publish all documents created since a cutoff as a batch."""
rows = await pool.fetch(_FETCH_BULK_DOCUMENTS, since)
if not rows:
logger.info("No documents to bulk-publish since %s", since)
return []
doc_rows: list[dict[str, object]] = []
for row in rows:
published_at = row["published_at"] or row["retrieved_at"]
doc_rows.append({
"document_id": str(row["id"]),
"document_type": row["document_type"],
"source_type": row["source_type"],
"ticker": row["ticker"] or "",
"publisher": row["publisher"] or "",
"title": row["title"] or "",
"url": row["url"] or "",
"canonical_url": row["canonical_url"] or "",
"language": row["language"] or "en",
"published_at": published_at,
"retrieved_at": row["retrieved_at"],
"content_hash": row["content_hash"],
"confidence": float(row["parse_quality_score"] or 0.0),
**partition_values(published_at),
})
ref = publish_documents_batch(minio_client, doc_rows, since)
return [ref] if ref else []
async def publish_bulk_extractions_job(
pool: asyncpg.Pool,
minio_client: Minio,
since: datetime,
) -> list[str]:
"""Publish all extractions created since a cutoff as a batch."""
rows = await pool.fetch(_FETCH_BULK_EXTRACTIONS, since)
if not rows:
logger.info("No extractions to bulk-publish since %s", since)
return []
extraction_rows: list[dict[str, object]] = []
for row in rows:
model_ver = row["schema_version"] or row["prompt_version"] or ""
extraction_rows.append({
"document_id": str(row["document_id"]),
"ticker": row["ticker"],
"company_name": row["company_name"] or "",
"relevance": float(row["relevance"] or 0.0),
"sentiment": row["sentiment"] or "neutral",
"impact_score": float(row["impact_score"] or 0.0),
"impact_horizon": row["impact_horizon"] or "",
"catalyst_type": row["catalyst_type"] or "other",
"confidence": float(row["confidence"] or 0.0),
"novelty_score": float(row["novelty_score"] or 0.0),
"source_credibility": float(row["source_credibility"] or 0.0),
"key_facts": _jsonb_to_str(row["key_facts"]),
"risks": _jsonb_to_str(row["risks"]),
"macro_themes": _jsonb_to_str(row["macro_themes"]),
"model_name": row["model_name"] or "",
"prompt_version": row["prompt_version"] or "",
"schema_version": row["schema_version"] or "",
"extraction_at": row["extraction_at"],
**partition_values(row["extraction_at"], {"model_version": model_ver}),
})
model_ver = extraction_rows[0].get("model_version", "") if extraction_rows else ""
ref = publish_document_extractions_batch(
minio_client, extraction_rows, since,
model_version=str(model_ver),
)
return [ref] if ref else []
# ---------------------------------------------------------------------------
# Job dispatcher
# ---------------------------------------------------------------------------
JOB_TYPES = {
"document",
"document_extraction",
"market_snapshot",
"trade_order",
"trade_fill",
"positions_snapshot",
"pnl_snapshot",
"company_event",
"bulk_documents",
"bulk_extractions",
}
async def dispatch_job(
pool: asyncpg.Pool,
minio_client: Minio,
job: dict[str, str],
) -> dict[str, object]:
"""Dispatch a lake publish job to the appropriate handler.
Args:
pool: PostgreSQL connection pool.
minio_client: MinIO client for writing Parquet files.
job: Job dict with at least 'job_type' and 'entity_id'.
Returns:
A result dict with 'job_type', 'entity_id', 'refs' (list of s3 URIs),
and 'error' (None on success).
"""
job_type = job.get("job_type", "")
entity_id = job.get("entity_id", "")
since_str = job.get("since")
result: dict[str, object] = {
"job_type": job_type,
"entity_id": entity_id,
"refs": [],
"error": None,
}
try:
if job_type == "document":
ref = await publish_document_job(pool, minio_client, entity_id)
result["refs"] = [ref] if ref else []
elif job_type == "document_extraction":
refs = await publish_extraction_job(pool, minio_client, entity_id)
result["refs"] = refs
elif job_type == "market_snapshot":
refs = await publish_market_snapshot_job(pool, minio_client, entity_id)
result["refs"] = refs
elif job_type == "trade_order":
ref = await publish_order_job(pool, minio_client, entity_id)
result["refs"] = [ref] if ref else []
elif job_type == "trade_fill":
refs = await publish_fills_job(pool, minio_client, entity_id)
result["refs"] = refs
elif job_type == "positions_snapshot":
ref = await publish_positions_job(pool, minio_client, entity_id)
result["refs"] = [ref] if ref else []
elif job_type == "pnl_snapshot":
refs = await publish_pnl_job(pool, minio_client, entity_id)
result["refs"] = refs
elif job_type == "bulk_documents":
since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
refs = await publish_bulk_documents_job(pool, minio_client, since)
result["refs"] = refs
elif job_type == "bulk_extractions":
since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
refs = await publish_bulk_extractions_job(pool, minio_client, since)
result["refs"] = refs
else:
result["error"] = f"Unknown job_type: {job_type}"
logger.warning("Unknown lake publish job type: %s", job_type)
except Exception as exc:
result["error"] = str(exc)
logger.exception("Lake publish job failed: %s/%s", job_type, entity_id)
return result
# ---------------------------------------------------------------------------
# Async worker loop
# ---------------------------------------------------------------------------
async def run_worker(
pool: asyncpg.Pool,
rds: aioredis.Redis,
minio_client: Minio,
poll_interval: float = 2.0,
) -> None:
"""Main worker loop — reads jobs from Redis and dispatches them.
Runs indefinitely until cancelled. Each job is processed sequentially
to keep MinIO write ordering predictable.
"""
queue = queue_key(QUEUE_LAKE_PUBLISH)
logger.info("Lake publisher worker started, listening on %s", queue)
while True:
raw = await rds.lpop(queue) # type: ignore[misc]
if raw is None:
await asyncio.sleep(poll_interval)
continue
try:
job = json.loads(str(raw))
except (json.JSONDecodeError, TypeError):
logger.error("Invalid lake publish job payload: %s", raw)
continue
result = await dispatch_job(pool, minio_client, job)
refs = result.get("refs") or []
error = result.get("error")
if error:
logger.error(
"Lake publish job %s/%s failed: %s",
result["job_type"], result["entity_id"], error,
)
else:
ref_count = len(refs) if isinstance(refs, list) else 0
logger.info(
"Lake publish job %s/%s completed: %d facts written",
result["job_type"], result["entity_id"], ref_count,
)
async def main() -> None:
"""Entry point for the lake publisher worker process."""
config = load_config()
pool = await get_pg_pool(config)
rds = get_redis(config)
minio_client = get_minio(config)
try:
await run_worker(pool, rds, minio_client)
finally:
await pool.close()
await rds.close()
if __name__ == "__main__":
cfg = load_config()
setup_logging("lake_publisher", level=cfg.log_level, json_output=cfg.json_logs)
asyncio.run(main())
+128
View File
@@ -0,0 +1,128 @@
"""Hive-compatible partition layout conventions for the MinIO lakehouse.
Centralizes partition path generation, partition column injection, and
bucket provisioning so that all lake publisher writers produce layouts
that Trino's Hive and Iceberg connectors can discover and prune.
Design ref: Section 5.2, 5.3 (Lakehouse model)
Requirements: 9.4, 9.5, N4, N6
Layout convention:
s3://stonks-lakehouse/warehouse/{table_name}/dt={YYYY-MM-DD}[/{extra_key}={value}]/part-{uuid}.parquet
Rules:
- Every fact table is partitioned by ``dt`` (DATE) derived from the row timestamp.
- Some tables have a second partition key (e.g. ``model_version``).
- Partition columns MUST appear in the Parquet file so Trino can read them
without relying solely on path parsing.
- File names use a UUID suffix to avoid collisions on concurrent writes.
"""
from __future__ import annotations
import uuid
from dataclasses import dataclass, field
from datetime import date, datetime, timezone
LAKEHOUSE_BUCKET = "stonks-lakehouse"
WAREHOUSE_PREFIX = "warehouse"
@dataclass(frozen=True)
class PartitionSpec:
"""Describes the partition layout for a single fact table."""
table_name: str
extra_keys: tuple[str, ...] = field(default_factory=tuple)
@property
def all_keys(self) -> tuple[str, ...]:
"""Return all partition keys in order (dt first, then extras)."""
return ("dt", *self.extra_keys)
# Registry of every analytical fact table and its partition keys.
# This is the single source of truth — DDL, publisher, and tests should agree.
TABLE_PARTITIONS: dict[str, PartitionSpec] = {
"market_bars": PartitionSpec("market_bars"),
"market_quotes": PartitionSpec("market_quotes"),
"company_events": PartitionSpec("company_events"),
"documents": PartitionSpec("documents"),
"document_extractions": PartitionSpec("document_extractions", extra_keys=("model_version",)),
"trade_signals": PartitionSpec("trade_signals"),
"trade_orders": PartitionSpec("trade_orders"),
"trade_fills": PartitionSpec("trade_fills"),
"positions_daily": PartitionSpec("positions_daily"),
"pnl_daily": PartitionSpec("pnl_daily"),
"prediction_vs_outcome": PartitionSpec("prediction_vs_outcome", extra_keys=("model_version",)),
"model_performance": PartitionSpec("model_performance", extra_keys=("model_version",)),
}
def partition_path(
table_name: str,
dt: datetime | date,
extra_partitions: dict[str, str] | None = None,
file_id: str | None = None,
) -> str:
"""Build a Hive-compatible object path for a Parquet file.
Args:
table_name: Logical fact table name (must be in TABLE_PARTITIONS).
dt: Row timestamp or date used to derive the ``dt=`` partition.
extra_partitions: Additional partition key/value pairs (e.g. model_version).
file_id: Optional override for the file suffix (defaults to a UUID4).
Returns:
Object key relative to the bucket root, e.g.
``warehouse/trade_signals/dt=2026-04-11/part-<uuid>.parquet``
"""
spec = TABLE_PARTITIONS.get(table_name)
if spec is None:
raise ValueError(f"Unknown table: {table_name}. Register it in TABLE_PARTITIONS.")
if isinstance(dt, datetime):
dt_str = dt.strftime("%Y-%m-%d")
else:
dt_str = dt.isoformat()
segments = [WAREHOUSE_PREFIX, table_name, f"dt={dt_str}"]
# Append extra partition directories in the order declared by the spec.
extras = extra_partitions or {}
for key in spec.extra_keys:
value = extras.get(key, "__NONE__")
segments.append(f"{key}={value}")
suffix = file_id or uuid.uuid4().hex[:16]
segments.append(f"part-{suffix}.parquet")
return "/".join(segments)
def partition_values(
dt: datetime | date,
extra_partitions: dict[str, str] | None = None,
) -> dict[str, object]:
"""Return partition column values to inject into Parquet row data.
Trino's Hive connector can read partition values from the directory path,
but embedding them in the Parquet file as well ensures compatibility with
engines that don't parse Hive paths (e.g. plain PyArrow reads, DuckDB).
Returns a dict like ``{"dt": date(2026, 4, 11), "model_version": "v2"}``.
"""
if isinstance(dt, datetime):
dt_date = dt.date()
else:
dt_date = dt
values: dict[str, object] = {"dt": dt_date}
if extra_partitions:
values.update(extra_partitions)
return values
def s3_uri(path: str) -> str:
"""Build an s3:// URI from a bucket-relative object path."""
return f"s3://{LAKEHOUSE_BUCKET}/{path}"
File diff suppressed because it is too large Load Diff