phase 14-15: docker build validation and helm deployment
This commit is contained in:
@@ -1 +1 @@
|
||||
# Lake Publisher - transforms operational data into analytical fact datasets
|
||||
"""Lake publisher — writes partitioned Parquet facts to MinIO for Trino/Superset."""
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
"""Helpers for enqueuing lake publish jobs from upstream workers.
|
||||
|
||||
Other services import these helpers to push jobs onto the QUEUE_LAKE_PUBLISH
|
||||
Redis queue. The lake publisher worker (jobs.py) consumes them.
|
||||
|
||||
Usage:
|
||||
await enqueue_lake_job(rds, "document", document_id)
|
||||
await enqueue_lake_job(rds, "trade_order", order_id)
|
||||
await enqueue_lake_job(rds, "bulk_documents", since=cutoff.isoformat())
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
import redis.asyncio as aioredis
|
||||
|
||||
from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key
|
||||
|
||||
|
||||
async def enqueue_lake_job(
|
||||
rds: aioredis.Redis,
|
||||
job_type: str,
|
||||
entity_id: str = "",
|
||||
since: str | None = None,
|
||||
) -> None:
|
||||
"""Push a lake publish job onto the Redis queue.
|
||||
|
||||
Args:
|
||||
rds: Async Redis client.
|
||||
job_type: One of the supported job types (document, document_extraction,
|
||||
market_snapshot, trade_order, trade_fill, positions_snapshot,
|
||||
pnl_snapshot, bulk_documents, bulk_extractions).
|
||||
entity_id: UUID or identifier for the entity to publish.
|
||||
since: ISO datetime string for bulk jobs (cutoff timestamp).
|
||||
"""
|
||||
payload: dict[str, str] = {"job_type": job_type, "entity_id": entity_id}
|
||||
if since:
|
||||
payload["since"] = since
|
||||
await rds.rpush(queue_key(QUEUE_LAKE_PUBLISH), json.dumps(payload)) # type: ignore[misc]
|
||||
@@ -0,0 +1,420 @@
|
||||
"""Iceberg table creation and metadata management for analytical datasets.
|
||||
|
||||
Manages Iceberg tables in Trino's Iceberg catalog, providing:
|
||||
- Table creation with proper schemas and partition specs
|
||||
- Schema synchronization between PyArrow definitions and Iceberg tables
|
||||
- Table metadata inspection (existence checks, schema retrieval, partition listing)
|
||||
|
||||
The Iceberg catalog complements the existing Hive-compatible partition layout.
|
||||
Parquet files written by the lake publisher are stored in the same MinIO paths,
|
||||
but Iceberg metadata enables schema evolution, snapshot isolation, and better
|
||||
partition pruning via Trino's Iceberg connector.
|
||||
|
||||
Requirements: 9.4, 9.5, 10.1, N4, N6
|
||||
Design ref: Section 5.3 (Lakehouse model), Section 4.12 (SQL Query Engine)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
import pyarrow as pa
|
||||
from trino.dbapi import connect as trino_connect
|
||||
|
||||
from services.lake_publisher.partitions import (
|
||||
LAKEHOUSE_BUCKET,
|
||||
TABLE_PARTITIONS,
|
||||
WAREHOUSE_PREFIX,
|
||||
PartitionSpec,
|
||||
)
|
||||
from services.lake_publisher.worker import (
|
||||
COMPANY_EVENTS_SCHEMA,
|
||||
DOCUMENTS_SCHEMA,
|
||||
DOCUMENT_EXTRACTIONS_SCHEMA,
|
||||
MARKET_BARS_SCHEMA,
|
||||
MARKET_QUOTES_SCHEMA,
|
||||
MODEL_PERFORMANCE_SCHEMA,
|
||||
PNL_DAILY_SCHEMA,
|
||||
POSITIONS_DAILY_SCHEMA,
|
||||
PREDICTION_VS_OUTCOME_SCHEMA,
|
||||
TRADE_FILLS_SCHEMA,
|
||||
TRADE_ORDERS_SCHEMA,
|
||||
TRADE_SIGNALS_SCHEMA,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
ICEBERG_CATALOG = "iceberg"
|
||||
ICEBERG_SCHEMA = "stonks"
|
||||
|
||||
|
||||
def _get_iceberg_catalog() -> str:
|
||||
"""Return the Iceberg catalog name from env or default."""
|
||||
import os
|
||||
return os.getenv("TRINO_ICEBERG_CATALOG", ICEBERG_CATALOG)
|
||||
|
||||
# Map PyArrow types to Trino/Iceberg SQL types.
|
||||
_ARROW_TO_TRINO: dict[str, str] = {
|
||||
"string": "VARCHAR",
|
||||
"utf8": "VARCHAR",
|
||||
"large_string": "VARCHAR",
|
||||
"large_utf8": "VARCHAR",
|
||||
"float64": "DOUBLE",
|
||||
"double": "DOUBLE",
|
||||
"float32": "REAL",
|
||||
"float": "REAL",
|
||||
"int8": "TINYINT",
|
||||
"int16": "SMALLINT",
|
||||
"int32": "INTEGER",
|
||||
"int64": "BIGINT",
|
||||
"bool": "BOOLEAN",
|
||||
"date32": "DATE",
|
||||
"date32[day]": "DATE",
|
||||
"date64": "DATE",
|
||||
}
|
||||
|
||||
|
||||
def _arrow_type_to_trino(arrow_type: pa.DataType) -> str:
|
||||
"""Convert a PyArrow data type to a Trino SQL type string."""
|
||||
type_str = str(arrow_type)
|
||||
|
||||
# Handle timestamp types (with or without timezone)
|
||||
if type_str.startswith("timestamp"):
|
||||
if "tz=" in type_str:
|
||||
return "TIMESTAMP(6) WITH TIME ZONE"
|
||||
return "TIMESTAMP(6)"
|
||||
|
||||
# Direct lookup
|
||||
result = _ARROW_TO_TRINO.get(type_str)
|
||||
if result:
|
||||
return result
|
||||
|
||||
# Fallback for type IDs
|
||||
if pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type):
|
||||
return "VARCHAR"
|
||||
if pa.types.is_floating(arrow_type):
|
||||
return "DOUBLE"
|
||||
if pa.types.is_integer(arrow_type):
|
||||
return "BIGINT"
|
||||
if pa.types.is_boolean(arrow_type):
|
||||
return "BOOLEAN"
|
||||
if pa.types.is_date(arrow_type):
|
||||
return "DATE"
|
||||
if pa.types.is_timestamp(arrow_type):
|
||||
return "TIMESTAMP(6) WITH TIME ZONE"
|
||||
|
||||
raise ValueError(f"Unsupported PyArrow type for Iceberg DDL: {arrow_type}")
|
||||
|
||||
|
||||
|
||||
# Registry mapping table names to their PyArrow schemas.
|
||||
TABLE_SCHEMAS: dict[str, pa.Schema] = {
|
||||
"market_bars": MARKET_BARS_SCHEMA,
|
||||
"market_quotes": MARKET_QUOTES_SCHEMA,
|
||||
"company_events": COMPANY_EVENTS_SCHEMA,
|
||||
"documents": DOCUMENTS_SCHEMA,
|
||||
"document_extractions": DOCUMENT_EXTRACTIONS_SCHEMA,
|
||||
"trade_signals": TRADE_SIGNALS_SCHEMA,
|
||||
"trade_orders": TRADE_ORDERS_SCHEMA,
|
||||
"trade_fills": TRADE_FILLS_SCHEMA,
|
||||
"positions_daily": POSITIONS_DAILY_SCHEMA,
|
||||
"pnl_daily": PNL_DAILY_SCHEMA,
|
||||
"prediction_vs_outcome": PREDICTION_VS_OUTCOME_SCHEMA,
|
||||
"model_performance": MODEL_PERFORMANCE_SCHEMA,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class IcebergTableDef:
|
||||
"""Definition for an Iceberg table derived from PyArrow schema + partition spec."""
|
||||
|
||||
table_name: str
|
||||
schema: pa.Schema
|
||||
partition_spec: PartitionSpec
|
||||
|
||||
@property
|
||||
def qualified_name(self) -> str:
|
||||
return f"{ICEBERG_CATALOG}.{ICEBERG_SCHEMA}.{self.table_name}"
|
||||
|
||||
@property
|
||||
def location(self) -> str:
|
||||
return f"s3a://{LAKEHOUSE_BUCKET}/{WAREHOUSE_PREFIX}/{self.table_name}/"
|
||||
|
||||
def column_defs_sql(self) -> list[str]:
|
||||
"""Generate SQL column definitions from the PyArrow schema.
|
||||
|
||||
Partition columns are included in the column list (Iceberg stores them
|
||||
in the data files, unlike Hive external tables).
|
||||
"""
|
||||
cols: list[str] = []
|
||||
for i in range(len(self.schema)):
|
||||
name = self.schema.field(i).name
|
||||
arrow_type = self.schema.field(i).type
|
||||
trino_type = _arrow_type_to_trino(arrow_type)
|
||||
cols.append(f" {name} {trino_type}")
|
||||
return cols
|
||||
|
||||
def partition_keys_sql(self) -> str:
|
||||
"""Generate the partitioning clause for CREATE TABLE."""
|
||||
keys = list(self.partition_spec.all_keys)
|
||||
if not keys:
|
||||
return ""
|
||||
quoted = ", ".join(f"'{k}'" for k in keys)
|
||||
return f"partitioning = ARRAY[{quoted}]"
|
||||
|
||||
def create_table_sql(self) -> str:
|
||||
"""Generate a CREATE TABLE IF NOT EXISTS statement for Trino's Iceberg catalog."""
|
||||
col_lines = ",\n".join(self.column_defs_sql())
|
||||
with_clauses = [
|
||||
"format = 'PARQUET'",
|
||||
f"location = '{self.location}'",
|
||||
]
|
||||
part_sql = self.partition_keys_sql()
|
||||
if part_sql:
|
||||
with_clauses.append(part_sql)
|
||||
|
||||
with_block = ",\n ".join(with_clauses)
|
||||
|
||||
return (
|
||||
f"CREATE TABLE IF NOT EXISTS {self.qualified_name} (\n"
|
||||
f"{col_lines}\n"
|
||||
f") WITH (\n"
|
||||
f" {with_block}\n"
|
||||
f")"
|
||||
)
|
||||
|
||||
|
||||
def get_all_table_defs() -> list[IcebergTableDef]:
|
||||
"""Build IcebergTableDef for every registered analytical table."""
|
||||
defs: list[IcebergTableDef] = []
|
||||
for table_name, partition_spec in TABLE_PARTITIONS.items():
|
||||
schema = TABLE_SCHEMAS.get(table_name)
|
||||
if schema is None:
|
||||
logger.warning("No PyArrow schema for table %s, skipping", table_name)
|
||||
continue
|
||||
defs.append(IcebergTableDef(
|
||||
table_name=table_name,
|
||||
schema=schema,
|
||||
partition_spec=partition_spec,
|
||||
))
|
||||
return defs
|
||||
|
||||
|
||||
def get_table_def(table_name: str) -> IcebergTableDef:
|
||||
"""Get the IcebergTableDef for a single table by name."""
|
||||
if table_name not in TABLE_PARTITIONS:
|
||||
raise ValueError(f"Unknown table: {table_name}")
|
||||
schema = TABLE_SCHEMAS.get(table_name)
|
||||
if schema is None:
|
||||
raise ValueError(f"No PyArrow schema registered for table: {table_name}")
|
||||
return IcebergTableDef(
|
||||
table_name=table_name,
|
||||
schema=schema,
|
||||
partition_spec=TABLE_PARTITIONS[table_name],
|
||||
)
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class IcebergManager:
|
||||
"""Manages Iceberg tables via Trino's Iceberg catalog.
|
||||
|
||||
Provides table creation, existence checks, schema inspection,
|
||||
and metadata operations against the Trino Iceberg connector.
|
||||
"""
|
||||
|
||||
host: str = "localhost"
|
||||
port: int = 8080
|
||||
user: str = "stonks"
|
||||
catalog: str = ICEBERG_CATALOG
|
||||
schema: str = ICEBERG_SCHEMA
|
||||
|
||||
def _get_connection(self) -> Any:
|
||||
"""Create a Trino DBAPI connection."""
|
||||
return trino_connect(
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
user=self.user,
|
||||
catalog=self.catalog,
|
||||
schema=self.schema,
|
||||
)
|
||||
|
||||
def _execute(self, sql: str) -> list[list[Any]]:
|
||||
"""Execute a SQL statement and return all rows."""
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(sql)
|
||||
return cursor.fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def _execute_no_fetch(self, sql: str) -> None:
|
||||
"""Execute a DDL statement that returns no rows."""
|
||||
conn = self._get_connection()
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(sql)
|
||||
# DDL statements in Trino still need fetchall to complete
|
||||
try:
|
||||
cursor.fetchall()
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def ensure_schema(self) -> None:
|
||||
"""Create the Iceberg schema if it doesn't exist."""
|
||||
sql = f"CREATE SCHEMA IF NOT EXISTS {self.catalog}.{self.schema}"
|
||||
logger.info("Ensuring Iceberg schema: %s.%s", self.catalog, self.schema)
|
||||
self._execute_no_fetch(sql)
|
||||
|
||||
def table_exists(self, table_name: str) -> bool:
|
||||
"""Check if an Iceberg table exists."""
|
||||
sql = (
|
||||
f"SELECT table_name FROM {self.catalog}.information_schema.tables "
|
||||
f"WHERE table_schema = '{self.schema}' AND table_name = '{table_name}'"
|
||||
)
|
||||
rows = self._execute(sql)
|
||||
return len(rows) > 0
|
||||
|
||||
def create_table(self, table_name: str) -> bool:
|
||||
"""Create a single Iceberg table if it doesn't exist.
|
||||
|
||||
Returns True if the table was created, False if it already existed.
|
||||
"""
|
||||
table_def = get_table_def(table_name)
|
||||
ddl = table_def.create_table_sql()
|
||||
logger.info("Creating Iceberg table: %s", table_def.qualified_name)
|
||||
self._execute_no_fetch(ddl)
|
||||
logger.info("Iceberg table ready: %s", table_def.qualified_name)
|
||||
return True
|
||||
|
||||
def create_all_tables(self) -> dict[str, bool]:
|
||||
"""Create all registered Iceberg tables.
|
||||
|
||||
Returns a dict mapping table_name -> True (created) or False (error).
|
||||
"""
|
||||
self.ensure_schema()
|
||||
results: dict[str, bool] = {}
|
||||
for table_def in get_all_table_defs():
|
||||
try:
|
||||
self.create_table(table_def.table_name)
|
||||
results[table_def.table_name] = True
|
||||
except Exception:
|
||||
logger.exception("Failed to create Iceberg table: %s", table_def.table_name)
|
||||
results[table_def.table_name] = False
|
||||
return results
|
||||
|
||||
def get_table_schema(self, table_name: str) -> list[dict[str, str]]:
|
||||
"""Retrieve the column schema of an Iceberg table from Trino.
|
||||
|
||||
Returns a list of dicts with 'column_name', 'data_type', and 'is_nullable'.
|
||||
"""
|
||||
sql = (
|
||||
f"SELECT column_name, data_type, is_nullable "
|
||||
f"FROM {self.catalog}.information_schema.columns "
|
||||
f"WHERE table_schema = '{self.schema}' AND table_name = '{table_name}' "
|
||||
f"ORDER BY ordinal_position"
|
||||
)
|
||||
rows = self._execute(sql)
|
||||
return [
|
||||
{"column_name": r[0], "data_type": r[1], "is_nullable": r[2]}
|
||||
for r in rows
|
||||
]
|
||||
|
||||
def get_table_snapshots(self, table_name: str) -> list[dict[str, Any]]:
|
||||
"""List Iceberg snapshots for a table (useful for auditing and rollback).
|
||||
|
||||
Returns snapshot metadata from Trino's $snapshots metadata table.
|
||||
"""
|
||||
qualified = f"{self.catalog}.{self.schema}.{table_name}"
|
||||
sql = f'SELECT * FROM "{qualified}$snapshots"'
|
||||
try:
|
||||
rows = self._execute(sql)
|
||||
return [{"snapshot_id": r[0], "parent_id": r[1], "operation": r[2],
|
||||
"manifest_list": r[3], "summary": r[4]} for r in rows]
|
||||
except Exception:
|
||||
logger.debug("Could not read snapshots for %s (table may be empty)", table_name)
|
||||
return []
|
||||
|
||||
def get_table_partitions(self, table_name: str) -> list[dict[str, Any]]:
|
||||
"""List partition values for an Iceberg table.
|
||||
|
||||
Returns partition metadata from Trino's $partitions metadata table.
|
||||
"""
|
||||
qualified = f"{self.catalog}.{self.schema}.{table_name}"
|
||||
sql = f'SELECT * FROM "{qualified}$partitions"'
|
||||
try:
|
||||
rows = self._execute(sql)
|
||||
return [{"row": r} for r in rows]
|
||||
except Exception:
|
||||
logger.debug("Could not read partitions for %s (table may be empty)", table_name)
|
||||
return []
|
||||
|
||||
def list_tables(self) -> list[str]:
|
||||
"""List all tables in the Iceberg schema."""
|
||||
sql = (
|
||||
f"SELECT table_name FROM {self.catalog}.information_schema.tables "
|
||||
f"WHERE table_schema = '{self.schema}' ORDER BY table_name"
|
||||
)
|
||||
rows = self._execute(sql)
|
||||
return [r[0] for r in rows]
|
||||
|
||||
def drop_table(self, table_name: str) -> None:
|
||||
"""Drop an Iceberg table (for testing/reset purposes)."""
|
||||
qualified = f"{self.catalog}.{self.schema}.{table_name}"
|
||||
logger.warning("Dropping Iceberg table: %s", qualified)
|
||||
self._execute_no_fetch(f"DROP TABLE IF EXISTS {qualified}")
|
||||
|
||||
def sync_table_schema(self, table_name: str) -> list[str]:
|
||||
"""Compare the expected PyArrow schema with the actual Iceberg table schema.
|
||||
|
||||
If columns are missing from the Iceberg table, adds them via ALTER TABLE.
|
||||
Returns a list of columns that were added.
|
||||
|
||||
This supports forward-only schema evolution — columns are never dropped.
|
||||
"""
|
||||
table_def = get_table_def(table_name)
|
||||
existing = self.get_table_schema(table_name)
|
||||
existing_names = {col["column_name"] for col in existing}
|
||||
|
||||
added: list[str] = []
|
||||
qualified = table_def.qualified_name
|
||||
|
||||
for i in range(len(table_def.schema)):
|
||||
col_name = table_def.schema.field(i).name
|
||||
if col_name not in existing_names:
|
||||
trino_type = _arrow_type_to_trino(table_def.schema.field(i).type)
|
||||
alter_sql = f"ALTER TABLE {qualified} ADD COLUMN {col_name} {trino_type}"
|
||||
logger.info("Adding column %s to %s", col_name, qualified)
|
||||
self._execute_no_fetch(alter_sql)
|
||||
added.append(col_name)
|
||||
|
||||
return added
|
||||
|
||||
def sync_all_schemas(self) -> dict[str, list[str]]:
|
||||
"""Sync schemas for all registered tables. Returns table_name -> added columns."""
|
||||
results: dict[str, list[str]] = {}
|
||||
for table_def in get_all_table_defs():
|
||||
try:
|
||||
if self.table_exists(table_def.table_name):
|
||||
added = self.sync_table_schema(table_def.table_name)
|
||||
results[table_def.table_name] = added
|
||||
else:
|
||||
logger.info("Table %s doesn't exist yet, skipping sync", table_def.table_name)
|
||||
results[table_def.table_name] = []
|
||||
except Exception:
|
||||
logger.exception("Failed to sync schema for %s", table_def.table_name)
|
||||
results[table_def.table_name] = []
|
||||
return results
|
||||
|
||||
|
||||
def create_iceberg_manager_from_config(
|
||||
host: str = "localhost",
|
||||
port: int = 8080,
|
||||
user: str = "stonks",
|
||||
) -> IcebergManager:
|
||||
"""Factory that creates an IcebergManager from explicit connection params."""
|
||||
return IcebergManager(host=host, port=port, user=user)
|
||||
@@ -0,0 +1,673 @@
|
||||
"""Lake publisher async job runner — transforms operational data into analytical facts.
|
||||
|
||||
Reads jobs from the QUEUE_LAKE_PUBLISH Redis queue, queries PostgreSQL for
|
||||
operational records, and publishes them as partitioned Parquet files to MinIO
|
||||
via the existing publish_* functions in worker.py.
|
||||
|
||||
Job message format:
|
||||
{"job_type": "<table_name>", "entity_id": "<uuid or ticker>", "dt": "2026-04-11T..."}
|
||||
|
||||
Supported job types:
|
||||
- document: publish a single document metadata fact
|
||||
- document_extraction: publish extraction facts for a document
|
||||
- market_snapshot: publish market bars/quotes from a snapshot
|
||||
- trade_order: publish an order fact
|
||||
- trade_fill: publish fill facts for an order
|
||||
- positions_snapshot: publish daily position snapshots for a broker account
|
||||
- pnl_snapshot: publish daily PnL for a broker account
|
||||
- company_event: publish a company event fact
|
||||
- bulk_documents: publish all unpublished documents since a cutoff
|
||||
- bulk_extractions: publish all unpublished extractions since a cutoff
|
||||
|
||||
Requirements: 9.4, 9.5, 10.1
|
||||
Design ref: Section 4.10 (Lake Publisher), Section 8.4 (Lake publication flow)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import asyncpg
|
||||
import redis.asyncio as aioredis
|
||||
from minio import Minio
|
||||
|
||||
from services.lake_publisher.worker import (
|
||||
publish_document_extraction,
|
||||
publish_document_fact,
|
||||
publish_market_bar,
|
||||
publish_market_quote,
|
||||
publish_trade_order,
|
||||
publish_trade_fill,
|
||||
publish_pnl_daily,
|
||||
publish_documents_batch,
|
||||
publish_document_extractions_batch,
|
||||
publish_positions_daily_batch,
|
||||
)
|
||||
from services.lake_publisher.partitions import partition_values
|
||||
from services.shared.config import load_config
|
||||
from services.shared.db import get_minio, get_pg_pool, get_redis
|
||||
from services.shared.logging import setup_logging
|
||||
from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SQL queries for fetching operational data
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_FETCH_DOCUMENT = """
|
||||
SELECT
|
||||
d.id, d.document_type, d.source_type, d.publisher, d.title,
|
||||
d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
|
||||
d.content_hash, d.parse_quality_score,
|
||||
COALESCE(
|
||||
(SELECT dcm.ticker FROM document_company_mentions dcm
|
||||
WHERE dcm.document_id = d.id LIMIT 1),
|
||||
''
|
||||
) AS ticker
|
||||
FROM documents d
|
||||
WHERE d.id = $1::uuid
|
||||
"""
|
||||
|
||||
_FETCH_EXTRACTIONS = """
|
||||
SELECT
|
||||
di.document_id, dir.ticker, dir.relevance, dir.sentiment,
|
||||
dir.impact_score, dir.impact_horizon, dir.catalyst_type,
|
||||
di.confidence, di.novelty_score, di.source_credibility,
|
||||
dir.key_facts, dir.risks, di.macro_themes,
|
||||
di.model_name, di.prompt_version, di.schema_version,
|
||||
di.created_at AS extraction_at,
|
||||
COALESCE(c.legal_name, '') AS company_name
|
||||
FROM document_intelligence di
|
||||
JOIN document_impact_records dir ON dir.intelligence_id = di.id
|
||||
LEFT JOIN companies c ON c.id = dir.company_id
|
||||
WHERE di.document_id = $1::uuid
|
||||
AND di.validation_status = 'valid'
|
||||
"""
|
||||
|
||||
_FETCH_MARKET_SNAPSHOT = """
|
||||
SELECT
|
||||
ms.ticker, ms.snapshot_type, ms.data, ms.source_provider, ms.captured_at
|
||||
FROM market_snapshots ms
|
||||
WHERE ms.id = $1::uuid
|
||||
"""
|
||||
|
||||
_FETCH_ORDER = """
|
||||
SELECT
|
||||
o.id, o.recommendation_id, o.ticker, o.side, o.order_type,
|
||||
o.quantity, o.limit_price, o.status, o.submitted_at,
|
||||
o.fill_price, o.fill_quantity, o.filled_at,
|
||||
COALESCE(ba.account_id, '') AS broker_account,
|
||||
COALESCE(ba.mode, 'paper') AS execution_mode
|
||||
FROM orders o
|
||||
LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
|
||||
WHERE o.id = $1::uuid
|
||||
"""
|
||||
|
||||
_FETCH_ORDER_FILLS = """
|
||||
SELECT
|
||||
oe.id AS fill_id, oe.order_id, oe.data, oe.broker_timestamp,
|
||||
o.ticker, o.side,
|
||||
COALESCE(ba.account_id, '') AS broker_account
|
||||
FROM order_events oe
|
||||
JOIN orders o ON o.id = oe.order_id
|
||||
LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
|
||||
WHERE oe.order_id = $1::uuid AND oe.event_type = 'fill'
|
||||
"""
|
||||
|
||||
_FETCH_POSITIONS = """
|
||||
SELECT
|
||||
p.ticker, p.quantity, p.avg_entry_price, p.current_price,
|
||||
p.unrealized_pnl, p.realized_pnl,
|
||||
COALESCE(ba.account_id, '') AS broker_account,
|
||||
COALESCE(ba.mode, 'paper') AS execution_mode
|
||||
FROM positions p
|
||||
LEFT JOIN broker_accounts ba ON ba.id = p.broker_account_id
|
||||
WHERE p.broker_account_id = $1::uuid AND p.quantity != 0
|
||||
"""
|
||||
|
||||
_FETCH_BULK_DOCUMENTS = """
|
||||
SELECT
|
||||
d.id, d.document_type, d.source_type, d.publisher, d.title,
|
||||
d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
|
||||
d.content_hash, d.parse_quality_score,
|
||||
COALESCE(
|
||||
(SELECT dcm.ticker FROM document_company_mentions dcm
|
||||
WHERE dcm.document_id = d.id LIMIT 1),
|
||||
''
|
||||
) AS ticker
|
||||
FROM documents d
|
||||
WHERE d.created_at >= $1
|
||||
AND d.status IN ('parsed', 'extracted')
|
||||
ORDER BY d.created_at
|
||||
LIMIT 500
|
||||
"""
|
||||
|
||||
_FETCH_BULK_EXTRACTIONS = """
|
||||
SELECT
|
||||
di.document_id, dir.ticker, dir.relevance, dir.sentiment,
|
||||
dir.impact_score, dir.impact_horizon, dir.catalyst_type,
|
||||
di.confidence, di.novelty_score, di.source_credibility,
|
||||
dir.key_facts, dir.risks, di.macro_themes,
|
||||
di.model_name, di.prompt_version, di.schema_version,
|
||||
di.created_at AS extraction_at,
|
||||
COALESCE(c.legal_name, '') AS company_name
|
||||
FROM document_intelligence di
|
||||
JOIN document_impact_records dir ON dir.intelligence_id = di.id
|
||||
LEFT JOIN companies c ON c.id = dir.company_id
|
||||
WHERE di.created_at >= $1
|
||||
AND di.validation_status = 'valid'
|
||||
ORDER BY di.created_at
|
||||
LIMIT 500
|
||||
"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Job handlers — each transforms operational rows into lake facts
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _jsonb_to_str(val: object) -> str:
|
||||
"""Convert a JSONB column value (list or str) to a comma-separated string."""
|
||||
if val is None:
|
||||
return ""
|
||||
if isinstance(val, str):
|
||||
try:
|
||||
parsed = json.loads(val)
|
||||
if isinstance(parsed, list):
|
||||
return ", ".join(str(x) for x in parsed)
|
||||
return val
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return val
|
||||
if isinstance(val, list):
|
||||
return ", ".join(str(x) for x in val)
|
||||
return str(val)
|
||||
|
||||
|
||||
async def publish_document_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> str:
|
||||
"""Publish a single document metadata fact from PostgreSQL to the lake."""
|
||||
row = await pool.fetchrow(_FETCH_DOCUMENT, entity_id)
|
||||
if row is None:
|
||||
logger.warning("Document %s not found, skipping lake publish", entity_id)
|
||||
return ""
|
||||
|
||||
published_at = row["published_at"] or row["retrieved_at"]
|
||||
return publish_document_fact(
|
||||
client=minio_client,
|
||||
document_id=str(row["id"]),
|
||||
document_type=row["document_type"],
|
||||
source_type=row["source_type"],
|
||||
ticker=row["ticker"] or "",
|
||||
publisher=row["publisher"] or "",
|
||||
title=row["title"] or "",
|
||||
published_at=published_at,
|
||||
content_hash=row["content_hash"],
|
||||
url=row["url"] or "",
|
||||
canonical_url=row["canonical_url"] or "",
|
||||
language=row["language"] or "en",
|
||||
confidence=float(row["parse_quality_score"] or 0.0),
|
||||
retrieved_at=row["retrieved_at"],
|
||||
)
|
||||
|
||||
|
||||
async def publish_extraction_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> list[str]:
|
||||
"""Publish document extraction facts for a document from PostgreSQL to the lake."""
|
||||
rows = await pool.fetch(_FETCH_EXTRACTIONS, entity_id)
|
||||
if not rows:
|
||||
logger.info("No valid extractions for document %s", entity_id)
|
||||
return []
|
||||
|
||||
refs: list[str] = []
|
||||
for row in rows:
|
||||
ref = publish_document_extraction(
|
||||
client=minio_client,
|
||||
document_id=str(row["document_id"]),
|
||||
ticker=row["ticker"],
|
||||
sentiment=row["sentiment"] or "neutral",
|
||||
impact_score=float(row["impact_score"] or 0.0),
|
||||
catalyst_type=row["catalyst_type"] or "other",
|
||||
confidence=float(row["confidence"] or 0.0),
|
||||
extraction_at=row["extraction_at"],
|
||||
model_name=row["model_name"] or "",
|
||||
prompt_version=row["prompt_version"] or "",
|
||||
company_name=row["company_name"] or "",
|
||||
relevance=float(row["relevance"] or 0.0),
|
||||
impact_horizon=row["impact_horizon"] or "",
|
||||
novelty_score=float(row["novelty_score"] or 0.0),
|
||||
source_credibility=float(row["source_credibility"] or 0.0),
|
||||
key_facts=_jsonb_to_str(row["key_facts"]),
|
||||
risks=_jsonb_to_str(row["risks"]),
|
||||
macro_themes=_jsonb_to_str(row["macro_themes"]),
|
||||
schema_version=row["schema_version"] or "",
|
||||
)
|
||||
refs.append(ref)
|
||||
return refs
|
||||
|
||||
|
||||
async def publish_market_snapshot_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> list[str]:
|
||||
"""Publish market bar/quote facts from a market_snapshots row."""
|
||||
row = await pool.fetchrow(_FETCH_MARKET_SNAPSHOT, entity_id)
|
||||
if row is None:
|
||||
logger.warning("Market snapshot %s not found", entity_id)
|
||||
return []
|
||||
|
||||
ticker = row["ticker"]
|
||||
data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"])
|
||||
source = row["source_provider"] or ""
|
||||
captured_at = row["captured_at"]
|
||||
snapshot_type = row["snapshot_type"]
|
||||
refs: list[str] = []
|
||||
|
||||
if snapshot_type == "bar" or snapshot_type == "bars":
|
||||
# Single bar or list of bars
|
||||
bars = data.get("bars", [data]) if "bars" in data else [data]
|
||||
for bar in bars:
|
||||
ref = publish_market_bar(
|
||||
client=minio_client,
|
||||
ticker=ticker,
|
||||
open_price=float(bar.get("open", bar.get("o", 0))),
|
||||
high_price=float(bar.get("high", bar.get("h", 0))),
|
||||
low_price=float(bar.get("low", bar.get("l", 0))),
|
||||
close_price=float(bar.get("close", bar.get("c", 0))),
|
||||
volume=int(bar.get("volume", bar.get("v", 0))),
|
||||
bar_timestamp=captured_at,
|
||||
source=source,
|
||||
vwap=float(bar.get("vwap", bar.get("vw", 0))),
|
||||
trade_count=int(bar.get("trade_count", bar.get("n", 0))),
|
||||
bar_interval=bar.get("interval", "1d"),
|
||||
)
|
||||
refs.append(ref)
|
||||
elif snapshot_type == "quote" or snapshot_type == "quotes":
|
||||
ref = publish_market_quote(
|
||||
client=minio_client,
|
||||
ticker=ticker,
|
||||
bid_price=float(data.get("bid_price", data.get("bp", 0))),
|
||||
ask_price=float(data.get("ask_price", data.get("ap", 0))),
|
||||
last_price=float(data.get("last_price", data.get("lp", 0))),
|
||||
quote_at=captured_at,
|
||||
source=source,
|
||||
bid_size=int(data.get("bid_size", data.get("bs", 0))),
|
||||
ask_size=int(data.get("ask_size", data.get("as", 0))),
|
||||
last_size=int(data.get("last_size", data.get("ls", 0))),
|
||||
)
|
||||
refs.append(ref)
|
||||
|
||||
return refs
|
||||
|
||||
|
||||
async def publish_order_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> str:
|
||||
"""Publish a trade order fact from PostgreSQL to the lake."""
|
||||
row = await pool.fetchrow(_FETCH_ORDER, entity_id)
|
||||
if row is None:
|
||||
logger.warning("Order %s not found", entity_id)
|
||||
return ""
|
||||
|
||||
submitted_at = row["submitted_at"] or datetime.now(timezone.utc)
|
||||
return publish_trade_order(
|
||||
client=minio_client,
|
||||
order_id=str(row["id"]),
|
||||
ticker=row["ticker"],
|
||||
side=row["side"],
|
||||
order_type=row["order_type"],
|
||||
quantity=float(row["quantity"]),
|
||||
limit_price=float(row["limit_price"]) if row["limit_price"] else None,
|
||||
status=row["status"],
|
||||
broker_account=row["broker_account"],
|
||||
submitted_at=submitted_at,
|
||||
recommendation_id=str(row["recommendation_id"]) if row["recommendation_id"] else "",
|
||||
execution_mode=row["execution_mode"],
|
||||
)
|
||||
|
||||
|
||||
async def publish_fills_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> list[str]:
|
||||
"""Publish trade fill facts for an order from PostgreSQL to the lake."""
|
||||
rows = await pool.fetch(_FETCH_ORDER_FILLS, entity_id)
|
||||
if not rows:
|
||||
logger.info("No fill events for order %s", entity_id)
|
||||
return []
|
||||
|
||||
refs: list[str] = []
|
||||
for row in rows:
|
||||
data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"] or "{}")
|
||||
filled_at = row["broker_timestamp"] or datetime.now(timezone.utc)
|
||||
ref = publish_trade_fill(
|
||||
client=minio_client,
|
||||
fill_id=str(row["fill_id"]),
|
||||
order_id=str(row["order_id"]),
|
||||
ticker=row["ticker"],
|
||||
side=row["side"],
|
||||
fill_price=float(data.get("fill_price", data.get("price", 0))),
|
||||
fill_quantity=float(data.get("fill_quantity", data.get("qty", 0))),
|
||||
broker_account=row["broker_account"],
|
||||
filled_at=filled_at,
|
||||
commission=float(data.get("commission", 0)),
|
||||
)
|
||||
refs.append(ref)
|
||||
return refs
|
||||
|
||||
|
||||
async def publish_positions_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> str:
|
||||
"""Publish daily position snapshots for a broker account."""
|
||||
rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
|
||||
if not rows:
|
||||
logger.info("No open positions for account %s", entity_id)
|
||||
return ""
|
||||
|
||||
snapshot_at = datetime.now(timezone.utc)
|
||||
positions = [
|
||||
{
|
||||
"ticker": row["ticker"],
|
||||
"quantity": float(row["quantity"]),
|
||||
"avg_entry_price": float(row["avg_entry_price"] or 0),
|
||||
"close_price": float(row["current_price"] or 0),
|
||||
"unrealized_pnl": float(row["unrealized_pnl"] or 0),
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
broker_account = rows[0]["broker_account"] if rows else ""
|
||||
return publish_positions_daily_batch(
|
||||
client=minio_client,
|
||||
positions=positions,
|
||||
broker_account=broker_account,
|
||||
snapshot_at=snapshot_at,
|
||||
)
|
||||
|
||||
|
||||
async def publish_pnl_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> list[str]:
|
||||
"""Publish daily PnL facts for a broker account's positions."""
|
||||
rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
|
||||
if not rows:
|
||||
logger.info("No positions for PnL snapshot, account %s", entity_id)
|
||||
return []
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
refs: list[str] = []
|
||||
for row in rows:
|
||||
realized = float(row["realized_pnl"] or 0)
|
||||
unrealized = float(row["unrealized_pnl"] or 0)
|
||||
total = realized + unrealized
|
||||
ref = publish_pnl_daily(
|
||||
client=minio_client,
|
||||
ticker=row["ticker"],
|
||||
realized_pnl=realized,
|
||||
unrealized_pnl=unrealized,
|
||||
total_pnl=total,
|
||||
broker_account=row["broker_account"],
|
||||
dt=now,
|
||||
execution_mode=row["execution_mode"],
|
||||
)
|
||||
refs.append(ref)
|
||||
return refs
|
||||
|
||||
|
||||
async def publish_bulk_documents_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
since: datetime,
|
||||
) -> list[str]:
|
||||
"""Publish all documents created since a cutoff as a batch."""
|
||||
rows = await pool.fetch(_FETCH_BULK_DOCUMENTS, since)
|
||||
if not rows:
|
||||
logger.info("No documents to bulk-publish since %s", since)
|
||||
return []
|
||||
|
||||
doc_rows: list[dict[str, object]] = []
|
||||
for row in rows:
|
||||
published_at = row["published_at"] or row["retrieved_at"]
|
||||
doc_rows.append({
|
||||
"document_id": str(row["id"]),
|
||||
"document_type": row["document_type"],
|
||||
"source_type": row["source_type"],
|
||||
"ticker": row["ticker"] or "",
|
||||
"publisher": row["publisher"] or "",
|
||||
"title": row["title"] or "",
|
||||
"url": row["url"] or "",
|
||||
"canonical_url": row["canonical_url"] or "",
|
||||
"language": row["language"] or "en",
|
||||
"published_at": published_at,
|
||||
"retrieved_at": row["retrieved_at"],
|
||||
"content_hash": row["content_hash"],
|
||||
"confidence": float(row["parse_quality_score"] or 0.0),
|
||||
**partition_values(published_at),
|
||||
})
|
||||
|
||||
ref = publish_documents_batch(minio_client, doc_rows, since)
|
||||
return [ref] if ref else []
|
||||
|
||||
|
||||
async def publish_bulk_extractions_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
since: datetime,
|
||||
) -> list[str]:
|
||||
"""Publish all extractions created since a cutoff as a batch."""
|
||||
rows = await pool.fetch(_FETCH_BULK_EXTRACTIONS, since)
|
||||
if not rows:
|
||||
logger.info("No extractions to bulk-publish since %s", since)
|
||||
return []
|
||||
|
||||
extraction_rows: list[dict[str, object]] = []
|
||||
for row in rows:
|
||||
model_ver = row["schema_version"] or row["prompt_version"] or ""
|
||||
extraction_rows.append({
|
||||
"document_id": str(row["document_id"]),
|
||||
"ticker": row["ticker"],
|
||||
"company_name": row["company_name"] or "",
|
||||
"relevance": float(row["relevance"] or 0.0),
|
||||
"sentiment": row["sentiment"] or "neutral",
|
||||
"impact_score": float(row["impact_score"] or 0.0),
|
||||
"impact_horizon": row["impact_horizon"] or "",
|
||||
"catalyst_type": row["catalyst_type"] or "other",
|
||||
"confidence": float(row["confidence"] or 0.0),
|
||||
"novelty_score": float(row["novelty_score"] or 0.0),
|
||||
"source_credibility": float(row["source_credibility"] or 0.0),
|
||||
"key_facts": _jsonb_to_str(row["key_facts"]),
|
||||
"risks": _jsonb_to_str(row["risks"]),
|
||||
"macro_themes": _jsonb_to_str(row["macro_themes"]),
|
||||
"model_name": row["model_name"] or "",
|
||||
"prompt_version": row["prompt_version"] or "",
|
||||
"schema_version": row["schema_version"] or "",
|
||||
"extraction_at": row["extraction_at"],
|
||||
**partition_values(row["extraction_at"], {"model_version": model_ver}),
|
||||
})
|
||||
|
||||
model_ver = extraction_rows[0].get("model_version", "") if extraction_rows else ""
|
||||
ref = publish_document_extractions_batch(
|
||||
minio_client, extraction_rows, since,
|
||||
model_version=str(model_ver),
|
||||
)
|
||||
return [ref] if ref else []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Job dispatcher
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
JOB_TYPES = {
|
||||
"document",
|
||||
"document_extraction",
|
||||
"market_snapshot",
|
||||
"trade_order",
|
||||
"trade_fill",
|
||||
"positions_snapshot",
|
||||
"pnl_snapshot",
|
||||
"company_event",
|
||||
"bulk_documents",
|
||||
"bulk_extractions",
|
||||
}
|
||||
|
||||
|
||||
async def dispatch_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
job: dict[str, str],
|
||||
) -> dict[str, object]:
|
||||
"""Dispatch a lake publish job to the appropriate handler.
|
||||
|
||||
Args:
|
||||
pool: PostgreSQL connection pool.
|
||||
minio_client: MinIO client for writing Parquet files.
|
||||
job: Job dict with at least 'job_type' and 'entity_id'.
|
||||
|
||||
Returns:
|
||||
A result dict with 'job_type', 'entity_id', 'refs' (list of s3 URIs),
|
||||
and 'error' (None on success).
|
||||
"""
|
||||
job_type = job.get("job_type", "")
|
||||
entity_id = job.get("entity_id", "")
|
||||
since_str = job.get("since")
|
||||
|
||||
result: dict[str, object] = {
|
||||
"job_type": job_type,
|
||||
"entity_id": entity_id,
|
||||
"refs": [],
|
||||
"error": None,
|
||||
}
|
||||
|
||||
try:
|
||||
if job_type == "document":
|
||||
ref = await publish_document_job(pool, minio_client, entity_id)
|
||||
result["refs"] = [ref] if ref else []
|
||||
|
||||
elif job_type == "document_extraction":
|
||||
refs = await publish_extraction_job(pool, minio_client, entity_id)
|
||||
result["refs"] = refs
|
||||
|
||||
elif job_type == "market_snapshot":
|
||||
refs = await publish_market_snapshot_job(pool, minio_client, entity_id)
|
||||
result["refs"] = refs
|
||||
|
||||
elif job_type == "trade_order":
|
||||
ref = await publish_order_job(pool, minio_client, entity_id)
|
||||
result["refs"] = [ref] if ref else []
|
||||
|
||||
elif job_type == "trade_fill":
|
||||
refs = await publish_fills_job(pool, minio_client, entity_id)
|
||||
result["refs"] = refs
|
||||
|
||||
elif job_type == "positions_snapshot":
|
||||
ref = await publish_positions_job(pool, minio_client, entity_id)
|
||||
result["refs"] = [ref] if ref else []
|
||||
|
||||
elif job_type == "pnl_snapshot":
|
||||
refs = await publish_pnl_job(pool, minio_client, entity_id)
|
||||
result["refs"] = refs
|
||||
|
||||
elif job_type == "bulk_documents":
|
||||
since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
|
||||
refs = await publish_bulk_documents_job(pool, minio_client, since)
|
||||
result["refs"] = refs
|
||||
|
||||
elif job_type == "bulk_extractions":
|
||||
since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
|
||||
refs = await publish_bulk_extractions_job(pool, minio_client, since)
|
||||
result["refs"] = refs
|
||||
|
||||
else:
|
||||
result["error"] = f"Unknown job_type: {job_type}"
|
||||
logger.warning("Unknown lake publish job type: %s", job_type)
|
||||
|
||||
except Exception as exc:
|
||||
result["error"] = str(exc)
|
||||
logger.exception("Lake publish job failed: %s/%s", job_type, entity_id)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Async worker loop
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_worker(
|
||||
pool: asyncpg.Pool,
|
||||
rds: aioredis.Redis,
|
||||
minio_client: Minio,
|
||||
poll_interval: float = 2.0,
|
||||
) -> None:
|
||||
"""Main worker loop — reads jobs from Redis and dispatches them.
|
||||
|
||||
Runs indefinitely until cancelled. Each job is processed sequentially
|
||||
to keep MinIO write ordering predictable.
|
||||
"""
|
||||
queue = queue_key(QUEUE_LAKE_PUBLISH)
|
||||
logger.info("Lake publisher worker started, listening on %s", queue)
|
||||
|
||||
while True:
|
||||
raw = await rds.lpop(queue) # type: ignore[misc]
|
||||
if raw is None:
|
||||
await asyncio.sleep(poll_interval)
|
||||
continue
|
||||
|
||||
try:
|
||||
job = json.loads(str(raw))
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
logger.error("Invalid lake publish job payload: %s", raw)
|
||||
continue
|
||||
|
||||
result = await dispatch_job(pool, minio_client, job)
|
||||
refs = result.get("refs") or []
|
||||
error = result.get("error")
|
||||
|
||||
if error:
|
||||
logger.error(
|
||||
"Lake publish job %s/%s failed: %s",
|
||||
result["job_type"], result["entity_id"], error,
|
||||
)
|
||||
else:
|
||||
ref_count = len(refs) if isinstance(refs, list) else 0
|
||||
logger.info(
|
||||
"Lake publish job %s/%s completed: %d facts written",
|
||||
result["job_type"], result["entity_id"], ref_count,
|
||||
)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
"""Entry point for the lake publisher worker process."""
|
||||
config = load_config()
|
||||
pool = await get_pg_pool(config)
|
||||
rds = get_redis(config)
|
||||
minio_client = get_minio(config)
|
||||
|
||||
try:
|
||||
await run_worker(pool, rds, minio_client)
|
||||
finally:
|
||||
await pool.close()
|
||||
await rds.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = load_config()
|
||||
setup_logging("lake_publisher", level=cfg.log_level, json_output=cfg.json_logs)
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,128 @@
|
||||
"""Hive-compatible partition layout conventions for the MinIO lakehouse.
|
||||
|
||||
Centralizes partition path generation, partition column injection, and
|
||||
bucket provisioning so that all lake publisher writers produce layouts
|
||||
that Trino's Hive and Iceberg connectors can discover and prune.
|
||||
|
||||
Design ref: Section 5.2, 5.3 (Lakehouse model)
|
||||
Requirements: 9.4, 9.5, N4, N6
|
||||
|
||||
Layout convention:
|
||||
s3://stonks-lakehouse/warehouse/{table_name}/dt={YYYY-MM-DD}[/{extra_key}={value}]/part-{uuid}.parquet
|
||||
|
||||
Rules:
|
||||
- Every fact table is partitioned by ``dt`` (DATE) derived from the row timestamp.
|
||||
- Some tables have a second partition key (e.g. ``model_version``).
|
||||
- Partition columns MUST appear in the Parquet file so Trino can read them
|
||||
without relying solely on path parsing.
|
||||
- File names use a UUID suffix to avoid collisions on concurrent writes.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date, datetime, timezone
|
||||
|
||||
|
||||
LAKEHOUSE_BUCKET = "stonks-lakehouse"
|
||||
WAREHOUSE_PREFIX = "warehouse"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PartitionSpec:
|
||||
"""Describes the partition layout for a single fact table."""
|
||||
|
||||
table_name: str
|
||||
extra_keys: tuple[str, ...] = field(default_factory=tuple)
|
||||
|
||||
@property
|
||||
def all_keys(self) -> tuple[str, ...]:
|
||||
"""Return all partition keys in order (dt first, then extras)."""
|
||||
return ("dt", *self.extra_keys)
|
||||
|
||||
|
||||
# Registry of every analytical fact table and its partition keys.
|
||||
# This is the single source of truth — DDL, publisher, and tests should agree.
|
||||
TABLE_PARTITIONS: dict[str, PartitionSpec] = {
|
||||
"market_bars": PartitionSpec("market_bars"),
|
||||
"market_quotes": PartitionSpec("market_quotes"),
|
||||
"company_events": PartitionSpec("company_events"),
|
||||
"documents": PartitionSpec("documents"),
|
||||
"document_extractions": PartitionSpec("document_extractions", extra_keys=("model_version",)),
|
||||
"trade_signals": PartitionSpec("trade_signals"),
|
||||
"trade_orders": PartitionSpec("trade_orders"),
|
||||
"trade_fills": PartitionSpec("trade_fills"),
|
||||
"positions_daily": PartitionSpec("positions_daily"),
|
||||
"pnl_daily": PartitionSpec("pnl_daily"),
|
||||
"prediction_vs_outcome": PartitionSpec("prediction_vs_outcome", extra_keys=("model_version",)),
|
||||
"model_performance": PartitionSpec("model_performance", extra_keys=("model_version",)),
|
||||
}
|
||||
|
||||
|
||||
def partition_path(
|
||||
table_name: str,
|
||||
dt: datetime | date,
|
||||
extra_partitions: dict[str, str] | None = None,
|
||||
file_id: str | None = None,
|
||||
) -> str:
|
||||
"""Build a Hive-compatible object path for a Parquet file.
|
||||
|
||||
Args:
|
||||
table_name: Logical fact table name (must be in TABLE_PARTITIONS).
|
||||
dt: Row timestamp or date used to derive the ``dt=`` partition.
|
||||
extra_partitions: Additional partition key/value pairs (e.g. model_version).
|
||||
file_id: Optional override for the file suffix (defaults to a UUID4).
|
||||
|
||||
Returns:
|
||||
Object key relative to the bucket root, e.g.
|
||||
``warehouse/trade_signals/dt=2026-04-11/part-<uuid>.parquet``
|
||||
"""
|
||||
spec = TABLE_PARTITIONS.get(table_name)
|
||||
if spec is None:
|
||||
raise ValueError(f"Unknown table: {table_name}. Register it in TABLE_PARTITIONS.")
|
||||
|
||||
if isinstance(dt, datetime):
|
||||
dt_str = dt.strftime("%Y-%m-%d")
|
||||
else:
|
||||
dt_str = dt.isoformat()
|
||||
|
||||
segments = [WAREHOUSE_PREFIX, table_name, f"dt={dt_str}"]
|
||||
|
||||
# Append extra partition directories in the order declared by the spec.
|
||||
extras = extra_partitions or {}
|
||||
for key in spec.extra_keys:
|
||||
value = extras.get(key, "__NONE__")
|
||||
segments.append(f"{key}={value}")
|
||||
|
||||
suffix = file_id or uuid.uuid4().hex[:16]
|
||||
segments.append(f"part-{suffix}.parquet")
|
||||
|
||||
return "/".join(segments)
|
||||
|
||||
|
||||
def partition_values(
|
||||
dt: datetime | date,
|
||||
extra_partitions: dict[str, str] | None = None,
|
||||
) -> dict[str, object]:
|
||||
"""Return partition column values to inject into Parquet row data.
|
||||
|
||||
Trino's Hive connector can read partition values from the directory path,
|
||||
but embedding them in the Parquet file as well ensures compatibility with
|
||||
engines that don't parse Hive paths (e.g. plain PyArrow reads, DuckDB).
|
||||
|
||||
Returns a dict like ``{"dt": date(2026, 4, 11), "model_version": "v2"}``.
|
||||
"""
|
||||
if isinstance(dt, datetime):
|
||||
dt_date = dt.date()
|
||||
else:
|
||||
dt_date = dt
|
||||
|
||||
values: dict[str, object] = {"dt": dt_date}
|
||||
if extra_partitions:
|
||||
values.update(extra_partitions)
|
||||
return values
|
||||
|
||||
|
||||
def s3_uri(path: str) -> str:
|
||||
"""Build an s3:// URI from a bucket-relative object path."""
|
||||
return f"s3://{LAKEHOUSE_BUCKET}/{path}"
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user