phase 14-15: docker build validation and helm deployment
This commit is contained in:
@@ -0,0 +1,673 @@
|
||||
"""Lake publisher async job runner — transforms operational data into analytical facts.
|
||||
|
||||
Reads jobs from the QUEUE_LAKE_PUBLISH Redis queue, queries PostgreSQL for
|
||||
operational records, and publishes them as partitioned Parquet files to MinIO
|
||||
via the existing publish_* functions in worker.py.
|
||||
|
||||
Job message format:
|
||||
{"job_type": "<table_name>", "entity_id": "<uuid or ticker>", "dt": "2026-04-11T..."}
|
||||
|
||||
Supported job types:
|
||||
- document: publish a single document metadata fact
|
||||
- document_extraction: publish extraction facts for a document
|
||||
- market_snapshot: publish market bars/quotes from a snapshot
|
||||
- trade_order: publish an order fact
|
||||
- trade_fill: publish fill facts for an order
|
||||
- positions_snapshot: publish daily position snapshots for a broker account
|
||||
- pnl_snapshot: publish daily PnL for a broker account
|
||||
- company_event: publish a company event fact
|
||||
- bulk_documents: publish all unpublished documents since a cutoff
|
||||
- bulk_extractions: publish all unpublished extractions since a cutoff
|
||||
|
||||
Requirements: 9.4, 9.5, 10.1
|
||||
Design ref: Section 4.10 (Lake Publisher), Section 8.4 (Lake publication flow)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import asyncpg
|
||||
import redis.asyncio as aioredis
|
||||
from minio import Minio
|
||||
|
||||
from services.lake_publisher.worker import (
|
||||
publish_document_extraction,
|
||||
publish_document_fact,
|
||||
publish_market_bar,
|
||||
publish_market_quote,
|
||||
publish_trade_order,
|
||||
publish_trade_fill,
|
||||
publish_pnl_daily,
|
||||
publish_documents_batch,
|
||||
publish_document_extractions_batch,
|
||||
publish_positions_daily_batch,
|
||||
)
|
||||
from services.lake_publisher.partitions import partition_values
|
||||
from services.shared.config import load_config
|
||||
from services.shared.db import get_minio, get_pg_pool, get_redis
|
||||
from services.shared.logging import setup_logging
|
||||
from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SQL queries for fetching operational data
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_FETCH_DOCUMENT = """
|
||||
SELECT
|
||||
d.id, d.document_type, d.source_type, d.publisher, d.title,
|
||||
d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
|
||||
d.content_hash, d.parse_quality_score,
|
||||
COALESCE(
|
||||
(SELECT dcm.ticker FROM document_company_mentions dcm
|
||||
WHERE dcm.document_id = d.id LIMIT 1),
|
||||
''
|
||||
) AS ticker
|
||||
FROM documents d
|
||||
WHERE d.id = $1::uuid
|
||||
"""
|
||||
|
||||
_FETCH_EXTRACTIONS = """
|
||||
SELECT
|
||||
di.document_id, dir.ticker, dir.relevance, dir.sentiment,
|
||||
dir.impact_score, dir.impact_horizon, dir.catalyst_type,
|
||||
di.confidence, di.novelty_score, di.source_credibility,
|
||||
dir.key_facts, dir.risks, di.macro_themes,
|
||||
di.model_name, di.prompt_version, di.schema_version,
|
||||
di.created_at AS extraction_at,
|
||||
COALESCE(c.legal_name, '') AS company_name
|
||||
FROM document_intelligence di
|
||||
JOIN document_impact_records dir ON dir.intelligence_id = di.id
|
||||
LEFT JOIN companies c ON c.id = dir.company_id
|
||||
WHERE di.document_id = $1::uuid
|
||||
AND di.validation_status = 'valid'
|
||||
"""
|
||||
|
||||
_FETCH_MARKET_SNAPSHOT = """
|
||||
SELECT
|
||||
ms.ticker, ms.snapshot_type, ms.data, ms.source_provider, ms.captured_at
|
||||
FROM market_snapshots ms
|
||||
WHERE ms.id = $1::uuid
|
||||
"""
|
||||
|
||||
_FETCH_ORDER = """
|
||||
SELECT
|
||||
o.id, o.recommendation_id, o.ticker, o.side, o.order_type,
|
||||
o.quantity, o.limit_price, o.status, o.submitted_at,
|
||||
o.fill_price, o.fill_quantity, o.filled_at,
|
||||
COALESCE(ba.account_id, '') AS broker_account,
|
||||
COALESCE(ba.mode, 'paper') AS execution_mode
|
||||
FROM orders o
|
||||
LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
|
||||
WHERE o.id = $1::uuid
|
||||
"""
|
||||
|
||||
_FETCH_ORDER_FILLS = """
|
||||
SELECT
|
||||
oe.id AS fill_id, oe.order_id, oe.data, oe.broker_timestamp,
|
||||
o.ticker, o.side,
|
||||
COALESCE(ba.account_id, '') AS broker_account
|
||||
FROM order_events oe
|
||||
JOIN orders o ON o.id = oe.order_id
|
||||
LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
|
||||
WHERE oe.order_id = $1::uuid AND oe.event_type = 'fill'
|
||||
"""
|
||||
|
||||
_FETCH_POSITIONS = """
|
||||
SELECT
|
||||
p.ticker, p.quantity, p.avg_entry_price, p.current_price,
|
||||
p.unrealized_pnl, p.realized_pnl,
|
||||
COALESCE(ba.account_id, '') AS broker_account,
|
||||
COALESCE(ba.mode, 'paper') AS execution_mode
|
||||
FROM positions p
|
||||
LEFT JOIN broker_accounts ba ON ba.id = p.broker_account_id
|
||||
WHERE p.broker_account_id = $1::uuid AND p.quantity != 0
|
||||
"""
|
||||
|
||||
_FETCH_BULK_DOCUMENTS = """
|
||||
SELECT
|
||||
d.id, d.document_type, d.source_type, d.publisher, d.title,
|
||||
d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
|
||||
d.content_hash, d.parse_quality_score,
|
||||
COALESCE(
|
||||
(SELECT dcm.ticker FROM document_company_mentions dcm
|
||||
WHERE dcm.document_id = d.id LIMIT 1),
|
||||
''
|
||||
) AS ticker
|
||||
FROM documents d
|
||||
WHERE d.created_at >= $1
|
||||
AND d.status IN ('parsed', 'extracted')
|
||||
ORDER BY d.created_at
|
||||
LIMIT 500
|
||||
"""
|
||||
|
||||
_FETCH_BULK_EXTRACTIONS = """
|
||||
SELECT
|
||||
di.document_id, dir.ticker, dir.relevance, dir.sentiment,
|
||||
dir.impact_score, dir.impact_horizon, dir.catalyst_type,
|
||||
di.confidence, di.novelty_score, di.source_credibility,
|
||||
dir.key_facts, dir.risks, di.macro_themes,
|
||||
di.model_name, di.prompt_version, di.schema_version,
|
||||
di.created_at AS extraction_at,
|
||||
COALESCE(c.legal_name, '') AS company_name
|
||||
FROM document_intelligence di
|
||||
JOIN document_impact_records dir ON dir.intelligence_id = di.id
|
||||
LEFT JOIN companies c ON c.id = dir.company_id
|
||||
WHERE di.created_at >= $1
|
||||
AND di.validation_status = 'valid'
|
||||
ORDER BY di.created_at
|
||||
LIMIT 500
|
||||
"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Job handlers — each transforms operational rows into lake facts
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _jsonb_to_str(val: object) -> str:
|
||||
"""Convert a JSONB column value (list or str) to a comma-separated string."""
|
||||
if val is None:
|
||||
return ""
|
||||
if isinstance(val, str):
|
||||
try:
|
||||
parsed = json.loads(val)
|
||||
if isinstance(parsed, list):
|
||||
return ", ".join(str(x) for x in parsed)
|
||||
return val
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return val
|
||||
if isinstance(val, list):
|
||||
return ", ".join(str(x) for x in val)
|
||||
return str(val)
|
||||
|
||||
|
||||
async def publish_document_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> str:
|
||||
"""Publish a single document metadata fact from PostgreSQL to the lake."""
|
||||
row = await pool.fetchrow(_FETCH_DOCUMENT, entity_id)
|
||||
if row is None:
|
||||
logger.warning("Document %s not found, skipping lake publish", entity_id)
|
||||
return ""
|
||||
|
||||
published_at = row["published_at"] or row["retrieved_at"]
|
||||
return publish_document_fact(
|
||||
client=minio_client,
|
||||
document_id=str(row["id"]),
|
||||
document_type=row["document_type"],
|
||||
source_type=row["source_type"],
|
||||
ticker=row["ticker"] or "",
|
||||
publisher=row["publisher"] or "",
|
||||
title=row["title"] or "",
|
||||
published_at=published_at,
|
||||
content_hash=row["content_hash"],
|
||||
url=row["url"] or "",
|
||||
canonical_url=row["canonical_url"] or "",
|
||||
language=row["language"] or "en",
|
||||
confidence=float(row["parse_quality_score"] or 0.0),
|
||||
retrieved_at=row["retrieved_at"],
|
||||
)
|
||||
|
||||
|
||||
async def publish_extraction_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> list[str]:
|
||||
"""Publish document extraction facts for a document from PostgreSQL to the lake."""
|
||||
rows = await pool.fetch(_FETCH_EXTRACTIONS, entity_id)
|
||||
if not rows:
|
||||
logger.info("No valid extractions for document %s", entity_id)
|
||||
return []
|
||||
|
||||
refs: list[str] = []
|
||||
for row in rows:
|
||||
ref = publish_document_extraction(
|
||||
client=minio_client,
|
||||
document_id=str(row["document_id"]),
|
||||
ticker=row["ticker"],
|
||||
sentiment=row["sentiment"] or "neutral",
|
||||
impact_score=float(row["impact_score"] or 0.0),
|
||||
catalyst_type=row["catalyst_type"] or "other",
|
||||
confidence=float(row["confidence"] or 0.0),
|
||||
extraction_at=row["extraction_at"],
|
||||
model_name=row["model_name"] or "",
|
||||
prompt_version=row["prompt_version"] or "",
|
||||
company_name=row["company_name"] or "",
|
||||
relevance=float(row["relevance"] or 0.0),
|
||||
impact_horizon=row["impact_horizon"] or "",
|
||||
novelty_score=float(row["novelty_score"] or 0.0),
|
||||
source_credibility=float(row["source_credibility"] or 0.0),
|
||||
key_facts=_jsonb_to_str(row["key_facts"]),
|
||||
risks=_jsonb_to_str(row["risks"]),
|
||||
macro_themes=_jsonb_to_str(row["macro_themes"]),
|
||||
schema_version=row["schema_version"] or "",
|
||||
)
|
||||
refs.append(ref)
|
||||
return refs
|
||||
|
||||
|
||||
async def publish_market_snapshot_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> list[str]:
|
||||
"""Publish market bar/quote facts from a market_snapshots row."""
|
||||
row = await pool.fetchrow(_FETCH_MARKET_SNAPSHOT, entity_id)
|
||||
if row is None:
|
||||
logger.warning("Market snapshot %s not found", entity_id)
|
||||
return []
|
||||
|
||||
ticker = row["ticker"]
|
||||
data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"])
|
||||
source = row["source_provider"] or ""
|
||||
captured_at = row["captured_at"]
|
||||
snapshot_type = row["snapshot_type"]
|
||||
refs: list[str] = []
|
||||
|
||||
if snapshot_type == "bar" or snapshot_type == "bars":
|
||||
# Single bar or list of bars
|
||||
bars = data.get("bars", [data]) if "bars" in data else [data]
|
||||
for bar in bars:
|
||||
ref = publish_market_bar(
|
||||
client=minio_client,
|
||||
ticker=ticker,
|
||||
open_price=float(bar.get("open", bar.get("o", 0))),
|
||||
high_price=float(bar.get("high", bar.get("h", 0))),
|
||||
low_price=float(bar.get("low", bar.get("l", 0))),
|
||||
close_price=float(bar.get("close", bar.get("c", 0))),
|
||||
volume=int(bar.get("volume", bar.get("v", 0))),
|
||||
bar_timestamp=captured_at,
|
||||
source=source,
|
||||
vwap=float(bar.get("vwap", bar.get("vw", 0))),
|
||||
trade_count=int(bar.get("trade_count", bar.get("n", 0))),
|
||||
bar_interval=bar.get("interval", "1d"),
|
||||
)
|
||||
refs.append(ref)
|
||||
elif snapshot_type == "quote" or snapshot_type == "quotes":
|
||||
ref = publish_market_quote(
|
||||
client=minio_client,
|
||||
ticker=ticker,
|
||||
bid_price=float(data.get("bid_price", data.get("bp", 0))),
|
||||
ask_price=float(data.get("ask_price", data.get("ap", 0))),
|
||||
last_price=float(data.get("last_price", data.get("lp", 0))),
|
||||
quote_at=captured_at,
|
||||
source=source,
|
||||
bid_size=int(data.get("bid_size", data.get("bs", 0))),
|
||||
ask_size=int(data.get("ask_size", data.get("as", 0))),
|
||||
last_size=int(data.get("last_size", data.get("ls", 0))),
|
||||
)
|
||||
refs.append(ref)
|
||||
|
||||
return refs
|
||||
|
||||
|
||||
async def publish_order_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> str:
|
||||
"""Publish a trade order fact from PostgreSQL to the lake."""
|
||||
row = await pool.fetchrow(_FETCH_ORDER, entity_id)
|
||||
if row is None:
|
||||
logger.warning("Order %s not found", entity_id)
|
||||
return ""
|
||||
|
||||
submitted_at = row["submitted_at"] or datetime.now(timezone.utc)
|
||||
return publish_trade_order(
|
||||
client=minio_client,
|
||||
order_id=str(row["id"]),
|
||||
ticker=row["ticker"],
|
||||
side=row["side"],
|
||||
order_type=row["order_type"],
|
||||
quantity=float(row["quantity"]),
|
||||
limit_price=float(row["limit_price"]) if row["limit_price"] else None,
|
||||
status=row["status"],
|
||||
broker_account=row["broker_account"],
|
||||
submitted_at=submitted_at,
|
||||
recommendation_id=str(row["recommendation_id"]) if row["recommendation_id"] else "",
|
||||
execution_mode=row["execution_mode"],
|
||||
)
|
||||
|
||||
|
||||
async def publish_fills_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> list[str]:
|
||||
"""Publish trade fill facts for an order from PostgreSQL to the lake."""
|
||||
rows = await pool.fetch(_FETCH_ORDER_FILLS, entity_id)
|
||||
if not rows:
|
||||
logger.info("No fill events for order %s", entity_id)
|
||||
return []
|
||||
|
||||
refs: list[str] = []
|
||||
for row in rows:
|
||||
data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"] or "{}")
|
||||
filled_at = row["broker_timestamp"] or datetime.now(timezone.utc)
|
||||
ref = publish_trade_fill(
|
||||
client=minio_client,
|
||||
fill_id=str(row["fill_id"]),
|
||||
order_id=str(row["order_id"]),
|
||||
ticker=row["ticker"],
|
||||
side=row["side"],
|
||||
fill_price=float(data.get("fill_price", data.get("price", 0))),
|
||||
fill_quantity=float(data.get("fill_quantity", data.get("qty", 0))),
|
||||
broker_account=row["broker_account"],
|
||||
filled_at=filled_at,
|
||||
commission=float(data.get("commission", 0)),
|
||||
)
|
||||
refs.append(ref)
|
||||
return refs
|
||||
|
||||
|
||||
async def publish_positions_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> str:
|
||||
"""Publish daily position snapshots for a broker account."""
|
||||
rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
|
||||
if not rows:
|
||||
logger.info("No open positions for account %s", entity_id)
|
||||
return ""
|
||||
|
||||
snapshot_at = datetime.now(timezone.utc)
|
||||
positions = [
|
||||
{
|
||||
"ticker": row["ticker"],
|
||||
"quantity": float(row["quantity"]),
|
||||
"avg_entry_price": float(row["avg_entry_price"] or 0),
|
||||
"close_price": float(row["current_price"] or 0),
|
||||
"unrealized_pnl": float(row["unrealized_pnl"] or 0),
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
broker_account = rows[0]["broker_account"] if rows else ""
|
||||
return publish_positions_daily_batch(
|
||||
client=minio_client,
|
||||
positions=positions,
|
||||
broker_account=broker_account,
|
||||
snapshot_at=snapshot_at,
|
||||
)
|
||||
|
||||
|
||||
async def publish_pnl_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
entity_id: str,
|
||||
) -> list[str]:
|
||||
"""Publish daily PnL facts for a broker account's positions."""
|
||||
rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
|
||||
if not rows:
|
||||
logger.info("No positions for PnL snapshot, account %s", entity_id)
|
||||
return []
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
refs: list[str] = []
|
||||
for row in rows:
|
||||
realized = float(row["realized_pnl"] or 0)
|
||||
unrealized = float(row["unrealized_pnl"] or 0)
|
||||
total = realized + unrealized
|
||||
ref = publish_pnl_daily(
|
||||
client=minio_client,
|
||||
ticker=row["ticker"],
|
||||
realized_pnl=realized,
|
||||
unrealized_pnl=unrealized,
|
||||
total_pnl=total,
|
||||
broker_account=row["broker_account"],
|
||||
dt=now,
|
||||
execution_mode=row["execution_mode"],
|
||||
)
|
||||
refs.append(ref)
|
||||
return refs
|
||||
|
||||
|
||||
async def publish_bulk_documents_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
since: datetime,
|
||||
) -> list[str]:
|
||||
"""Publish all documents created since a cutoff as a batch."""
|
||||
rows = await pool.fetch(_FETCH_BULK_DOCUMENTS, since)
|
||||
if not rows:
|
||||
logger.info("No documents to bulk-publish since %s", since)
|
||||
return []
|
||||
|
||||
doc_rows: list[dict[str, object]] = []
|
||||
for row in rows:
|
||||
published_at = row["published_at"] or row["retrieved_at"]
|
||||
doc_rows.append({
|
||||
"document_id": str(row["id"]),
|
||||
"document_type": row["document_type"],
|
||||
"source_type": row["source_type"],
|
||||
"ticker": row["ticker"] or "",
|
||||
"publisher": row["publisher"] or "",
|
||||
"title": row["title"] or "",
|
||||
"url": row["url"] or "",
|
||||
"canonical_url": row["canonical_url"] or "",
|
||||
"language": row["language"] or "en",
|
||||
"published_at": published_at,
|
||||
"retrieved_at": row["retrieved_at"],
|
||||
"content_hash": row["content_hash"],
|
||||
"confidence": float(row["parse_quality_score"] or 0.0),
|
||||
**partition_values(published_at),
|
||||
})
|
||||
|
||||
ref = publish_documents_batch(minio_client, doc_rows, since)
|
||||
return [ref] if ref else []
|
||||
|
||||
|
||||
async def publish_bulk_extractions_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
since: datetime,
|
||||
) -> list[str]:
|
||||
"""Publish all extractions created since a cutoff as a batch."""
|
||||
rows = await pool.fetch(_FETCH_BULK_EXTRACTIONS, since)
|
||||
if not rows:
|
||||
logger.info("No extractions to bulk-publish since %s", since)
|
||||
return []
|
||||
|
||||
extraction_rows: list[dict[str, object]] = []
|
||||
for row in rows:
|
||||
model_ver = row["schema_version"] or row["prompt_version"] or ""
|
||||
extraction_rows.append({
|
||||
"document_id": str(row["document_id"]),
|
||||
"ticker": row["ticker"],
|
||||
"company_name": row["company_name"] or "",
|
||||
"relevance": float(row["relevance"] or 0.0),
|
||||
"sentiment": row["sentiment"] or "neutral",
|
||||
"impact_score": float(row["impact_score"] or 0.0),
|
||||
"impact_horizon": row["impact_horizon"] or "",
|
||||
"catalyst_type": row["catalyst_type"] or "other",
|
||||
"confidence": float(row["confidence"] or 0.0),
|
||||
"novelty_score": float(row["novelty_score"] or 0.0),
|
||||
"source_credibility": float(row["source_credibility"] or 0.0),
|
||||
"key_facts": _jsonb_to_str(row["key_facts"]),
|
||||
"risks": _jsonb_to_str(row["risks"]),
|
||||
"macro_themes": _jsonb_to_str(row["macro_themes"]),
|
||||
"model_name": row["model_name"] or "",
|
||||
"prompt_version": row["prompt_version"] or "",
|
||||
"schema_version": row["schema_version"] or "",
|
||||
"extraction_at": row["extraction_at"],
|
||||
**partition_values(row["extraction_at"], {"model_version": model_ver}),
|
||||
})
|
||||
|
||||
model_ver = extraction_rows[0].get("model_version", "") if extraction_rows else ""
|
||||
ref = publish_document_extractions_batch(
|
||||
minio_client, extraction_rows, since,
|
||||
model_version=str(model_ver),
|
||||
)
|
||||
return [ref] if ref else []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Job dispatcher
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
JOB_TYPES = {
|
||||
"document",
|
||||
"document_extraction",
|
||||
"market_snapshot",
|
||||
"trade_order",
|
||||
"trade_fill",
|
||||
"positions_snapshot",
|
||||
"pnl_snapshot",
|
||||
"company_event",
|
||||
"bulk_documents",
|
||||
"bulk_extractions",
|
||||
}
|
||||
|
||||
|
||||
async def dispatch_job(
|
||||
pool: asyncpg.Pool,
|
||||
minio_client: Minio,
|
||||
job: dict[str, str],
|
||||
) -> dict[str, object]:
|
||||
"""Dispatch a lake publish job to the appropriate handler.
|
||||
|
||||
Args:
|
||||
pool: PostgreSQL connection pool.
|
||||
minio_client: MinIO client for writing Parquet files.
|
||||
job: Job dict with at least 'job_type' and 'entity_id'.
|
||||
|
||||
Returns:
|
||||
A result dict with 'job_type', 'entity_id', 'refs' (list of s3 URIs),
|
||||
and 'error' (None on success).
|
||||
"""
|
||||
job_type = job.get("job_type", "")
|
||||
entity_id = job.get("entity_id", "")
|
||||
since_str = job.get("since")
|
||||
|
||||
result: dict[str, object] = {
|
||||
"job_type": job_type,
|
||||
"entity_id": entity_id,
|
||||
"refs": [],
|
||||
"error": None,
|
||||
}
|
||||
|
||||
try:
|
||||
if job_type == "document":
|
||||
ref = await publish_document_job(pool, minio_client, entity_id)
|
||||
result["refs"] = [ref] if ref else []
|
||||
|
||||
elif job_type == "document_extraction":
|
||||
refs = await publish_extraction_job(pool, minio_client, entity_id)
|
||||
result["refs"] = refs
|
||||
|
||||
elif job_type == "market_snapshot":
|
||||
refs = await publish_market_snapshot_job(pool, minio_client, entity_id)
|
||||
result["refs"] = refs
|
||||
|
||||
elif job_type == "trade_order":
|
||||
ref = await publish_order_job(pool, minio_client, entity_id)
|
||||
result["refs"] = [ref] if ref else []
|
||||
|
||||
elif job_type == "trade_fill":
|
||||
refs = await publish_fills_job(pool, minio_client, entity_id)
|
||||
result["refs"] = refs
|
||||
|
||||
elif job_type == "positions_snapshot":
|
||||
ref = await publish_positions_job(pool, minio_client, entity_id)
|
||||
result["refs"] = [ref] if ref else []
|
||||
|
||||
elif job_type == "pnl_snapshot":
|
||||
refs = await publish_pnl_job(pool, minio_client, entity_id)
|
||||
result["refs"] = refs
|
||||
|
||||
elif job_type == "bulk_documents":
|
||||
since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
|
||||
refs = await publish_bulk_documents_job(pool, minio_client, since)
|
||||
result["refs"] = refs
|
||||
|
||||
elif job_type == "bulk_extractions":
|
||||
since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
|
||||
refs = await publish_bulk_extractions_job(pool, minio_client, since)
|
||||
result["refs"] = refs
|
||||
|
||||
else:
|
||||
result["error"] = f"Unknown job_type: {job_type}"
|
||||
logger.warning("Unknown lake publish job type: %s", job_type)
|
||||
|
||||
except Exception as exc:
|
||||
result["error"] = str(exc)
|
||||
logger.exception("Lake publish job failed: %s/%s", job_type, entity_id)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Async worker loop
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_worker(
|
||||
pool: asyncpg.Pool,
|
||||
rds: aioredis.Redis,
|
||||
minio_client: Minio,
|
||||
poll_interval: float = 2.0,
|
||||
) -> None:
|
||||
"""Main worker loop — reads jobs from Redis and dispatches them.
|
||||
|
||||
Runs indefinitely until cancelled. Each job is processed sequentially
|
||||
to keep MinIO write ordering predictable.
|
||||
"""
|
||||
queue = queue_key(QUEUE_LAKE_PUBLISH)
|
||||
logger.info("Lake publisher worker started, listening on %s", queue)
|
||||
|
||||
while True:
|
||||
raw = await rds.lpop(queue) # type: ignore[misc]
|
||||
if raw is None:
|
||||
await asyncio.sleep(poll_interval)
|
||||
continue
|
||||
|
||||
try:
|
||||
job = json.loads(str(raw))
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
logger.error("Invalid lake publish job payload: %s", raw)
|
||||
continue
|
||||
|
||||
result = await dispatch_job(pool, minio_client, job)
|
||||
refs = result.get("refs") or []
|
||||
error = result.get("error")
|
||||
|
||||
if error:
|
||||
logger.error(
|
||||
"Lake publish job %s/%s failed: %s",
|
||||
result["job_type"], result["entity_id"], error,
|
||||
)
|
||||
else:
|
||||
ref_count = len(refs) if isinstance(refs, list) else 0
|
||||
logger.info(
|
||||
"Lake publish job %s/%s completed: %d facts written",
|
||||
result["job_type"], result["entity_id"], ref_count,
|
||||
)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
"""Entry point for the lake publisher worker process."""
|
||||
config = load_config()
|
||||
pool = await get_pg_pool(config)
|
||||
rds = get_redis(config)
|
||||
minio_client = get_minio(config)
|
||||
|
||||
try:
|
||||
await run_worker(pool, rds, minio_client)
|
||||
finally:
|
||||
await pool.close()
|
||||
await rds.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cfg = load_config()
|
||||
setup_logging("lake_publisher", level=cfg.log_level, json_output=cfg.json_logs)
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user