674 lines
23 KiB
Python
674 lines
23 KiB
Python
"""Lake publisher async job runner — transforms operational data into analytical facts.
|
|
|
|
Reads jobs from the QUEUE_LAKE_PUBLISH Redis queue, queries PostgreSQL for
|
|
operational records, and publishes them as partitioned Parquet files to MinIO
|
|
via the existing publish_* functions in worker.py.
|
|
|
|
Job message format:
|
|
{"job_type": "<table_name>", "entity_id": "<uuid or ticker>", "dt": "2026-04-11T..."}
|
|
|
|
Supported job types:
|
|
- document: publish a single document metadata fact
|
|
- document_extraction: publish extraction facts for a document
|
|
- market_snapshot: publish market bars/quotes from a snapshot
|
|
- trade_order: publish an order fact
|
|
- trade_fill: publish fill facts for an order
|
|
- positions_snapshot: publish daily position snapshots for a broker account
|
|
- pnl_snapshot: publish daily PnL for a broker account
|
|
- company_event: publish a company event fact
|
|
- bulk_documents: publish all unpublished documents since a cutoff
|
|
- bulk_extractions: publish all unpublished extractions since a cutoff
|
|
|
|
Requirements: 9.4, 9.5, 10.1
|
|
Design ref: Section 4.10 (Lake Publisher), Section 8.4 (Lake publication flow)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
|
|
import asyncpg
|
|
import redis.asyncio as aioredis
|
|
from minio import Minio
|
|
|
|
from services.lake_publisher.worker import (
|
|
publish_document_extraction,
|
|
publish_document_fact,
|
|
publish_market_bar,
|
|
publish_market_quote,
|
|
publish_trade_order,
|
|
publish_trade_fill,
|
|
publish_pnl_daily,
|
|
publish_documents_batch,
|
|
publish_document_extractions_batch,
|
|
publish_positions_daily_batch,
|
|
)
|
|
from services.lake_publisher.partitions import partition_values
|
|
from services.shared.config import load_config
|
|
from services.shared.db import get_minio, get_pg_pool, get_redis
|
|
from services.shared.logging import setup_logging
|
|
from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SQL queries for fetching operational data
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_FETCH_DOCUMENT = """
|
|
SELECT
|
|
d.id, d.document_type, d.source_type, d.publisher, d.title,
|
|
d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
|
|
d.content_hash, d.parse_quality_score,
|
|
COALESCE(
|
|
(SELECT dcm.ticker FROM document_company_mentions dcm
|
|
WHERE dcm.document_id = d.id LIMIT 1),
|
|
''
|
|
) AS ticker
|
|
FROM documents d
|
|
WHERE d.id = $1::uuid
|
|
"""
|
|
|
|
_FETCH_EXTRACTIONS = """
|
|
SELECT
|
|
di.document_id, dir.ticker, dir.relevance, dir.sentiment,
|
|
dir.impact_score, dir.impact_horizon, dir.catalyst_type,
|
|
di.confidence, di.novelty_score, di.source_credibility,
|
|
dir.key_facts, dir.risks, di.macro_themes,
|
|
di.model_name, di.prompt_version, di.schema_version,
|
|
di.created_at AS extraction_at,
|
|
COALESCE(c.legal_name, '') AS company_name
|
|
FROM document_intelligence di
|
|
JOIN document_impact_records dir ON dir.intelligence_id = di.id
|
|
LEFT JOIN companies c ON c.id = dir.company_id
|
|
WHERE di.document_id = $1::uuid
|
|
AND di.validation_status = 'valid'
|
|
"""
|
|
|
|
_FETCH_MARKET_SNAPSHOT = """
|
|
SELECT
|
|
ms.ticker, ms.snapshot_type, ms.data, ms.source_provider, ms.captured_at
|
|
FROM market_snapshots ms
|
|
WHERE ms.id = $1::uuid
|
|
"""
|
|
|
|
_FETCH_ORDER = """
|
|
SELECT
|
|
o.id, o.recommendation_id, o.ticker, o.side, o.order_type,
|
|
o.quantity, o.limit_price, o.status, o.submitted_at,
|
|
o.fill_price, o.fill_quantity, o.filled_at,
|
|
COALESCE(ba.account_id, '') AS broker_account,
|
|
COALESCE(ba.mode, 'paper') AS execution_mode
|
|
FROM orders o
|
|
LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
|
|
WHERE o.id = $1::uuid
|
|
"""
|
|
|
|
_FETCH_ORDER_FILLS = """
|
|
SELECT
|
|
oe.id AS fill_id, oe.order_id, oe.data, oe.broker_timestamp,
|
|
o.ticker, o.side,
|
|
COALESCE(ba.account_id, '') AS broker_account
|
|
FROM order_events oe
|
|
JOIN orders o ON o.id = oe.order_id
|
|
LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
|
|
WHERE oe.order_id = $1::uuid AND oe.event_type = 'fill'
|
|
"""
|
|
|
|
_FETCH_POSITIONS = """
|
|
SELECT
|
|
p.ticker, p.quantity, p.avg_entry_price, p.current_price,
|
|
p.unrealized_pnl, p.realized_pnl,
|
|
COALESCE(ba.account_id, '') AS broker_account,
|
|
COALESCE(ba.mode, 'paper') AS execution_mode
|
|
FROM positions p
|
|
LEFT JOIN broker_accounts ba ON ba.id = p.broker_account_id
|
|
WHERE p.broker_account_id = $1::uuid AND p.quantity != 0
|
|
"""
|
|
|
|
_FETCH_BULK_DOCUMENTS = """
|
|
SELECT
|
|
d.id, d.document_type, d.source_type, d.publisher, d.title,
|
|
d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
|
|
d.content_hash, d.parse_quality_score,
|
|
COALESCE(
|
|
(SELECT dcm.ticker FROM document_company_mentions dcm
|
|
WHERE dcm.document_id = d.id LIMIT 1),
|
|
''
|
|
) AS ticker
|
|
FROM documents d
|
|
WHERE d.created_at >= $1
|
|
AND d.status IN ('parsed', 'extracted')
|
|
ORDER BY d.created_at
|
|
LIMIT 500
|
|
"""
|
|
|
|
_FETCH_BULK_EXTRACTIONS = """
|
|
SELECT
|
|
di.document_id, dir.ticker, dir.relevance, dir.sentiment,
|
|
dir.impact_score, dir.impact_horizon, dir.catalyst_type,
|
|
di.confidence, di.novelty_score, di.source_credibility,
|
|
dir.key_facts, dir.risks, di.macro_themes,
|
|
di.model_name, di.prompt_version, di.schema_version,
|
|
di.created_at AS extraction_at,
|
|
COALESCE(c.legal_name, '') AS company_name
|
|
FROM document_intelligence di
|
|
JOIN document_impact_records dir ON dir.intelligence_id = di.id
|
|
LEFT JOIN companies c ON c.id = dir.company_id
|
|
WHERE di.created_at >= $1
|
|
AND di.validation_status = 'valid'
|
|
ORDER BY di.created_at
|
|
LIMIT 500
|
|
"""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Job handlers — each transforms operational rows into lake facts
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _jsonb_to_str(val: object) -> str:
|
|
"""Convert a JSONB column value (list or str) to a comma-separated string."""
|
|
if val is None:
|
|
return ""
|
|
if isinstance(val, str):
|
|
try:
|
|
parsed = json.loads(val)
|
|
if isinstance(parsed, list):
|
|
return ", ".join(str(x) for x in parsed)
|
|
return val
|
|
except (json.JSONDecodeError, TypeError):
|
|
return val
|
|
if isinstance(val, list):
|
|
return ", ".join(str(x) for x in val)
|
|
return str(val)
|
|
|
|
|
|
async def publish_document_job(
|
|
pool: asyncpg.Pool,
|
|
minio_client: Minio,
|
|
entity_id: str,
|
|
) -> str:
|
|
"""Publish a single document metadata fact from PostgreSQL to the lake."""
|
|
row = await pool.fetchrow(_FETCH_DOCUMENT, entity_id)
|
|
if row is None:
|
|
logger.warning("Document %s not found, skipping lake publish", entity_id)
|
|
return ""
|
|
|
|
published_at = row["published_at"] or row["retrieved_at"]
|
|
return publish_document_fact(
|
|
client=minio_client,
|
|
document_id=str(row["id"]),
|
|
document_type=row["document_type"],
|
|
source_type=row["source_type"],
|
|
ticker=row["ticker"] or "",
|
|
publisher=row["publisher"] or "",
|
|
title=row["title"] or "",
|
|
published_at=published_at,
|
|
content_hash=row["content_hash"],
|
|
url=row["url"] or "",
|
|
canonical_url=row["canonical_url"] or "",
|
|
language=row["language"] or "en",
|
|
confidence=float(row["parse_quality_score"] or 0.0),
|
|
retrieved_at=row["retrieved_at"],
|
|
)
|
|
|
|
|
|
async def publish_extraction_job(
|
|
pool: asyncpg.Pool,
|
|
minio_client: Minio,
|
|
entity_id: str,
|
|
) -> list[str]:
|
|
"""Publish document extraction facts for a document from PostgreSQL to the lake."""
|
|
rows = await pool.fetch(_FETCH_EXTRACTIONS, entity_id)
|
|
if not rows:
|
|
logger.info("No valid extractions for document %s", entity_id)
|
|
return []
|
|
|
|
refs: list[str] = []
|
|
for row in rows:
|
|
ref = publish_document_extraction(
|
|
client=minio_client,
|
|
document_id=str(row["document_id"]),
|
|
ticker=row["ticker"],
|
|
sentiment=row["sentiment"] or "neutral",
|
|
impact_score=float(row["impact_score"] or 0.0),
|
|
catalyst_type=row["catalyst_type"] or "other",
|
|
confidence=float(row["confidence"] or 0.0),
|
|
extraction_at=row["extraction_at"],
|
|
model_name=row["model_name"] or "",
|
|
prompt_version=row["prompt_version"] or "",
|
|
company_name=row["company_name"] or "",
|
|
relevance=float(row["relevance"] or 0.0),
|
|
impact_horizon=row["impact_horizon"] or "",
|
|
novelty_score=float(row["novelty_score"] or 0.0),
|
|
source_credibility=float(row["source_credibility"] or 0.0),
|
|
key_facts=_jsonb_to_str(row["key_facts"]),
|
|
risks=_jsonb_to_str(row["risks"]),
|
|
macro_themes=_jsonb_to_str(row["macro_themes"]),
|
|
schema_version=row["schema_version"] or "",
|
|
)
|
|
refs.append(ref)
|
|
return refs
|
|
|
|
|
|
async def publish_market_snapshot_job(
|
|
pool: asyncpg.Pool,
|
|
minio_client: Minio,
|
|
entity_id: str,
|
|
) -> list[str]:
|
|
"""Publish market bar/quote facts from a market_snapshots row."""
|
|
row = await pool.fetchrow(_FETCH_MARKET_SNAPSHOT, entity_id)
|
|
if row is None:
|
|
logger.warning("Market snapshot %s not found", entity_id)
|
|
return []
|
|
|
|
ticker = row["ticker"]
|
|
data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"])
|
|
source = row["source_provider"] or ""
|
|
captured_at = row["captured_at"]
|
|
snapshot_type = row["snapshot_type"]
|
|
refs: list[str] = []
|
|
|
|
if snapshot_type == "bar" or snapshot_type == "bars":
|
|
# Single bar or list of bars
|
|
bars = data.get("bars", [data]) if "bars" in data else [data]
|
|
for bar in bars:
|
|
ref = publish_market_bar(
|
|
client=minio_client,
|
|
ticker=ticker,
|
|
open_price=float(bar.get("open", bar.get("o", 0))),
|
|
high_price=float(bar.get("high", bar.get("h", 0))),
|
|
low_price=float(bar.get("low", bar.get("l", 0))),
|
|
close_price=float(bar.get("close", bar.get("c", 0))),
|
|
volume=int(bar.get("volume", bar.get("v", 0))),
|
|
bar_timestamp=captured_at,
|
|
source=source,
|
|
vwap=float(bar.get("vwap", bar.get("vw", 0))),
|
|
trade_count=int(bar.get("trade_count", bar.get("n", 0))),
|
|
bar_interval=bar.get("interval", "1d"),
|
|
)
|
|
refs.append(ref)
|
|
elif snapshot_type == "quote" or snapshot_type == "quotes":
|
|
ref = publish_market_quote(
|
|
client=minio_client,
|
|
ticker=ticker,
|
|
bid_price=float(data.get("bid_price", data.get("bp", 0))),
|
|
ask_price=float(data.get("ask_price", data.get("ap", 0))),
|
|
last_price=float(data.get("last_price", data.get("lp", 0))),
|
|
quote_at=captured_at,
|
|
source=source,
|
|
bid_size=int(data.get("bid_size", data.get("bs", 0))),
|
|
ask_size=int(data.get("ask_size", data.get("as", 0))),
|
|
last_size=int(data.get("last_size", data.get("ls", 0))),
|
|
)
|
|
refs.append(ref)
|
|
|
|
return refs
|
|
|
|
|
|
async def publish_order_job(
|
|
pool: asyncpg.Pool,
|
|
minio_client: Minio,
|
|
entity_id: str,
|
|
) -> str:
|
|
"""Publish a trade order fact from PostgreSQL to the lake."""
|
|
row = await pool.fetchrow(_FETCH_ORDER, entity_id)
|
|
if row is None:
|
|
logger.warning("Order %s not found", entity_id)
|
|
return ""
|
|
|
|
submitted_at = row["submitted_at"] or datetime.now(timezone.utc)
|
|
return publish_trade_order(
|
|
client=minio_client,
|
|
order_id=str(row["id"]),
|
|
ticker=row["ticker"],
|
|
side=row["side"],
|
|
order_type=row["order_type"],
|
|
quantity=float(row["quantity"]),
|
|
limit_price=float(row["limit_price"]) if row["limit_price"] else None,
|
|
status=row["status"],
|
|
broker_account=row["broker_account"],
|
|
submitted_at=submitted_at,
|
|
recommendation_id=str(row["recommendation_id"]) if row["recommendation_id"] else "",
|
|
execution_mode=row["execution_mode"],
|
|
)
|
|
|
|
|
|
async def publish_fills_job(
|
|
pool: asyncpg.Pool,
|
|
minio_client: Minio,
|
|
entity_id: str,
|
|
) -> list[str]:
|
|
"""Publish trade fill facts for an order from PostgreSQL to the lake."""
|
|
rows = await pool.fetch(_FETCH_ORDER_FILLS, entity_id)
|
|
if not rows:
|
|
logger.info("No fill events for order %s", entity_id)
|
|
return []
|
|
|
|
refs: list[str] = []
|
|
for row in rows:
|
|
data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"] or "{}")
|
|
filled_at = row["broker_timestamp"] or datetime.now(timezone.utc)
|
|
ref = publish_trade_fill(
|
|
client=minio_client,
|
|
fill_id=str(row["fill_id"]),
|
|
order_id=str(row["order_id"]),
|
|
ticker=row["ticker"],
|
|
side=row["side"],
|
|
fill_price=float(data.get("fill_price", data.get("price", 0))),
|
|
fill_quantity=float(data.get("fill_quantity", data.get("qty", 0))),
|
|
broker_account=row["broker_account"],
|
|
filled_at=filled_at,
|
|
commission=float(data.get("commission", 0)),
|
|
)
|
|
refs.append(ref)
|
|
return refs
|
|
|
|
|
|
async def publish_positions_job(
|
|
pool: asyncpg.Pool,
|
|
minio_client: Minio,
|
|
entity_id: str,
|
|
) -> str:
|
|
"""Publish daily position snapshots for a broker account."""
|
|
rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
|
|
if not rows:
|
|
logger.info("No open positions for account %s", entity_id)
|
|
return ""
|
|
|
|
snapshot_at = datetime.now(timezone.utc)
|
|
positions = [
|
|
{
|
|
"ticker": row["ticker"],
|
|
"quantity": float(row["quantity"]),
|
|
"avg_entry_price": float(row["avg_entry_price"] or 0),
|
|
"close_price": float(row["current_price"] or 0),
|
|
"unrealized_pnl": float(row["unrealized_pnl"] or 0),
|
|
}
|
|
for row in rows
|
|
]
|
|
broker_account = rows[0]["broker_account"] if rows else ""
|
|
return publish_positions_daily_batch(
|
|
client=minio_client,
|
|
positions=positions,
|
|
broker_account=broker_account,
|
|
snapshot_at=snapshot_at,
|
|
)
|
|
|
|
|
|
async def publish_pnl_job(
|
|
pool: asyncpg.Pool,
|
|
minio_client: Minio,
|
|
entity_id: str,
|
|
) -> list[str]:
|
|
"""Publish daily PnL facts for a broker account's positions."""
|
|
rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
|
|
if not rows:
|
|
logger.info("No positions for PnL snapshot, account %s", entity_id)
|
|
return []
|
|
|
|
now = datetime.now(timezone.utc)
|
|
refs: list[str] = []
|
|
for row in rows:
|
|
realized = float(row["realized_pnl"] or 0)
|
|
unrealized = float(row["unrealized_pnl"] or 0)
|
|
total = realized + unrealized
|
|
ref = publish_pnl_daily(
|
|
client=minio_client,
|
|
ticker=row["ticker"],
|
|
realized_pnl=realized,
|
|
unrealized_pnl=unrealized,
|
|
total_pnl=total,
|
|
broker_account=row["broker_account"],
|
|
dt=now,
|
|
execution_mode=row["execution_mode"],
|
|
)
|
|
refs.append(ref)
|
|
return refs
|
|
|
|
|
|
async def publish_bulk_documents_job(
|
|
pool: asyncpg.Pool,
|
|
minio_client: Minio,
|
|
since: datetime,
|
|
) -> list[str]:
|
|
"""Publish all documents created since a cutoff as a batch."""
|
|
rows = await pool.fetch(_FETCH_BULK_DOCUMENTS, since)
|
|
if not rows:
|
|
logger.info("No documents to bulk-publish since %s", since)
|
|
return []
|
|
|
|
doc_rows: list[dict[str, object]] = []
|
|
for row in rows:
|
|
published_at = row["published_at"] or row["retrieved_at"]
|
|
doc_rows.append({
|
|
"document_id": str(row["id"]),
|
|
"document_type": row["document_type"],
|
|
"source_type": row["source_type"],
|
|
"ticker": row["ticker"] or "",
|
|
"publisher": row["publisher"] or "",
|
|
"title": row["title"] or "",
|
|
"url": row["url"] or "",
|
|
"canonical_url": row["canonical_url"] or "",
|
|
"language": row["language"] or "en",
|
|
"published_at": published_at,
|
|
"retrieved_at": row["retrieved_at"],
|
|
"content_hash": row["content_hash"],
|
|
"confidence": float(row["parse_quality_score"] or 0.0),
|
|
**partition_values(published_at),
|
|
})
|
|
|
|
ref = publish_documents_batch(minio_client, doc_rows, since)
|
|
return [ref] if ref else []
|
|
|
|
|
|
async def publish_bulk_extractions_job(
|
|
pool: asyncpg.Pool,
|
|
minio_client: Minio,
|
|
since: datetime,
|
|
) -> list[str]:
|
|
"""Publish all extractions created since a cutoff as a batch."""
|
|
rows = await pool.fetch(_FETCH_BULK_EXTRACTIONS, since)
|
|
if not rows:
|
|
logger.info("No extractions to bulk-publish since %s", since)
|
|
return []
|
|
|
|
extraction_rows: list[dict[str, object]] = []
|
|
for row in rows:
|
|
model_ver = row["schema_version"] or row["prompt_version"] or ""
|
|
extraction_rows.append({
|
|
"document_id": str(row["document_id"]),
|
|
"ticker": row["ticker"],
|
|
"company_name": row["company_name"] or "",
|
|
"relevance": float(row["relevance"] or 0.0),
|
|
"sentiment": row["sentiment"] or "neutral",
|
|
"impact_score": float(row["impact_score"] or 0.0),
|
|
"impact_horizon": row["impact_horizon"] or "",
|
|
"catalyst_type": row["catalyst_type"] or "other",
|
|
"confidence": float(row["confidence"] or 0.0),
|
|
"novelty_score": float(row["novelty_score"] or 0.0),
|
|
"source_credibility": float(row["source_credibility"] or 0.0),
|
|
"key_facts": _jsonb_to_str(row["key_facts"]),
|
|
"risks": _jsonb_to_str(row["risks"]),
|
|
"macro_themes": _jsonb_to_str(row["macro_themes"]),
|
|
"model_name": row["model_name"] or "",
|
|
"prompt_version": row["prompt_version"] or "",
|
|
"schema_version": row["schema_version"] or "",
|
|
"extraction_at": row["extraction_at"],
|
|
**partition_values(row["extraction_at"], {"model_version": model_ver}),
|
|
})
|
|
|
|
model_ver = extraction_rows[0].get("model_version", "") if extraction_rows else ""
|
|
ref = publish_document_extractions_batch(
|
|
minio_client, extraction_rows, since,
|
|
model_version=str(model_ver),
|
|
)
|
|
return [ref] if ref else []
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Job dispatcher
|
|
# ---------------------------------------------------------------------------
|
|
|
|
JOB_TYPES = {
|
|
"document",
|
|
"document_extraction",
|
|
"market_snapshot",
|
|
"trade_order",
|
|
"trade_fill",
|
|
"positions_snapshot",
|
|
"pnl_snapshot",
|
|
"company_event",
|
|
"bulk_documents",
|
|
"bulk_extractions",
|
|
}
|
|
|
|
|
|
async def dispatch_job(
|
|
pool: asyncpg.Pool,
|
|
minio_client: Minio,
|
|
job: dict[str, str],
|
|
) -> dict[str, object]:
|
|
"""Dispatch a lake publish job to the appropriate handler.
|
|
|
|
Args:
|
|
pool: PostgreSQL connection pool.
|
|
minio_client: MinIO client for writing Parquet files.
|
|
job: Job dict with at least 'job_type' and 'entity_id'.
|
|
|
|
Returns:
|
|
A result dict with 'job_type', 'entity_id', 'refs' (list of s3 URIs),
|
|
and 'error' (None on success).
|
|
"""
|
|
job_type = job.get("job_type", "")
|
|
entity_id = job.get("entity_id", "")
|
|
since_str = job.get("since")
|
|
|
|
result: dict[str, object] = {
|
|
"job_type": job_type,
|
|
"entity_id": entity_id,
|
|
"refs": [],
|
|
"error": None,
|
|
}
|
|
|
|
try:
|
|
if job_type == "document":
|
|
ref = await publish_document_job(pool, minio_client, entity_id)
|
|
result["refs"] = [ref] if ref else []
|
|
|
|
elif job_type == "document_extraction":
|
|
refs = await publish_extraction_job(pool, minio_client, entity_id)
|
|
result["refs"] = refs
|
|
|
|
elif job_type == "market_snapshot":
|
|
refs = await publish_market_snapshot_job(pool, minio_client, entity_id)
|
|
result["refs"] = refs
|
|
|
|
elif job_type == "trade_order":
|
|
ref = await publish_order_job(pool, minio_client, entity_id)
|
|
result["refs"] = [ref] if ref else []
|
|
|
|
elif job_type == "trade_fill":
|
|
refs = await publish_fills_job(pool, minio_client, entity_id)
|
|
result["refs"] = refs
|
|
|
|
elif job_type == "positions_snapshot":
|
|
ref = await publish_positions_job(pool, minio_client, entity_id)
|
|
result["refs"] = [ref] if ref else []
|
|
|
|
elif job_type == "pnl_snapshot":
|
|
refs = await publish_pnl_job(pool, minio_client, entity_id)
|
|
result["refs"] = refs
|
|
|
|
elif job_type == "bulk_documents":
|
|
since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
|
|
refs = await publish_bulk_documents_job(pool, minio_client, since)
|
|
result["refs"] = refs
|
|
|
|
elif job_type == "bulk_extractions":
|
|
since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
|
|
refs = await publish_bulk_extractions_job(pool, minio_client, since)
|
|
result["refs"] = refs
|
|
|
|
else:
|
|
result["error"] = f"Unknown job_type: {job_type}"
|
|
logger.warning("Unknown lake publish job type: %s", job_type)
|
|
|
|
except Exception as exc:
|
|
result["error"] = str(exc)
|
|
logger.exception("Lake publish job failed: %s/%s", job_type, entity_id)
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Async worker loop
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def run_worker(
|
|
pool: asyncpg.Pool,
|
|
rds: aioredis.Redis,
|
|
minio_client: Minio,
|
|
poll_interval: float = 2.0,
|
|
) -> None:
|
|
"""Main worker loop — reads jobs from Redis and dispatches them.
|
|
|
|
Runs indefinitely until cancelled. Each job is processed sequentially
|
|
to keep MinIO write ordering predictable.
|
|
"""
|
|
queue = queue_key(QUEUE_LAKE_PUBLISH)
|
|
logger.info("Lake publisher worker started, listening on %s", queue)
|
|
|
|
while True:
|
|
raw = await rds.lpop(queue) # type: ignore[misc]
|
|
if raw is None:
|
|
await asyncio.sleep(poll_interval)
|
|
continue
|
|
|
|
try:
|
|
job = json.loads(str(raw))
|
|
except (json.JSONDecodeError, TypeError):
|
|
logger.error("Invalid lake publish job payload: %s", raw)
|
|
continue
|
|
|
|
result = await dispatch_job(pool, minio_client, job)
|
|
refs = result.get("refs") or []
|
|
error = result.get("error")
|
|
|
|
if error:
|
|
logger.error(
|
|
"Lake publish job %s/%s failed: %s",
|
|
result["job_type"], result["entity_id"], error,
|
|
)
|
|
else:
|
|
ref_count = len(refs) if isinstance(refs, list) else 0
|
|
logger.info(
|
|
"Lake publish job %s/%s completed: %d facts written",
|
|
result["job_type"], result["entity_id"], ref_count,
|
|
)
|
|
|
|
|
|
async def main() -> None:
|
|
"""Entry point for the lake publisher worker process."""
|
|
config = load_config()
|
|
pool = await get_pg_pool(config)
|
|
rds = get_redis(config)
|
|
minio_client = get_minio(config)
|
|
|
|
try:
|
|
await run_worker(pool, rds, minio_client)
|
|
finally:
|
|
await pool.close()
|
|
await rds.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
cfg = load_config()
|
|
setup_logging("lake_publisher", level=cfg.log_level, json_output=cfg.json_logs)
|
|
asyncio.run(main())
|