feat: competitive intelligence & historical pattern matching layer
This commit is contained in:
@@ -1226,3 +1226,373 @@ def publish_prediction_vs_outcome_batch(
|
||||
) -> str:
|
||||
"""Publish a batch of prediction vs outcome rows as a single Parquet file."""
|
||||
return _publish_batch(client, "prediction_vs_outcome", rows, PREDICTION_VS_OUTCOME_SCHEMA, dt)
|
||||
|
||||
|
||||
# --- global_events fact table ---
|
||||
|
||||
GLOBAL_EVENTS_SCHEMA = pa.schema([
|
||||
("event_id", pa.string()),
|
||||
("event_types", pa.string()),
|
||||
("severity", pa.string()),
|
||||
("affected_regions", pa.string()),
|
||||
("affected_sectors", pa.string()),
|
||||
("affected_commodities", pa.string()),
|
||||
("summary", pa.string()),
|
||||
("estimated_duration", pa.string()),
|
||||
("confidence", pa.float64()),
|
||||
("source_document_id", pa.string()),
|
||||
("created_at", pa.timestamp("us", tz="UTC")),
|
||||
("dt", pa.date32()),
|
||||
])
|
||||
|
||||
|
||||
def publish_global_event_fact(
|
||||
client: Minio,
|
||||
event_id: str,
|
||||
event_types: list[str],
|
||||
severity: str,
|
||||
affected_regions: list[str],
|
||||
affected_sectors: list[str],
|
||||
affected_commodities: list[str],
|
||||
summary: str,
|
||||
estimated_duration: str,
|
||||
confidence: float,
|
||||
source_document_id: str,
|
||||
created_at: datetime,
|
||||
) -> str:
|
||||
"""Publish a single global event fact to MinIO.
|
||||
|
||||
Writes a Parquet file to:
|
||||
s3://stonks-lakehouse/warehouse/global_events/dt={date}/part-{uuid}.parquet
|
||||
|
||||
Returns the s3:// URI of the written object.
|
||||
|
||||
Requirements: 7.3, 12.6
|
||||
Design ref: Analytical Lake Datasets (lake.global_events)
|
||||
"""
|
||||
row: dict[str, object] = {
|
||||
"event_id": event_id,
|
||||
"event_types": ", ".join(event_types),
|
||||
"severity": severity,
|
||||
"affected_regions": ", ".join(affected_regions),
|
||||
"affected_sectors": ", ".join(affected_sectors),
|
||||
"affected_commodities": ", ".join(affected_commodities),
|
||||
"summary": summary,
|
||||
"estimated_duration": estimated_duration,
|
||||
"confidence": confidence,
|
||||
"source_document_id": source_document_id,
|
||||
"created_at": created_at,
|
||||
**partition_values(created_at),
|
||||
}
|
||||
table = pa.Table.from_pylist([row], schema=GLOBAL_EVENTS_SCHEMA)
|
||||
parquet_bytes = _write_parquet_bytes(table)
|
||||
|
||||
path = _partition_path("global_events", created_at)
|
||||
_put_lakehouse_object(client, "global_events", path, parquet_bytes)
|
||||
|
||||
ref = s3_uri(path)
|
||||
logger.info("Published global_event fact %s: %s", event_id, ref)
|
||||
return ref
|
||||
|
||||
|
||||
# --- macro_impacts fact table ---
|
||||
|
||||
MACRO_IMPACTS_SCHEMA = pa.schema([
|
||||
("event_id", pa.string()),
|
||||
("company_id", pa.string()),
|
||||
("ticker", pa.string()),
|
||||
("macro_impact_score", pa.float64()),
|
||||
("impact_direction", pa.string()),
|
||||
("contributing_factors", pa.string()),
|
||||
("confidence", pa.float64()),
|
||||
("computed_at", pa.timestamp("us", tz="UTC")),
|
||||
("dt", pa.date32()),
|
||||
])
|
||||
|
||||
|
||||
def publish_macro_impact_fact(
|
||||
client: Minio,
|
||||
event_id: str,
|
||||
company_id: str,
|
||||
ticker: str,
|
||||
macro_impact_score: float,
|
||||
impact_direction: str,
|
||||
contributing_factors: list[str],
|
||||
confidence: float,
|
||||
computed_at: datetime,
|
||||
) -> str:
|
||||
"""Publish a single macro impact fact to MinIO.
|
||||
|
||||
Writes a Parquet file to:
|
||||
s3://stonks-lakehouse/warehouse/macro_impacts/dt={date}/ticker={ticker}/part-{uuid}.parquet
|
||||
|
||||
Returns the s3:// URI of the written object.
|
||||
|
||||
Requirements: 7.3, 12.6
|
||||
Design ref: Analytical Lake Datasets (lake.macro_impacts)
|
||||
"""
|
||||
extra = {"ticker": ticker}
|
||||
row: dict[str, object] = {
|
||||
"event_id": event_id,
|
||||
"company_id": company_id,
|
||||
"ticker": ticker,
|
||||
"macro_impact_score": macro_impact_score,
|
||||
"impact_direction": impact_direction,
|
||||
"contributing_factors": ", ".join(contributing_factors),
|
||||
"confidence": confidence,
|
||||
"computed_at": computed_at,
|
||||
**partition_values(computed_at, extra),
|
||||
}
|
||||
table = pa.Table.from_pylist([row], schema=MACRO_IMPACTS_SCHEMA)
|
||||
parquet_bytes = _write_parquet_bytes(table)
|
||||
|
||||
path = _partition_path("macro_impacts", computed_at, extra_partitions=extra)
|
||||
_put_lakehouse_object(client, "macro_impacts", path, parquet_bytes)
|
||||
|
||||
ref = s3_uri(path)
|
||||
logger.info("Published macro_impact fact for %s/%s: %s", ticker, event_id, ref)
|
||||
return ref
|
||||
|
||||
|
||||
# --- trend_projections fact table ---
|
||||
|
||||
TREND_PROJECTIONS_SCHEMA = pa.schema([
|
||||
("trend_window_id", pa.string()),
|
||||
("ticker", pa.string()),
|
||||
("projected_direction", pa.string()),
|
||||
("projected_strength", pa.float64()),
|
||||
("projected_confidence", pa.float64()),
|
||||
("projection_horizon", pa.string()),
|
||||
("driving_factors", pa.string()),
|
||||
("macro_contribution_pct", pa.float64()),
|
||||
("diverges_from_current", pa.bool_()),
|
||||
("computed_at", pa.timestamp("us", tz="UTC")),
|
||||
("dt", pa.date32()),
|
||||
])
|
||||
|
||||
|
||||
def publish_trend_projection_fact(
|
||||
client: Minio,
|
||||
trend_window_id: str,
|
||||
ticker: str,
|
||||
projected_direction: str,
|
||||
projected_strength: float,
|
||||
projected_confidence: float,
|
||||
projection_horizon: str,
|
||||
driving_factors: list[str],
|
||||
macro_contribution_pct: float,
|
||||
diverges_from_current: bool,
|
||||
computed_at: datetime,
|
||||
) -> str:
|
||||
"""Publish a single trend projection fact to MinIO.
|
||||
|
||||
Writes a Parquet file to:
|
||||
s3://stonks-lakehouse/warehouse/trend_projections/dt={date}/ticker={ticker}/part-{uuid}.parquet
|
||||
|
||||
Returns the s3:// URI of the written object.
|
||||
|
||||
Requirements: 7.3, 12.6
|
||||
Design ref: Analytical Lake Datasets (lake.trend_projections)
|
||||
"""
|
||||
extra = {"ticker": ticker}
|
||||
row: dict[str, object] = {
|
||||
"trend_window_id": trend_window_id,
|
||||
"ticker": ticker,
|
||||
"projected_direction": projected_direction,
|
||||
"projected_strength": projected_strength,
|
||||
"projected_confidence": projected_confidence,
|
||||
"projection_horizon": projection_horizon,
|
||||
"driving_factors": ", ".join(driving_factors),
|
||||
"macro_contribution_pct": macro_contribution_pct,
|
||||
"diverges_from_current": diverges_from_current,
|
||||
"computed_at": computed_at,
|
||||
**partition_values(computed_at, extra),
|
||||
}
|
||||
table = pa.Table.from_pylist([row], schema=TREND_PROJECTIONS_SCHEMA)
|
||||
parquet_bytes = _write_parquet_bytes(table)
|
||||
|
||||
path = _partition_path("trend_projections", computed_at, extra_partitions=extra)
|
||||
_put_lakehouse_object(client, "trend_projections", path, parquet_bytes)
|
||||
|
||||
ref = s3_uri(path)
|
||||
logger.info("Published trend_projection fact for %s: %s", ticker, ref)
|
||||
return ref
|
||||
|
||||
|
||||
# --- Batch publishers for macro fact tables ---
|
||||
|
||||
def publish_global_events_batch(
|
||||
client: Minio,
|
||||
rows: list[dict[str, object]],
|
||||
dt: datetime,
|
||||
) -> str:
|
||||
"""Publish a batch of global event rows as a single Parquet file."""
|
||||
return _publish_batch(client, "global_events", rows, GLOBAL_EVENTS_SCHEMA, dt)
|
||||
|
||||
|
||||
def publish_macro_impacts_batch(
|
||||
client: Minio,
|
||||
rows: list[dict[str, object]],
|
||||
dt: datetime,
|
||||
ticker: str = "",
|
||||
) -> str:
|
||||
"""Publish a batch of macro impact rows as a single Parquet file."""
|
||||
extra = {"ticker": ticker} if ticker else None
|
||||
return _publish_batch(client, "macro_impacts", rows, MACRO_IMPACTS_SCHEMA, dt, extra)
|
||||
|
||||
|
||||
def publish_trend_projections_batch(
|
||||
client: Minio,
|
||||
rows: list[dict[str, object]],
|
||||
dt: datetime,
|
||||
ticker: str = "",
|
||||
) -> str:
|
||||
"""Publish a batch of trend projection rows as a single Parquet file."""
|
||||
extra = {"ticker": ticker} if ticker else None
|
||||
return _publish_batch(client, "trend_projections", rows, TREND_PROJECTIONS_SCHEMA, dt, extra)
|
||||
|
||||
|
||||
# --- competitor_relationships fact table ---
|
||||
|
||||
COMPETITOR_RELATIONSHIPS_SCHEMA = pa.schema([
|
||||
("id", pa.string()),
|
||||
("company_a_id", pa.string()),
|
||||
("company_b_id", pa.string()),
|
||||
("relationship_type", pa.string()),
|
||||
("strength", pa.float64()),
|
||||
("bidirectional", pa.bool_()),
|
||||
("source", pa.string()),
|
||||
("active", pa.bool_()),
|
||||
("created_at", pa.timestamp("us", tz="UTC")),
|
||||
("dt", pa.date32()),
|
||||
])
|
||||
|
||||
|
||||
def publish_competitor_relationship_fact(
|
||||
client: Minio,
|
||||
relationship_id: str,
|
||||
company_a_id: str,
|
||||
company_b_id: str,
|
||||
relationship_type: str,
|
||||
strength: float,
|
||||
bidirectional: bool,
|
||||
source: str,
|
||||
active: bool,
|
||||
created_at: datetime,
|
||||
) -> str:
|
||||
"""Publish a single competitor relationship fact to MinIO.
|
||||
|
||||
Writes a Parquet file to:
|
||||
s3://stonks-lakehouse/warehouse/competitor_relationships/dt={date}/part-{uuid}.parquet
|
||||
|
||||
Returns the s3:// URI of the written object.
|
||||
|
||||
Requirements: 7.3
|
||||
Design ref: Analytical Lake Datasets (lake.competitor_relationships)
|
||||
"""
|
||||
row: dict[str, object] = {
|
||||
"id": relationship_id,
|
||||
"company_a_id": company_a_id,
|
||||
"company_b_id": company_b_id,
|
||||
"relationship_type": relationship_type,
|
||||
"strength": strength,
|
||||
"bidirectional": bidirectional,
|
||||
"source": source,
|
||||
"active": active,
|
||||
"created_at": created_at,
|
||||
**partition_values(created_at),
|
||||
}
|
||||
table = pa.Table.from_pylist([row], schema=COMPETITOR_RELATIONSHIPS_SCHEMA)
|
||||
parquet_bytes = _write_parquet_bytes(table)
|
||||
|
||||
path = _partition_path("competitor_relationships", created_at)
|
||||
_put_lakehouse_object(client, "competitor_relationships", path, parquet_bytes)
|
||||
|
||||
ref = s3_uri(path)
|
||||
logger.info("Published competitor_relationship fact %s: %s", relationship_id, ref)
|
||||
return ref
|
||||
|
||||
|
||||
def publish_competitor_relationships_batch(
|
||||
client: Minio,
|
||||
rows: list[dict[str, object]],
|
||||
dt: datetime,
|
||||
) -> str:
|
||||
"""Publish a batch of competitor relationship rows as a single Parquet file."""
|
||||
return _publish_batch(client, "competitor_relationships", rows, COMPETITOR_RELATIONSHIPS_SCHEMA, dt)
|
||||
|
||||
|
||||
# --- competitive_signals fact table ---
|
||||
|
||||
COMPETITIVE_SIGNALS_SCHEMA = pa.schema([
|
||||
("id", pa.string()),
|
||||
("source_document_id", pa.string()),
|
||||
("source_ticker", pa.string()),
|
||||
("target_ticker", pa.string()),
|
||||
("catalyst_type", pa.string()),
|
||||
("pattern_confidence", pa.float64()),
|
||||
("signal_direction", pa.string()),
|
||||
("signal_strength", pa.float64()),
|
||||
("relationship_strength", pa.float64()),
|
||||
("computed_at", pa.timestamp("us", tz="UTC")),
|
||||
("dt", pa.date32()),
|
||||
])
|
||||
|
||||
|
||||
def publish_competitive_signal_fact(
|
||||
client: Minio,
|
||||
signal_id: str,
|
||||
source_document_id: str,
|
||||
source_ticker: str,
|
||||
target_ticker: str,
|
||||
catalyst_type: str,
|
||||
pattern_confidence: float,
|
||||
signal_direction: str,
|
||||
signal_strength: float,
|
||||
relationship_strength: float,
|
||||
computed_at: datetime,
|
||||
) -> str:
|
||||
"""Publish a single competitive signal fact to MinIO.
|
||||
|
||||
Writes a Parquet file to:
|
||||
s3://stonks-lakehouse/warehouse/competitive_signals/dt={date}/target_ticker={ticker}/part-{uuid}.parquet
|
||||
|
||||
Returns the s3:// URI of the written object.
|
||||
|
||||
Requirements: 7.4
|
||||
Design ref: Analytical Lake Datasets (lake.competitive_signals)
|
||||
"""
|
||||
extra = {"target_ticker": target_ticker}
|
||||
row: dict[str, object] = {
|
||||
"id": signal_id,
|
||||
"source_document_id": source_document_id,
|
||||
"source_ticker": source_ticker,
|
||||
"target_ticker": target_ticker,
|
||||
"catalyst_type": catalyst_type,
|
||||
"pattern_confidence": pattern_confidence,
|
||||
"signal_direction": signal_direction,
|
||||
"signal_strength": signal_strength,
|
||||
"relationship_strength": relationship_strength,
|
||||
"computed_at": computed_at,
|
||||
**partition_values(computed_at, extra),
|
||||
}
|
||||
table = pa.Table.from_pylist([row], schema=COMPETITIVE_SIGNALS_SCHEMA)
|
||||
parquet_bytes = _write_parquet_bytes(table)
|
||||
|
||||
path = _partition_path("competitive_signals", computed_at, extra_partitions=extra)
|
||||
_put_lakehouse_object(client, "competitive_signals", path, parquet_bytes)
|
||||
|
||||
ref = s3_uri(path)
|
||||
logger.info("Published competitive_signal fact for %s→%s: %s", source_ticker, target_ticker, ref)
|
||||
return ref
|
||||
|
||||
|
||||
def publish_competitive_signals_batch(
|
||||
client: Minio,
|
||||
rows: list[dict[str, object]],
|
||||
dt: datetime,
|
||||
target_ticker: str = "",
|
||||
) -> str:
|
||||
"""Publish a batch of competitive signal rows as a single Parquet file."""
|
||||
extra = {"target_ticker": target_ticker} if target_ticker else None
|
||||
return _publish_batch(client, "competitive_signals", rows, COMPETITIVE_SIGNALS_SCHEMA, dt, extra)
|
||||
|
||||
Reference in New Issue
Block a user