feat: competitive intelligence & historical pattern matching layer

2026-04-14 19:42:48 +00:00
parent b478022ba3
commit f7a11d14ea
203 changed files with 20155 additions and 97 deletions
@@ -1226,3 +1226,373 @@ def publish_prediction_vs_outcome_batch(
 ) -> str:
    """Publish a batch of prediction vs outcome rows as a single Parquet file."""
    return _publish_batch(client, "prediction_vs_outcome", rows, PREDICTION_VS_OUTCOME_SCHEMA, dt)
+
+
+# --- global_events fact table ---
+
+GLOBAL_EVENTS_SCHEMA = pa.schema([
+    ("event_id", pa.string()),
+    ("event_types", pa.string()),
+    ("severity", pa.string()),
+    ("affected_regions", pa.string()),
+    ("affected_sectors", pa.string()),
+    ("affected_commodities", pa.string()),
+    ("summary", pa.string()),
+    ("estimated_duration", pa.string()),
+    ("confidence", pa.float64()),
+    ("source_document_id", pa.string()),
+    ("created_at", pa.timestamp("us", tz="UTC")),
+    ("dt", pa.date32()),
+])
+
+
+def publish_global_event_fact(
+    client: Minio,
+    event_id: str,
+    event_types: list[str],
+    severity: str,
+    affected_regions: list[str],
+    affected_sectors: list[str],
+    affected_commodities: list[str],
+    summary: str,
+    estimated_duration: str,
+    confidence: float,
+    source_document_id: str,
+    created_at: datetime,
+) -> str:
+    """Publish a single global event fact to MinIO.
+
+    Writes a Parquet file to:
+      s3://stonks-lakehouse/warehouse/global_events/dt={date}/part-{uuid}.parquet
+
+    Returns the s3:// URI of the written object.
+
+    Requirements: 7.3, 12.6
+    Design ref: Analytical Lake Datasets (lake.global_events)
+    """
+    row: dict[str, object] = {
+        "event_id": event_id,
+        "event_types": ", ".join(event_types),
+        "severity": severity,
+        "affected_regions": ", ".join(affected_regions),
+        "affected_sectors": ", ".join(affected_sectors),
+        "affected_commodities": ", ".join(affected_commodities),
+        "summary": summary,
+        "estimated_duration": estimated_duration,
+        "confidence": confidence,
+        "source_document_id": source_document_id,
+        "created_at": created_at,
+        **partition_values(created_at),
+    }
+    table = pa.Table.from_pylist([row], schema=GLOBAL_EVENTS_SCHEMA)
+    parquet_bytes = _write_parquet_bytes(table)
+
+    path = _partition_path("global_events", created_at)
+    _put_lakehouse_object(client, "global_events", path, parquet_bytes)
+
+    ref = s3_uri(path)
+    logger.info("Published global_event fact %s: %s", event_id, ref)
+    return ref
+
+
+# --- macro_impacts fact table ---
+
+MACRO_IMPACTS_SCHEMA = pa.schema([
+    ("event_id", pa.string()),
+    ("company_id", pa.string()),
+    ("ticker", pa.string()),
+    ("macro_impact_score", pa.float64()),
+    ("impact_direction", pa.string()),
+    ("contributing_factors", pa.string()),
+    ("confidence", pa.float64()),
+    ("computed_at", pa.timestamp("us", tz="UTC")),
+    ("dt", pa.date32()),
+])
+
+
+def publish_macro_impact_fact(
+    client: Minio,
+    event_id: str,
+    company_id: str,
+    ticker: str,
+    macro_impact_score: float,
+    impact_direction: str,
+    contributing_factors: list[str],
+    confidence: float,
+    computed_at: datetime,
+) -> str:
+    """Publish a single macro impact fact to MinIO.
+
+    Writes a Parquet file to:
+      s3://stonks-lakehouse/warehouse/macro_impacts/dt={date}/ticker={ticker}/part-{uuid}.parquet
+
+    Returns the s3:// URI of the written object.
+
+    Requirements: 7.3, 12.6
+    Design ref: Analytical Lake Datasets (lake.macro_impacts)
+    """
+    extra = {"ticker": ticker}
+    row: dict[str, object] = {
+        "event_id": event_id,
+        "company_id": company_id,
+        "ticker": ticker,
+        "macro_impact_score": macro_impact_score,
+        "impact_direction": impact_direction,
+        "contributing_factors": ", ".join(contributing_factors),
+        "confidence": confidence,
+        "computed_at": computed_at,
+        **partition_values(computed_at, extra),
+    }
+    table = pa.Table.from_pylist([row], schema=MACRO_IMPACTS_SCHEMA)
+    parquet_bytes = _write_parquet_bytes(table)
+
+    path = _partition_path("macro_impacts", computed_at, extra_partitions=extra)
+    _put_lakehouse_object(client, "macro_impacts", path, parquet_bytes)
+
+    ref = s3_uri(path)
+    logger.info("Published macro_impact fact for %s/%s: %s", ticker, event_id, ref)
+    return ref
+
+
+# --- trend_projections fact table ---
+
+TREND_PROJECTIONS_SCHEMA = pa.schema([
+    ("trend_window_id", pa.string()),
+    ("ticker", pa.string()),
+    ("projected_direction", pa.string()),
+    ("projected_strength", pa.float64()),
+    ("projected_confidence", pa.float64()),
+    ("projection_horizon", pa.string()),
+    ("driving_factors", pa.string()),
+    ("macro_contribution_pct", pa.float64()),
+    ("diverges_from_current", pa.bool_()),
+    ("computed_at", pa.timestamp("us", tz="UTC")),
+    ("dt", pa.date32()),
+])
+
+
+def publish_trend_projection_fact(
+    client: Minio,
+    trend_window_id: str,
+    ticker: str,
+    projected_direction: str,
+    projected_strength: float,
+    projected_confidence: float,
+    projection_horizon: str,
+    driving_factors: list[str],
+    macro_contribution_pct: float,
+    diverges_from_current: bool,
+    computed_at: datetime,
+) -> str:
+    """Publish a single trend projection fact to MinIO.
+
+    Writes a Parquet file to:
+      s3://stonks-lakehouse/warehouse/trend_projections/dt={date}/ticker={ticker}/part-{uuid}.parquet
+
+    Returns the s3:// URI of the written object.
+
+    Requirements: 7.3, 12.6
+    Design ref: Analytical Lake Datasets (lake.trend_projections)
+    """
+    extra = {"ticker": ticker}
+    row: dict[str, object] = {
+        "trend_window_id": trend_window_id,
+        "ticker": ticker,
+        "projected_direction": projected_direction,
+        "projected_strength": projected_strength,
+        "projected_confidence": projected_confidence,
+        "projection_horizon": projection_horizon,
+        "driving_factors": ", ".join(driving_factors),
+        "macro_contribution_pct": macro_contribution_pct,
+        "diverges_from_current": diverges_from_current,
+        "computed_at": computed_at,
+        **partition_values(computed_at, extra),
+    }
+    table = pa.Table.from_pylist([row], schema=TREND_PROJECTIONS_SCHEMA)
+    parquet_bytes = _write_parquet_bytes(table)
+
+    path = _partition_path("trend_projections", computed_at, extra_partitions=extra)
+    _put_lakehouse_object(client, "trend_projections", path, parquet_bytes)
+
+    ref = s3_uri(path)
+    logger.info("Published trend_projection fact for %s: %s", ticker, ref)
+    return ref
+
+
+# --- Batch publishers for macro fact tables ---
+
+def publish_global_events_batch(
+    client: Minio,
+    rows: list[dict[str, object]],
+    dt: datetime,
+) -> str:
+    """Publish a batch of global event rows as a single Parquet file."""
+    return _publish_batch(client, "global_events", rows, GLOBAL_EVENTS_SCHEMA, dt)
+
+
+def publish_macro_impacts_batch(
+    client: Minio,
+    rows: list[dict[str, object]],
+    dt: datetime,
+    ticker: str = "",
+) -> str:
+    """Publish a batch of macro impact rows as a single Parquet file."""
+    extra = {"ticker": ticker} if ticker else None
+    return _publish_batch(client, "macro_impacts", rows, MACRO_IMPACTS_SCHEMA, dt, extra)
+
+
+def publish_trend_projections_batch(
+    client: Minio,
+    rows: list[dict[str, object]],
+    dt: datetime,
+    ticker: str = "",
+) -> str:
+    """Publish a batch of trend projection rows as a single Parquet file."""
+    extra = {"ticker": ticker} if ticker else None
+    return _publish_batch(client, "trend_projections", rows, TREND_PROJECTIONS_SCHEMA, dt, extra)
+
+
+# --- competitor_relationships fact table ---
+
+COMPETITOR_RELATIONSHIPS_SCHEMA = pa.schema([
+    ("id", pa.string()),
+    ("company_a_id", pa.string()),
+    ("company_b_id", pa.string()),
+    ("relationship_type", pa.string()),
+    ("strength", pa.float64()),
+    ("bidirectional", pa.bool_()),
+    ("source", pa.string()),
+    ("active", pa.bool_()),
+    ("created_at", pa.timestamp("us", tz="UTC")),
+    ("dt", pa.date32()),
+])
+
+
+def publish_competitor_relationship_fact(
+    client: Minio,
+    relationship_id: str,
+    company_a_id: str,
+    company_b_id: str,
+    relationship_type: str,
+    strength: float,
+    bidirectional: bool,
+    source: str,
+    active: bool,
+    created_at: datetime,
+) -> str:
+    """Publish a single competitor relationship fact to MinIO.
+
+    Writes a Parquet file to:
+      s3://stonks-lakehouse/warehouse/competitor_relationships/dt={date}/part-{uuid}.parquet
+
+    Returns the s3:// URI of the written object.
+
+    Requirements: 7.3
+    Design ref: Analytical Lake Datasets (lake.competitor_relationships)
+    """
+    row: dict[str, object] = {
+        "id": relationship_id,
+        "company_a_id": company_a_id,
+        "company_b_id": company_b_id,
+        "relationship_type": relationship_type,
+        "strength": strength,
+        "bidirectional": bidirectional,
+        "source": source,
+        "active": active,
+        "created_at": created_at,
+        **partition_values(created_at),
+    }
+    table = pa.Table.from_pylist([row], schema=COMPETITOR_RELATIONSHIPS_SCHEMA)
+    parquet_bytes = _write_parquet_bytes(table)
+
+    path = _partition_path("competitor_relationships", created_at)
+    _put_lakehouse_object(client, "competitor_relationships", path, parquet_bytes)
+
+    ref = s3_uri(path)
+    logger.info("Published competitor_relationship fact %s: %s", relationship_id, ref)
+    return ref
+
+
+def publish_competitor_relationships_batch(
+    client: Minio,
+    rows: list[dict[str, object]],
+    dt: datetime,
+) -> str:
+    """Publish a batch of competitor relationship rows as a single Parquet file."""
+    return _publish_batch(client, "competitor_relationships", rows, COMPETITOR_RELATIONSHIPS_SCHEMA, dt)
+
+
+# --- competitive_signals fact table ---
+
+COMPETITIVE_SIGNALS_SCHEMA = pa.schema([
+    ("id", pa.string()),
+    ("source_document_id", pa.string()),
+    ("source_ticker", pa.string()),
+    ("target_ticker", pa.string()),
+    ("catalyst_type", pa.string()),
+    ("pattern_confidence", pa.float64()),
+    ("signal_direction", pa.string()),
+    ("signal_strength", pa.float64()),
+    ("relationship_strength", pa.float64()),
+    ("computed_at", pa.timestamp("us", tz="UTC")),
+    ("dt", pa.date32()),
+])
+
+
+def publish_competitive_signal_fact(
+    client: Minio,
+    signal_id: str,
+    source_document_id: str,
+    source_ticker: str,
+    target_ticker: str,
+    catalyst_type: str,
+    pattern_confidence: float,
+    signal_direction: str,
+    signal_strength: float,
+    relationship_strength: float,
+    computed_at: datetime,
+) -> str:
+    """Publish a single competitive signal fact to MinIO.
+
+    Writes a Parquet file to:
+      s3://stonks-lakehouse/warehouse/competitive_signals/dt={date}/target_ticker={ticker}/part-{uuid}.parquet
+
+    Returns the s3:// URI of the written object.
+
+    Requirements: 7.4
+    Design ref: Analytical Lake Datasets (lake.competitive_signals)
+    """
+    extra = {"target_ticker": target_ticker}
+    row: dict[str, object] = {
+        "id": signal_id,
+        "source_document_id": source_document_id,
+        "source_ticker": source_ticker,
+        "target_ticker": target_ticker,
+        "catalyst_type": catalyst_type,
+        "pattern_confidence": pattern_confidence,
+        "signal_direction": signal_direction,
+        "signal_strength": signal_strength,
+        "relationship_strength": relationship_strength,
+        "computed_at": computed_at,
+        **partition_values(computed_at, extra),
+    }
+    table = pa.Table.from_pylist([row], schema=COMPETITIVE_SIGNALS_SCHEMA)
+    parquet_bytes = _write_parquet_bytes(table)
+
+    path = _partition_path("competitive_signals", computed_at, extra_partitions=extra)
+    _put_lakehouse_object(client, "competitive_signals", path, parquet_bytes)
+
+    ref = s3_uri(path)
+    logger.info("Published competitive_signal fact for %s→%s: %s", source_ticker, target_ticker, ref)
+    return ref
+
+
+def publish_competitive_signals_batch(
+    client: Minio,
+    rows: list[dict[str, object]],
+    dt: datetime,
+    target_ticker: str = "",
+) -> str:
+    """Publish a batch of competitive signal rows as a single Parquet file."""
+    extra = {"target_ticker": target_ticker} if target_ticker else None
+    return _publish_batch(client, "competitive_signals", rows, COMPETITIVE_SIGNALS_SCHEMA, dt, extra)