"""Attribution Engine — per-source, per-catalyst, and per-layer performance. Joins signal evidence links with prediction outcomes to compute attribution metrics that identify which sources, catalyst types, and signal layers contribute most to accurate predictions. Requirements: 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7 """ from __future__ import annotations import logging import math from dataclasses import dataclass from datetime import datetime, timedelta import asyncpg logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Dataclasses # --------------------------------------------------------------------------- @dataclass class SourceAttribution: """Performance metrics for a single source.""" source: str source_type: str prediction_count: int avg_weight: float avg_contribution_score: float win_rate: float avg_future_return: float avg_excess_return_vs_spy: float information_coefficient: float | None duplicate_rate: float @dataclass class CatalystAttribution: """Performance metrics for a single catalyst type.""" catalyst_type: str prediction_count: int win_rate: float avg_future_return: float avg_excess_return_vs_spy: float information_coefficient: float | None @dataclass class LayerAttribution: """Performance metrics for a signal layer.""" layer: str # company, macro, competitive avg_contribution_pct: float dominant_win_rate: float # win rate when this layer > 30% contribution dominant_ic: float | None # IC when this layer > 30% contribution # --------------------------------------------------------------------------- # Pure computation helpers # --------------------------------------------------------------------------- def _pearson_correlation(xs: list[float], ys: list[float]) -> float | None: """Compute Pearson correlation coefficient between two lists. Returns None if the lists have fewer than 2 elements or if either has zero variance. Guards against NaN/infinity. """ n = len(xs) if n < 2: return None mean_x = sum(xs) / n mean_y = sum(ys) / n cov = 0.0 var_x = 0.0 var_y = 0.0 for x, y in zip(xs, ys): dx = x - mean_x dy = y - mean_y cov += dx * dy var_x += dx * dx var_y += dy * dy if var_x == 0.0 or var_y == 0.0: return None r = cov / math.sqrt(var_x * var_y) if math.isnan(r) or math.isinf(r): return None return max(-1.0, min(1.0, r)) def _compute_ic( contribution_scores: list[float], future_returns: list[float], ) -> float | None: """Compute IC (Pearson correlation) between contribution scores and returns. Returns None when fewer than 30 data points. """ if len(contribution_scores) < 30 or len(future_returns) < 30: return None n = min(len(contribution_scores), len(future_returns)) return _pearson_correlation(contribution_scores[:n], future_returns[:n]) # --------------------------------------------------------------------------- # SQL queries — source attribution via v_source_performance # --------------------------------------------------------------------------- _SOURCE_ATTRIBUTION_SQL = """ SELECT source, source_type, weight, contribution_score, is_duplicate, direction_correct, future_return, excess_return_vs_spy FROM v_source_performance WHERE horizon = $1 AND generated_at >= $2 """ _SOURCE_ATTRIBUTION_ALL_SQL = """ SELECT source, source_type, weight, contribution_score, is_duplicate, direction_correct, future_return, excess_return_vs_spy FROM v_source_performance WHERE horizon = $1 """ # --------------------------------------------------------------------------- # SQL queries — catalyst attribution via v_source_performance # --------------------------------------------------------------------------- _CATALYST_ATTRIBUTION_SQL = """ SELECT catalyst_type, weight, contribution_score, direction_correct, future_return, excess_return_vs_spy FROM v_source_performance WHERE horizon = $1 AND generated_at >= $2 """ _CATALYST_ATTRIBUTION_ALL_SQL = """ SELECT catalyst_type, weight, contribution_score, direction_correct, future_return, excess_return_vs_spy FROM v_source_performance WHERE horizon = $1 """ # --------------------------------------------------------------------------- # SQL queries — layer attribution via prediction_snapshots + outcomes # --------------------------------------------------------------------------- _LAYER_ATTRIBUTION_SQL = """ SELECT ps.score_company, ps.score_macro, ps.score_competitive, po.direction_correct, po.future_return FROM prediction_snapshots ps JOIN prediction_outcomes po ON po.prediction_id = ps.id WHERE po.horizon = $1 AND ps.generated_at >= $2 """ _LAYER_ATTRIBUTION_ALL_SQL = """ SELECT ps.score_company, ps.score_macro, ps.score_competitive, po.direction_correct, po.future_return FROM prediction_snapshots ps JOIN prediction_outcomes po ON po.prediction_id = ps.id WHERE po.horizon = $1 """ # --------------------------------------------------------------------------- # Source attribution (Requirements 7.1, 7.2, 7.7) # --------------------------------------------------------------------------- async def compute_source_attribution( pool: asyncpg.Pool, lookback_days: int = 30, horizon: str = "7d", ) -> list[SourceAttribution]: """Compute per-source performance metrics. Queries v_source_performance, groups by source, and computes: prediction count, avg weight, avg contribution score, win rate, avg future return, avg excess return vs SPY, IC, and duplicate rate. Returns a list of SourceAttribution sorted by prediction count descending. """ now = datetime.now().astimezone() cutoff = now - timedelta(days=lookback_days) try: rows = await pool.fetch(_SOURCE_ATTRIBUTION_SQL, horizon, cutoff) except Exception: logger.exception( "Failed to query source attribution for horizon=%s lookback=%dd", horizon, lookback_days, ) return [] if not rows: return [] # Group rows by source source_groups: dict[str, list[dict]] = {} for row in rows: r = dict(row) key = r.get("source") or "unknown" source_groups.setdefault(key, []).append(r) results: list[SourceAttribution] = [] for source, group in source_groups.items(): count = len(group) # Source type — take the most common one source_type = group[0].get("source_type") or "unknown" # Avg weight weights = [r["weight"] for r in group if r.get("weight") is not None] avg_weight = sum(weights) / len(weights) if weights else 0.0 # Avg contribution score contrib_scores = [ r["contribution_score"] for r in group if r.get("contribution_score") is not None ] avg_contribution_score = ( sum(contrib_scores) / len(contrib_scores) if contrib_scores else 0.0 ) # Win rate direction_rows = [r for r in group if r.get("direction_correct") is not None] win_count = sum(1 for r in direction_rows if r["direction_correct"] is True) win_rate = win_count / len(direction_rows) if direction_rows else 0.0 # Avg future return returns = [ r["future_return"] for r in group if r.get("future_return") is not None ] avg_future_return = sum(returns) / len(returns) if returns else 0.0 # Avg excess return vs SPY excess_returns = [ r["excess_return_vs_spy"] for r in group if r.get("excess_return_vs_spy") is not None ] avg_excess_return_vs_spy = ( sum(excess_returns) / len(excess_returns) if excess_returns else 0.0 ) # IC: correlation between contribution scores and future returns ic_scores = [ r["contribution_score"] for r in group if r.get("contribution_score") is not None and r.get("future_return") is not None ] ic_returns = [ r["future_return"] for r in group if r.get("contribution_score") is not None and r.get("future_return") is not None ] ic = _compute_ic(ic_scores, ic_returns) # Duplicate rate: is_duplicate=true / total dup_count = sum(1 for r in group if r.get("is_duplicate") is True) duplicate_rate = dup_count / count results.append( SourceAttribution( source=source, source_type=source_type, prediction_count=count, avg_weight=avg_weight, avg_contribution_score=avg_contribution_score, win_rate=win_rate, avg_future_return=avg_future_return, avg_excess_return_vs_spy=avg_excess_return_vs_spy, information_coefficient=ic, duplicate_rate=duplicate_rate, ) ) # Sort by prediction count descending results.sort(key=lambda a: a.prediction_count, reverse=True) logger.info( "Computed source attribution for %d sources (horizon=%s, lookback=%dd)", len(results), horizon, lookback_days, ) return results # --------------------------------------------------------------------------- # Catalyst attribution (Requirements 7.3, 7.4) # --------------------------------------------------------------------------- async def compute_catalyst_attribution( pool: asyncpg.Pool, lookback_days: int = 30, horizon: str = "7d", ) -> list[CatalystAttribution]: """Compute per-catalyst-type performance metrics. Queries v_source_performance, groups by catalyst_type, and computes: prediction count, win rate, avg future return, avg excess return vs SPY, and IC. Returns a list of CatalystAttribution sorted by prediction count descending. """ now = datetime.now().astimezone() cutoff = now - timedelta(days=lookback_days) try: rows = await pool.fetch(_CATALYST_ATTRIBUTION_SQL, horizon, cutoff) except Exception: logger.exception( "Failed to query catalyst attribution for horizon=%s lookback=%dd", horizon, lookback_days, ) return [] if not rows: return [] # Group rows by catalyst_type catalyst_groups: dict[str, list[dict]] = {} for row in rows: r = dict(row) key = r.get("catalyst_type") or "unknown" catalyst_groups.setdefault(key, []).append(r) results: list[CatalystAttribution] = [] for catalyst_type, group in catalyst_groups.items(): count = len(group) # Win rate direction_rows = [r for r in group if r.get("direction_correct") is not None] win_count = sum(1 for r in direction_rows if r["direction_correct"] is True) win_rate = win_count / len(direction_rows) if direction_rows else 0.0 # Avg future return returns = [ r["future_return"] for r in group if r.get("future_return") is not None ] avg_future_return = sum(returns) / len(returns) if returns else 0.0 # Avg excess return vs SPY excess_returns = [ r["excess_return_vs_spy"] for r in group if r.get("excess_return_vs_spy") is not None ] avg_excess_return_vs_spy = ( sum(excess_returns) / len(excess_returns) if excess_returns else 0.0 ) # IC: correlation between contribution scores and future returns ic_scores = [ r["contribution_score"] for r in group if r.get("contribution_score") is not None and r.get("future_return") is not None ] ic_returns = [ r["future_return"] for r in group if r.get("contribution_score") is not None and r.get("future_return") is not None ] ic = _compute_ic(ic_scores, ic_returns) results.append( CatalystAttribution( catalyst_type=catalyst_type, prediction_count=count, win_rate=win_rate, avg_future_return=avg_future_return, avg_excess_return_vs_spy=avg_excess_return_vs_spy, information_coefficient=ic, ) ) # Sort by prediction count descending results.sort(key=lambda a: a.prediction_count, reverse=True) logger.info( "Computed catalyst attribution for %d catalyst types " "(horizon=%s, lookback=%dd)", len(results), horizon, lookback_days, ) return results # --------------------------------------------------------------------------- # Layer attribution (Requirements 7.5, 7.6) # --------------------------------------------------------------------------- async def compute_layer_attribution( pool: asyncpg.Pool, lookback_days: int = 30, horizon: str = "7d", ) -> list[LayerAttribution]: """Compute per-layer (company, macro, competitive) performance metrics. Queries prediction_snapshots joined with prediction_outcomes to get score_company, score_macro, score_competitive alongside outcomes. For each layer computes: - avg_contribution_pct: average of layer_score / total_score across all predictions (where total_score > 0) - dominant_win_rate: win rate for predictions where the layer contributes more than 30% of the total score - dominant_ic: IC (Pearson correlation between layer score and future return) for predictions where the layer contributes > 30% Returns a list of 3 LayerAttribution objects (company, macro, competitive). """ now = datetime.now().astimezone() cutoff = now - timedelta(days=lookback_days) try: rows = await pool.fetch(_LAYER_ATTRIBUTION_SQL, horizon, cutoff) except Exception: logger.exception( "Failed to query layer attribution for horizon=%s lookback=%dd", horizon, lookback_days, ) return [] if not rows: return [ LayerAttribution( layer="company", avg_contribution_pct=0.0, dominant_win_rate=0.0, dominant_ic=None, ), LayerAttribution( layer="macro", avg_contribution_pct=0.0, dominant_win_rate=0.0, dominant_ic=None, ), LayerAttribution( layer="competitive", avg_contribution_pct=0.0, dominant_win_rate=0.0, dominant_ic=None, ), ] row_dicts = [dict(r) for r in rows] layers = [ ("company", "score_company"), ("macro", "score_macro"), ("competitive", "score_competitive"), ] results: list[LayerAttribution] = [] for layer_name, score_field in layers: # --- Average contribution percentage --- contribution_pcts: list[float] = [] for r in row_dicts: total = ( (r.get("score_company") or 0.0) + (r.get("score_macro") or 0.0) + (r.get("score_competitive") or 0.0) ) if total > 0.0: layer_score = r.get(score_field) or 0.0 contribution_pcts.append(layer_score / total) avg_contribution_pct = ( sum(contribution_pcts) / len(contribution_pcts) if contribution_pcts else 0.0 ) # --- Dominant predictions: layer > 30% of total score --- dominant_rows: list[dict] = [] for r in row_dicts: total = ( (r.get("score_company") or 0.0) + (r.get("score_macro") or 0.0) + (r.get("score_competitive") or 0.0) ) if total > 0.0: layer_score = r.get(score_field) or 0.0 if layer_score / total > 0.30: dominant_rows.append(r) # Dominant win rate dominant_direction_rows = [ r for r in dominant_rows if r.get("direction_correct") is not None ] dominant_win_count = sum( 1 for r in dominant_direction_rows if r["direction_correct"] is True ) dominant_win_rate = ( dominant_win_count / len(dominant_direction_rows) if dominant_direction_rows else 0.0 ) # Dominant IC: correlation between layer score and future return dom_scores = [ r.get(score_field) or 0.0 for r in dominant_rows if r.get("future_return") is not None ] dom_returns = [ r["future_return"] for r in dominant_rows if r.get("future_return") is not None ] dominant_ic = _compute_ic(dom_scores, dom_returns) results.append( LayerAttribution( layer=layer_name, avg_contribution_pct=avg_contribution_pct, dominant_win_rate=dominant_win_rate, dominant_ic=dominant_ic, ) ) logger.info( "Computed layer attribution for 3 layers (horizon=%s, lookback=%dd)", horizon, lookback_days, ) return results