"""Exposure profile auto-inference from filing extractions. Infers baseline exposure profiles from company filing extractions when no manual profile exists. Scans recent filing extractions for geographic revenue breakdowns, supplier mentions, and commodity references. Requirements: 9.1, 9.2, 9.3 """ from __future__ import annotations import logging import re from collections import defaultdict from services.aggregation.interpolation import build_default_profile from services.shared.schemas import ( DocumentIntelligence, ExposureProfileSchema, MarketPositionTier, ) logger = logging.getLogger("exposure_inference") # --------------------------------------------------------------------------- # Known region patterns for geographic extraction # --------------------------------------------------------------------------- _REGION_KEYWORDS: dict[str, str] = { "united states": "US", "u.s.": "US", "us": "US", "america": "US", "north america": "US", "china": "CN", "chinese": "CN", "europe": "EU", "european": "EU", "eu": "EU", "japan": "JP", "japanese": "JP", "germany": "DE", "german": "DE", "united kingdom": "GB", "uk": "GB", "britain": "GB", "british": "GB", "south korea": "KR", "korea": "KR", "india": "IN", "indian": "IN", "brazil": "BR", "brazilian": "BR", "australia": "AU", "australian": "AU", "canada": "CA", "canadian": "CA", "taiwan": "TW", "saudi arabia": "SA", "russia": "RU", "russian": "RU", "mexico": "MX", "singapore": "SG", "asia": "CN", "asia pacific": "CN", "latin america": "BR", "middle east": "SA", } # --------------------------------------------------------------------------- # Known commodity patterns # --------------------------------------------------------------------------- _COMMODITY_KEYWORDS: dict[str, str] = { "crude oil": "crude_oil", "oil": "crude_oil", "petroleum": "crude_oil", "natural gas": "natural_gas", "gas": "natural_gas", "copper": "copper", "steel": "steel", "lithium": "lithium", "semiconductor": "semiconductors", "semiconductors": "semiconductors", "chip": "semiconductors", "chips": "semiconductors", "wheat": "wheat", "corn": "corn", "gold": "gold", "aluminum": "aluminum", "aluminium": "aluminum", "nickel": "nickel", "cobalt": "cobalt", "rare earth": "rare_earth", } # Minimum number of filing documents to consider inference meaningful _MIN_FILINGS_FOR_INFERENCE = 1 # Minimum total mentions to consider a region significant _MIN_REGION_MENTIONS = 1 # Minimum total mentions to consider a commodity significant _MIN_COMMODITY_MENTIONS = 1 # --------------------------------------------------------------------------- # Text scanning helpers # --------------------------------------------------------------------------- def _extract_regions_from_text(text: str) -> dict[str, int]: """Extract region mentions from text, returning region_code -> count.""" text_lower = text.lower() region_counts: dict[str, int] = defaultdict(int) for keyword, code in _REGION_KEYWORDS.items(): # Use word boundary matching for short keywords if len(keyword) <= 3: pattern = rf"\b{re.escape(keyword)}\b" matches = re.findall(pattern, text_lower) else: matches = re.findall(re.escape(keyword), text_lower) if matches: region_counts[code] += len(matches) return dict(region_counts) def _extract_commodities_from_text(text: str) -> dict[str, int]: """Extract commodity mentions from text, returning commodity_id -> count.""" text_lower = text.lower() commodity_counts: dict[str, int] = defaultdict(int) for keyword, commodity_id in _COMMODITY_KEYWORDS.items(): if len(keyword) <= 4: pattern = rf"\b{re.escape(keyword)}\b" matches = re.findall(pattern, text_lower) else: matches = re.findall(re.escape(keyword), text_lower) if matches: commodity_counts[commodity_id] += len(matches) return dict(commodity_counts) def _extract_supply_chain_regions(text: str) -> set[str]: """Extract supply chain region mentions from text.""" supply_keywords = [ "supplier", "supply chain", "sourcing", "manufacturing", "factory", "plant", "warehouse", "distribution", "import", "export", "procurement", ] text_lower = text.lower() regions: set[str] = set() for keyword in supply_keywords: if keyword in text_lower: # Find regions mentioned near supply chain keywords # Look within a window around each occurrence for match in re.finditer(re.escape(keyword), text_lower): start = max(0, match.start() - 200) end = min(len(text_lower), match.end() + 200) window = text_lower[start:end] window_regions = _extract_regions_from_text(window) regions.update(window_regions.keys()) return regions # --------------------------------------------------------------------------- # Revenue mix estimation # --------------------------------------------------------------------------- def _estimate_revenue_mix(region_counts: dict[str, int]) -> dict[str, float]: """Estimate geographic revenue mix from region mention counts. Uses mention frequency as a proxy for revenue distribution. Normalizes to sum to 1.0. """ if not region_counts: return {} total = sum(region_counts.values()) if total == 0: return {} mix = { region: round(count / total, 4) for region, count in region_counts.items() if count >= _MIN_REGION_MENTIONS } # Re-normalize after filtering mix_total = sum(mix.values()) if mix_total > 0 and abs(mix_total - 1.0) > 0.001: mix = {r: round(v / mix_total, 4) for r, v in mix.items()} return mix # --------------------------------------------------------------------------- # Confidence scoring # --------------------------------------------------------------------------- def _compute_inference_confidence( num_filings: int, num_regions: int, num_commodities: int, total_mentions: int, ) -> float: """Compute confidence score for the inferred profile. Higher confidence when more filings are available and more geographic/commodity data points are found. """ # Base confidence from number of filings (more filings = more reliable) filing_factor = min(num_filings / 5.0, 1.0) # saturates at 5 filings # Data richness factor data_points = num_regions + num_commodities richness_factor = min(data_points / 8.0, 1.0) # saturates at 8 data points # Mention volume factor volume_factor = min(total_mentions / 20.0, 1.0) # saturates at 20 mentions confidence = 0.4 * filing_factor + 0.35 * richness_factor + 0.25 * volume_factor return round(max(0.0, min(1.0, confidence)), 4) # --------------------------------------------------------------------------- # Main inference function # --------------------------------------------------------------------------- def infer_exposure_profile( document_intelligences: list[DocumentIntelligence], sector: str, industry: str, market_cap_bucket: str, ) -> ExposureProfileSchema: """Infer a baseline exposure profile from filing extractions. Scans recent filing extractions for geographic revenue breakdowns, supplier mentions, and commodity references. Produces an ExposureProfile with source='inferred' and a confidence score reflecting data quality. Falls back to sector-based default profile when insufficient filing data is available. Args: document_intelligences: List of DocumentIntelligence from recent filings. sector: Company's GICS sector name. industry: Company's industry name. market_cap_bucket: One of large_cap, mid_cap, small_cap, micro_cap. Returns: An ExposureProfileSchema with source='inferred'. Requirements: 9.1, 9.2, 9.3 """ # Filter to filing-type documents filings = [ di for di in document_intelligences if di.document_type.value in ("filing", "transcript") ] if len(filings) < _MIN_FILINGS_FOR_INFERENCE: logger.info( "Insufficient filing data (%d filings) for inference, " "falling back to sector-based default profile", len(filings), ) return build_default_profile(sector, industry, market_cap_bucket) # Aggregate region and commodity mentions across all filings all_region_counts: dict[str, int] = defaultdict(int) all_commodity_counts: dict[str, int] = defaultdict(int) all_supply_regions: set[str] = set() for filing in filings: # Scan summary text if filing.summary: regions = _extract_regions_from_text(filing.summary) for r, c in regions.items(): all_region_counts[r] += c commodities = _extract_commodities_from_text(filing.summary) for com, c in commodities.items(): all_commodity_counts[com] += c supply_regions = _extract_supply_chain_regions(filing.summary) all_supply_regions.update(supply_regions) # Scan company impacts for geographic and commodity mentions for company in filing.companies: # Key facts and evidence spans contain geographic details for text in company.key_facts + company.evidence_spans: regions = _extract_regions_from_text(text) for r, c in regions.items(): all_region_counts[r] += c commodities = _extract_commodities_from_text(text) for com, c in commodities.items(): all_commodity_counts[com] += c supply_regions = _extract_supply_chain_regions(text) all_supply_regions.update(supply_regions) # Scan macro themes for commodity/region hints for theme in filing.macro_themes: regions = _extract_regions_from_text(theme) for r, c in regions.items(): all_region_counts[r] += c commodities = _extract_commodities_from_text(theme) for com, c in commodities.items(): all_commodity_counts[com] += c # Check if we have enough data to infer total_mentions = sum(all_region_counts.values()) + sum(all_commodity_counts.values()) has_regions = len(all_region_counts) > 0 has_commodities = len(all_commodity_counts) > 0 if not has_regions and not has_commodities: logger.info( "No geographic or commodity data found in %d filings, " "falling back to sector-based default profile", len(filings), ) return build_default_profile(sector, industry, market_cap_bucket) # Build the inferred profile geographic_revenue_mix = _estimate_revenue_mix(dict(all_region_counts)) # Filter commodities by minimum mentions key_commodities = [ com for com, count in all_commodity_counts.items() if count >= _MIN_COMMODITY_MENTIONS ] # Supply chain regions: combine extracted supply regions with geo regions supply_chain_regions = list(all_supply_regions | set(geographic_revenue_mix.keys())) # Market position tier from market cap bucket from services.aggregation.interpolation import _CAP_TO_TIER tier_value = _CAP_TO_TIER.get(market_cap_bucket, MarketPositionTier.REGIONAL.value) # Regulatory jurisdictions: top regions by revenue sorted_regions = sorted( geographic_revenue_mix.items(), key=lambda x: x[1], reverse=True, ) regulatory_jurisdictions = [r for r, _ in sorted_regions[:3]] # Export dependency: fraction of revenue outside the top region if geographic_revenue_mix: top_region_pct = max(geographic_revenue_mix.values()) export_pct = round(1.0 - top_region_pct, 4) else: export_pct = 0.0 # Confidence score confidence = _compute_inference_confidence( num_filings=len(filings), num_regions=len(all_region_counts), num_commodities=len(all_commodity_counts), total_mentions=total_mentions, ) profile = ExposureProfileSchema( company_id="", geographic_revenue_mix=geographic_revenue_mix, supply_chain_regions=supply_chain_regions, key_input_commodities=key_commodities, regulatory_jurisdictions=regulatory_jurisdictions, market_position_tier=MarketPositionTier(tier_value), export_dependency_pct=max(0.0, min(1.0, export_pct)), source="inferred", confidence=confidence, version=1, ) logger.info( "Inferred exposure profile: regions=%d, commodities=%d, " "supply_chain=%d, confidence=%.3f", len(geographic_revenue_mix), len(key_commodities), len(supply_chain_regions), confidence, ) return profile