feat: competitive intelligence & historical pattern matching layer

2026-04-14 19:42:48 +00:00
parent b478022ba3
commit f7a11d14ea
203 changed files with 20155 additions and 97 deletions
@@ -0,0 +1,209 @@
+"""Unit tests for exposure profile auto-inference.
+
+Requirements: 9.1, 9.2, 9.3
+"""
+from __future__ import annotations
+
+from services.extractor.exposure_inference import (
+    infer_exposure_profile,
+    _extract_regions_from_text,
+    _extract_commodities_from_text,
+    _estimate_revenue_mix,
+    _compute_inference_confidence,
+)
+from services.shared.schemas import (
+    DocumentIntelligence,
+    DocumentType,
+    CompanyImpact,
+    Sentiment,
+    CatalystType,
+    MarketPositionTier,
+)
+
+
+# ---------------------------------------------------------------------------
+# Helper builders
+# ---------------------------------------------------------------------------
+
+
+def _make_filing(
+    summary: str = "",
+    key_facts: list[str] | None = None,
+    macro_themes: list[str] | None = None,
+    doc_type: str = "filing",
+) -> DocumentIntelligence:
+    companies = []
+    if key_facts:
+        companies.append(CompanyImpact(
+            ticker="TEST",
+            company_name="Test Corp",
+            relevance=0.8,
+            sentiment=Sentiment.NEUTRAL,
+            impact_score=0.5,
+            impact_horizon="medium_term",
+            catalyst_type=CatalystType.EARNINGS,
+            key_facts=key_facts,
+        ))
+    return DocumentIntelligence(
+        document_type=DocumentType(doc_type),
+        summary=summary,
+        companies=companies,
+        macro_themes=macro_themes or [],
+        confidence=0.7,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Region extraction
+# ---------------------------------------------------------------------------
+
+
+class TestExtractRegions:
+    def test_extracts_country_names(self):
+        regions = _extract_regions_from_text("Revenue from China and Japan grew 15%")
+        assert "CN" in regions
+        assert "JP" in regions
+
+    def test_extracts_region_codes(self):
+        regions = _extract_regions_from_text("US operations expanded into EU markets")
+        assert "US" in regions
+        assert "EU" in regions
+
+    def test_empty_text(self):
+        assert _extract_regions_from_text("") == {}
+
+    def test_no_regions(self):
+        assert _extract_regions_from_text("quarterly earnings increased") == {}
+
+
+# ---------------------------------------------------------------------------
+# Commodity extraction
+# ---------------------------------------------------------------------------
+
+
+class TestExtractCommodities:
+    def test_extracts_commodities(self):
+        commodities = _extract_commodities_from_text(
+            "Rising crude oil and copper prices impacted margins"
+        )
+        assert "crude_oil" in commodities
+        assert "copper" in commodities
+
+    def test_semiconductor_variants(self):
+        commodities = _extract_commodities_from_text("semiconductor shortage continues")
+        assert "semiconductors" in commodities
+
+    def test_empty_text(self):
+        assert _extract_commodities_from_text("") == {}
+
+
+# ---------------------------------------------------------------------------
+# Revenue mix estimation
+# ---------------------------------------------------------------------------
+
+
+class TestEstimateRevenueMix:
+    def test_normalizes_to_one(self):
+        mix = _estimate_revenue_mix({"US": 3, "CN": 1, "JP": 1})
+        total = sum(mix.values())
+        assert abs(total - 1.0) < 0.01
+
+    def test_empty_counts(self):
+        assert _estimate_revenue_mix({}) == {}
+
+    def test_single_region(self):
+        mix = _estimate_revenue_mix({"US": 5})
+        assert mix == {"US": 1.0}
+
+
+# ---------------------------------------------------------------------------
+# Confidence scoring
+# ---------------------------------------------------------------------------
+
+
+class TestComputeInferenceConfidence:
+    def test_high_data_high_confidence(self):
+        conf = _compute_inference_confidence(5, 5, 3, 25)
+        assert conf > 0.5
+
+    def test_low_data_low_confidence(self):
+        conf = _compute_inference_confidence(1, 1, 0, 2)
+        assert conf < 0.5
+
+    def test_bounds(self):
+        conf = _compute_inference_confidence(0, 0, 0, 0)
+        assert 0.0 <= conf <= 1.0
+        conf = _compute_inference_confidence(100, 100, 100, 1000)
+        assert 0.0 <= conf <= 1.0
+
+
+# ---------------------------------------------------------------------------
+# Full inference
+# ---------------------------------------------------------------------------
+
+
+class TestInferExposureProfile:
+    def test_infers_from_filings_with_geo_data(self):
+        filings = [
+            _make_filing(
+                summary="Revenue from United States was 60%, China 25%, and Japan 15%.",
+                key_facts=["US revenue grew 10%", "China operations expanded"],
+            ),
+        ]
+        profile = infer_exposure_profile(filings, "Information Technology", "Software", "large_cap")
+        assert profile.source == "inferred"
+        assert 0.0 <= profile.confidence <= 1.0
+        assert len(profile.geographic_revenue_mix) > 0
+        assert "US" in profile.geographic_revenue_mix
+
+    def test_infers_commodities(self):
+        filings = [
+            _make_filing(
+                summary="Crude oil and natural gas prices affected our cost structure.",
+            ),
+        ]
+        profile = infer_exposure_profile(filings, "Energy", "Oil & Gas", "mid_cap")
+        assert profile.source == "inferred"
+        assert "crude_oil" in profile.key_input_commodities
+
+    def test_fallback_when_no_filings(self):
+        profile = infer_exposure_profile([], "Energy", "Oil & Gas", "large_cap")
+        assert profile.source == "inferred"
+        assert len(profile.geographic_revenue_mix) > 0
+
+    def test_fallback_when_no_geo_or_commodity_data(self):
+        filings = [
+            _make_filing(summary="Quarterly earnings were strong."),
+        ]
+        profile = infer_exposure_profile(filings, "Financials", "Banking", "mid_cap")
+        # Should fall back to default since no geo/commodity data found
+        assert profile.source == "inferred"
+        assert len(profile.geographic_revenue_mix) > 0
+
+    def test_non_filing_documents_ignored(self):
+        docs = [
+            _make_filing(
+                summary="Revenue from China was 50%",
+                doc_type="article",
+            ),
+        ]
+        # Article type should be filtered out, falling back to default
+        profile = infer_exposure_profile(docs, "Energy", "Oil & Gas", "small_cap")
+        assert profile.source == "inferred"
+
+    def test_market_cap_tier_mapping(self):
+        filings = [
+            _make_filing(summary="US and European operations"),
+        ]
+        profile = infer_exposure_profile(filings, "Industrials", "Machinery", "large_cap")
+        tier = profile.market_position_tier
+        if isinstance(tier, MarketPositionTier):
+            tier = tier.value
+        assert tier == "global_leader"
+
+    def test_confidence_in_bounds(self):
+        filings = [
+            _make_filing(summary="Revenue from US, China, Japan, Germany, and India"),
+        ]
+        profile = infer_exposure_profile(filings, "Information Technology", "Software", "mid_cap")
+        assert 0.0 <= profile.confidence <= 1.0