phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -0,0 +1,316 @@
+"""JSON schema definitions for document intelligence extraction.
+
+Generates Ollama-compatible JSON schemas from Pydantic models so the
+extraction contract stays in sync with the shared data models.  Also
+provides schema validation and semantic validation helpers.
+
+Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
+"""
+from __future__ import annotations
+
+import json
+import re
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from services.shared.schemas import (
+    CatalystType,
+    Sentiment,
+)
+
+SCHEMA_VERSION = "2.0.0"
+
+
+# ---------------------------------------------------------------------------
+# Pydantic model that mirrors the Ollama extraction output contract.
+# This is the *response* shape we ask the model to produce — it intentionally
+# omits server-side fields like document_id, source_credibility, and model
+# metadata that are attached after extraction.
+# ---------------------------------------------------------------------------
+
+
+class CompanyExtractionItem(BaseModel):
+    """Per-company extraction output expected from the model.
+
+    All fields are required (no defaults) so the generated JSON schema
+    forces the model to produce every field explicitly.
+    """
+
+    ticker: str = Field(description="Stock ticker symbol mentioned in the document.")
+    company_name: str = Field(description="Full company name as referenced in the document.")
+    relevance: float = Field(
+        ge=0,
+        le=1,
+        description="How relevant the document is to this company. 0=tangential, 1=primary subject.",
+    )
+    sentiment: Sentiment = Field(description="Overall sentiment toward this company in the document.")
+    impact_score: float = Field(
+        ge=0,
+        le=1,
+        description="Estimated magnitude of impact. 0=negligible, 1=highly material.",
+    )
+    impact_horizon: str = Field(
+        description="One of: intraday, 1d, 1d_7d, 1d_30d, 30d_90d, 90d_plus",
+    )
+    catalyst_type: CatalystType = Field(description="Primary catalyst category.")
+    key_facts: list[str] = Field(
+        description="Facts explicitly stated in the document. Do NOT infer or fabricate.",
+    )
+    risks: list[str] = Field(
+        description="Risks explicitly mentioned in the document.",
+    )
+    evidence_spans: list[str] = Field(
+        description="Short verbatim quotes from the document supporting the analysis.",
+    )
+
+
+class ExtractionResult(BaseModel):
+    """Top-level structured output the model must return.
+
+    All fields are required (no defaults) so the generated JSON schema
+    forces the model to produce every field explicitly.
+    """
+
+    summary: str = Field(
+        description="A concise 1-3 sentence summary of the document's main point.",
+    )
+    companies: list[CompanyExtractionItem] = Field(
+        description="Per-company intelligence extracted from the document.",
+    )
+    macro_themes: list[str] = Field(
+        description="Broad economic or market themes mentioned (e.g. rates, inflation, ai_capex).",
+    )
+    novelty_score: float = Field(
+        ge=0,
+        le=1,
+        description="How novel or surprising the information is. 0=routine, 1=highly novel.",
+    )
+    confidence: float = Field(
+        ge=0,
+        le=1,
+        description="Model confidence in the accuracy of this extraction. Lower if text is ambiguous.",
+    )
+    extraction_warnings: list[str] = Field(
+        description="Any issues encountered: ambiguous_ticker, incomplete_text, low_confidence, etc.",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Schema generation
+# ---------------------------------------------------------------------------
+
+
+def generate_json_schema() -> dict[str, Any]:
+    """Generate the JSON schema from the Pydantic model.
+
+    Returns a plain JSON Schema dict suitable for Ollama's ``format``
+    parameter.  Pydantic ``$defs`` are inlined so the schema is
+    self-contained.
+    """
+    raw = ExtractionResult.model_json_schema()
+    # Inline $defs so the schema is flat and Ollama-friendly
+    return _inline_defs(raw)
+
+
+def get_schema_version() -> str:
+    """Return the current schema version string."""
+    return SCHEMA_VERSION
+
+
+# ---------------------------------------------------------------------------
+# Validation helpers
+# ---------------------------------------------------------------------------
+
+
+class ValidationReport(BaseModel):
+    """Result of validating a raw model response."""
+
+    valid: bool = False
+    errors: list[str] = Field(default_factory=list)
+    warnings: list[str] = Field(default_factory=list)
+    parsed: ExtractionResult | None = None
+
+
+def validate_extraction(
+    raw_json: str | dict[str, Any],
+    *,
+    document_text: str = "",
+) -> ValidationReport:
+    """Validate raw model output against the extraction schema.
+
+    Performs structural (JSON / Pydantic) validation followed by semantic
+    checks that catch hallucination indicators, cross-field inconsistencies,
+    and data-quality issues.
+
+    Args:
+        raw_json: Either a JSON string or an already-parsed dict.
+        document_text: Optional original document text used for evidence
+            span verification.
+
+    Returns:
+        A ``ValidationReport`` with parsed result on success.
+    """
+    errors: list[str] = []
+    warnings: list[str] = []
+
+    # --- Parse JSON string if needed ---
+    if isinstance(raw_json, str):
+        try:
+            data = json.loads(raw_json)
+        except json.JSONDecodeError as exc:
+            return ValidationReport(valid=False, errors=[f"Invalid JSON: {exc}"])
+    else:
+        data = raw_json
+
+    if not isinstance(data, dict):
+        return ValidationReport(valid=False, errors=["Expected a JSON object at top level."])
+
+    # --- Pydantic structural validation ---
+    try:
+        result = ExtractionResult.model_validate(data)
+    except Exception as exc:  # noqa: BLE001
+        return ValidationReport(valid=False, errors=[f"Schema validation failed: {exc}"])
+
+    # --- Semantic checks ---
+    sem_errors, sem_warnings = _semantic_checks(result, document_text)
+    errors.extend(sem_errors)
+    warnings.extend(sem_warnings)
+
+    # Semantic errors make the report invalid — the caller should retry.
+    valid = len(errors) == 0
+    return ValidationReport(
+        valid=valid,
+        errors=errors,
+        warnings=warnings,
+        parsed=result,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Known valid impact horizons
+# ---------------------------------------------------------------------------
+
+VALID_IMPACT_HORIZONS = frozenset({
+    "intraday",
+    "1d",
+    "1d_7d",
+    "1d_30d",
+    "30d_90d",
+    "90d_plus",
+})
+
+# Ticker: 1-5 uppercase letters (covers NYSE, NASDAQ, etc.)
+_TICKER_RE = re.compile(r"^[A-Z]{1,5}$")
+
+# Evidence span length bounds (characters)
+_MIN_EVIDENCE_LEN = 8
+_MAX_EVIDENCE_LEN = 500
+
+
+# ---------------------------------------------------------------------------
+# Semantic validation rules
+# ---------------------------------------------------------------------------
+
+
+def _semantic_checks(
+    result: ExtractionResult,
+    document_text: str = "",
+) -> tuple[list[str], list[str]]:
+    """Run semantic checks on a parsed extraction.
+
+    Returns a tuple of (errors, warnings).  Errors are issues severe enough
+    to warrant a retry; warnings are informational.
+    """
+    errors: list[str] = []
+    warnings: list[str] = []
+
+    # --- Top-level checks ---
+    if not result.summary:
+        warnings.append("empty_summary")
+
+    if result.confidence < 0.3 and len(result.companies) > 0:
+        warnings.append("low_confidence_with_companies")
+
+    # Duplicate tickers across company entries
+    tickers_seen: list[str] = []
+    for comp in result.companies:
+        if comp.ticker in tickers_seen:
+            errors.append(f"duplicate_ticker_{comp.ticker}")
+        tickers_seen.append(comp.ticker)
+
+    # --- Per-company checks ---
+    for comp in result.companies:
+        tag = comp.ticker or "unknown"
+
+        # Ticker format
+        if not comp.ticker:
+            errors.append("company_missing_ticker")
+        elif not _TICKER_RE.match(comp.ticker):
+            warnings.append(f"invalid_ticker_format_{tag}")
+
+        # Impact horizon must be a known value
+        if comp.impact_horizon not in VALID_IMPACT_HORIZONS:
+            errors.append(f"invalid_impact_horizon_{comp.impact_horizon}_for_{tag}")
+
+        # Evidence spans
+        if not comp.evidence_spans:
+            warnings.append(f"no_evidence_spans_for_{tag}")
+        else:
+            for idx, span in enumerate(comp.evidence_spans):
+                if len(span) < _MIN_EVIDENCE_LEN:
+                    warnings.append(f"evidence_span_too_short_for_{tag}_{idx}")
+                if len(span) > _MAX_EVIDENCE_LEN:
+                    warnings.append(f"evidence_span_too_long_for_{tag}_{idx}")
+
+        # Cross-field: high impact but no facts
+        if not comp.key_facts and comp.impact_score > 0.5:
+            warnings.append(f"high_impact_no_facts_for_{tag}")
+
+        # Cross-field: very low relevance
+        if comp.relevance < 0.2:
+            warnings.append(f"very_low_relevance_for_{tag}")
+
+        # Cross-field: strong sentiment but low impact
+        if comp.sentiment in (Sentiment.POSITIVE, Sentiment.NEGATIVE) and comp.impact_score < 0.1:
+            warnings.append(f"strong_sentiment_low_impact_for_{tag}")
+
+    # --- Evidence grounding check (when source text is available) ---
+    if document_text:
+        doc_lower = document_text.lower()
+        for comp in result.companies:
+            for idx, span in enumerate(comp.evidence_spans):
+                if span.lower() not in doc_lower:
+                    warnings.append(
+                        f"evidence_span_not_found_in_document_for_{comp.ticker or 'unknown'}_{idx}"
+                    )
+
+    return errors, warnings
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _inline_defs(schema: dict[str, Any]) -> dict[str, Any]:
+    """Recursively inline ``$defs`` / ``$ref`` so the schema is self-contained."""
+    defs = schema.pop("$defs", {})
+    return _resolve_refs(schema, defs)
+
+
+def _resolve_refs(node: Any, defs: dict[str, Any]) -> Any:
+    """Walk the schema tree and replace ``$ref`` pointers with their definitions."""
+    if isinstance(node, dict):
+        if "$ref" in node:
+            ref_path = node["$ref"]  # e.g. "#/$defs/CompanyExtractionItem"
+            ref_name = ref_path.rsplit("/", 1)[-1]
+            if ref_name in defs:
+                resolved = defs[ref_name].copy()
+                # The resolved def may itself contain refs
+                return _resolve_refs(resolved, defs)
+            return node  # unresolvable ref, leave as-is
+        return {k: _resolve_refs(v, defs) for k, v in node.items()}
+    if isinstance(node, list):
+        return [_resolve_refs(item, defs) for item in node]
+    return node