phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -0,0 +1,149 @@
+"""Extraction prompt templates with anti-hallucination instructions.
+
+Builds structured prompts for Ollama document intelligence extraction.
+Each prompt includes the target JSON schema, anti-hallucination rules,
+and document-type-specific guidance.
+
+Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
+"""
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from services.extractor.schemas import generate_json_schema, SCHEMA_VERSION
+from services.shared.schemas import (
+    DocumentType,
+)
+
+PROMPT_VERSION = "document-intel-v1"
+
+# --- JSON schema for structured output (generated from Pydantic models) ---
+
+EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema()
+
+# --- Anti-hallucination system prompt ---
+
+SYSTEM_PROMPT = """\
+You are a financial document analysis system. You extract structured intelligence \
+from financial documents into JSON.
+
+STRICT RULES — VIOLATIONS WILL INVALIDATE YOUR OUTPUT:
+
+1. ONLY extract information explicitly stated in the document text provided.
+2. NEVER fabricate facts, quotes, numbers, dates, or company names.
+3. NEVER infer information that is not directly supported by the text.
+4. If the document does not mention a company, do NOT include that company.
+5. If the document is ambiguous about sentiment or impact, use "neutral" or "mixed" \
+and set confidence lower.
+6. evidence_spans MUST be short verbatim quotes copied from the document. \
+Do NOT paraphrase or invent quotes.
+7. key_facts MUST be directly stated in the document. Do NOT add external knowledge.
+8. If you are uncertain about any field, lower the confidence score and add a warning \
+to extraction_warnings.
+9. If the document text is too short, garbled, or uninformative, return an empty \
+companies array, set confidence below 0.3, and add "insufficient_content" to warnings.
+10. Return ONLY valid JSON matching the provided schema. No commentary, no markdown fences."""
+
+# --- Document-type-specific guidance ---
+
+_DOCTYPE_GUIDANCE: dict[str, str] = {
+    DocumentType.ARTICLE: (
+        "This is a news article. Focus on reported facts, quoted sources, and stated "
+        "analyst opinions. Distinguish between the journalist's framing and actual "
+        "company developments. Do not treat speculative language as confirmed fact."
+    ),
+    DocumentType.FILING: (
+        "This is a regulatory filing (e.g. SEC 10-K, 10-Q, 8-K). Extract concrete "
+        "financial figures, risk factors, and material events as stated. Filings use "
+        "precise legal language — preserve that precision in your extraction."
+    ),
+    DocumentType.TRANSCRIPT: (
+        "This is an earnings call or event transcript. Distinguish between management "
+        "forward-looking statements and reported results. Flag forward-looking language "
+        "as lower confidence. Extract specific guidance numbers when stated."
+    ),
+    DocumentType.PRESS_RELEASE: (
+        "This is a company press release. Be aware that press releases are promotional. "
+        "Extract stated facts and figures but note that sentiment may be biased positive. "
+        "Look for concrete metrics rather than marketing language."
+    ),
+}
+
+
+def _get_doctype_guidance(document_type: str) -> str:
+    """Return document-type-specific extraction guidance."""
+    return _DOCTYPE_GUIDANCE.get(document_type, _DOCTYPE_GUIDANCE[DocumentType.ARTICLE])
+
+
+# --- Prompt builder ---
+
+def build_extraction_prompt(
+    document_text: str,
+    document_type: str = DocumentType.ARTICLE,
+    known_tickers: list[str] | None = None,
+    document_id: str = "",
+) -> dict[str, str]:
+    """Build system and user prompts for Ollama structured extraction.
+
+    Args:
+        document_text: Normalized text content of the document.
+        document_type: One of the DocumentType enum values.
+        known_tickers: Optional list of tickers the document may reference.
+            Helps the model focus but does NOT mean all tickers are relevant.
+        document_id: Optional document ID for traceability.
+
+    Returns:
+        Dict with 'system' and 'user' prompt strings.
+    """
+    doctype_guidance = _get_doctype_guidance(document_type)
+
+    ticker_hint = ""
+    if known_tickers:
+        tickers_str = ", ".join(known_tickers)
+        ticker_hint = (
+            f"\nThe following tickers may be referenced in this document: {tickers_str}\n"
+            "Only include a ticker in your output if the document actually discusses that company. "
+            "Do NOT include a ticker just because it appears in this hint."
+        )
+
+    schema_str = json.dumps(EXTRACTION_JSON_SCHEMA, indent=2)
+
+    doc_id_line = f"Document ID: {document_id}\n" if document_id else ""
+
+    user_prompt = f"""\
+Extract structured intelligence from the following document.
+
+{doc_id_line}Document type: {document_type}
+{doctype_guidance}
+{ticker_hint}
+Your output MUST be a single JSON object conforming to this schema:
+{schema_str}
+
+REMEMBER:
+- Only extract what is explicitly in the text below.
+- evidence_spans must be verbatim quotes from the text.
+- If the text is insufficient, return empty companies and low confidence.
+- Return ONLY the JSON object. No other text.
+
+--- DOCUMENT TEXT ---
+{document_text}
+--- END DOCUMENT TEXT ---"""
+
+    return {
+        "system": SYSTEM_PROMPT,
+        "user": user_prompt,
+    }
+
+
+def get_prompt_metadata() -> dict[str, str]:
+    """Return metadata about the current prompt version for audit trails."""
+    return {
+        "prompt_version": PROMPT_VERSION,
+        "schema_version": SCHEMA_VERSION,
+    }
+
+
+def get_json_schema() -> dict[str, Any]:
+    """Return the extraction JSON schema for Ollama structured output format parameter."""
+    return EXTRACTION_JSON_SCHEMA