"""Extraction prompt templates with anti-hallucination instructions. Builds structured prompts for Ollama document intelligence extraction. Each prompt includes the target JSON schema, anti-hallucination rules, and document-type-specific guidance. Requirements: 5.1, 5.2, 5.3, 5.4, 5.5 """ from __future__ import annotations from typing import Any from services.extractor.schemas import SCHEMA_VERSION, generate_json_schema from services.shared.schemas import ( DocumentType, ) PROMPT_VERSION = "document-intel-v2" # --- JSON schema for structured output (generated from Pydantic models) --- EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema() # --- Anti-hallucination system prompt --- SYSTEM_PROMPT = """\ You are a financial document analyst. Extract structured data as JSON. \ Return ONLY a single JSON object. No markdown fences, no explanation, no text before or after the JSON. \ Every field in the schema is required. Use "other" for catalyst_type if unsure. \ Keep evidence_spans short (under 20 words each). Keep key_facts to 3-5 items max.""" # --- Document-type-specific guidance --- _DOCTYPE_GUIDANCE: dict[str, str] = { DocumentType.ARTICLE: ( "This is a news article. Focus on reported facts, quoted sources, and stated " "analyst opinions. Distinguish between the journalist's framing and actual " "company developments. Do not treat speculative language as confirmed fact." ), DocumentType.FILING: ( "This is a regulatory filing (e.g. SEC 10-K, 10-Q, 8-K). Extract concrete " "financial figures, risk factors, and material events as stated. Filings use " "precise legal language — preserve that precision in your extraction." ), DocumentType.TRANSCRIPT: ( "This is an earnings call or event transcript. Distinguish between management " "forward-looking statements and reported results. Flag forward-looking language " "as lower confidence. Extract specific guidance numbers when stated." ), DocumentType.PRESS_RELEASE: ( "This is a company press release. Be aware that press releases are promotional. " "Extract stated facts and figures but note that sentiment may be biased positive. " "Look for concrete metrics rather than marketing language." ), } def _get_doctype_guidance(document_type: str) -> str: """Return document-type-specific extraction guidance.""" return _DOCTYPE_GUIDANCE.get(document_type, _DOCTYPE_GUIDANCE[DocumentType.ARTICLE]) # --- Prompt builder --- def build_extraction_prompt( document_text: str, document_type: str = DocumentType.ARTICLE, known_tickers: list[str] | None = None, document_id: str = "", ) -> dict[str, str]: """Build system and user prompts for Ollama structured extraction. Args: document_text: Normalized text content of the document. document_type: One of the DocumentType enum values. known_tickers: Optional list of tickers the document may reference. Helps the model focus but does NOT mean all tickers are relevant. document_id: Optional document ID for traceability. Returns: Dict with 'system' and 'user' prompt strings. """ doctype_guidance = _get_doctype_guidance(document_type) ticker_hint = "" if known_tickers: tickers_str = ", ".join(known_tickers) ticker_hint = ( f"\nTracked tickers: {tickers_str}\n" "RULES for companies array:\n" "- If ANY ticker from the list above appears verbatim in the text, " "you MUST include it in companies with at least one evidence_span quote.\n" "- If the article discusses a sector or theme that clearly affects a tracked company " "(e.g. oil prices → XOM, AI chips → NVDA, interest rates → JPM), include that company.\n" "- For each company: set sentiment (positive/negative/neutral/mixed), " "impact_score (0.0-1.0), and copy a verbatim quote into evidence_spans.\n" "- Do NOT invent tickers not in the list above." ) doc_id_line = f"Document ID: {document_id}\n" if document_id else "" # Truncate long documents to reduce prompt size and inference time max_doc_chars = 8000 if len(document_text) > max_doc_chars: document_text = document_text[:max_doc_chars] + "\n[... truncated for extraction ...]" user_prompt = f"""\ Extract structured intelligence from this document. Fill every field. {doc_id_line}Document type: {document_type} {doctype_guidance} {ticker_hint} Fill these fields: - summary: 1-3 sentence summary of the document's main point - companies: array of affected companies (see ticker rules above) - macro_themes: list of broad market themes mentioned - novelty_score: 0.0-1.0 how novel is this information - confidence: 0.0-1.0 your confidence in the extraction quality - extraction_warnings: list any issues For each company entry fill: ticker, company_name, relevance (0-1), sentiment, impact_score (0-1), impact_horizon, catalyst_type, key_facts (list), risks (list), evidence_spans (verbatim quotes from text). catalyst_type MUST be exactly one of: earnings, product, legal, macro, supply_chain, m_and_a, rating_change, other. Use "other" if none of the specific categories fit. --- DOCUMENT TEXT --- {document_text} --- END DOCUMENT TEXT ---""" return { "system": SYSTEM_PROMPT, "user": user_prompt, } def get_prompt_metadata() -> dict[str, str]: """Return metadata about the current prompt version for audit trails.""" return { "prompt_version": PROMPT_VERSION, "schema_version": SCHEMA_VERSION, } def get_json_schema() -> dict[str, Any]: """Return the extraction JSON schema for Ollama structured output format parameter.""" return EXTRACTION_JSON_SCHEMA