stonks-oracle/services/extractor/prompts.py

"""Extraction prompt templates with anti-hallucination instructions.

Builds structured prompts for Ollama document intelligence extraction.
Each prompt includes the target JSON schema, anti-hallucination rules,
and document-type-specific guidance.

Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
"""
from __future__ import annotations

from typing import Any

from services.extractor.schemas import SCHEMA_VERSION, generate_json_schema
from services.shared.schemas import (
    DocumentType,
)

PROMPT_VERSION = "document-intel-v2"

# --- JSON schema for structured output (generated from Pydantic models) ---

EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema()

# --- Anti-hallucination system prompt ---

SYSTEM_PROMPT = """\
You extract structured financial intelligence from documents into JSON. \
Read the document text carefully and fill every field. \
Return ONLY valid JSON. No commentary, no markdown, no explanation."""

# --- Document-type-specific guidance ---

_DOCTYPE_GUIDANCE: dict[str, str] = {
    DocumentType.ARTICLE: (
        "This is a news article. Focus on reported facts, quoted sources, and stated "
        "analyst opinions. Distinguish between the journalist's framing and actual "
        "company developments. Do not treat speculative language as confirmed fact."
    ),
    DocumentType.FILING: (
        "This is a regulatory filing (e.g. SEC 10-K, 10-Q, 8-K). Extract concrete "
        "financial figures, risk factors, and material events as stated. Filings use "
        "precise legal language — preserve that precision in your extraction."
    ),
    DocumentType.TRANSCRIPT: (
        "This is an earnings call or event transcript. Distinguish between management "
        "forward-looking statements and reported results. Flag forward-looking language "
        "as lower confidence. Extract specific guidance numbers when stated."
    ),
    DocumentType.PRESS_RELEASE: (
        "This is a company press release. Be aware that press releases are promotional. "
        "Extract stated facts and figures but note that sentiment may be biased positive. "
        "Look for concrete metrics rather than marketing language."
    ),
}


def _get_doctype_guidance(document_type: str) -> str:
    """Return document-type-specific extraction guidance."""
    return _DOCTYPE_GUIDANCE.get(document_type, _DOCTYPE_GUIDANCE[DocumentType.ARTICLE])


# --- Prompt builder ---

def build_extraction_prompt(
    document_text: str,
    document_type: str = DocumentType.ARTICLE,
    known_tickers: list[str] | None = None,
    document_id: str = "",
) -> dict[str, str]:
    """Build system and user prompts for Ollama structured extraction.

    Args:
        document_text: Normalized text content of the document.
        document_type: One of the DocumentType enum values.
        known_tickers: Optional list of tickers the document may reference.
            Helps the model focus but does NOT mean all tickers are relevant.
        document_id: Optional document ID for traceability.

    Returns:
        Dict with 'system' and 'user' prompt strings.
    """
    doctype_guidance = _get_doctype_guidance(document_type)

    ticker_hint = ""
    if known_tickers:
        tickers_str = ", ".join(known_tickers)
        ticker_hint = (
            f"\nTracked tickers: {tickers_str}\n"
            "RULES for companies array:\n"
            "- If ANY ticker from the list above appears verbatim in the text, "
            "you MUST include it in companies with at least one evidence_span quote.\n"
            "- If the article discusses a sector or theme that clearly affects a tracked company "
            "(e.g. oil prices → XOM, AI chips → NVDA, interest rates → JPM), include that company.\n"
            "- For each company: set sentiment (positive/negative/neutral/mixed), "
            "impact_score (0.0-1.0), and copy a verbatim quote into evidence_spans.\n"
            "- Do NOT invent tickers not in the list above."
        )

    doc_id_line = f"Document ID: {document_id}\n" if document_id else ""

    user_prompt = f"""\
Extract structured intelligence from this document. Fill every field.

{doc_id_line}Document type: {document_type}
{doctype_guidance}
{ticker_hint}
Fill these fields:
- summary: 1-3 sentence summary of the document's main point
- companies: array of affected companies (see ticker rules above)
- macro_themes: list of broad market themes mentioned
- novelty_score: 0.0-1.0 how novel is this information
- confidence: 0.0-1.0 your confidence in the extraction quality
- extraction_warnings: list any issues

For each company entry fill: ticker, company_name, relevance (0-1), sentiment, impact_score (0-1), impact_horizon, catalyst_type, key_facts (list), risks (list), evidence_spans (verbatim quotes from text).

catalyst_type MUST be exactly one of: earnings, product, legal, macro, supply_chain, m_and_a, rating_change, other. Use "other" if none of the specific categories fit.

--- DOCUMENT TEXT ---
{document_text}
--- END DOCUMENT TEXT ---"""

    return {
        "system": SYSTEM_PROMPT,
        "user": user_prompt,
    }


def get_prompt_metadata() -> dict[str, str]:
    """Return metadata about the current prompt version for audit trails."""
    return {
        "prompt_version": PROMPT_VERSION,
        "schema_version": SCHEMA_VERSION,
    }


def get_json_schema() -> dict[str, Any]:
    """Return the extraction JSON schema for Ollama structured output format parameter."""
    return EXTRACTION_JSON_SCHEMA