stonks-oracle/services/extractor/prompts.py

"""Extraction prompt templates with anti-hallucination instructions.

Builds structured prompts for Ollama document intelligence extraction.
Each prompt includes the target JSON schema, anti-hallucination rules,
and document-type-specific guidance.

Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
"""
from __future__ import annotations

import json
from typing import Any

from services.extractor.schemas import generate_json_schema, SCHEMA_VERSION
from services.shared.schemas import (
    DocumentType,
)

PROMPT_VERSION = "document-intel-v1"

# --- JSON schema for structured output (generated from Pydantic models) ---

EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema()

# --- Anti-hallucination system prompt ---

SYSTEM_PROMPT = """\
You are a financial document analysis system. You extract structured intelligence \
from financial documents into JSON.

STRICT RULES — VIOLATIONS WILL INVALIDATE YOUR OUTPUT:

1. ONLY extract information explicitly stated in the document text provided.
2. NEVER fabricate facts, quotes, numbers, dates, or company names.
3. NEVER infer information that is not directly supported by the text.
4. If the document does not mention a company, do NOT include that company.
5. If the document is ambiguous about sentiment or impact, use "neutral" or "mixed" \
and set confidence lower.
6. evidence_spans MUST be short verbatim quotes copied from the document. \
Do NOT paraphrase or invent quotes.
7. key_facts MUST be directly stated in the document. Do NOT add external knowledge.
8. If you are uncertain about any field, lower the confidence score and add a warning \
to extraction_warnings.
9. If the document text is too short, garbled, or uninformative, return an empty \
companies array, set confidence below 0.3, and add "insufficient_content" to warnings.
10. Return ONLY valid JSON matching the provided schema. No commentary, no markdown fences."""

# --- Document-type-specific guidance ---

_DOCTYPE_GUIDANCE: dict[str, str] = {
    DocumentType.ARTICLE: (
        "This is a news article. Focus on reported facts, quoted sources, and stated "
        "analyst opinions. Distinguish between the journalist's framing and actual "
        "company developments. Do not treat speculative language as confirmed fact."
    ),
    DocumentType.FILING: (
        "This is a regulatory filing (e.g. SEC 10-K, 10-Q, 8-K). Extract concrete "
        "financial figures, risk factors, and material events as stated. Filings use "
        "precise legal language — preserve that precision in your extraction."
    ),
    DocumentType.TRANSCRIPT: (
        "This is an earnings call or event transcript. Distinguish between management "
        "forward-looking statements and reported results. Flag forward-looking language "
        "as lower confidence. Extract specific guidance numbers when stated."
    ),
    DocumentType.PRESS_RELEASE: (
        "This is a company press release. Be aware that press releases are promotional. "
        "Extract stated facts and figures but note that sentiment may be biased positive. "
        "Look for concrete metrics rather than marketing language."
    ),
}


def _get_doctype_guidance(document_type: str) -> str:
    """Return document-type-specific extraction guidance."""
    return _DOCTYPE_GUIDANCE.get(document_type, _DOCTYPE_GUIDANCE[DocumentType.ARTICLE])


# --- Prompt builder ---

def build_extraction_prompt(
    document_text: str,
    document_type: str = DocumentType.ARTICLE,
    known_tickers: list[str] | None = None,
    document_id: str = "",
) -> dict[str, str]:
    """Build system and user prompts for Ollama structured extraction.

    Args:
        document_text: Normalized text content of the document.
        document_type: One of the DocumentType enum values.
        known_tickers: Optional list of tickers the document may reference.
            Helps the model focus but does NOT mean all tickers are relevant.
        document_id: Optional document ID for traceability.

    Returns:
        Dict with 'system' and 'user' prompt strings.
    """
    doctype_guidance = _get_doctype_guidance(document_type)

    ticker_hint = ""
    if known_tickers:
        tickers_str = ", ".join(known_tickers)
        ticker_hint = (
            f"\nThe following tickers may be referenced in this document: {tickers_str}\n"
            "Only include a ticker in your output if the document actually discusses that company. "
            "Do NOT include a ticker just because it appears in this hint."
        )

    schema_str = json.dumps(EXTRACTION_JSON_SCHEMA, indent=2)

    doc_id_line = f"Document ID: {document_id}\n" if document_id else ""

    user_prompt = f"""\
Extract structured intelligence from the following document.

{doc_id_line}Document type: {document_type}
{doctype_guidance}
{ticker_hint}
Your output MUST be a single JSON object conforming to this schema:
{schema_str}

REMEMBER:
- Only extract what is explicitly in the text below.
- evidence_spans must be verbatim quotes from the text.
- If the text is insufficient, return empty companies and low confidence.
- Return ONLY the JSON object. No other text.

--- DOCUMENT TEXT ---
{document_text}
--- END DOCUMENT TEXT ---"""

    return {
        "system": SYSTEM_PROMPT,
        "user": user_prompt,
    }


def get_prompt_metadata() -> dict[str, str]:
    """Return metadata about the current prompt version for audit trails."""
    return {
        "prompt_version": PROMPT_VERSION,
        "schema_version": SCHEMA_VERSION,
    }


def get_json_schema() -> dict[str, Any]:
    """Return the extraction JSON schema for Ollama structured output format parameter."""
    return EXTRACTION_JSON_SCHEMA