Files
stonks-oracle/services/extractor/prompts.py
T
Celes Renata cd782d1552 fix(extractor): streaming with guardrails + catalyst_type normalization
- Switch Ollama calls from non-streaming to streaming with early termination
- Add loop detection, max token limit, and stall timeout guards
- Add catalyst_type alias normalizer to handle model hallucinations
- Add explicit enum values in extraction prompt for catalyst_type
- Add streaming config knobs to OllamaConfig
2026-04-12 15:28:20 -07:00

140 lines
5.4 KiB
Python

"""Extraction prompt templates with anti-hallucination instructions.
Builds structured prompts for Ollama document intelligence extraction.
Each prompt includes the target JSON schema, anti-hallucination rules,
and document-type-specific guidance.
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
"""
from __future__ import annotations
from typing import Any
from services.extractor.schemas import SCHEMA_VERSION, generate_json_schema
from services.shared.schemas import (
DocumentType,
)
PROMPT_VERSION = "document-intel-v2"
# --- JSON schema for structured output (generated from Pydantic models) ---
EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema()
# --- Anti-hallucination system prompt ---
SYSTEM_PROMPT = """\
You extract structured financial intelligence from documents into JSON. \
Read the document text carefully and fill every field. \
Return ONLY valid JSON. No commentary, no markdown, no explanation."""
# --- Document-type-specific guidance ---
_DOCTYPE_GUIDANCE: dict[str, str] = {
DocumentType.ARTICLE: (
"This is a news article. Focus on reported facts, quoted sources, and stated "
"analyst opinions. Distinguish between the journalist's framing and actual "
"company developments. Do not treat speculative language as confirmed fact."
),
DocumentType.FILING: (
"This is a regulatory filing (e.g. SEC 10-K, 10-Q, 8-K). Extract concrete "
"financial figures, risk factors, and material events as stated. Filings use "
"precise legal language — preserve that precision in your extraction."
),
DocumentType.TRANSCRIPT: (
"This is an earnings call or event transcript. Distinguish between management "
"forward-looking statements and reported results. Flag forward-looking language "
"as lower confidence. Extract specific guidance numbers when stated."
),
DocumentType.PRESS_RELEASE: (
"This is a company press release. Be aware that press releases are promotional. "
"Extract stated facts and figures but note that sentiment may be biased positive. "
"Look for concrete metrics rather than marketing language."
),
}
def _get_doctype_guidance(document_type: str) -> str:
"""Return document-type-specific extraction guidance."""
return _DOCTYPE_GUIDANCE.get(document_type, _DOCTYPE_GUIDANCE[DocumentType.ARTICLE])
# --- Prompt builder ---
def build_extraction_prompt(
document_text: str,
document_type: str = DocumentType.ARTICLE,
known_tickers: list[str] | None = None,
document_id: str = "",
) -> dict[str, str]:
"""Build system and user prompts for Ollama structured extraction.
Args:
document_text: Normalized text content of the document.
document_type: One of the DocumentType enum values.
known_tickers: Optional list of tickers the document may reference.
Helps the model focus but does NOT mean all tickers are relevant.
document_id: Optional document ID for traceability.
Returns:
Dict with 'system' and 'user' prompt strings.
"""
doctype_guidance = _get_doctype_guidance(document_type)
ticker_hint = ""
if known_tickers:
tickers_str = ", ".join(known_tickers)
ticker_hint = (
f"\nTracked tickers: {tickers_str}\n"
"RULES for companies array:\n"
"- If ANY ticker from the list above appears verbatim in the text, "
"you MUST include it in companies with at least one evidence_span quote.\n"
"- If the article discusses a sector or theme that clearly affects a tracked company "
"(e.g. oil prices → XOM, AI chips → NVDA, interest rates → JPM), include that company.\n"
"- For each company: set sentiment (positive/negative/neutral/mixed), "
"impact_score (0.0-1.0), and copy a verbatim quote into evidence_spans.\n"
"- Do NOT invent tickers not in the list above."
)
doc_id_line = f"Document ID: {document_id}\n" if document_id else ""
user_prompt = f"""\
Extract structured intelligence from this document. Fill every field.
{doc_id_line}Document type: {document_type}
{doctype_guidance}
{ticker_hint}
Fill these fields:
- summary: 1-3 sentence summary of the document's main point
- companies: array of affected companies (see ticker rules above)
- macro_themes: list of broad market themes mentioned
- novelty_score: 0.0-1.0 how novel is this information
- confidence: 0.0-1.0 your confidence in the extraction quality
- extraction_warnings: list any issues
For each company entry fill: ticker, company_name, relevance (0-1), sentiment, impact_score (0-1), impact_horizon, catalyst_type, key_facts (list), risks (list), evidence_spans (verbatim quotes from text).
catalyst_type MUST be exactly one of: earnings, product, legal, macro, supply_chain, m_and_a, rating_change, other. Use "other" if none of the specific categories fit.
--- DOCUMENT TEXT ---
{document_text}
--- END DOCUMENT TEXT ---"""
return {
"system": SYSTEM_PROMPT,
"user": user_prompt,
}
def get_prompt_metadata() -> dict[str, str]:
"""Return metadata about the current prompt version for audit trails."""
return {
"prompt_version": PROMPT_VERSION,
"schema_version": SCHEMA_VERSION,
}
def get_json_schema() -> dict[str, Any]:
"""Return the extraction JSON schema for Ollama structured output format parameter."""
return EXTRACTION_JSON_SCHEMA