138 lines
5.2 KiB
Python
138 lines
5.2 KiB
Python
"""Extraction prompt templates with anti-hallucination instructions.
|
|
|
|
Builds structured prompts for Ollama document intelligence extraction.
|
|
Each prompt includes the target JSON schema, anti-hallucination rules,
|
|
and document-type-specific guidance.
|
|
|
|
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from typing import Any
|
|
|
|
from services.extractor.schemas import SCHEMA_VERSION, generate_json_schema
|
|
from services.shared.schemas import (
|
|
DocumentType,
|
|
)
|
|
|
|
PROMPT_VERSION = "document-intel-v2"
|
|
|
|
# --- JSON schema for structured output (generated from Pydantic models) ---
|
|
|
|
EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema()
|
|
|
|
# --- Anti-hallucination system prompt ---
|
|
|
|
SYSTEM_PROMPT = """\
|
|
You extract structured financial intelligence from documents into JSON. \
|
|
Read the document text carefully and fill every field. \
|
|
Return ONLY valid JSON. No commentary, no markdown, no explanation."""
|
|
|
|
# --- Document-type-specific guidance ---
|
|
|
|
_DOCTYPE_GUIDANCE: dict[str, str] = {
|
|
DocumentType.ARTICLE: (
|
|
"This is a news article. Focus on reported facts, quoted sources, and stated "
|
|
"analyst opinions. Distinguish between the journalist's framing and actual "
|
|
"company developments. Do not treat speculative language as confirmed fact."
|
|
),
|
|
DocumentType.FILING: (
|
|
"This is a regulatory filing (e.g. SEC 10-K, 10-Q, 8-K). Extract concrete "
|
|
"financial figures, risk factors, and material events as stated. Filings use "
|
|
"precise legal language — preserve that precision in your extraction."
|
|
),
|
|
DocumentType.TRANSCRIPT: (
|
|
"This is an earnings call or event transcript. Distinguish between management "
|
|
"forward-looking statements and reported results. Flag forward-looking language "
|
|
"as lower confidence. Extract specific guidance numbers when stated."
|
|
),
|
|
DocumentType.PRESS_RELEASE: (
|
|
"This is a company press release. Be aware that press releases are promotional. "
|
|
"Extract stated facts and figures but note that sentiment may be biased positive. "
|
|
"Look for concrete metrics rather than marketing language."
|
|
),
|
|
}
|
|
|
|
|
|
def _get_doctype_guidance(document_type: str) -> str:
|
|
"""Return document-type-specific extraction guidance."""
|
|
return _DOCTYPE_GUIDANCE.get(document_type, _DOCTYPE_GUIDANCE[DocumentType.ARTICLE])
|
|
|
|
|
|
# --- Prompt builder ---
|
|
|
|
def build_extraction_prompt(
|
|
document_text: str,
|
|
document_type: str = DocumentType.ARTICLE,
|
|
known_tickers: list[str] | None = None,
|
|
document_id: str = "",
|
|
) -> dict[str, str]:
|
|
"""Build system and user prompts for Ollama structured extraction.
|
|
|
|
Args:
|
|
document_text: Normalized text content of the document.
|
|
document_type: One of the DocumentType enum values.
|
|
known_tickers: Optional list of tickers the document may reference.
|
|
Helps the model focus but does NOT mean all tickers are relevant.
|
|
document_id: Optional document ID for traceability.
|
|
|
|
Returns:
|
|
Dict with 'system' and 'user' prompt strings.
|
|
"""
|
|
doctype_guidance = _get_doctype_guidance(document_type)
|
|
|
|
ticker_hint = ""
|
|
if known_tickers:
|
|
tickers_str = ", ".join(known_tickers)
|
|
ticker_hint = (
|
|
f"\nTracked tickers: {tickers_str}\n"
|
|
"RULES for companies array:\n"
|
|
"- If ANY ticker from the list above appears verbatim in the text, "
|
|
"you MUST include it in companies with at least one evidence_span quote.\n"
|
|
"- If the article discusses a sector or theme that clearly affects a tracked company "
|
|
"(e.g. oil prices → XOM, AI chips → NVDA, interest rates → JPM), include that company.\n"
|
|
"- For each company: set sentiment (positive/negative/neutral/mixed), "
|
|
"impact_score (0.0-1.0), and copy a verbatim quote into evidence_spans.\n"
|
|
"- Do NOT invent tickers not in the list above."
|
|
)
|
|
|
|
doc_id_line = f"Document ID: {document_id}\n" if document_id else ""
|
|
|
|
user_prompt = f"""\
|
|
Extract structured intelligence from this document. Fill every field.
|
|
|
|
{doc_id_line}Document type: {document_type}
|
|
{doctype_guidance}
|
|
{ticker_hint}
|
|
Fill these fields:
|
|
- summary: 1-3 sentence summary of the document's main point
|
|
- companies: array of affected companies (see ticker rules above)
|
|
- macro_themes: list of broad market themes mentioned
|
|
- novelty_score: 0.0-1.0 how novel is this information
|
|
- confidence: 0.0-1.0 your confidence in the extraction quality
|
|
- extraction_warnings: list any issues
|
|
|
|
For each company entry fill: ticker, company_name, relevance (0-1), sentiment, impact_score (0-1), impact_horizon, catalyst_type, key_facts (list), risks (list), evidence_spans (verbatim quotes from text).
|
|
|
|
--- DOCUMENT TEXT ---
|
|
{document_text}
|
|
--- END DOCUMENT TEXT ---"""
|
|
|
|
return {
|
|
"system": SYSTEM_PROMPT,
|
|
"user": user_prompt,
|
|
}
|
|
|
|
|
|
def get_prompt_metadata() -> dict[str, str]:
|
|
"""Return metadata about the current prompt version for audit trails."""
|
|
return {
|
|
"prompt_version": PROMPT_VERSION,
|
|
"schema_version": SCHEMA_VERSION,
|
|
}
|
|
|
|
|
|
def get_json_schema() -> dict[str, Any]:
|
|
"""Return the extraction JSON schema for Ollama structured output format parameter."""
|
|
return EXTRACTION_JSON_SCHEMA
|