150 lines
5.6 KiB
Python
150 lines
5.6 KiB
Python
"""Extraction prompt templates with anti-hallucination instructions.
|
|
|
|
Builds structured prompts for Ollama document intelligence extraction.
|
|
Each prompt includes the target JSON schema, anti-hallucination rules,
|
|
and document-type-specific guidance.
|
|
|
|
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import Any
|
|
|
|
from services.extractor.schemas import generate_json_schema, SCHEMA_VERSION
|
|
from services.shared.schemas import (
|
|
DocumentType,
|
|
)
|
|
|
|
PROMPT_VERSION = "document-intel-v1"
|
|
|
|
# --- JSON schema for structured output (generated from Pydantic models) ---
|
|
|
|
EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema()
|
|
|
|
# --- Anti-hallucination system prompt ---
|
|
|
|
SYSTEM_PROMPT = """\
|
|
You are a financial document analysis system. You extract structured intelligence \
|
|
from financial documents into JSON.
|
|
|
|
STRICT RULES — VIOLATIONS WILL INVALIDATE YOUR OUTPUT:
|
|
|
|
1. ONLY extract information explicitly stated in the document text provided.
|
|
2. NEVER fabricate facts, quotes, numbers, dates, or company names.
|
|
3. NEVER infer information that is not directly supported by the text.
|
|
4. If the document does not mention a company, do NOT include that company.
|
|
5. If the document is ambiguous about sentiment or impact, use "neutral" or "mixed" \
|
|
and set confidence lower.
|
|
6. evidence_spans MUST be short verbatim quotes copied from the document. \
|
|
Do NOT paraphrase or invent quotes.
|
|
7. key_facts MUST be directly stated in the document. Do NOT add external knowledge.
|
|
8. If you are uncertain about any field, lower the confidence score and add a warning \
|
|
to extraction_warnings.
|
|
9. If the document text is too short, garbled, or uninformative, return an empty \
|
|
companies array, set confidence below 0.3, and add "insufficient_content" to warnings.
|
|
10. Return ONLY valid JSON matching the provided schema. No commentary, no markdown fences."""
|
|
|
|
# --- Document-type-specific guidance ---
|
|
|
|
_DOCTYPE_GUIDANCE: dict[str, str] = {
|
|
DocumentType.ARTICLE: (
|
|
"This is a news article. Focus on reported facts, quoted sources, and stated "
|
|
"analyst opinions. Distinguish between the journalist's framing and actual "
|
|
"company developments. Do not treat speculative language as confirmed fact."
|
|
),
|
|
DocumentType.FILING: (
|
|
"This is a regulatory filing (e.g. SEC 10-K, 10-Q, 8-K). Extract concrete "
|
|
"financial figures, risk factors, and material events as stated. Filings use "
|
|
"precise legal language — preserve that precision in your extraction."
|
|
),
|
|
DocumentType.TRANSCRIPT: (
|
|
"This is an earnings call or event transcript. Distinguish between management "
|
|
"forward-looking statements and reported results. Flag forward-looking language "
|
|
"as lower confidence. Extract specific guidance numbers when stated."
|
|
),
|
|
DocumentType.PRESS_RELEASE: (
|
|
"This is a company press release. Be aware that press releases are promotional. "
|
|
"Extract stated facts and figures but note that sentiment may be biased positive. "
|
|
"Look for concrete metrics rather than marketing language."
|
|
),
|
|
}
|
|
|
|
|
|
def _get_doctype_guidance(document_type: str) -> str:
|
|
"""Return document-type-specific extraction guidance."""
|
|
return _DOCTYPE_GUIDANCE.get(document_type, _DOCTYPE_GUIDANCE[DocumentType.ARTICLE])
|
|
|
|
|
|
# --- Prompt builder ---
|
|
|
|
def build_extraction_prompt(
|
|
document_text: str,
|
|
document_type: str = DocumentType.ARTICLE,
|
|
known_tickers: list[str] | None = None,
|
|
document_id: str = "",
|
|
) -> dict[str, str]:
|
|
"""Build system and user prompts for Ollama structured extraction.
|
|
|
|
Args:
|
|
document_text: Normalized text content of the document.
|
|
document_type: One of the DocumentType enum values.
|
|
known_tickers: Optional list of tickers the document may reference.
|
|
Helps the model focus but does NOT mean all tickers are relevant.
|
|
document_id: Optional document ID for traceability.
|
|
|
|
Returns:
|
|
Dict with 'system' and 'user' prompt strings.
|
|
"""
|
|
doctype_guidance = _get_doctype_guidance(document_type)
|
|
|
|
ticker_hint = ""
|
|
if known_tickers:
|
|
tickers_str = ", ".join(known_tickers)
|
|
ticker_hint = (
|
|
f"\nThe following tickers may be referenced in this document: {tickers_str}\n"
|
|
"Only include a ticker in your output if the document actually discusses that company. "
|
|
"Do NOT include a ticker just because it appears in this hint."
|
|
)
|
|
|
|
schema_str = json.dumps(EXTRACTION_JSON_SCHEMA, indent=2)
|
|
|
|
doc_id_line = f"Document ID: {document_id}\n" if document_id else ""
|
|
|
|
user_prompt = f"""\
|
|
Extract structured intelligence from the following document.
|
|
|
|
{doc_id_line}Document type: {document_type}
|
|
{doctype_guidance}
|
|
{ticker_hint}
|
|
Your output MUST be a single JSON object conforming to this schema:
|
|
{schema_str}
|
|
|
|
REMEMBER:
|
|
- Only extract what is explicitly in the text below.
|
|
- evidence_spans must be verbatim quotes from the text.
|
|
- If the text is insufficient, return empty companies and low confidence.
|
|
- Return ONLY the JSON object. No other text.
|
|
|
|
--- DOCUMENT TEXT ---
|
|
{document_text}
|
|
--- END DOCUMENT TEXT ---"""
|
|
|
|
return {
|
|
"system": SYSTEM_PROMPT,
|
|
"user": user_prompt,
|
|
}
|
|
|
|
|
|
def get_prompt_metadata() -> dict[str, str]:
|
|
"""Return metadata about the current prompt version for audit trails."""
|
|
return {
|
|
"prompt_version": PROMPT_VERSION,
|
|
"schema_version": SCHEMA_VERSION,
|
|
}
|
|
|
|
|
|
def get_json_schema() -> dict[str, Any]:
|
|
"""Return the extraction JSON schema for Ollama structured output format parameter."""
|
|
return EXTRACTION_JSON_SCHEMA
|