phase 14-15: docker build validation and helm deployment
This commit is contained in:
@@ -0,0 +1,149 @@
|
||||
"""Extraction prompt templates with anti-hallucination instructions.
|
||||
|
||||
Builds structured prompts for Ollama document intelligence extraction.
|
||||
Each prompt includes the target JSON schema, anti-hallucination rules,
|
||||
and document-type-specific guidance.
|
||||
|
||||
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from services.extractor.schemas import generate_json_schema, SCHEMA_VERSION
|
||||
from services.shared.schemas import (
|
||||
DocumentType,
|
||||
)
|
||||
|
||||
PROMPT_VERSION = "document-intel-v1"
|
||||
|
||||
# --- JSON schema for structured output (generated from Pydantic models) ---
|
||||
|
||||
EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema()
|
||||
|
||||
# --- Anti-hallucination system prompt ---
|
||||
|
||||
SYSTEM_PROMPT = """\
|
||||
You are a financial document analysis system. You extract structured intelligence \
|
||||
from financial documents into JSON.
|
||||
|
||||
STRICT RULES — VIOLATIONS WILL INVALIDATE YOUR OUTPUT:
|
||||
|
||||
1. ONLY extract information explicitly stated in the document text provided.
|
||||
2. NEVER fabricate facts, quotes, numbers, dates, or company names.
|
||||
3. NEVER infer information that is not directly supported by the text.
|
||||
4. If the document does not mention a company, do NOT include that company.
|
||||
5. If the document is ambiguous about sentiment or impact, use "neutral" or "mixed" \
|
||||
and set confidence lower.
|
||||
6. evidence_spans MUST be short verbatim quotes copied from the document. \
|
||||
Do NOT paraphrase or invent quotes.
|
||||
7. key_facts MUST be directly stated in the document. Do NOT add external knowledge.
|
||||
8. If you are uncertain about any field, lower the confidence score and add a warning \
|
||||
to extraction_warnings.
|
||||
9. If the document text is too short, garbled, or uninformative, return an empty \
|
||||
companies array, set confidence below 0.3, and add "insufficient_content" to warnings.
|
||||
10. Return ONLY valid JSON matching the provided schema. No commentary, no markdown fences."""
|
||||
|
||||
# --- Document-type-specific guidance ---
|
||||
|
||||
_DOCTYPE_GUIDANCE: dict[str, str] = {
|
||||
DocumentType.ARTICLE: (
|
||||
"This is a news article. Focus on reported facts, quoted sources, and stated "
|
||||
"analyst opinions. Distinguish between the journalist's framing and actual "
|
||||
"company developments. Do not treat speculative language as confirmed fact."
|
||||
),
|
||||
DocumentType.FILING: (
|
||||
"This is a regulatory filing (e.g. SEC 10-K, 10-Q, 8-K). Extract concrete "
|
||||
"financial figures, risk factors, and material events as stated. Filings use "
|
||||
"precise legal language — preserve that precision in your extraction."
|
||||
),
|
||||
DocumentType.TRANSCRIPT: (
|
||||
"This is an earnings call or event transcript. Distinguish between management "
|
||||
"forward-looking statements and reported results. Flag forward-looking language "
|
||||
"as lower confidence. Extract specific guidance numbers when stated."
|
||||
),
|
||||
DocumentType.PRESS_RELEASE: (
|
||||
"This is a company press release. Be aware that press releases are promotional. "
|
||||
"Extract stated facts and figures but note that sentiment may be biased positive. "
|
||||
"Look for concrete metrics rather than marketing language."
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _get_doctype_guidance(document_type: str) -> str:
|
||||
"""Return document-type-specific extraction guidance."""
|
||||
return _DOCTYPE_GUIDANCE.get(document_type, _DOCTYPE_GUIDANCE[DocumentType.ARTICLE])
|
||||
|
||||
|
||||
# --- Prompt builder ---
|
||||
|
||||
def build_extraction_prompt(
|
||||
document_text: str,
|
||||
document_type: str = DocumentType.ARTICLE,
|
||||
known_tickers: list[str] | None = None,
|
||||
document_id: str = "",
|
||||
) -> dict[str, str]:
|
||||
"""Build system and user prompts for Ollama structured extraction.
|
||||
|
||||
Args:
|
||||
document_text: Normalized text content of the document.
|
||||
document_type: One of the DocumentType enum values.
|
||||
known_tickers: Optional list of tickers the document may reference.
|
||||
Helps the model focus but does NOT mean all tickers are relevant.
|
||||
document_id: Optional document ID for traceability.
|
||||
|
||||
Returns:
|
||||
Dict with 'system' and 'user' prompt strings.
|
||||
"""
|
||||
doctype_guidance = _get_doctype_guidance(document_type)
|
||||
|
||||
ticker_hint = ""
|
||||
if known_tickers:
|
||||
tickers_str = ", ".join(known_tickers)
|
||||
ticker_hint = (
|
||||
f"\nThe following tickers may be referenced in this document: {tickers_str}\n"
|
||||
"Only include a ticker in your output if the document actually discusses that company. "
|
||||
"Do NOT include a ticker just because it appears in this hint."
|
||||
)
|
||||
|
||||
schema_str = json.dumps(EXTRACTION_JSON_SCHEMA, indent=2)
|
||||
|
||||
doc_id_line = f"Document ID: {document_id}\n" if document_id else ""
|
||||
|
||||
user_prompt = f"""\
|
||||
Extract structured intelligence from the following document.
|
||||
|
||||
{doc_id_line}Document type: {document_type}
|
||||
{doctype_guidance}
|
||||
{ticker_hint}
|
||||
Your output MUST be a single JSON object conforming to this schema:
|
||||
{schema_str}
|
||||
|
||||
REMEMBER:
|
||||
- Only extract what is explicitly in the text below.
|
||||
- evidence_spans must be verbatim quotes from the text.
|
||||
- If the text is insufficient, return empty companies and low confidence.
|
||||
- Return ONLY the JSON object. No other text.
|
||||
|
||||
--- DOCUMENT TEXT ---
|
||||
{document_text}
|
||||
--- END DOCUMENT TEXT ---"""
|
||||
|
||||
return {
|
||||
"system": SYSTEM_PROMPT,
|
||||
"user": user_prompt,
|
||||
}
|
||||
|
||||
|
||||
def get_prompt_metadata() -> dict[str, str]:
|
||||
"""Return metadata about the current prompt version for audit trails."""
|
||||
return {
|
||||
"prompt_version": PROMPT_VERSION,
|
||||
"schema_version": SCHEMA_VERSION,
|
||||
}
|
||||
|
||||
|
||||
def get_json_schema() -> dict[str, Any]:
|
||||
"""Return the extraction JSON schema for Ollama structured output format parameter."""
|
||||
return EXTRACTION_JSON_SCHEMA
|
||||
Reference in New Issue
Block a user