diff --git a/services/extractor/client.py b/services/extractor/client.py index 324d3c7..cba4c17 100644 --- a/services/extractor/client.py +++ b/services/extractor/client.py @@ -259,7 +259,7 @@ class OllamaClient: "stream": False, "think": False, "options": { - "num_predict": 16384, + "num_predict": 4096, }, } diff --git a/services/extractor/event_classifier.py b/services/extractor/event_classifier.py index 493bdd9..29e7068 100644 --- a/services/extractor/event_classifier.py +++ b/services/extractor/event_classifier.py @@ -183,22 +183,15 @@ def get_event_json_schema() -> dict[str, Any]: # --------------------------------------------------------------------------- _SYSTEM_PROMPT = """\ -You classify global news articles into structured macro event intelligence. \ -Read the article carefully and extract the event classification. \ -Return ONLY valid JSON matching the schema. No commentary, no markdown, no explanation.""" +You classify global news into structured macro event JSON. \ +Return ONLY a single JSON object. No markdown, no explanation. \ +Every field is required. Keep key_facts to 3-5 items. Keep summary under 3 sentences.""" _ANTI_HALLUCINATION_RULES = """\ -CRITICAL RULES — read carefully: -1. Only extract information EXPLICITLY stated in the article text. -2. Do NOT infer, speculate, or fabricate facts, regions, sectors, or commodities. -3. If the article mentions multiple distinct impact types, include ALL of them in event_types. -4. For affected_regions, only include regions explicitly mentioned or clearly implied by the event. -5. For affected_sectors, only include sectors with a clear causal link to the event. -6. For affected_commodities, only include commodities directly referenced or obviously impacted. -7. For key_facts, each fact must be directly supported by a specific passage in the text. -8. If the article is vague or speculative, set confidence LOW (below 0.4). -9. Do NOT treat journalist speculation or opinion as confirmed fact. -10. Distinguish between announced policy and proposed/rumored policy.""" +RULES: +- Only extract facts EXPLICITLY stated in the text. Do NOT fabricate. +- If vague or speculative, set confidence below 0.4. +- Distinguish announced policy from rumored policy.""" def build_event_classification_prompt(text: str) -> dict[str, str]: @@ -210,6 +203,11 @@ def build_event_classification_prompt(text: str) -> dict[str, str]: Returns: Dict with 'system' and 'user' prompt strings. """ + # Truncate long articles to reduce inference time + max_chars = 6000 + if len(text) > max_chars: + text = text[:max_chars] + "\n[... truncated ...]" + user_prompt = f"""\ Classify this global news article as a macro event. Fill every field. diff --git a/services/extractor/prompts.py b/services/extractor/prompts.py index 0303f2a..8d5e74b 100644 --- a/services/extractor/prompts.py +++ b/services/extractor/prompts.py @@ -24,9 +24,10 @@ EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema() # --- Anti-hallucination system prompt --- SYSTEM_PROMPT = """\ -You extract structured financial intelligence from documents into JSON. \ -Read the document text carefully and fill every field. \ -Return ONLY valid JSON. No commentary, no markdown, no explanation.""" +You are a financial document analyst. Extract structured data as JSON. \ +Return ONLY a single JSON object. No markdown fences, no explanation, no text before or after the JSON. \ +Every field in the schema is required. Use "other" for catalyst_type if unsure. \ +Keep evidence_spans short (under 20 words each). Keep key_facts to 3-5 items max.""" # --- Document-type-specific guidance --- @@ -98,6 +99,11 @@ def build_extraction_prompt( doc_id_line = f"Document ID: {document_id}\n" if document_id else "" + # Truncate long documents to reduce prompt size and inference time + max_doc_chars = 8000 + if len(document_text) > max_doc_chars: + document_text = document_text[:max_doc_chars] + "\n[... truncated for extraction ...]" + user_prompt = f"""\ Extract structured intelligence from this document. Fill every field.