diff --git a/infra/helm/stonks-oracle/values.yaml b/infra/helm/stonks-oracle/values.yaml index 29ef19b..5b0a7ab 100644 --- a/infra/helm/stonks-oracle/values.yaml +++ b/infra/helm/stonks-oracle/values.yaml @@ -159,7 +159,7 @@ config: MINIO_ENDPOINT: "minio.minio-service.svc.cluster.local:80" MINIO_SECURE: "false" OLLAMA_BASE_URL: "http://ollama.ollama-service.svc.cluster.local:11434" - OLLAMA_MODEL: "qwen3.5:9b" + OLLAMA_MODEL: "gemma4:e4b" OLLAMA_TIMEOUT: "120" OLLAMA_MAX_RETRIES: "2" OLLAMA_RETRY_BASE_DELAY: "1.0" diff --git a/scripts/reenqueue_filings.py b/scripts/reenqueue_filings.py new file mode 100644 index 0000000..02b17fe --- /dev/null +++ b/scripts/reenqueue_filings.py @@ -0,0 +1,35 @@ +import asyncio, asyncpg, json, os, redis + +async def main(): + pool = await asyncpg.create_pool( + host=os.environ["POSTGRES_HOST"], + port=int(os.environ["POSTGRES_PORT"]), + database=os.environ["POSTGRES_DB"], + user=os.environ["POSTGRES_USER"], + password=os.environ["POSTGRES_PASSWORD"], + ) + r = redis.from_url(f"redis://:{os.environ.get('REDIS_PASSWORD','')}@{os.environ['REDIS_HOST']}:{os.environ['REDIS_PORT']}/0") + + # Reset filing docs to ingested + await pool.execute( + "UPDATE documents SET status = 'ingested', parse_quality_score = NULL, parse_confidence = NULL " + "WHERE source_type = 'filings_api' AND status = 'low_quality' AND url IS NOT NULL" + ) + + rows = await pool.fetch( + "SELECT d.id, dcm.ticker FROM documents d " + "LEFT JOIN document_company_mentions dcm ON d.id = dcm.document_id " + "WHERE d.source_type = 'filings_api' AND d.status = 'ingested' " + "LIMIT 20" # Start with 20 to test + ) + + for row in rows: + r.rpush("stonks:queue:parsing", json.dumps({ + "document_id": str(row["id"]), + "ticker": row["ticker"] or "", + })) + + print(f"Enqueued {len(rows)} filing docs for parsing (test batch)") + await pool.close() + +asyncio.run(main()) diff --git a/services/extractor/prompts.py b/services/extractor/prompts.py index 9bb0c40..c4ca28a 100644 --- a/services/extractor/prompts.py +++ b/services/extractor/prompts.py @@ -15,7 +15,7 @@ from services.shared.schemas import ( DocumentType, ) -PROMPT_VERSION = "document-intel-v1" +PROMPT_VERSION = "document-intel-v2" # --- JSON schema for structured output (generated from Pydantic models) --- @@ -24,25 +24,9 @@ EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema() # --- Anti-hallucination system prompt --- SYSTEM_PROMPT = """\ -You are a financial document analysis system. You extract structured intelligence \ -from financial documents into JSON. - -STRICT RULES — VIOLATIONS WILL INVALIDATE YOUR OUTPUT: - -1. ONLY extract information explicitly stated in the document text provided. -2. NEVER fabricate facts, quotes, numbers, dates, or company names. -3. NEVER infer information that is not directly supported by the text. -4. If the document does not mention a company, do NOT include that company. -5. If the document is ambiguous about sentiment or impact, use "neutral" or "mixed" \ -and set confidence lower. -6. evidence_spans MUST be short verbatim quotes copied from the document. \ -Do NOT paraphrase or invent quotes. -7. key_facts MUST be directly stated in the document. Do NOT add external knowledge. -8. If you are uncertain about any field, lower the confidence score and add a warning \ -to extraction_warnings. -9. If the document text is too short, garbled, or uninformative, return an empty \ -companies array, set confidence below 0.3, and add "insufficient_content" to warnings. -10. Return ONLY valid JSON matching the provided schema. No commentary, no markdown fences.""" +You extract structured financial intelligence from documents into JSON. \ +Read the document text carefully and fill every field. \ +Return ONLY valid JSON. No commentary, no markdown, no explanation.""" # --- Document-type-specific guidance --- @@ -102,28 +86,33 @@ def build_extraction_prompt( tickers_str = ", ".join(known_tickers) ticker_hint = ( f"\nTracked tickers: {tickers_str}\n" - "If the document discusses market conditions, sectors, or themes that directly affect " - "any of these companies, include those companies with appropriate sentiment and impact scores. " - "A macro article about oil prices IS relevant to energy companies like XOM. " - "A tech sector article IS relevant to AAPL, MSFT, NVDA, etc. " - "Use your judgment — but only include companies where the connection is clear from the text." + "RULES for companies array:\n" + "- If ANY ticker from the list above appears verbatim in the text, " + "you MUST include it in companies with at least one evidence_span quote.\n" + "- If the article discusses a sector or theme that clearly affects a tracked company " + "(e.g. oil prices → XOM, AI chips → NVDA, interest rates → JPM), include that company.\n" + "- For each company: set sentiment (positive/negative/neutral/mixed), " + "impact_score (0.0-1.0), and copy a verbatim quote into evidence_spans.\n" + "- Do NOT invent tickers not in the list above." ) doc_id_line = f"Document ID: {document_id}\n" if document_id else "" user_prompt = f"""\ -Extract structured intelligence from the following document. +Extract structured intelligence from this document. Fill every field. {doc_id_line}Document type: {document_type} {doctype_guidance} {ticker_hint} -Return a JSON object with: summary, companies (array with ticker, company_name, relevance, sentiment, impact_score, impact_horizon, catalyst_type, key_facts, risks, evidence_spans), macro_themes, novelty_score, confidence, extraction_warnings. +Fill these fields: +- summary: 1-3 sentence summary of the document's main point +- companies: array of affected companies (see ticker rules above) +- macro_themes: list of broad market themes mentioned +- novelty_score: 0.0-1.0 how novel is this information +- confidence: 0.0-1.0 your confidence in the extraction quality +- extraction_warnings: list any issues -REMEMBER: -- Only extract what is explicitly in the text below. -- evidence_spans must be verbatim quotes from the text. -- If the text is insufficient, return empty companies and low confidence. -- Return ONLY the JSON object. No other text. +For each company entry fill: ticker, company_name, relevance (0-1), sentiment, impact_score (0-1), impact_horizon, catalyst_type, key_facts (list), risks (list), evidence_spans (verbatim quotes from text). --- DOCUMENT TEXT --- {document_text}