phase 17: switch to gemma4:e4b, rewrite prompts for fill-the-fields style with forced ticker inclusion

2026-04-12 10:05:31 -07:00
parent 2e42310f07
commit 66ed38bf18
3 changed files with 57 additions and 33 deletions
@@ -159,7 +159,7 @@ config:
  MINIO_ENDPOINT: "minio.minio-service.svc.cluster.local:80"
  MINIO_SECURE: "false"
  OLLAMA_BASE_URL: "http://ollama.ollama-service.svc.cluster.local:11434"
-  OLLAMA_MODEL: "qwen3.5:9b"
+  OLLAMA_MODEL: "gemma4:e4b"
  OLLAMA_TIMEOUT: "120"
  OLLAMA_MAX_RETRIES: "2"
  OLLAMA_RETRY_BASE_DELAY: "1.0"
@@ -0,0 +1,35 @@
+import asyncio, asyncpg, json, os, redis
+
+async def main():
+    pool = await asyncpg.create_pool(
+        host=os.environ["POSTGRES_HOST"],
+        port=int(os.environ["POSTGRES_PORT"]),
+        database=os.environ["POSTGRES_DB"],
+        user=os.environ["POSTGRES_USER"],
+        password=os.environ["POSTGRES_PASSWORD"],
+    )
+    r = redis.from_url(f"redis://:{os.environ.get('REDIS_PASSWORD','')}@{os.environ['REDIS_HOST']}:{os.environ['REDIS_PORT']}/0")
+    
+    # Reset filing docs to ingested
+    await pool.execute(
+        "UPDATE documents SET status = 'ingested', parse_quality_score = NULL, parse_confidence = NULL "
+        "WHERE source_type = 'filings_api' AND status = 'low_quality' AND url IS NOT NULL"
+    )
+    
+    rows = await pool.fetch(
+        "SELECT d.id, dcm.ticker FROM documents d "
+        "LEFT JOIN document_company_mentions dcm ON d.id = dcm.document_id "
+        "WHERE d.source_type = 'filings_api' AND d.status = 'ingested' "
+        "LIMIT 20"  # Start with 20 to test
+    )
+    
+    for row in rows:
+        r.rpush("stonks:queue:parsing", json.dumps({
+            "document_id": str(row["id"]),
+            "ticker": row["ticker"] or "",
+        }))
+    
+    print(f"Enqueued {len(rows)} filing docs for parsing (test batch)")
+    await pool.close()
+
+asyncio.run(main())
@@ -15,7 +15,7 @@ from services.shared.schemas import (
    DocumentType,
 )

-PROMPT_VERSION = "document-intel-v1"
+PROMPT_VERSION = "document-intel-v2"

 # --- JSON schema for structured output (generated from Pydantic models) ---

@@ -24,25 +24,9 @@ EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema()
 # --- Anti-hallucination system prompt ---

 SYSTEM_PROMPT = """\
-You are a financial document analysis system. You extract structured intelligence \
-from financial documents into JSON.
-
-STRICT RULES — VIOLATIONS WILL INVALIDATE YOUR OUTPUT:
-
-1. ONLY extract information explicitly stated in the document text provided.
-2. NEVER fabricate facts, quotes, numbers, dates, or company names.
-3. NEVER infer information that is not directly supported by the text.
-4. If the document does not mention a company, do NOT include that company.
-5. If the document is ambiguous about sentiment or impact, use "neutral" or "mixed" \
-and set confidence lower.
-6. evidence_spans MUST be short verbatim quotes copied from the document. \
-Do NOT paraphrase or invent quotes.
-7. key_facts MUST be directly stated in the document. Do NOT add external knowledge.
-8. If you are uncertain about any field, lower the confidence score and add a warning \
-to extraction_warnings.
-9. If the document text is too short, garbled, or uninformative, return an empty \
-companies array, set confidence below 0.3, and add "insufficient_content" to warnings.
-10. Return ONLY valid JSON matching the provided schema. No commentary, no markdown fences."""
+You extract structured financial intelligence from documents into JSON. \
+Read the document text carefully and fill every field. \
+Return ONLY valid JSON. No commentary, no markdown, no explanation."""

 # --- Document-type-specific guidance ---

@@ -102,28 +86,33 @@ def build_extraction_prompt(
        tickers_str = ", ".join(known_tickers)
        ticker_hint = (
            f"\nTracked tickers: {tickers_str}\n"
-            "If the document discusses market conditions, sectors, or themes that directly affect "
-            "any of these companies, include those companies with appropriate sentiment and impact scores. "
-            "A macro article about oil prices IS relevant to energy companies like XOM. "
-            "A tech sector article IS relevant to AAPL, MSFT, NVDA, etc. "
-            "Use your judgment — but only include companies where the connection is clear from the text."
+            "RULES for companies array:\n"
+            "- If ANY ticker from the list above appears verbatim in the text, "
+            "you MUST include it in companies with at least one evidence_span quote.\n"
+            "- If the article discusses a sector or theme that clearly affects a tracked company "
+            "(e.g. oil prices → XOM, AI chips → NVDA, interest rates → JPM), include that company.\n"
+            "- For each company: set sentiment (positive/negative/neutral/mixed), "
+            "impact_score (0.0-1.0), and copy a verbatim quote into evidence_spans.\n"
+            "- Do NOT invent tickers not in the list above."
        )

    doc_id_line = f"Document ID: {document_id}\n" if document_id else ""

    user_prompt = f"""\
-Extract structured intelligence from the following document.
+Extract structured intelligence from this document. Fill every field.

 {doc_id_line}Document type: {document_type}
 {doctype_guidance}
 {ticker_hint}
-Return a JSON object with: summary, companies (array with ticker, company_name, relevance, sentiment, impact_score, impact_horizon, catalyst_type, key_facts, risks, evidence_spans), macro_themes, novelty_score, confidence, extraction_warnings.
+Fill these fields:
+- summary: 1-3 sentence summary of the document's main point
+- companies: array of affected companies (see ticker rules above)
+- macro_themes: list of broad market themes mentioned
+- novelty_score: 0.0-1.0 how novel is this information
+- confidence: 0.0-1.0 your confidence in the extraction quality
+- extraction_warnings: list any issues

-REMEMBER:
- Only extract what is explicitly in the text below.
- evidence_spans must be verbatim quotes from the text.
- If the text is insufficient, return empty companies and low confidence.
- Return ONLY the JSON object. No other text.
+For each company entry fill: ticker, company_name, relevance (0-1), sentiment, impact_score (0-1), impact_horizon, catalyst_type, key_facts (list), risks (list), evidence_spans (verbatim quotes from text).

 --- DOCUMENT TEXT ---
 {document_text}