phase 17: switch to gemma4:e4b, rewrite prompts for fill-the-fields style with forced ticker inclusion

This commit is contained in:
Celes Renata
2026-04-12 10:05:31 -07:00
parent 2e42310f07
commit 66ed38bf18
3 changed files with 57 additions and 33 deletions
+1 -1
View File
@@ -159,7 +159,7 @@ config:
MINIO_ENDPOINT: "minio.minio-service.svc.cluster.local:80"
MINIO_SECURE: "false"
OLLAMA_BASE_URL: "http://ollama.ollama-service.svc.cluster.local:11434"
OLLAMA_MODEL: "qwen3.5:9b"
OLLAMA_MODEL: "gemma4:e4b"
OLLAMA_TIMEOUT: "120"
OLLAMA_MAX_RETRIES: "2"
OLLAMA_RETRY_BASE_DELAY: "1.0"
+35
View File
@@ -0,0 +1,35 @@
import asyncio, asyncpg, json, os, redis
async def main():
pool = await asyncpg.create_pool(
host=os.environ["POSTGRES_HOST"],
port=int(os.environ["POSTGRES_PORT"]),
database=os.environ["POSTGRES_DB"],
user=os.environ["POSTGRES_USER"],
password=os.environ["POSTGRES_PASSWORD"],
)
r = redis.from_url(f"redis://:{os.environ.get('REDIS_PASSWORD','')}@{os.environ['REDIS_HOST']}:{os.environ['REDIS_PORT']}/0")
# Reset filing docs to ingested
await pool.execute(
"UPDATE documents SET status = 'ingested', parse_quality_score = NULL, parse_confidence = NULL "
"WHERE source_type = 'filings_api' AND status = 'low_quality' AND url IS NOT NULL"
)
rows = await pool.fetch(
"SELECT d.id, dcm.ticker FROM documents d "
"LEFT JOIN document_company_mentions dcm ON d.id = dcm.document_id "
"WHERE d.source_type = 'filings_api' AND d.status = 'ingested' "
"LIMIT 20" # Start with 20 to test
)
for row in rows:
r.rpush("stonks:queue:parsing", json.dumps({
"document_id": str(row["id"]),
"ticker": row["ticker"] or "",
}))
print(f"Enqueued {len(rows)} filing docs for parsing (test batch)")
await pool.close()
asyncio.run(main())
+21 -32
View File
@@ -15,7 +15,7 @@ from services.shared.schemas import (
DocumentType,
)
PROMPT_VERSION = "document-intel-v1"
PROMPT_VERSION = "document-intel-v2"
# --- JSON schema for structured output (generated from Pydantic models) ---
@@ -24,25 +24,9 @@ EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema()
# --- Anti-hallucination system prompt ---
SYSTEM_PROMPT = """\
You are a financial document analysis system. You extract structured intelligence \
from financial documents into JSON.
STRICT RULES — VIOLATIONS WILL INVALIDATE YOUR OUTPUT:
1. ONLY extract information explicitly stated in the document text provided.
2. NEVER fabricate facts, quotes, numbers, dates, or company names.
3. NEVER infer information that is not directly supported by the text.
4. If the document does not mention a company, do NOT include that company.
5. If the document is ambiguous about sentiment or impact, use "neutral" or "mixed" \
and set confidence lower.
6. evidence_spans MUST be short verbatim quotes copied from the document. \
Do NOT paraphrase or invent quotes.
7. key_facts MUST be directly stated in the document. Do NOT add external knowledge.
8. If you are uncertain about any field, lower the confidence score and add a warning \
to extraction_warnings.
9. If the document text is too short, garbled, or uninformative, return an empty \
companies array, set confidence below 0.3, and add "insufficient_content" to warnings.
10. Return ONLY valid JSON matching the provided schema. No commentary, no markdown fences."""
You extract structured financial intelligence from documents into JSON. \
Read the document text carefully and fill every field. \
Return ONLY valid JSON. No commentary, no markdown, no explanation."""
# --- Document-type-specific guidance ---
@@ -102,28 +86,33 @@ def build_extraction_prompt(
tickers_str = ", ".join(known_tickers)
ticker_hint = (
f"\nTracked tickers: {tickers_str}\n"
"If the document discusses market conditions, sectors, or themes that directly affect "
"any of these companies, include those companies with appropriate sentiment and impact scores. "
"A macro article about oil prices IS relevant to energy companies like XOM. "
"A tech sector article IS relevant to AAPL, MSFT, NVDA, etc. "
"Use your judgment — but only include companies where the connection is clear from the text."
"RULES for companies array:\n"
"- If ANY ticker from the list above appears verbatim in the text, "
"you MUST include it in companies with at least one evidence_span quote.\n"
"- If the article discusses a sector or theme that clearly affects a tracked company "
"(e.g. oil prices → XOM, AI chips → NVDA, interest rates → JPM), include that company.\n"
"- For each company: set sentiment (positive/negative/neutral/mixed), "
"impact_score (0.0-1.0), and copy a verbatim quote into evidence_spans.\n"
"- Do NOT invent tickers not in the list above."
)
doc_id_line = f"Document ID: {document_id}\n" if document_id else ""
user_prompt = f"""\
Extract structured intelligence from the following document.
Extract structured intelligence from this document. Fill every field.
{doc_id_line}Document type: {document_type}
{doctype_guidance}
{ticker_hint}
Return a JSON object with: summary, companies (array with ticker, company_name, relevance, sentiment, impact_score, impact_horizon, catalyst_type, key_facts, risks, evidence_spans), macro_themes, novelty_score, confidence, extraction_warnings.
Fill these fields:
- summary: 1-3 sentence summary of the document's main point
- companies: array of affected companies (see ticker rules above)
- macro_themes: list of broad market themes mentioned
- novelty_score: 0.0-1.0 how novel is this information
- confidence: 0.0-1.0 your confidence in the extraction quality
- extraction_warnings: list any issues
REMEMBER:
- Only extract what is explicitly in the text below.
- evidence_spans must be verbatim quotes from the text.
- If the text is insufficient, return empty companies and low confidence.
- Return ONLY the JSON object. No other text.
For each company entry fill: ticker, company_name, relevance (0-1), sentiment, impact_score (0-1), impact_horizon, catalyst_type, key_facts (list), risks (list), evidence_spans (verbatim quotes from text).
--- DOCUMENT TEXT ---
{document_text}