phase 17: switch to gemma4:e4b, rewrite prompts for fill-the-fields style with forced ticker inclusion
This commit is contained in:
@@ -159,7 +159,7 @@ config:
|
||||
MINIO_ENDPOINT: "minio.minio-service.svc.cluster.local:80"
|
||||
MINIO_SECURE: "false"
|
||||
OLLAMA_BASE_URL: "http://ollama.ollama-service.svc.cluster.local:11434"
|
||||
OLLAMA_MODEL: "qwen3.5:9b"
|
||||
OLLAMA_MODEL: "gemma4:e4b"
|
||||
OLLAMA_TIMEOUT: "120"
|
||||
OLLAMA_MAX_RETRIES: "2"
|
||||
OLLAMA_RETRY_BASE_DELAY: "1.0"
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
import asyncio, asyncpg, json, os, redis
|
||||
|
||||
async def main():
|
||||
pool = await asyncpg.create_pool(
|
||||
host=os.environ["POSTGRES_HOST"],
|
||||
port=int(os.environ["POSTGRES_PORT"]),
|
||||
database=os.environ["POSTGRES_DB"],
|
||||
user=os.environ["POSTGRES_USER"],
|
||||
password=os.environ["POSTGRES_PASSWORD"],
|
||||
)
|
||||
r = redis.from_url(f"redis://:{os.environ.get('REDIS_PASSWORD','')}@{os.environ['REDIS_HOST']}:{os.environ['REDIS_PORT']}/0")
|
||||
|
||||
# Reset filing docs to ingested
|
||||
await pool.execute(
|
||||
"UPDATE documents SET status = 'ingested', parse_quality_score = NULL, parse_confidence = NULL "
|
||||
"WHERE source_type = 'filings_api' AND status = 'low_quality' AND url IS NOT NULL"
|
||||
)
|
||||
|
||||
rows = await pool.fetch(
|
||||
"SELECT d.id, dcm.ticker FROM documents d "
|
||||
"LEFT JOIN document_company_mentions dcm ON d.id = dcm.document_id "
|
||||
"WHERE d.source_type = 'filings_api' AND d.status = 'ingested' "
|
||||
"LIMIT 20" # Start with 20 to test
|
||||
)
|
||||
|
||||
for row in rows:
|
||||
r.rpush("stonks:queue:parsing", json.dumps({
|
||||
"document_id": str(row["id"]),
|
||||
"ticker": row["ticker"] or "",
|
||||
}))
|
||||
|
||||
print(f"Enqueued {len(rows)} filing docs for parsing (test batch)")
|
||||
await pool.close()
|
||||
|
||||
asyncio.run(main())
|
||||
@@ -15,7 +15,7 @@ from services.shared.schemas import (
|
||||
DocumentType,
|
||||
)
|
||||
|
||||
PROMPT_VERSION = "document-intel-v1"
|
||||
PROMPT_VERSION = "document-intel-v2"
|
||||
|
||||
# --- JSON schema for structured output (generated from Pydantic models) ---
|
||||
|
||||
@@ -24,25 +24,9 @@ EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema()
|
||||
# --- Anti-hallucination system prompt ---
|
||||
|
||||
SYSTEM_PROMPT = """\
|
||||
You are a financial document analysis system. You extract structured intelligence \
|
||||
from financial documents into JSON.
|
||||
|
||||
STRICT RULES — VIOLATIONS WILL INVALIDATE YOUR OUTPUT:
|
||||
|
||||
1. ONLY extract information explicitly stated in the document text provided.
|
||||
2. NEVER fabricate facts, quotes, numbers, dates, or company names.
|
||||
3. NEVER infer information that is not directly supported by the text.
|
||||
4. If the document does not mention a company, do NOT include that company.
|
||||
5. If the document is ambiguous about sentiment or impact, use "neutral" or "mixed" \
|
||||
and set confidence lower.
|
||||
6. evidence_spans MUST be short verbatim quotes copied from the document. \
|
||||
Do NOT paraphrase or invent quotes.
|
||||
7. key_facts MUST be directly stated in the document. Do NOT add external knowledge.
|
||||
8. If you are uncertain about any field, lower the confidence score and add a warning \
|
||||
to extraction_warnings.
|
||||
9. If the document text is too short, garbled, or uninformative, return an empty \
|
||||
companies array, set confidence below 0.3, and add "insufficient_content" to warnings.
|
||||
10. Return ONLY valid JSON matching the provided schema. No commentary, no markdown fences."""
|
||||
You extract structured financial intelligence from documents into JSON. \
|
||||
Read the document text carefully and fill every field. \
|
||||
Return ONLY valid JSON. No commentary, no markdown, no explanation."""
|
||||
|
||||
# --- Document-type-specific guidance ---
|
||||
|
||||
@@ -102,28 +86,33 @@ def build_extraction_prompt(
|
||||
tickers_str = ", ".join(known_tickers)
|
||||
ticker_hint = (
|
||||
f"\nTracked tickers: {tickers_str}\n"
|
||||
"If the document discusses market conditions, sectors, or themes that directly affect "
|
||||
"any of these companies, include those companies with appropriate sentiment and impact scores. "
|
||||
"A macro article about oil prices IS relevant to energy companies like XOM. "
|
||||
"A tech sector article IS relevant to AAPL, MSFT, NVDA, etc. "
|
||||
"Use your judgment — but only include companies where the connection is clear from the text."
|
||||
"RULES for companies array:\n"
|
||||
"- If ANY ticker from the list above appears verbatim in the text, "
|
||||
"you MUST include it in companies with at least one evidence_span quote.\n"
|
||||
"- If the article discusses a sector or theme that clearly affects a tracked company "
|
||||
"(e.g. oil prices → XOM, AI chips → NVDA, interest rates → JPM), include that company.\n"
|
||||
"- For each company: set sentiment (positive/negative/neutral/mixed), "
|
||||
"impact_score (0.0-1.0), and copy a verbatim quote into evidence_spans.\n"
|
||||
"- Do NOT invent tickers not in the list above."
|
||||
)
|
||||
|
||||
doc_id_line = f"Document ID: {document_id}\n" if document_id else ""
|
||||
|
||||
user_prompt = f"""\
|
||||
Extract structured intelligence from the following document.
|
||||
Extract structured intelligence from this document. Fill every field.
|
||||
|
||||
{doc_id_line}Document type: {document_type}
|
||||
{doctype_guidance}
|
||||
{ticker_hint}
|
||||
Return a JSON object with: summary, companies (array with ticker, company_name, relevance, sentiment, impact_score, impact_horizon, catalyst_type, key_facts, risks, evidence_spans), macro_themes, novelty_score, confidence, extraction_warnings.
|
||||
Fill these fields:
|
||||
- summary: 1-3 sentence summary of the document's main point
|
||||
- companies: array of affected companies (see ticker rules above)
|
||||
- macro_themes: list of broad market themes mentioned
|
||||
- novelty_score: 0.0-1.0 how novel is this information
|
||||
- confidence: 0.0-1.0 your confidence in the extraction quality
|
||||
- extraction_warnings: list any issues
|
||||
|
||||
REMEMBER:
|
||||
- Only extract what is explicitly in the text below.
|
||||
- evidence_spans must be verbatim quotes from the text.
|
||||
- If the text is insufficient, return empty companies and low confidence.
|
||||
- Return ONLY the JSON object. No other text.
|
||||
For each company entry fill: ticker, company_name, relevance (0-1), sentiment, impact_score (0-1), impact_horizon, catalyst_type, key_facts (list), risks (list), evidence_spans (verbatim quotes from text).
|
||||
|
||||
--- DOCUMENT TEXT ---
|
||||
{document_text}
|
||||
|
||||
Reference in New Issue
Block a user