phase 17: pass all tracked tickers to extractor, soften prompt for macro-to-company relevance

This commit is contained in:
Celes Renata
2026-04-12 09:18:08 -07:00
parent 59f89d03d2
commit 57d0fc7d33
6 changed files with 142 additions and 4 deletions
+28
View File
@@ -0,0 +1,28 @@
from minio import Minio
import os, json
mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False)
raw_objs = list(mc.list_objects("stonks-llm-results", recursive=True))
print(f"LLM result objects: {len(raw_objs)}")
for o in raw_objs[:5]:
data = json.loads(mc.get_object("stonks-llm-results", o.object_name).read())
success = data.get("success", False)
attempts = data.get("attempts", [])
ticker = o.object_name.split("/")[1] if "/" in o.object_name else "?"
if attempts:
last = attempts[-1]
raw_out = last.get("raw_output", "")
print(f" {ticker}: success={success} output_len={len(raw_out)}")
try:
parsed = json.loads(raw_out)
companies = parsed.get("companies", [])
summary = parsed.get("summary", "")[:80]
conf = parsed.get("confidence", "?")
print(f" summary: {summary}")
print(f" confidence: {conf}")
print(f" companies: {len(companies)}")
for c in companies[:3]:
print(f" {c.get('ticker','?')} sentiment={c.get('sentiment','?')} impact={c.get('impact_score','?')}")
except:
print(f" raw: {raw_out[:120]}")
print()
+44
View File
@@ -0,0 +1,44 @@
from minio import Minio
import os, json
mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False)
raw_objs = list(mc.list_objects("stonks-llm-results", recursive=True))
# Count companies found vs not found
total = 0
with_companies = 0
empty_companies = 0
wrong_tickers = 0
our_tickers = {"AAPL","MSFT","NVDA","AMZN","GOOGL","JPM","JNJ","XOM","TSLA","META"}
ticker_hits = {}
for o in raw_objs:
data = json.loads(mc.get_object("stonks-llm-results", o.object_name).read())
if not data.get("success"):
continue
attempts = data.get("attempts", [])
if not attempts:
continue
raw_out = attempts[-1].get("raw_output", "")
try:
parsed = json.loads(raw_out)
companies = parsed.get("companies", [])
total += 1
if companies:
with_companies += 1
for c in companies:
t = c.get("ticker", "")
if t in our_tickers:
ticker_hits[t] = ticker_hits.get(t, 0) + 1
else:
wrong_tickers += 1
else:
empty_companies += 1
except:
pass
print(f"Total successful extractions: {total}")
print(f" With companies: {with_companies}")
print(f" Empty companies: {empty_companies}")
print(f" Wrong/unknown tickers: {wrong_tickers}")
print(f" Our ticker hits: {ticker_hits}")
+28
View File
@@ -0,0 +1,28 @@
from minio import Minio
import os, json
mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False)
raw_objs = list(mc.list_objects("stonks-llm-results", recursive=True))
# Find extractions with companies and check their summaries
for o in raw_objs:
data = json.loads(mc.get_object("stonks-llm-results", o.object_name).read())
if not data.get("success"):
continue
attempts = data.get("attempts", [])
if not attempts:
continue
raw_out = attempts[-1].get("raw_output", "")
try:
parsed = json.loads(raw_out)
companies = parsed.get("companies", [])
summary = parsed.get("summary", "")
conf = parsed.get("confidence", 0)
# Show high-confidence ones that still have no companies
if conf >= 0.5 and not companies:
ticker = o.object_name.split("/")[1]
print(f"HIGH CONF ({conf}) NO COMPANIES - {ticker}:")
print(f" summary: {summary[:120]}")
print()
except:
pass
+33
View File
@@ -0,0 +1,33 @@
from minio import Minio
import os, json
mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False)
# Get the most recent LLM results
raw_objs = sorted(mc.list_objects("stonks-llm-results", recursive=True), key=lambda o: o.last_modified, reverse=True)
for o in raw_objs[:3]:
data = json.loads(mc.get_object("stonks-llm-results", o.object_name).read())
attempts = data.get("attempts", [])
if not attempts:
continue
last = attempts[-1]
raw_out = last.get("raw_output", "")
ticker = o.object_name.split("/")[1]
doc_id = o.object_name.split("/")[-2]
print(f"=== {ticker} / {doc_id[:8]} ===")
print(f" success: {data.get('success')}")
print(f" duration: {data.get('total_duration_ms')}ms")
try:
parsed = json.loads(raw_out)
print(f" summary: {parsed.get('summary', '')[:120]}")
print(f" confidence: {parsed.get('confidence')}")
print(f" companies: {len(parsed.get('companies', []))}")
print(f" macro_themes: {parsed.get('macro_themes', [])}")
for c in parsed.get("companies", []):
print(f" -> {c.get('ticker')} sent={c.get('sentiment')} impact={c.get('impact_score')} catalyst={c.get('catalyst_type')}")
except:
print(f" raw output: {raw_out[:200]}")
print()
+3 -1
View File
@@ -88,10 +88,12 @@ async def main() -> None:
company_id_map = await _build_company_id_map(pool) company_id_map = await _build_company_id_map(pool)
try: try:
# Pass all tracked tickers so the model can identify any mentioned companies
all_tickers = list(company_id_map.keys()) if company_id_map else ([ticker] if ticker else None)
extraction_response = await ollama.extract( extraction_response = await ollama.extract(
text, text,
document_id=document_id, document_id=document_id,
known_tickers=[ticker] if ticker else None, known_tickers=all_tickers,
) )
result = await persist_extraction( result = await persist_extraction(
pool=pool, pool=pool,
+6 -3
View File
@@ -102,9 +102,12 @@ def build_extraction_prompt(
if known_tickers: if known_tickers:
tickers_str = ", ".join(known_tickers) tickers_str = ", ".join(known_tickers)
ticker_hint = ( ticker_hint = (
f"\nThe following tickers may be referenced in this document: {tickers_str}\n" f"\nTracked tickers: {tickers_str}\n"
"Only include a ticker in your output if the document actually discusses that company. " "If the document discusses market conditions, sectors, or themes that directly affect "
"Do NOT include a ticker just because it appears in this hint." "any of these companies, include those companies with appropriate sentiment and impact scores. "
"A macro article about oil prices IS relevant to energy companies like XOM. "
"A tech sector article IS relevant to AAPL, MSFT, NVDA, etc. "
"Use your judgment — but only include companies where the connection is clear from the text."
) )
schema_str = json.dumps(EXTRACTION_JSON_SCHEMA, indent=2) schema_str = json.dumps(EXTRACTION_JSON_SCHEMA, indent=2)