From 57d0fc7d339b0190dfa5dfa9df5bfb818a5ca4d5 Mon Sep 17 00:00:00 2001 From: Celes Renata Date: Sun, 12 Apr 2026 09:18:08 -0700 Subject: [PATCH] phase 17: pass all tracked tickers to extractor, soften prompt for macro-to-company relevance --- scripts/check_llm.py | 28 ++++++++++++++++++++++ scripts/check_llm2.py | 44 +++++++++++++++++++++++++++++++++++ scripts/check_llm3.py | 28 ++++++++++++++++++++++ scripts/check_recent.py | 33 ++++++++++++++++++++++++++ services/extractor/main.py | 4 +++- services/extractor/prompts.py | 9 ++++--- 6 files changed, 142 insertions(+), 4 deletions(-) create mode 100644 scripts/check_llm.py create mode 100644 scripts/check_llm2.py create mode 100644 scripts/check_llm3.py create mode 100644 scripts/check_recent.py diff --git a/scripts/check_llm.py b/scripts/check_llm.py new file mode 100644 index 0000000..afba6bc --- /dev/null +++ b/scripts/check_llm.py @@ -0,0 +1,28 @@ +from minio import Minio +import os, json + +mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False) +raw_objs = list(mc.list_objects("stonks-llm-results", recursive=True)) +print(f"LLM result objects: {len(raw_objs)}") +for o in raw_objs[:5]: + data = json.loads(mc.get_object("stonks-llm-results", o.object_name).read()) + success = data.get("success", False) + attempts = data.get("attempts", []) + ticker = o.object_name.split("/")[1] if "/" in o.object_name else "?" + if attempts: + last = attempts[-1] + raw_out = last.get("raw_output", "") + print(f" {ticker}: success={success} output_len={len(raw_out)}") + try: + parsed = json.loads(raw_out) + companies = parsed.get("companies", []) + summary = parsed.get("summary", "")[:80] + conf = parsed.get("confidence", "?") + print(f" summary: {summary}") + print(f" confidence: {conf}") + print(f" companies: {len(companies)}") + for c in companies[:3]: + print(f" {c.get('ticker','?')} sentiment={c.get('sentiment','?')} impact={c.get('impact_score','?')}") + except: + print(f" raw: {raw_out[:120]}") + print() diff --git a/scripts/check_llm2.py b/scripts/check_llm2.py new file mode 100644 index 0000000..9d777e9 --- /dev/null +++ b/scripts/check_llm2.py @@ -0,0 +1,44 @@ +from minio import Minio +import os, json + +mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False) +raw_objs = list(mc.list_objects("stonks-llm-results", recursive=True)) + +# Count companies found vs not found +total = 0 +with_companies = 0 +empty_companies = 0 +wrong_tickers = 0 +our_tickers = {"AAPL","MSFT","NVDA","AMZN","GOOGL","JPM","JNJ","XOM","TSLA","META"} +ticker_hits = {} + +for o in raw_objs: + data = json.loads(mc.get_object("stonks-llm-results", o.object_name).read()) + if not data.get("success"): + continue + attempts = data.get("attempts", []) + if not attempts: + continue + raw_out = attempts[-1].get("raw_output", "") + try: + parsed = json.loads(raw_out) + companies = parsed.get("companies", []) + total += 1 + if companies: + with_companies += 1 + for c in companies: + t = c.get("ticker", "") + if t in our_tickers: + ticker_hits[t] = ticker_hits.get(t, 0) + 1 + else: + wrong_tickers += 1 + else: + empty_companies += 1 + except: + pass + +print(f"Total successful extractions: {total}") +print(f" With companies: {with_companies}") +print(f" Empty companies: {empty_companies}") +print(f" Wrong/unknown tickers: {wrong_tickers}") +print(f" Our ticker hits: {ticker_hits}") diff --git a/scripts/check_llm3.py b/scripts/check_llm3.py new file mode 100644 index 0000000..8cc66c2 --- /dev/null +++ b/scripts/check_llm3.py @@ -0,0 +1,28 @@ +from minio import Minio +import os, json + +mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False) +raw_objs = list(mc.list_objects("stonks-llm-results", recursive=True)) + +# Find extractions with companies and check their summaries +for o in raw_objs: + data = json.loads(mc.get_object("stonks-llm-results", o.object_name).read()) + if not data.get("success"): + continue + attempts = data.get("attempts", []) + if not attempts: + continue + raw_out = attempts[-1].get("raw_output", "") + try: + parsed = json.loads(raw_out) + companies = parsed.get("companies", []) + summary = parsed.get("summary", "") + conf = parsed.get("confidence", 0) + # Show high-confidence ones that still have no companies + if conf >= 0.5 and not companies: + ticker = o.object_name.split("/")[1] + print(f"HIGH CONF ({conf}) NO COMPANIES - {ticker}:") + print(f" summary: {summary[:120]}") + print() + except: + pass diff --git a/scripts/check_recent.py b/scripts/check_recent.py new file mode 100644 index 0000000..f4d81f6 --- /dev/null +++ b/scripts/check_recent.py @@ -0,0 +1,33 @@ +from minio import Minio +import os, json + +mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False) + +# Get the most recent LLM results +raw_objs = sorted(mc.list_objects("stonks-llm-results", recursive=True), key=lambda o: o.last_modified, reverse=True) + +for o in raw_objs[:3]: + data = json.loads(mc.get_object("stonks-llm-results", o.object_name).read()) + attempts = data.get("attempts", []) + if not attempts: + continue + last = attempts[-1] + raw_out = last.get("raw_output", "") + ticker = o.object_name.split("/")[1] + doc_id = o.object_name.split("/")[-2] + + print(f"=== {ticker} / {doc_id[:8]} ===") + print(f" success: {data.get('success')}") + print(f" duration: {data.get('total_duration_ms')}ms") + + try: + parsed = json.loads(raw_out) + print(f" summary: {parsed.get('summary', '')[:120]}") + print(f" confidence: {parsed.get('confidence')}") + print(f" companies: {len(parsed.get('companies', []))}") + print(f" macro_themes: {parsed.get('macro_themes', [])}") + for c in parsed.get("companies", []): + print(f" -> {c.get('ticker')} sent={c.get('sentiment')} impact={c.get('impact_score')} catalyst={c.get('catalyst_type')}") + except: + print(f" raw output: {raw_out[:200]}") + print() diff --git a/services/extractor/main.py b/services/extractor/main.py index 1ebe0b0..748ae32 100644 --- a/services/extractor/main.py +++ b/services/extractor/main.py @@ -88,10 +88,12 @@ async def main() -> None: company_id_map = await _build_company_id_map(pool) try: + # Pass all tracked tickers so the model can identify any mentioned companies + all_tickers = list(company_id_map.keys()) if company_id_map else ([ticker] if ticker else None) extraction_response = await ollama.extract( text, document_id=document_id, - known_tickers=[ticker] if ticker else None, + known_tickers=all_tickers, ) result = await persist_extraction( pool=pool, diff --git a/services/extractor/prompts.py b/services/extractor/prompts.py index 8b20a7a..086dac0 100644 --- a/services/extractor/prompts.py +++ b/services/extractor/prompts.py @@ -102,9 +102,12 @@ def build_extraction_prompt( if known_tickers: tickers_str = ", ".join(known_tickers) ticker_hint = ( - f"\nThe following tickers may be referenced in this document: {tickers_str}\n" - "Only include a ticker in your output if the document actually discusses that company. " - "Do NOT include a ticker just because it appears in this hint." + f"\nTracked tickers: {tickers_str}\n" + "If the document discusses market conditions, sectors, or themes that directly affect " + "any of these companies, include those companies with appropriate sentiment and impact scores. " + "A macro article about oil prices IS relevant to energy companies like XOM. " + "A tech sector article IS relevant to AAPL, MSFT, NVDA, etc. " + "Use your judgment — but only include companies where the connection is clear from the text." ) schema_str = json.dumps(EXTRACTION_JSON_SCHEMA, indent=2)