From 57d0fc7d339b0190dfa5dfa9df5bfb818a5ca4d5 Mon Sep 17 00:00:00 2001
From: Celes Renata <celes@frameshift.net>
Date: Sun, 12 Apr 2026 09:18:08 -0700
Subject: [PATCH] phase 17: pass all tracked tickers to extractor, soften
 prompt for macro-to-company relevance

---
 scripts/check_llm.py          | 28 ++++++++++++++++++++++
 scripts/check_llm2.py         | 44 +++++++++++++++++++++++++++++++++++
 scripts/check_llm3.py         | 28 ++++++++++++++++++++++
 scripts/check_recent.py       | 33 ++++++++++++++++++++++++++
 services/extractor/main.py    |  4 +++-
 services/extractor/prompts.py |  9 ++++---
 6 files changed, 142 insertions(+), 4 deletions(-)
 create mode 100644 scripts/check_llm.py
 create mode 100644 scripts/check_llm2.py
 create mode 100644 scripts/check_llm3.py
 create mode 100644 scripts/check_recent.py

diff --git a/scripts/check_llm.py b/scripts/check_llm.py
new file mode 100644
index 0000000..afba6bc
--- /dev/null
+++ b/scripts/check_llm.py
@@ -0,0 +1,28 @@
+from minio import Minio
+import os, json
+
+mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False)
+raw_objs = list(mc.list_objects("stonks-llm-results", recursive=True))
+print(f"LLM result objects: {len(raw_objs)}")
+for o in raw_objs[:5]:
+    data = json.loads(mc.get_object("stonks-llm-results", o.object_name).read())
+    success = data.get("success", False)
+    attempts = data.get("attempts", [])
+    ticker = o.object_name.split("/")[1] if "/" in o.object_name else "?"
+    if attempts:
+        last = attempts[-1]
+        raw_out = last.get("raw_output", "")
+        print(f"  {ticker}: success={success} output_len={len(raw_out)}")
+        try:
+            parsed = json.loads(raw_out)
+            companies = parsed.get("companies", [])
+            summary = parsed.get("summary", "")[:80]
+            conf = parsed.get("confidence", "?")
+            print(f"    summary: {summary}")
+            print(f"    confidence: {conf}")
+            print(f"    companies: {len(companies)}")
+            for c in companies[:3]:
+                print(f"      {c.get('ticker','?')} sentiment={c.get('sentiment','?')} impact={c.get('impact_score','?')}")
+        except:
+            print(f"    raw: {raw_out[:120]}")
+    print()
diff --git a/scripts/check_llm2.py b/scripts/check_llm2.py
new file mode 100644
index 0000000..9d777e9
--- /dev/null
+++ b/scripts/check_llm2.py
@@ -0,0 +1,44 @@
+from minio import Minio
+import os, json
+
+mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False)
+raw_objs = list(mc.list_objects("stonks-llm-results", recursive=True))
+
+# Count companies found vs not found
+total = 0
+with_companies = 0
+empty_companies = 0
+wrong_tickers = 0
+our_tickers = {"AAPL","MSFT","NVDA","AMZN","GOOGL","JPM","JNJ","XOM","TSLA","META"}
+ticker_hits = {}
+
+for o in raw_objs:
+    data = json.loads(mc.get_object("stonks-llm-results", o.object_name).read())
+    if not data.get("success"):
+        continue
+    attempts = data.get("attempts", [])
+    if not attempts:
+        continue
+    raw_out = attempts[-1].get("raw_output", "")
+    try:
+        parsed = json.loads(raw_out)
+        companies = parsed.get("companies", [])
+        total += 1
+        if companies:
+            with_companies += 1
+            for c in companies:
+                t = c.get("ticker", "")
+                if t in our_tickers:
+                    ticker_hits[t] = ticker_hits.get(t, 0) + 1
+                else:
+                    wrong_tickers += 1
+        else:
+            empty_companies += 1
+    except:
+        pass
+
+print(f"Total successful extractions: {total}")
+print(f"  With companies: {with_companies}")
+print(f"  Empty companies: {empty_companies}")
+print(f"  Wrong/unknown tickers: {wrong_tickers}")
+print(f"  Our ticker hits: {ticker_hits}")
diff --git a/scripts/check_llm3.py b/scripts/check_llm3.py
new file mode 100644
index 0000000..8cc66c2
--- /dev/null
+++ b/scripts/check_llm3.py
@@ -0,0 +1,28 @@
+from minio import Minio
+import os, json
+
+mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False)
+raw_objs = list(mc.list_objects("stonks-llm-results", recursive=True))
+
+# Find extractions with companies and check their summaries
+for o in raw_objs:
+    data = json.loads(mc.get_object("stonks-llm-results", o.object_name).read())
+    if not data.get("success"):
+        continue
+    attempts = data.get("attempts", [])
+    if not attempts:
+        continue
+    raw_out = attempts[-1].get("raw_output", "")
+    try:
+        parsed = json.loads(raw_out)
+        companies = parsed.get("companies", [])
+        summary = parsed.get("summary", "")
+        conf = parsed.get("confidence", 0)
+        # Show high-confidence ones that still have no companies
+        if conf >= 0.5 and not companies:
+            ticker = o.object_name.split("/")[1]
+            print(f"HIGH CONF ({conf}) NO COMPANIES - {ticker}:")
+            print(f"  summary: {summary[:120]}")
+            print()
+    except:
+        pass
diff --git a/scripts/check_recent.py b/scripts/check_recent.py
new file mode 100644
index 0000000..f4d81f6
--- /dev/null
+++ b/scripts/check_recent.py
@@ -0,0 +1,33 @@
+from minio import Minio
+import os, json
+
+mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False)
+
+# Get the most recent LLM results
+raw_objs = sorted(mc.list_objects("stonks-llm-results", recursive=True), key=lambda o: o.last_modified, reverse=True)
+
+for o in raw_objs[:3]:
+    data = json.loads(mc.get_object("stonks-llm-results", o.object_name).read())
+    attempts = data.get("attempts", [])
+    if not attempts:
+        continue
+    last = attempts[-1]
+    raw_out = last.get("raw_output", "")
+    ticker = o.object_name.split("/")[1]
+    doc_id = o.object_name.split("/")[-2]
+    
+    print(f"=== {ticker} / {doc_id[:8]} ===")
+    print(f"  success: {data.get('success')}")
+    print(f"  duration: {data.get('total_duration_ms')}ms")
+    
+    try:
+        parsed = json.loads(raw_out)
+        print(f"  summary: {parsed.get('summary', '')[:120]}")
+        print(f"  confidence: {parsed.get('confidence')}")
+        print(f"  companies: {len(parsed.get('companies', []))}")
+        print(f"  macro_themes: {parsed.get('macro_themes', [])}")
+        for c in parsed.get("companies", []):
+            print(f"    -> {c.get('ticker')} sent={c.get('sentiment')} impact={c.get('impact_score')} catalyst={c.get('catalyst_type')}")
+    except:
+        print(f"  raw output: {raw_out[:200]}")
+    print()
diff --git a/services/extractor/main.py b/services/extractor/main.py
index 1ebe0b0..748ae32 100644
--- a/services/extractor/main.py
+++ b/services/extractor/main.py
@@ -88,10 +88,12 @@ async def main() -> None:
                 company_id_map = await _build_company_id_map(pool)
 
             try:
+                # Pass all tracked tickers so the model can identify any mentioned companies
+                all_tickers = list(company_id_map.keys()) if company_id_map else ([ticker] if ticker else None)
                 extraction_response = await ollama.extract(
                     text,
                     document_id=document_id,
-                    known_tickers=[ticker] if ticker else None,
+                    known_tickers=all_tickers,
                 )
                 result = await persist_extraction(
                     pool=pool,
diff --git a/services/extractor/prompts.py b/services/extractor/prompts.py
index 8b20a7a..086dac0 100644
--- a/services/extractor/prompts.py
+++ b/services/extractor/prompts.py
@@ -102,9 +102,12 @@ def build_extraction_prompt(
     if known_tickers:
         tickers_str = ", ".join(known_tickers)
         ticker_hint = (
-            f"\nThe following tickers may be referenced in this document: {tickers_str}\n"
-            "Only include a ticker in your output if the document actually discusses that company. "
-            "Do NOT include a ticker just because it appears in this hint."
+            f"\nTracked tickers: {tickers_str}\n"
+            "If the document discusses market conditions, sectors, or themes that directly affect "
+            "any of these companies, include those companies with appropriate sentiment and impact scores. "
+            "A macro article about oil prices IS relevant to energy companies like XOM. "
+            "A tech sector article IS relevant to AAPL, MSFT, NVDA, etc. "
+            "Use your judgment — but only include companies where the connection is clear from the text."
         )
 
     schema_str = json.dumps(EXTRACTION_JSON_SCHEMA, indent=2)