From 0ac4493bd44dfca34d951a3773bf5bbe90021e9e Mon Sep 17 00:00:00 2001 From: Celes Renata Date: Sun, 12 Apr 2026 02:54:23 -0700 Subject: [PATCH] phase 17: fix parser URL lookup from DB and extractor text field name mismatch --- services/extractor/main.py | 2 +- services/parser/worker.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/services/extractor/main.py b/services/extractor/main.py index 20634d5..eb2d1b3 100644 --- a/services/extractor/main.py +++ b/services/extractor/main.py @@ -48,7 +48,7 @@ async def main() -> None: job = json.loads(payload) document_id = job.get("document_id", "") ticker = job.get("ticker", "") - text = job.get("text", "") + text = job.get("text", "") or job.get("normalized_text", "") logger.info("Processing extraction job for doc %s / %s", document_id, ticker) diff --git a/services/parser/worker.py b/services/parser/worker.py index 5547947..78083c3 100644 --- a/services/parser/worker.py +++ b/services/parser/worker.py @@ -96,6 +96,14 @@ async def process_job( set_trace_context(trace_id=job.get("_trace_id") or new_trace_id()) + # If no URL in job, look it up from the documents table + if not url: + row = await pool.fetchrow( + "SELECT url FROM documents WHERE id = $1::uuid", doc_id, + ) + if row and row["url"]: + url = row["url"] + # Fetch HTML if we have a URL html = await fetch_html(url) if url else None