phase 17: fix parser URL lookup from DB and extractor text field name mismatch

2026-04-12 02:54:23 -07:00
parent 67cdb0b8c8
commit 0ac4493bd4
2 changed files with 9 additions and 1 deletions
@@ -48,7 +48,7 @@ async def main() -> None:
            job = json.loads(payload)
            document_id = job.get("document_id", "")
            ticker = job.get("ticker", "")
-            text = job.get("text", "")
+            text = job.get("text", "") or job.get("normalized_text", "")

            logger.info("Processing extraction job for doc %s / %s", document_id, ticker)

@@ -96,6 +96,14 @@ async def process_job(

    set_trace_context(trace_id=job.get("_trace_id") or new_trace_id())

+    # If no URL in job, look it up from the documents table
+    if not url:
+        row = await pool.fetchrow(
+            "SELECT url FROM documents WHERE id = $1::uuid", doc_id,
+        )
+        if row and row["url"]:
+            url = row["url"]
+
    # Fetch HTML if we have a URL
    html = await fetch_html(url) if url else None