fix: prevent duplicate queue entries with Redis SET markers

Recovery sweeps and the retry endpoint now check a per-document Redis key (SET NX, 1h TTL) before pushing to the queue. If the marker exists, the doc is already enqueued and gets skipped. This prevents the scheduler from re-enqueuing the same parsed docs every 5 minutes.
2026-04-20 17:24:53 +00:00
parent 288c5333b5
commit 46c24aefab
2 changed files with 47 additions and 16 deletions
@@ -1891,6 +1891,7 @@ async def retry_failed_extractions_endpoint():
        return {"retried": 0, "message": "No extraction-failed documents to retry"}

    doc_ids = []
+    enqueued_set_prefix = f"{QUEUE_PREFIX}:enqueued"
    for row in rows:
        doc_type = row["document_type"]
        if doc_type == "macro_event":
@@ -1898,10 +1899,14 @@ async def retry_failed_extractions_endpoint():
        else:
            target = queue_key("extraction")

-        await rds.rpush(target, json.dumps({
-            "document_id": str(row["id"]),
-            "ticker": row["ticker"] or "",
-        }))
+        doc_id = str(row["id"])
+        marker = f"{enqueued_set_prefix}:{doc_id}"
+        added = await rds.set(marker, "1", nx=True, ex=3600)
+        if added:
+            await rds.rpush(target, json.dumps({
+                "document_id": doc_id,
+                "ticker": row["ticker"] or "",
+            }))
        doc_ids.append(row["id"])

    # Delete failed intelligence rows so extractor starts fresh