fix: increase stale threshold to 4h to prevent duplicate enqueuing

The 30-minute threshold was shorter than the queue drain time, causing
the recovery sweep to re-enqueue docs that were already queued but not
yet processed. Bumped to 4 hours with matching marker TTL.
This commit is contained in:
Celes Renata
2026-04-20 18:05:30 +00:00
parent 20faa8e20d
commit 7071bba92d
+4 -2
View File
@@ -534,7 +534,9 @@ async def main() -> None:
# How long a document can sit in "parsed" before we consider it orphaned # How long a document can sit in "parsed" before we consider it orphaned
STALE_PARSED_THRESHOLD_MINUTES: int = 30 # Must be longer than the expected queue drain time to avoid re-enqueuing
# docs that are already queued but not yet processed.
STALE_PARSED_THRESHOLD_MINUTES: int = 240
# How long after an extraction failure before we retry # How long after an extraction failure before we retry
EXTRACTION_FAILED_RETRY_MINUTES: int = 60 EXTRACTION_FAILED_RETRY_MINUTES: int = 60
@@ -542,7 +544,7 @@ EXTRACTION_FAILED_RETRY_MINUTES: int = 60
# Redis set key for tracking enqueued doc IDs (prevents duplicate enqueuing) # Redis set key for tracking enqueued doc IDs (prevents duplicate enqueuing)
_ENQUEUED_SET = f"{QUEUE_PREFIX}:enqueued" _ENQUEUED_SET = f"{QUEUE_PREFIX}:enqueued"
# How long an enqueued marker lives before it can be re-enqueued (seconds) # How long an enqueued marker lives before it can be re-enqueued (seconds)
_ENQUEUED_TTL = 3600 _ENQUEUED_TTL = 14400 # 4 hours — matches STALE_PARSED_THRESHOLD_MINUTES
async def _enqueue_if_new( async def _enqueue_if_new(