feat: retry failed extractions button on pipeline page

- POST /api/ops/pipeline/retry-failed endpoint resets extraction_failed docs to parsed, deletes failed intelligence rows, and re-enqueues them (batch of 200) - Scheduler now auto-retries extraction_failed docs every ~10 minutes (100 per cycle, 60-min cooldown per doc) - Pipeline page shows 'Retry Failed (N)' button when extraction_failed count > 0, with pending/success/error states
2026-04-20 08:09:29 +00:00
parent 5289f0f195
commit de35279269
5 changed files with 152 additions and 2 deletions
@@ -1869,6 +1869,58 @@ async def pipeline_stream(request: Request):
    )


+@app.post("/api/ops/pipeline/retry-failed")
+async def retry_failed_extractions_endpoint():
+    """Re-enqueue documents stuck in extraction_failed for another attempt.
+
+    Resets up to 200 extraction_failed documents back to 'parsed',
+    deletes their failed intelligence rows, and pushes them onto the
+    extraction queue.  Returns the count of documents re-enqueued.
+    """
+    rows = await pool.fetch(
+        """SELECT d.id, d.document_type, dcm.ticker
+           FROM documents d
+           LEFT JOIN document_company_mentions dcm ON d.id = dcm.document_id
+           WHERE d.status = 'extraction_failed'
+           ORDER BY d.updated_at ASC
+           LIMIT 200""",
+    )
+
+    if not rows:
+        return {"retried": 0, "message": "No extraction-failed documents to retry"}
+
+    doc_ids = []
+    for row in rows:
+        doc_type = row["document_type"]
+        if doc_type == "macro_event":
+            target = "stonks:queue:macro_classification"
+        else:
+            target = "stonks:queue:extraction"
+
+        await rds.rpush(target, json.dumps({
+            "document_id": str(row["id"]),
+            "ticker": row["ticker"] or "",
+        }))
+        doc_ids.append(row["id"])
+
+    # Delete failed intelligence rows so extractor starts fresh
+    await pool.execute(
+        """DELETE FROM document_intelligence
+           WHERE document_id = ANY($1::uuid[])
+             AND validation_status = 'failed'""",
+        doc_ids,
+    )
+    # Reset status to 'parsed' and touch updated_at
+    await pool.execute(
+        """UPDATE documents
+           SET status = 'parsed', updated_at = NOW()
+           WHERE id = ANY($1::uuid[])""",
+        doc_ids,
+    )
+
+    return {"retried": len(doc_ids), "message": f"Re-enqueued {len(doc_ids)} documents for extraction"}
+
+
@app.get("/api/ops/sources/coverage-gaps")
 async def get_source_coverage_gaps():
    """Identify symbols with missing or insufficient source coverage.
@@ -499,6 +499,7 @@ async def main() -> None:

    logger.info("Scheduler started (tick=%ds)", SCHEDULER_TICK)
    recovery_counter = 0
+    retry_counter = 0
    cleanup_counter = 0
    try:
        while True:
@@ -511,6 +512,11 @@ async def main() -> None:
                        if recovery_counter >= 20:
                            recovery_counter = 0
                            await recover_stale_documents(pool, rds)
+                        # Retry extraction failures every ~40 cycles (~10 minutes)
+                        retry_counter += 1
+                        if retry_counter >= 40:
+                            retry_counter = 0
+                            await retry_failed_extractions(pool, rds)
                        # Run signal cleanup periodically (~25 minutes)
                        cleanup_counter += 1
                        if cleanup_counter >= CLEANUP_CYCLE_INTERVAL:
@@ -529,6 +535,9 @@ async def main() -> None:
 # How long a document can sit in "parsed" before we consider it orphaned
 STALE_PARSED_THRESHOLD_MINUTES: int = 30

+# How long after an extraction failure before we retry
+EXTRACTION_FAILED_RETRY_MINUTES: int = 60
+

 async def recover_stale_documents(pool: asyncpg.Pool, rds: aioredis.Redis) -> int:
    """Re-enqueue documents stuck in 'parsed' status for extraction.
@@ -584,6 +593,67 @@ async def recover_stale_documents(pool: asyncpg.Pool, rds: aioredis.Redis) -> in
    return enqueued


+async def retry_failed_extractions(pool: asyncpg.Pool, rds: aioredis.Redis) -> int:
+    """Re-enqueue documents stuck in 'extraction_failed' for another attempt.
+
+    Resets status to 'parsed', deletes the failed intelligence row so the
+    extractor treats them as fresh, and pushes them onto the extraction queue.
+
+    Only retries documents whose last attempt was at least
+    EXTRACTION_FAILED_RETRY_MINUTES ago to avoid tight retry loops.
+
+    Returns the number of documents re-enqueued.
+    """
+    rows = await pool.fetch(
+        """SELECT d.id, d.document_type, dcm.ticker
+           FROM documents d
+           LEFT JOIN document_company_mentions dcm ON d.id = dcm.document_id
+           WHERE d.status = 'extraction_failed'
+             AND d.updated_at < NOW() - INTERVAL '1 minute' * $1
+           ORDER BY d.updated_at ASC
+           LIMIT 100""",
+        EXTRACTION_FAILED_RETRY_MINUTES,
+    )
+
+    if not rows:
+        return 0
+
+    enqueued = 0
+    doc_ids = []
+    for row in rows:
+        doc_type = row["document_type"]
+        if doc_type == "macro_event":
+            target = queue_key(QUEUE_MACRO_CLASSIFICATION)
+        else:
+            target = queue_key(QUEUE_EXTRACTION)
+
+        await rds.rpush(target, json.dumps({
+            "document_id": str(row["id"]),
+            "ticker": row["ticker"] or "",
+        }))
+        doc_ids.append(row["id"])
+        enqueued += 1
+
+    if doc_ids:
+        # Delete failed intelligence rows so extractor starts fresh
+        await pool.execute(
+            """DELETE FROM document_intelligence
+               WHERE document_id = ANY($1::uuid[])
+                 AND validation_status = 'failed'""",
+            doc_ids,
+        )
+        # Reset status to 'parsed' and touch updated_at
+        await pool.execute(
+            """UPDATE documents
+               SET status = 'parsed', updated_at = NOW()
+               WHERE id = ANY($1::uuid[])""",
+            doc_ids,
+        )
+
+    logger.info("Retried %d extraction-failed documents", enqueued)
+    return enqueued
+
+
 # How often to run competitive signal cleanup (every ~100 cycles = ~25 minutes)
 CLEANUP_CYCLE_INTERVAL: int = 100
 # Keep competitive signals for this many days