feat: retry failed extractions button on pipeline page

- POST /api/ops/pipeline/retry-failed endpoint resets extraction_failed
  docs to parsed, deletes failed intelligence rows, and re-enqueues
  them (batch of 200)
- Scheduler now auto-retries extraction_failed docs every ~10 minutes
  (100 per cycle, 60-min cooldown per doc)
- Pipeline page shows 'Retry Failed (N)' button when extraction_failed
  count > 0, with pending/success/error states
This commit is contained in:
Celes Renata
2026-04-20 08:09:29 +00:00
parent 5289f0f195
commit de35279269
5 changed files with 152 additions and 2 deletions
+52
View File
@@ -1869,6 +1869,58 @@ async def pipeline_stream(request: Request):
)
@app.post("/api/ops/pipeline/retry-failed")
async def retry_failed_extractions_endpoint():
"""Re-enqueue documents stuck in extraction_failed for another attempt.
Resets up to 200 extraction_failed documents back to 'parsed',
deletes their failed intelligence rows, and pushes them onto the
extraction queue. Returns the count of documents re-enqueued.
"""
rows = await pool.fetch(
"""SELECT d.id, d.document_type, dcm.ticker
FROM documents d
LEFT JOIN document_company_mentions dcm ON d.id = dcm.document_id
WHERE d.status = 'extraction_failed'
ORDER BY d.updated_at ASC
LIMIT 200""",
)
if not rows:
return {"retried": 0, "message": "No extraction-failed documents to retry"}
doc_ids = []
for row in rows:
doc_type = row["document_type"]
if doc_type == "macro_event":
target = "stonks:queue:macro_classification"
else:
target = "stonks:queue:extraction"
await rds.rpush(target, json.dumps({
"document_id": str(row["id"]),
"ticker": row["ticker"] or "",
}))
doc_ids.append(row["id"])
# Delete failed intelligence rows so extractor starts fresh
await pool.execute(
"""DELETE FROM document_intelligence
WHERE document_id = ANY($1::uuid[])
AND validation_status = 'failed'""",
doc_ids,
)
# Reset status to 'parsed' and touch updated_at
await pool.execute(
"""UPDATE documents
SET status = 'parsed', updated_at = NOW()
WHERE id = ANY($1::uuid[])""",
doc_ids,
)
return {"retried": len(doc_ids), "message": f"Re-enqueued {len(doc_ids)} documents for extraction"}
@app.get("/api/ops/sources/coverage-gaps")
async def get_source_coverage_gaps():
"""Identify symbols with missing or insufficient source coverage.
+70
View File
@@ -499,6 +499,7 @@ async def main() -> None:
logger.info("Scheduler started (tick=%ds)", SCHEDULER_TICK)
recovery_counter = 0
retry_counter = 0
cleanup_counter = 0
try:
while True:
@@ -511,6 +512,11 @@ async def main() -> None:
if recovery_counter >= 20:
recovery_counter = 0
await recover_stale_documents(pool, rds)
# Retry extraction failures every ~40 cycles (~10 minutes)
retry_counter += 1
if retry_counter >= 40:
retry_counter = 0
await retry_failed_extractions(pool, rds)
# Run signal cleanup periodically (~25 minutes)
cleanup_counter += 1
if cleanup_counter >= CLEANUP_CYCLE_INTERVAL:
@@ -529,6 +535,9 @@ async def main() -> None:
# How long a document can sit in "parsed" before we consider it orphaned
STALE_PARSED_THRESHOLD_MINUTES: int = 30
# How long after an extraction failure before we retry
EXTRACTION_FAILED_RETRY_MINUTES: int = 60
async def recover_stale_documents(pool: asyncpg.Pool, rds: aioredis.Redis) -> int:
"""Re-enqueue documents stuck in 'parsed' status for extraction.
@@ -584,6 +593,67 @@ async def recover_stale_documents(pool: asyncpg.Pool, rds: aioredis.Redis) -> in
return enqueued
async def retry_failed_extractions(pool: asyncpg.Pool, rds: aioredis.Redis) -> int:
"""Re-enqueue documents stuck in 'extraction_failed' for another attempt.
Resets status to 'parsed', deletes the failed intelligence row so the
extractor treats them as fresh, and pushes them onto the extraction queue.
Only retries documents whose last attempt was at least
EXTRACTION_FAILED_RETRY_MINUTES ago to avoid tight retry loops.
Returns the number of documents re-enqueued.
"""
rows = await pool.fetch(
"""SELECT d.id, d.document_type, dcm.ticker
FROM documents d
LEFT JOIN document_company_mentions dcm ON d.id = dcm.document_id
WHERE d.status = 'extraction_failed'
AND d.updated_at < NOW() - INTERVAL '1 minute' * $1
ORDER BY d.updated_at ASC
LIMIT 100""",
EXTRACTION_FAILED_RETRY_MINUTES,
)
if not rows:
return 0
enqueued = 0
doc_ids = []
for row in rows:
doc_type = row["document_type"]
if doc_type == "macro_event":
target = queue_key(QUEUE_MACRO_CLASSIFICATION)
else:
target = queue_key(QUEUE_EXTRACTION)
await rds.rpush(target, json.dumps({
"document_id": str(row["id"]),
"ticker": row["ticker"] or "",
}))
doc_ids.append(row["id"])
enqueued += 1
if doc_ids:
# Delete failed intelligence rows so extractor starts fresh
await pool.execute(
"""DELETE FROM document_intelligence
WHERE document_id = ANY($1::uuid[])
AND validation_status = 'failed'""",
doc_ids,
)
# Reset status to 'parsed' and touch updated_at
await pool.execute(
"""UPDATE documents
SET status = 'parsed', updated_at = NOW()
WHERE id = ANY($1::uuid[])""",
doc_ids,
)
logger.info("Retried %d extraction-failed documents", enqueued)
return enqueued
# How often to run competitive signal cleanup (every ~100 cycles = ~25 minutes)
CLEANUP_CYCLE_INTERVAL: int = 100
# Keep competitive signals for this many days