feat: retry failed extractions button on pipeline page
- POST /api/ops/pipeline/retry-failed endpoint resets extraction_failed docs to parsed, deletes failed intelligence rows, and re-enqueues them (batch of 200) - Scheduler now auto-retries extraction_failed docs every ~10 minutes (100 per cycle, 60-min cooldown per doc) - Pipeline page shows 'Retry Failed (N)' button when extraction_failed count > 0, with pending/success/error states
This commit is contained in:
@@ -1869,6 +1869,58 @@ async def pipeline_stream(request: Request):
|
||||
)
|
||||
|
||||
|
||||
@app.post("/api/ops/pipeline/retry-failed")
|
||||
async def retry_failed_extractions_endpoint():
|
||||
"""Re-enqueue documents stuck in extraction_failed for another attempt.
|
||||
|
||||
Resets up to 200 extraction_failed documents back to 'parsed',
|
||||
deletes their failed intelligence rows, and pushes them onto the
|
||||
extraction queue. Returns the count of documents re-enqueued.
|
||||
"""
|
||||
rows = await pool.fetch(
|
||||
"""SELECT d.id, d.document_type, dcm.ticker
|
||||
FROM documents d
|
||||
LEFT JOIN document_company_mentions dcm ON d.id = dcm.document_id
|
||||
WHERE d.status = 'extraction_failed'
|
||||
ORDER BY d.updated_at ASC
|
||||
LIMIT 200""",
|
||||
)
|
||||
|
||||
if not rows:
|
||||
return {"retried": 0, "message": "No extraction-failed documents to retry"}
|
||||
|
||||
doc_ids = []
|
||||
for row in rows:
|
||||
doc_type = row["document_type"]
|
||||
if doc_type == "macro_event":
|
||||
target = "stonks:queue:macro_classification"
|
||||
else:
|
||||
target = "stonks:queue:extraction"
|
||||
|
||||
await rds.rpush(target, json.dumps({
|
||||
"document_id": str(row["id"]),
|
||||
"ticker": row["ticker"] or "",
|
||||
}))
|
||||
doc_ids.append(row["id"])
|
||||
|
||||
# Delete failed intelligence rows so extractor starts fresh
|
||||
await pool.execute(
|
||||
"""DELETE FROM document_intelligence
|
||||
WHERE document_id = ANY($1::uuid[])
|
||||
AND validation_status = 'failed'""",
|
||||
doc_ids,
|
||||
)
|
||||
# Reset status to 'parsed' and touch updated_at
|
||||
await pool.execute(
|
||||
"""UPDATE documents
|
||||
SET status = 'parsed', updated_at = NOW()
|
||||
WHERE id = ANY($1::uuid[])""",
|
||||
doc_ids,
|
||||
)
|
||||
|
||||
return {"retried": len(doc_ids), "message": f"Re-enqueued {len(doc_ids)} documents for extraction"}
|
||||
|
||||
|
||||
@app.get("/api/ops/sources/coverage-gaps")
|
||||
async def get_source_coverage_gaps():
|
||||
"""Identify symbols with missing or insufficient source coverage.
|
||||
|
||||
@@ -499,6 +499,7 @@ async def main() -> None:
|
||||
|
||||
logger.info("Scheduler started (tick=%ds)", SCHEDULER_TICK)
|
||||
recovery_counter = 0
|
||||
retry_counter = 0
|
||||
cleanup_counter = 0
|
||||
try:
|
||||
while True:
|
||||
@@ -511,6 +512,11 @@ async def main() -> None:
|
||||
if recovery_counter >= 20:
|
||||
recovery_counter = 0
|
||||
await recover_stale_documents(pool, rds)
|
||||
# Retry extraction failures every ~40 cycles (~10 minutes)
|
||||
retry_counter += 1
|
||||
if retry_counter >= 40:
|
||||
retry_counter = 0
|
||||
await retry_failed_extractions(pool, rds)
|
||||
# Run signal cleanup periodically (~25 minutes)
|
||||
cleanup_counter += 1
|
||||
if cleanup_counter >= CLEANUP_CYCLE_INTERVAL:
|
||||
@@ -529,6 +535,9 @@ async def main() -> None:
|
||||
# How long a document can sit in "parsed" before we consider it orphaned
|
||||
STALE_PARSED_THRESHOLD_MINUTES: int = 30
|
||||
|
||||
# How long after an extraction failure before we retry
|
||||
EXTRACTION_FAILED_RETRY_MINUTES: int = 60
|
||||
|
||||
|
||||
async def recover_stale_documents(pool: asyncpg.Pool, rds: aioredis.Redis) -> int:
|
||||
"""Re-enqueue documents stuck in 'parsed' status for extraction.
|
||||
@@ -584,6 +593,67 @@ async def recover_stale_documents(pool: asyncpg.Pool, rds: aioredis.Redis) -> in
|
||||
return enqueued
|
||||
|
||||
|
||||
async def retry_failed_extractions(pool: asyncpg.Pool, rds: aioredis.Redis) -> int:
|
||||
"""Re-enqueue documents stuck in 'extraction_failed' for another attempt.
|
||||
|
||||
Resets status to 'parsed', deletes the failed intelligence row so the
|
||||
extractor treats them as fresh, and pushes them onto the extraction queue.
|
||||
|
||||
Only retries documents whose last attempt was at least
|
||||
EXTRACTION_FAILED_RETRY_MINUTES ago to avoid tight retry loops.
|
||||
|
||||
Returns the number of documents re-enqueued.
|
||||
"""
|
||||
rows = await pool.fetch(
|
||||
"""SELECT d.id, d.document_type, dcm.ticker
|
||||
FROM documents d
|
||||
LEFT JOIN document_company_mentions dcm ON d.id = dcm.document_id
|
||||
WHERE d.status = 'extraction_failed'
|
||||
AND d.updated_at < NOW() - INTERVAL '1 minute' * $1
|
||||
ORDER BY d.updated_at ASC
|
||||
LIMIT 100""",
|
||||
EXTRACTION_FAILED_RETRY_MINUTES,
|
||||
)
|
||||
|
||||
if not rows:
|
||||
return 0
|
||||
|
||||
enqueued = 0
|
||||
doc_ids = []
|
||||
for row in rows:
|
||||
doc_type = row["document_type"]
|
||||
if doc_type == "macro_event":
|
||||
target = queue_key(QUEUE_MACRO_CLASSIFICATION)
|
||||
else:
|
||||
target = queue_key(QUEUE_EXTRACTION)
|
||||
|
||||
await rds.rpush(target, json.dumps({
|
||||
"document_id": str(row["id"]),
|
||||
"ticker": row["ticker"] or "",
|
||||
}))
|
||||
doc_ids.append(row["id"])
|
||||
enqueued += 1
|
||||
|
||||
if doc_ids:
|
||||
# Delete failed intelligence rows so extractor starts fresh
|
||||
await pool.execute(
|
||||
"""DELETE FROM document_intelligence
|
||||
WHERE document_id = ANY($1::uuid[])
|
||||
AND validation_status = 'failed'""",
|
||||
doc_ids,
|
||||
)
|
||||
# Reset status to 'parsed' and touch updated_at
|
||||
await pool.execute(
|
||||
"""UPDATE documents
|
||||
SET status = 'parsed', updated_at = NOW()
|
||||
WHERE id = ANY($1::uuid[])""",
|
||||
doc_ids,
|
||||
)
|
||||
|
||||
logger.info("Retried %d extraction-failed documents", enqueued)
|
||||
return enqueued
|
||||
|
||||
|
||||
# How often to run competitive signal cleanup (every ~100 cycles = ~25 minutes)
|
||||
CLEANUP_CYCLE_INTERVAL: int = 100
|
||||
# Keep competitive signals for this many days
|
||||
|
||||
Reference in New Issue
Block a user