fix: pipeline stop now halts all workers and flushes queues
ci/woodpecker/push/test Pipeline was successful
ci/woodpecker/push/build-1 Pipeline was successful
ci/woodpecker/push/build-3 Pipeline was successful
ci/woodpecker/push/build-2 Pipeline was successful
ci/woodpecker/push/finalize Pipeline was successful
Build and Push / lint-and-test (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.adapters.broker_adapter name:broker-adapter]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.aggregation.worker name:aggregation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.extractor.worker name:extractor]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.ingestion.worker name:ingestion]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.lake_publisher.worker name:lake-publisher]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.parser.worker name:parser]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.recommendation.worker name:recommendation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.scheduler.app name:scheduler]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.api.app:app --host 0.0.0.0 --port 8000 name:query-api]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.risk.app:app --host 0.0.0.0 --port 8000 name:risk]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.symbol_registry.app:app --host 0.0.0.0 --port 8000 name:symbol-registry]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.trading.app:app --host 0.0.0.0 --port 8000 name:trading-engine]) (push) Has been cancelled
Build and Push / build-dashboard (push) Has been cancelled
Build and Push / build-superset (push) Has been cancelled
Build and Push / integration-test (push) Has been cancelled
Build and Push / beta-gate (push) Has been cancelled

Workers (ingestion, parser, extractor, aggregation, recommendation,
broker, lake-publisher) now check the pipeline:enabled Redis flag on
each loop iteration and sleep when disabled.

The toggle endpoint flushes all pipeline queues on disable so queued
jobs don't resume when workers eventually check. Broker/trading queues
are excluded from flush to avoid dropping in-flight orders.
This commit is contained in:
Celes Renata
2026-04-29 07:59:35 +00:00
parent cfcfd655e7
commit 8c3c1aab43
10 changed files with 80 additions and 9 deletions
+4 -1
View File
@@ -76,7 +76,7 @@ from services.shared.metrics import (
RISK_CHECK_FAILURES, RISK_CHECK_FAILURES,
RISK_EVALUATIONS_TOTAL, RISK_EVALUATIONS_TOTAL,
) )
from services.shared.redis_keys import QUEUE_BROKER, queue_key from services.shared.redis_keys import QUEUE_BROKER, is_pipeline_enabled, queue_key
logger = logging.getLogger("broker_service") logger = logging.getLogger("broker_service")
@@ -923,6 +923,9 @@ async def main() -> None:
try: try:
while True: while True:
if not await is_pipeline_enabled(rds):
await asyncio.sleep(2)
continue
result = await rds.lpop(queue) result = await rds.lpop(queue)
raw = str(result) if result else None raw = str(result) if result else None
if raw: if raw:
+5
View File
@@ -23,6 +23,7 @@ from services.shared.logging import inject_trace_context, setup_logging
from services.shared.redis_keys import ( from services.shared.redis_keys import (
QUEUE_AGGREGATION, QUEUE_AGGREGATION,
QUEUE_RECOMMENDATION, QUEUE_RECOMMENDATION,
is_pipeline_enabled,
queue_key, queue_key,
) )
@@ -134,6 +135,10 @@ async def main() -> None:
try: try:
while True: while True:
if not await is_pipeline_enabled(redis_client):
await asyncio.sleep(1)
continue
raw = await redis_client.lpop(queue) raw = await redis_client.lpop(queue)
if raw is None: if raw is None:
await asyncio.sleep(1) await asyncio.sleep(1)
+26 -3
View File
@@ -41,7 +41,7 @@ from services.shared.audit import get_entity_audit_trail, get_order_audit_trail,
from services.shared.config import load_config from services.shared.config import load_config
from services.shared.db import get_pg_pool, get_redis from services.shared.db import get_pg_pool, get_redis
from services.shared.logging import new_trace_id, set_trace_context, setup_logging from services.shared.logging import new_trace_id, set_trace_context, setup_logging
from services.shared.redis_keys import PREFIX, QUEUE_BROKER, QUEUE_PREFIX, queue_key from services.shared.redis_keys import PIPELINE_ENABLED_KEY, QUEUE_BROKER, QUEUE_PREFIX, queue_key
from services.shared.schemas import MAJOR_DECISION_CATALYSTS from services.shared.schemas import MAJOR_DECISION_CATALYSTS
logger = logging.getLogger("query_api") logger = logging.getLogger("query_api")
@@ -1948,7 +1948,7 @@ async def retry_failed_extractions_endpoint():
# Pipeline On/Off Toggle # Pipeline On/Off Toggle
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
_PIPELINE_ENABLED_KEY = f"{PREFIX}:pipeline:enabled" _PIPELINE_ENABLED_KEY = PIPELINE_ENABLED_KEY
@app.get("/api/ops/pipeline/toggle") @app.get("/api/ops/pipeline/toggle")
@@ -1966,10 +1966,33 @@ async def set_pipeline_toggle(body: dict[str, Any]):
Accepts: { "enabled": true/false } Accepts: { "enabled": true/false }
Workers check this flag before processing jobs. Workers check this flag before processing jobs.
When disabling, optionally flush all pipeline queues so in-flight
work stops immediately.
""" """
enabled = body.get("enabled", True) enabled = body.get("enabled", True)
flush = body.get("flush", not enabled) # default: flush when disabling
await rds.set(_PIPELINE_ENABLED_KEY, "1" if enabled else "0") await rds.set(_PIPELINE_ENABLED_KEY, "1" if enabled else "0")
return {"pipeline_enabled": enabled, "message": f"Pipeline {'enabled' if enabled else 'disabled'}"}
flushed_counts: dict[str, int] = {}
if flush and not enabled:
from services.shared.redis_keys import QUEUE_PREFIX
# Flush all pipeline queues
queue_names = [
"ingestion", "parsing", "extraction", "macro_classification",
"aggregation", "recommendation", "lake_publish",
]
for qname in queue_names:
qkey = f"{QUEUE_PREFIX}:{qname}"
count = await rds.llen(qkey)
if count > 0:
await rds.delete(qkey)
flushed_counts[qname] = count
msg = f"Pipeline {'enabled' if enabled else 'disabled'}"
if flushed_counts:
total = sum(flushed_counts.values())
msg += f" — flushed {total} queued jobs"
return {"pipeline_enabled": enabled, "flushed": flushed_counts, "message": msg}
@app.get("/api/ops/sources/coverage-gaps") @app.get("/api/ops/sources/coverage-gaps")
+5
View File
@@ -27,6 +27,7 @@ from services.shared.redis_keys import (
QUEUE_AGGREGATION, QUEUE_AGGREGATION,
QUEUE_EXTRACTION, QUEUE_EXTRACTION,
QUEUE_MACRO_CLASSIFICATION, QUEUE_MACRO_CLASSIFICATION,
is_pipeline_enabled,
queue_key, queue_key,
) )
@@ -421,6 +422,10 @@ async def main() -> None:
try: try:
while True: while True:
if not await is_pipeline_enabled(redis_client):
await asyncio.sleep(1)
continue
# Alternate: every 3rd job from macro queue, rest from extraction # Alternate: every 3rd job from macro queue, rest from extraction
# This prevents macro events from starving regular extractions # This prevents macro events from starving regular extractions
raw = None raw = None
+4
View File
@@ -41,6 +41,7 @@ from services.shared.redis_keys import (
QUEUE_INGESTION, QUEUE_INGESTION,
QUEUE_PARSING, QUEUE_PARSING,
dedupe_key, dedupe_key,
is_pipeline_enabled,
queue_key, queue_key,
) )
from services.shared.storage import ( from services.shared.storage import (
@@ -265,6 +266,9 @@ async def main():
try: try:
while True: while True:
if not await is_pipeline_enabled(rds):
await asyncio.sleep(2)
continue
raw = await rds.lpop(queue) raw = await rds.lpop(queue)
if raw: if raw:
job = json.loads(raw) job = json.loads(raw)
+4 -1
View File
@@ -54,7 +54,7 @@ from services.lake_publisher.worker import (
from services.shared.config import load_config from services.shared.config import load_config
from services.shared.db import get_minio, get_pg_pool, get_redis from services.shared.db import get_minio, get_pg_pool, get_redis
from services.shared.logging import setup_logging from services.shared.logging import setup_logging
from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, is_pipeline_enabled, queue_key
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -865,6 +865,9 @@ async def run_worker(
logger.info("Lake publisher worker started, listening on %s", queue) logger.info("Lake publisher worker started, listening on %s", queue)
while True: while True:
if not await is_pipeline_enabled(rds):
await asyncio.sleep(poll_interval)
continue
raw = await rds.lpop(queue) # type: ignore[misc] raw = await rds.lpop(queue) # type: ignore[misc]
if raw is None: if raw is None:
await asyncio.sleep(poll_interval) await asyncio.sleep(poll_interval)
+10 -1
View File
@@ -35,7 +35,13 @@ from services.shared.metrics import (
PARSE_LOW_QUALITY_TOTAL, PARSE_LOW_QUALITY_TOTAL,
PARSE_QUALITY_SCORE, PARSE_QUALITY_SCORE,
) )
from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_MACRO_CLASSIFICATION, QUEUE_PARSING, queue_key from services.shared.redis_keys import (
QUEUE_EXTRACTION,
QUEUE_MACRO_CLASSIFICATION,
QUEUE_PARSING,
is_pipeline_enabled,
queue_key,
)
from services.shared.storage import upload_normalized_text, upload_parser_output from services.shared.storage import upload_normalized_text, upload_parser_output
logger = logging.getLogger("parser_worker") logger = logging.getLogger("parser_worker")
@@ -260,6 +266,9 @@ async def main() -> None:
try: try:
while True: while True:
if not await is_pipeline_enabled(rds):
await asyncio.sleep(2)
continue
raw = await rds.lpop(queue) raw = await rds.lpop(queue)
if raw: if raw:
job = json.loads(raw) job = json.loads(raw)
+5 -1
View File
@@ -12,7 +12,7 @@ from services.recommendation.worker import generate_recommendation
from services.shared.agent_config import AgentConfigResolver from services.shared.agent_config import AgentConfigResolver
from services.shared.config import OllamaConfig, load_config from services.shared.config import OllamaConfig, load_config
from services.shared.logging import setup_logging from services.shared.logging import setup_logging
from services.shared.redis_keys import QUEUE_RECOMMENDATION, queue_key from services.shared.redis_keys import QUEUE_RECOMMENDATION, is_pipeline_enabled, queue_key
logger = logging.getLogger("recommendation_main") logger = logging.getLogger("recommendation_main")
@@ -62,6 +62,10 @@ async def main() -> None:
try: try:
while True: while True:
if not await is_pipeline_enabled(redis_client):
await asyncio.sleep(1)
continue
raw = await redis_client.lpop(queue) raw = await redis_client.lpop(queue)
if raw is None: if raw is None:
await asyncio.sleep(1) await asyncio.sleep(1)
+2 -2
View File
@@ -20,7 +20,7 @@ from services.shared.config import load_config
from services.shared.db import get_pg_pool, get_redis from services.shared.db import get_pg_pool, get_redis
from services.shared.logging import setup_logging from services.shared.logging import setup_logging
from services.shared.redis_keys import ( from services.shared.redis_keys import (
PREFIX, PIPELINE_ENABLED_KEY,
QUEUE_EXTRACTION, QUEUE_EXTRACTION,
QUEUE_INGESTION, QUEUE_INGESTION,
QUEUE_MACRO_CLASSIFICATION, QUEUE_MACRO_CLASSIFICATION,
@@ -501,7 +501,7 @@ async def main() -> None:
rds = get_redis(config) rds = get_redis(config)
logger.info("Scheduler started (tick=%ds)", SCHEDULER_TICK) logger.info("Scheduler started (tick=%ds)", SCHEDULER_TICK)
pipeline_key = f"{PREFIX}:pipeline:enabled" pipeline_key = PIPELINE_ENABLED_KEY
# If PIPELINE_DEFAULT_OFF is set, initialize the toggle to OFF on first boot # If PIPELINE_DEFAULT_OFF is set, initialize the toggle to OFF on first boot
# (only if the key doesn't already exist — preserves manual overrides) # (only if the key doesn't already exist — preserves manual overrides)
+15
View File
@@ -89,3 +89,18 @@ def trading_cb_key(trigger_type: str) -> str:
def trading_notification_rate_key(channel: str) -> str: def trading_notification_rate_key(channel: str) -> str:
"""Return the notification rate-limit key for a given channel.""" """Return the notification rate-limit key for a given channel."""
return f"{TRADING_NOTIFICATION_RATE}:{channel}" return f"{TRADING_NOTIFICATION_RATE}:{channel}"
# --- Pipeline toggle ---
PIPELINE_ENABLED_KEY = f"{PREFIX}:pipeline:enabled"
async def is_pipeline_enabled(rds: "redis.asyncio.Redis") -> bool: # type: ignore[name-defined] # noqa: F821
"""Check whether the pipeline is enabled via the Redis toggle.
Returns True (enabled) when the key is absent or set to anything
other than ``"0"``. Workers should call this at the top of each
loop iteration and sleep when it returns False.
"""
val = await rds.get(PIPELINE_ENABLED_KEY)
return val != "0"