feat: pipeline on/off toggle with per-stage Helm control

- Added pipelineEnabled flag to Helm values (default: true) - Worker services (scheduler, ingestion, parser, extractor, aggregation, recommendation, broker-adapter, lake-publisher) scale to 0 when disabled - API services always run regardless of toggle - Redis-based runtime toggle: POST /api/ops/pipeline/toggle - Scheduler checks the flag before each cycle - Frontend: green/red Pipeline ON/OFF button on the pipeline page - Beta defaults to pipelineEnabled: false - Base values.yaml: blanked external URLs (Ollama, Polygon, Alpaca) so stages only connect to what they explicitly configure
2026-04-21 00:21:53 +00:00
parent a19ed086fe
commit be526ae614
14 changed files with 923 additions and 104 deletions
@@ -227,3 +227,55 @@ jobs:
        with:
          name: inttest-results
          path: inttest-results.json
+
+  beta-gate:
+    needs: [integration-test]
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    runs-on: self-hosted-gremlin
+    permissions:
+      contents: read
+      packages: read
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: Install kubectl
+        run: |
+          if ! command -v kubectl &> /dev/null; then
+            curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+            chmod +x kubectl
+            sudo mv kubectl /usr/local/bin/kubectl
+          fi
+          kubectl version --client
+
+      - name: Install Helm
+        run: |
+          if ! command -v helm &> /dev/null; then
+            curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | sudo bash
+          fi
+          helm version
+
+      - name: Configure kubectl
+        run: |
+          if [ -f /var/run/secrets/kubernetes.io/serviceaccount/token ]; then
+            kubectl config set-cluster in-cluster \
+              --server=https://kubernetes.default.svc \
+              --certificate-authority=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+            kubectl config set-credentials runner \
+              --token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
+            kubectl config set-context runner --cluster=in-cluster --user=runner
+            kubectl config use-context runner
+          fi
+          kubectl cluster-info || echo "WARNING: kubectl cannot reach cluster API"
+
+      - name: Run beta gate (deploy → test → promote)
+        run: |
+          bash infra/inttest/promote.sh \
+            --image-tag ${{ github.sha }} \
+            --results-file beta-gate-results.json
+
+      - name: Upload beta gate results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: beta-gate-results
+          path: beta-gate-results.json
@@ -533,6 +533,14 @@ export function useRetryFailedExtractions() {
  });
 }

+export function usePipelineToggle() {
+  const qc = useQueryClient();
+  return useMutation({
+    mutationFn: (enabled: boolean) => apiPost<{ pipeline_enabled: boolean }>('query', '/api/ops/pipeline/toggle', { enabled }),
+    onSuccess: () => qc.invalidateQueries({ queryKey: ['pipeline-health'] }),
+  });
+}
+
 export function useIngestionSummary(hours = 24) {
  return useGet<Record<string, unknown>>(['ingestion-summary', hours], 'query', `/api/ops/ingestion/summary?hours=${hours}`);
 }
@@ -1,5 +1,5 @@
 import { useState, useEffect } from 'react';
-import { usePipelineHealth, useRetryFailedExtractions } from '../api/hooks';
+import { usePipelineHealth, useRetryFailedExtractions, usePipelineToggle } from '../api/hooks';
 import { LoadingSpinner, DateRangeSelector, Card } from '../components/ui';

 const QUEUE_LABELS: Record<string, string> = {
@@ -54,12 +54,14 @@ export function OpsPipelinePage() {
  const { data, isLoading } = usePipelineHealth(hours);
  const stream = usePipelineStream();
  const retryMutation = useRetryFailedExtractions();
+  const toggleMutation = usePipelineToggle();

  if (isLoading) return <LoadingSpinner />;

  const parsing = (data?.parsing ?? {}) as Record<string, unknown>;
  const extraction = (data?.extraction ?? {}) as Record<string, unknown>;
  const aggregation = (data?.aggregation ?? {}) as Record<string, unknown>;
+  const pipelineEnabled = (data?.pipeline_enabled ?? true) as boolean;

  // Prefer live stream data for queue depths and doc stages, fall back to initial fetch
  const queueDepths = stream?.queue_depths
@@ -82,6 +84,14 @@ export function OpsPipelinePage() {
      <div className="flex items-center justify-between">
        <h1 className="text-xl font-semibold text-gray-100">Pipeline Health</h1>
        <div className="flex items-center gap-3">
+          <button
+            type="button"
+            onClick={() => toggleMutation.mutate(!pipelineEnabled)}
+            disabled={toggleMutation.isPending}
+            className={`rounded-md px-3 py-1.5 text-xs font-medium text-white ${pipelineEnabled ? 'bg-green-600 hover:bg-green-500' : 'bg-red-600 hover:bg-red-500'} disabled:opacity-50`}
+          >
+            {toggleMutation.isPending ? '…' : pipelineEnabled ? 'Pipeline ON' : 'Pipeline OFF'}
+          </button>
          {failedCount > 0 && (
            <button
              type="button"
@@ -106,6 +106,8 @@ export const handlers = [
  http.delete('/api/admin/trading/lockouts/:id', () => HttpResponse.json({ status: 'deleted' })),
  http.get('/api/ops/pipeline/health', () => HttpResponse.json({ hours: 24, document_stages: [{ status: 'extracted', doc_count: 5 }], parsing: {}, extraction: {}, aggregation: {}, queue_depths: {} })),
  http.post('/api/ops/pipeline/retry-failed', () => HttpResponse.json({ retried: 10, message: 'Re-enqueued 10 documents for extraction' })),
+  http.get('/api/ops/pipeline/toggle', () => HttpResponse.json({ pipeline_enabled: true })),
+  http.post('/api/ops/pipeline/toggle', () => HttpResponse.json({ pipeline_enabled: true })),
  http.get('/api/ops/ingestion/summary', () => HttpResponse.json({ total_runs: 10, completed: 8, failed: 2, total_items_fetched: 50, total_items_new: 12, by_source_type: [] })),
  http.get('/api/ops/ingestion/throughput', () => HttpResponse.json([])),
  http.get('/api/ops/model/performance', () => HttpResponse.json({ total_extractions: 20, success_rate: 0.9, avg_duration_ms: 1500, retry_rate: 0.05, avg_confidence: 0.8 })),
@@ -11,7 +11,7 @@ metadata:
    {{- include "stonks.labels" $root | nindent 4 }}
    stonks-oracle/tier: {{ $svc.tier }}
 spec:
-  replicas: {{ $svc.replicas }}
+  replicas: {{ if and (hasKey $svc "pipeline") $svc.pipeline (not $root.Values.pipelineEnabled) }}0{{ else }}{{ $svc.replicas }}{{ end }}
  selector:
    matchLabels:
      app: {{ $svc.image }}
@@ -6,31 +6,11 @@
 image:
  tag: latest

-## Single replica for API services, disable pipeline workers
-## Beta is for API testing only — no ingestion/extraction/aggregation
+## Pipeline OFF by default — beta is for API testing only
+pipelineEnabled: false
+
+## Single replica for API services
 services:
-  scheduler:
-    replicas: 0
-  symbolRegistry:
-    replicas: 1
-  ingestion:
-    replicas: 0
-  parser:
-    replicas: 0
-  extractor:
-    replicas: 0
-  aggregation:
-    replicas: 0
-  recommendation:
-    replicas: 0
-  tradingEngine:
-    replicas: 1
-  riskEngine:
-    replicas: 1
-  brokerAdapter:
-    replicas: 0
-  lakePublisher:
-    replicas: 0
  queryApi:
    replicas: 1
  dashboard:
@@ -4,10 +4,16 @@ image:
  pullPolicy: Always
  tag: latest

+## Pipeline toggle — when false, all worker services (ingestion, parsing,
+## extraction, aggregation, recommendation, broker, lake-publisher, scheduler)
+## are scaled to 0. API services always run.
+pipelineEnabled: true
+
 ## Service deployments — replicas and resource overrides
 services:
  scheduler:
    replicas: 1
+    pipeline: true
    image: scheduler
    command: "python -m services.scheduler.app"
    tier: orchestration
@@ -32,6 +38,7 @@ services:

  ingestion:
    replicas: 2
+    pipeline: true
    image: ingestion
    command: "python -m services.ingestion.worker"
    tier: ingestion
@@ -42,6 +49,7 @@ services:

  parser:
    replicas: 2
+    pipeline: true
    image: parser
    command: "python -m services.parser.worker"
    tier: processing
@@ -52,6 +60,7 @@ services:

  extractor:
    replicas: 1
+    pipeline: true
    image: extractor
    command: "python -m services.extractor.main"
    tier: processing
@@ -62,6 +71,7 @@ services:

  aggregation:
    replicas: 4
+    pipeline: true
    image: aggregation
    command: "python -m services.aggregation.main"
    tier: processing
@@ -72,6 +82,7 @@ services:

  recommendation:
    replicas: 1
+    pipeline: true
    image: recommendation
    command: "python -m services.recommendation.main"
    tier: processing
@@ -107,6 +118,7 @@ services:

  brokerAdapter:
    replicas: 1
+    pipeline: true
    image: broker-adapter
    command: "python -m services.adapters.broker_service"
    tier: trading
@@ -117,6 +129,7 @@ services:

  lakePublisher:
    replicas: 1
+    pipeline: true
    image: lake-publisher
    command: "python -m services.lake_publisher.jobs"
    tier: analytics
@@ -0,0 +1,409 @@
+#!/bin/bash
+# Beta-to-Paper promotion gate
+#
+# Deploys the given image tag to the beta namespace, runs integration tests
+# against the live beta services, and promotes to paper-trading if all pass.
+#
+# This script is the single source of truth for the promotion decision.
+# CI calls it; humans can call it too.
+#
+# Usage: bash infra/inttest/promote.sh [OPTIONS]
+#
+# Options:
+#   --image-tag TAG       Docker image tag to deploy (required)
+#   --skip-promote        Run tests but don't promote even if green
+#   --skip-teardown       Leave beta namespace running after tests
+#   --results-file PATH   Path for JSON results output (default: beta-gate-results.json)
+#   --timeout SECONDS     Max wait for services to become ready (default: 180)
+#   -h, --help            Show usage
+#
+# Exit codes:
+#   0  All tests passed, promotion succeeded (or --skip-promote)
+#   1  Test failures — promotion blocked
+#   2  Infrastructure/deployment failure
+#   3  Promotion step failed (tests passed but helm upgrade failed)
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# ── Defaults ─────────────────────────────────────────────────────────────────
+IMAGE_TAG=""
+SKIP_PROMOTE=false
+SKIP_TEARDOWN=false
+RESULTS_FILE="beta-gate-results.json"
+READY_TIMEOUT=180
+BETA_NAMESPACE="stonks-oracle-beta"
+PAPER_NAMESPACE="stonks-oracle"
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+usage() {
+  cat <<EOF
+Usage: bash infra/inttest/promote.sh [OPTIONS]
+
+Options:
+  --image-tag TAG       Docker image tag to deploy (required)
+  --skip-promote        Run tests but don't promote even if green
+  --skip-teardown       Leave beta namespace running after tests
+  --results-file PATH   Path for JSON results output (default: beta-gate-results.json)
+  --timeout SECONDS     Max wait for services to become ready (default: 180)
+  -h, --help            Show usage
+
+Exit codes:
+  0  All tests passed, promotion succeeded (or --skip-promote)
+  1  Test failures — promotion blocked
+  2  Infrastructure/deployment failure
+  3  Promotion step failed (tests passed but helm upgrade failed)
+EOF
+  exit 0
+}
+
+log() {
+  echo "[$(date -u +"%H:%M:%S")] [beta-gate] $*"
+}
+
+die() {
+  log "FATAL: $*"
+  exit 2
+}
+
+# ── Parse CLI args ───────────────────────────────────────────────────────────
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --image-tag)
+      IMAGE_TAG="$2"
+      shift 2
+      ;;
+    --skip-promote)
+      SKIP_PROMOTE=true
+      shift
+      ;;
+    --skip-teardown)
+      SKIP_TEARDOWN=true
+      shift
+      ;;
+    --results-file)
+      RESULTS_FILE="$2"
+      shift 2
+      ;;
+    --timeout)
+      READY_TIMEOUT="$2"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      echo "Unknown option: $1"
+      usage
+      ;;
+  esac
+done
+
+if [ -z "$IMAGE_TAG" ]; then
+  echo "ERROR: --image-tag is required"
+  usage
+fi
+
+STARTED_AT=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+PIPELINE_START=$(date +%s)
+
+log "Beta gate starting"
+log "  Image tag:     $IMAGE_TAG"
+log "  Beta NS:       $BETA_NAMESPACE"
+log "  Paper NS:      $PAPER_NAMESPACE"
+log "  Skip promote:  $SKIP_PROMOTE"
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Stage 1: Deploy to beta namespace
+# ══════════════════════════════════════════════════════════════════════════════
+log "▶ Stage 1: Deploy to beta"
+
+# Ensure beta namespace exists
+kubectl create namespace "$BETA_NAMESPACE" 2>/dev/null || true
+
+# Create beta database if it doesn't exist
+log "Ensuring beta database exists ..."
+kubectl exec -n postgresql-service postgresql-1 -c postgres -- \
+  psql -U postgres -tc "SELECT 1 FROM pg_database WHERE datname = 'stonks_beta'" \
+  | grep -q 1 || \
+kubectl exec -n postgresql-service postgresql-1 -c postgres -- \
+  psql -U postgres -c "CREATE DATABASE stonks_beta OWNER stonks;" 2>/dev/null || true
+
+# Apply migrations to beta database
+log "Applying migrations to beta database ..."
+for migration in $(ls "$REPO_ROOT/infra/migrations/"*.sql | sort); do
+  kubectl exec -n postgresql-service postgresql-1 -c postgres -- \
+    psql -U stonks -d stonks_beta -f - < "$migration" 2>/dev/null || true
+done
+
+# Deploy via Helm with beta values
+log "Helm upgrade to beta namespace ..."
+if ! helm upgrade --install stonks-oracle-beta \
+    "$REPO_ROOT/infra/helm/stonks-oracle" \
+    -n "$BETA_NAMESPACE" \
+    -f "$REPO_ROOT/infra/helm/stonks-oracle/values-beta.yaml" \
+    --set "image.tag=$IMAGE_TAG" \
+    --wait \
+    --timeout "${READY_TIMEOUT}s"; then
+  log "Helm deploy to beta failed"
+  DEPLOY_STATUS="failed"
+else
+  DEPLOY_STATUS="ok"
+fi
+
+if [ "$DEPLOY_STATUS" != "ok" ]; then
+  log "Beta deployment failed — checking pod status"
+  kubectl get pods -n "$BETA_NAMESPACE" -o wide 2>&1 || true
+  kubectl get events -n "$BETA_NAMESPACE" --sort-by='.lastTimestamp' 2>&1 | tail -20 || true
+
+  cat > "$RESULTS_FILE" <<EOF
+{
+  "run_id": "beta-gate-${IMAGE_TAG}",
+  "image_tag": "${IMAGE_TAG}",
+  "started_at": "${STARTED_AT}",
+  "completed_at": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
+  "exit_code": 2,
+  "stage": "deploy",
+  "deploy_status": "failed",
+  "test_status": "skipped",
+  "promote_status": "blocked",
+  "tests": {"total": 0, "passed": 0, "failed": 0, "errors": 0}
+}
+EOF
+  exit 2
+fi
+
+log "✓ Beta deployment ready"
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Stage 2: Seed beta database
+# ══════════════════════════════════════════════════════════════════════════════
+log "▶ Stage 2: Seed beta data"
+
+# Run seed against beta database via a temporary pod
+SEED_IMAGE="registry.celestium.life/stonks-oracle/query-api:${IMAGE_TAG}"
+
+# Clean up any previous seed pod
+kubectl delete pod seed-beta -n "$BETA_NAMESPACE" --ignore-not-found > /dev/null 2>&1 || true
+
+if ! kubectl run seed-beta \
+    --image="$SEED_IMAGE" \
+    --restart=Never \
+    --rm \
+    --attach \
+    --pod-running-timeout=3m \
+    --namespace="$BETA_NAMESPACE" \
+    --image-pull-policy=Always \
+    --env="POSTGRES_HOST=postgresql-rw.postgresql-service.svc.cluster.local" \
+    --env="POSTGRES_PORT=5432" \
+    --env="POSTGRES_DB=stonks_beta" \
+    --env="POSTGRES_USER=stonks" \
+    --env="POSTGRES_PASSWORD=St0nks0racl3!" \
+    --env="MINIO_ENDPOINT=minio.minio-service.svc.cluster.local:80" \
+    --env="MINIO_SECURE=false" \
+    --env="MINIO_ACCESS_KEY=minioadmin" \
+    --env="MINIO_SECRET_KEY=minioadmin" \
+    --command -- python -m tests.integration.seed_sandbox 2>/dev/null; then
+  log "WARNING: Seed may have partially failed (could be idempotent re-run)"
+fi
+
+log "✓ Beta data seeded"
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Stage 3: Run integration tests against beta
+# ══════════════════════════════════════════════════════════════════════════════
+log "▶ Stage 3: Run integration tests"
+
+# Determine service URLs within the beta namespace
+QUERY_API_URL="http://query-api.${BETA_NAMESPACE}.svc.cluster.local:8000"
+REGISTRY_API_URL="http://symbol-registry.${BETA_NAMESPACE}.svc.cluster.local:8000"
+RISK_API_URL="http://risk.${BETA_NAMESPACE}.svc.cluster.local:8000"
+TRADING_API_URL="http://trading-engine.${BETA_NAMESPACE}.svc.cluster.local:8000"
+
+# Clean up any previous runner
+kubectl delete pod beta-test-runner -n "$BETA_NAMESPACE" --ignore-not-found > /dev/null 2>&1 || true
+
+# Run tests as a pod in the beta namespace
+log "Starting test runner pod ..."
+kubectl run beta-test-runner \
+    --image="$SEED_IMAGE" \
+    --restart=Never \
+    --namespace="$BETA_NAMESPACE" \
+    --image-pull-policy=Always \
+    --env="QUERY_API_URL=$QUERY_API_URL" \
+    --env="REGISTRY_API_URL=$REGISTRY_API_URL" \
+    --env="RISK_API_URL=$RISK_API_URL" \
+    --env="TRADING_API_URL=$TRADING_API_URL" \
+    --env="POSTGRES_HOST=postgresql-rw.postgresql-service.svc.cluster.local" \
+    --env="POSTGRES_PORT=5432" \
+    --env="POSTGRES_DB=stonks_beta" \
+    --env="POSTGRES_USER=stonks" \
+    --env="POSTGRES_PASSWORD=St0nks0racl3!" \
+    --env="REDIS_HOST=redis-master.redis-service.svc.cluster.local" \
+    --env="REDIS_PORT=6379" \
+    --env="REDIS_DB=1" \
+    --env="REDIS_PASSWORD=" \
+    --env="BROKER_MODE=paper" \
+    --env="LOG_LEVEL=INFO" \
+    --command -- python -m pytest tests/integration/ -v --tb=short -q
+
+# Wait for the test runner to complete
+log "Waiting for test runner (timeout: 600s) ..."
+TEST_EXIT_CODE=0
+if ! kubectl wait --for=condition=Ready=false pod/beta-test-runner \
+    -n "$BETA_NAMESPACE" --timeout=600s 2>/dev/null; then
+  # Pod may have already completed — check its status
+  true
+fi
+
+# Wait for pod to reach terminal state
+for i in $(seq 1 120); do
+  POD_PHASE=$(kubectl get pod beta-test-runner -n "$BETA_NAMESPACE" \
+    -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
+  if [ "$POD_PHASE" = "Succeeded" ] || [ "$POD_PHASE" = "Failed" ]; then
+    break
+  fi
+  sleep 5
+done
+
+# Collect results
+TEST_OUTPUT=$(kubectl logs beta-test-runner -n "$BETA_NAMESPACE" 2>/dev/null || true)
+POD_PHASE=$(kubectl get pod beta-test-runner -n "$BETA_NAMESPACE" \
+  -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
+
+if [ "$POD_PHASE" = "Failed" ]; then
+  TEST_EXIT_CODE=1
+fi
+
+# Parse test counts
+TESTS_PASSED=$(echo "$TEST_OUTPUT" | grep -oP '\d+(?= passed)' | tail -1 || echo "0")
+TESTS_FAILED=$(echo "$TEST_OUTPUT" | grep -oP '\d+(?= failed)' | tail -1 || echo "0")
+TESTS_ERRORS=$(echo "$TEST_OUTPUT" | grep -oP '\d+(?= error)' | tail -1 || echo "0")
+TESTS_PASSED=${TESTS_PASSED:-0}
+TESTS_FAILED=${TESTS_FAILED:-0}
+TESTS_ERRORS=${TESTS_ERRORS:-0}
+TESTS_TOTAL=$(( TESTS_PASSED + TESTS_FAILED + TESTS_ERRORS ))
+
+log "Test results: ${TESTS_PASSED} passed, ${TESTS_FAILED} failed, ${TESTS_ERRORS} errors"
+
+# Print test output for CI visibility
+if [ -n "$TEST_OUTPUT" ]; then
+  echo "─── Test Output ───"
+  echo "$TEST_OUTPUT" | tail -60
+  echo "─── End Test Output ───"
+fi
+
+# Clean up test runner
+kubectl delete pod beta-test-runner -n "$BETA_NAMESPACE" --ignore-not-found > /dev/null 2>&1 || true
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Stage 4: Promotion decision
+# ══════════════════════════════════════════════════════════════════════════════
+PROMOTE_STATUS="blocked"
+FINAL_EXIT=0
+
+if [ "$TESTS_FAILED" -gt 0 ] || [ "$TESTS_ERRORS" -gt 0 ] || [ "$TEST_EXIT_CODE" -ne 0 ]; then
+  log "✗ GATE FAILED — ${TESTS_FAILED} failures, ${TESTS_ERRORS} errors"
+  log "  Promotion to paper-trading BLOCKED"
+  PROMOTE_STATUS="blocked"
+  FINAL_EXIT=1
+elif [ "$SKIP_PROMOTE" = true ]; then
+  log "✓ Tests passed — promotion skipped (--skip-promote)"
+  PROMOTE_STATUS="skipped"
+  FINAL_EXIT=0
+else
+  log "▶ Stage 4: Promoting to paper-trading"
+  log "  Upgrading $PAPER_NAMESPACE with image tag $IMAGE_TAG ..."
+
+  if helm upgrade --install stonks-oracle \
+      "$REPO_ROOT/infra/helm/stonks-oracle" \
+      -n "$PAPER_NAMESPACE" \
+      --set "image.tag=$IMAGE_TAG" \
+      --wait \
+      --timeout 300s; then
+    log "✓ PROMOTED — paper-trading now running $IMAGE_TAG"
+    PROMOTE_STATUS="promoted"
+    FINAL_EXIT=0
+
+    # Rolling restart to pick up new images
+    log "Rolling restart of API services ..."
+    kubectl rollout restart deployment/query-api deployment/symbol-registry \
+      deployment/trading-engine deployment/risk-engine \
+      deployment/aggregation deployment/recommendation \
+      -n "$PAPER_NAMESPACE" 2>/dev/null || true
+  else
+    log "✗ Promotion failed — helm upgrade error"
+    PROMOTE_STATUS="failed"
+    FINAL_EXIT=3
+  fi
+fi
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Stage 5: Cleanup beta (optional)
+# ══════════════════════════════════════════════════════════════════════════════
+if [ "$SKIP_TEARDOWN" = false ] && [ "$PROMOTE_STATUS" = "promoted" ]; then
+  log "Scaling down beta deployment (keeping namespace for next run) ..."
+  helm upgrade stonks-oracle-beta \
+    "$REPO_ROOT/infra/helm/stonks-oracle" \
+    -n "$BETA_NAMESPACE" \
+    -f "$REPO_ROOT/infra/helm/stonks-oracle/values-beta.yaml" \
+    --set "image.tag=$IMAGE_TAG" \
+    --set "services.queryApi.replicas=0" \
+    --set "services.symbolRegistry.replicas=0" \
+    --set "services.tradingEngine.replicas=0" \
+    --set "services.riskEngine.replicas=0" \
+    --set "services.scheduler.replicas=0" \
+    --set "services.ingestion.replicas=0" \
+    --set "services.parser.replicas=0" \
+    --set "services.extractor.replicas=0" \
+    --set "services.aggregation.replicas=0" \
+    --set "services.recommendation.replicas=0" \
+    --set "services.brokerAdapter.replicas=0" \
+    --set "services.lakePublisher.replicas=0" \
+    --set "services.dashboard.replicas=0" \
+    2>/dev/null || true
+  log "Beta scaled to zero"
+fi
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Write results
+# ══════════════════════════════════════════════════════════════════════════════
+COMPLETED_AT=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+PIPELINE_END=$(date +%s)
+PIPELINE_DURATION=$(( PIPELINE_END - PIPELINE_START ))
+
+cat > "$RESULTS_FILE" <<EOF
+{
+  "run_id": "beta-gate-${IMAGE_TAG}",
+  "image_tag": "${IMAGE_TAG}",
+  "started_at": "${STARTED_AT}",
+  "completed_at": "${COMPLETED_AT}",
+  "duration_s": ${PIPELINE_DURATION},
+  "exit_code": ${FINAL_EXIT},
+  "deploy_status": "${DEPLOY_STATUS}",
+  "test_status": "$([ "$FINAL_EXIT" -le 1 ] && echo "completed" || echo "error")",
+  "promote_status": "${PROMOTE_STATUS}",
+  "tests": {
+    "total": ${TESTS_TOTAL},
+    "passed": ${TESTS_PASSED},
+    "failed": ${TESTS_FAILED},
+    "errors": ${TESTS_ERRORS}
+  }
+}
+EOF
+
+log "Results written to $RESULTS_FILE"
+echo ""
+log "═══════════════════════════════════════════════════"
+log "  Beta Gate Summary"
+log "═══════════════════════════════════════════════════"
+log "  Image:       $IMAGE_TAG"
+log "  Duration:    ${PIPELINE_DURATION}s"
+log "  Tests:       ${TESTS_PASSED}/${TESTS_TOTAL} passed"
+log "  Promotion:   ${PROMOTE_STATUS}"
+log "  Exit code:   $FINAL_EXIT"
+log "═══════════════════════════════════════════════════"
+echo ""
+
+exit "$FINAL_EXIT"
@@ -65,6 +65,11 @@ trivy:
 metrics:
  enabled: false

+# Enable Redis cache layer for faster manifest lookups (avoids upstream checks)
+cache:
+  enabled: true
+  expireHours: 24
+
 # Resource limits — conservative for a 4-node cluster
 core:
  resources:
@@ -41,7 +41,7 @@ from services.shared.audit import get_entity_audit_trail, get_order_audit_trail,
 from services.shared.config import load_config
 from services.shared.db import get_pg_pool, get_redis
 from services.shared.logging import new_trace_id, set_trace_context, setup_logging
-from services.shared.redis_keys import QUEUE_PREFIX, queue_key
+from services.shared.redis_keys import PREFIX, QUEUE_PREFIX, queue_key
 from services.shared.schemas import MAJOR_DECISION_CATALYSTS

 logger = logging.getLogger("query_api")
@@ -1787,8 +1787,13 @@ async def get_pipeline_health(
            except Exception:
                pass

+    # Pipeline enabled flag
+    pipeline_flag = await rds.get(_PIPELINE_ENABLED_KEY) if rds else None
+    pipeline_enabled = pipeline_flag != "0" if pipeline_flag is not None else True
+
    return {
        "hours": hours,
+        "pipeline_enabled": pipeline_enabled,
        "document_stages": [_row_to_dict(r) for r in doc_stages],
        "parsing": _row_to_dict(parse_quality) if parse_quality else {},
        "extraction": _row_to_dict(extraction_stats) if extraction_stats else {},
@@ -1927,6 +1932,34 @@ async def retry_failed_extractions_endpoint():
    return {"retried": len(doc_ids), "message": f"Re-enqueued {len(doc_ids)} documents for extraction"}


+# ---------------------------------------------------------------------------
+# Pipeline On/Off Toggle
+# ---------------------------------------------------------------------------
+
+_PIPELINE_ENABLED_KEY = f"{PREFIX}:pipeline:enabled"
+
+
+@app.get("/api/ops/pipeline/toggle")
+async def get_pipeline_toggle():
+    """Get the current pipeline enabled/disabled state."""
+    val = await rds.get(_PIPELINE_ENABLED_KEY)
+    # Default to enabled if key doesn't exist
+    enabled = val != "0"
+    return {"pipeline_enabled": enabled}
+
+
+@app.post("/api/ops/pipeline/toggle")
+async def set_pipeline_toggle(body: dict[str, Any]):
+    """Toggle the pipeline on or off.
+
+    Accepts: { "enabled": true/false }
+    Workers check this flag before processing jobs.
+    """
+    enabled = body.get("enabled", True)
+    await rds.set(_PIPELINE_ENABLED_KEY, "1" if enabled else "0")
+    return {"pipeline_enabled": enabled, "message": f"Pipeline {'enabled' if enabled else 'disabled'}"}
+
+
@app.get("/api/ops/sources/coverage-gaps")
 async def get_source_coverage_gaps():
    """Identify symbols with missing or insufficient source coverage.
@@ -19,6 +19,7 @@ from services.shared.config import load_config
 from services.shared.db import get_pg_pool, get_redis
 from services.shared.logging import setup_logging
 from services.shared.redis_keys import (
+    PREFIX,
    QUEUE_EXTRACTION,
    QUEUE_INGESTION,
    QUEUE_MACRO_CLASSIFICATION,
@@ -499,12 +500,19 @@ async def main() -> None:
    rds = get_redis(config)

    logger.info("Scheduler started (tick=%ds)", SCHEDULER_TICK)
+    pipeline_key = f"{PREFIX}:pipeline:enabled"
    recovery_counter = 0
    retry_counter = 0
    cleanup_counter = 0
    try:
        while True:
            try:
+                # Check pipeline toggle — skip cycle if disabled
+                flag = await rds.get(pipeline_key)
+                if flag == "0":
+                    await asyncio.sleep(SCHEDULER_TICK)
+                    continue
+
                if await acquire_lock(rds, "scheduler_cycle", ttl=30):
                    try:
                        await schedule_cycle(pool, rds)
@@ -1,77 +0,0 @@
-"""Database migration runner using asyncpg.
-
-Applies all SQL migration files from infra/migrations/ in sorted order.
-Each file is split on semicolons and executed statement-by-statement.
-Idempotent — migrations use IF NOT EXISTS / CREATE OR REPLACE patterns.
-
-Usage:
-    python -m services.shared.migrate
-"""
-import asyncio
-import glob
-import logging
-import os
-import sys
-
-import asyncpg
-
-logger = logging.getLogger("migrate")
-
-
-async def run_migrations() -> None:
-    host = os.getenv("POSTGRES_HOST", "localhost")
-    port = int(os.getenv("POSTGRES_PORT", "5432"))
-    user = os.getenv("POSTGRES_USER", "stonks")
-    password = os.getenv("POSTGRES_PASSWORD", "")
-    database = os.getenv("POSTGRES_DB", "stonks")
-
-    migrations_dir = os.path.join(
-        os.path.dirname(__file__), "..", "..", "infra", "migrations"
-    )
-    migrations_dir = os.path.normpath(migrations_dir)
-
-    if not os.path.isdir(migrations_dir):
-        logger.error("Migrations directory not found: %s", migrations_dir)
-        sys.exit(1)
-
-    files = sorted(glob.glob(os.path.join(migrations_dir, "*.sql")))
-    if not files:
-        logger.warning("No migration files found in %s", migrations_dir)
-        return
-
-    logger.info("Connecting to %s@%s:%d/%s", user, host, port, database)
-    conn = await asyncpg.connect(
-        host=host, port=port, user=user, password=password, database=database
-    )
-
-    try:
-        for path in files:
-            name = os.path.basename(path)
-            with open(path) as f:
-                sql = f.read()
-            # Split on semicolons and execute each statement individually.
-            # asyncpg.execute() doesn't support multi-statement strings.
-            statements = [s.strip() for s in sql.split(";") if s.strip()]
-            try:
-                for stmt in statements:
-                    await conn.execute(stmt)
-                logger.info("  ✓ %s (%d statements)", name, len(statements))
-            except Exception as exc:
-                logger.warning("  ⚠ %s: %s", name, exc)
-    finally:
-        await conn.close()
-
-    logger.info("Migrations complete (%d files)", len(files))
-
-
-def main() -> None:
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(asctime)s %(name)s %(message)s",
-        datefmt="%H:%M:%S",
-    )
-    asyncio.run(run_migrations())
-
-
-if __name__ == "__main__":
-    main()
@@ -0,0 +1,376 @@
+"""Integration tests for cross-service signal flow contracts.
+
+These tests validate the end-to-end data flow that the trading engine
+depends on. They catch schema drift and contract violations between
+services that unit tests miss.
+
+Flow under test:
+  1. Symbol Registry has companies with exposure profiles and competitors
+  2. Query API returns trends with correct schema for trading engine consumption
+  3. Risk engine evaluates orders using data from query API
+  4. Trading engine receives valid recommendation payloads
+
+These are the "beta gate" tests — if any fail, promotion to paper is blocked.
+"""
+
+import pytest
+
+pytestmark = pytest.mark.asyncio
+
+
+# ---------------------------------------------------------------------------
+# Contract: Symbol Registry → Query API company data consistency
+# ---------------------------------------------------------------------------
+
+
+class TestRegistryToQueryContract:
+    """Verify that company data in the registry matches what query API exposes."""
+
+    async def test_company_ids_consistent(self, registry_client, query_client, seed_ids):
+        """Company IDs from registry match those returned by query API."""
+        reg_resp = await registry_client.get("/companies")
+        assert reg_resp.status_code == 200
+        reg_companies = {c["id"]: c["ticker"] for c in reg_resp.json()}
+
+        query_resp = await query_client.get("/api/companies")
+        assert query_resp.status_code == 200
+        query_companies = {c["id"]: c["ticker"] for c in query_resp.json()}
+
+        # Every company in registry should appear in query API
+        for cid, ticker in reg_companies.items():
+            assert cid in query_companies, (
+                f"Company {ticker} ({cid}) in registry but missing from query API"
+            )
+            assert query_companies[cid] == ticker
+
+    async def test_exposure_profiles_accessible(self, registry_client, seed_ids):
+        """Exposure profiles required by macro signal layer are accessible."""
+        company_id = seed_ids["companies"]["AAPL"]
+        resp = await registry_client.get(f"/companies/{company_id}/exposure")
+        assert resp.status_code == 200
+        data = resp.json()
+        # Trading engine needs these fields for macro impact scoring
+        assert "geographic_revenue_mix" in data
+        assert "supply_chain_regions" in data
+        assert "key_input_commodities" in data
+        assert "market_position_tier" in data
+        assert "export_dependency_pct" in data
+        # Values must be valid types
+        assert isinstance(data["geographic_revenue_mix"], dict)
+        assert isinstance(data["supply_chain_regions"], list)
+        assert isinstance(data["export_dependency_pct"], (int, float))
+        assert 0 <= data["export_dependency_pct"] <= 1
+
+    async def test_competitor_relationships_bidirectional(self, registry_client, seed_ids):
+        """Competitor relationships are queryable from both sides."""
+        aapl_id = seed_ids["companies"]["AAPL"]
+        msft_id = seed_ids["companies"]["MSFT"]
+
+        # Query from AAPL side
+        resp_a = await registry_client.get(f"/companies/{aapl_id}/competitors")
+        assert resp_a.status_code == 200
+        aapl_competitors = resp_a.json()
+
+        # Query from MSFT side
+        resp_b = await registry_client.get(f"/companies/{msft_id}/competitors")
+        assert resp_b.status_code == 200
+        msft_competitors = resp_b.json()
+
+        # AAPL should see MSFT and vice versa
+        aapl_partner_ids = set()
+        for rel in aapl_competitors:
+            if rel.get("company_a_id") == aapl_id:
+                aapl_partner_ids.add(rel["company_b_id"])
+            else:
+                aapl_partner_ids.add(rel["company_a_id"])
+
+        msft_partner_ids = set()
+        for rel in msft_competitors:
+            if rel.get("company_a_id") == msft_id:
+                msft_partner_ids.add(rel["company_b_id"])
+            else:
+                msft_partner_ids.add(rel["company_a_id"])
+
+        assert msft_id in aapl_partner_ids, "MSFT not in AAPL's competitors"
+        assert aapl_id in msft_partner_ids, "AAPL not in MSFT's competitors"
+
+
+# ---------------------------------------------------------------------------
+# Contract: Query API → Trading Engine trend data
+# ---------------------------------------------------------------------------
+
+
+class TestTrendToTradingContract:
+    """Verify trend data has the schema the trading engine expects."""
+
+    async def test_trend_has_required_trading_fields(self, query_client, seed_ids):
+        """Trends must include fields the trading engine uses for decisions."""
+        resp = await query_client.get("/api/trends")
+        assert resp.status_code == 200
+        trends = resp.json()
+        assert len(trends) >= 1
+
+        for trend in trends:
+            # Fields the trading engine reads
+            assert "id" in trend
+            assert "trend_direction" in trend
+            assert "confidence" in trend
+            assert "trend_strength" in trend
+            # Direction must be a valid enum value
+            assert trend["trend_direction"] in (
+                "bullish", "bearish", "mixed", "neutral",
+            ), f"Invalid direction: {trend['trend_direction']}"
+            # Confidence and strength must be normalized [0, 1]
+            assert 0 <= trend["confidence"] <= 1, (
+                f"Confidence out of range: {trend['confidence']}"
+            )
+            assert 0 <= trend["trend_strength"] <= 1, (
+                f"Strength out of range: {trend['trend_strength']}"
+            )
+
+    async def test_trend_detail_has_evidence(self, query_client, seed_ids):
+        """Individual trend detail includes evidence the trading engine logs."""
+        trend_id = seed_ids["trends"]["TREND_01"]
+        resp = await query_client.get(f"/api/trends/{trend_id}")
+        assert resp.status_code == 200
+        data = resp.json()
+
+        # Trading engine logs these for audit trail
+        assert "top_supporting_evidence" in data
+        assert "top_opposing_evidence" in data
+        assert "dominant_catalysts" in data
+        assert isinstance(data["top_supporting_evidence"], list)
+        assert isinstance(data["top_opposing_evidence"], list)
+        assert isinstance(data["dominant_catalysts"], list)
+
+
+# ---------------------------------------------------------------------------
+# Contract: Recommendation → Risk Engine → Trading Engine
+# ---------------------------------------------------------------------------
+
+
+class TestRecommendationToRiskContract:
+    """Verify recommendations produce valid risk evaluation inputs."""
+
+    async def test_recommendation_has_risk_fields(self, query_client, seed_ids):
+        """Recommendations include fields needed for risk evaluation."""
+        resp = await query_client.get(
+            "/api/recommendations", params={"latest": "false"},
+        )
+        assert resp.status_code == 200
+        recs = resp.json()
+        assert len(recs) >= 1
+
+        for rec in recs:
+            assert "ticker" in rec
+            assert "action" in rec
+            assert "confidence" in rec
+            assert "mode" in rec
+            # Action must be valid
+            assert rec["action"] in ("buy", "sell", "hold", "watch")
+            # Mode determines if it reaches trading engine
+            assert rec["mode"] in (
+                "informational", "paper_eligible", "live_eligible",
+            )
+            # Confidence must be normalized
+            assert 0 <= rec["confidence"] <= 1
+
+    async def test_risk_evaluation_schema(self, risk_client):
+        """Risk engine returns evaluation with all fields trading engine needs."""
+        payload = {
+            "order": {
+                "ticker": "AAPL",
+                "action": "buy",
+                "quantity": 5,
+                "estimated_value": 925.0,
+                "confidence": 0.75,
+                "recommendation_id": None,
+                "sector": "Technology",
+            },
+        }
+        resp = await risk_client.post("/evaluate", json=payload)
+        assert resp.status_code == 200
+        data = resp.json()
+
+        # Trading engine reads these fields from risk evaluation
+        assert "evaluation_id" in data
+        assert "ticker" in data
+        assert "eligible" in data
+        assert "rejection_reasons" in data
+        assert "checks" in data
+        assert "evaluated_at" in data
+        # Types
+        assert isinstance(data["eligible"], bool)
+        assert isinstance(data["rejection_reasons"], list)
+        assert isinstance(data["checks"], list)
+        # Each check should have name and passed
+        for check in data["checks"]:
+            assert "name" in check
+            assert "passed" in check
+            assert isinstance(check["passed"], bool)
+
+    async def test_risk_rejects_oversized_order(self, risk_client):
+        """Risk engine correctly rejects an order exceeding position cap."""
+        payload = {
+            "order": {
+                "ticker": "AAPL",
+                "action": "buy",
+                "quantity": 1000,
+                "estimated_value": 185000.0,
+                "confidence": 0.9,
+                "recommendation_id": None,
+                "sector": "Technology",
+            },
+            "config": {
+                "absolute_position_cap": 10000.0,
+            },
+        }
+        resp = await risk_client.post("/evaluate", json=payload)
+        assert resp.status_code == 200
+        data = resp.json()
+        # Should be rejected due to position cap
+        assert data["eligible"] is False
+        assert len(data["rejection_reasons"]) > 0
+
+
+# ---------------------------------------------------------------------------
+# Contract: Trading Engine state consistency
+# ---------------------------------------------------------------------------
+
+
+class TestTradingEngineState:
+    """Verify trading engine exposes consistent state for the promotion gate."""
+
+    async def test_status_reflects_config(self, trading_client):
+        """Engine status fields are consistent with each other."""
+        resp = await trading_client.get("/api/trading/status")
+        assert resp.status_code == 200
+        data = resp.json()
+
+        # If paused, open_positions should still be reported
+        assert "open_positions" in data
+        assert isinstance(data["open_positions"], int)
+        assert data["open_positions"] >= 0
+
+        # Risk tier must be valid
+        assert data["risk_tier"] in ("conservative", "moderate", "aggressive")
+
+        # Pool values must be non-negative
+        assert data["active_pool"] >= 0
+        assert data["reserve_pool"] >= 0
+
+    async def test_decisions_have_audit_fields(self, trading_client, seed_ids):
+        """Trading decisions include full audit trail fields."""
+        resp = await trading_client.get("/api/trading/decisions")
+        assert resp.status_code == 200
+        decisions = resp.json()
+
+        if len(decisions) > 0:
+            d = decisions[0]
+            assert "id" in d
+            assert "decision" in d
+            assert "ticker" in d
+            assert "created_at" in d
+            # Decision type must be valid
+            assert d["decision"] in ("act", "skip")
+
+    async def test_metrics_numeric_consistency(self, trading_client):
+        """Portfolio metrics are all numeric and internally consistent."""
+        resp = await trading_client.get("/api/trading/metrics")
+        assert resp.status_code == 200
+        data = resp.json()
+
+        # All values must be numeric
+        numeric_fields = [
+            "total_portfolio_value", "active_pool", "reserve_pool",
+            "unrealized_pnl", "realized_pnl", "daily_pnl",
+            "win_rate", "sharpe_ratio", "max_drawdown", "portfolio_heat",
+        ]
+        for field in numeric_fields:
+            assert field in data, f"Missing field: {field}"
+            assert isinstance(data[field], (int, float)), (
+                f"{field} should be numeric, got {type(data[field])}"
+            )
+
+        # Win rate and portfolio heat should be bounded
+        assert 0 <= data["win_rate"] <= 1 or data["win_rate"] == 0
+        assert 0 <= data["portfolio_heat"] <= 1 or data["portfolio_heat"] == 0
+
+        # Total portfolio = active + reserve + unrealized (approximately)
+        # Allow some tolerance for rounding
+        expected_total = data["active_pool"] + data["reserve_pool"] + data["unrealized_pnl"]
+        if data["total_portfolio_value"] > 0:
+            diff = abs(data["total_portfolio_value"] - expected_total)
+            assert diff < data["total_portfolio_value"] * 0.1, (
+                f"Portfolio value inconsistency: total={data['total_portfolio_value']}, "
+                f"active+reserve+unrealized={expected_total}"
+            )
+
+
+# ---------------------------------------------------------------------------
+# Contract: Approval workflow integration
+# ---------------------------------------------------------------------------
+
+
+class TestApprovalWorkflowContract:
+    """Verify the approval workflow is accessible and returns valid schemas."""
+
+    async def test_pending_approvals_schema(self, risk_client):
+        """Pending approvals list returns valid schema (may be empty)."""
+        resp = await risk_client.get("/approvals/pending")
+        assert resp.status_code == 200
+        data = resp.json()
+        assert isinstance(data, list)
+
+        # If there are pending approvals, validate schema
+        for approval in data:
+            assert "id" in approval
+            assert "status" in approval
+            assert "ticker" in approval
+            assert "side" in approval
+            assert "quantity" in approval
+            assert "created_at" in approval
+
+    async def test_approval_not_found_returns_404(self, risk_client):
+        """Non-existent approval ID returns 404, not 500."""
+        fake_id = "00000000-0000-4000-ffff-ffffffffffff"
+        resp = await risk_client.get(f"/approvals/{fake_id}")
+        assert resp.status_code == 404
+
+
+# ---------------------------------------------------------------------------
+# Contract: Cross-service health (all services must be up for paper trading)
+# ---------------------------------------------------------------------------
+
+
+class TestCrossServiceHealth:
+    """All services must be healthy before promotion to paper trading."""
+
+    async def test_all_services_healthy(
+        self, query_client, registry_client, risk_client, trading_client,
+    ):
+        """Every service responds to health check."""
+        services = {
+            "query-api": query_client,
+            "symbol-registry": registry_client,
+            "risk-engine": risk_client,
+            "trading-engine": trading_client,
+        }
+        for name, client in services.items():
+            resp = await client.get("/health")
+            assert resp.status_code == 200, (
+                f"{name} health check failed with status {resp.status_code}"
+            )
+            data = resp.json()
+            assert data.get("status") == "ok", (
+                f"{name} reported unhealthy: {data}"
+            )
+
+    async def test_trading_engine_ready(self, trading_client):
+        """Trading engine readiness probe passes (DB + Redis connected)."""
+        resp = await trading_client.get("/ready")
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["ready"] is True, (
+            f"Trading engine not ready: {data}"
+        )