feat: pipeline on/off toggle with per-stage Helm control

- Added pipelineEnabled flag to Helm values (default: true) - Worker services (scheduler, ingestion, parser, extractor, aggregation, recommendation, broker-adapter, lake-publisher) scale to 0 when disabled - API services always run regardless of toggle - Redis-based runtime toggle: POST /api/ops/pipeline/toggle - Scheduler checks the flag before each cycle - Frontend: green/red Pipeline ON/OFF button on the pipeline page - Beta defaults to pipelineEnabled: false - Base values.yaml: blanked external URLs (Ollama, Polygon, Alpaca) so stages only connect to what they explicitly configure
2026-04-21 00:21:53 +00:00
parent a19ed086fe
commit be526ae614
14 changed files with 923 additions and 104 deletions
@@ -227,3 +227,55 @@ jobs:
        with:
          name: inttest-results
          path: inttest-results.json
  beta-gate:
    needs: [integration-test]
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
    runs-on: self-hosted-gremlin
    permissions:
      contents: read
      packages: read
    steps:
      - uses: actions/checkout@v5
      - name: Install kubectl
        run: |
          if ! command -v kubectl &> /dev/null; then
            curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
            chmod +x kubectl
            sudo mv kubectl /usr/local/bin/kubectl
          fi
          kubectl version --client
      - name: Install Helm
        run: |
          if ! command -v helm &> /dev/null; then
            curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | sudo bash
          fi
          helm version
      - name: Configure kubectl
        run: |
          if [ -f /var/run/secrets/kubernetes.io/serviceaccount/token ]; then
            kubectl config set-cluster in-cluster \
              --server=https://kubernetes.default.svc \
              --certificate-authority=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
            kubectl config set-credentials runner \
              --token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
            kubectl config set-context runner --cluster=in-cluster --user=runner
            kubectl config use-context runner
          fi
          kubectl cluster-info || echo "WARNING: kubectl cannot reach cluster API"
      - name: Run beta gate (deploy → test → promote)
        run: |
          bash infra/inttest/promote.sh \
            --image-tag ${{ github.sha }} \
            --results-file beta-gate-results.json
      - name: Upload beta gate results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: beta-gate-results
          path: beta-gate-results.json
@@ -533,6 +533,14 @@ export function useRetryFailedExtractions() {
  });
 }
 export function usePipelineToggle() {
  const qc = useQueryClient();
  return useMutation({
    mutationFn: (enabled: boolean) => apiPost<{ pipeline_enabled: boolean }>('query', '/api/ops/pipeline/toggle', { enabled }),
    onSuccess: () => qc.invalidateQueries({ queryKey: ['pipeline-health'] }),
  });
 }
 export function useIngestionSummary(hours = 24) {
  return useGet<Record<string, unknown>>(['ingestion-summary', hours], 'query', `/api/ops/ingestion/summary?hours=${hours}`);
 }
@@ -1,5 +1,5 @@
 import { useState, useEffect } from 'react';
-import { usePipelineHealth, useRetryFailedExtractions } from '../api/hooks';
+import { usePipelineHealth, useRetryFailedExtractions, usePipelineToggle } from '../api/hooks';
 import { LoadingSpinner, DateRangeSelector, Card } from '../components/ui';
 const QUEUE_LABELS: Record<string, string> = {
@@ -54,12 +54,14 @@ export function OpsPipelinePage() {
  const { data, isLoading } = usePipelineHealth(hours);
  const stream = usePipelineStream();
  const retryMutation = useRetryFailedExtractions();
  const toggleMutation = usePipelineToggle();
  if (isLoading) return <LoadingSpinner />;
  const parsing = (data?.parsing ?? {}) as Record<string, unknown>;
  const extraction = (data?.extraction ?? {}) as Record<string, unknown>;
  const aggregation = (data?.aggregation ?? {}) as Record<string, unknown>;
  const pipelineEnabled = (data?.pipeline_enabled ?? true) as boolean;
  // Prefer live stream data for queue depths and doc stages, fall back to initial fetch
  const queueDepths = stream?.queue_depths
@@ -82,6 +84,14 @@ export function OpsPipelinePage() {
      <div className="flex items-center justify-between">
        <h1 className="text-xl font-semibold text-gray-100">Pipeline Health</h1>
        <div className="flex items-center gap-3">
          <button
            type="button"
            onClick={() => toggleMutation.mutate(!pipelineEnabled)}
            disabled={toggleMutation.isPending}
            className={`rounded-md px-3 py-1.5 text-xs font-medium text-white ${pipelineEnabled ? 'bg-green-600 hover:bg-green-500' : 'bg-red-600 hover:bg-red-500'} disabled:opacity-50`}
          >
            {toggleMutation.isPending ? '…' : pipelineEnabled ? 'Pipeline ON' : 'Pipeline OFF'}
          </button>
          {failedCount > 0 && (
            <button
              type="button"
@@ -106,6 +106,8 @@ export const handlers = [
  http.delete('/api/admin/trading/lockouts/:id', () => HttpResponse.json({ status: 'deleted' })),
  http.get('/api/ops/pipeline/health', () => HttpResponse.json({ hours: 24, document_stages: [{ status: 'extracted', doc_count: 5 }], parsing: {}, extraction: {}, aggregation: {}, queue_depths: {} })),
  http.post('/api/ops/pipeline/retry-failed', () => HttpResponse.json({ retried: 10, message: 'Re-enqueued 10 documents for extraction' })),
  http.get('/api/ops/pipeline/toggle', () => HttpResponse.json({ pipeline_enabled: true })),
  http.post('/api/ops/pipeline/toggle', () => HttpResponse.json({ pipeline_enabled: true })),
  http.get('/api/ops/ingestion/summary', () => HttpResponse.json({ total_runs: 10, completed: 8, failed: 2, total_items_fetched: 50, total_items_new: 12, by_source_type: [] })),
  http.get('/api/ops/ingestion/throughput', () => HttpResponse.json([])),
  http.get('/api/ops/model/performance', () => HttpResponse.json({ total_extractions: 20, success_rate: 0.9, avg_duration_ms: 1500, retry_rate: 0.05, avg_confidence: 0.8 })),
@@ -11,7 +11,7 @@ metadata:
    {{- include "stonks.labels" $root | nindent 4 }}
    stonks-oracle/tier: {{ $svc.tier }}
 spec:
-  replicas: {{ $svc.replicas }}
+  replicas: {{ if and (hasKey $svc "pipeline") $svc.pipeline (not $root.Values.pipelineEnabled) }}0{{ else }}{{ $svc.replicas }}{{ end }}
  selector:
    matchLabels:
      app: {{ $svc.image }}
@@ -6,31 +6,11 @@
 image:
  tag: latest
-## Single replica for API services, disable pipeline workers
+## Pipeline OFF by default — beta is for API testing only
-## Beta is for API testing only — no ingestion/extraction/aggregation
+pipelineEnabled: false
 ## Single replica for API services
 services:
  scheduler:
    replicas: 0
  symbolRegistry:
    replicas: 1
  ingestion:
    replicas: 0
  parser:
    replicas: 0
  extractor:
    replicas: 0
  aggregation:
    replicas: 0
  recommendation:
    replicas: 0
  tradingEngine:
    replicas: 1
  riskEngine:
    replicas: 1
  brokerAdapter:
    replicas: 0
  lakePublisher:
    replicas: 0
  queryApi:
    replicas: 1
  dashboard:
@@ -4,10 +4,16 @@ image:
  pullPolicy: Always
  tag: latest
 ## Pipeline toggle — when false, all worker services (ingestion, parsing,
 ## extraction, aggregation, recommendation, broker, lake-publisher, scheduler)
 ## are scaled to 0. API services always run.
 pipelineEnabled: true
 ## Service deployments — replicas and resource overrides
 services:
  scheduler:
    replicas: 1
    pipeline: true
    image: scheduler
    command: "python -m services.scheduler.app"
    tier: orchestration
@@ -32,6 +38,7 @@ services:
  ingestion:
    replicas: 2
    pipeline: true
    image: ingestion
    command: "python -m services.ingestion.worker"
    tier: ingestion
@@ -42,6 +49,7 @@ services:
  parser:
    replicas: 2
    pipeline: true
    image: parser
    command: "python -m services.parser.worker"
    tier: processing
@@ -52,6 +60,7 @@ services:
  extractor:
    replicas: 1
    pipeline: true
    image: extractor
    command: "python -m services.extractor.main"
    tier: processing
@@ -62,6 +71,7 @@ services:
  aggregation:
    replicas: 4
    pipeline: true
    image: aggregation
    command: "python -m services.aggregation.main"
    tier: processing
@@ -72,6 +82,7 @@ services:
  recommendation:
    replicas: 1
    pipeline: true
    image: recommendation
    command: "python -m services.recommendation.main"
    tier: processing
@@ -107,6 +118,7 @@ services:
  brokerAdapter:
    replicas: 1
    pipeline: true
    image: broker-adapter
    command: "python -m services.adapters.broker_service"
    tier: trading
@@ -117,6 +129,7 @@ services:
  lakePublisher:
    replicas: 1
    pipeline: true
    image: lake-publisher
    command: "python -m services.lake_publisher.jobs"
    tier: analytics
@@ -0,0 +1,409 @@
 #!/bin/bash
 # Beta-to-Paper promotion gate
 #
 # Deploys the given image tag to the beta namespace, runs integration tests
 # against the live beta services, and promotes to paper-trading if all pass.
 #
 # This script is the single source of truth for the promotion decision.
 # CI calls it; humans can call it too.
 #
 # Usage: bash infra/inttest/promote.sh [OPTIONS]
 #
 # Options:
 #   --image-tag TAG       Docker image tag to deploy (required)
 #   --skip-promote        Run tests but don't promote even if green
 #   --skip-teardown       Leave beta namespace running after tests
 #   --results-file PATH   Path for JSON results output (default: beta-gate-results.json)
 #   --timeout SECONDS     Max wait for services to become ready (default: 180)
 #   -h, --help            Show usage
 #
 # Exit codes:
 #   0  All tests passed, promotion succeeded (or --skip-promote)
 #   1  Test failures — promotion blocked
 #   2  Infrastructure/deployment failure
 #   3  Promotion step failed (tests passed but helm upgrade failed)
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 # ── Defaults ─────────────────────────────────────────────────────────────────
 IMAGE_TAG=""
 SKIP_PROMOTE=false
 SKIP_TEARDOWN=false
 RESULTS_FILE="beta-gate-results.json"
 READY_TIMEOUT=180
 BETA_NAMESPACE="stonks-oracle-beta"
 PAPER_NAMESPACE="stonks-oracle"
 # ── Helpers ──────────────────────────────────────────────────────────────────
 usage() {
  cat <<EOF
 Usage: bash infra/inttest/promote.sh [OPTIONS]
 Options:
  --image-tag TAG       Docker image tag to deploy (required)
  --skip-promote        Run tests but don't promote even if green
  --skip-teardown       Leave beta namespace running after tests
  --results-file PATH   Path for JSON results output (default: beta-gate-results.json)
  --timeout SECONDS     Max wait for services to become ready (default: 180)
  -h, --help            Show usage
 Exit codes:
  0  All tests passed, promotion succeeded (or --skip-promote)
  1  Test failures — promotion blocked
  2  Infrastructure/deployment failure
  3  Promotion step failed (tests passed but helm upgrade failed)
 EOF
  exit 0
 }
 log() {
  echo "[$(date -u +"%H:%M:%S")] [beta-gate] $*"
 }
 die() {
  log "FATAL: $*"
  exit 2
 }
 # ── Parse CLI args ───────────────────────────────────────────────────────────
 while [[ $# -gt 0 ]]; do
  case $1 in
    --image-tag)
      IMAGE_TAG="$2"
      shift 2
      ;;
    --skip-promote)
      SKIP_PROMOTE=true
      shift
      ;;
    --skip-teardown)
      SKIP_TEARDOWN=true
      shift
      ;;
    --results-file)
      RESULTS_FILE="$2"
      shift 2
      ;;
    --timeout)
      READY_TIMEOUT="$2"
      shift 2
      ;;
    -h|--help)
      usage
      ;;
    *)
      echo "Unknown option: $1"
      usage
      ;;
  esac
 done
 if [ -z "$IMAGE_TAG" ]; then
  echo "ERROR: --image-tag is required"
  usage
 fi
 STARTED_AT=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 PIPELINE_START=$(date +%s)
 log "Beta gate starting"
 log "  Image tag:     $IMAGE_TAG"
 log "  Beta NS:       $BETA_NAMESPACE"
 log "  Paper NS:      $PAPER_NAMESPACE"
 log "  Skip promote:  $SKIP_PROMOTE"
 # ══════════════════════════════════════════════════════════════════════════════
 # Stage 1: Deploy to beta namespace
 # ══════════════════════════════════════════════════════════════════════════════
 log "▶ Stage 1: Deploy to beta"
 # Ensure beta namespace exists
 kubectl create namespace "$BETA_NAMESPACE" 2>/dev/null || true
 # Create beta database if it doesn't exist
 log "Ensuring beta database exists ..."
 kubectl exec -n postgresql-service postgresql-1 -c postgres -- \
  psql -U postgres -tc "SELECT 1 FROM pg_database WHERE datname = 'stonks_beta'" \
  | grep -q 1 || \
 kubectl exec -n postgresql-service postgresql-1 -c postgres -- \
  psql -U postgres -c "CREATE DATABASE stonks_beta OWNER stonks;" 2>/dev/null || true
 # Apply migrations to beta database
 log "Applying migrations to beta database ..."
 for migration in $(ls "$REPO_ROOT/infra/migrations/"*.sql | sort); do
  kubectl exec -n postgresql-service postgresql-1 -c postgres -- \
    psql -U stonks -d stonks_beta -f - < "$migration" 2>/dev/null || true
 done
 # Deploy via Helm with beta values
 log "Helm upgrade to beta namespace ..."
 if ! helm upgrade --install stonks-oracle-beta \
    "$REPO_ROOT/infra/helm/stonks-oracle" \
    -n "$BETA_NAMESPACE" \
    -f "$REPO_ROOT/infra/helm/stonks-oracle/values-beta.yaml" \
    --set "image.tag=$IMAGE_TAG" \
    --wait \
    --timeout "${READY_TIMEOUT}s"; then
  log "Helm deploy to beta failed"
  DEPLOY_STATUS="failed"
 else
  DEPLOY_STATUS="ok"
 fi
 if [ "$DEPLOY_STATUS" != "ok" ]; then
  log "Beta deployment failed — checking pod status"
  kubectl get pods -n "$BETA_NAMESPACE" -o wide 2>&1 || true
  kubectl get events -n "$BETA_NAMESPACE" --sort-by='.lastTimestamp' 2>&1 | tail -20 || true
  cat > "$RESULTS_FILE" <<EOF
 {
  "run_id": "beta-gate-${IMAGE_TAG}",
  "image_tag": "${IMAGE_TAG}",
  "started_at": "${STARTED_AT}",
  "completed_at": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
  "exit_code": 2,
  "stage": "deploy",
  "deploy_status": "failed",
  "test_status": "skipped",
  "promote_status": "blocked",
  "tests": {"total": 0, "passed": 0, "failed": 0, "errors": 0}
 }
 EOF
  exit 2
 fi
 log "✓ Beta deployment ready"
 # ══════════════════════════════════════════════════════════════════════════════
 # Stage 2: Seed beta database
 # ══════════════════════════════════════════════════════════════════════════════
 log "▶ Stage 2: Seed beta data"
 # Run seed against beta database via a temporary pod
 SEED_IMAGE="registry.celestium.life/stonks-oracle/query-api:${IMAGE_TAG}"
 # Clean up any previous seed pod
 kubectl delete pod seed-beta -n "$BETA_NAMESPACE" --ignore-not-found > /dev/null 2>&1 || true
 if ! kubectl run seed-beta \
    --image="$SEED_IMAGE" \
    --restart=Never \
    --rm \
    --attach \
    --pod-running-timeout=3m \
    --namespace="$BETA_NAMESPACE" \
    --image-pull-policy=Always \
    --env="POSTGRES_HOST=postgresql-rw.postgresql-service.svc.cluster.local" \
    --env="POSTGRES_PORT=5432" \
    --env="POSTGRES_DB=stonks_beta" \
    --env="POSTGRES_USER=stonks" \
    --env="POSTGRES_PASSWORD=St0nks0racl3!" \
    --env="MINIO_ENDPOINT=minio.minio-service.svc.cluster.local:80" \
    --env="MINIO_SECURE=false" \
    --env="MINIO_ACCESS_KEY=minioadmin" \
    --env="MINIO_SECRET_KEY=minioadmin" \
    --command -- python -m tests.integration.seed_sandbox 2>/dev/null; then
  log "WARNING: Seed may have partially failed (could be idempotent re-run)"
 fi
 log "✓ Beta data seeded"
 # ══════════════════════════════════════════════════════════════════════════════
 # Stage 3: Run integration tests against beta
 # ══════════════════════════════════════════════════════════════════════════════
 log "▶ Stage 3: Run integration tests"
 # Determine service URLs within the beta namespace
 QUERY_API_URL="http://query-api.${BETA_NAMESPACE}.svc.cluster.local:8000"
 REGISTRY_API_URL="http://symbol-registry.${BETA_NAMESPACE}.svc.cluster.local:8000"
 RISK_API_URL="http://risk.${BETA_NAMESPACE}.svc.cluster.local:8000"
 TRADING_API_URL="http://trading-engine.${BETA_NAMESPACE}.svc.cluster.local:8000"
 # Clean up any previous runner
 kubectl delete pod beta-test-runner -n "$BETA_NAMESPACE" --ignore-not-found > /dev/null 2>&1 || true
 # Run tests as a pod in the beta namespace
 log "Starting test runner pod ..."
 kubectl run beta-test-runner \
    --image="$SEED_IMAGE" \
    --restart=Never \
    --namespace="$BETA_NAMESPACE" \
    --image-pull-policy=Always \
    --env="QUERY_API_URL=$QUERY_API_URL" \
    --env="REGISTRY_API_URL=$REGISTRY_API_URL" \
    --env="RISK_API_URL=$RISK_API_URL" \
    --env="TRADING_API_URL=$TRADING_API_URL" \
    --env="POSTGRES_HOST=postgresql-rw.postgresql-service.svc.cluster.local" \
    --env="POSTGRES_PORT=5432" \
    --env="POSTGRES_DB=stonks_beta" \
    --env="POSTGRES_USER=stonks" \
    --env="POSTGRES_PASSWORD=St0nks0racl3!" \
    --env="REDIS_HOST=redis-master.redis-service.svc.cluster.local" \
    --env="REDIS_PORT=6379" \
    --env="REDIS_DB=1" \
    --env="REDIS_PASSWORD=" \
    --env="BROKER_MODE=paper" \
    --env="LOG_LEVEL=INFO" \
    --command -- python -m pytest tests/integration/ -v --tb=short -q
 # Wait for the test runner to complete
 log "Waiting for test runner (timeout: 600s) ..."
 TEST_EXIT_CODE=0
 if ! kubectl wait --for=condition=Ready=false pod/beta-test-runner \
    -n "$BETA_NAMESPACE" --timeout=600s 2>/dev/null; then
  # Pod may have already completed — check its status
  true
 fi
 # Wait for pod to reach terminal state
 for i in $(seq 1 120); do
  POD_PHASE=$(kubectl get pod beta-test-runner -n "$BETA_NAMESPACE" \
    -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
  if [ "$POD_PHASE" = "Succeeded" ] || [ "$POD_PHASE" = "Failed" ]; then
    break
  fi
  sleep 5
 done
 # Collect results
 TEST_OUTPUT=$(kubectl logs beta-test-runner -n "$BETA_NAMESPACE" 2>/dev/null || true)
 POD_PHASE=$(kubectl get pod beta-test-runner -n "$BETA_NAMESPACE" \
  -o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
 if [ "$POD_PHASE" = "Failed" ]; then
  TEST_EXIT_CODE=1
 fi
 # Parse test counts
 TESTS_PASSED=$(echo "$TEST_OUTPUT" | grep -oP '\d+(?= passed)' | tail -1 || echo "0")
 TESTS_FAILED=$(echo "$TEST_OUTPUT" | grep -oP '\d+(?= failed)' | tail -1 || echo "0")
 TESTS_ERRORS=$(echo "$TEST_OUTPUT" | grep -oP '\d+(?= error)' | tail -1 || echo "0")
 TESTS_PASSED=${TESTS_PASSED:-0}
 TESTS_FAILED=${TESTS_FAILED:-0}
 TESTS_ERRORS=${TESTS_ERRORS:-0}
 TESTS_TOTAL=$(( TESTS_PASSED + TESTS_FAILED + TESTS_ERRORS ))
 log "Test results: ${TESTS_PASSED} passed, ${TESTS_FAILED} failed, ${TESTS_ERRORS} errors"
 # Print test output for CI visibility
 if [ -n "$TEST_OUTPUT" ]; then
  echo "─── Test Output ───"
  echo "$TEST_OUTPUT" | tail -60
  echo "─── End Test Output ───"
 fi
 # Clean up test runner
 kubectl delete pod beta-test-runner -n "$BETA_NAMESPACE" --ignore-not-found > /dev/null 2>&1 || true
 # ══════════════════════════════════════════════════════════════════════════════
 # Stage 4: Promotion decision
 # ══════════════════════════════════════════════════════════════════════════════
 PROMOTE_STATUS="blocked"
 FINAL_EXIT=0
 if [ "$TESTS_FAILED" -gt 0 ] || [ "$TESTS_ERRORS" -gt 0 ] || [ "$TEST_EXIT_CODE" -ne 0 ]; then
  log "✗ GATE FAILED — ${TESTS_FAILED} failures, ${TESTS_ERRORS} errors"
  log "  Promotion to paper-trading BLOCKED"
  PROMOTE_STATUS="blocked"
  FINAL_EXIT=1
 elif [ "$SKIP_PROMOTE" = true ]; then
  log "✓ Tests passed — promotion skipped (--skip-promote)"
  PROMOTE_STATUS="skipped"
  FINAL_EXIT=0
 else
  log "▶ Stage 4: Promoting to paper-trading"
  log "  Upgrading $PAPER_NAMESPACE with image tag $IMAGE_TAG ..."
  if helm upgrade --install stonks-oracle \
      "$REPO_ROOT/infra/helm/stonks-oracle" \
      -n "$PAPER_NAMESPACE" \
      --set "image.tag=$IMAGE_TAG" \
      --wait \
      --timeout 300s; then
    log "✓ PROMOTED — paper-trading now running $IMAGE_TAG"
    PROMOTE_STATUS="promoted"
    FINAL_EXIT=0
    # Rolling restart to pick up new images
    log "Rolling restart of API services ..."
    kubectl rollout restart deployment/query-api deployment/symbol-registry \
      deployment/trading-engine deployment/risk-engine \
      deployment/aggregation deployment/recommendation \
      -n "$PAPER_NAMESPACE" 2>/dev/null || true
  else
    log "✗ Promotion failed — helm upgrade error"
    PROMOTE_STATUS="failed"
    FINAL_EXIT=3
  fi
 fi
 # ══════════════════════════════════════════════════════════════════════════════
 # Stage 5: Cleanup beta (optional)
 # ══════════════════════════════════════════════════════════════════════════════
 if [ "$SKIP_TEARDOWN" = false ] && [ "$PROMOTE_STATUS" = "promoted" ]; then
  log "Scaling down beta deployment (keeping namespace for next run) ..."
  helm upgrade stonks-oracle-beta \
    "$REPO_ROOT/infra/helm/stonks-oracle" \
    -n "$BETA_NAMESPACE" \
    -f "$REPO_ROOT/infra/helm/stonks-oracle/values-beta.yaml" \
    --set "image.tag=$IMAGE_TAG" \
    --set "services.queryApi.replicas=0" \
    --set "services.symbolRegistry.replicas=0" \
    --set "services.tradingEngine.replicas=0" \
    --set "services.riskEngine.replicas=0" \
    --set "services.scheduler.replicas=0" \
    --set "services.ingestion.replicas=0" \
    --set "services.parser.replicas=0" \
    --set "services.extractor.replicas=0" \
    --set "services.aggregation.replicas=0" \
    --set "services.recommendation.replicas=0" \
    --set "services.brokerAdapter.replicas=0" \
    --set "services.lakePublisher.replicas=0" \
    --set "services.dashboard.replicas=0" \
    2>/dev/null || true
  log "Beta scaled to zero"
 fi
 # ══════════════════════════════════════════════════════════════════════════════
 # Write results
 # ══════════════════════════════════════════════════════════════════════════════
 COMPLETED_AT=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
 PIPELINE_END=$(date +%s)
 PIPELINE_DURATION=$(( PIPELINE_END - PIPELINE_START ))
 cat > "$RESULTS_FILE" <<EOF
 {
  "run_id": "beta-gate-${IMAGE_TAG}",
  "image_tag": "${IMAGE_TAG}",
  "started_at": "${STARTED_AT}",
  "completed_at": "${COMPLETED_AT}",
  "duration_s": ${PIPELINE_DURATION},
  "exit_code": ${FINAL_EXIT},
  "deploy_status": "${DEPLOY_STATUS}",
  "test_status": "$([ "$FINAL_EXIT" -le 1 ] && echo "completed" || echo "error")",
  "promote_status": "${PROMOTE_STATUS}",
  "tests": {
    "total": ${TESTS_TOTAL},
    "passed": ${TESTS_PASSED},
    "failed": ${TESTS_FAILED},
    "errors": ${TESTS_ERRORS}
  }
 }
 EOF
 log "Results written to $RESULTS_FILE"
 echo ""
 log "═══════════════════════════════════════════════════"
 log "  Beta Gate Summary"
 log "═══════════════════════════════════════════════════"
 log "  Image:       $IMAGE_TAG"
 log "  Duration:    ${PIPELINE_DURATION}s"
 log "  Tests:       ${TESTS_PASSED}/${TESTS_TOTAL} passed"
 log "  Promotion:   ${PROMOTE_STATUS}"
 log "  Exit code:   $FINAL_EXIT"
 log "═══════════════════════════════════════════════════"
 echo ""
 exit "$FINAL_EXIT"
@@ -65,6 +65,11 @@ trivy:
 metrics:
  enabled: false
 # Enable Redis cache layer for faster manifest lookups (avoids upstream checks)
 cache:
  enabled: true
  expireHours: 24
 # Resource limits — conservative for a 4-node cluster
 core:
  resources:
@@ -41,7 +41,7 @@ from services.shared.audit import get_entity_audit_trail, get_order_audit_trail,
 from services.shared.config import load_config
 from services.shared.db import get_pg_pool, get_redis
 from services.shared.logging import new_trace_id, set_trace_context, setup_logging
-from services.shared.redis_keys import QUEUE_PREFIX, queue_key
+from services.shared.redis_keys import PREFIX, QUEUE_PREFIX, queue_key
 from services.shared.schemas import MAJOR_DECISION_CATALYSTS
 logger = logging.getLogger("query_api")
@@ -1787,8 +1787,13 @@ async def get_pipeline_health(
            except Exception:
                pass
    # Pipeline enabled flag
    pipeline_flag = await rds.get(_PIPELINE_ENABLED_KEY) if rds else None
    pipeline_enabled = pipeline_flag != "0" if pipeline_flag is not None else True
    return {
        "hours": hours,
        "pipeline_enabled": pipeline_enabled,
        "document_stages": [_row_to_dict(r) for r in doc_stages],
        "parsing": _row_to_dict(parse_quality) if parse_quality else {},
        "extraction": _row_to_dict(extraction_stats) if extraction_stats else {},
@@ -1927,6 +1932,34 @@ async def retry_failed_extractions_endpoint():
    return {"retried": len(doc_ids), "message": f"Re-enqueued {len(doc_ids)} documents for extraction"}
 # ---------------------------------------------------------------------------
 # Pipeline On/Off Toggle
 # ---------------------------------------------------------------------------
 _PIPELINE_ENABLED_KEY = f"{PREFIX}:pipeline:enabled"
@app.get("/api/ops/pipeline/toggle")
 async def get_pipeline_toggle():
    """Get the current pipeline enabled/disabled state."""
    val = await rds.get(_PIPELINE_ENABLED_KEY)
    # Default to enabled if key doesn't exist
    enabled = val != "0"
    return {"pipeline_enabled": enabled}
@app.post("/api/ops/pipeline/toggle")
 async def set_pipeline_toggle(body: dict[str, Any]):
    """Toggle the pipeline on or off.
    Accepts: { "enabled": true/false }
    Workers check this flag before processing jobs.
    """
    enabled = body.get("enabled", True)
    await rds.set(_PIPELINE_ENABLED_KEY, "1" if enabled else "0")
    return {"pipeline_enabled": enabled, "message": f"Pipeline {'enabled' if enabled else 'disabled'}"}
@app.get("/api/ops/sources/coverage-gaps")
 async def get_source_coverage_gaps():
    """Identify symbols with missing or insufficient source coverage.
@@ -19,6 +19,7 @@ from services.shared.config import load_config
 from services.shared.db import get_pg_pool, get_redis
 from services.shared.logging import setup_logging
 from services.shared.redis_keys import (
    PREFIX,
    QUEUE_EXTRACTION,
    QUEUE_INGESTION,
    QUEUE_MACRO_CLASSIFICATION,
@@ -499,12 +500,19 @@ async def main() -> None:
    rds = get_redis(config)
    logger.info("Scheduler started (tick=%ds)", SCHEDULER_TICK)
    pipeline_key = f"{PREFIX}:pipeline:enabled"
    recovery_counter = 0
    retry_counter = 0
    cleanup_counter = 0
    try:
        while True:
            try:
                # Check pipeline toggle — skip cycle if disabled
                flag = await rds.get(pipeline_key)
                if flag == "0":
                    await asyncio.sleep(SCHEDULER_TICK)
                    continue
                if await acquire_lock(rds, "scheduler_cycle", ttl=30):
                    try:
                        await schedule_cycle(pool, rds)
@@ -1,77 +0,0 @@
 """Database migration runner using asyncpg.
 Applies all SQL migration files from infra/migrations/ in sorted order.
 Each file is split on semicolons and executed statement-by-statement.
 Idempotent — migrations use IF NOT EXISTS / CREATE OR REPLACE patterns.
 Usage:
    python -m services.shared.migrate
 """
 import asyncio
 import glob
 import logging
 import os
 import sys
 import asyncpg
 logger = logging.getLogger("migrate")
 async def run_migrations() -> None:
    host = os.getenv("POSTGRES_HOST", "localhost")
    port = int(os.getenv("POSTGRES_PORT", "5432"))
    user = os.getenv("POSTGRES_USER", "stonks")
    password = os.getenv("POSTGRES_PASSWORD", "")
    database = os.getenv("POSTGRES_DB", "stonks")
    migrations_dir = os.path.join(
        os.path.dirname(__file__), "..", "..", "infra", "migrations"
    )
    migrations_dir = os.path.normpath(migrations_dir)
    if not os.path.isdir(migrations_dir):
        logger.error("Migrations directory not found: %s", migrations_dir)
        sys.exit(1)
    files = sorted(glob.glob(os.path.join(migrations_dir, "*.sql")))
    if not files:
        logger.warning("No migration files found in %s", migrations_dir)
        return
    logger.info("Connecting to %s@%s:%d/%s", user, host, port, database)
    conn = await asyncpg.connect(
        host=host, port=port, user=user, password=password, database=database
    )
    try:
        for path in files:
            name = os.path.basename(path)
            with open(path) as f:
                sql = f.read()
            # Split on semicolons and execute each statement individually.
            # asyncpg.execute() doesn't support multi-statement strings.
            statements = [s.strip() for s in sql.split(";") if s.strip()]
            try:
                for stmt in statements:
                    await conn.execute(stmt)
                logger.info("  ✓ %s (%d statements)", name, len(statements))
            except Exception as exc:
                logger.warning("  ⚠ %s: %s", name, exc)
    finally:
        await conn.close()
    logger.info("Migrations complete (%d files)", len(files))
 def main() -> None:
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(name)s %(message)s",
        datefmt="%H:%M:%S",
    )
    asyncio.run(run_migrations())
 if __name__ == "__main__":
    main()
@@ -0,0 +1,376 @@
 """Integration tests for cross-service signal flow contracts.
 These tests validate the end-to-end data flow that the trading engine
 depends on. They catch schema drift and contract violations between
 services that unit tests miss.
 Flow under test:
  1. Symbol Registry has companies with exposure profiles and competitors
  2. Query API returns trends with correct schema for trading engine consumption
  3. Risk engine evaluates orders using data from query API
  4. Trading engine receives valid recommendation payloads
 These are the "beta gate" tests — if any fail, promotion to paper is blocked.
 """
 import pytest
 pytestmark = pytest.mark.asyncio
 # ---------------------------------------------------------------------------
 # Contract: Symbol Registry → Query API company data consistency
 # ---------------------------------------------------------------------------
 class TestRegistryToQueryContract:
    """Verify that company data in the registry matches what query API exposes."""
    async def test_company_ids_consistent(self, registry_client, query_client, seed_ids):
        """Company IDs from registry match those returned by query API."""
        reg_resp = await registry_client.get("/companies")
        assert reg_resp.status_code == 200
        reg_companies = {c["id"]: c["ticker"] for c in reg_resp.json()}
        query_resp = await query_client.get("/api/companies")
        assert query_resp.status_code == 200
        query_companies = {c["id"]: c["ticker"] for c in query_resp.json()}
        # Every company in registry should appear in query API
        for cid, ticker in reg_companies.items():
            assert cid in query_companies, (
                f"Company {ticker} ({cid}) in registry but missing from query API"
            )
            assert query_companies[cid] == ticker
    async def test_exposure_profiles_accessible(self, registry_client, seed_ids):
        """Exposure profiles required by macro signal layer are accessible."""
        company_id = seed_ids["companies"]["AAPL"]
        resp = await registry_client.get(f"/companies/{company_id}/exposure")
        assert resp.status_code == 200
        data = resp.json()
        # Trading engine needs these fields for macro impact scoring
        assert "geographic_revenue_mix" in data
        assert "supply_chain_regions" in data
        assert "key_input_commodities" in data
        assert "market_position_tier" in data
        assert "export_dependency_pct" in data
        # Values must be valid types
        assert isinstance(data["geographic_revenue_mix"], dict)
        assert isinstance(data["supply_chain_regions"], list)
        assert isinstance(data["export_dependency_pct"], (int, float))
        assert 0 <= data["export_dependency_pct"] <= 1
    async def test_competitor_relationships_bidirectional(self, registry_client, seed_ids):
        """Competitor relationships are queryable from both sides."""
        aapl_id = seed_ids["companies"]["AAPL"]
        msft_id = seed_ids["companies"]["MSFT"]
        # Query from AAPL side
        resp_a = await registry_client.get(f"/companies/{aapl_id}/competitors")
        assert resp_a.status_code == 200
        aapl_competitors = resp_a.json()
        # Query from MSFT side
        resp_b = await registry_client.get(f"/companies/{msft_id}/competitors")
        assert resp_b.status_code == 200
        msft_competitors = resp_b.json()
        # AAPL should see MSFT and vice versa
        aapl_partner_ids = set()
        for rel in aapl_competitors:
            if rel.get("company_a_id") == aapl_id:
                aapl_partner_ids.add(rel["company_b_id"])
            else:
                aapl_partner_ids.add(rel["company_a_id"])
        msft_partner_ids = set()
        for rel in msft_competitors:
            if rel.get("company_a_id") == msft_id:
                msft_partner_ids.add(rel["company_b_id"])
            else:
                msft_partner_ids.add(rel["company_a_id"])
        assert msft_id in aapl_partner_ids, "MSFT not in AAPL's competitors"
        assert aapl_id in msft_partner_ids, "AAPL not in MSFT's competitors"
 # ---------------------------------------------------------------------------
 # Contract: Query API → Trading Engine trend data
 # ---------------------------------------------------------------------------
 class TestTrendToTradingContract:
    """Verify trend data has the schema the trading engine expects."""
    async def test_trend_has_required_trading_fields(self, query_client, seed_ids):
        """Trends must include fields the trading engine uses for decisions."""
        resp = await query_client.get("/api/trends")
        assert resp.status_code == 200
        trends = resp.json()
        assert len(trends) >= 1
        for trend in trends:
            # Fields the trading engine reads
            assert "id" in trend
            assert "trend_direction" in trend
            assert "confidence" in trend
            assert "trend_strength" in trend
            # Direction must be a valid enum value
            assert trend["trend_direction"] in (
                "bullish", "bearish", "mixed", "neutral",
            ), f"Invalid direction: {trend['trend_direction']}"
            # Confidence and strength must be normalized [0, 1]
            assert 0 <= trend["confidence"] <= 1, (
                f"Confidence out of range: {trend['confidence']}"
            )
            assert 0 <= trend["trend_strength"] <= 1, (
                f"Strength out of range: {trend['trend_strength']}"
            )
    async def test_trend_detail_has_evidence(self, query_client, seed_ids):
        """Individual trend detail includes evidence the trading engine logs."""
        trend_id = seed_ids["trends"]["TREND_01"]
        resp = await query_client.get(f"/api/trends/{trend_id}")
        assert resp.status_code == 200
        data = resp.json()
        # Trading engine logs these for audit trail
        assert "top_supporting_evidence" in data
        assert "top_opposing_evidence" in data
        assert "dominant_catalysts" in data
        assert isinstance(data["top_supporting_evidence"], list)
        assert isinstance(data["top_opposing_evidence"], list)
        assert isinstance(data["dominant_catalysts"], list)
 # ---------------------------------------------------------------------------
 # Contract: Recommendation → Risk Engine → Trading Engine
 # ---------------------------------------------------------------------------
 class TestRecommendationToRiskContract:
    """Verify recommendations produce valid risk evaluation inputs."""
    async def test_recommendation_has_risk_fields(self, query_client, seed_ids):
        """Recommendations include fields needed for risk evaluation."""
        resp = await query_client.get(
            "/api/recommendations", params={"latest": "false"},
        )
        assert resp.status_code == 200
        recs = resp.json()
        assert len(recs) >= 1
        for rec in recs:
            assert "ticker" in rec
            assert "action" in rec
            assert "confidence" in rec
            assert "mode" in rec
            # Action must be valid
            assert rec["action"] in ("buy", "sell", "hold", "watch")
            # Mode determines if it reaches trading engine
            assert rec["mode"] in (
                "informational", "paper_eligible", "live_eligible",
            )
            # Confidence must be normalized
            assert 0 <= rec["confidence"] <= 1
    async def test_risk_evaluation_schema(self, risk_client):
        """Risk engine returns evaluation with all fields trading engine needs."""
        payload = {
            "order": {
                "ticker": "AAPL",
                "action": "buy",
                "quantity": 5,
                "estimated_value": 925.0,
                "confidence": 0.75,
                "recommendation_id": None,
                "sector": "Technology",
            },
        }
        resp = await risk_client.post("/evaluate", json=payload)
        assert resp.status_code == 200
        data = resp.json()
        # Trading engine reads these fields from risk evaluation
        assert "evaluation_id" in data
        assert "ticker" in data
        assert "eligible" in data
        assert "rejection_reasons" in data
        assert "checks" in data
        assert "evaluated_at" in data
        # Types
        assert isinstance(data["eligible"], bool)
        assert isinstance(data["rejection_reasons"], list)
        assert isinstance(data["checks"], list)
        # Each check should have name and passed
        for check in data["checks"]:
            assert "name" in check
            assert "passed" in check
            assert isinstance(check["passed"], bool)
    async def test_risk_rejects_oversized_order(self, risk_client):
        """Risk engine correctly rejects an order exceeding position cap."""
        payload = {
            "order": {
                "ticker": "AAPL",
                "action": "buy",
                "quantity": 1000,
                "estimated_value": 185000.0,
                "confidence": 0.9,
                "recommendation_id": None,
                "sector": "Technology",
            },
            "config": {
                "absolute_position_cap": 10000.0,
            },
        }
        resp = await risk_client.post("/evaluate", json=payload)
        assert resp.status_code == 200
        data = resp.json()
        # Should be rejected due to position cap
        assert data["eligible"] is False
        assert len(data["rejection_reasons"]) > 0
 # ---------------------------------------------------------------------------
 # Contract: Trading Engine state consistency
 # ---------------------------------------------------------------------------
 class TestTradingEngineState:
    """Verify trading engine exposes consistent state for the promotion gate."""
    async def test_status_reflects_config(self, trading_client):
        """Engine status fields are consistent with each other."""
        resp = await trading_client.get("/api/trading/status")
        assert resp.status_code == 200
        data = resp.json()
        # If paused, open_positions should still be reported
        assert "open_positions" in data
        assert isinstance(data["open_positions"], int)
        assert data["open_positions"] >= 0
        # Risk tier must be valid
        assert data["risk_tier"] in ("conservative", "moderate", "aggressive")
        # Pool values must be non-negative
        assert data["active_pool"] >= 0
        assert data["reserve_pool"] >= 0
    async def test_decisions_have_audit_fields(self, trading_client, seed_ids):
        """Trading decisions include full audit trail fields."""
        resp = await trading_client.get("/api/trading/decisions")
        assert resp.status_code == 200
        decisions = resp.json()
        if len(decisions) > 0:
            d = decisions[0]
            assert "id" in d
            assert "decision" in d
            assert "ticker" in d
            assert "created_at" in d
            # Decision type must be valid
            assert d["decision"] in ("act", "skip")
    async def test_metrics_numeric_consistency(self, trading_client):
        """Portfolio metrics are all numeric and internally consistent."""
        resp = await trading_client.get("/api/trading/metrics")
        assert resp.status_code == 200
        data = resp.json()
        # All values must be numeric
        numeric_fields = [
            "total_portfolio_value", "active_pool", "reserve_pool",
            "unrealized_pnl", "realized_pnl", "daily_pnl",
            "win_rate", "sharpe_ratio", "max_drawdown", "portfolio_heat",
        ]
        for field in numeric_fields:
            assert field in data, f"Missing field: {field}"
            assert isinstance(data[field], (int, float)), (
                f"{field} should be numeric, got {type(data[field])}"
            )
        # Win rate and portfolio heat should be bounded
        assert 0 <= data["win_rate"] <= 1 or data["win_rate"] == 0
        assert 0 <= data["portfolio_heat"] <= 1 or data["portfolio_heat"] == 0
        # Total portfolio = active + reserve + unrealized (approximately)
        # Allow some tolerance for rounding
        expected_total = data["active_pool"] + data["reserve_pool"] + data["unrealized_pnl"]
        if data["total_portfolio_value"] > 0:
            diff = abs(data["total_portfolio_value"] - expected_total)
            assert diff < data["total_portfolio_value"] * 0.1, (
                f"Portfolio value inconsistency: total={data['total_portfolio_value']}, "
                f"active+reserve+unrealized={expected_total}"
            )
 # ---------------------------------------------------------------------------
 # Contract: Approval workflow integration
 # ---------------------------------------------------------------------------
 class TestApprovalWorkflowContract:
    """Verify the approval workflow is accessible and returns valid schemas."""
    async def test_pending_approvals_schema(self, risk_client):
        """Pending approvals list returns valid schema (may be empty)."""
        resp = await risk_client.get("/approvals/pending")
        assert resp.status_code == 200
        data = resp.json()
        assert isinstance(data, list)
        # If there are pending approvals, validate schema
        for approval in data:
            assert "id" in approval
            assert "status" in approval
            assert "ticker" in approval
            assert "side" in approval
            assert "quantity" in approval
            assert "created_at" in approval
    async def test_approval_not_found_returns_404(self, risk_client):
        """Non-existent approval ID returns 404, not 500."""
        fake_id = "00000000-0000-4000-ffff-ffffffffffff"
        resp = await risk_client.get(f"/approvals/{fake_id}")
        assert resp.status_code == 404
 # ---------------------------------------------------------------------------
 # Contract: Cross-service health (all services must be up for paper trading)
 # ---------------------------------------------------------------------------
 class TestCrossServiceHealth:
    """All services must be healthy before promotion to paper trading."""
    async def test_all_services_healthy(
        self, query_client, registry_client, risk_client, trading_client,
    ):
        """Every service responds to health check."""
        services = {
            "query-api": query_client,
            "symbol-registry": registry_client,
            "risk-engine": risk_client,
            "trading-engine": trading_client,
        }
        for name, client in services.items():
            resp = await client.get("/health")
            assert resp.status_code == 200, (
                f"{name} health check failed with status {resp.status_code}"
            )
            data = resp.json()
            assert data.get("status") == "ok", (
                f"{name} reported unhealthy: {data}"
            )
    async def test_trading_engine_ready(self, trading_client):
        """Trading engine readiness probe passes (DB + Redis connected)."""
        resp = await trading_client.get("/ready")
        assert resp.status_code == 200
        data = resp.json()
        assert data["ready"] is True, (
            f"Trading engine not ready: {data}"
        )