feat: pipeline on/off toggle with per-stage Helm control

- Added pipelineEnabled flag to Helm values (default: true)
- Worker services (scheduler, ingestion, parser, extractor, aggregation,
  recommendation, broker-adapter, lake-publisher) scale to 0 when disabled
- API services always run regardless of toggle
- Redis-based runtime toggle: POST /api/ops/pipeline/toggle
- Scheduler checks the flag before each cycle
- Frontend: green/red Pipeline ON/OFF button on the pipeline page
- Beta defaults to pipelineEnabled: false
- Base values.yaml: blanked external URLs (Ollama, Polygon, Alpaca)
  so stages only connect to what they explicitly configure
This commit is contained in:
Celes Renata
2026-04-21 00:21:53 +00:00
parent a19ed086fe
commit be526ae614
14 changed files with 923 additions and 104 deletions
+52
View File
@@ -227,3 +227,55 @@ jobs:
with: with:
name: inttest-results name: inttest-results
path: inttest-results.json path: inttest-results.json
beta-gate:
needs: [integration-test]
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
runs-on: self-hosted-gremlin
permissions:
contents: read
packages: read
steps:
- uses: actions/checkout@v5
- name: Install kubectl
run: |
if ! command -v kubectl &> /dev/null; then
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
chmod +x kubectl
sudo mv kubectl /usr/local/bin/kubectl
fi
kubectl version --client
- name: Install Helm
run: |
if ! command -v helm &> /dev/null; then
curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | sudo bash
fi
helm version
- name: Configure kubectl
run: |
if [ -f /var/run/secrets/kubernetes.io/serviceaccount/token ]; then
kubectl config set-cluster in-cluster \
--server=https://kubernetes.default.svc \
--certificate-authority=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
kubectl config set-credentials runner \
--token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
kubectl config set-context runner --cluster=in-cluster --user=runner
kubectl config use-context runner
fi
kubectl cluster-info || echo "WARNING: kubectl cannot reach cluster API"
- name: Run beta gate (deploy → test → promote)
run: |
bash infra/inttest/promote.sh \
--image-tag ${{ github.sha }} \
--results-file beta-gate-results.json
- name: Upload beta gate results
if: always()
uses: actions/upload-artifact@v4
with:
name: beta-gate-results
path: beta-gate-results.json
Binary file not shown.
+8
View File
@@ -533,6 +533,14 @@ export function useRetryFailedExtractions() {
}); });
} }
export function usePipelineToggle() {
const qc = useQueryClient();
return useMutation({
mutationFn: (enabled: boolean) => apiPost<{ pipeline_enabled: boolean }>('query', '/api/ops/pipeline/toggle', { enabled }),
onSuccess: () => qc.invalidateQueries({ queryKey: ['pipeline-health'] }),
});
}
export function useIngestionSummary(hours = 24) { export function useIngestionSummary(hours = 24) {
return useGet<Record<string, unknown>>(['ingestion-summary', hours], 'query', `/api/ops/ingestion/summary?hours=${hours}`); return useGet<Record<string, unknown>>(['ingestion-summary', hours], 'query', `/api/ops/ingestion/summary?hours=${hours}`);
} }
+11 -1
View File
@@ -1,5 +1,5 @@
import { useState, useEffect } from 'react'; import { useState, useEffect } from 'react';
import { usePipelineHealth, useRetryFailedExtractions } from '../api/hooks'; import { usePipelineHealth, useRetryFailedExtractions, usePipelineToggle } from '../api/hooks';
import { LoadingSpinner, DateRangeSelector, Card } from '../components/ui'; import { LoadingSpinner, DateRangeSelector, Card } from '../components/ui';
const QUEUE_LABELS: Record<string, string> = { const QUEUE_LABELS: Record<string, string> = {
@@ -54,12 +54,14 @@ export function OpsPipelinePage() {
const { data, isLoading } = usePipelineHealth(hours); const { data, isLoading } = usePipelineHealth(hours);
const stream = usePipelineStream(); const stream = usePipelineStream();
const retryMutation = useRetryFailedExtractions(); const retryMutation = useRetryFailedExtractions();
const toggleMutation = usePipelineToggle();
if (isLoading) return <LoadingSpinner />; if (isLoading) return <LoadingSpinner />;
const parsing = (data?.parsing ?? {}) as Record<string, unknown>; const parsing = (data?.parsing ?? {}) as Record<string, unknown>;
const extraction = (data?.extraction ?? {}) as Record<string, unknown>; const extraction = (data?.extraction ?? {}) as Record<string, unknown>;
const aggregation = (data?.aggregation ?? {}) as Record<string, unknown>; const aggregation = (data?.aggregation ?? {}) as Record<string, unknown>;
const pipelineEnabled = (data?.pipeline_enabled ?? true) as boolean;
// Prefer live stream data for queue depths and doc stages, fall back to initial fetch // Prefer live stream data for queue depths and doc stages, fall back to initial fetch
const queueDepths = stream?.queue_depths const queueDepths = stream?.queue_depths
@@ -82,6 +84,14 @@ export function OpsPipelinePage() {
<div className="flex items-center justify-between"> <div className="flex items-center justify-between">
<h1 className="text-xl font-semibold text-gray-100">Pipeline Health</h1> <h1 className="text-xl font-semibold text-gray-100">Pipeline Health</h1>
<div className="flex items-center gap-3"> <div className="flex items-center gap-3">
<button
type="button"
onClick={() => toggleMutation.mutate(!pipelineEnabled)}
disabled={toggleMutation.isPending}
className={`rounded-md px-3 py-1.5 text-xs font-medium text-white ${pipelineEnabled ? 'bg-green-600 hover:bg-green-500' : 'bg-red-600 hover:bg-red-500'} disabled:opacity-50`}
>
{toggleMutation.isPending ? '…' : pipelineEnabled ? 'Pipeline ON' : 'Pipeline OFF'}
</button>
{failedCount > 0 && ( {failedCount > 0 && (
<button <button
type="button" type="button"
+2
View File
@@ -106,6 +106,8 @@ export const handlers = [
http.delete('/api/admin/trading/lockouts/:id', () => HttpResponse.json({ status: 'deleted' })), http.delete('/api/admin/trading/lockouts/:id', () => HttpResponse.json({ status: 'deleted' })),
http.get('/api/ops/pipeline/health', () => HttpResponse.json({ hours: 24, document_stages: [{ status: 'extracted', doc_count: 5 }], parsing: {}, extraction: {}, aggregation: {}, queue_depths: {} })), http.get('/api/ops/pipeline/health', () => HttpResponse.json({ hours: 24, document_stages: [{ status: 'extracted', doc_count: 5 }], parsing: {}, extraction: {}, aggregation: {}, queue_depths: {} })),
http.post('/api/ops/pipeline/retry-failed', () => HttpResponse.json({ retried: 10, message: 'Re-enqueued 10 documents for extraction' })), http.post('/api/ops/pipeline/retry-failed', () => HttpResponse.json({ retried: 10, message: 'Re-enqueued 10 documents for extraction' })),
http.get('/api/ops/pipeline/toggle', () => HttpResponse.json({ pipeline_enabled: true })),
http.post('/api/ops/pipeline/toggle', () => HttpResponse.json({ pipeline_enabled: true })),
http.get('/api/ops/ingestion/summary', () => HttpResponse.json({ total_runs: 10, completed: 8, failed: 2, total_items_fetched: 50, total_items_new: 12, by_source_type: [] })), http.get('/api/ops/ingestion/summary', () => HttpResponse.json({ total_runs: 10, completed: 8, failed: 2, total_items_fetched: 50, total_items_new: 12, by_source_type: [] })),
http.get('/api/ops/ingestion/throughput', () => HttpResponse.json([])), http.get('/api/ops/ingestion/throughput', () => HttpResponse.json([])),
http.get('/api/ops/model/performance', () => HttpResponse.json({ total_extractions: 20, success_rate: 0.9, avg_duration_ms: 1500, retry_rate: 0.05, avg_confidence: 0.8 })), http.get('/api/ops/model/performance', () => HttpResponse.json({ total_extractions: 20, success_rate: 0.9, avg_duration_ms: 1500, retry_rate: 0.05, avg_confidence: 0.8 })),
@@ -11,7 +11,7 @@ metadata:
{{- include "stonks.labels" $root | nindent 4 }} {{- include "stonks.labels" $root | nindent 4 }}
stonks-oracle/tier: {{ $svc.tier }} stonks-oracle/tier: {{ $svc.tier }}
spec: spec:
replicas: {{ $svc.replicas }} replicas: {{ if and (hasKey $svc "pipeline") $svc.pipeline (not $root.Values.pipelineEnabled) }}0{{ else }}{{ $svc.replicas }}{{ end }}
selector: selector:
matchLabels: matchLabels:
app: {{ $svc.image }} app: {{ $svc.image }}
+4 -24
View File
@@ -6,31 +6,11 @@
image: image:
tag: latest tag: latest
## Single replica for API services, disable pipeline workers ## Pipeline OFF by default — beta is for API testing only
## Beta is for API testing only — no ingestion/extraction/aggregation pipelineEnabled: false
## Single replica for API services
services: services:
scheduler:
replicas: 0
symbolRegistry:
replicas: 1
ingestion:
replicas: 0
parser:
replicas: 0
extractor:
replicas: 0
aggregation:
replicas: 0
recommendation:
replicas: 0
tradingEngine:
replicas: 1
riskEngine:
replicas: 1
brokerAdapter:
replicas: 0
lakePublisher:
replicas: 0
queryApi: queryApi:
replicas: 1 replicas: 1
dashboard: dashboard:
+13
View File
@@ -4,10 +4,16 @@ image:
pullPolicy: Always pullPolicy: Always
tag: latest tag: latest
## Pipeline toggle — when false, all worker services (ingestion, parsing,
## extraction, aggregation, recommendation, broker, lake-publisher, scheduler)
## are scaled to 0. API services always run.
pipelineEnabled: true
## Service deployments — replicas and resource overrides ## Service deployments — replicas and resource overrides
services: services:
scheduler: scheduler:
replicas: 1 replicas: 1
pipeline: true
image: scheduler image: scheduler
command: "python -m services.scheduler.app" command: "python -m services.scheduler.app"
tier: orchestration tier: orchestration
@@ -32,6 +38,7 @@ services:
ingestion: ingestion:
replicas: 2 replicas: 2
pipeline: true
image: ingestion image: ingestion
command: "python -m services.ingestion.worker" command: "python -m services.ingestion.worker"
tier: ingestion tier: ingestion
@@ -42,6 +49,7 @@ services:
parser: parser:
replicas: 2 replicas: 2
pipeline: true
image: parser image: parser
command: "python -m services.parser.worker" command: "python -m services.parser.worker"
tier: processing tier: processing
@@ -52,6 +60,7 @@ services:
extractor: extractor:
replicas: 1 replicas: 1
pipeline: true
image: extractor image: extractor
command: "python -m services.extractor.main" command: "python -m services.extractor.main"
tier: processing tier: processing
@@ -62,6 +71,7 @@ services:
aggregation: aggregation:
replicas: 4 replicas: 4
pipeline: true
image: aggregation image: aggregation
command: "python -m services.aggregation.main" command: "python -m services.aggregation.main"
tier: processing tier: processing
@@ -72,6 +82,7 @@ services:
recommendation: recommendation:
replicas: 1 replicas: 1
pipeline: true
image: recommendation image: recommendation
command: "python -m services.recommendation.main" command: "python -m services.recommendation.main"
tier: processing tier: processing
@@ -107,6 +118,7 @@ services:
brokerAdapter: brokerAdapter:
replicas: 1 replicas: 1
pipeline: true
image: broker-adapter image: broker-adapter
command: "python -m services.adapters.broker_service" command: "python -m services.adapters.broker_service"
tier: trading tier: trading
@@ -117,6 +129,7 @@ services:
lakePublisher: lakePublisher:
replicas: 1 replicas: 1
pipeline: true
image: lake-publisher image: lake-publisher
command: "python -m services.lake_publisher.jobs" command: "python -m services.lake_publisher.jobs"
tier: analytics tier: analytics
+409
View File
@@ -0,0 +1,409 @@
#!/bin/bash
# Beta-to-Paper promotion gate
#
# Deploys the given image tag to the beta namespace, runs integration tests
# against the live beta services, and promotes to paper-trading if all pass.
#
# This script is the single source of truth for the promotion decision.
# CI calls it; humans can call it too.
#
# Usage: bash infra/inttest/promote.sh [OPTIONS]
#
# Options:
# --image-tag TAG Docker image tag to deploy (required)
# --skip-promote Run tests but don't promote even if green
# --skip-teardown Leave beta namespace running after tests
# --results-file PATH Path for JSON results output (default: beta-gate-results.json)
# --timeout SECONDS Max wait for services to become ready (default: 180)
# -h, --help Show usage
#
# Exit codes:
# 0 All tests passed, promotion succeeded (or --skip-promote)
# 1 Test failures — promotion blocked
# 2 Infrastructure/deployment failure
# 3 Promotion step failed (tests passed but helm upgrade failed)
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# ── Defaults ─────────────────────────────────────────────────────────────────
IMAGE_TAG=""
SKIP_PROMOTE=false
SKIP_TEARDOWN=false
RESULTS_FILE="beta-gate-results.json"
READY_TIMEOUT=180
BETA_NAMESPACE="stonks-oracle-beta"
PAPER_NAMESPACE="stonks-oracle"
# ── Helpers ──────────────────────────────────────────────────────────────────
usage() {
cat <<EOF
Usage: bash infra/inttest/promote.sh [OPTIONS]
Options:
--image-tag TAG Docker image tag to deploy (required)
--skip-promote Run tests but don't promote even if green
--skip-teardown Leave beta namespace running after tests
--results-file PATH Path for JSON results output (default: beta-gate-results.json)
--timeout SECONDS Max wait for services to become ready (default: 180)
-h, --help Show usage
Exit codes:
0 All tests passed, promotion succeeded (or --skip-promote)
1 Test failures — promotion blocked
2 Infrastructure/deployment failure
3 Promotion step failed (tests passed but helm upgrade failed)
EOF
exit 0
}
log() {
echo "[$(date -u +"%H:%M:%S")] [beta-gate] $*"
}
die() {
log "FATAL: $*"
exit 2
}
# ── Parse CLI args ───────────────────────────────────────────────────────────
while [[ $# -gt 0 ]]; do
case $1 in
--image-tag)
IMAGE_TAG="$2"
shift 2
;;
--skip-promote)
SKIP_PROMOTE=true
shift
;;
--skip-teardown)
SKIP_TEARDOWN=true
shift
;;
--results-file)
RESULTS_FILE="$2"
shift 2
;;
--timeout)
READY_TIMEOUT="$2"
shift 2
;;
-h|--help)
usage
;;
*)
echo "Unknown option: $1"
usage
;;
esac
done
if [ -z "$IMAGE_TAG" ]; then
echo "ERROR: --image-tag is required"
usage
fi
STARTED_AT=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
PIPELINE_START=$(date +%s)
log "Beta gate starting"
log " Image tag: $IMAGE_TAG"
log " Beta NS: $BETA_NAMESPACE"
log " Paper NS: $PAPER_NAMESPACE"
log " Skip promote: $SKIP_PROMOTE"
# ══════════════════════════════════════════════════════════════════════════════
# Stage 1: Deploy to beta namespace
# ══════════════════════════════════════════════════════════════════════════════
log "▶ Stage 1: Deploy to beta"
# Ensure beta namespace exists
kubectl create namespace "$BETA_NAMESPACE" 2>/dev/null || true
# Create beta database if it doesn't exist
log "Ensuring beta database exists ..."
kubectl exec -n postgresql-service postgresql-1 -c postgres -- \
psql -U postgres -tc "SELECT 1 FROM pg_database WHERE datname = 'stonks_beta'" \
| grep -q 1 || \
kubectl exec -n postgresql-service postgresql-1 -c postgres -- \
psql -U postgres -c "CREATE DATABASE stonks_beta OWNER stonks;" 2>/dev/null || true
# Apply migrations to beta database
log "Applying migrations to beta database ..."
for migration in $(ls "$REPO_ROOT/infra/migrations/"*.sql | sort); do
kubectl exec -n postgresql-service postgresql-1 -c postgres -- \
psql -U stonks -d stonks_beta -f - < "$migration" 2>/dev/null || true
done
# Deploy via Helm with beta values
log "Helm upgrade to beta namespace ..."
if ! helm upgrade --install stonks-oracle-beta \
"$REPO_ROOT/infra/helm/stonks-oracle" \
-n "$BETA_NAMESPACE" \
-f "$REPO_ROOT/infra/helm/stonks-oracle/values-beta.yaml" \
--set "image.tag=$IMAGE_TAG" \
--wait \
--timeout "${READY_TIMEOUT}s"; then
log "Helm deploy to beta failed"
DEPLOY_STATUS="failed"
else
DEPLOY_STATUS="ok"
fi
if [ "$DEPLOY_STATUS" != "ok" ]; then
log "Beta deployment failed — checking pod status"
kubectl get pods -n "$BETA_NAMESPACE" -o wide 2>&1 || true
kubectl get events -n "$BETA_NAMESPACE" --sort-by='.lastTimestamp' 2>&1 | tail -20 || true
cat > "$RESULTS_FILE" <<EOF
{
"run_id": "beta-gate-${IMAGE_TAG}",
"image_tag": "${IMAGE_TAG}",
"started_at": "${STARTED_AT}",
"completed_at": "$(date -u +"%Y-%m-%dT%H:%M:%SZ")",
"exit_code": 2,
"stage": "deploy",
"deploy_status": "failed",
"test_status": "skipped",
"promote_status": "blocked",
"tests": {"total": 0, "passed": 0, "failed": 0, "errors": 0}
}
EOF
exit 2
fi
log "✓ Beta deployment ready"
# ══════════════════════════════════════════════════════════════════════════════
# Stage 2: Seed beta database
# ══════════════════════════════════════════════════════════════════════════════
log "▶ Stage 2: Seed beta data"
# Run seed against beta database via a temporary pod
SEED_IMAGE="registry.celestium.life/stonks-oracle/query-api:${IMAGE_TAG}"
# Clean up any previous seed pod
kubectl delete pod seed-beta -n "$BETA_NAMESPACE" --ignore-not-found > /dev/null 2>&1 || true
if ! kubectl run seed-beta \
--image="$SEED_IMAGE" \
--restart=Never \
--rm \
--attach \
--pod-running-timeout=3m \
--namespace="$BETA_NAMESPACE" \
--image-pull-policy=Always \
--env="POSTGRES_HOST=postgresql-rw.postgresql-service.svc.cluster.local" \
--env="POSTGRES_PORT=5432" \
--env="POSTGRES_DB=stonks_beta" \
--env="POSTGRES_USER=stonks" \
--env="POSTGRES_PASSWORD=St0nks0racl3!" \
--env="MINIO_ENDPOINT=minio.minio-service.svc.cluster.local:80" \
--env="MINIO_SECURE=false" \
--env="MINIO_ACCESS_KEY=minioadmin" \
--env="MINIO_SECRET_KEY=minioadmin" \
--command -- python -m tests.integration.seed_sandbox 2>/dev/null; then
log "WARNING: Seed may have partially failed (could be idempotent re-run)"
fi
log "✓ Beta data seeded"
# ══════════════════════════════════════════════════════════════════════════════
# Stage 3: Run integration tests against beta
# ══════════════════════════════════════════════════════════════════════════════
log "▶ Stage 3: Run integration tests"
# Determine service URLs within the beta namespace
QUERY_API_URL="http://query-api.${BETA_NAMESPACE}.svc.cluster.local:8000"
REGISTRY_API_URL="http://symbol-registry.${BETA_NAMESPACE}.svc.cluster.local:8000"
RISK_API_URL="http://risk.${BETA_NAMESPACE}.svc.cluster.local:8000"
TRADING_API_URL="http://trading-engine.${BETA_NAMESPACE}.svc.cluster.local:8000"
# Clean up any previous runner
kubectl delete pod beta-test-runner -n "$BETA_NAMESPACE" --ignore-not-found > /dev/null 2>&1 || true
# Run tests as a pod in the beta namespace
log "Starting test runner pod ..."
kubectl run beta-test-runner \
--image="$SEED_IMAGE" \
--restart=Never \
--namespace="$BETA_NAMESPACE" \
--image-pull-policy=Always \
--env="QUERY_API_URL=$QUERY_API_URL" \
--env="REGISTRY_API_URL=$REGISTRY_API_URL" \
--env="RISK_API_URL=$RISK_API_URL" \
--env="TRADING_API_URL=$TRADING_API_URL" \
--env="POSTGRES_HOST=postgresql-rw.postgresql-service.svc.cluster.local" \
--env="POSTGRES_PORT=5432" \
--env="POSTGRES_DB=stonks_beta" \
--env="POSTGRES_USER=stonks" \
--env="POSTGRES_PASSWORD=St0nks0racl3!" \
--env="REDIS_HOST=redis-master.redis-service.svc.cluster.local" \
--env="REDIS_PORT=6379" \
--env="REDIS_DB=1" \
--env="REDIS_PASSWORD=" \
--env="BROKER_MODE=paper" \
--env="LOG_LEVEL=INFO" \
--command -- python -m pytest tests/integration/ -v --tb=short -q
# Wait for the test runner to complete
log "Waiting for test runner (timeout: 600s) ..."
TEST_EXIT_CODE=0
if ! kubectl wait --for=condition=Ready=false pod/beta-test-runner \
-n "$BETA_NAMESPACE" --timeout=600s 2>/dev/null; then
# Pod may have already completed — check its status
true
fi
# Wait for pod to reach terminal state
for i in $(seq 1 120); do
POD_PHASE=$(kubectl get pod beta-test-runner -n "$BETA_NAMESPACE" \
-o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
if [ "$POD_PHASE" = "Succeeded" ] || [ "$POD_PHASE" = "Failed" ]; then
break
fi
sleep 5
done
# Collect results
TEST_OUTPUT=$(kubectl logs beta-test-runner -n "$BETA_NAMESPACE" 2>/dev/null || true)
POD_PHASE=$(kubectl get pod beta-test-runner -n "$BETA_NAMESPACE" \
-o jsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
if [ "$POD_PHASE" = "Failed" ]; then
TEST_EXIT_CODE=1
fi
# Parse test counts
TESTS_PASSED=$(echo "$TEST_OUTPUT" | grep -oP '\d+(?= passed)' | tail -1 || echo "0")
TESTS_FAILED=$(echo "$TEST_OUTPUT" | grep -oP '\d+(?= failed)' | tail -1 || echo "0")
TESTS_ERRORS=$(echo "$TEST_OUTPUT" | grep -oP '\d+(?= error)' | tail -1 || echo "0")
TESTS_PASSED=${TESTS_PASSED:-0}
TESTS_FAILED=${TESTS_FAILED:-0}
TESTS_ERRORS=${TESTS_ERRORS:-0}
TESTS_TOTAL=$(( TESTS_PASSED + TESTS_FAILED + TESTS_ERRORS ))
log "Test results: ${TESTS_PASSED} passed, ${TESTS_FAILED} failed, ${TESTS_ERRORS} errors"
# Print test output for CI visibility
if [ -n "$TEST_OUTPUT" ]; then
echo "─── Test Output ───"
echo "$TEST_OUTPUT" | tail -60
echo "─── End Test Output ───"
fi
# Clean up test runner
kubectl delete pod beta-test-runner -n "$BETA_NAMESPACE" --ignore-not-found > /dev/null 2>&1 || true
# ══════════════════════════════════════════════════════════════════════════════
# Stage 4: Promotion decision
# ══════════════════════════════════════════════════════════════════════════════
PROMOTE_STATUS="blocked"
FINAL_EXIT=0
if [ "$TESTS_FAILED" -gt 0 ] || [ "$TESTS_ERRORS" -gt 0 ] || [ "$TEST_EXIT_CODE" -ne 0 ]; then
log "✗ GATE FAILED — ${TESTS_FAILED} failures, ${TESTS_ERRORS} errors"
log " Promotion to paper-trading BLOCKED"
PROMOTE_STATUS="blocked"
FINAL_EXIT=1
elif [ "$SKIP_PROMOTE" = true ]; then
log "✓ Tests passed — promotion skipped (--skip-promote)"
PROMOTE_STATUS="skipped"
FINAL_EXIT=0
else
log "▶ Stage 4: Promoting to paper-trading"
log " Upgrading $PAPER_NAMESPACE with image tag $IMAGE_TAG ..."
if helm upgrade --install stonks-oracle \
"$REPO_ROOT/infra/helm/stonks-oracle" \
-n "$PAPER_NAMESPACE" \
--set "image.tag=$IMAGE_TAG" \
--wait \
--timeout 300s; then
log "✓ PROMOTED — paper-trading now running $IMAGE_TAG"
PROMOTE_STATUS="promoted"
FINAL_EXIT=0
# Rolling restart to pick up new images
log "Rolling restart of API services ..."
kubectl rollout restart deployment/query-api deployment/symbol-registry \
deployment/trading-engine deployment/risk-engine \
deployment/aggregation deployment/recommendation \
-n "$PAPER_NAMESPACE" 2>/dev/null || true
else
log "✗ Promotion failed — helm upgrade error"
PROMOTE_STATUS="failed"
FINAL_EXIT=3
fi
fi
# ══════════════════════════════════════════════════════════════════════════════
# Stage 5: Cleanup beta (optional)
# ══════════════════════════════════════════════════════════════════════════════
if [ "$SKIP_TEARDOWN" = false ] && [ "$PROMOTE_STATUS" = "promoted" ]; then
log "Scaling down beta deployment (keeping namespace for next run) ..."
helm upgrade stonks-oracle-beta \
"$REPO_ROOT/infra/helm/stonks-oracle" \
-n "$BETA_NAMESPACE" \
-f "$REPO_ROOT/infra/helm/stonks-oracle/values-beta.yaml" \
--set "image.tag=$IMAGE_TAG" \
--set "services.queryApi.replicas=0" \
--set "services.symbolRegistry.replicas=0" \
--set "services.tradingEngine.replicas=0" \
--set "services.riskEngine.replicas=0" \
--set "services.scheduler.replicas=0" \
--set "services.ingestion.replicas=0" \
--set "services.parser.replicas=0" \
--set "services.extractor.replicas=0" \
--set "services.aggregation.replicas=0" \
--set "services.recommendation.replicas=0" \
--set "services.brokerAdapter.replicas=0" \
--set "services.lakePublisher.replicas=0" \
--set "services.dashboard.replicas=0" \
2>/dev/null || true
log "Beta scaled to zero"
fi
# ══════════════════════════════════════════════════════════════════════════════
# Write results
# ══════════════════════════════════════════════════════════════════════════════
COMPLETED_AT=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
PIPELINE_END=$(date +%s)
PIPELINE_DURATION=$(( PIPELINE_END - PIPELINE_START ))
cat > "$RESULTS_FILE" <<EOF
{
"run_id": "beta-gate-${IMAGE_TAG}",
"image_tag": "${IMAGE_TAG}",
"started_at": "${STARTED_AT}",
"completed_at": "${COMPLETED_AT}",
"duration_s": ${PIPELINE_DURATION},
"exit_code": ${FINAL_EXIT},
"deploy_status": "${DEPLOY_STATUS}",
"test_status": "$([ "$FINAL_EXIT" -le 1 ] && echo "completed" || echo "error")",
"promote_status": "${PROMOTE_STATUS}",
"tests": {
"total": ${TESTS_TOTAL},
"passed": ${TESTS_PASSED},
"failed": ${TESTS_FAILED},
"errors": ${TESTS_ERRORS}
}
}
EOF
log "Results written to $RESULTS_FILE"
echo ""
log "═══════════════════════════════════════════════════"
log " Beta Gate Summary"
log "═══════════════════════════════════════════════════"
log " Image: $IMAGE_TAG"
log " Duration: ${PIPELINE_DURATION}s"
log " Tests: ${TESTS_PASSED}/${TESTS_TOTAL} passed"
log " Promotion: ${PROMOTE_STATUS}"
log " Exit code: $FINAL_EXIT"
log "═══════════════════════════════════════════════════"
echo ""
exit "$FINAL_EXIT"
+5
View File
@@ -65,6 +65,11 @@ trivy:
metrics: metrics:
enabled: false enabled: false
# Enable Redis cache layer for faster manifest lookups (avoids upstream checks)
cache:
enabled: true
expireHours: 24
# Resource limits — conservative for a 4-node cluster # Resource limits — conservative for a 4-node cluster
core: core:
resources: resources:
+34 -1
View File
@@ -41,7 +41,7 @@ from services.shared.audit import get_entity_audit_trail, get_order_audit_trail,
from services.shared.config import load_config from services.shared.config import load_config
from services.shared.db import get_pg_pool, get_redis from services.shared.db import get_pg_pool, get_redis
from services.shared.logging import new_trace_id, set_trace_context, setup_logging from services.shared.logging import new_trace_id, set_trace_context, setup_logging
from services.shared.redis_keys import QUEUE_PREFIX, queue_key from services.shared.redis_keys import PREFIX, QUEUE_PREFIX, queue_key
from services.shared.schemas import MAJOR_DECISION_CATALYSTS from services.shared.schemas import MAJOR_DECISION_CATALYSTS
logger = logging.getLogger("query_api") logger = logging.getLogger("query_api")
@@ -1787,8 +1787,13 @@ async def get_pipeline_health(
except Exception: except Exception:
pass pass
# Pipeline enabled flag
pipeline_flag = await rds.get(_PIPELINE_ENABLED_KEY) if rds else None
pipeline_enabled = pipeline_flag != "0" if pipeline_flag is not None else True
return { return {
"hours": hours, "hours": hours,
"pipeline_enabled": pipeline_enabled,
"document_stages": [_row_to_dict(r) for r in doc_stages], "document_stages": [_row_to_dict(r) for r in doc_stages],
"parsing": _row_to_dict(parse_quality) if parse_quality else {}, "parsing": _row_to_dict(parse_quality) if parse_quality else {},
"extraction": _row_to_dict(extraction_stats) if extraction_stats else {}, "extraction": _row_to_dict(extraction_stats) if extraction_stats else {},
@@ -1927,6 +1932,34 @@ async def retry_failed_extractions_endpoint():
return {"retried": len(doc_ids), "message": f"Re-enqueued {len(doc_ids)} documents for extraction"} return {"retried": len(doc_ids), "message": f"Re-enqueued {len(doc_ids)} documents for extraction"}
# ---------------------------------------------------------------------------
# Pipeline On/Off Toggle
# ---------------------------------------------------------------------------
_PIPELINE_ENABLED_KEY = f"{PREFIX}:pipeline:enabled"
@app.get("/api/ops/pipeline/toggle")
async def get_pipeline_toggle():
"""Get the current pipeline enabled/disabled state."""
val = await rds.get(_PIPELINE_ENABLED_KEY)
# Default to enabled if key doesn't exist
enabled = val != "0"
return {"pipeline_enabled": enabled}
@app.post("/api/ops/pipeline/toggle")
async def set_pipeline_toggle(body: dict[str, Any]):
"""Toggle the pipeline on or off.
Accepts: { "enabled": true/false }
Workers check this flag before processing jobs.
"""
enabled = body.get("enabled", True)
await rds.set(_PIPELINE_ENABLED_KEY, "1" if enabled else "0")
return {"pipeline_enabled": enabled, "message": f"Pipeline {'enabled' if enabled else 'disabled'}"}
@app.get("/api/ops/sources/coverage-gaps") @app.get("/api/ops/sources/coverage-gaps")
async def get_source_coverage_gaps(): async def get_source_coverage_gaps():
"""Identify symbols with missing or insufficient source coverage. """Identify symbols with missing or insufficient source coverage.
+8
View File
@@ -19,6 +19,7 @@ from services.shared.config import load_config
from services.shared.db import get_pg_pool, get_redis from services.shared.db import get_pg_pool, get_redis
from services.shared.logging import setup_logging from services.shared.logging import setup_logging
from services.shared.redis_keys import ( from services.shared.redis_keys import (
PREFIX,
QUEUE_EXTRACTION, QUEUE_EXTRACTION,
QUEUE_INGESTION, QUEUE_INGESTION,
QUEUE_MACRO_CLASSIFICATION, QUEUE_MACRO_CLASSIFICATION,
@@ -499,12 +500,19 @@ async def main() -> None:
rds = get_redis(config) rds = get_redis(config)
logger.info("Scheduler started (tick=%ds)", SCHEDULER_TICK) logger.info("Scheduler started (tick=%ds)", SCHEDULER_TICK)
pipeline_key = f"{PREFIX}:pipeline:enabled"
recovery_counter = 0 recovery_counter = 0
retry_counter = 0 retry_counter = 0
cleanup_counter = 0 cleanup_counter = 0
try: try:
while True: while True:
try: try:
# Check pipeline toggle — skip cycle if disabled
flag = await rds.get(pipeline_key)
if flag == "0":
await asyncio.sleep(SCHEDULER_TICK)
continue
if await acquire_lock(rds, "scheduler_cycle", ttl=30): if await acquire_lock(rds, "scheduler_cycle", ttl=30):
try: try:
await schedule_cycle(pool, rds) await schedule_cycle(pool, rds)
-77
View File
@@ -1,77 +0,0 @@
"""Database migration runner using asyncpg.
Applies all SQL migration files from infra/migrations/ in sorted order.
Each file is split on semicolons and executed statement-by-statement.
Idempotent — migrations use IF NOT EXISTS / CREATE OR REPLACE patterns.
Usage:
python -m services.shared.migrate
"""
import asyncio
import glob
import logging
import os
import sys
import asyncpg
logger = logging.getLogger("migrate")
async def run_migrations() -> None:
host = os.getenv("POSTGRES_HOST", "localhost")
port = int(os.getenv("POSTGRES_PORT", "5432"))
user = os.getenv("POSTGRES_USER", "stonks")
password = os.getenv("POSTGRES_PASSWORD", "")
database = os.getenv("POSTGRES_DB", "stonks")
migrations_dir = os.path.join(
os.path.dirname(__file__), "..", "..", "infra", "migrations"
)
migrations_dir = os.path.normpath(migrations_dir)
if not os.path.isdir(migrations_dir):
logger.error("Migrations directory not found: %s", migrations_dir)
sys.exit(1)
files = sorted(glob.glob(os.path.join(migrations_dir, "*.sql")))
if not files:
logger.warning("No migration files found in %s", migrations_dir)
return
logger.info("Connecting to %s@%s:%d/%s", user, host, port, database)
conn = await asyncpg.connect(
host=host, port=port, user=user, password=password, database=database
)
try:
for path in files:
name = os.path.basename(path)
with open(path) as f:
sql = f.read()
# Split on semicolons and execute each statement individually.
# asyncpg.execute() doesn't support multi-statement strings.
statements = [s.strip() for s in sql.split(";") if s.strip()]
try:
for stmt in statements:
await conn.execute(stmt)
logger.info("%s (%d statements)", name, len(statements))
except Exception as exc:
logger.warning("%s: %s", name, exc)
finally:
await conn.close()
logger.info("Migrations complete (%d files)", len(files))
def main() -> None:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(name)s %(message)s",
datefmt="%H:%M:%S",
)
asyncio.run(run_migrations())
if __name__ == "__main__":
main()
+376
View File
@@ -0,0 +1,376 @@
"""Integration tests for cross-service signal flow contracts.
These tests validate the end-to-end data flow that the trading engine
depends on. They catch schema drift and contract violations between
services that unit tests miss.
Flow under test:
1. Symbol Registry has companies with exposure profiles and competitors
2. Query API returns trends with correct schema for trading engine consumption
3. Risk engine evaluates orders using data from query API
4. Trading engine receives valid recommendation payloads
These are the "beta gate" tests — if any fail, promotion to paper is blocked.
"""
import pytest
pytestmark = pytest.mark.asyncio
# ---------------------------------------------------------------------------
# Contract: Symbol Registry → Query API company data consistency
# ---------------------------------------------------------------------------
class TestRegistryToQueryContract:
"""Verify that company data in the registry matches what query API exposes."""
async def test_company_ids_consistent(self, registry_client, query_client, seed_ids):
"""Company IDs from registry match those returned by query API."""
reg_resp = await registry_client.get("/companies")
assert reg_resp.status_code == 200
reg_companies = {c["id"]: c["ticker"] for c in reg_resp.json()}
query_resp = await query_client.get("/api/companies")
assert query_resp.status_code == 200
query_companies = {c["id"]: c["ticker"] for c in query_resp.json()}
# Every company in registry should appear in query API
for cid, ticker in reg_companies.items():
assert cid in query_companies, (
f"Company {ticker} ({cid}) in registry but missing from query API"
)
assert query_companies[cid] == ticker
async def test_exposure_profiles_accessible(self, registry_client, seed_ids):
"""Exposure profiles required by macro signal layer are accessible."""
company_id = seed_ids["companies"]["AAPL"]
resp = await registry_client.get(f"/companies/{company_id}/exposure")
assert resp.status_code == 200
data = resp.json()
# Trading engine needs these fields for macro impact scoring
assert "geographic_revenue_mix" in data
assert "supply_chain_regions" in data
assert "key_input_commodities" in data
assert "market_position_tier" in data
assert "export_dependency_pct" in data
# Values must be valid types
assert isinstance(data["geographic_revenue_mix"], dict)
assert isinstance(data["supply_chain_regions"], list)
assert isinstance(data["export_dependency_pct"], (int, float))
assert 0 <= data["export_dependency_pct"] <= 1
async def test_competitor_relationships_bidirectional(self, registry_client, seed_ids):
"""Competitor relationships are queryable from both sides."""
aapl_id = seed_ids["companies"]["AAPL"]
msft_id = seed_ids["companies"]["MSFT"]
# Query from AAPL side
resp_a = await registry_client.get(f"/companies/{aapl_id}/competitors")
assert resp_a.status_code == 200
aapl_competitors = resp_a.json()
# Query from MSFT side
resp_b = await registry_client.get(f"/companies/{msft_id}/competitors")
assert resp_b.status_code == 200
msft_competitors = resp_b.json()
# AAPL should see MSFT and vice versa
aapl_partner_ids = set()
for rel in aapl_competitors:
if rel.get("company_a_id") == aapl_id:
aapl_partner_ids.add(rel["company_b_id"])
else:
aapl_partner_ids.add(rel["company_a_id"])
msft_partner_ids = set()
for rel in msft_competitors:
if rel.get("company_a_id") == msft_id:
msft_partner_ids.add(rel["company_b_id"])
else:
msft_partner_ids.add(rel["company_a_id"])
assert msft_id in aapl_partner_ids, "MSFT not in AAPL's competitors"
assert aapl_id in msft_partner_ids, "AAPL not in MSFT's competitors"
# ---------------------------------------------------------------------------
# Contract: Query API → Trading Engine trend data
# ---------------------------------------------------------------------------
class TestTrendToTradingContract:
"""Verify trend data has the schema the trading engine expects."""
async def test_trend_has_required_trading_fields(self, query_client, seed_ids):
"""Trends must include fields the trading engine uses for decisions."""
resp = await query_client.get("/api/trends")
assert resp.status_code == 200
trends = resp.json()
assert len(trends) >= 1
for trend in trends:
# Fields the trading engine reads
assert "id" in trend
assert "trend_direction" in trend
assert "confidence" in trend
assert "trend_strength" in trend
# Direction must be a valid enum value
assert trend["trend_direction"] in (
"bullish", "bearish", "mixed", "neutral",
), f"Invalid direction: {trend['trend_direction']}"
# Confidence and strength must be normalized [0, 1]
assert 0 <= trend["confidence"] <= 1, (
f"Confidence out of range: {trend['confidence']}"
)
assert 0 <= trend["trend_strength"] <= 1, (
f"Strength out of range: {trend['trend_strength']}"
)
async def test_trend_detail_has_evidence(self, query_client, seed_ids):
"""Individual trend detail includes evidence the trading engine logs."""
trend_id = seed_ids["trends"]["TREND_01"]
resp = await query_client.get(f"/api/trends/{trend_id}")
assert resp.status_code == 200
data = resp.json()
# Trading engine logs these for audit trail
assert "top_supporting_evidence" in data
assert "top_opposing_evidence" in data
assert "dominant_catalysts" in data
assert isinstance(data["top_supporting_evidence"], list)
assert isinstance(data["top_opposing_evidence"], list)
assert isinstance(data["dominant_catalysts"], list)
# ---------------------------------------------------------------------------
# Contract: Recommendation → Risk Engine → Trading Engine
# ---------------------------------------------------------------------------
class TestRecommendationToRiskContract:
"""Verify recommendations produce valid risk evaluation inputs."""
async def test_recommendation_has_risk_fields(self, query_client, seed_ids):
"""Recommendations include fields needed for risk evaluation."""
resp = await query_client.get(
"/api/recommendations", params={"latest": "false"},
)
assert resp.status_code == 200
recs = resp.json()
assert len(recs) >= 1
for rec in recs:
assert "ticker" in rec
assert "action" in rec
assert "confidence" in rec
assert "mode" in rec
# Action must be valid
assert rec["action"] in ("buy", "sell", "hold", "watch")
# Mode determines if it reaches trading engine
assert rec["mode"] in (
"informational", "paper_eligible", "live_eligible",
)
# Confidence must be normalized
assert 0 <= rec["confidence"] <= 1
async def test_risk_evaluation_schema(self, risk_client):
"""Risk engine returns evaluation with all fields trading engine needs."""
payload = {
"order": {
"ticker": "AAPL",
"action": "buy",
"quantity": 5,
"estimated_value": 925.0,
"confidence": 0.75,
"recommendation_id": None,
"sector": "Technology",
},
}
resp = await risk_client.post("/evaluate", json=payload)
assert resp.status_code == 200
data = resp.json()
# Trading engine reads these fields from risk evaluation
assert "evaluation_id" in data
assert "ticker" in data
assert "eligible" in data
assert "rejection_reasons" in data
assert "checks" in data
assert "evaluated_at" in data
# Types
assert isinstance(data["eligible"], bool)
assert isinstance(data["rejection_reasons"], list)
assert isinstance(data["checks"], list)
# Each check should have name and passed
for check in data["checks"]:
assert "name" in check
assert "passed" in check
assert isinstance(check["passed"], bool)
async def test_risk_rejects_oversized_order(self, risk_client):
"""Risk engine correctly rejects an order exceeding position cap."""
payload = {
"order": {
"ticker": "AAPL",
"action": "buy",
"quantity": 1000,
"estimated_value": 185000.0,
"confidence": 0.9,
"recommendation_id": None,
"sector": "Technology",
},
"config": {
"absolute_position_cap": 10000.0,
},
}
resp = await risk_client.post("/evaluate", json=payload)
assert resp.status_code == 200
data = resp.json()
# Should be rejected due to position cap
assert data["eligible"] is False
assert len(data["rejection_reasons"]) > 0
# ---------------------------------------------------------------------------
# Contract: Trading Engine state consistency
# ---------------------------------------------------------------------------
class TestTradingEngineState:
"""Verify trading engine exposes consistent state for the promotion gate."""
async def test_status_reflects_config(self, trading_client):
"""Engine status fields are consistent with each other."""
resp = await trading_client.get("/api/trading/status")
assert resp.status_code == 200
data = resp.json()
# If paused, open_positions should still be reported
assert "open_positions" in data
assert isinstance(data["open_positions"], int)
assert data["open_positions"] >= 0
# Risk tier must be valid
assert data["risk_tier"] in ("conservative", "moderate", "aggressive")
# Pool values must be non-negative
assert data["active_pool"] >= 0
assert data["reserve_pool"] >= 0
async def test_decisions_have_audit_fields(self, trading_client, seed_ids):
"""Trading decisions include full audit trail fields."""
resp = await trading_client.get("/api/trading/decisions")
assert resp.status_code == 200
decisions = resp.json()
if len(decisions) > 0:
d = decisions[0]
assert "id" in d
assert "decision" in d
assert "ticker" in d
assert "created_at" in d
# Decision type must be valid
assert d["decision"] in ("act", "skip")
async def test_metrics_numeric_consistency(self, trading_client):
"""Portfolio metrics are all numeric and internally consistent."""
resp = await trading_client.get("/api/trading/metrics")
assert resp.status_code == 200
data = resp.json()
# All values must be numeric
numeric_fields = [
"total_portfolio_value", "active_pool", "reserve_pool",
"unrealized_pnl", "realized_pnl", "daily_pnl",
"win_rate", "sharpe_ratio", "max_drawdown", "portfolio_heat",
]
for field in numeric_fields:
assert field in data, f"Missing field: {field}"
assert isinstance(data[field], (int, float)), (
f"{field} should be numeric, got {type(data[field])}"
)
# Win rate and portfolio heat should be bounded
assert 0 <= data["win_rate"] <= 1 or data["win_rate"] == 0
assert 0 <= data["portfolio_heat"] <= 1 or data["portfolio_heat"] == 0
# Total portfolio = active + reserve + unrealized (approximately)
# Allow some tolerance for rounding
expected_total = data["active_pool"] + data["reserve_pool"] + data["unrealized_pnl"]
if data["total_portfolio_value"] > 0:
diff = abs(data["total_portfolio_value"] - expected_total)
assert diff < data["total_portfolio_value"] * 0.1, (
f"Portfolio value inconsistency: total={data['total_portfolio_value']}, "
f"active+reserve+unrealized={expected_total}"
)
# ---------------------------------------------------------------------------
# Contract: Approval workflow integration
# ---------------------------------------------------------------------------
class TestApprovalWorkflowContract:
"""Verify the approval workflow is accessible and returns valid schemas."""
async def test_pending_approvals_schema(self, risk_client):
"""Pending approvals list returns valid schema (may be empty)."""
resp = await risk_client.get("/approvals/pending")
assert resp.status_code == 200
data = resp.json()
assert isinstance(data, list)
# If there are pending approvals, validate schema
for approval in data:
assert "id" in approval
assert "status" in approval
assert "ticker" in approval
assert "side" in approval
assert "quantity" in approval
assert "created_at" in approval
async def test_approval_not_found_returns_404(self, risk_client):
"""Non-existent approval ID returns 404, not 500."""
fake_id = "00000000-0000-4000-ffff-ffffffffffff"
resp = await risk_client.get(f"/approvals/{fake_id}")
assert resp.status_code == 404
# ---------------------------------------------------------------------------
# Contract: Cross-service health (all services must be up for paper trading)
# ---------------------------------------------------------------------------
class TestCrossServiceHealth:
"""All services must be healthy before promotion to paper trading."""
async def test_all_services_healthy(
self, query_client, registry_client, risk_client, trading_client,
):
"""Every service responds to health check."""
services = {
"query-api": query_client,
"symbol-registry": registry_client,
"risk-engine": risk_client,
"trading-engine": trading_client,
}
for name, client in services.items():
resp = await client.get("/health")
assert resp.status_code == 200, (
f"{name} health check failed with status {resp.status_code}"
)
data = resp.json()
assert data.get("status") == "ok", (
f"{name} reported unhealthy: {data}"
)
async def test_trading_engine_ready(self, trading_client):
"""Trading engine readiness probe passes (DB + Redis connected)."""
resp = await trading_client.get("/ready")
assert resp.status_code == 200
data = resp.json()
assert data["ready"] is True, (
f"Trading engine not ready: {data}"
)