fix: wait for scheduler migrations after factory reset scale-up
ci/woodpecker/push/test Pipeline was successful
ci/woodpecker/push/build-1 Pipeline was successful
ci/woodpecker/push/build-2 Pipeline was successful
ci/woodpecker/push/build-3 Pipeline was successful
ci/woodpecker/push/finalize Pipeline was successful
Build and Push / lint-and-test (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.adapters.broker_adapter name:broker-adapter]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.aggregation.worker name:aggregation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.extractor.worker name:extractor]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.ingestion.worker name:ingestion]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.lake_publisher.worker name:lake-publisher]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.parser.worker name:parser]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.recommendation.worker name:recommendation]) (push) Has been cancelled
Build and Push / build-services (map[cmd:python -m services.scheduler.app name:scheduler]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.api.app:app --host 0.0.0.0 --port 8000 name:query-api]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.risk.app:app --host 0.0.0.0 --port 8000 name:risk]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.symbol_registry.app:app --host 0.0.0.0 --port 8000 name:symbol-registry]) (push) Has been cancelled
Build and Push / build-services (map[cmd:uvicorn services.trading.app:app --host 0.0.0.0 --port 8000 name:trading-engine]) (push) Has been cancelled
Build and Push / build-dashboard (push) Has been cancelled
Build and Push / build-superset (push) Has been cancelled
Build and Push / integration-test (push) Has been cancelled
Build and Push / beta-gate (push) Has been cancelled

After scale_up, restart the scheduler and wait for it to be ready
before other services start. The scheduler's init containers run
migrations, so this ensures tables exist before ingestion/extractor
pods try to query them.
This commit is contained in:
Celes Renata
2026-04-30 07:01:01 +00:00
parent 601b85764b
commit 34ffdad00c
+360
View File
@@ -0,0 +1,360 @@
#!/usr/bin/env bash
set -euo pipefail
# factory-reset.sh — Factory reset a Stonks Oracle stage
#
# Drops and recreates the database, flushes Redis keys, empties S3 buckets,
# re-runs migrations, and re-seeds the symbol registry.
#
# Usage:
# bash scripts/factory-reset.sh <stage> [--component <component>] [--yes] [--verbose]
#
# Stages:
# production — stonks-oracle namespace, DB=stonks, Redis DB=0, buckets=stonks-*
# paper — stonks-paper namespace, DB=stonks_paper, Redis DB=2, buckets=paper-stonks-*
# beta — stonks-beta namespace, DB=stonks_beta, Redis DB=1, buckets=beta-stonks-*
#
# Components (optional, default: all):
# all — Full factory reset (DB + S3 + Redis)
# db — Database only (drop/recreate + migrations + seed)
# s3 — S3 buckets only (empty all stage buckets)
# redis — Redis only (flush stage keys)
# computed — Computed data only (trends, recommendations, orders, positions)
#
# Examples:
# bash scripts/factory-reset.sh beta # Full reset of beta
# bash scripts/factory-reset.sh production --component db # DB-only reset of production
# bash scripts/factory-reset.sh paper --component computed # Clear computed data in paper
# bash scripts/factory-reset.sh beta --verbose # Full reset with per-object output
#
# Requirements:
# - kubectl access to the cluster
# - mc (MinIO client) configured with alias "stonks"
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
STAGE="${1:-}"
COMPONENT="all"
AUTO_YES=false
VERBOSE=false
if [[ -z "$STAGE" ]]; then
echo "Usage: bash scripts/factory-reset.sh <stage> [--component <component>] [--yes] [--verbose]"
echo "Stages: production, paper, beta"
echo "Components: all, db, s3, redis, computed"
echo "Flags: --yes Skip confirmation prompt"
echo " --verbose Show detailed per-object output"
exit 1
fi
shift
while [[ $# -gt 0 ]]; do
case $1 in
--component) COMPONENT="$2"; shift 2 ;;
--yes|-y) AUTO_YES=true; shift ;;
--verbose|-v) VERBOSE=true; shift ;;
*) echo "Unknown option: $1"; exit 1 ;;
esac
done
# Map stage to namespace, DB name, Redis DB, and bucket prefix
case "$STAGE" in
production|prod)
NAMESPACE="stonks-oracle"
DB_NAME="stonks"
REDIS_DB=0
BUCKET_PREFIX="stonks-"
REDIS_KEY_PREFIX="stonks:"
DEPLOY_STAGE=""
;;
paper)
NAMESPACE="stonks-paper"
DB_NAME="stonks_paper"
REDIS_DB=2
BUCKET_PREFIX="paper-stonks-"
REDIS_KEY_PREFIX="stonks:paper:"
DEPLOY_STAGE="paper"
;;
beta)
NAMESPACE="stonks-beta"
DB_NAME="stonks_beta"
REDIS_DB=1
BUCKET_PREFIX="beta-stonks-"
REDIS_KEY_PREFIX="stonks:beta:"
DEPLOY_STAGE="beta"
;;
*)
echo "ERROR: Unknown stage '$STAGE'. Use: production, paper, beta"
exit 1
;;
esac
PG_POD="postgresql-1"
PG_NS="postgresql-service"
PG_USER="postgres"
REDIS_HOST="redis-master.redis-service.svc.cluster.local"
REDIS_PORT="6379"
REDIS_PASSWORD="PSCh4ng3me!"
MC_ALIAS="stonks"
# S3 bucket suffixes
BUCKET_SUFFIXES=(
"audit"
"lakehouse"
"llm-prompts"
"llm-results"
"normalized"
"raw-filings"
"raw-market"
"raw-news"
)
echo "============================================"
echo " Stonks Oracle Factory Reset"
echo "============================================"
echo " Stage: $STAGE"
echo " Namespace: $NAMESPACE"
echo " Database: $DB_NAME"
echo " Redis DB: $REDIS_DB"
echo " Buckets: ${BUCKET_PREFIX}*"
echo " Component: $COMPONENT"
if [[ "$VERBOSE" == true ]]; then
echo " Verbose: ON"
fi
echo "============================================"
echo ""
# Safety confirmation
if [[ "$AUTO_YES" == true ]]; then
echo "⚠️ --yes flag set, skipping confirmation"
else
read -rp "⚠️ This will DESTROY data. Type '$STAGE' to confirm: " confirm
if [[ "$confirm" != "$STAGE" ]]; then
echo "Aborted."
exit 1
fi
fi
echo ""
# ---------------------------------------------------------------------------
# Helper: scale down all deployments in the namespace
# ---------------------------------------------------------------------------
scale_down() {
echo "--- Scaling down $NAMESPACE deployments ---"
local deployments
deployments=$(kubectl get deployments -n "$NAMESPACE" -o name 2>/dev/null || true)
if [[ -n "$deployments" ]]; then
local count
count=$(echo "$deployments" | wc -l)
if [[ "$VERBOSE" == true ]]; then
echo "$deployments" | xargs -I{} kubectl scale {} --replicas=0 -n "$NAMESPACE" 2>/dev/null || true
else
echo "$deployments" | xargs -I{} kubectl scale {} --replicas=0 -n "$NAMESPACE" &>/dev/null || true
fi
echo " Waiting for $count deployments to terminate..."
kubectl wait --for=delete pod --all -n "$NAMESPACE" --timeout=60s 2>/dev/null || true
fi
echo " ✓ All deployments scaled to 0"
}
# ---------------------------------------------------------------------------
# Helper: scale up all deployments in the namespace
# ---------------------------------------------------------------------------
scale_up() {
echo "--- Scaling up $NAMESPACE deployments ---"
# ArgoCD will auto-heal and restore replica counts, just trigger a sync
if kubectl get application -n argocd 2>/dev/null | grep -q "$NAMESPACE"; then
echo " ArgoCD will restore replicas via self-heal"
else
echo " Manually restoring replicas..."
local deployments
deployments=$(kubectl get deployments -n "$NAMESPACE" -o name 2>/dev/null || true)
if [[ "$VERBOSE" == true ]]; then
echo "$deployments" | xargs -I{} kubectl scale {} --replicas=1 -n "$NAMESPACE" 2>/dev/null || true
else
echo "$deployments" | xargs -I{} kubectl scale {} --replicas=1 -n "$NAMESPACE" &>/dev/null || true
fi
fi
echo " ✓ Scale-up triggered"
# Wait for the scheduler pod (which runs migrations via init containers)
# to be fully ready before other services start querying the DB.
echo " Waiting for scheduler pod (runs migrations)..."
kubectl rollout restart deployment/scheduler -n "$NAMESPACE" 2>/dev/null || true
kubectl rollout status deployment/scheduler -n "$NAMESPACE" --timeout=120s 2>/dev/null || true
echo " ✓ Scheduler ready (migrations applied)"
}
# ---------------------------------------------------------------------------
# Reset: Database
# ---------------------------------------------------------------------------
reset_db() {
echo "--- Resetting database: $DB_NAME ---"
# Terminate active connections
if [[ "$VERBOSE" == true ]]; then
kubectl exec -n "$PG_NS" "$PG_POD" -c postgres -- psql -U "$PG_USER" -c \
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '$DB_NAME' AND pid <> pg_backend_pid();" \
2>/dev/null || true
else
kubectl exec -n "$PG_NS" "$PG_POD" -c postgres -- psql -U "$PG_USER" -tAc \
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = '$DB_NAME' AND pid <> pg_backend_pid();" \
&>/dev/null || true
fi
# Drop and recreate
kubectl exec -n "$PG_NS" "$PG_POD" -c postgres -- psql -U "$PG_USER" -c \
"DROP DATABASE IF EXISTS $DB_NAME;"
kubectl exec -n "$PG_NS" "$PG_POD" -c postgres -- psql -U "$PG_USER" -c \
"CREATE DATABASE $DB_NAME OWNER stonks;"
echo " ✓ Database recreated"
# Run migrations
local migrations
migrations=($(ls infra/migrations/*.sql | sort))
local count=${#migrations[@]}
echo " Running $count migrations..."
for migration in "${migrations[@]}"; do
if [[ "$VERBOSE" == true ]]; then
echo " Applying $(basename "$migration")..."
fi
kubectl exec -n "$PG_NS" "$PG_POD" -c postgres -i -- psql -U stonks -d "$DB_NAME" < "$migration" 2>/dev/null || true
done
echo " ✓ Migrations applied ($count files)"
# Seed symbol registry
echo " Seeding symbol registry..."
# Wait for at least one pod to be ready
scale_up
sleep 10
local scheduler_pod
scheduler_pod=$(kubectl get pods -n "$NAMESPACE" -l app=scheduler -o name 2>/dev/null | head -1)
if [[ -n "$scheduler_pod" ]]; then
kubectl exec -n "$NAMESPACE" "$scheduler_pod" -c scheduler -- \
python -m services.symbol_registry.seed 2>/dev/null && echo " ✓ Seeded" || echo " ⚠ Seed failed (will retry on next restart)"
else
echo " ⚠ No scheduler pod available — seed will run on next deployment"
fi
}
# ---------------------------------------------------------------------------
# Reset: Computed data only (trends, recommendations, orders, positions)
# ---------------------------------------------------------------------------
reset_computed() {
echo "--- Clearing computed data in $DB_NAME ---"
kubectl exec -n "$PG_NS" "$PG_POD" -c postgres -- psql -U "$PG_USER" -d "$DB_NAME" -c "
-- Order matters due to FK constraints
DELETE FROM recommendation_evidence;
DELETE FROM risk_evaluations;
DELETE FROM order_events;
DELETE FROM orders;
DELETE FROM trading_decisions;
DELETE FROM positions;
DELETE FROM portfolio_snapshots;
DELETE FROM reserve_pool_ledger;
DELETE FROM risk_tier_history;
DELETE FROM circuit_breaker_events;
DELETE FROM notifications;
DELETE FROM recommendations;
DELETE FROM trend_evidence;
DELETE FROM trend_projections;
DELETE FROM trend_history;
DELETE FROM trend_windows;
DELETE FROM backtest_trades;
DELETE FROM backtest_runs;
DELETE FROM position_stop_levels;
" 2>/dev/null
echo " ✓ Computed data cleared"
}
# ---------------------------------------------------------------------------
# Reset: S3 buckets
# ---------------------------------------------------------------------------
reset_s3() {
echo "--- Emptying S3 buckets: ${BUCKET_PREFIX}* ---"
for suffix in "${BUCKET_SUFFIXES[@]}"; do
local bucket="${BUCKET_PREFIX}${suffix}"
if mc ls "${MC_ALIAS}/${bucket}" &>/dev/null; then
echo -n " Emptying ${bucket}..."
if [[ "$VERBOSE" == true ]]; then
echo ""
mc rm --recursive --force "${MC_ALIAS}/${bucket}/" 2>/dev/null || true
else
local removed
removed=$(mc rm --recursive --force "${MC_ALIAS}/${bucket}/" 2>/dev/null | wc -l || echo "0")
echo " ${removed} objects removed"
fi
echo "${bucket} emptied"
else
echo "${bucket} not found (skipping)"
fi
done
echo " ✓ All S3 buckets emptied"
}
# ---------------------------------------------------------------------------
# Reset: Redis
# ---------------------------------------------------------------------------
reset_redis() {
echo "--- Flushing Redis DB $REDIS_DB ---"
kubectl exec -n "$NAMESPACE" deployment/scheduler -c scheduler -- python -c "
import redis
r = redis.from_url('redis://:${REDIS_PASSWORD}@${REDIS_HOST}:${REDIS_PORT}/${REDIS_DB}')
keys = r.keys('stonks:*')
if keys:
r.delete(*keys)
print(f' Deleted {len(keys)} keys')
else:
print(' No keys to delete')
" 2>/dev/null || {
# Fallback: flush the entire Redis DB if no scheduler pod
echo " Falling back to FLUSHDB..."
kubectl exec -n redis-service redis-master-0 -- redis-cli -a "$REDIS_PASSWORD" -n "$REDIS_DB" FLUSHDB 2>/dev/null || true
}
echo " ✓ Redis flushed"
}
# ---------------------------------------------------------------------------
# Execute based on component selection
# ---------------------------------------------------------------------------
case "$COMPONENT" in
all)
scale_down
reset_db
reset_s3
reset_redis
scale_up
;;
db)
scale_down
reset_db
scale_up
;;
s3)
reset_s3
;;
redis)
reset_redis
;;
computed)
reset_computed
;;
*)
echo "ERROR: Unknown component '$COMPONENT'. Use: all, db, s3, redis, computed"
exit 1
;;
esac
echo ""
echo "============================================"
echo " Factory reset complete: $STAGE / $COMPONENT"
echo "============================================"
echo ""
echo "Next steps:"
echo " - ArgoCD will auto-restore pod replicas"
echo " - Migrations and seed run automatically on scheduler init"
echo " - Ingestion will begin on the next scheduler cycle (~15s)"
echo " - First aggregation will run within ~15 minutes"