diff --git a/infra/inttest/run_pipeline.sh b/infra/inttest/run_pipeline.sh index f6c1e53..ffa3c8c 100755 --- a/infra/inttest/run_pipeline.sh +++ b/infra/inttest/run_pipeline.sh @@ -88,6 +88,29 @@ stage_fail() { log "✗ Stage: $name FAILED after ${STAGE_DURATION[$name]}s" } +debug_pod_failure() { + local pod_name="$1" + local label="${2:-}" + log "─── DEBUG: pod failure diagnostics ───" + if [ -n "$label" ]; then + # Find pod by label selector + local found_pod + found_pod=$(kubectl get pods -n "$NAMESPACE" -l "$label" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + if [ -n "$found_pod" ]; then + pod_name="$found_pod" + fi + fi + log "Pod describe ($pod_name):" + kubectl describe pod "$pod_name" -n "$NAMESPACE" 2>&1 | tail -40 || true + log "Pod logs ($pod_name):" + kubectl logs "$pod_name" -n "$NAMESPACE" --tail=60 2>&1 || true + log "Pod status:" + kubectl get pod "$pod_name" -n "$NAMESPACE" -o wide 2>&1 || true + log "Recent events in namespace:" + kubectl get events -n "$NAMESPACE" --sort-by='.lastTimestamp' 2>&1 | tail -20 || true + log "─── END DEBUG ───" +} + # ── Parse CLI args ─────────────────────────────────────────────────────────── while [[ $# -gt 0 ]]; do case $1 in @@ -268,6 +291,7 @@ envsubst < "$REPO_ROOT/infra/inttest/minio.yaml" | kubectl apply -n "$NAMESPACE" log "Waiting for postgres readiness ..." if ! kubectl wait --for=condition=ready pod -l app=postgres -n "$NAMESPACE" --timeout=120s; then log "FATAL: PostgreSQL did not become ready" + debug_pod_failure "postgres" "app=postgres" stage_fail "infra_deploy" PIPELINE_EXIT_CODE=2 exit 2 @@ -276,6 +300,7 @@ fi log "Waiting for redis readiness ..." if ! kubectl wait --for=condition=ready pod -l app=redis -n "$NAMESPACE" --timeout=60s; then log "FATAL: Redis did not become ready" + debug_pod_failure "redis" "app=redis" stage_fail "infra_deploy" PIPELINE_EXIT_CODE=2 exit 2 @@ -284,13 +309,19 @@ fi log "Waiting for minio readiness ..." if ! kubectl wait --for=condition=ready pod -l app=minio -n "$NAMESPACE" --timeout=60s; then log "FATAL: MinIO did not become ready" + debug_pod_failure "minio" "app=minio" stage_fail "infra_deploy" PIPELINE_EXIT_CODE=2 exit 2 fi log "Waiting for minio-bucket-init job ..." -kubectl wait --for=condition=complete job/minio-bucket-init -n "$NAMESPACE" --timeout=60s || true +if ! kubectl wait --for=condition=complete job/minio-bucket-init -n "$NAMESPACE" --timeout=120s; then + log "WARNING: minio-bucket-init job did not complete within 120s" + log "Bucket-init pod logs:" + kubectl logs -l app=minio-bucket-init -n "$NAMESPACE" --tail=30 2>&1 || true + kubectl describe job/minio-bucket-init -n "$NAMESPACE" 2>&1 | tail -20 || true +fi stage_end "infra_deploy" "ok" @@ -307,6 +338,7 @@ if ! kubectl run seed-sandbox \ --restart=Never \ --rm \ --attach \ + --pod-running-timeout=5m \ --namespace="$NAMESPACE" \ --image-pull-policy=Always \ --overrides='{ @@ -326,6 +358,7 @@ if ! kubectl run seed-sandbox \ --env="MINIO_SECRET_KEY=minioadmin" \ --command -- python -m tests.integration.seed_sandbox; then log "FATAL: Database seed failed" + debug_pod_failure "seed-sandbox" "run=seed-sandbox" stage_fail "seed_data" PIPELINE_EXIT_CODE=2 exit 2 @@ -337,6 +370,7 @@ if ! kubectl run seed-minio \ --restart=Never \ --rm \ --attach \ + --pod-running-timeout=5m \ --namespace="$NAMESPACE" \ --image-pull-policy=Always \ --overrides='{ @@ -351,6 +385,7 @@ if ! kubectl run seed-minio \ --env="MINIO_SECRET_KEY=minioadmin" \ --command -- python -m tests.integration.seed_minio; then log "FATAL: MinIO seed failed" + debug_pod_failure "seed-minio" "run=seed-minio" stage_fail "seed_data" PIPELINE_EXIT_CODE=2 exit 2 @@ -371,6 +406,11 @@ envsubst < "$REPO_ROOT/infra/inttest/services.yaml" \ log "Waiting for all API services to become ready ..." if ! kubectl wait --for=condition=ready pod -l tier=api -n "$NAMESPACE" --timeout=120s; then log "FATAL: API services did not become ready" + log "Pod statuses:" + kubectl get pods -n "$NAMESPACE" -l tier=api -o wide 2>&1 || true + for pod in $(kubectl get pods -n "$NAMESPACE" -l tier=api --no-headers -o custom-columns=':metadata.name' 2>/dev/null); do + debug_pod_failure "$pod" + done stage_fail "service_deploy" PIPELINE_EXIT_CODE=2 exit 2 @@ -398,6 +438,7 @@ else if kubectl wait --for=condition=failed job/inttest-runner -n "$NAMESPACE" --timeout=5s 2>/dev/null; then log "Test runner job reported failure" fi + debug_pod_failure "inttest-runner" "app=inttest-runner" stage_fail "integration_tests" PIPELINE_EXIT_CODE=1 fi