phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -0,0 +1,241 @@
+"""Resilient adapter wrapper with rate-limit coordination, retries, and backoff.
+
+Wraps any BaseAdapter with:
+- Per-source-type rate limiting via Redis (distributed across workers)
+- Exponential backoff with jitter on retryable failures
+- Configurable retry counts and retryable HTTP status codes
+- Graceful degradation when Redis is unavailable
+
+Requirements: 2.5, 3.4
+"""
+import asyncio
+import logging
+import random
+import time
+from dataclasses import dataclass
+from typing import Any
+
+import redis.asyncio as aioredis
+
+from services.shared.redis_keys import rate_limit_key
+
+from .base import AdapterResult, BaseAdapter
+
+logger = logging.getLogger("resilient_adapter")
+
+# HTTP status codes that are safe to retry
+RETRYABLE_STATUS_CODES: frozenset[int] = frozenset({429, 500, 502, 503, 504})
+
+
+@dataclass
+class RetryConfig:
+    """Configuration for retry and rate-limit behavior."""
+
+    max_retries: int = 3
+    base_delay: float = 1.0
+    max_delay: float = 60.0
+    jitter_factor: float = 0.5
+    retryable_status_codes: frozenset[int] = RETRYABLE_STATUS_CODES
+    # Rate limit: max requests per window per source type
+    rate_limit_max: int = 30
+    rate_limit_window_seconds: int = 60
+
+
+# Sensible defaults per source type
+DEFAULT_RETRY_CONFIGS: dict[str, RetryConfig] = {
+    "market_api": RetryConfig(max_retries=3, rate_limit_max=30),
+    "news_api": RetryConfig(max_retries=3, rate_limit_max=20),
+    "filings_api": RetryConfig(max_retries=2, rate_limit_max=10, base_delay=2.0),
+    "web_scrape": RetryConfig(max_retries=2, rate_limit_max=10, base_delay=2.0),
+    "broker": RetryConfig(max_retries=2, rate_limit_max=60, base_delay=0.5),
+}
+
+
+def compute_delay(attempt: int, config: RetryConfig) -> float:
+    """Compute backoff delay with jitter for a given attempt number."""
+    exp_delay = config.base_delay * (2 ** attempt)
+    capped = min(exp_delay, config.max_delay)
+    jitter = capped * config.jitter_factor * random.random()
+    return capped + jitter
+
+
+
+@dataclass
+class RetryStats:
+    """Tracks retry statistics for observability."""
+
+    attempts: int = 0
+    total_delay: float = 0.0
+    rate_limited_waits: int = 0
+    last_error: str | None = None
+    retryable: bool = False
+
+
+class ResilientAdapter:
+    """Wraps a BaseAdapter with rate-limit coordination, retries, and backoff.
+
+    Usage:
+        adapter = PolygonMarketAdapter(api_key="...")
+        resilient = ResilientAdapter(adapter, redis=rds)
+        result = await resilient.fetch(ticker, config)
+
+    If redis is None, rate limiting is skipped (local dev / testing).
+    """
+
+    def __init__(
+        self,
+        adapter: BaseAdapter,
+        redis: aioredis.Redis | None = None,
+        retry_config: RetryConfig | None = None,
+    ) -> None:
+        self._adapter = adapter
+        self._redis = redis
+        source_type = adapter.source_type()
+        self._config = retry_config or DEFAULT_RETRY_CONFIGS.get(
+            source_type, RetryConfig()
+        )
+
+    @property
+    def adapter(self) -> BaseAdapter:
+        """Access the underlying adapter."""
+        return self._adapter
+
+    @property
+    def config(self) -> RetryConfig:
+        return self._config
+
+    def source_type(self) -> str:
+        return self._adapter.source_type()
+
+    async def _check_rate_limit(self) -> float:
+        """Check distributed rate limit via Redis.
+
+        Returns 0.0 if allowed, or the number of seconds to wait.
+        """
+        if self._redis is None:
+            return 0.0
+
+        source_type = self._adapter.source_type()
+        window_sec = self._config.rate_limit_window_seconds
+        # Use a time-bucketed key so counters auto-expire
+        bucket = int(time.time()) // window_sec
+        key = rate_limit_key(source_type, str(bucket))
+
+        try:
+            count = await self._redis.incr(key)
+            if count == 1:
+                await self._redis.expire(key, window_sec * 2)
+            if count > self._config.rate_limit_max:
+                # Over limit — compute how long until the window rolls over
+                elapsed_in_window = time.time() % window_sec
+                wait = window_sec - elapsed_in_window
+                return max(wait, 0.5)
+        except Exception:
+            # Redis unavailable — degrade gracefully, allow the request
+            logger.warning("Redis rate-limit check failed, allowing request")
+        return 0.0
+
+    def _is_retryable(self, result: AdapterResult) -> bool:
+        """Determine if a failed result is worth retrying."""
+        if result.ok:
+            return False
+        # Retry on known retryable HTTP status codes
+        if result.http_status and result.http_status in self._config.retryable_status_codes:
+            return True
+        # Retry on timeouts
+        if result.error and "timeout" in result.error.lower():
+            return True
+        # Retry on connection errors
+        if result.error and any(
+            kw in result.error.lower()
+            for kw in ("connection", "connect", "reset", "refused")
+        ):
+            return True
+        return False
+
+    def _extract_retry_after(self, result: AdapterResult) -> float | None:
+        """Extract Retry-After hint from result metadata if present."""
+        retry_after = result.metadata.get("retry_after")
+        if retry_after is not None:
+            try:
+                return float(retry_after)
+            except (ValueError, TypeError):
+                pass
+        return None
+
+    async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
+        """Fetch with rate-limit coordination, retries, and exponential backoff.
+
+        Returns the AdapterResult from the underlying adapter. On retryable
+        failures, retries up to max_retries times with exponential backoff
+        and jitter. Rate-limit waits are applied before each attempt.
+
+        The returned result's metadata includes retry stats under the
+        "retry_stats" key.
+        """
+        stats = RetryStats()
+        last_result: AdapterResult | None = None
+
+        for attempt in range(self._config.max_retries + 1):
+            stats.attempts = attempt + 1
+
+            # Rate limit check
+            wait = await self._check_rate_limit()
+            if wait > 0:
+                stats.rate_limited_waits += 1
+                logger.info(
+                    "Rate limited for %s/%s, waiting %.1fs",
+                    self.source_type(), ticker, wait,
+                )
+                stats.total_delay += wait
+                await asyncio.sleep(wait)
+
+            # Execute the fetch
+            result = await self._adapter.fetch(ticker, config)
+            last_result = result
+
+            # Success — attach stats and return
+            if result.ok:
+                result.metadata["retry_stats"] = {
+                    "attempts": stats.attempts,
+                    "total_delay": round(stats.total_delay, 2),
+                    "rate_limited_waits": stats.rate_limited_waits,
+                }
+                return result
+
+            # Check if retryable
+            if not self._is_retryable(result):
+                stats.last_error = result.error
+                stats.retryable = False
+                break
+
+            stats.retryable = True
+            stats.last_error = result.error
+
+            # Don't sleep after the last attempt
+            if attempt < self._config.max_retries:
+                # Respect Retry-After header for 429s
+                retry_after = self._extract_retry_after(result)
+                if result.http_status == 429 and retry_after is not None:
+                    delay = min(retry_after, self._config.max_delay)
+                else:
+                    delay = compute_delay(attempt, self._config)
+
+                logger.info(
+                    "Retrying %s/%s (attempt %d/%d) after %.1fs: %s",
+                    self.source_type(), ticker, attempt + 1,
+                    self._config.max_retries + 1, delay, result.error,
+                )
+                stats.total_delay += delay
+                await asyncio.sleep(delay)
+
+        # All retries exhausted — return last result with stats
+        assert last_result is not None
+        last_result.metadata["retry_stats"] = {
+            "attempts": stats.attempts,
+            "total_delay": round(stats.total_delay, 2),
+            "rate_limited_waits": stats.rate_limited_waits,
+            "exhausted": True,
+            "last_error": stats.last_error,
+        }
+        return last_result