phase 14-15: docker build validation and helm deployment

2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
@@ -1,14 +1,23 @@
-"""Scheduler - triggers ingestion cycles for tracked symbols and sources."""
+"""Scheduler - triggers ingestion cycles for tracked symbols and sources.
+
+Polls the symbol registry for active companies and their configured sources,
+respects per-source polling cadences and backoff windows, coordinates rate
+limits across source types, and enqueues ingestion jobs for downstream workers.
+
+Requirements: 2.1, 2.2, 2.3, 2.4, 2.5
+"""
 import asyncio
 import json
 import logging
-from datetime import datetime, timedelta
+from datetime import datetime
+from typing import Any, Optional

 import asyncpg
 import redis.asyncio as aioredis

 from services.shared.config import load_config
 from services.shared.db import get_pg_pool, get_redis
+from services.shared.logging import setup_logging
 from services.shared.redis_keys import (
    QUEUE_INGESTION,
    lock_key,
@@ -16,11 +25,11 @@ from services.shared.redis_keys import (
    rate_limit_key,
 )

-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("scheduler")

-# Polling cadences by source class (seconds)
-CADENCES = {
+# Default polling cadences by source class (seconds).
+# Individual sources can override via config.polling_interval_seconds.
+DEFAULT_CADENCES: dict[str, int] = {
    "market_api": 60,
    "news_api": 300,
    "filings_api": 3600,
@@ -28,81 +37,267 @@ CADENCES = {
    "broker": 30,
 }

+# Default rate limits per source type (requests per minute)
+DEFAULT_RATE_LIMITS: dict[str, int] = {
+    "market_api": 30,
+    "news_api": 20,
+    "filings_api": 10,
+    "web_scrape": 10,
+    "broker": 60,
+}
+
+# How long to wait before retrying a failed source (seconds)
+DEFAULT_BACKOFF_BASE: int = 60
+MAX_BACKOFF: int = 3600
+MAX_RETRY_COUNT: int = 10
+
+# Main loop interval (seconds)
+SCHEDULER_TICK: int = 15
+
+
+def get_cadence_for_source(source_type: str, config: Optional[dict[str, Any]]) -> int:
+    """Return the polling interval for a source.
+
+    Uses the source's config.polling_interval_seconds if set,
+    otherwise falls back to the default cadence for the source type.
+    """
+    if config and "polling_interval_seconds" in config:
+        try:
+            return max(10, int(config["polling_interval_seconds"]))
+        except (ValueError, TypeError):
+            pass
+    return DEFAULT_CADENCES.get(source_type, 600)
+
+
+def compute_backoff(retry_count: int) -> int:
+    """Exponential backoff with a cap. Returns seconds to wait."""
+    delay = DEFAULT_BACKOFF_BASE * (2 ** min(retry_count, 8))
+    return min(delay, MAX_BACKOFF)
+
+
+def is_source_due(
+    source_type: str,
+    source_config: Optional[dict[str, Any]],
+    last_completed_at: Optional[datetime],
+    last_status: Optional[str],
+    retry_count: int,
+    next_retry_at: Optional[datetime],
+    now: datetime,
+) -> bool:
+    """Determine whether a source is due for its next polling cycle.
+
+    Checks:
+    - If the source has never run, it is due.
+    - If the last run failed and we have a next_retry_at in the future, skip.
+    - If the last run failed and retry_count exceeds max, skip (needs manual reset).
+    - Otherwise, check if enough time has elapsed since the last completed run.
+    """
+    # Never run before — always due
+    if last_completed_at is None and last_status is None:
+        return True
+
+    # If last run failed, respect backoff
+    if last_status == "failed":
+        if retry_count >= MAX_RETRY_COUNT:
+            return False
+        if next_retry_at and now < next_retry_at.replace(tzinfo=None):
+            return False
+        # Backoff elapsed or no next_retry_at set — allow retry
+        return True
+
+    # If currently running, don't double-schedule
+    if last_status == "running":
+        return False
+
+    # Normal cadence check
+    if last_completed_at is None:
+        return True
+
+    cadence = get_cadence_for_source(source_type, source_config)
+    elapsed = (now - last_completed_at.replace(tzinfo=None)).total_seconds()
+    return elapsed >= cadence
+
+
+def build_job_payload(
+    source: Any,
+    aliases: list[str],
+    now: datetime,
+) -> dict[str, Any]:
+    """Build the ingestion job payload for a source."""
+    return {
+        "source_id": str(source["source_id"]),
+        "company_id": str(source["company_id"]),
+        "ticker": source["ticker"],
+        "legal_name": source["legal_name"],
+        "aliases": aliases,
+        "source_type": source["source_type"],
+        "source_name": source["source_name"],
+        "config": dict(source["config"]) if source["config"] else {},
+        "credibility_score": float(source["credibility_score"]) if source["credibility_score"] else 0.5,
+        "scheduled_at": now.isoformat(),
+    }
+

 async def acquire_lock(rds: aioredis.Redis, name: str, ttl: int = 60) -> bool:
+    """Acquire a distributed lock. Returns True if acquired."""
    return await rds.set(lock_key(name), "1", nx=True, ex=ttl)


-async def release_lock(rds: aioredis.Redis, name: str):
+async def release_lock(rds: aioredis.Redis, name: str) -> None:
+    """Release a distributed lock."""
    await rds.delete(lock_key(name))


-async def check_rate_limit(rds: aioredis.Redis, source_type: str, max_per_minute: int = 30) -> bool:
-    key = rate_limit_key(source_type, datetime.utcnow().strftime("%Y%m%d%H%M"))
+async def check_rate_limit(
+    rds: aioredis.Redis,
+    source_type: str,
+    now: datetime,
+    max_per_minute: Optional[int] = None,
+) -> bool:
+    """Check whether the source type is within its rate limit window.
+
+    Returns True if the request is allowed, False if rate-limited.
+    """
+    limit = max_per_minute or DEFAULT_RATE_LIMITS.get(source_type, 30)
+    window = now.strftime("%Y%m%d%H%M")
+    key = rate_limit_key(source_type, window)
    count = await rds.incr(key)
    if count == 1:
        await rds.expire(key, 120)
-    return count <= max_per_minute
+    return count <= limit


-async def schedule_cycle(pool: asyncpg.Pool, rds: aioredis.Redis):
-    """One scheduling pass: find due sources and enqueue ingestion jobs."""
-    sources = await pool.fetch(
-        """SELECT s.id as source_id, s.company_id, s.source_type, s.source_name, s.config,
-                  c.ticker, c.legal_name
-           FROM sources s JOIN companies c ON s.company_id = c.id
+async def fetch_active_sources(pool: asyncpg.Pool) -> list[asyncpg.Record]:
+    """Fetch all active sources joined with their active companies."""
+    return await pool.fetch(
+        """SELECT s.id AS source_id,
+                  s.company_id,
+                  s.source_type,
+                  s.source_name,
+                  s.config,
+                  s.credibility_score,
+                  c.ticker,
+                  c.legal_name
+           FROM sources s
+           JOIN companies c ON s.company_id = c.id
           WHERE s.active = TRUE AND c.active = TRUE
           ORDER BY s.source_type, c.ticker"""
    )

+
+async def fetch_aliases_for_company(pool: asyncpg.Pool, company_id: str) -> list[str]:
+    """Fetch all aliases for a company."""
+    rows = await pool.fetch(
+        "SELECT alias FROM company_aliases WHERE company_id = $1",
+        company_id,
+    )
+    return [r["alias"] for r in rows]
+
+
+async def fetch_last_run(
+    pool: asyncpg.Pool, source_id: str
+) -> Optional[asyncpg.Record]:
+    """Fetch the most recent ingestion run for a source."""
+    return await pool.fetchrow(
+        """SELECT status, started_at, completed_at, retry_count, next_retry_at
+           FROM ingestion_runs
+           WHERE source_id = $1
+           ORDER BY started_at DESC
+           LIMIT 1""",
+        source_id,
+    )
+
+
+async def schedule_cycle(pool: asyncpg.Pool, rds: aioredis.Redis) -> int:
+    """One scheduling pass: find due sources and enqueue ingestion jobs.
+
+    Returns the number of jobs enqueued.
+    """
+    now = datetime.utcnow()
+    sources = await fetch_active_sources(pool)
+
    enqueued = 0
+    skipped_rate_limit = 0
+    skipped_not_due = 0
+
    for src in sources:
+        source_id = src["source_id"]
        source_type = src["source_type"]
-        cadence = CADENCES.get(source_type, 600)
+        source_config = dict(src["config"]) if src["config"] else None

-        # Check last run
-        last_run = await pool.fetchval(
-            "SELECT MAX(started_at) FROM ingestion_runs WHERE source_id = $1 AND status IN ('completed', 'running')",
-            src["source_id"],
-        )
-        if last_run and (datetime.utcnow() - last_run.replace(tzinfo=None)) < timedelta(seconds=cadence):
+        # Check last run status and timing
+        last_run = await fetch_last_run(pool, source_id)
+
+        last_completed_at = None
+        last_status = None
+        retry_count = 0
+        next_retry_at = None
+
+        if last_run:
+            last_status = last_run["status"]
+            last_completed_at = last_run["completed_at"] or last_run["started_at"]
+            retry_count = last_run["retry_count"] or 0
+            next_retry_at = last_run["next_retry_at"]
+
+        if not is_source_due(
+            source_type=source_type,
+            source_config=source_config,
+            last_completed_at=last_completed_at,
+            last_status=last_status,
+            retry_count=retry_count,
+            next_retry_at=next_retry_at,
+            now=now,
+        ):
+            skipped_not_due += 1
            continue

-        if not await check_rate_limit(rds, source_type):
-            logger.warning(f"Rate limit hit for {source_type}")
+        # Rate limit check
+        if not await check_rate_limit(rds, source_type, now):
+            logger.warning(
+                "Rate limit hit for %s, skipping %s/%s",
+                source_type, src["ticker"], src["source_name"],
+            )
+            skipped_rate_limit += 1
            continue

-        job = {
-            "source_id": str(src["source_id"]),
-            "company_id": str(src["company_id"]),
-            "ticker": src["ticker"],
-            "source_type": source_type,
-            "source_name": src["source_name"],
-            "config": dict(src["config"]) if src["config"] else {},
-            "scheduled_at": datetime.utcnow().isoformat(),
-        }
-        await rds.rpush(queue_key(QUEUE_INGESTION), json.dumps(job))
+        # Fetch company aliases for downstream entity matching
+        aliases = await fetch_aliases_for_company(pool, src["company_id"])
+
+        job = build_job_payload(src, aliases, now)
+        await rds.rpush(queue_key(QUEUE_INGESTION), json.dumps(job))  # type: ignore[misc]
        enqueued += 1

-    if enqueued:
-        logger.info(f"Enqueued {enqueued} ingestion jobs")
+        logger.debug(
+            "Enqueued %s job for %s (%s)",
+            source_type, src["ticker"], src["source_name"],
+        )
+
+    logger.info(
+        "Cycle complete: enqueued=%d skipped_not_due=%d skipped_rate_limit=%d total_sources=%d",
+        enqueued, skipped_not_due, skipped_rate_limit, len(sources),
+    )
+    return enqueued


-async def main():
+async def main() -> None:
    config = load_config()
+    setup_logging("scheduler", level=config.log_level, json_output=config.json_logs)
+
    pool = await get_pg_pool(config)
    rds = get_redis(config)

-    logger.info("Scheduler started")
+    logger.info("Scheduler started (tick=%ds)", SCHEDULER_TICK)
    try:
        while True:
            try:
                if await acquire_lock(rds, "scheduler_cycle", ttl=30):
-                    await schedule_cycle(pool, rds)
-                    await release_lock(rds, "scheduler_cycle")
-            except Exception as e:
-                logger.error(f"Scheduler cycle error: {e}")
-            await asyncio.sleep(15)
+                    try:
+                        await schedule_cycle(pool, rds)
+                    finally:
+                        await release_lock(rds, "scheduler_cycle")
+            except Exception:
+                logger.exception("Scheduler cycle error")
+            await asyncio.sleep(SCHEDULER_TICK)
    finally:
        await pool.close()
        await rds.close()