"""Scheduler - triggers ingestion cycles for tracked symbols and sources.""" import asyncio import json import logging from datetime import datetime, timedelta import asyncpg import redis.asyncio as aioredis from services.shared.config import load_config from services.shared.db import get_pg_pool, get_redis from services.shared.redis_keys import ( QUEUE_INGESTION, lock_key, queue_key, rate_limit_key, ) logging.basicConfig(level=logging.INFO) logger = logging.getLogger("scheduler") # Polling cadences by source class (seconds) CADENCES = { "market_api": 60, "news_api": 300, "filings_api": 3600, "web_scrape": 1800, "broker": 30, } async def acquire_lock(rds: aioredis.Redis, name: str, ttl: int = 60) -> bool: return await rds.set(lock_key(name), "1", nx=True, ex=ttl) async def release_lock(rds: aioredis.Redis, name: str): await rds.delete(lock_key(name)) async def check_rate_limit(rds: aioredis.Redis, source_type: str, max_per_minute: int = 30) -> bool: key = rate_limit_key(source_type, datetime.utcnow().strftime("%Y%m%d%H%M")) count = await rds.incr(key) if count == 1: await rds.expire(key, 120) return count <= max_per_minute async def schedule_cycle(pool: asyncpg.Pool, rds: aioredis.Redis): """One scheduling pass: find due sources and enqueue ingestion jobs.""" sources = await pool.fetch( """SELECT s.id as source_id, s.company_id, s.source_type, s.source_name, s.config, c.ticker, c.legal_name FROM sources s JOIN companies c ON s.company_id = c.id WHERE s.active = TRUE AND c.active = TRUE ORDER BY s.source_type, c.ticker""" ) enqueued = 0 for src in sources: source_type = src["source_type"] cadence = CADENCES.get(source_type, 600) # Check last run last_run = await pool.fetchval( "SELECT MAX(started_at) FROM ingestion_runs WHERE source_id = $1 AND status IN ('completed', 'running')", src["source_id"], ) if last_run and (datetime.utcnow() - last_run.replace(tzinfo=None)) < timedelta(seconds=cadence): continue if not await check_rate_limit(rds, source_type): logger.warning(f"Rate limit hit for {source_type}") continue job = { "source_id": str(src["source_id"]), "company_id": str(src["company_id"]), "ticker": src["ticker"], "source_type": source_type, "source_name": src["source_name"], "config": dict(src["config"]) if src["config"] else {}, "scheduled_at": datetime.utcnow().isoformat(), } await rds.rpush(queue_key(QUEUE_INGESTION), json.dumps(job)) enqueued += 1 if enqueued: logger.info(f"Enqueued {enqueued} ingestion jobs") async def main(): config = load_config() pool = await get_pg_pool(config) rds = get_redis(config) logger.info("Scheduler started") try: while True: try: if await acquire_lock(rds, "scheduler_cycle", ttl=30): await schedule_cycle(pool, rds) await release_lock(rds, "scheduler_cycle") except Exception as e: logger.error(f"Scheduler cycle error: {e}") await asyncio.sleep(15) finally: await pool.close() await rds.close() if __name__ == "__main__": asyncio.run(main())