phase 0+1: project scaffold, k8s manifests, CI pipeline, steering, hooks, tests

- Repository structure for all services, infra, lakehouse, dashboards
- K8s manifests targeting stonks-oracle namespace with GHCR images
- Ingress via Traefik with ca-issuer TLS for internal services
- ConfigMap wired to existing cluster services (pg, redis, minio, ollama)
- GitHub Actions workflow for lint, test, multi-service container builds
- Dockerfile with build-arg CMD per service
- Makefile for local build/push/deploy
- Steering rules for TDD workflow, K8s conventions, project context
- Agent hooks for lint-on-save, test-on-save, k8s-validate, phase-commit
- Ruff linter config, all lint issues fixed
- 14 passing tests for schemas, config, redis keys
- PostgreSQL migrations, Trino catalogs, Superset config, MinIO lifecycle
This commit is contained in:
Celes Renata
2026-04-11 03:25:08 -07:00
parent 8cfc4f423b
commit ebea70573b
90 changed files with 3590 additions and 19 deletions
+1
View File
@@ -0,0 +1 @@
# Ingestion Adapters
+29
View File
@@ -0,0 +1,29 @@
"""Base adapter interface for all external API integrations."""
from abc import ABC, abstractmethod
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Dict, List, Optional
@dataclass
class AdapterResult:
source_type: str
ticker: str
items: List[Dict[str, Any]]
raw_payload: bytes
content_hash: str
fetched_at: datetime
error: Optional[str] = None
class BaseAdapter(ABC):
"""Interface for all ingestion adapters."""
@abstractmethod
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
"""Fetch data for a given ticker using source config."""
...
@abstractmethod
def source_type(self) -> str:
...
+108
View File
@@ -0,0 +1,108 @@
"""Broker API adapter - paper/live trading, orders, positions, balances."""
import hashlib
import logging
import uuid
from datetime import datetime
from typing import Any, Dict, Optional
import httpx
from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("broker_adapter")
class BrokerAdapter(BaseAdapter):
"""Broker API adapter supporting paper and live modes."""
def __init__(self, api_key: str = "", api_secret: str = "", base_url: str = "", mode: str = "paper"):
self.api_key = api_key
self.api_secret = api_secret
self.base_url = base_url
self.mode = mode # paper | live
def source_type(self) -> str:
return "broker"
def _headers(self) -> Dict[str, str]:
return {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
"""Fetch positions and recent orders for a ticker."""
async with httpx.AsyncClient(timeout=30) as client:
try:
resp = await client.get(
f"{self.base_url}/v2/positions/{ticker}",
headers=self._headers(),
)
raw = resp.content
data = resp.json() if resp.status_code == 200 else {}
content_hash = hashlib.sha256(raw).hexdigest()
return AdapterResult(
source_type="broker",
ticker=ticker,
items=[data] if data else [],
raw_payload=raw,
content_hash=content_hash,
fetched_at=datetime.utcnow(),
)
except Exception as e:
logger.error(f"Broker fetch failed for {ticker}: {e}")
return AdapterResult(
source_type="broker",
ticker=ticker,
items=[],
raw_payload=b"",
content_hash="",
fetched_at=datetime.utcnow(),
error=str(e),
)
async def submit_order(
self,
ticker: str,
side: str,
qty: float,
order_type: str = "market",
limit_price: Optional[float] = None,
idempotency_key: Optional[str] = None,
) -> Dict[str, Any]:
"""Submit an order to the broker. Returns broker response."""
if self.mode == "live":
logger.warning("LIVE order submission")
idem_key = idempotency_key or str(uuid.uuid4())
payload = {
"symbol": ticker,
"qty": str(qty),
"side": side,
"type": order_type,
"time_in_force": "day",
}
if limit_price and order_type == "limit":
payload["limit_price"] = str(limit_price)
async with httpx.AsyncClient(timeout=30) as client:
try:
resp = await client.post(
f"{self.base_url}/v2/orders",
headers={**self._headers(), "Idempotency-Key": idem_key},
json=payload,
)
resp.raise_for_status()
return resp.json()
except httpx.HTTPStatusError as e:
logger.error(f"Order rejected: {e.response.text}")
return {"error": e.response.text, "status": e.response.status_code}
except Exception as e:
logger.error(f"Order submission failed: {e}")
return {"error": str(e)}
async def get_account(self) -> Dict[str, Any]:
async with httpx.AsyncClient(timeout=30) as client:
resp = await client.get(f"{self.base_url}/v2/account", headers=self._headers())
return resp.json()
+58
View File
@@ -0,0 +1,58 @@
"""Filings / Regulatory API adapter - fetches SEC-style submissions."""
import hashlib
import logging
from datetime import datetime
from typing import Any, Dict
import httpx
from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("filings_adapter")
class FilingsAdapter(BaseAdapter):
"""Concrete adapter for SEC EDGAR or similar filings API."""
def __init__(self, base_url: str = "https://efts.sec.gov", user_agent: str = "StonksOracle/1.0"):
self.base_url = base_url
self.user_agent = user_agent
def source_type(self) -> str:
return "filings_api"
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
_cik = config.get("cik", "")
endpoint = config.get("endpoint", f"/LATEST/search-index?q=%22{ticker}%22&dateRange=custom&startdt=2026-01-01&forms=8-K,10-Q,10-K")
url = f"{self.base_url}{endpoint}"
headers = {"User-Agent": self.user_agent}
async with httpx.AsyncClient(timeout=30) as client:
try:
resp = await client.get(url, headers=headers)
resp.raise_for_status()
raw = resp.content
data = resp.json()
content_hash = hashlib.sha256(raw).hexdigest()
hits = data.get("hits", {}).get("hits", [])
return AdapterResult(
source_type="filings_api",
ticker=ticker,
items=hits,
raw_payload=raw,
content_hash=content_hash,
fetched_at=datetime.utcnow(),
)
except Exception as e:
logger.error(f"Filings fetch failed for {ticker}: {e}")
return AdapterResult(
source_type="filings_api",
ticker=ticker,
items=[],
raw_payload=b"",
content_hash="",
fetched_at=datetime.utcnow(),
error=str(e),
)
+59
View File
@@ -0,0 +1,59 @@
"""Market data API adapter - fetches quotes, bars, and reference data."""
import hashlib
import logging
from datetime import datetime
from typing import Any, Dict
import httpx
from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("market_adapter")
class MarketDataAdapter(BaseAdapter):
"""Concrete adapter for a market data provider (e.g., Alpha Vantage, Polygon, Yahoo)."""
def __init__(self, api_key: str = "", base_url: str = ""):
self.api_key = api_key
self.base_url = base_url
def source_type(self) -> str:
return "market_api"
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
endpoint = config.get("endpoint", "/v2/aggs/ticker/{ticker}/prev")
url = f"{self.base_url}{endpoint.format(ticker=ticker)}"
params = config.get("params", {})
if self.api_key:
params["apiKey"] = self.api_key
async with httpx.AsyncClient(timeout=30) as client:
try:
resp = await client.get(url, params=params)
resp.raise_for_status()
raw = resp.content
data = resp.json()
content_hash = hashlib.sha256(raw).hexdigest()
items = data.get("results", [data]) if isinstance(data, dict) else data
return AdapterResult(
source_type="market_api",
ticker=ticker,
items=items if isinstance(items, list) else [items],
raw_payload=raw,
content_hash=content_hash,
fetched_at=datetime.utcnow(),
)
except Exception as e:
logger.error(f"Market fetch failed for {ticker}: {e}")
return AdapterResult(
source_type="market_api",
ticker=ticker,
items=[],
raw_payload=b"",
content_hash="",
fetched_at=datetime.utcnow(),
error=str(e),
)
+61
View File
@@ -0,0 +1,61 @@
"""News API adapter - fetches company-linked headlines and article metadata."""
import hashlib
import logging
from datetime import datetime
from typing import Any, Dict
import httpx
from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("news_adapter")
class NewsApiAdapter(BaseAdapter):
"""Concrete adapter for a news API provider."""
def __init__(self, api_key: str = "", base_url: str = ""):
self.api_key = api_key
self.base_url = base_url
def source_type(self) -> str:
return "news_api"
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
endpoint = config.get("endpoint", "/v2/everything")
url = f"{self.base_url}{endpoint}"
params = config.get("params", {})
params.setdefault("q", ticker)
params.setdefault("sortBy", "publishedAt")
params.setdefault("pageSize", 20)
if self.api_key:
params["apiKey"] = self.api_key
async with httpx.AsyncClient(timeout=30) as client:
try:
resp = await client.get(url, params=params)
resp.raise_for_status()
raw = resp.content
data = resp.json()
content_hash = hashlib.sha256(raw).hexdigest()
articles = data.get("articles", [])
return AdapterResult(
source_type="news_api",
ticker=ticker,
items=articles,
raw_payload=raw,
content_hash=content_hash,
fetched_at=datetime.utcnow(),
)
except Exception as e:
logger.error(f"News fetch failed for {ticker}: {e}")
return AdapterResult(
source_type="news_api",
ticker=ticker,
items=[],
raw_payload=b"",
content_hash="",
fetched_at=datetime.utcnow(),
error=str(e),
)
+1
View File
@@ -0,0 +1 @@
# Aggregation Engine - trend summaries and signal aggregation
+1
View File
@@ -0,0 +1 @@
"""Aggregation worker - rolling trend summaries, contradiction detection, evidence ranking."""
+1
View File
@@ -0,0 +1 @@
# Query API - exposes companies, documents, trends, recommendations, and audit trails
+1
View File
@@ -0,0 +1 @@
"""Query API - FastAPI application for analytics, evidence drill-down, and admin controls."""
+1
View File
@@ -0,0 +1 @@
# Ollama Extraction Service
+1
View File
@@ -0,0 +1 @@
"""Extraction worker - sends documents to Ollama for structured intelligence extraction."""
+1
View File
@@ -0,0 +1 @@
# Ingestion Pipeline
+182
View File
@@ -0,0 +1,182 @@
"""Ingestion worker - processes jobs from the ingestion queue."""
import asyncio
import hashlib
import io
import json
import logging
from datetime import datetime
import asyncpg
import redis.asyncio as aioredis
from minio import Minio
from services.adapters.base import AdapterResult
from services.adapters.filings_adapter import FilingsAdapter
from services.adapters.market_adapter import MarketDataAdapter
from services.adapters.news_adapter import NewsApiAdapter
from services.shared.config import load_config
from services.shared.db import get_minio, get_pg_pool, get_redis
from services.shared.redis_keys import (
QUEUE_INGESTION,
QUEUE_PARSING,
dedupe_key,
queue_key,
)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ingestion_worker")
BUCKET_MAP = {
"market_api": "stonks-raw-market",
"news_api": "stonks-raw-news",
"filings_api": "stonks-raw-filings",
"broker": "stonks-raw-market",
}
def build_storage_path(source_type: str, ticker: str, doc_id: str) -> str:
now = datetime.utcnow()
return f"{source_type}/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.json"
async def store_raw_artifact(minio_client: Minio, bucket: str, path: str, data: bytes):
minio_client.put_object(bucket, path, io.BytesIO(data), len(data), content_type="application/json")
async def process_job(
job: dict,
pool: asyncpg.Pool,
rds: aioredis.Redis,
minio_client: Minio,
adapters: dict,
):
source_type = job["source_type"]
ticker = job["ticker"]
source_id = job["source_id"]
config = job.get("config", {})
adapter = adapters.get(source_type)
if not adapter:
logger.warning(f"No adapter for source_type={source_type}")
return
# Record ingestion run
run_id = await pool.fetchval(
"""INSERT INTO ingestion_runs (source_id, company_id, source_type, status)
VALUES ($1, $2, $3, 'running') RETURNING id""",
source_id, job["company_id"], source_type,
)
try:
result: AdapterResult = await adapter.fetch(ticker, config)
if result.error:
await pool.execute(
"UPDATE ingestion_runs SET status='failed', error_message=$2, completed_at=NOW() WHERE id=$1",
run_id, result.error,
)
return
# Store raw payload
bucket = BUCKET_MAP.get(source_type, "stonks-raw-market")
storage_path = build_storage_path(source_type, ticker, str(run_id))
await store_raw_artifact(minio_client, bucket, storage_path, result.raw_payload)
# Dedupe check
if result.content_hash:
already_seen = await rds.get(dedupe_key(result.content_hash))
if already_seen:
logger.info(f"Duplicate content for {ticker}, skipping")
await pool.execute(
"UPDATE ingestion_runs SET status='completed', items_fetched=$2, items_new=0, completed_at=NOW() WHERE id=$1",
run_id, len(result.items),
)
return
await rds.set(dedupe_key(result.content_hash), "1", ex=86400)
new_items = 0
for item in result.items:
item_json = json.dumps(item)
item_hash = hashlib.sha256(item_json.encode()).hexdigest()
# Check if document already exists
exists = await pool.fetchval("SELECT 1 FROM documents WHERE content_hash = $1", item_hash)
if exists:
continue
title = item.get("title", item.get("name", ""))
url = item.get("url", item.get("link", ""))
published = item.get("publishedAt", item.get("published_at"))
doc_id = await pool.fetchval(
"""INSERT INTO documents (document_type, source_type, publisher, url, title, published_at, content_hash, raw_storage_ref, status)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'ingested')
RETURNING id""",
"article" if source_type == "news_api" else "filing" if source_type == "filings_api" else "article",
source_type,
item.get("source", {}).get("name", "") if isinstance(item.get("source"), dict) else str(item.get("source", "")),
url, title,
datetime.fromisoformat(published.replace("Z", "+00:00")) if published else None,
item_hash,
f"s3://{bucket}/{storage_path}",
)
# Enqueue for parsing
await rds.rpush(queue_key(QUEUE_PARSING), json.dumps({
"document_id": str(doc_id),
"ticker": ticker,
"source_type": source_type,
"url": url,
}))
new_items += 1
await pool.execute(
"UPDATE ingestion_runs SET status='completed', items_fetched=$2, items_new=$3, completed_at=NOW() WHERE id=$1",
run_id, len(result.items), new_items,
)
logger.info(f"Ingested {ticker}/{source_type}: {len(result.items)} fetched, {new_items} new")
except Exception as e:
logger.error(f"Ingestion error for {ticker}: {e}")
await pool.execute(
"UPDATE ingestion_runs SET status='failed', error_message=$2, completed_at=NOW() WHERE id=$1",
run_id, str(e),
)
async def main():
config = load_config()
pool = await get_pg_pool(config)
rds = get_redis(config)
minio_client = get_minio(config)
adapters = {
"market_api": MarketDataAdapter(
api_key=config.broker.api_key or "",
base_url="https://api.polygon.io",
),
"news_api": NewsApiAdapter(
api_key="",
base_url="https://newsapi.org",
),
"filings_api": FilingsAdapter(),
}
logger.info("Ingestion worker started")
queue = queue_key(QUEUE_INGESTION)
try:
while True:
raw = await rds.lpop(queue)
if raw:
job = json.loads(raw)
await process_job(job, pool, rds, minio_client, adapters)
else:
await asyncio.sleep(2)
finally:
await pool.close()
await rds.close()
if __name__ == "__main__":
asyncio.run(main())
+1
View File
@@ -0,0 +1 @@
# Lake Publisher - transforms operational data into analytical fact datasets
+1
View File
@@ -0,0 +1 @@
"""Lake publisher worker - writes partitioned Parquet facts to MinIO for Trino/Superset."""
+1
View File
@@ -0,0 +1 @@
# Scraper / Parser Service
+209
View File
@@ -0,0 +1,209 @@
"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring."""
import asyncio
import io
import json
import logging
import re
from datetime import datetime
from typing import List, Optional, Tuple
import asyncpg
import httpx
import redis.asyncio as aioredis
from minio import Minio
from services.shared.config import load_config
from services.shared.db import get_minio, get_pg_pool, get_redis
from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("parser_worker")
# Simple boilerplate patterns to strip
BOILERPLATE_PATTERNS = [
re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
re.compile(r"(?i)advertisement\s*\n"),
re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
]
def strip_html_tags(html: str) -> str:
"""Basic HTML tag removal."""
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"&nbsp;", " ", text)
text = re.sub(r"&amp;", "&", text)
text = re.sub(r"&lt;", "<", text)
text = re.sub(r"&gt;", ">", text)
text = re.sub(r"&#\d+;", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def reduce_boilerplate(text: str) -> str:
for pattern in BOILERPLATE_PATTERNS:
text = pattern.sub("", text)
return text.strip()
def score_quality(text: str) -> Tuple[float, str]:
"""Score parse quality. Returns (score, confidence_label)."""
word_count = len(text.split())
if word_count < 20:
return 0.1, "low"
if word_count < 50:
return 0.3, "low"
if word_count < 150:
return 0.6, "medium"
return 0.85, "high"
def detect_company_mentions(text: str, aliases: List[dict]) -> List[dict]:
"""Detect company mentions using ticker, alias, and name matching."""
mentions = []
text_upper = text.upper()
for alias_info in aliases:
alias = alias_info["alias"]
if alias.upper() in text_upper:
mentions.append({
"company_id": alias_info["company_id"],
"ticker": alias_info.get("ticker", ""),
"mention_type": alias_info.get("alias_type", "alias"),
"confidence": 0.7,
})
return mentions
async def fetch_html(url: str) -> Optional[str]:
"""Fetch article HTML for scraping."""
if not url:
return None
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
try:
resp = await client.get(url, headers={"User-Agent": "StonksOracle/1.0"})
resp.raise_for_status()
return resp.text
except Exception as e:
logger.warning(f"Failed to fetch {url}: {e}")
return None
async def process_job(
job: dict,
pool: asyncpg.Pool,
rds: aioredis.Redis,
minio_client: Minio,
):
doc_id = job["document_id"]
ticker = job["ticker"]
url = job.get("url", "")
# Fetch HTML if we have a URL
html = await fetch_html(url) if url else None
if html:
# Store raw HTML
html_bytes = html.encode("utf-8")
now = datetime.utcnow()
html_path = f"scrape/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.html"
minio_client.put_object(
"stonks-raw-news", html_path, io.BytesIO(html_bytes), len(html_bytes),
content_type="text/html",
)
# Parse
text = strip_html_tags(html)
text = reduce_boilerplate(text)
else:
text = ""
quality_score, confidence = score_quality(text)
# Store normalized text
if text:
text_bytes = text.encode("utf-8")
now = datetime.utcnow()
norm_path = f"parsed/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/normalized.txt"
minio_client.put_object(
"stonks-normalized", norm_path, io.BytesIO(text_bytes), len(text_bytes),
content_type="text/plain",
)
else:
norm_path = None
# Detect company mentions
aliases = await pool.fetch(
"""SELECT ca.company_id::text, ca.alias, ca.alias_type, c.ticker
FROM company_aliases ca JOIN companies c ON ca.company_id = c.id
UNION ALL
SELECT c.id::text as company_id, c.ticker as alias, 'ticker' as alias_type, c.ticker
FROM companies c
UNION ALL
SELECT c.id::text as company_id, c.legal_name as alias, 'legal_name' as alias_type, c.ticker
FROM companies c"""
)
mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else []
# Update document
status = "parsed" if confidence != "low" else "low_quality"
await pool.execute(
"""UPDATE documents SET
normalized_storage_ref=$2, parse_quality_score=$3, parse_confidence=$4, status=$5, updated_at=NOW()
WHERE id=$1""",
doc_id, f"s3://stonks-normalized/{norm_path}" if norm_path else None,
quality_score, confidence, status,
)
# Insert company mentions
for m in mentions:
await pool.execute(
"""INSERT INTO document_company_mentions (document_id, company_id, ticker, mention_type, confidence)
VALUES ($1, $2, $3, $4, $5) ON CONFLICT DO NOTHING""",
doc_id, m["company_id"], m["ticker"], m["mention_type"], m["confidence"],
)
# Only enqueue for extraction if quality is acceptable
if confidence != "low":
await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps({
"document_id": doc_id,
"ticker": ticker,
"normalized_text": text[:8000], # Truncate for prompt
}))
logger.info(f"Parsed doc {doc_id} for {ticker}: quality={quality_score:.2f}, confidence={confidence}")
else:
logger.warning(f"Low quality parse for doc {doc_id}, skipping extraction")
async def main():
config = load_config()
pool = await get_pg_pool(config)
rds = get_redis(config)
minio_client = get_minio(config)
logger.info("Parser worker started")
queue = queue_key(QUEUE_PARSING)
try:
while True:
raw = await rds.lpop(queue)
if raw:
job = json.loads(raw)
try:
await process_job(job, pool, rds, minio_client)
except Exception as e:
logger.error(f"Parse error: {e}")
else:
await asyncio.sleep(2)
finally:
await pool.close()
await rds.close()
if __name__ == "__main__":
asyncio.run(main())
+1
View File
@@ -0,0 +1 @@
# Recommendation Engine - trade recommendations from aggregated signals
+1
View File
@@ -0,0 +1 @@
"""Recommendation worker - generates explainable trade recommendations from trend data."""
+1
View File
@@ -0,0 +1 @@
# Risk Engine - portfolio risk controls and trade eligibility
+1
View File
@@ -0,0 +1 @@
"""Risk engine - enforces guardrails, position limits, and trade eligibility checks."""
+1
View File
@@ -0,0 +1 @@
# Scheduler / Orchestrator Service
+112
View File
@@ -0,0 +1,112 @@
"""Scheduler - triggers ingestion cycles for tracked symbols and sources."""
import asyncio
import json
import logging
from datetime import datetime, timedelta
import asyncpg
import redis.asyncio as aioredis
from services.shared.config import load_config
from services.shared.db import get_pg_pool, get_redis
from services.shared.redis_keys import (
QUEUE_INGESTION,
lock_key,
queue_key,
rate_limit_key,
)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("scheduler")
# Polling cadences by source class (seconds)
CADENCES = {
"market_api": 60,
"news_api": 300,
"filings_api": 3600,
"web_scrape": 1800,
"broker": 30,
}
async def acquire_lock(rds: aioredis.Redis, name: str, ttl: int = 60) -> bool:
return await rds.set(lock_key(name), "1", nx=True, ex=ttl)
async def release_lock(rds: aioredis.Redis, name: str):
await rds.delete(lock_key(name))
async def check_rate_limit(rds: aioredis.Redis, source_type: str, max_per_minute: int = 30) -> bool:
key = rate_limit_key(source_type, datetime.utcnow().strftime("%Y%m%d%H%M"))
count = await rds.incr(key)
if count == 1:
await rds.expire(key, 120)
return count <= max_per_minute
async def schedule_cycle(pool: asyncpg.Pool, rds: aioredis.Redis):
"""One scheduling pass: find due sources and enqueue ingestion jobs."""
sources = await pool.fetch(
"""SELECT s.id as source_id, s.company_id, s.source_type, s.source_name, s.config,
c.ticker, c.legal_name
FROM sources s JOIN companies c ON s.company_id = c.id
WHERE s.active = TRUE AND c.active = TRUE
ORDER BY s.source_type, c.ticker"""
)
enqueued = 0
for src in sources:
source_type = src["source_type"]
cadence = CADENCES.get(source_type, 600)
# Check last run
last_run = await pool.fetchval(
"SELECT MAX(started_at) FROM ingestion_runs WHERE source_id = $1 AND status IN ('completed', 'running')",
src["source_id"],
)
if last_run and (datetime.utcnow() - last_run.replace(tzinfo=None)) < timedelta(seconds=cadence):
continue
if not await check_rate_limit(rds, source_type):
logger.warning(f"Rate limit hit for {source_type}")
continue
job = {
"source_id": str(src["source_id"]),
"company_id": str(src["company_id"]),
"ticker": src["ticker"],
"source_type": source_type,
"source_name": src["source_name"],
"config": dict(src["config"]) if src["config"] else {},
"scheduled_at": datetime.utcnow().isoformat(),
}
await rds.rpush(queue_key(QUEUE_INGESTION), json.dumps(job))
enqueued += 1
if enqueued:
logger.info(f"Enqueued {enqueued} ingestion jobs")
async def main():
config = load_config()
pool = await get_pg_pool(config)
rds = get_redis(config)
logger.info("Scheduler started")
try:
while True:
try:
if await acquire_lock(rds, "scheduler_cycle", ttl=30):
await schedule_cycle(pool, rds)
await release_lock(rds, "scheduler_cycle")
except Exception as e:
logger.error(f"Scheduler cycle error: {e}")
await asyncio.sleep(15)
finally:
await pool.close()
await rds.close()
if __name__ == "__main__":
asyncio.run(main())
+1
View File
@@ -0,0 +1 @@
# Stonks Oracle - Shared modules
+115
View File
@@ -0,0 +1,115 @@
"""Shared configuration loader for all services."""
import os
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class PostgresConfig:
host: str = "localhost"
port: int = 5432
database: str = "stonks"
user: str = "stonks"
password: str = "stonks_dev"
@property
def dsn(self) -> str:
return f"postgresql://{self.user}:{self.password}@{self.host}:{self.port}/{self.database}"
@dataclass
class RedisConfig:
host: str = "localhost"
port: int = 6379
db: int = 0
password: Optional[str] = None
@property
def url(self) -> str:
auth = f":{self.password}@" if self.password else ""
return f"redis://{auth}{self.host}:{self.port}/{self.db}"
@dataclass
class MinioConfig:
endpoint: str = "localhost:9000"
access_key: str = "minioadmin"
secret_key: str = "minioadmin"
secure: bool = False
@dataclass
class OllamaConfig:
base_url: str = "http://localhost:11434"
model: str = "llama3.1:8b"
timeout: int = 120
@dataclass
class TrinoConfig:
host: str = "localhost"
port: int = 8080
catalog: str = "lakehouse"
schema: str = "stonks"
@dataclass
class BrokerConfig:
mode: str = "paper" # paper | live
api_key: Optional[str] = None
api_secret: Optional[str] = None
base_url: Optional[str] = None
@dataclass
class AppConfig:
postgres: PostgresConfig = field(default_factory=PostgresConfig)
redis: RedisConfig = field(default_factory=RedisConfig)
minio: MinioConfig = field(default_factory=MinioConfig)
ollama: OllamaConfig = field(default_factory=OllamaConfig)
trino: TrinoConfig = field(default_factory=TrinoConfig)
broker: BrokerConfig = field(default_factory=BrokerConfig)
log_level: str = "INFO"
def load_config() -> AppConfig:
"""Load configuration from environment variables with sensible defaults."""
return AppConfig(
postgres=PostgresConfig(
host=os.getenv("POSTGRES_HOST", "localhost"),
port=int(os.getenv("POSTGRES_PORT", "5432")),
database=os.getenv("POSTGRES_DB", "stonks"),
user=os.getenv("POSTGRES_USER", "stonks"),
password=os.getenv("POSTGRES_PASSWORD", "stonks_dev"),
),
redis=RedisConfig(
host=os.getenv("REDIS_HOST", "localhost"),
port=int(os.getenv("REDIS_PORT", "6379")),
db=int(os.getenv("REDIS_DB", "0")),
password=os.getenv("REDIS_PASSWORD", None),
),
minio=MinioConfig(
endpoint=os.getenv("MINIO_ENDPOINT", "localhost:9000"),
access_key=os.getenv("MINIO_ACCESS_KEY", "minioadmin"),
secret_key=os.getenv("MINIO_SECRET_KEY", "minioadmin"),
secure=os.getenv("MINIO_SECURE", "false").lower() == "true",
),
ollama=OllamaConfig(
base_url=os.getenv("OLLAMA_BASE_URL", "http://localhost:11434"),
model=os.getenv("OLLAMA_MODEL", "llama3.1:8b"),
timeout=int(os.getenv("OLLAMA_TIMEOUT", "120")),
),
trino=TrinoConfig(
host=os.getenv("TRINO_HOST", "localhost"),
port=int(os.getenv("TRINO_PORT", "8080")),
catalog=os.getenv("TRINO_CATALOG", "lakehouse"),
schema=os.getenv("TRINO_SCHEMA", "stonks"),
),
broker=BrokerConfig(
mode=os.getenv("BROKER_MODE", "paper"),
api_key=os.getenv("BROKER_API_KEY", None),
api_secret=os.getenv("BROKER_API_SECRET", None),
base_url=os.getenv("BROKER_BASE_URL", None),
),
log_level=os.getenv("LOG_LEVEL", "INFO"),
)
+33
View File
@@ -0,0 +1,33 @@
"""Database connection helpers."""
import asyncpg
import redis.asyncio as aioredis
from minio import Minio
from .config import AppConfig
async def get_pg_pool(config: AppConfig) -> asyncpg.Pool:
"""Create a PostgreSQL connection pool."""
return await asyncpg.create_pool(
dsn=config.postgres.dsn,
min_size=2,
max_size=10,
)
def get_redis(config: AppConfig) -> aioredis.Redis:
"""Create a Redis async client."""
return aioredis.from_url(
config.redis.url,
decode_responses=True,
)
def get_minio(config: AppConfig) -> Minio:
"""Create a MinIO client."""
return Minio(
config.minio.endpoint,
access_key=config.minio.access_key,
secret_key=config.minio.secret_key,
secure=config.minio.secure,
)
+56
View File
@@ -0,0 +1,56 @@
"""Redis key conventions and queue abstractions."""
# --- Key prefixes ---
PREFIX = "stonks"
# Distributed locks
LOCK_PREFIX = f"{PREFIX}:lock"
# Rate limit counters
RATE_LIMIT_PREFIX = f"{PREFIX}:ratelimit"
# Job queues
QUEUE_PREFIX = f"{PREFIX}:queue"
# Dedupe markers
DEDUPE_PREFIX = f"{PREFIX}:dedupe"
# Cache
CACHE_PREFIX = f"{PREFIX}:cache"
# Retry backoff state
RETRY_PREFIX = f"{PREFIX}:retry"
def lock_key(resource: str) -> str:
return f"{LOCK_PREFIX}:{resource}"
def rate_limit_key(source: str, window: str) -> str:
return f"{RATE_LIMIT_PREFIX}:{source}:{window}"
def queue_key(queue_name: str) -> str:
return f"{QUEUE_PREFIX}:{queue_name}"
def dedupe_key(content_hash: str) -> str:
return f"{DEDUPE_PREFIX}:{content_hash}"
def cache_key(namespace: str, key: str) -> str:
return f"{CACHE_PREFIX}:{namespace}:{key}"
def retry_key(job_id: str) -> str:
return f"{RETRY_PREFIX}:{job_id}"
# --- Queue names ---
QUEUE_INGESTION = "ingestion"
QUEUE_PARSING = "parsing"
QUEUE_EXTRACTION = "extraction"
QUEUE_AGGREGATION = "aggregation"
QUEUE_RECOMMENDATION = "recommendation"
QUEUE_LAKE_PUBLISH = "lake_publish"
QUEUE_TRADE = "trade"
+169
View File
@@ -0,0 +1,169 @@
"""Typed JSON schemas for document intelligence, trend summaries, and recommendations."""
from __future__ import annotations
import uuid
from datetime import datetime
from enum import Enum
from typing import List, Optional
from pydantic import BaseModel, Field
# --- Enums ---
class DocumentType(str, Enum):
ARTICLE = "article"
FILING = "filing"
TRANSCRIPT = "transcript"
PRESS_RELEASE = "press_release"
class SourceType(str, Enum):
MARKET_API = "market_api"
NEWS_API = "news_api"
FILINGS_API = "filings_api"
WEB_SCRAPE = "web_scrape"
BROKER = "broker"
class Sentiment(str, Enum):
POSITIVE = "positive"
NEGATIVE = "negative"
NEUTRAL = "neutral"
MIXED = "mixed"
class CatalystType(str, Enum):
EARNINGS = "earnings"
PRODUCT = "product"
LEGAL = "legal"
MACRO = "macro"
SUPPLY_CHAIN = "supply_chain"
M_AND_A = "m_and_a"
RATING_CHANGE = "rating_change"
OTHER = "other"
class TrendDirection(str, Enum):
BULLISH = "bullish"
BEARISH = "bearish"
MIXED = "mixed"
NEUTRAL = "neutral"
class ActionType(str, Enum):
BUY = "buy"
SELL = "sell"
HOLD = "hold"
WATCH = "watch"
class RecommendationMode(str, Enum):
INFORMATIONAL = "informational"
PAPER_ELIGIBLE = "paper_eligible"
LIVE_ELIGIBLE = "live_eligible"
class TrendWindow(str, Enum):
INTRADAY = "intraday"
ONE_DAY = "1d"
SEVEN_DAY = "7d"
THIRTY_DAY = "30d"
NINETY_DAY = "90d"
# --- Document Intelligence ---
class CompanyImpact(BaseModel):
ticker: str
company_name: str
relevance: float = Field(ge=0, le=1)
sentiment: Sentiment
impact_score: float = Field(ge=0, le=1)
impact_horizon: str
catalyst_type: CatalystType
key_facts: List[str] = Field(default_factory=list)
risks: List[str] = Field(default_factory=list)
evidence_spans: List[str] = Field(default_factory=list)
class ModelMetadata(BaseModel):
provider: str = "ollama"
model_name: str = ""
prompt_version: str = ""
schema_version: str = "2.0.0"
class DocumentIntelligence(BaseModel):
document_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
document_type: DocumentType = DocumentType.ARTICLE
summary: str = ""
companies: List[CompanyImpact] = Field(default_factory=list)
macro_themes: List[str] = Field(default_factory=list)
novelty_score: float = Field(ge=0, le=1, default=0.5)
source_credibility: float = Field(ge=0, le=1, default=0.5)
extraction_warnings: List[str] = Field(default_factory=list)
confidence: float = Field(ge=0, le=1, default=0.5)
model: ModelMetadata = Field(default_factory=ModelMetadata)
# --- Trend Summary ---
class TrendSummary(BaseModel):
entity_type: str = "company"
entity_id: str = ""
window: TrendWindow = TrendWindow.SEVEN_DAY
trend_direction: TrendDirection = TrendDirection.NEUTRAL
trend_strength: float = Field(ge=0, le=1, default=0.5)
confidence: float = Field(ge=0, le=1, default=0.5)
top_supporting_evidence: List[str] = Field(default_factory=list)
top_opposing_evidence: List[str] = Field(default_factory=list)
dominant_catalysts: List[str] = Field(default_factory=list)
material_risks: List[str] = Field(default_factory=list)
contradiction_score: float = Field(ge=0, le=1, default=0.0)
generated_at: datetime = Field(default_factory=datetime.utcnow)
# --- Recommendation ---
class PositionSizing(BaseModel):
portfolio_pct: float = Field(ge=0, le=1, default=0.02)
max_loss_pct: float = Field(ge=0, le=1, default=0.005)
class Recommendation(BaseModel):
recommendation_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
ticker: str = ""
action: ActionType = ActionType.WATCH
mode: RecommendationMode = RecommendationMode.INFORMATIONAL
confidence: float = Field(ge=0, le=1, default=0.5)
time_horizon: str = ""
thesis: str = ""
invalidation_conditions: List[str] = Field(default_factory=list)
position_sizing: PositionSizing = Field(default_factory=PositionSizing)
evidence_refs: List[str] = Field(default_factory=list)
model_metadata: ModelMetadata = Field(default_factory=ModelMetadata)
generated_at: datetime = Field(default_factory=datetime.utcnow)
# --- Document Metadata ---
class StorageRefs(BaseModel):
raw_html: Optional[str] = None
raw_payload: Optional[str] = None
normalized_text: Optional[str] = None
class DocumentMetadata(BaseModel):
document_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
document_type: DocumentType = DocumentType.ARTICLE
symbol_candidates: List[str] = Field(default_factory=list)
source_type: SourceType = SourceType.NEWS_API
publisher: str = ""
url: Optional[str] = None
canonical_url: Optional[str] = None
title: str = ""
published_at: Optional[datetime] = None
retrieved_at: datetime = Field(default_factory=datetime.utcnow)
language: str = "en"
content_hash: str = ""
storage_refs: StorageRefs = Field(default_factory=StorageRefs)
+1
View File
@@ -0,0 +1 @@
# Symbol Registry Service
+209
View File
@@ -0,0 +1,209 @@
"""Symbol Registry API - FastAPI application."""
from contextlib import asynccontextmanager
from typing import List, Optional
import asyncpg
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from services.shared.config import load_config
from services.shared.db import get_pg_pool
config = load_config()
pool: Optional[asyncpg.Pool] = None
@asynccontextmanager
async def lifespan(app: FastAPI):
global pool
pool = await get_pg_pool(config)
yield
await pool.close()
app = FastAPI(title="Stonks Oracle - Symbol Registry", lifespan=lifespan)
# --- Request/Response Models ---
class CompanyCreate(BaseModel):
ticker: str
legal_name: str
exchange: Optional[str] = None
sector: Optional[str] = None
industry: Optional[str] = None
market_cap_bucket: Optional[str] = None
class CompanyResponse(BaseModel):
id: str
ticker: str
legal_name: str
exchange: Optional[str]
sector: Optional[str]
industry: Optional[str]
market_cap_bucket: Optional[str]
active: bool
class AliasCreate(BaseModel):
alias: str
alias_type: str = "brand"
class WatchlistCreate(BaseModel):
name: str
description: Optional[str] = None
class SourceCreate(BaseModel):
source_type: str
source_name: str
config: dict = {}
credibility_score: float = 0.5
retention_days: int = 365
access_policy: str = "internal"
VALID_SOURCE_TYPES = {"market_api", "news_api", "filings_api", "web_scrape", "broker"}
# --- Company Endpoints ---
@app.post("/companies", response_model=CompanyResponse, status_code=201)
async def create_company(body: CompanyCreate):
try:
row = await pool.fetchrow(
"""INSERT INTO companies (ticker, legal_name, exchange, sector, industry, market_cap_bucket)
VALUES ($1, $2, $3, $4, $5, $6)
RETURNING id, ticker, legal_name, exchange, sector, industry, market_cap_bucket, active""",
body.ticker.upper(), body.legal_name, body.exchange, body.sector,
body.industry, body.market_cap_bucket,
)
except asyncpg.UniqueViolationError:
raise HTTPException(409, f"Company {body.ticker} on {body.exchange} already exists")
return dict(row)
@app.get("/companies", response_model=List[CompanyResponse])
async def list_companies(active: bool = True):
rows = await pool.fetch(
"SELECT id, ticker, legal_name, exchange, sector, industry, market_cap_bucket, active FROM companies WHERE active = $1 ORDER BY ticker",
active,
)
return [dict(r) for r in rows]
@app.get("/companies/{company_id}", response_model=CompanyResponse)
async def get_company(company_id: str):
row = await pool.fetchrow(
"SELECT id, ticker, legal_name, exchange, sector, industry, market_cap_bucket, active FROM companies WHERE id = $1",
company_id,
)
if not row:
raise HTTPException(404, "Company not found")
return dict(row)
@app.put("/companies/{company_id}", response_model=CompanyResponse)
async def update_company(company_id: str, body: CompanyCreate):
row = await pool.fetchrow(
"""UPDATE companies SET ticker=$2, legal_name=$3, exchange=$4, sector=$5, industry=$6, market_cap_bucket=$7, updated_at=NOW()
WHERE id=$1
RETURNING id, ticker, legal_name, exchange, sector, industry, market_cap_bucket, active""",
company_id, body.ticker.upper(), body.legal_name, body.exchange,
body.sector, body.industry, body.market_cap_bucket,
)
if not row:
raise HTTPException(404, "Company not found")
return dict(row)
# --- Alias Endpoints ---
@app.post("/companies/{company_id}/aliases", status_code=201)
async def add_alias(company_id: str, body: AliasCreate):
row = await pool.fetchrow(
"INSERT INTO company_aliases (company_id, alias, alias_type) VALUES ($1, $2, $3) RETURNING id, alias, alias_type",
company_id, body.alias, body.alias_type,
)
return dict(row)
@app.get("/companies/{company_id}/aliases")
async def list_aliases(company_id: str):
rows = await pool.fetch(
"SELECT id, alias, alias_type FROM company_aliases WHERE company_id = $1",
company_id,
)
return [dict(r) for r in rows]
# --- Watchlist Endpoints ---
@app.post("/watchlists", status_code=201)
async def create_watchlist(body: WatchlistCreate):
try:
row = await pool.fetchrow(
"INSERT INTO watchlists (name, description) VALUES ($1, $2) RETURNING id, name, description, active",
body.name, body.description,
)
except asyncpg.UniqueViolationError:
raise HTTPException(409, f"Watchlist '{body.name}' already exists")
return dict(row)
@app.get("/watchlists")
async def list_watchlists():
rows = await pool.fetch("SELECT id, name, description, active FROM watchlists ORDER BY name")
return [dict(r) for r in rows]
@app.post("/watchlists/{watchlist_id}/members/{company_id}", status_code=201)
async def add_watchlist_member(watchlist_id: str, company_id: str):
try:
await pool.execute(
"INSERT INTO watchlist_members (watchlist_id, company_id) VALUES ($1, $2)",
watchlist_id, company_id,
)
except asyncpg.UniqueViolationError:
raise HTTPException(409, "Already a member")
except asyncpg.ForeignKeyViolationError:
raise HTTPException(404, "Watchlist or company not found")
return {"status": "added"}
@app.get("/watchlists/{watchlist_id}/members")
async def list_watchlist_members(watchlist_id: str):
rows = await pool.fetch(
"""SELECT c.id, c.ticker, c.legal_name, c.exchange, c.sector, c.industry, c.market_cap_bucket, c.active
FROM companies c JOIN watchlist_members wm ON c.id = wm.company_id
WHERE wm.watchlist_id = $1 ORDER BY c.ticker""",
watchlist_id,
)
return [dict(r) for r in rows]
# --- Source Endpoints ---
@app.post("/companies/{company_id}/sources", status_code=201)
async def add_source(company_id: str, body: SourceCreate):
if body.source_type not in VALID_SOURCE_TYPES:
raise HTTPException(400, f"Invalid source_type. Must be one of: {VALID_SOURCE_TYPES}")
row = await pool.fetchrow(
"""INSERT INTO sources (company_id, source_type, source_name, config, credibility_score, retention_days, access_policy)
VALUES ($1, $2, $3, $4, $5, $6, $7)
RETURNING id, source_type, source_name, credibility_score, active""",
company_id, body.source_type, body.source_name,
body.config, body.credibility_score, body.retention_days, body.access_policy,
)
return dict(row)
@app.get("/companies/{company_id}/sources")
async def list_sources(company_id: str):
rows = await pool.fetch(
"SELECT id, source_type, source_name, config, credibility_score, retention_days, access_policy, active FROM sources WHERE company_id = $1",
company_id,
)
return [dict(r) for r in rows]