phase 2: symbol registry validation, seed data, nix dev shell

- Enhanced CompanyCreate with ticker format validation (1-10 uppercase letters)
- Enhanced SourceCreate with pydantic validators for source_type, access_policy, config URLs
- Added /health endpoint to symbol registry
- Seed data: 10 companies (AAPL, MSFT, NVDA, AMZN, GOOGL, JPM, JNJ, XOM, TSLA, META)
- Seed sources: Alpha Vantage (market), NewsAPI (news), SEC EDGAR (filings), Alpaca (paper trading)
- Seed watchlist: 'Starter 10' with all companies and aliases
- Added flake.nix dev shell (nixos-25.11) with Python 3.12, ruff, pytest, kubectl, helm
- 30 passing tests, lint clean, Docker build verified
This commit is contained in:
Celes Renata
2026-04-11 03:41:41 -07:00
parent ebea70573b
commit 7394d241c9
7 changed files with 480 additions and 10 deletions
+5 -5
View File
@@ -17,11 +17,11 @@
- [x] Stand up Superset with environment-backed datasource configuration - [x] Stand up Superset with environment-backed datasource configuration
## Phase 2 - Symbol Registry and Source Management ## Phase 2 - Symbol Registry and Source Management
- [ ] Build symbol registry API endpoints for companies, aliases, watchlists, and sources - [x] Build symbol registry API endpoints for companies, aliases, watchlists, and sources
- [ ] Add source credibility, retention policy, and access policy fields - [x] Add source credibility, retention policy, and access policy fields
- [ ] Add source classes for market data API, news API, filings API, web scrape, and broker adapter - [x] Add source classes for market data API, news API, filings API, web scrape, and broker adapter
- [ ] Add admin validation for duplicate tickers, invalid URLs, and unsupported source types - [x] Add admin validation for duplicate tickers, invalid URLs, and unsupported source types
- [ ] Add seed data support for an initial tracked watchlist - [x] Add seed data support for an initial tracked watchlist
## Phase 3 ## Phase 3
- External API Adapters - External API Adapters
- [ ] Implement scheduler for symbol and source polling windows - [ ] Implement scheduler for symbol and source polling windows
Generated
+61
View File
@@ -0,0 +1,61 @@
{
"nodes": {
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1731533236,
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1775811116,
"narHash": "sha256-t+HZK42pB6N+i5RGbuy7Xluez/VvWbembBdvzsc23Ss=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "54170c54449ea4d6725efd30d719c5e505f1c10e",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-25.11",
"repo": "nixpkgs",
"type": "github"
}
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}
+70
View File
@@ -0,0 +1,70 @@
{
description = "Stonks Oracle dev environment";
inputs = {
nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.11";
flake-utils.url = "github:numtide/flake-utils";
};
outputs = { self, nixpkgs, flake-utils }:
flake-utils.lib.eachDefaultSystem (system:
let
pkgs = nixpkgs.legacyPackages.${system};
python = pkgs.python312;
pythonPkgs = python.pkgs;
in
{
devShells.default = pkgs.mkShell {
buildInputs = [
# Python + core deps
python
pythonPkgs.pip
pythonPkgs.virtualenv
# Native build deps for asyncpg, etc.
pkgs.postgresql_16.lib
pkgs.openssl
pkgs.pkg-config
pkgs.gcc
# Dev tools
pkgs.ruff
pkgs.kubectl
pkgs.kubernetes-helm
pkgs.docker-client
# Useful extras
pkgs.jq
pkgs.yq-go
pkgs.curl
pkgs.git
];
shellHook = ''
# Create venv if it doesn't exist
if [ ! -d .venv ]; then
echo "Creating Python venv..."
python -m venv .venv
fi
source .venv/bin/activate
# Ensure nix-provided tools take precedence over venv
export PATH="${builtins.concatStringsSep ":" [
"${pkgs.ruff}/bin"
"${pkgs.kubectl}/bin"
"${pkgs.kubernetes-helm}/bin"
]}:$PATH"
# Install deps if needed
if [ ! -f .venv/.installed ]; then
echo "Installing Python dependencies..."
pip install -q --exclude ruff -r requirements.txt 2>/dev/null || pip install -q -r requirements.txt
touch .venv/.installed
fi
export PYTHONPATH="$PWD:$PYTHONPATH"
echo "Stonks Oracle dev shell ready. Python $(python --version), ruff $(ruff --version)"
'';
};
});
}
+2 -2
View File
@@ -28,5 +28,5 @@ trino>=0.330.0
pytest>=8.0.0 pytest>=8.0.0
pytest-asyncio>=0.24.0 pytest-asyncio>=0.24.0
# Code quality # Code quality (provided by nix devshell, not pip)
ruff>=0.5.0 # ruff
+53 -3
View File
@@ -1,10 +1,12 @@
"""Symbol Registry API - FastAPI application.""" """Symbol Registry API - FastAPI application."""
import re
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from typing import List, Optional from typing import List, Optional
from urllib.parse import urlparse
import asyncpg import asyncpg
from fastapi import FastAPI, HTTPException from fastapi import FastAPI, HTTPException
from pydantic import BaseModel from pydantic import BaseModel, field_validator
from services.shared.config import load_config from services.shared.config import load_config
from services.shared.db import get_pg_pool from services.shared.db import get_pg_pool
@@ -24,6 +26,19 @@ async def lifespan(app: FastAPI):
app = FastAPI(title="Stonks Oracle - Symbol Registry", lifespan=lifespan) app = FastAPI(title="Stonks Oracle - Symbol Registry", lifespan=lifespan)
@app.get("/health")
async def health():
try:
await pool.fetchval("SELECT 1")
return {"status": "ok"}
except Exception:
raise HTTPException(503, "Database unavailable")
TICKER_PATTERN = re.compile(r"^[A-Z]{1,10}$")
VALID_SOURCE_TYPES = {"market_api", "news_api", "filings_api", "web_scrape", "broker"}
VALID_ACCESS_POLICIES = {"internal", "public", "restricted"}
# --- Request/Response Models --- # --- Request/Response Models ---
class CompanyCreate(BaseModel): class CompanyCreate(BaseModel):
@@ -34,6 +49,14 @@ class CompanyCreate(BaseModel):
industry: Optional[str] = None industry: Optional[str] = None
market_cap_bucket: Optional[str] = None market_cap_bucket: Optional[str] = None
@field_validator("ticker")
@classmethod
def validate_ticker(cls, v: str) -> str:
v = v.upper().strip()
if not TICKER_PATTERN.match(v):
raise ValueError(f"Ticker must be 1-10 uppercase letters, got: {v}")
return v
class CompanyResponse(BaseModel): class CompanyResponse(BaseModel):
id: str id: str
@@ -64,6 +87,31 @@ class SourceCreate(BaseModel):
retention_days: int = 365 retention_days: int = 365
access_policy: str = "internal" access_policy: str = "internal"
@field_validator("source_type")
@classmethod
def validate_source_type(cls, v: str) -> str:
if v not in VALID_SOURCE_TYPES:
raise ValueError(f"source_type must be one of {VALID_SOURCE_TYPES}")
return v
@field_validator("access_policy")
@classmethod
def validate_access_policy(cls, v: str) -> str:
if v not in VALID_ACCESS_POLICIES:
raise ValueError(f"access_policy must be one of {VALID_ACCESS_POLICIES}")
return v
@field_validator("config")
@classmethod
def validate_config_urls(cls, v: dict) -> dict:
"""Validate any URL fields in the config dict."""
for key in ("base_url", "endpoint", "url"):
if key in v and v[key]:
parsed = urlparse(str(v[key]))
if key == "base_url" and parsed.scheme not in ("http", "https"):
raise ValueError(f"config.{key} must be a valid HTTP(S) URL")
return v
VALID_SOURCE_TYPES = {"market_api", "news_api", "filings_api", "web_scrape", "broker"} VALID_SOURCE_TYPES = {"market_api", "news_api", "filings_api", "web_scrape", "broker"}
@@ -188,8 +236,10 @@ async def list_watchlist_members(watchlist_id: str):
@app.post("/companies/{company_id}/sources", status_code=201) @app.post("/companies/{company_id}/sources", status_code=201)
async def add_source(company_id: str, body: SourceCreate): async def add_source(company_id: str, body: SourceCreate):
if body.source_type not in VALID_SOURCE_TYPES: # Verify company exists
raise HTTPException(400, f"Invalid source_type. Must be one of: {VALID_SOURCE_TYPES}") exists = await pool.fetchval("SELECT 1 FROM companies WHERE id = $1", company_id)
if not exists:
raise HTTPException(404, "Company not found")
row = await pool.fetchrow( row = await pool.fetchrow(
"""INSERT INTO sources (company_id, source_type, source_name, config, credibility_score, retention_days, access_policy) """INSERT INTO sources (company_id, source_type, source_name, config, credibility_score, retention_days, access_policy)
VALUES ($1, $2, $3, $4, $5, $6, $7) VALUES ($1, $2, $3, $4, $5, $6, $7)
+184
View File
@@ -0,0 +1,184 @@
"""Seed data for initial tracked watchlist.
Run against a live database to populate the starter companies, aliases,
watchlist, and source configurations.
Usage:
python -m services.symbol_registry.seed
"""
import asyncio
import logging
import asyncpg
from services.shared.config import load_config
from services.shared.db import get_pg_pool
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("seed")
# --- Seed Companies ---
# Diverse mix: mega-cap tech, finance, healthcare, energy, consumer
COMPANIES = [
{"ticker": "AAPL", "legal_name": "Apple Inc.", "exchange": "NASDAQ", "sector": "Technology", "industry": "Consumer Electronics", "market_cap_bucket": "mega"},
{"ticker": "MSFT", "legal_name": "Microsoft Corporation", "exchange": "NASDAQ", "sector": "Technology", "industry": "Software", "market_cap_bucket": "mega"},
{"ticker": "NVDA", "legal_name": "NVIDIA Corporation", "exchange": "NASDAQ", "sector": "Technology", "industry": "Semiconductors", "market_cap_bucket": "mega"},
{"ticker": "AMZN", "legal_name": "Amazon.com Inc.", "exchange": "NASDAQ", "sector": "Consumer Cyclical", "industry": "Internet Retail", "market_cap_bucket": "mega"},
{"ticker": "GOOGL", "legal_name": "Alphabet Inc.", "exchange": "NASDAQ", "sector": "Technology", "industry": "Internet Content", "market_cap_bucket": "mega"},
{"ticker": "JPM", "legal_name": "JPMorgan Chase & Co.", "exchange": "NYSE", "sector": "Financial Services", "industry": "Banks", "market_cap_bucket": "mega"},
{"ticker": "JNJ", "legal_name": "Johnson & Johnson", "exchange": "NYSE", "sector": "Healthcare", "industry": "Drug Manufacturers", "market_cap_bucket": "mega"},
{"ticker": "XOM", "legal_name": "Exxon Mobil Corporation", "exchange": "NYSE", "sector": "Energy", "industry": "Oil & Gas Integrated", "market_cap_bucket": "mega"},
{"ticker": "TSLA", "legal_name": "Tesla Inc.", "exchange": "NASDAQ", "sector": "Consumer Cyclical", "industry": "Auto Manufacturers", "market_cap_bucket": "large"},
{"ticker": "META", "legal_name": "Meta Platforms Inc.", "exchange": "NASDAQ", "sector": "Technology", "industry": "Internet Content", "market_cap_bucket": "mega"},
]
# --- Aliases ---
ALIASES = {
"AAPL": [("Apple", "brand"), ("iPhone", "product")],
"MSFT": [("Microsoft", "brand"), ("Azure", "product"), ("Windows", "product")],
"NVDA": [("NVIDIA", "brand"), ("GeForce", "product"), ("CUDA", "product")],
"AMZN": [("Amazon", "brand"), ("AWS", "product"), ("Prime", "product")],
"GOOGL": [("Google", "brand"), ("Alphabet", "legal_name"), ("YouTube", "product")],
"JPM": [("JPMorgan", "brand"), ("Chase", "brand")],
"JNJ": [("J&J", "brand")],
"XOM": [("Exxon", "brand"), ("ExxonMobil", "brand")],
"TSLA": [("Tesla", "brand")],
"META": [("Facebook", "brand"), ("Instagram", "product"), ("WhatsApp", "product")],
}
# --- Source configs per company ---
# Alpha Vantage for market data (free: 25 req/day)
# NewsAPI for news (free: 100 req/day)
# SEC EDGAR for filings (free, rate-limited by user-agent)
# Alpaca for paper trading (free unlimited paper)
SOURCES_PER_COMPANY = [
{
"source_type": "market_api",
"source_name": "Alpha Vantage",
"credibility_score": 0.9,
"config": {
"provider": "alpha_vantage",
"base_url": "https://www.alphavantage.co",
"endpoint": "/query",
"functions": ["TIME_SERIES_DAILY", "GLOBAL_QUOTE", "OVERVIEW"],
},
},
{
"source_type": "news_api",
"source_name": "NewsAPI",
"credibility_score": 0.7,
"config": {
"provider": "newsapi",
"base_url": "https://newsapi.org",
"endpoint": "/v2/everything",
"page_size": 20,
},
},
{
"source_type": "filings_api",
"source_name": "SEC EDGAR",
"credibility_score": 1.0,
"config": {
"provider": "sec_edgar",
"base_url": "https://efts.sec.gov",
"forms": ["8-K", "10-Q", "10-K"],
"user_agent": "StonksOracle/1.0",
},
},
]
# Broker source — one per account, not per company
BROKER_SOURCE = {
"source_type": "broker",
"source_name": "Alpaca Paper",
"credibility_score": 1.0,
"config": {
"provider": "alpaca",
"base_url": "https://paper-api.alpaca.markets",
"mode": "paper",
},
}
async def seed(pool: asyncpg.Pool) -> None:
"""Insert seed data. Skips existing records."""
company_ids = {}
# Companies
for c in COMPANIES:
row = await pool.fetchrow(
"""INSERT INTO companies (ticker, legal_name, exchange, sector, industry, market_cap_bucket)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (ticker, exchange) DO UPDATE SET legal_name = EXCLUDED.legal_name
RETURNING id, ticker""",
c["ticker"], c["legal_name"], c["exchange"],
c["sector"], c["industry"], c["market_cap_bucket"],
)
company_ids[row["ticker"]] = row["id"]
logger.info(f"Company: {row['ticker']} -> {row['id']}")
# Aliases
for ticker, aliases in ALIASES.items():
cid = company_ids.get(ticker)
if not cid:
continue
for alias, alias_type in aliases:
await pool.execute(
"""INSERT INTO company_aliases (company_id, alias, alias_type)
VALUES ($1, $2, $3) ON CONFLICT DO NOTHING""",
cid, alias, alias_type,
)
logger.info("Aliases seeded")
# Watchlist
wl = await pool.fetchrow(
"""INSERT INTO watchlists (name, description)
VALUES ('Starter 10', 'Initial tracked watchlist — 10 diverse mega/large-cap symbols')
ON CONFLICT (name) DO UPDATE SET description = EXCLUDED.description
RETURNING id""",
)
wl_id = wl["id"]
for cid in company_ids.values():
await pool.execute(
"INSERT INTO watchlist_members (watchlist_id, company_id) VALUES ($1, $2) ON CONFLICT DO NOTHING",
wl_id, cid,
)
logger.info(f"Watchlist 'Starter 10' -> {wl_id}")
# Sources per company
for ticker, cid in company_ids.items():
for src in SOURCES_PER_COMPANY:
await pool.execute(
"""INSERT INTO sources (company_id, source_type, source_name, config, credibility_score)
VALUES ($1, $2, $3, $4, $5)
ON CONFLICT DO NOTHING""",
cid, src["source_type"], src["source_name"],
src["config"], src["credibility_score"],
)
# Broker source only for the first company (account-level)
if ticker == COMPANIES[0]["ticker"]:
await pool.execute(
"""INSERT INTO sources (company_id, source_type, source_name, config, credibility_score)
VALUES ($1, $2, $3, $4, $5)
ON CONFLICT DO NOTHING""",
cid, BROKER_SOURCE["source_type"], BROKER_SOURCE["source_name"],
BROKER_SOURCE["config"], BROKER_SOURCE["credibility_score"],
)
logger.info("Sources seeded")
total = await pool.fetchval("SELECT count(*) FROM companies")
logger.info(f"Seed complete: {total} companies, watchlist with {len(company_ids)} members")
async def main() -> None:
config = load_config()
pool = await get_pg_pool(config)
try:
await seed(pool)
finally:
await pool.close()
if __name__ == "__main__":
asyncio.run(main())
+105
View File
@@ -0,0 +1,105 @@
"""Tests for symbol registry validation and seed data."""
import pytest
from pydantic import ValidationError
# Import after path setup
from services.symbol_registry.app import CompanyCreate, SourceCreate, VALID_SOURCE_TYPES
from services.symbol_registry.seed import COMPANIES, ALIASES, SOURCES_PER_COMPANY
# --- CompanyCreate validation ---
def test_ticker_uppercased():
c = CompanyCreate(ticker="aapl", legal_name="Apple Inc.")
assert c.ticker == "AAPL"
def test_ticker_strips_whitespace():
c = CompanyCreate(ticker=" MSFT ", legal_name="Microsoft")
assert c.ticker == "MSFT"
def test_ticker_rejects_numbers():
with pytest.raises(ValidationError):
CompanyCreate(ticker="123", legal_name="Bad")
def test_ticker_rejects_empty():
with pytest.raises(ValidationError):
CompanyCreate(ticker="", legal_name="Bad")
def test_ticker_rejects_too_long():
with pytest.raises(ValidationError):
CompanyCreate(ticker="ABCDEFGHIJK", legal_name="Bad")
def test_ticker_rejects_special_chars():
with pytest.raises(ValidationError):
CompanyCreate(ticker="AA-PL", legal_name="Bad")
# --- SourceCreate validation ---
def test_source_valid_types():
for st in VALID_SOURCE_TYPES:
s = SourceCreate(source_type=st, source_name="test")
assert s.source_type == st
def test_source_rejects_invalid_type():
with pytest.raises(ValidationError):
SourceCreate(source_type="invalid_type", source_name="test")
def test_source_rejects_invalid_access_policy():
with pytest.raises(ValidationError):
SourceCreate(source_type="market_api", source_name="test", access_policy="secret")
def test_source_validates_config_url():
s = SourceCreate(
source_type="market_api",
source_name="test",
config={"base_url": "https://api.example.com"},
)
assert s.config["base_url"] == "https://api.example.com"
def test_source_rejects_bad_config_url():
with pytest.raises(ValidationError):
SourceCreate(
source_type="market_api",
source_name="test",
config={"base_url": "not-a-url"},
)
# --- Seed data integrity ---
def test_seed_companies_have_required_fields():
for c in COMPANIES:
assert c["ticker"]
assert c["legal_name"]
assert c["exchange"]
assert c["sector"]
def test_seed_companies_unique_tickers():
tickers = [c["ticker"] for c in COMPANIES]
assert len(tickers) == len(set(tickers))
def test_seed_aliases_reference_valid_tickers():
tickers = {c["ticker"] for c in COMPANIES}
for ticker in ALIASES:
assert ticker in tickers, f"Alias references unknown ticker: {ticker}"
def test_seed_sources_have_valid_types():
for src in SOURCES_PER_COMPANY:
assert src["source_type"] in VALID_SOURCE_TYPES
def test_seed_has_ten_companies():
assert len(COMPANIES) == 10