feat: expand seed to 50 companies with competitor relationships and macro source

This commit is contained in:
Celes Renata
2026-04-14 21:00:19 +00:00
parent 86808465de
commit 8d0526470d
+205 -23
View File
@@ -1,7 +1,7 @@
"""Seed data for initial tracked watchlist.
Run against a live database to populate the starter companies, aliases,
watchlist, and source configurations.
watchlist, source configurations, macro news source, and competitor relationships.
Usage:
python -m services.symbol_registry.seed
@@ -18,19 +18,67 @@ from services.shared.logging import setup_logging
logger = logging.getLogger("seed")
# --- Seed Companies ---
# Diverse mix: mega-cap tech, finance, healthcare, energy, consumer
# --- Seed Companies (50 diverse large/mega-cap) ---
COMPANIES = [
# Technology
{"ticker": "AAPL", "legal_name": "Apple Inc.", "exchange": "NASDAQ", "sector": "Technology", "industry": "Consumer Electronics", "market_cap_bucket": "mega"},
{"ticker": "MSFT", "legal_name": "Microsoft Corporation", "exchange": "NASDAQ", "sector": "Technology", "industry": "Software", "market_cap_bucket": "mega"},
{"ticker": "NVDA", "legal_name": "NVIDIA Corporation", "exchange": "NASDAQ", "sector": "Technology", "industry": "Semiconductors", "market_cap_bucket": "mega"},
{"ticker": "AMZN", "legal_name": "Amazon.com Inc.", "exchange": "NASDAQ", "sector": "Consumer Cyclical", "industry": "Internet Retail", "market_cap_bucket": "mega"},
{"ticker": "GOOGL", "legal_name": "Alphabet Inc.", "exchange": "NASDAQ", "sector": "Technology", "industry": "Internet Content", "market_cap_bucket": "mega"},
{"ticker": "JPM", "legal_name": "JPMorgan Chase & Co.", "exchange": "NYSE", "sector": "Financial Services", "industry": "Banks", "market_cap_bucket": "mega"},
{"ticker": "JNJ", "legal_name": "Johnson & Johnson", "exchange": "NYSE", "sector": "Healthcare", "industry": "Drug Manufacturers", "market_cap_bucket": "mega"},
{"ticker": "XOM", "legal_name": "Exxon Mobil Corporation", "exchange": "NYSE", "sector": "Energy", "industry": "Oil & Gas Integrated", "market_cap_bucket": "mega"},
{"ticker": "TSLA", "legal_name": "Tesla Inc.", "exchange": "NASDAQ", "sector": "Consumer Cyclical", "industry": "Auto Manufacturers", "market_cap_bucket": "large"},
{"ticker": "META", "legal_name": "Meta Platforms Inc.", "exchange": "NASDAQ", "sector": "Technology", "industry": "Internet Content", "market_cap_bucket": "mega"},
{"ticker": "AVGO", "legal_name": "Broadcom Inc.", "exchange": "NASDAQ", "sector": "Technology", "industry": "Semiconductors", "market_cap_bucket": "mega"},
{"ticker": "ORCL", "legal_name": "Oracle Corporation", "exchange": "NYSE", "sector": "Technology", "industry": "Software", "market_cap_bucket": "mega"},
{"ticker": "CRM", "legal_name": "Salesforce Inc.", "exchange": "NYSE", "sector": "Technology", "industry": "Software", "market_cap_bucket": "large"},
{"ticker": "AMD", "legal_name": "Advanced Micro Devices Inc.", "exchange": "NASDAQ", "sector": "Technology", "industry": "Semiconductors", "market_cap_bucket": "large"},
{"ticker": "INTC", "legal_name": "Intel Corporation", "exchange": "NASDAQ", "sector": "Technology", "industry": "Semiconductors", "market_cap_bucket": "large"},
{"ticker": "CSCO", "legal_name": "Cisco Systems Inc.", "exchange": "NASDAQ", "sector": "Technology", "industry": "Networking", "market_cap_bucket": "large"},
{"ticker": "ADBE", "legal_name": "Adobe Inc.", "exchange": "NASDAQ", "sector": "Technology", "industry": "Software", "market_cap_bucket": "large"},
# Consumer Cyclical
{"ticker": "AMZN", "legal_name": "Amazon.com Inc.", "exchange": "NASDAQ", "sector": "Consumer Cyclical", "industry": "Internet Retail", "market_cap_bucket": "mega"},
{"ticker": "TSLA", "legal_name": "Tesla Inc.", "exchange": "NASDAQ", "sector": "Consumer Cyclical", "industry": "Auto Manufacturers", "market_cap_bucket": "large"},
{"ticker": "HD", "legal_name": "The Home Depot Inc.", "exchange": "NYSE", "sector": "Consumer Cyclical", "industry": "Home Improvement", "market_cap_bucket": "mega"},
{"ticker": "NKE", "legal_name": "Nike Inc.", "exchange": "NYSE", "sector": "Consumer Cyclical", "industry": "Footwear & Accessories", "market_cap_bucket": "large"},
{"ticker": "MCD", "legal_name": "McDonald's Corporation", "exchange": "NYSE", "sector": "Consumer Cyclical", "industry": "Restaurants", "market_cap_bucket": "large"},
{"ticker": "SBUX", "legal_name": "Starbucks Corporation", "exchange": "NASDAQ", "sector": "Consumer Cyclical", "industry": "Restaurants", "market_cap_bucket": "large"},
# Financial Services
{"ticker": "JPM", "legal_name": "JPMorgan Chase & Co.", "exchange": "NYSE", "sector": "Financial Services", "industry": "Banks", "market_cap_bucket": "mega"},
{"ticker": "V", "legal_name": "Visa Inc.", "exchange": "NYSE", "sector": "Financial Services", "industry": "Credit Services", "market_cap_bucket": "mega"},
{"ticker": "MA", "legal_name": "Mastercard Inc.", "exchange": "NYSE", "sector": "Financial Services", "industry": "Credit Services", "market_cap_bucket": "mega"},
{"ticker": "BAC", "legal_name": "Bank of America Corporation", "exchange": "NYSE", "sector": "Financial Services", "industry": "Banks", "market_cap_bucket": "mega"},
{"ticker": "GS", "legal_name": "Goldman Sachs Group Inc.", "exchange": "NYSE", "sector": "Financial Services", "industry": "Capital Markets", "market_cap_bucket": "large"},
{"ticker": "MS", "legal_name": "Morgan Stanley", "exchange": "NYSE", "sector": "Financial Services", "industry": "Capital Markets", "market_cap_bucket": "large"},
{"ticker": "BRK.B", "legal_name": "Berkshire Hathaway Inc.", "exchange": "NYSE", "sector": "Financial Services", "industry": "Insurance", "market_cap_bucket": "mega"},
# Healthcare
{"ticker": "JNJ", "legal_name": "Johnson & Johnson", "exchange": "NYSE", "sector": "Healthcare", "industry": "Drug Manufacturers", "market_cap_bucket": "mega"},
{"ticker": "UNH", "legal_name": "UnitedHealth Group Inc.", "exchange": "NYSE", "sector": "Healthcare", "industry": "Health Insurance", "market_cap_bucket": "mega"},
{"ticker": "LLY", "legal_name": "Eli Lilly and Company", "exchange": "NYSE", "sector": "Healthcare", "industry": "Drug Manufacturers", "market_cap_bucket": "mega"},
{"ticker": "PFE", "legal_name": "Pfizer Inc.", "exchange": "NYSE", "sector": "Healthcare", "industry": "Drug Manufacturers", "market_cap_bucket": "large"},
{"ticker": "ABBV", "legal_name": "AbbVie Inc.", "exchange": "NYSE", "sector": "Healthcare", "industry": "Drug Manufacturers", "market_cap_bucket": "mega"},
{"ticker": "MRK", "legal_name": "Merck & Co. Inc.", "exchange": "NYSE", "sector": "Healthcare", "industry": "Drug Manufacturers", "market_cap_bucket": "mega"},
# Energy
{"ticker": "XOM", "legal_name": "Exxon Mobil Corporation", "exchange": "NYSE", "sector": "Energy", "industry": "Oil & Gas Integrated", "market_cap_bucket": "mega"},
{"ticker": "CVX", "legal_name": "Chevron Corporation", "exchange": "NYSE", "sector": "Energy", "industry": "Oil & Gas Integrated", "market_cap_bucket": "mega"},
{"ticker": "COP", "legal_name": "ConocoPhillips", "exchange": "NYSE", "sector": "Energy", "industry": "Oil & Gas E&P", "market_cap_bucket": "large"},
{"ticker": "SLB", "legal_name": "Schlumberger Limited", "exchange": "NYSE", "sector": "Energy", "industry": "Oil & Gas Services", "market_cap_bucket": "large"},
# Communication Services
{"ticker": "NFLX", "legal_name": "Netflix Inc.", "exchange": "NASDAQ", "sector": "Communication Services", "industry": "Entertainment", "market_cap_bucket": "mega"},
{"ticker": "DIS", "legal_name": "The Walt Disney Company", "exchange": "NYSE", "sector": "Communication Services", "industry": "Entertainment", "market_cap_bucket": "large"},
{"ticker": "CMCSA", "legal_name": "Comcast Corporation", "exchange": "NASDAQ", "sector": "Communication Services", "industry": "Telecom", "market_cap_bucket": "large"},
{"ticker": "T", "legal_name": "AT&T Inc.", "exchange": "NYSE", "sector": "Communication Services", "industry": "Telecom", "market_cap_bucket": "large"},
# Industrials
{"ticker": "CAT", "legal_name": "Caterpillar Inc.", "exchange": "NYSE", "sector": "Industrials", "industry": "Farm & Heavy Equipment", "market_cap_bucket": "large"},
{"ticker": "BA", "legal_name": "The Boeing Company", "exchange": "NYSE", "sector": "Industrials", "industry": "Aerospace & Defense", "market_cap_bucket": "large"},
{"ticker": "UPS", "legal_name": "United Parcel Service Inc.", "exchange": "NYSE", "sector": "Industrials", "industry": "Logistics", "market_cap_bucket": "large"},
{"ticker": "HON", "legal_name": "Honeywell International Inc.", "exchange": "NASDAQ", "sector": "Industrials", "industry": "Conglomerates", "market_cap_bucket": "large"},
# Consumer Defensive
{"ticker": "PG", "legal_name": "Procter & Gamble Company", "exchange": "NYSE", "sector": "Consumer Defensive", "industry": "Household Products", "market_cap_bucket": "mega"},
{"ticker": "KO", "legal_name": "The Coca-Cola Company", "exchange": "NYSE", "sector": "Consumer Defensive", "industry": "Beverages", "market_cap_bucket": "mega"},
{"ticker": "PEP", "legal_name": "PepsiCo Inc.", "exchange": "NASDAQ", "sector": "Consumer Defensive", "industry": "Beverages", "market_cap_bucket": "mega"},
{"ticker": "WMT", "legal_name": "Walmart Inc.", "exchange": "NYSE", "sector": "Consumer Defensive", "industry": "Discount Stores", "market_cap_bucket": "mega"},
{"ticker": "COST", "legal_name": "Costco Wholesale Corporation", "exchange": "NASDAQ", "sector": "Consumer Defensive", "industry": "Discount Stores", "market_cap_bucket": "mega"},
# Real Estate / Utilities
{"ticker": "AMT", "legal_name": "American Tower Corporation", "exchange": "NYSE", "sector": "Real Estate", "industry": "REIT - Specialty", "market_cap_bucket": "large"},
{"ticker": "NEE", "legal_name": "NextEra Energy Inc.", "exchange": "NYSE", "sector": "Utilities", "industry": "Utilities - Renewable", "market_cap_bucket": "large"},
]
# --- Aliases ---
@@ -40,18 +88,102 @@ ALIASES = {
"NVDA": [("NVIDIA", "brand"), ("GeForce", "product"), ("CUDA", "product")],
"AMZN": [("Amazon", "brand"), ("AWS", "product"), ("Prime", "product")],
"GOOGL": [("Google", "brand"), ("Alphabet", "legal_name"), ("YouTube", "product")],
"META": [("Facebook", "brand"), ("Instagram", "product"), ("WhatsApp", "product")],
"JPM": [("JPMorgan", "brand"), ("Chase", "brand")],
"JNJ": [("J&J", "brand")],
"XOM": [("Exxon", "brand"), ("ExxonMobil", "brand")],
"TSLA": [("Tesla", "brand")],
"META": [("Facebook", "brand"), ("Instagram", "product"), ("WhatsApp", "product")],
"NFLX": [("Netflix", "brand")],
"DIS": [("Disney", "brand"), ("Disney+", "product")],
"V": [("Visa", "brand")],
"MA": [("Mastercard", "brand")],
"KO": [("Coca-Cola", "brand"), ("Coke", "brand")],
"PEP": [("Pepsi", "brand"), ("PepsiCo", "brand")],
"BA": [("Boeing", "brand")],
"WMT": [("Walmart", "brand")],
"COST": [("Costco", "brand")],
"CRM": [("Salesforce", "brand")],
"ORCL": [("Oracle", "brand")],
"ADBE": [("Adobe", "brand"), ("Photoshop", "product")],
"AMD": [("AMD", "brand"), ("Ryzen", "product")],
"INTC": [("Intel", "brand")],
"BAC": [("Bank of America", "brand"), ("BofA", "brand")],
"GS": [("Goldman Sachs", "brand"), ("Goldman", "brand")],
"UNH": [("UnitedHealth", "brand"), ("Optum", "product")],
"LLY": [("Eli Lilly", "brand"), ("Lilly", "brand")],
"PFE": [("Pfizer", "brand")],
"CVX": [("Chevron", "brand")],
"PG": [("P&G", "brand"), ("Procter & Gamble", "brand")],
"HD": [("Home Depot", "brand")],
"NKE": [("Nike", "brand")],
"MCD": [("McDonald's", "brand")],
"SBUX": [("Starbucks", "brand")],
}
# --- Source configs per company ---
# Polygon.io for market data and news (matches PolygonMarketAdapter and PolygonNewsAdapter)
# SEC EDGAR for filings (matches SECEdgarAdapter)
# Alpaca for paper trading (matches AlpacaBrokerAdapter)
# --- Competitor Relationships ---
# (ticker_a, ticker_b, relationship_type, strength)
COMPETITOR_RELATIONSHIPS = [
# Tech ecosystem rivals
("AAPL", "MSFT", "direct_rival", 0.75),
("AAPL", "GOOGL", "overlapping_products", 0.60),
("AAPL", "META", "overlapping_products", 0.40),
("GOOGL", "META", "direct_rival", 0.85),
("MSFT", "GOOGL", "overlapping_products", 0.70),
("MSFT", "AMZN", "overlapping_products", 0.65),
("MSFT", "ORCL", "direct_rival", 0.60),
("MSFT", "CRM", "overlapping_products", 0.55),
("MSFT", "ADBE", "overlapping_products", 0.40),
("CRM", "ORCL", "direct_rival", 0.70),
("CRM", "ADBE", "overlapping_products", 0.45),
# Semiconductors
("NVDA", "AMD", "direct_rival", 0.80),
("NVDA", "INTC", "direct_rival", 0.65),
("NVDA", "AVGO", "same_sector", 0.50),
("AMD", "INTC", "direct_rival", 0.85),
("INTC", "AVGO", "same_sector", 0.40),
# Supply chain: chips → tech
("NVDA", "AAPL", "supply_chain_adjacent", 0.50),
("NVDA", "MSFT", "supply_chain_adjacent", 0.60),
("NVDA", "TSLA", "supply_chain_adjacent", 0.45),
("NVDA", "META", "supply_chain_adjacent", 0.50),
("AVGO", "AAPL", "supply_chain_adjacent", 0.55),
# Cloud rivals
("AMZN", "GOOGL", "overlapping_products", 0.55),
# Financial services
("JPM", "BAC", "direct_rival", 0.80),
("JPM", "GS", "overlapping_products", 0.60),
("GS", "MS", "direct_rival", 0.85),
("V", "MA", "direct_rival", 0.90),
# Healthcare / pharma
("JNJ", "PFE", "direct_rival", 0.65),
("JNJ", "ABBV", "direct_rival", 0.60),
("JNJ", "MRK", "direct_rival", 0.60),
("LLY", "PFE", "direct_rival", 0.70),
("LLY", "ABBV", "direct_rival", 0.65),
("LLY", "MRK", "direct_rival", 0.65),
("PFE", "MRK", "direct_rival", 0.75),
("PFE", "ABBV", "direct_rival", 0.70),
("ABBV", "MRK", "direct_rival", 0.70),
# Energy
("XOM", "CVX", "direct_rival", 0.85),
("XOM", "COP", "same_sector", 0.55),
("CVX", "COP", "same_sector", 0.55),
# Entertainment / streaming
("NFLX", "DIS", "direct_rival", 0.75),
("NFLX", "AMZN", "overlapping_products", 0.50),
# Telecom
("CMCSA", "T", "direct_rival", 0.70),
# Consumer: beverages
("KO", "PEP", "direct_rival", 0.90),
# Consumer: restaurants
("MCD", "SBUX", "same_sector", 0.45),
# Consumer: retail
("WMT", "COST", "direct_rival", 0.70),
("WMT", "AMZN", "overlapping_products", 0.60),
("COST", "AMZN", "overlapping_products", 0.40),
]
# --- Source configs per company ---
SOURCES_PER_COMPANY = [
{
"source_type": "market_api",
@@ -85,7 +217,6 @@ SOURCES_PER_COMPANY = [
},
]
# Broker source — one per account, not per company
BROKER_SOURCE = {
"source_type": "broker",
"source_name": "Alpaca Paper",
@@ -96,10 +227,24 @@ BROKER_SOURCE = {
},
}
# Macro news source (global, not company-specific)
MACRO_NEWS_SOURCE = {
"source_type": "macro_news",
"source_name": "Polygon Global News",
"credibility_score": 0.7,
"config": {
"url": "https://api.polygon.io/v2/reference/news",
"params": {"order": "desc"},
"results_key": "results",
"provider": "polygon",
"limit": 50,
},
}
async def seed(pool: asyncpg.Pool) -> None:
"""Insert seed data. Uses upsert for companies, skips existing aliases/sources."""
company_ids = {}
company_ids: dict[str, str] = {}
# Companies — upsert on (ticker, exchange)
for c in COMPANIES:
@@ -116,8 +261,8 @@ async def seed(pool: asyncpg.Pool) -> None:
c["ticker"], c["legal_name"], c["exchange"],
c["sector"], c["industry"], c["market_cap_bucket"],
)
company_ids[row["ticker"]] = row["id"]
logger.info(f"Company: {row['ticker']} -> {row['id']}")
company_ids[row["ticker"]] = str(row["id"])
logger.info("Company: %s -> %s", row["ticker"], row["id"])
# Aliases
for ticker, aliases in ALIASES.items():
@@ -135,7 +280,7 @@ async def seed(pool: asyncpg.Pool) -> None:
# Watchlist
wl = await pool.fetchrow(
"""INSERT INTO watchlists (name, description)
VALUES ('Starter 10', 'Initial tracked watchlist — 10 diverse mega/large-cap symbols')
VALUES ('Starter 50', 'Initial tracked watchlist — 50 diverse mega/large-cap symbols')
ON CONFLICT (name) DO UPDATE SET description = EXCLUDED.description
RETURNING id""",
)
@@ -145,9 +290,9 @@ async def seed(pool: asyncpg.Pool) -> None:
"INSERT INTO watchlist_members (watchlist_id, company_id) VALUES ($1, $2) ON CONFLICT DO NOTHING",
wl_id, cid,
)
logger.info(f"Watchlist 'Starter 10' -> {wl_id}")
logger.info("Watchlist 'Starter 50' -> %s", wl_id)
# Sources per company — check for existing before inserting
# Sources per company
for ticker, cid in company_ids.items():
existing = await pool.fetch(
"SELECT source_type, source_name FROM sources WHERE company_id = $1",
@@ -158,7 +303,6 @@ async def seed(pool: asyncpg.Pool) -> None:
for src in SOURCES_PER_COMPANY:
key = (src["source_type"], src["source_name"])
if key in existing_set:
logger.debug(f"Source {key} already exists for {ticker}, skipping")
continue
await pool.execute(
"""INSERT INTO sources (company_id, source_type, source_name, config, credibility_score)
@@ -167,7 +311,7 @@ async def seed(pool: asyncpg.Pool) -> None:
json.dumps(src["config"]), src["credibility_score"],
)
# Broker source only for the first company (account-level)
# Broker source only for the first company
if ticker == COMPANIES[0]["ticker"]:
bkey = (BROKER_SOURCE["source_type"], BROKER_SOURCE["source_name"])
if bkey not in existing_set:
@@ -177,11 +321,49 @@ async def seed(pool: asyncpg.Pool) -> None:
cid, BROKER_SOURCE["source_type"], BROKER_SOURCE["source_name"],
json.dumps(BROKER_SOURCE["config"]), BROKER_SOURCE["credibility_score"],
)
# Macro news source on the first company (global, but needs a company_id FK)
if ticker == COMPANIES[0]["ticker"]:
mkey = (MACRO_NEWS_SOURCE["source_type"], MACRO_NEWS_SOURCE["source_name"])
if mkey not in existing_set:
await pool.execute(
"""INSERT INTO sources (company_id, source_type, source_name, config, credibility_score)
VALUES ($1, $2, $3, $4::jsonb, $5)""",
cid, MACRO_NEWS_SOURCE["source_type"], MACRO_NEWS_SOURCE["source_name"],
json.dumps(MACRO_NEWS_SOURCE["config"]), MACRO_NEWS_SOURCE["credibility_score"],
)
logger.info("Sources seeded")
# Competitor relationships
rel_count = 0
for ticker_a, ticker_b, rel_type, strength in COMPETITOR_RELATIONSHIPS:
cid_a = company_ids.get(ticker_a)
cid_b = company_ids.get(ticker_b)
if not cid_a or not cid_b:
logger.warning("Skipping relationship %s-%s: company not found", ticker_a, ticker_b)
continue
# Use LEAST/GREATEST ordering for the unique index
a_id = min(cid_a, cid_b)
b_id = max(cid_a, cid_b)
await pool.execute(
"""INSERT INTO competitor_relationships
(company_a_id, company_b_id, relationship_type, strength, bidirectional, source)
VALUES ($1, $2, $3, $4, TRUE, 'manual')
ON CONFLICT (LEAST(company_a_id, company_b_id), GREATEST(company_a_id, company_b_id))
WHERE active = TRUE
DO UPDATE SET strength = EXCLUDED.strength, relationship_type = EXCLUDED.relationship_type, updated_at = NOW()""",
a_id, b_id, rel_type, strength,
)
rel_count += 1
logger.info("Competitor relationships seeded: %d", rel_count)
total = await pool.fetchval("SELECT count(*) FROM companies")
sources_total = await pool.fetchval("SELECT count(*) FROM sources")
logger.info(f"Seed complete: {total} companies, {sources_total} sources, watchlist with {len(company_ids)} members")
rels_total = await pool.fetchval("SELECT count(*) FROM competitor_relationships WHERE active = TRUE")
logger.info(
"Seed complete: %d companies, %d sources, %d competitor relationships, watchlist with %d members",
total, sources_total, rels_total, len(company_ids),
)
async def main() -> None: