Files
stonks-oracle/scripts/backfill_filing_urls.py
T

97 lines
3.3 KiB
Python

"""Backfill SEC EDGAR filing URLs from raw MinIO payloads."""
import asyncio
import json
import os
from datetime import datetime
import asyncpg
from minio import Minio
async def main():
pool = await asyncpg.create_pool(
host=os.environ["POSTGRES_HOST"],
port=int(os.environ["POSTGRES_PORT"]),
database=os.environ["POSTGRES_DB"],
user=os.environ["POSTGRES_USER"],
password=os.environ["POSTGRES_PASSWORD"],
)
mc = Minio(
os.environ["MINIO_ENDPOINT"],
access_key=os.environ["MINIO_ACCESS_KEY"],
secret_key=os.environ["MINIO_SECRET_KEY"],
secure=False,
)
# Build URL map from raw filings data
objs = list(mc.list_objects("stonks-raw-filings", recursive=True))
print(f"Processing {len(objs)} raw filing payloads")
# Map title -> url for matching
title_url_map = {}
for obj in objs:
try:
data = json.loads(mc.get_object("stonks-raw-filings", obj.object_name).read())
hits = data.get("hits", {}).get("hits", [])
seen_adsh = set()
for hit in hits:
src = hit.get("_source", {})
adsh = src.get("adsh", "")
ciks = src.get("ciks", [])
if not adsh or not ciks or adsh in seen_adsh:
continue
file_type = src.get("file_type", "")
if file_type in ("XML", "GRAPHIC", "ZIP", "EXCEL"):
continue
seen_adsh.add(adsh)
cik = ciks[0].lstrip("0")
adsh_nodash = adsh.replace("-", "")
url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh_nodash}/{adsh}-index.htm"
form = src.get("form", "")
names = src.get("display_names", [])
entity_name = names[0].split("(CIK")[0].strip() if names else ""
file_desc = src.get("file_description", "")
title = f"{form}: {entity_name}" + (f" - {file_desc}" if file_desc else "")
file_date = src.get("file_date", "")
# Also try matching by file_description alone
if file_desc:
title_url_map[file_desc] = {"url": url, "date": file_date}
title_url_map[title] = {"url": url, "date": file_date}
except Exception as e:
print(f" Error processing {obj.object_name}: {e}")
print(f"Built {len(title_url_map)} title->URL mappings")
# Get filing docs without URLs
rows = await pool.fetch(
"SELECT id, title FROM documents "
"WHERE source_type = 'filings_api' AND (url IS NULL OR url = '')"
)
print(f"{len(rows)} filing docs missing URLs")
updated = 0
for row in rows:
title = row["title"] or ""
match = title_url_map.get(title)
if match and match["url"]:
pub = None
if match["date"]:
try:
pub = datetime.fromisoformat(f"{match['date']}T00:00:00+00:00")
except Exception:
pass
await pool.execute(
"UPDATE documents SET url = $1, published_at = COALESCE($2, published_at) WHERE id = $3",
match["url"], pub, row["id"],
)
updated += 1
print(f"Updated {updated} filing docs with URLs")
await pool.close()
asyncio.run(main())