"""Backfill SEC EDGAR filing URLs from raw MinIO payloads.""" import asyncio import json import os from datetime import datetime import asyncpg from minio import Minio async def main(): pool = await asyncpg.create_pool( host=os.environ["POSTGRES_HOST"], port=int(os.environ["POSTGRES_PORT"]), database=os.environ["POSTGRES_DB"], user=os.environ["POSTGRES_USER"], password=os.environ["POSTGRES_PASSWORD"], ) mc = Minio( os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False, ) # Build URL map from raw filings data objs = list(mc.list_objects("stonks-raw-filings", recursive=True)) print(f"Processing {len(objs)} raw filing payloads") # Map title -> url for matching title_url_map = {} for obj in objs: try: data = json.loads(mc.get_object("stonks-raw-filings", obj.object_name).read()) hits = data.get("hits", {}).get("hits", []) seen_adsh = set() for hit in hits: src = hit.get("_source", {}) adsh = src.get("adsh", "") ciks = src.get("ciks", []) if not adsh or not ciks or adsh in seen_adsh: continue file_type = src.get("file_type", "") if file_type in ("XML", "GRAPHIC", "ZIP", "EXCEL"): continue seen_adsh.add(adsh) cik = ciks[0].lstrip("0") adsh_nodash = adsh.replace("-", "") url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh_nodash}/{adsh}-index.htm" form = src.get("form", "") names = src.get("display_names", []) entity_name = names[0].split("(CIK")[0].strip() if names else "" file_desc = src.get("file_description", "") title = f"{form}: {entity_name}" + (f" - {file_desc}" if file_desc else "") file_date = src.get("file_date", "") # Also try matching by file_description alone if file_desc: title_url_map[file_desc] = {"url": url, "date": file_date} title_url_map[title] = {"url": url, "date": file_date} except Exception as e: print(f" Error processing {obj.object_name}: {e}") print(f"Built {len(title_url_map)} title->URL mappings") # Get filing docs without URLs rows = await pool.fetch( "SELECT id, title FROM documents " "WHERE source_type = 'filings_api' AND (url IS NULL OR url = '')" ) print(f"{len(rows)} filing docs missing URLs") updated = 0 for row in rows: title = row["title"] or "" match = title_url_map.get(title) if match and match["url"]: pub = None if match["date"]: try: pub = datetime.fromisoformat(f"{match['date']}T00:00:00+00:00") except Exception: pass await pool.execute( "UPDATE documents SET url = $1, published_at = COALESCE($2, published_at) WHERE id = $3", match["url"], pub, row["id"], ) updated += 1 print(f"Updated {updated} filing docs with URLs") await pool.close() asyncio.run(main())