phase 17: fix SEC EDGAR 403 — use descriptive User-Agent with contact email per fair access policy

2026-04-12 09:50:29 -07:00
parent 311d76dc0b
commit 2e42310f07
3 changed files with 110 additions and 1 deletions
@@ -0,0 +1,96 @@
 """Backfill SEC EDGAR filing URLs from raw MinIO payloads."""
 import asyncio
 import json
 import os
 from datetime import datetime
 import asyncpg
 from minio import Minio
 async def main():
    pool = await asyncpg.create_pool(
        host=os.environ["POSTGRES_HOST"],
        port=int(os.environ["POSTGRES_PORT"]),
        database=os.environ["POSTGRES_DB"],
        user=os.environ["POSTGRES_USER"],
        password=os.environ["POSTGRES_PASSWORD"],
    )
    mc = Minio(
        os.environ["MINIO_ENDPOINT"],
        access_key=os.environ["MINIO_ACCESS_KEY"],
        secret_key=os.environ["MINIO_SECRET_KEY"],
        secure=False,
    )
    # Build URL map from raw filings data
    objs = list(mc.list_objects("stonks-raw-filings", recursive=True))
    print(f"Processing {len(objs)} raw filing payloads")
    # Map title -> url for matching
    title_url_map = {}
    for obj in objs:
        try:
            data = json.loads(mc.get_object("stonks-raw-filings", obj.object_name).read())
            hits = data.get("hits", {}).get("hits", [])
            seen_adsh = set()
            for hit in hits:
                src = hit.get("_source", {})
                adsh = src.get("adsh", "")
                ciks = src.get("ciks", [])
                if not adsh or not ciks or adsh in seen_adsh:
                    continue
                file_type = src.get("file_type", "")
                if file_type in ("XML", "GRAPHIC", "ZIP", "EXCEL"):
                    continue
                seen_adsh.add(adsh)
                cik = ciks[0].lstrip("0")
                adsh_nodash = adsh.replace("-", "")
                url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh_nodash}/{adsh}-index.htm"
                form = src.get("form", "")
                names = src.get("display_names", [])
                entity_name = names[0].split("(CIK")[0].strip() if names else ""
                file_desc = src.get("file_description", "")
                title = f"{form}: {entity_name}" + (f" - {file_desc}" if file_desc else "")
                file_date = src.get("file_date", "")
                # Also try matching by file_description alone
                if file_desc:
                    title_url_map[file_desc] = {"url": url, "date": file_date}
                title_url_map[title] = {"url": url, "date": file_date}
        except Exception as e:
            print(f"  Error processing {obj.object_name}: {e}")
    print(f"Built {len(title_url_map)} title->URL mappings")
    # Get filing docs without URLs
    rows = await pool.fetch(
        "SELECT id, title FROM documents "
        "WHERE source_type = 'filings_api' AND (url IS NULL OR url = '')"
    )
    print(f"{len(rows)} filing docs missing URLs")
    updated = 0
    for row in rows:
        title = row["title"] or ""
        match = title_url_map.get(title)
        if match and match["url"]:
            pub = None
            if match["date"]:
                try:
                    pub = datetime.fromisoformat(f"{match['date']}T00:00:00+00:00")
                except Exception:
                    pass
            await pool.execute(
                "UPDATE documents SET url = $1, published_at = COALESCE($2, published_at) WHERE id = $3",
                match["url"], pub, row["id"],
            )
            updated += 1
    print(f"Updated {updated} filing docs with URLs")
    await pool.close()
 asyncio.run(main())
@@ -0,0 +1,8 @@
 import redis, os
 r = redis.from_url(f"redis://:{os.environ.get('REDIS_PASSWORD','')}@{os.environ['REDIS_HOST']}:{os.environ['REDIS_PORT']}/0")
 keys = list(r.scan_iter("stonks:dedupe:*"))
 if keys:
    r.delete(*keys)
    print(f"Cleared {len(keys)} dedupe keys")
 else:
    print("No dedupe keys")
@@ -45,9 +45,14 @@ async def fetch_html(url: str) -> Optional[str]:
    """Fetch article HTML for scraping."""
    if not url:
        return None
    # SEC EDGAR requires a descriptive User-Agent with contact email per fair access policy
    if "sec.gov" in url:
        ua = "StonksOracle/1.0 (stonks-oracle-bot; contact@celestium.life)"
    else:
        ua = "StonksOracle/1.0"
    async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
        try:
-            resp = await client.get(url, headers={"User-Agent": "StonksOracle/1.0"})
+            resp = await client.get(url, headers={"User-Agent": ua, "Accept-Encoding": "gzip, deflate"})
            resp.raise_for_status()
            return resp.text
        except Exception as e: