phase 17: fix SEC EDGAR 403 — use descriptive User-Agent with contact email per fair access policy
This commit is contained in:
@@ -0,0 +1,96 @@
|
|||||||
|
"""Backfill SEC EDGAR filing URLs from raw MinIO payloads."""
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
from minio import Minio
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
pool = await asyncpg.create_pool(
|
||||||
|
host=os.environ["POSTGRES_HOST"],
|
||||||
|
port=int(os.environ["POSTGRES_PORT"]),
|
||||||
|
database=os.environ["POSTGRES_DB"],
|
||||||
|
user=os.environ["POSTGRES_USER"],
|
||||||
|
password=os.environ["POSTGRES_PASSWORD"],
|
||||||
|
)
|
||||||
|
mc = Minio(
|
||||||
|
os.environ["MINIO_ENDPOINT"],
|
||||||
|
access_key=os.environ["MINIO_ACCESS_KEY"],
|
||||||
|
secret_key=os.environ["MINIO_SECRET_KEY"],
|
||||||
|
secure=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build URL map from raw filings data
|
||||||
|
objs = list(mc.list_objects("stonks-raw-filings", recursive=True))
|
||||||
|
print(f"Processing {len(objs)} raw filing payloads")
|
||||||
|
|
||||||
|
# Map title -> url for matching
|
||||||
|
title_url_map = {}
|
||||||
|
for obj in objs:
|
||||||
|
try:
|
||||||
|
data = json.loads(mc.get_object("stonks-raw-filings", obj.object_name).read())
|
||||||
|
hits = data.get("hits", {}).get("hits", [])
|
||||||
|
seen_adsh = set()
|
||||||
|
for hit in hits:
|
||||||
|
src = hit.get("_source", {})
|
||||||
|
adsh = src.get("adsh", "")
|
||||||
|
ciks = src.get("ciks", [])
|
||||||
|
if not adsh or not ciks or adsh in seen_adsh:
|
||||||
|
continue
|
||||||
|
file_type = src.get("file_type", "")
|
||||||
|
if file_type in ("XML", "GRAPHIC", "ZIP", "EXCEL"):
|
||||||
|
continue
|
||||||
|
seen_adsh.add(adsh)
|
||||||
|
|
||||||
|
cik = ciks[0].lstrip("0")
|
||||||
|
adsh_nodash = adsh.replace("-", "")
|
||||||
|
url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh_nodash}/{adsh}-index.htm"
|
||||||
|
|
||||||
|
form = src.get("form", "")
|
||||||
|
names = src.get("display_names", [])
|
||||||
|
entity_name = names[0].split("(CIK")[0].strip() if names else ""
|
||||||
|
file_desc = src.get("file_description", "")
|
||||||
|
title = f"{form}: {entity_name}" + (f" - {file_desc}" if file_desc else "")
|
||||||
|
file_date = src.get("file_date", "")
|
||||||
|
|
||||||
|
# Also try matching by file_description alone
|
||||||
|
if file_desc:
|
||||||
|
title_url_map[file_desc] = {"url": url, "date": file_date}
|
||||||
|
title_url_map[title] = {"url": url, "date": file_date}
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error processing {obj.object_name}: {e}")
|
||||||
|
|
||||||
|
print(f"Built {len(title_url_map)} title->URL mappings")
|
||||||
|
|
||||||
|
# Get filing docs without URLs
|
||||||
|
rows = await pool.fetch(
|
||||||
|
"SELECT id, title FROM documents "
|
||||||
|
"WHERE source_type = 'filings_api' AND (url IS NULL OR url = '')"
|
||||||
|
)
|
||||||
|
print(f"{len(rows)} filing docs missing URLs")
|
||||||
|
|
||||||
|
updated = 0
|
||||||
|
for row in rows:
|
||||||
|
title = row["title"] or ""
|
||||||
|
match = title_url_map.get(title)
|
||||||
|
if match and match["url"]:
|
||||||
|
pub = None
|
||||||
|
if match["date"]:
|
||||||
|
try:
|
||||||
|
pub = datetime.fromisoformat(f"{match['date']}T00:00:00+00:00")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
await pool.execute(
|
||||||
|
"UPDATE documents SET url = $1, published_at = COALESCE($2, published_at) WHERE id = $3",
|
||||||
|
match["url"], pub, row["id"],
|
||||||
|
)
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
print(f"Updated {updated} filing docs with URLs")
|
||||||
|
await pool.close()
|
||||||
|
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
import redis, os
|
||||||
|
r = redis.from_url(f"redis://:{os.environ.get('REDIS_PASSWORD','')}@{os.environ['REDIS_HOST']}:{os.environ['REDIS_PORT']}/0")
|
||||||
|
keys = list(r.scan_iter("stonks:dedupe:*"))
|
||||||
|
if keys:
|
||||||
|
r.delete(*keys)
|
||||||
|
print(f"Cleared {len(keys)} dedupe keys")
|
||||||
|
else:
|
||||||
|
print("No dedupe keys")
|
||||||
@@ -45,9 +45,14 @@ async def fetch_html(url: str) -> Optional[str]:
|
|||||||
"""Fetch article HTML for scraping."""
|
"""Fetch article HTML for scraping."""
|
||||||
if not url:
|
if not url:
|
||||||
return None
|
return None
|
||||||
|
# SEC EDGAR requires a descriptive User-Agent with contact email per fair access policy
|
||||||
|
if "sec.gov" in url:
|
||||||
|
ua = "StonksOracle/1.0 (stonks-oracle-bot; contact@celestium.life)"
|
||||||
|
else:
|
||||||
|
ua = "StonksOracle/1.0"
|
||||||
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
||||||
try:
|
try:
|
||||||
resp = await client.get(url, headers={"User-Agent": "StonksOracle/1.0"})
|
resp = await client.get(url, headers={"User-Agent": ua, "Accept-Encoding": "gzip, deflate"})
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
return resp.text
|
return resp.text
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user