diff --git a/scripts/backfill_filing_urls.py b/scripts/backfill_filing_urls.py new file mode 100644 index 0000000..a530079 --- /dev/null +++ b/scripts/backfill_filing_urls.py @@ -0,0 +1,96 @@ +"""Backfill SEC EDGAR filing URLs from raw MinIO payloads.""" +import asyncio +import json +import os +from datetime import datetime + +import asyncpg +from minio import Minio + + +async def main(): + pool = await asyncpg.create_pool( + host=os.environ["POSTGRES_HOST"], + port=int(os.environ["POSTGRES_PORT"]), + database=os.environ["POSTGRES_DB"], + user=os.environ["POSTGRES_USER"], + password=os.environ["POSTGRES_PASSWORD"], + ) + mc = Minio( + os.environ["MINIO_ENDPOINT"], + access_key=os.environ["MINIO_ACCESS_KEY"], + secret_key=os.environ["MINIO_SECRET_KEY"], + secure=False, + ) + + # Build URL map from raw filings data + objs = list(mc.list_objects("stonks-raw-filings", recursive=True)) + print(f"Processing {len(objs)} raw filing payloads") + + # Map title -> url for matching + title_url_map = {} + for obj in objs: + try: + data = json.loads(mc.get_object("stonks-raw-filings", obj.object_name).read()) + hits = data.get("hits", {}).get("hits", []) + seen_adsh = set() + for hit in hits: + src = hit.get("_source", {}) + adsh = src.get("adsh", "") + ciks = src.get("ciks", []) + if not adsh or not ciks or adsh in seen_adsh: + continue + file_type = src.get("file_type", "") + if file_type in ("XML", "GRAPHIC", "ZIP", "EXCEL"): + continue + seen_adsh.add(adsh) + + cik = ciks[0].lstrip("0") + adsh_nodash = adsh.replace("-", "") + url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh_nodash}/{adsh}-index.htm" + + form = src.get("form", "") + names = src.get("display_names", []) + entity_name = names[0].split("(CIK")[0].strip() if names else "" + file_desc = src.get("file_description", "") + title = f"{form}: {entity_name}" + (f" - {file_desc}" if file_desc else "") + file_date = src.get("file_date", "") + + # Also try matching by file_description alone + if file_desc: + title_url_map[file_desc] = {"url": url, "date": file_date} + title_url_map[title] = {"url": url, "date": file_date} + except Exception as e: + print(f" Error processing {obj.object_name}: {e}") + + print(f"Built {len(title_url_map)} title->URL mappings") + + # Get filing docs without URLs + rows = await pool.fetch( + "SELECT id, title FROM documents " + "WHERE source_type = 'filings_api' AND (url IS NULL OR url = '')" + ) + print(f"{len(rows)} filing docs missing URLs") + + updated = 0 + for row in rows: + title = row["title"] or "" + match = title_url_map.get(title) + if match and match["url"]: + pub = None + if match["date"]: + try: + pub = datetime.fromisoformat(f"{match['date']}T00:00:00+00:00") + except Exception: + pass + await pool.execute( + "UPDATE documents SET url = $1, published_at = COALESCE($2, published_at) WHERE id = $3", + match["url"], pub, row["id"], + ) + updated += 1 + + print(f"Updated {updated} filing docs with URLs") + await pool.close() + + +asyncio.run(main()) diff --git a/scripts/clear_dedupe.py b/scripts/clear_dedupe.py new file mode 100644 index 0000000..9787639 --- /dev/null +++ b/scripts/clear_dedupe.py @@ -0,0 +1,8 @@ +import redis, os +r = redis.from_url(f"redis://:{os.environ.get('REDIS_PASSWORD','')}@{os.environ['REDIS_HOST']}:{os.environ['REDIS_PORT']}/0") +keys = list(r.scan_iter("stonks:dedupe:*")) +if keys: + r.delete(*keys) + print(f"Cleared {len(keys)} dedupe keys") +else: + print("No dedupe keys") diff --git a/services/parser/worker.py b/services/parser/worker.py index 791ae8b..196f1d7 100644 --- a/services/parser/worker.py +++ b/services/parser/worker.py @@ -45,9 +45,14 @@ async def fetch_html(url: str) -> Optional[str]: """Fetch article HTML for scraping.""" if not url: return None + # SEC EDGAR requires a descriptive User-Agent with contact email per fair access policy + if "sec.gov" in url: + ua = "StonksOracle/1.0 (stonks-oracle-bot; contact@celestium.life)" + else: + ua = "StonksOracle/1.0" async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client: try: - resp = await client.get(url, headers={"User-Agent": "StonksOracle/1.0"}) + resp = await client.get(url, headers={"User-Agent": ua, "Accept-Encoding": "gzip, deflate"}) resp.raise_for_status() return resp.text except Exception as e: