phase 17: enrich SEC EDGAR filings with URLs, titles, dedupe by accession number, skip XML fragments
This commit is contained in:
@@ -0,0 +1,27 @@
|
||||
from minio import Minio
|
||||
import os, json
|
||||
|
||||
mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False)
|
||||
objs = list(mc.list_objects("stonks-raw-filings", recursive=True))
|
||||
|
||||
for obj in objs[:1]:
|
||||
data = json.loads(mc.get_object("stonks-raw-filings", obj.object_name).read())
|
||||
hits = data.get("hits", {}).get("hits", [])
|
||||
for h in hits[:5]:
|
||||
src = h.get("_source", {})
|
||||
adsh = src.get("adsh", "")
|
||||
ciks = src.get("ciks", [])
|
||||
form = src.get("form", "")
|
||||
names = src.get("display_names", [])
|
||||
file_desc = src.get("file_description", "")
|
||||
file_date = src.get("file_date", "")
|
||||
file_type = src.get("file_type", "")
|
||||
if adsh and ciks:
|
||||
cik = ciks[0].lstrip("0")
|
||||
adsh_nodash = adsh.replace("-", "")
|
||||
url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh_nodash}/{adsh}-index.htm"
|
||||
print(f"form={form} type={file_type} date={file_date}")
|
||||
print(f" names={names}")
|
||||
print(f" desc={file_desc}")
|
||||
print(f" index_url={url}")
|
||||
print()
|
||||
@@ -0,0 +1,11 @@
|
||||
import redis, os
|
||||
r = redis.from_url(f"redis://:{os.environ.get('REDIS_PASSWORD','')}@{os.environ['REDIS_HOST']}:{os.environ['REDIS_PORT']}/0")
|
||||
for q in ["ingestion","parsing","extraction","aggregation","recommendation","lake_publish","broker_orders"]:
|
||||
depth = r.llen(f"stonks:queue:{q}")
|
||||
print(f" {q:20} {depth:>4} pending")
|
||||
|
||||
# Check dead letter queues
|
||||
for q in ["ingestion","parsing","extraction","aggregation","recommendation"]:
|
||||
depth = r.llen(f"stonks:dlq:{q}")
|
||||
if depth > 0:
|
||||
print(f" DLQ {q:16} {depth:>4} dead letters")
|
||||
Reference in New Issue
Block a user