phase 17: enrich SEC EDGAR filings with URLs, titles, dedupe by accession number, skip XML fragments

This commit is contained in:
Celes Renata
2026-04-12 09:42:12 -07:00
parent 28b3361833
commit 311d76dc0b
3 changed files with 96 additions and 7 deletions
+58 -7
View File
@@ -152,19 +152,70 @@ class SECEdgarAdapter(FilingsDataAdapter):
return url, params, headers
def _extract_items(self, data: dict[str, Any]) -> list[dict[str, Any]]:
"""Extract the filing hits from an EDGAR EFTS response.
"""Extract filing hits from EDGAR EFTS, enrich with fetchable URLs.
EFTS returns results under hits.hits as a list of objects,
each containing _source with fields like file_date, form_type,
entity_name, file_num, and period_of_report.
EFTS returns results under hits.hits. Each hit has _source with
adsh, ciks, form, file_type, file_description, and file_date.
We construct the SEC EDGAR document URL from these fields and
filter to primary filing documents (not XML fragments or exhibits).
"""
hits_wrapper = data.get("hits", {})
if not isinstance(hits_wrapper, dict):
return []
hits = hits_wrapper.get("hits", [])
if isinstance(hits, list):
return hits
return []
if not isinstance(hits, list):
return []
# Dedupe by adsh (accession number) — keep one item per filing
seen_adsh: set[str] = set()
items: list[dict[str, Any]] = []
for hit in hits:
src = hit.get("_source", {})
if not isinstance(src, dict):
continue
adsh = src.get("adsh", "")
if not adsh or adsh in seen_adsh:
continue
ciks = src.get("ciks", [])
if not ciks:
continue
# Skip XML data fragments and non-primary documents
file_type = src.get("file_type", "")
if file_type in ("XML", "GRAPHIC", "ZIP", "EXCEL"):
continue
seen_adsh.add(adsh)
# Build the filing index URL
cik = ciks[0].lstrip("0")
adsh_nodash = adsh.replace("-", "")
filing_index_url = (
f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh_nodash}/{adsh}-index.htm"
)
# Build a title from the metadata
form = src.get("form", "")
names = src.get("display_names", [])
entity_name = names[0].split("(CIK")[0].strip() if names else ""
file_date = src.get("file_date", "")
file_desc = src.get("file_description", "")
title = f"{form}: {entity_name}" + (f" - {file_desc}" if file_desc else "")
# Enrich the item with URL and structured fields
enriched = dict(src)
enriched["url"] = filing_index_url
enriched["article_url"] = filing_index_url # compat with news URL field
enriched["title"] = title
enriched["name"] = title
enriched["published_utc"] = f"{file_date}T00:00:00Z" if file_date else None
enriched["publisher"] = "SEC EDGAR"
items.append(enriched)
return items
def _total_hits(self, data: dict[str, Any]) -> int:
"""Extract total hit count from EFTS response."""