"""Filings / Regulatory API adapter interface and concrete SEC EDGAR provider. The FilingsDataAdapter is the abstract interface for all filings data providers. SECEdgarAdapter is the first concrete implementation, targeting the SEC EDGAR full-text search system (EFTS) for company filings discovery. Requirements: 2.3, 2.5, 3.1, 3.2, 3.3 """ import hashlib import logging import time from abc import ABC from datetime import datetime, timezone from typing import Any import httpx from .base import AdapterResult, BaseAdapter logger = logging.getLogger("filings_adapter") class FilingsDataAdapter(BaseAdapter, ABC): """Abstract interface for filings / regulatory data providers. Subclasses implement fetch() for their specific filings API. source_type() is concrete here since all filings adapters share the same type. """ def source_type(self) -> str: return "filings_api" class SECEdgarAdapter(FilingsDataAdapter): """Concrete adapter for the SEC EDGAR full-text search system (EFTS). Supports: - Full-text search (/LATEST/search-index) for 8-K, 10-Q, 10-K, and other forms - Filtering by date range, form type, and entity The SEC EDGAR EFTS API is public and does not require an API key, but requires a descriptive User-Agent header per SEC fair-access policy. Config options: cik: Company CIK number (optional, narrows search) forms: Comma-separated form types to search (default "8-K,10-Q,10-K") start_date: Only filings on or after this date, YYYY-MM-DD (optional) end_date: Only filings on or before this date, YYYY-MM-DD (optional) query: Custom search query override (optional, replaces ticker-based query) """ SEARCH_ENDPOINT: str = "/LATEST/search-index" def __init__( self, base_url: str = "https://efts.sec.gov", user_agent: str = "StonksOracle/1.0 ([email])", ) -> None: self.base_url: str = base_url.rstrip("/") self.user_agent: str = user_agent async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult: """Fetch filings from SEC EDGAR EFTS for a given ticker. Args: ticker: The company ticker symbol. config: Source-specific configuration from the sources table. Returns: AdapterResult with raw payload, parsed filing items, and metadata. """ url, params, headers = self._build_request(ticker, config) async with httpx.AsyncClient(timeout=30) as client: t0 = time.monotonic() try: resp = await client.get(url, params=params, headers=headers) elapsed_ms = (time.monotonic() - t0) * 1000 resp.raise_for_status() raw = resp.content data = resp.json() content_hash = hashlib.sha256(raw).hexdigest() items = self._extract_items(data) return AdapterResult( source_type="filings_api", ticker=ticker, items=items, raw_payload=raw, content_hash=content_hash, fetched_at=datetime.now(timezone.utc), http_status=resp.status_code, response_time_ms=round(elapsed_ms, 1), metadata={ "provider": "sec_edgar", "results_count": len(items), "total_hits": self._total_hits(data), "query": params.get("q", ""), "forms": params.get("forms", ""), }, ) except httpx.HTTPStatusError as e: elapsed_ms = (time.monotonic() - t0) * 1000 logger.error("SEC EDGAR HTTP error for %s: %s", ticker, e) return self._error_result( ticker, str(e), elapsed_ms, http_status=e.response.status_code if e.response else None, raw=e.response.content if e.response else b"", ) except httpx.TimeoutException as e: elapsed_ms = (time.monotonic() - t0) * 1000 logger.error("SEC EDGAR timeout for %s: %s", ticker, e) return self._error_result(ticker, f"timeout: {e}", elapsed_ms) except Exception as e: elapsed_ms = (time.monotonic() - t0) * 1000 logger.error("SEC EDGAR fetch failed for %s: %s", ticker, e) return self._error_result(ticker, str(e), elapsed_ms) def _build_request( self, ticker: str, config: dict[str, Any] ) -> tuple[str, dict[str, str], dict[str, str]]: """Build the URL, query params, and headers for an EDGAR EFTS request.""" params: dict[str, str] = {} headers: dict[str, str] = {"User-Agent": self.user_agent} # Query: use custom override or default to ticker-based search query = config.get("query") if query: params["q"] = str(query) else: params["q"] = f'"{ticker}"' # Form types filter forms = config.get("forms", "8-K,10-Q,10-K") params["forms"] = str(forms) # Date range if config.get("start_date"): params["dateRange"] = "custom" params["startdt"] = str(config["start_date"]) if config.get("end_date"): params["dateRange"] = "custom" params["enddt"] = str(config["end_date"]) # CIK filter (entity-level narrowing) cik = config.get("cik") if cik: params["q"] = f'{params["q"]} AND cik:{cik}' url = f"{self.base_url}{self.SEARCH_ENDPOINT}" return url, params, headers def _extract_items(self, data: dict[str, Any]) -> list[dict[str, Any]]: """Extract filing hits from EDGAR EFTS, enrich with fetchable URLs. EFTS returns results under hits.hits. Each hit has _source with adsh, ciks, form, file_type, file_description, and file_date. We construct the SEC EDGAR document URL from these fields and filter to primary filing documents (not XML fragments or exhibits). """ hits_wrapper = data.get("hits", {}) if not isinstance(hits_wrapper, dict): return [] hits = hits_wrapper.get("hits", []) if not isinstance(hits, list): return [] # Dedupe by adsh (accession number) — keep one item per filing seen_adsh: set[str] = set() items: list[dict[str, Any]] = [] for hit in hits: src = hit.get("_source", {}) if not isinstance(src, dict): continue adsh = src.get("adsh", "") if not adsh or adsh in seen_adsh: continue ciks = src.get("ciks", []) if not ciks: continue # Skip XML data fragments and non-primary documents file_type = src.get("file_type", "") if file_type in ("XML", "GRAPHIC", "ZIP", "EXCEL"): continue seen_adsh.add(adsh) # Build the filing index URL cik = ciks[0].lstrip("0") adsh_nodash = adsh.replace("-", "") filing_index_url = ( f"https://www.sec.gov/Archives/edgar/data/{cik}/{adsh_nodash}/{adsh}-index.htm" ) # Build a title from the metadata form = src.get("form", "") names = src.get("display_names", []) entity_name = names[0].split("(CIK")[0].strip() if names else "" file_date = src.get("file_date", "") file_desc = src.get("file_description", "") title = f"{form}: {entity_name}" + (f" - {file_desc}" if file_desc else "") # Enrich the item with URL and structured fields enriched = dict(src) enriched["url"] = filing_index_url enriched["article_url"] = filing_index_url # compat with news URL field enriched["title"] = title enriched["name"] = title enriched["published_utc"] = f"{file_date}T00:00:00Z" if file_date else None enriched["publisher"] = "SEC EDGAR" items.append(enriched) return items def _total_hits(self, data: dict[str, Any]) -> int: """Extract total hit count from EFTS response.""" hits_wrapper = data.get("hits", {}) if not isinstance(hits_wrapper, dict): return 0 total = hits_wrapper.get("total", {}) if isinstance(total, dict): return int(total.get("value", 0)) if isinstance(total, int): return total return 0 def _error_result( self, ticker: str, error: str, elapsed_ms: float, http_status: int | None = None, raw: bytes = b"", ) -> AdapterResult: """Build an error AdapterResult for filings fetches.""" return AdapterResult( source_type="filings_api", ticker=ticker, items=[], raw_payload=raw, content_hash="", fetched_at=datetime.now(timezone.utc), error=error, http_status=http_status, response_time_ms=round(elapsed_ms, 1), metadata={"provider": "sec_edgar"}, )