phase 14-15: docker build validation and helm deployment

This commit is contained in:
Celes Renata
2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
+44
View File
@@ -1 +1,45 @@
# Ingestion Adapters
from .base import AdapterResult, BaseAdapter
from .resilient import ResilientAdapter, RetryConfig, RetryStats, compute_delay
from .broker_adapter import (
AccountInfo,
AlpacaBrokerAdapter,
BrokerDataAdapter,
OrderEventType,
OrderRequest,
OrderResponse,
OrderSide,
OrderStatus,
OrderType,
PositionInfo,
TradingMode,
)
from .filings_adapter import FilingsDataAdapter, SECEdgarAdapter
from .market_adapter import MarketDataAdapter, PolygonMarketAdapter
from .news_adapter import NewsDataAdapter, PolygonNewsAdapter
__all__ = [
"AccountInfo",
"AdapterResult",
"AlpacaBrokerAdapter",
"BaseAdapter",
"BrokerDataAdapter",
"FilingsDataAdapter",
"MarketDataAdapter",
"NewsDataAdapter",
"OrderEventType",
"OrderRequest",
"OrderResponse",
"OrderSide",
"OrderStatus",
"OrderType",
"PolygonMarketAdapter",
"PolygonNewsAdapter",
"PositionInfo",
"ResilientAdapter",
"RetryConfig",
"RetryStats",
"SECEdgarAdapter",
"TradingMode",
"compute_delay",
]
+63 -8
View File
@@ -1,29 +1,84 @@
"""Base adapter interface for all external API integrations."""
"""Base adapter interface for all external API integrations.
All ingestion adapters follow the same contract:
1. Fetch external payloads for a given ticker/source config.
2. Return a structured result with raw bytes, parsed items, and metadata.
3. The ingestion worker handles MinIO upload, PostgreSQL metadata, and downstream job emission.
Requirements: 2.1, 2.2, 2.3, 2.4, 2.5, 3.1, 3.2, 3.3, 3.4
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Dict, List, Optional
from typing import Any
@dataclass
class AdapterResult:
"""Result of a single adapter fetch operation."""
source_type: str
ticker: str
items: List[Dict[str, Any]]
items: list[dict[str, Any]]
raw_payload: bytes
content_hash: str
fetched_at: datetime
error: Optional[str] = None
error: str | None = None
# HTTP metadata for observability
http_status: int | None = None
response_time_ms: float | None = None
# Additional metadata the adapter wants to pass downstream
metadata: dict[str, Any] = field(default_factory=dict)
@property
def ok(self) -> bool:
"""True if the fetch succeeded without error."""
return self.error is None and len(self.items) > 0
@property
def item_count(self) -> int:
return len(self.items)
class BaseAdapter(ABC):
"""Interface for all ingestion adapters."""
"""Interface for all ingestion adapters.
Subclasses implement fetch() for their specific API and source_type()
to identify the adapter class. The ingestion worker orchestrates
persistence and downstream job emission.
"""
@abstractmethod
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
"""Fetch data for a given ticker using source config."""
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch data for a given ticker using source config.
Args:
ticker: The company ticker symbol.
config: Source-specific configuration from the sources table.
Returns:
AdapterResult with raw payload, parsed items, and metadata.
"""
...
@abstractmethod
def source_type(self) -> str:
"""Return the source type identifier for this adapter (e.g. 'market_api')."""
...
def bucket_name(self) -> str:
"""Return the MinIO bucket name for raw artifact storage.
Override in subclasses if the bucket differs from the default pattern.
"""
return f"stonks-raw-{self.source_type().replace('_api', '').replace('_', '-')}"
def artifact_path(self, ticker: str, document_id: str, now: datetime) -> str:
"""Build the MinIO object path for a raw artifact.
Pattern: /{source_type}/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/raw.json
"""
return (
f"{self.source_type()}/{ticker}/"
f"{now.strftime('%Y/%m/%d')}/{document_id}/raw.json"
)
+558 -61
View File
@@ -1,9 +1,19 @@
"""Broker API adapter - paper/live trading, orders, positions, balances."""
"""Broker API adapter interface for paper trading and order events.
The BrokerDataAdapter is the abstract interface for all broker integrations.
AlpacaBrokerAdapter is the first concrete implementation, targeting the
Alpaca Markets REST API for paper and live trading.
Requirements: 2.4, 2.5, 8.1, 8.3, 8.5
"""
import hashlib
import logging
import time
import uuid
from datetime import datetime
from typing import Any, Dict, Optional
from abc import ABC, abstractmethod
from datetime import datetime, timezone
from enum import Enum
from typing import Any
import httpx
@@ -12,97 +22,584 @@ from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("broker_adapter")
class BrokerAdapter(BaseAdapter):
"""Broker API adapter supporting paper and live modes."""
# --- Broker-specific enums ---
def __init__(self, api_key: str = "", api_secret: str = "", base_url: str = "", mode: str = "paper"):
self.api_key = api_key
self.api_secret = api_secret
self.base_url = base_url
self.mode = mode # paper | live
class OrderSide(str, Enum):
BUY = "buy"
SELL = "sell"
class OrderType(str, Enum):
MARKET = "market"
LIMIT = "limit"
STOP = "stop"
STOP_LIMIT = "stop_limit"
class OrderStatus(str, Enum):
PENDING = "pending"
SUBMITTED = "submitted"
ACCEPTED = "accepted"
PARTIALLY_FILLED = "partially_filled"
FILLED = "filled"
CANCELLED = "cancelled"
REJECTED = "rejected"
EXPIRED = "expired"
class TradingMode(str, Enum):
PAPER = "paper"
LIVE = "live"
class OrderEventType(str, Enum):
SUBMITTED = "submitted"
ACCEPTED = "accepted"
REJECTED = "rejected"
FILL = "fill"
PARTIAL_FILL = "partial_fill"
CANCELLED = "cancelled"
EXPIRED = "expired"
# --- Data structures ---
class OrderRequest:
"""Represents an order to be submitted to a broker."""
def __init__(
self,
ticker: str,
side: OrderSide,
quantity: float,
order_type: OrderType = OrderType.MARKET,
limit_price: float | None = None,
stop_price: float | None = None,
time_in_force: str = "day",
idempotency_key: str | None = None,
) -> None:
self.ticker = ticker
self.side = side
self.quantity = quantity
self.order_type = order_type
self.limit_price = limit_price
self.stop_price = stop_price
self.time_in_force = time_in_force
self.idempotency_key = idempotency_key or str(uuid.uuid4())
def to_dict(self) -> dict[str, Any]:
"""Serialize to a dict for audit/persistence."""
d: dict[str, Any] = {
"ticker": self.ticker,
"side": self.side.value,
"quantity": self.quantity,
"order_type": self.order_type.value,
"time_in_force": self.time_in_force,
"idempotency_key": self.idempotency_key,
}
if self.limit_price is not None:
d["limit_price"] = self.limit_price
if self.stop_price is not None:
d["stop_price"] = self.stop_price
return d
class OrderResponse:
"""Represents a broker's response to an order submission."""
def __init__(
self,
broker_order_id: str,
status: OrderStatus,
ticker: str,
side: OrderSide,
quantity: float,
filled_quantity: float = 0.0,
filled_avg_price: float | None = None,
submitted_at: datetime | None = None,
raw_response: dict[str, Any] | None = None,
error: str | None = None,
) -> None:
self.broker_order_id = broker_order_id
self.status = status
self.ticker = ticker
self.side = side
self.quantity = quantity
self.filled_quantity = filled_quantity
self.filled_avg_price = filled_avg_price
self.submitted_at = submitted_at or datetime.now(timezone.utc)
self.raw_response = raw_response or {}
self.error = error
@property
def ok(self) -> bool:
return self.error is None and self.status not in (
OrderStatus.REJECTED,
OrderStatus.CANCELLED,
OrderStatus.EXPIRED,
)
def to_dict(self) -> dict[str, Any]:
return {
"broker_order_id": self.broker_order_id,
"status": self.status.value,
"ticker": self.ticker,
"side": self.side.value,
"quantity": self.quantity,
"filled_quantity": self.filled_quantity,
"filled_avg_price": self.filled_avg_price,
"submitted_at": self.submitted_at.isoformat(),
"error": self.error,
}
class PositionInfo:
"""Represents a current position from the broker."""
def __init__(
self,
ticker: str,
quantity: float,
avg_entry_price: float,
current_price: float,
unrealized_pnl: float,
market_value: float,
side: str = "long",
) -> None:
self.ticker = ticker
self.quantity = quantity
self.avg_entry_price = avg_entry_price
self.current_price = current_price
self.unrealized_pnl = unrealized_pnl
self.market_value = market_value
self.side = side
def to_dict(self) -> dict[str, Any]:
return {
"ticker": self.ticker,
"quantity": self.quantity,
"avg_entry_price": self.avg_entry_price,
"current_price": self.current_price,
"unrealized_pnl": self.unrealized_pnl,
"market_value": self.market_value,
"side": self.side,
}
class AccountInfo:
"""Represents broker account summary."""
def __init__(
self,
account_id: str,
buying_power: float,
cash: float,
portfolio_value: float,
currency: str = "USD",
mode: TradingMode = TradingMode.PAPER,
) -> None:
self.account_id = account_id
self.buying_power = buying_power
self.cash = cash
self.portfolio_value = portfolio_value
self.currency = currency
self.mode = mode
def to_dict(self) -> dict[str, Any]:
return {
"account_id": self.account_id,
"buying_power": self.buying_power,
"cash": self.cash,
"portfolio_value": self.portfolio_value,
"currency": self.currency,
"mode": self.mode.value,
}
# --- Abstract interface ---
class BrokerDataAdapter(BaseAdapter, ABC):
"""Abstract interface for broker API integrations.
Extends BaseAdapter with broker-specific operations:
- submit_order: place an order with idempotency key
- cancel_order: cancel an existing order
- get_order_status: check order state
- get_positions: list current positions
- get_account: retrieve account summary
All concrete adapters must enforce:
- Idempotent order submission via idempotency_key (Req 8.5)
- Paper/live mode separation (Req 8.1)
- Fail-closed on broker unavailability (Req 8.5)
"""
def __init__(self, mode: TradingMode = TradingMode.PAPER) -> None:
self._mode = mode
@property
def mode(self) -> TradingMode:
return self._mode
def source_type(self) -> str:
return "broker"
def _headers(self) -> Dict[str, str]:
@abstractmethod
async def submit_order(self, order: OrderRequest) -> OrderResponse:
"""Submit an order to the broker.
Must use order.idempotency_key to prevent duplicate submissions.
Must fail closed if the broker is unavailable or returns ambiguous state.
"""
...
@abstractmethod
async def cancel_order(self, broker_order_id: str) -> OrderResponse:
"""Cancel an existing order by broker order ID."""
...
@abstractmethod
async def get_order_status(self, broker_order_id: str) -> OrderResponse:
"""Get the current status of an order."""
...
@abstractmethod
async def get_positions(self) -> list[PositionInfo]:
"""Get all current positions."""
...
@abstractmethod
async def get_account(self) -> AccountInfo:
"""Get account summary (balance, buying power, etc.)."""
...
# --- Concrete Alpaca implementation ---
class AlpacaBrokerAdapter(BrokerDataAdapter):
"""Concrete broker adapter for the Alpaca Markets REST API.
Supports:
- Paper trading via paper-api.alpaca.markets
- Live trading via api.alpaca.markets
- Order submission, cancellation, and status
- Position and account queries
Config options for fetch():
endpoint: One of "positions", "orders", "account" (default "positions")
"""
PAPER_BASE_URL: str = "https://paper-api.alpaca.markets"
LIVE_BASE_URL: str = "https://api.alpaca.markets"
def __init__(
self,
api_key: str,
api_secret: str,
mode: TradingMode = TradingMode.PAPER,
base_url: str | None = None,
) -> None:
super().__init__(mode=mode)
self.api_key = api_key
self.api_secret = api_secret
if base_url:
self.base_url = base_url.rstrip("/")
elif mode == TradingMode.LIVE:
self.base_url = self.LIVE_BASE_URL
else:
self.base_url = self.PAPER_BASE_URL
def _headers(self) -> dict[str, str]:
return {
"Authorization": f"Bearer {self.api_key}",
"APCA-API-KEY-ID": self.api_key,
"APCA-API-SECRET-KEY": self.api_secret,
"Content-Type": "application/json",
}
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
"""Fetch positions and recent orders for a ticker."""
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch positions or recent orders for a ticker from Alpaca.
This satisfies the BaseAdapter contract for the ingestion pipeline.
The broker adapter uses fetch() to pull position/order snapshots
that get persisted as raw artifacts.
"""
endpoint = config.get("endpoint", "positions")
url = self._build_fetch_url(ticker, endpoint)
async with httpx.AsyncClient(timeout=30) as client:
t0 = time.monotonic()
try:
resp = await client.get(
f"{self.base_url}/v2/positions/{ticker}",
headers=self._headers(),
)
resp = await client.get(url, headers=self._headers())
elapsed_ms = (time.monotonic() - t0) * 1000
resp.raise_for_status()
raw = resp.content
data = resp.json() if resp.status_code == 200 else {}
data = resp.json()
content_hash = hashlib.sha256(raw).hexdigest()
items = [data] if isinstance(data, dict) else data if isinstance(data, list) else []
return AdapterResult(
source_type="broker",
ticker=ticker,
items=[data] if data else [],
items=items,
raw_payload=raw,
content_hash=content_hash,
fetched_at=datetime.utcnow(),
fetched_at=datetime.now(timezone.utc),
http_status=resp.status_code,
response_time_ms=round(elapsed_ms, 1),
metadata={
"provider": "alpaca",
"mode": self._mode.value,
"endpoint": endpoint,
},
)
except httpx.HTTPStatusError as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Alpaca HTTP error for %s: %s", ticker, e)
return self._error_result(
ticker, str(e), elapsed_ms,
http_status=e.response.status_code if e.response else None,
raw=e.response.content if e.response else b"",
)
except Exception as e:
logger.error(f"Broker fetch failed for {ticker}: {e}")
return AdapterResult(
source_type="broker",
ticker=ticker,
items=[],
raw_payload=b"",
content_hash="",
fetched_at=datetime.utcnow(),
error=str(e),
)
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Alpaca fetch failed for %s: %s", ticker, e)
return self._error_result(ticker, str(e), elapsed_ms)
async def submit_order(
self,
ticker: str,
side: str,
qty: float,
order_type: str = "market",
limit_price: Optional[float] = None,
idempotency_key: Optional[str] = None,
) -> Dict[str, Any]:
"""Submit an order to the broker. Returns broker response."""
if self.mode == "live":
logger.warning("LIVE order submission")
def _build_fetch_url(self, ticker: str, endpoint: str) -> str:
"""Build the URL for a fetch operation."""
if endpoint == "orders":
return f"{self.base_url}/v2/orders?symbols={ticker}&status=all&limit=50"
if endpoint == "account":
return f"{self.base_url}/v2/account"
# Default: positions for ticker
return f"{self.base_url}/v2/positions/{ticker}"
idem_key = idempotency_key or str(uuid.uuid4())
payload = {
"symbol": ticker,
"qty": str(qty),
"side": side,
"type": order_type,
"time_in_force": "day",
async def submit_order(self, order: OrderRequest) -> OrderResponse:
"""Submit an order to Alpaca with idempotency key.
Fails closed: any network error or ambiguous response returns
a rejected OrderResponse rather than risking duplicate orders.
"""
if self._mode == TradingMode.LIVE:
logger.warning("LIVE order submission: %s %s %s", order.side.value, order.quantity, order.ticker)
payload: dict[str, Any] = {
"symbol": order.ticker,
"qty": str(order.quantity),
"side": order.side.value,
"type": order.order_type.value,
"time_in_force": order.time_in_force,
}
if limit_price and order_type == "limit":
payload["limit_price"] = str(limit_price)
if order.limit_price is not None and order.order_type in (OrderType.LIMIT, OrderType.STOP_LIMIT):
payload["limit_price"] = str(order.limit_price)
if order.stop_price is not None and order.order_type in (OrderType.STOP, OrderType.STOP_LIMIT):
payload["stop_price"] = str(order.stop_price)
headers = {**self._headers(), "Idempotency-Key": order.idempotency_key}
async with httpx.AsyncClient(timeout=30) as client:
try:
resp = await client.post(
f"{self.base_url}/v2/orders",
headers={**self._headers(), "Idempotency-Key": idem_key},
headers=headers,
json=payload,
)
resp.raise_for_status()
return resp.json()
data = resp.json()
return self._parse_order_response(data)
except httpx.HTTPStatusError as e:
logger.error(f"Order rejected: {e.response.text}")
return {"error": e.response.text, "status": e.response.status_code}
error_body = e.response.text if e.response else "unknown"
logger.error("Order rejected by Alpaca: %s", error_body)
return OrderResponse(
broker_order_id="",
status=OrderStatus.REJECTED,
ticker=order.ticker,
side=order.side,
quantity=order.quantity,
error=f"HTTP {e.response.status_code}: {error_body}" if e.response else str(e),
raw_response={"error": error_body},
)
except Exception as e:
logger.error(f"Order submission failed: {e}")
return {"error": str(e)}
# Fail closed: treat any unexpected error as rejection
logger.error("Order submission failed (fail-closed): %s", e)
return OrderResponse(
broker_order_id="",
status=OrderStatus.REJECTED,
ticker=order.ticker,
side=order.side,
quantity=order.quantity,
error=f"fail-closed: {e}",
)
async def get_account(self) -> Dict[str, Any]:
async def cancel_order(self, broker_order_id: str) -> OrderResponse:
"""Cancel an order on Alpaca."""
async with httpx.AsyncClient(timeout=30) as client:
resp = await client.get(f"{self.base_url}/v2/account", headers=self._headers())
return resp.json()
try:
resp = await client.delete(
f"{self.base_url}/v2/orders/{broker_order_id}",
headers=self._headers(),
)
if resp.status_code == 204:
return OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.CANCELLED,
ticker="",
side=OrderSide.BUY,
quantity=0,
)
resp.raise_for_status()
data = resp.json()
return self._parse_order_response(data)
except Exception as e:
logger.error("Cancel failed for %s: %s", broker_order_id, e)
return OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.REJECTED,
ticker="",
side=OrderSide.BUY,
quantity=0,
error=str(e),
)
async def get_order_status(self, broker_order_id: str) -> OrderResponse:
"""Get order status from Alpaca."""
async with httpx.AsyncClient(timeout=30) as client:
try:
resp = await client.get(
f"{self.base_url}/v2/orders/{broker_order_id}",
headers=self._headers(),
)
resp.raise_for_status()
data = resp.json()
return self._parse_order_response(data)
except Exception as e:
logger.error("Get order status failed for %s: %s", broker_order_id, e)
return OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.REJECTED,
ticker="",
side=OrderSide.BUY,
quantity=0,
error=str(e),
)
async def get_positions(self) -> list[PositionInfo]:
"""Get all current positions from Alpaca."""
async with httpx.AsyncClient(timeout=30) as client:
try:
resp = await client.get(
f"{self.base_url}/v2/positions",
headers=self._headers(),
)
resp.raise_for_status()
data = resp.json()
if not isinstance(data, list):
return []
return [self._parse_position(p) for p in data if isinstance(p, dict)]
except Exception as e:
logger.error("Get positions failed: %s", e)
return []
async def get_account(self) -> AccountInfo:
"""Get account summary from Alpaca."""
async with httpx.AsyncClient(timeout=30) as client:
try:
resp = await client.get(
f"{self.base_url}/v2/account",
headers=self._headers(),
)
resp.raise_for_status()
data = resp.json()
return AccountInfo(
account_id=str(data.get("id", "")),
buying_power=float(data.get("buying_power", 0)),
cash=float(data.get("cash", 0)),
portfolio_value=float(data.get("portfolio_value", 0)),
currency=str(data.get("currency", "USD")),
mode=self._mode,
)
except Exception as e:
logger.error("Get account failed: %s", e)
return AccountInfo(
account_id="",
buying_power=0,
cash=0,
portfolio_value=0,
mode=self._mode,
)
def _parse_order_response(self, data: dict[str, Any]) -> OrderResponse:
"""Parse an Alpaca order response into an OrderResponse."""
status_map: dict[str, OrderStatus] = {
"new": OrderStatus.SUBMITTED,
"accepted": OrderStatus.ACCEPTED,
"partially_filled": OrderStatus.PARTIALLY_FILLED,
"filled": OrderStatus.FILLED,
"done_for_day": OrderStatus.FILLED,
"canceled": OrderStatus.CANCELLED,
"expired": OrderStatus.EXPIRED,
"replaced": OrderStatus.SUBMITTED,
"pending_new": OrderStatus.PENDING,
"pending_cancel": OrderStatus.PENDING,
"pending_replace": OrderStatus.PENDING,
"rejected": OrderStatus.REJECTED,
}
raw_status = str(data.get("status", "pending"))
status = status_map.get(raw_status, OrderStatus.PENDING)
side_str = str(data.get("side", "buy"))
side = OrderSide.SELL if side_str == "sell" else OrderSide.BUY
filled_qty = float(data.get("filled_qty", 0) or 0)
filled_avg = data.get("filled_avg_price")
filled_avg_price = float(filled_avg) if filled_avg else None
return OrderResponse(
broker_order_id=str(data.get("id", "")),
status=status,
ticker=str(data.get("symbol", "")),
side=side,
quantity=float(data.get("qty", 0) or 0),
filled_quantity=filled_qty,
filled_avg_price=filled_avg_price,
raw_response=data,
)
def _parse_position(self, data: dict[str, Any]) -> PositionInfo:
"""Parse an Alpaca position response into a PositionInfo."""
return PositionInfo(
ticker=str(data.get("symbol", "")),
quantity=float(data.get("qty", 0) or 0),
avg_entry_price=float(data.get("avg_entry_price", 0) or 0),
current_price=float(data.get("current_price", 0) or 0),
unrealized_pnl=float(data.get("unrealized_pl", 0) or 0),
market_value=float(data.get("market_value", 0) or 0),
side=str(data.get("side", "long")),
)
def _error_result(
self,
ticker: str,
error: str,
elapsed_ms: float,
http_status: int | None = None,
raw: bytes = b"",
) -> AdapterResult:
"""Build an error AdapterResult for broker fetches."""
return AdapterResult(
source_type="broker",
ticker=ticker,
items=[],
raw_payload=raw,
content_hash="",
fetched_at=datetime.now(timezone.utc),
error=error,
http_status=http_status,
response_time_ms=round(elapsed_ms, 1),
metadata={"provider": "alpaca", "mode": self._mode.value},
)
+832
View File
@@ -0,0 +1,832 @@
"""Broker adapter service - standalone worker for sandbox order execution.
Runs the Alpaca broker adapter in sandbox (paper) mode, processing order
requests from the broker queue, evaluating them through the risk engine,
submitting to Alpaca's paper trading API, and persisting the full audit trail.
Also periodically syncs positions and account state from Alpaca.
Implements idempotent order submission keys and duplicate prevention:
- Deterministic idempotency key generation from job attributes
- Redis-based fast-path duplicate detection before broker submission
- PostgreSQL UNIQUE constraint on idempotency_key as durable fallback
Requirements: 2.4, 8.1, 8.3, 8.5
Design: Section 4.9 - Broker Adapter
"""
from __future__ import annotations
import asyncio
import hashlib
import json
import logging
import uuid
from datetime import datetime, timezone
from typing import Any
import asyncpg
import redis.asyncio as aioredis
from services.adapters.broker_adapter import (
AlpacaBrokerAdapter,
OrderRequest,
OrderResponse,
OrderSide,
OrderStatus,
OrderType,
TradingMode,
)
from services.risk.engine import (
AccountRiskState,
PortfolioRiskConfig,
ProposedOrder,
evaluate_order,
)
from services.risk.approval import (
ApprovalRequest,
ApprovalStatus,
compute_expiry,
create_approval_request,
requires_approval,
)
from services.shared.audit import (
audit_approval_requested,
audit_duplicate_prevented,
audit_order_filled,
audit_order_rejected,
audit_order_submitted,
audit_risk_evaluated,
)
from services.lake_publisher.worker import (
publish_trade_order,
publish_trade_fill,
publish_positions_daily_batch,
LAKEHOUSE_BUCKET,
)
from services.shared.config import load_config
from services.shared.db import get_pg_pool, get_redis
from services.shared.logging import Span, new_trace_id, set_trace_context, setup_logging
from services.shared.metrics import (
ORDERS_DUPLICATES_PREVENTED,
ORDERS_FILLED,
ORDERS_REJECTED,
ORDERS_SUBMITTED,
POSITIONS_SYNCED,
RISK_CHECK_FAILURES,
RISK_EVALUATIONS_TOTAL,
)
from services.shared.redis_keys import QUEUE_BROKER, queue_key
logger = logging.getLogger("broker_service")
POSITION_SYNC_INTERVAL = 60 # seconds
# Redis TTL for idempotency markers (24 hours)
ORDER_IDEMPOTENCY_TTL = 86400
ORDER_IDEMPOTENCY_PREFIX = "stonks:order_idempotency"
# ---------------------------------------------------------------------------
# DB persistence helpers
# ---------------------------------------------------------------------------
_UPSERT_BROKER_ACCOUNT = """
INSERT INTO broker_accounts (id, provider, account_id, mode, config, active)
VALUES ($1::uuid, $2, $3, $4, $5::jsonb, TRUE)
ON CONFLICT (id) DO UPDATE SET
config = EXCLUDED.config,
mode = EXCLUDED.mode,
active = TRUE
"""
_INSERT_ORDER = """
INSERT INTO orders (
id, recommendation_id, broker_account_id, ticker, side, order_type,
quantity, limit_price, stop_price, status, idempotency_key,
broker_order_id, decision_trace, submitted_at, filled_at,
fill_price, fill_quantity
) VALUES (
$1::uuid, $2, $3::uuid, $4, $5, $6,
$7, $8, $9, $10, $11,
$12, $13::jsonb, $14, $15,
$16, $17
)
ON CONFLICT (idempotency_key) DO UPDATE SET
status = EXCLUDED.status,
broker_order_id = EXCLUDED.broker_order_id,
filled_at = EXCLUDED.filled_at,
fill_price = EXCLUDED.fill_price,
fill_quantity = EXCLUDED.fill_quantity,
updated_at = NOW()
"""
_INSERT_ORDER_EVENT = """
INSERT INTO order_events (order_id, event_type, data, broker_timestamp)
VALUES ($1::uuid, $2, $3::jsonb, $4)
"""
_INSERT_RISK_EVALUATION = """
INSERT INTO risk_evaluations (id, recommendation_id, eligible, allowed_mode, rejection_reasons, risk_checks, evaluated_at)
VALUES ($1::uuid, $2::uuid, $3, $4, $5::jsonb, $6::jsonb, $7)
"""
_UPSERT_POSITION = """
INSERT INTO positions (broker_account_id, ticker, quantity, avg_entry_price, current_price, unrealized_pnl, updated_at)
VALUES ($1::uuid, $2, $3, $4, $5, $6, $7)
ON CONFLICT (broker_account_id, ticker)
DO UPDATE SET
quantity = EXCLUDED.quantity,
avg_entry_price = EXCLUDED.avg_entry_price,
current_price = EXCLUDED.current_price,
unrealized_pnl = EXCLUDED.unrealized_pnl,
updated_at = EXCLUDED.updated_at
"""
_LOAD_RISK_CONFIG = """
SELECT config FROM risk_configs WHERE active = TRUE ORDER BY updated_at DESC LIMIT 1
"""
_LOAD_DAILY_SNAPSHOT = """
SELECT portfolio_value, daily_pnl, daily_trade_count, positions_by_sector
FROM daily_risk_snapshots
WHERE account_id = $1 AND snapshot_date = CURRENT_DATE
LIMIT 1
"""
_CHECK_ORDER_BY_IDEMPOTENCY_KEY = """
SELECT id, status, broker_order_id FROM orders
WHERE idempotency_key = $1
LIMIT 1
"""
# ---------------------------------------------------------------------------
# Idempotency helpers (Requirement 8.5)
# ---------------------------------------------------------------------------
def generate_idempotency_key(job: dict[str, Any]) -> str:
"""Generate a deterministic idempotency key from job attributes.
If the job already carries an explicit idempotency_key, use it.
Otherwise, derive a stable key from the combination of
recommendation_id, ticker, side, quantity, and order_type so that
replayed queue messages produce the same key and are detected as
duplicates.
"""
explicit = job.get("idempotency_key")
if explicit:
return str(explicit)
# Build a deterministic key from job content
parts = [
str(job.get("recommendation_id", "")),
str(job.get("ticker", "")),
str(job.get("side", "buy")),
str(job.get("quantity", 0)),
str(job.get("order_type", "market")),
str(job.get("limit_price", "")),
str(job.get("stop_price", "")),
]
raw = "|".join(parts)
return hashlib.sha256(raw.encode()).hexdigest()[:40]
def _redis_idempotency_key(idempotency_key: str) -> str:
"""Build the Redis key for an order idempotency marker."""
return f"{ORDER_IDEMPOTENCY_PREFIX}:{idempotency_key}"
async def check_idempotency_redis(
rds: aioredis.Redis,
idempotency_key: str,
) -> str | None:
"""Fast-path: check Redis for a previously processed idempotency key.
Returns the existing order_id if found, None otherwise.
"""
redis_key = _redis_idempotency_key(idempotency_key)
cached = await rds.get(redis_key)
if cached:
return str(cached)
return None
async def check_idempotency_db(
pool: asyncpg.Pool,
idempotency_key: str,
) -> dict[str, Any] | None:
"""Durable fallback: check PostgreSQL for an existing order with this key.
Returns a dict with id, status, broker_order_id if found, None otherwise.
"""
row = await pool.fetchrow(_CHECK_ORDER_BY_IDEMPOTENCY_KEY, idempotency_key)
if row:
return {
"id": str(row["id"]),
"status": str(row["status"]),
"broker_order_id": str(row["broker_order_id"] or ""),
}
return None
async def mark_idempotency_redis(
rds: aioredis.Redis,
idempotency_key: str,
order_id: str,
) -> None:
"""Set the Redis idempotency marker after an order is processed."""
redis_key = _redis_idempotency_key(idempotency_key)
await rds.set(redis_key, order_id, ex=ORDER_IDEMPOTENCY_TTL)
# ---------------------------------------------------------------------------
# Core service logic
# ---------------------------------------------------------------------------
def build_order_request(job: dict[str, Any]) -> OrderRequest:
"""Build an OrderRequest from a broker queue job payload."""
side = OrderSide.SELL if job.get("side", "buy") == "sell" else OrderSide.BUY
order_type_str = job.get("order_type", "market")
order_type_map = {
"market": OrderType.MARKET,
"limit": OrderType.LIMIT,
"stop": OrderType.STOP,
"stop_limit": OrderType.STOP_LIMIT,
}
return OrderRequest(
ticker=job["ticker"],
side=side,
quantity=float(job.get("quantity", 0)),
order_type=order_type_map.get(order_type_str, OrderType.MARKET),
limit_price=job.get("limit_price"),
stop_price=job.get("stop_price"),
time_in_force=job.get("time_in_force", "day"),
idempotency_key=generate_idempotency_key(job),
)
def build_proposed_order(job: dict[str, Any]) -> ProposedOrder:
"""Build a ProposedOrder for risk evaluation from a broker queue job."""
return ProposedOrder(
recommendation_id=job.get("recommendation_id"),
ticker=job["ticker"],
sector=job.get("sector", ""),
action=job.get("side", "buy"),
quantity=float(job.get("quantity", 0)),
estimated_value=float(job.get("estimated_value", 0)),
confidence=float(job.get("confidence", 0)),
)
async def load_risk_config(pool: asyncpg.Pool) -> PortfolioRiskConfig:
"""Load the active risk configuration from the database."""
row = await pool.fetchrow(_LOAD_RISK_CONFIG)
if row and row["config"]:
data = row["config"] if isinstance(row["config"], dict) else json.loads(row["config"])
return PortfolioRiskConfig.from_db_json(data)
return PortfolioRiskConfig()
async def load_account_risk_state(
pool: asyncpg.Pool,
adapter: AlpacaBrokerAdapter,
account_uuid: str,
) -> AccountRiskState:
"""Build an AccountRiskState from the broker and daily snapshot."""
state = AccountRiskState(account_id=account_uuid)
# Get live account info from Alpaca
try:
acct = await adapter.get_account()
state.portfolio_value = acct.portfolio_value
state.cash = acct.cash
state.buying_power = acct.buying_power
except Exception as e:
logger.warning("Failed to fetch account from Alpaca: %s", e)
# Get positions from Alpaca
try:
positions = await adapter.get_positions()
for pos in positions:
state.positions_by_symbol[pos.ticker] = pos.market_value
state.open_position_count = len(positions)
except Exception as e:
logger.warning("Failed to fetch positions from Alpaca: %s", e)
# Overlay daily snapshot from DB
row = await pool.fetchrow(_LOAD_DAILY_SNAPSHOT, account_uuid)
if row:
state.daily_pnl = float(row["daily_pnl"] or 0)
state.daily_trade_count = int(row["daily_trade_count"] or 0)
sector_data = row["positions_by_sector"]
if sector_data:
state.positions_by_sector = (
sector_data if isinstance(sector_data, dict) else json.loads(sector_data)
)
return state
async def persist_order(
pool: asyncpg.Pool,
order_id: str,
order: OrderRequest,
resp: OrderResponse,
account_uuid: str,
risk_eval: dict[str, Any],
recommendation_id: str | None = None,
) -> None:
"""Persist order, events, and risk evaluation to PostgreSQL."""
now = datetime.now(timezone.utc)
filled_at = now if resp.status == OrderStatus.FILLED else None
decision_trace = {
"risk_evaluation": risk_eval,
"order_request": order.to_dict(),
"broker_response": resp.to_dict(),
}
async with pool.acquire() as conn:
async with conn.transaction():
await conn.execute(
_INSERT_ORDER,
order_id,
recommendation_id,
account_uuid,
order.ticker,
order.side.value,
order.order_type.value,
order.quantity,
order.limit_price,
order.stop_price,
resp.status.value,
order.idempotency_key,
resp.broker_order_id,
json.dumps(decision_trace),
resp.submitted_at or now,
filled_at,
resp.filled_avg_price,
resp.filled_quantity,
)
# Record order events
for event_type in ["submitted"]:
await conn.execute(
_INSERT_ORDER_EVENT,
order_id,
event_type,
json.dumps({"ticker": order.ticker, "side": order.side.value}),
now,
)
if resp.status == OrderStatus.FILLED:
await conn.execute(
_INSERT_ORDER_EVENT,
order_id,
"fill",
json.dumps({
"fill_price": resp.filled_avg_price,
"fill_qty": resp.filled_quantity,
}),
now,
)
elif resp.status == OrderStatus.REJECTED:
await conn.execute(
_INSERT_ORDER_EVENT,
order_id,
"rejected",
json.dumps({"error": resp.error}),
now,
)
async def sync_positions(
adapter: AlpacaBrokerAdapter,
pool: asyncpg.Pool,
account_uuid: str,
minio_client: Any | None = None,
) -> None:
"""Sync current positions from Alpaca to PostgreSQL and publish to lake."""
now = datetime.now(timezone.utc)
try:
positions = await adapter.get_positions()
async with pool.acquire() as conn:
for pos in positions:
await conn.execute(
_UPSERT_POSITION,
account_uuid,
pos.ticker,
pos.quantity,
pos.avg_entry_price,
pos.current_price,
pos.unrealized_pnl,
now,
)
logger.info("Synced %d positions from Alpaca", len(positions))
POSITIONS_SYNCED.inc()
# Publish positions snapshot to analytical lake
if minio_client is not None and positions:
try:
pos_dicts = [
{
"ticker": p.ticker,
"quantity": p.quantity,
"avg_entry_price": p.avg_entry_price,
"close_price": p.current_price,
"unrealized_pnl": p.unrealized_pnl,
}
for p in positions
]
publish_positions_daily_batch(
minio_client, pos_dicts, account_uuid, now,
)
except Exception as e:
logger.warning("Failed to publish positions to lake: %s", e)
except Exception as e:
logger.error("Position sync failed: %s", e)
async def register_broker_account(
pool: asyncpg.Pool,
account_uuid: str,
adapter: AlpacaBrokerAdapter,
) -> None:
"""Register or update the broker account in PostgreSQL."""
try:
acct = await adapter.get_account()
config_json = json.dumps({
"provider": "alpaca",
"buying_power": acct.buying_power,
"cash": acct.cash,
"portfolio_value": acct.portfolio_value,
})
await pool.execute(
_UPSERT_BROKER_ACCOUNT,
account_uuid,
"alpaca",
acct.account_id or account_uuid,
adapter.mode.value,
config_json,
)
logger.info(
"Registered Alpaca account: id=%s mode=%s portfolio=%.2f",
acct.account_id, adapter.mode.value, acct.portfolio_value,
)
except Exception as e:
logger.error("Failed to register broker account: %s", e)
async def process_order_job(
job: dict[str, Any],
adapter: AlpacaBrokerAdapter,
pool: asyncpg.Pool,
account_uuid: str,
rds: aioredis.Redis | None = None,
minio_client: Any | None = None,
) -> None:
"""Process a single order job from the broker queue.
1. Generate deterministic idempotency key
2. Check Redis + DB for duplicate (Req 8.5)
3. Build proposed order and run risk evaluation
4. If risk passes, submit to Alpaca
5. Persist order, events, and risk evaluation
6. Set Redis idempotency marker
"""
ticker = job.get("ticker", "???")
order_id = str(uuid.uuid4())
idempotency_key = generate_idempotency_key(job)
# --- Duplicate prevention (Requirement 8.5) ---
# Fast path: Redis check
if rds is not None:
existing_order_id = await check_idempotency_redis(rds, idempotency_key)
if existing_order_id:
logger.info(
"Duplicate order detected (redis) for %s key=%s existing=%s",
ticker, idempotency_key[:16], existing_order_id,
)
ORDERS_DUPLICATES_PREVENTED.labels(detected_via="redis").inc()
await audit_duplicate_prevented(
pool, existing_order_id, ticker, idempotency_key, detected_via="redis",
)
return
# Durable fallback: DB check
existing = await check_idempotency_db(pool, idempotency_key)
if existing:
logger.info(
"Duplicate order detected (db) for %s key=%s existing=%s status=%s",
ticker, idempotency_key[:16], existing["id"], existing["status"],
)
ORDERS_DUPLICATES_PREVENTED.labels(detected_via="db").inc()
await audit_duplicate_prevented(
pool, existing["id"], ticker, idempotency_key, detected_via="db",
)
# Warm Redis cache for future fast-path hits
if rds is not None:
await mark_idempotency_redis(rds, idempotency_key, existing["id"])
return
# Risk evaluation
risk_config = await load_risk_config(pool)
risk_state = await load_account_risk_state(pool, adapter, account_uuid)
proposed = build_proposed_order(job)
evaluation = evaluate_order(proposed, risk_config, risk_state)
risk_eval_dict = {
"evaluation_id": evaluation.evaluation_id,
"eligible": evaluation.eligible,
"allowed_mode": evaluation.allowed_mode.value,
"rejection_reasons": evaluation.rejection_reasons,
"checks": [c.model_dump(mode="json") for c in evaluation.checks],
}
# Persist risk evaluation
rec_id = job.get("recommendation_id")
try:
await pool.execute(
_INSERT_RISK_EVALUATION,
evaluation.evaluation_id,
rec_id,
evaluation.eligible,
evaluation.allowed_mode.value,
json.dumps(evaluation.rejection_reasons),
json.dumps(risk_eval_dict["checks"]),
evaluation.evaluated_at,
)
except Exception as e:
logger.warning("Failed to persist risk evaluation: %s", e)
# Audit: risk evaluation result
await audit_risk_evaluated(
pool,
evaluation_id=evaluation.evaluation_id,
recommendation_id=rec_id,
ticker=ticker,
eligible=evaluation.eligible,
allowed_mode=evaluation.allowed_mode.value,
rejection_reasons=evaluation.rejection_reasons,
check_count=len(evaluation.checks),
)
if not evaluation.eligible:
RISK_EVALUATIONS_TOTAL.labels(result="rejected").inc()
for check in evaluation.checks:
if check.result.value == "fail":
RISK_CHECK_FAILURES.labels(check_name=check.check_name).inc()
ORDERS_REJECTED.labels(reason_category="risk_engine").inc()
logger.info(
"Order rejected by risk engine for %s: %s",
ticker, evaluation.rejection_reasons,
)
# Persist the rejected order for audit
order_req = build_order_request(job)
rejected_resp = OrderResponse(
broker_order_id="",
status=OrderStatus.REJECTED,
ticker=ticker,
side=OrderSide.SELL if job.get("side") == "sell" else OrderSide.BUY,
quantity=float(job.get("quantity", 0)),
error=f"Risk rejected: {'; '.join(evaluation.rejection_reasons)}",
)
await persist_order(
pool, order_id, order_req, rejected_resp,
account_uuid, risk_eval_dict, rec_id,
)
# Publish rejected order fact to analytical lake
if minio_client is not None:
try:
publish_trade_order(
minio_client, order_id, ticker,
side=job.get("side", "buy"),
order_type=job.get("order_type", "market"),
quantity=float(job.get("quantity", 0)),
limit_price=job.get("limit_price"),
status="rejected",
broker_account=account_uuid,
submitted_at=datetime.now(timezone.utc),
)
except Exception as e:
logger.warning("Failed to publish rejected order to lake: %s", e)
# Audit: order rejected by risk engine
await audit_order_rejected(
pool, order_id, ticker,
reason=f"Risk rejected: {'; '.join(evaluation.rejection_reasons)}",
source="risk_engine",
)
# Mark idempotency even for rejected orders to prevent reprocessing
if rds is not None:
await mark_idempotency_redis(rds, idempotency_key, order_id)
return
# --- Operator approval gate (Requirement 8.2) ---
if requires_approval(risk_config, evaluation.allowed_mode):
expiry = compute_expiry(risk_config)
approval_req = ApprovalRequest(
order_job=job,
recommendation_id=rec_id,
ticker=ticker,
side=job.get("side", "buy"),
quantity=float(job.get("quantity", 0)),
estimated_value=float(job.get("estimated_value", 0)),
risk_evaluation_id=evaluation.evaluation_id,
expires_at=expiry,
)
try:
await create_approval_request(pool, approval_req)
logger.info(
"Order for %s held for operator approval (id=%s, expires=%s)",
ticker, approval_req.approval_id, expiry.isoformat(),
)
await audit_approval_requested(
pool,
approval_id=approval_req.approval_id,
ticker=ticker,
side=approval_req.side,
quantity=approval_req.quantity,
estimated_value=approval_req.estimated_value,
recommendation_id=rec_id,
expires_at=expiry.isoformat(),
)
except Exception as e:
logger.error("Failed to create approval request for %s: %s", ticker, e)
# Do NOT mark idempotency — the job will be re-submitted after approval
return
# Submit to Alpaca
order_req = build_order_request(job)
RISK_EVALUATIONS_TOTAL.labels(result="passed").inc()
# Audit: order submitted to broker
await audit_order_submitted(
pool,
order_id=order_id,
ticker=ticker,
side=order_req.side.value,
quantity=order_req.quantity,
order_type=order_req.order_type.value,
idempotency_key=order_req.idempotency_key,
recommendation_id=rec_id,
evaluation_id=evaluation.evaluation_id,
)
resp = await adapter.submit_order(order_req)
await persist_order(
pool, order_id, order_req, resp,
account_uuid, risk_eval_dict, rec_id,
)
# Publish order fact to analytical lake
if minio_client is not None:
try:
publish_trade_order(
minio_client, order_id, ticker,
side=order_req.side.value,
order_type=order_req.order_type.value,
quantity=order_req.quantity,
limit_price=order_req.limit_price,
status=resp.status.value,
broker_account=account_uuid,
submitted_at=resp.submitted_at or datetime.now(timezone.utc),
)
except Exception as e:
logger.warning("Failed to publish order to lake: %s", e)
# Publish fill fact if the order was filled
if resp.status == OrderStatus.FILLED and resp.filled_avg_price is not None:
try:
fill_id = str(uuid.uuid4())
publish_trade_fill(
minio_client, fill_id, order_id, ticker,
side=order_req.side.value,
fill_price=resp.filled_avg_price,
fill_quantity=resp.filled_quantity,
broker_account=account_uuid,
filled_at=datetime.now(timezone.utc),
)
except Exception as e:
logger.warning("Failed to publish fill to lake: %s", e)
# Mark idempotency after successful persistence
if rds is not None:
await mark_idempotency_redis(rds, idempotency_key, order_id)
if resp.ok:
mode = "paper" if adapter.mode == TradingMode.PAPER else "live"
ORDERS_SUBMITTED.labels(
side=order_req.side.value,
order_type=order_req.order_type.value,
mode=mode,
).inc()
logger.info(
"Order submitted to Alpaca: %s %s %.0f %s @ %s | broker_id=%s",
resp.status.value, order_req.side.value, order_req.quantity,
ticker, resp.filled_avg_price, resp.broker_order_id,
)
# Audit: order filled
if resp.status == OrderStatus.FILLED:
ORDERS_FILLED.labels(side=order_req.side.value).inc()
await audit_order_filled(
pool, order_id, ticker,
side=order_req.side.value,
fill_quantity=resp.filled_quantity,
fill_price=resp.filled_avg_price,
broker_order_id=resp.broker_order_id,
)
else:
ORDERS_REJECTED.labels(reason_category="broker").inc()
logger.warning(
"Order failed for %s: %s (status=%s)",
ticker, resp.error, resp.status.value,
)
# Audit: order rejected by broker
await audit_order_rejected(
pool, order_id, ticker,
reason=resp.error or f"Broker status: {resp.status.value}",
source="broker",
)
async def position_sync_loop(
adapter: AlpacaBrokerAdapter,
pool: asyncpg.Pool,
account_uuid: str,
minio_client: Any | None = None,
) -> None:
"""Periodically sync positions from Alpaca to PostgreSQL and lake."""
while True:
await sync_positions(adapter, pool, account_uuid, minio_client)
await asyncio.sleep(POSITION_SYNC_INTERVAL)
async def main() -> None:
config = load_config()
setup_logging("broker_service", level=config.log_level, json_output=config.json_logs)
pool = await get_pg_pool(config)
rds = get_redis(config)
# Initialize MinIO client for lake publishing
from minio import Minio
minio_client = Minio(
config.minio.endpoint,
access_key=config.minio.access_key,
secret_key=config.minio.secret_key,
secure=config.minio.secure,
)
# Ensure lakehouse bucket exists
if not minio_client.bucket_exists(LAKEHOUSE_BUCKET):
minio_client.make_bucket(LAKEHOUSE_BUCKET)
# Determine mode — default to paper for safety (Req 8.1)
mode = TradingMode.LIVE if config.broker.mode == "live" else TradingMode.PAPER
if mode == TradingMode.LIVE:
logger.warning("LIVE trading mode enabled — orders will be submitted to real broker")
adapter = AlpacaBrokerAdapter(
api_key=config.broker.api_key or "",
api_secret=config.broker.api_secret or "",
mode=mode,
base_url=config.broker.base_url,
)
# Generate a stable account UUID from the API key
account_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"alpaca-{config.broker.api_key or 'default'}"))
# Register broker account on startup
await register_broker_account(pool, account_uuid, adapter)
# Start position sync in background
sync_task = asyncio.create_task(
position_sync_loop(adapter, pool, account_uuid, minio_client)
)
queue = queue_key(QUEUE_BROKER)
logger.info("Broker service started (mode=%s)", mode.value)
try:
while True:
result = await rds.lpop(queue)
raw = str(result) if result else None
if raw:
try:
job = json.loads(raw)
await process_order_job(job, adapter, pool, account_uuid, rds, minio_client)
except Exception:
logger.exception("Error processing broker job")
else:
await asyncio.sleep(2)
finally:
sync_task.cancel()
await pool.close()
await rds.close()
if __name__ == "__main__":
asyncio.run(main())
+170 -27
View File
@@ -1,8 +1,17 @@
"""Filings / Regulatory API adapter - fetches SEC-style submissions."""
"""Filings / Regulatory API adapter interface and concrete SEC EDGAR provider.
The FilingsDataAdapter is the abstract interface for all filings data providers.
SECEdgarAdapter is the first concrete implementation, targeting the SEC EDGAR
full-text search system (EFTS) for company filings discovery.
Requirements: 2.3, 2.5, 3.1, 3.2, 3.3
"""
import hashlib
import logging
from datetime import datetime
from typing import Any, Dict
import time
from abc import ABC
from datetime import datetime, timezone
from typing import Any
import httpx
@@ -11,48 +20,182 @@ from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("filings_adapter")
class FilingsAdapter(BaseAdapter):
"""Concrete adapter for SEC EDGAR or similar filings API."""
class FilingsDataAdapter(BaseAdapter, ABC):
"""Abstract interface for filings / regulatory data providers.
def __init__(self, base_url: str = "https://efts.sec.gov", user_agent: str = "StonksOracle/1.0"):
self.base_url = base_url
self.user_agent = user_agent
Subclasses implement fetch() for their specific filings API.
source_type() is concrete here since all filings adapters share the same type.
"""
def source_type(self) -> str:
return "filings_api"
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
_cik = config.get("cik", "")
endpoint = config.get("endpoint", f"/LATEST/search-index?q=%22{ticker}%22&dateRange=custom&startdt=2026-01-01&forms=8-K,10-Q,10-K")
url = f"{self.base_url}{endpoint}"
headers = {"User-Agent": self.user_agent}
class SECEdgarAdapter(FilingsDataAdapter):
"""Concrete adapter for the SEC EDGAR full-text search system (EFTS).
Supports:
- Full-text search (/LATEST/search-index) for 8-K, 10-Q, 10-K, and other forms
- Filtering by date range, form type, and entity
The SEC EDGAR EFTS API is public and does not require an API key,
but requires a descriptive User-Agent header per SEC fair-access policy.
Config options:
cik: Company CIK number (optional, narrows search)
forms: Comma-separated form types to search (default "8-K,10-Q,10-K")
start_date: Only filings on or after this date, YYYY-MM-DD (optional)
end_date: Only filings on or before this date, YYYY-MM-DD (optional)
query: Custom search query override (optional, replaces ticker-based query)
"""
SEARCH_ENDPOINT: str = "/LATEST/search-index"
def __init__(
self,
base_url: str = "https://efts.sec.gov",
user_agent: str = "StonksOracle/1.0 ([email])",
) -> None:
self.base_url: str = base_url.rstrip("/")
self.user_agent: str = user_agent
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch filings from SEC EDGAR EFTS for a given ticker.
Args:
ticker: The company ticker symbol.
config: Source-specific configuration from the sources table.
Returns:
AdapterResult with raw payload, parsed filing items, and metadata.
"""
url, params, headers = self._build_request(ticker, config)
async with httpx.AsyncClient(timeout=30) as client:
t0 = time.monotonic()
try:
resp = await client.get(url, headers=headers)
resp = await client.get(url, params=params, headers=headers)
elapsed_ms = (time.monotonic() - t0) * 1000
resp.raise_for_status()
raw = resp.content
data = resp.json()
content_hash = hashlib.sha256(raw).hexdigest()
items = self._extract_items(data)
hits = data.get("hits", {}).get("hits", [])
return AdapterResult(
source_type="filings_api",
ticker=ticker,
items=hits,
items=items,
raw_payload=raw,
content_hash=content_hash,
fetched_at=datetime.utcnow(),
fetched_at=datetime.now(timezone.utc),
http_status=resp.status_code,
response_time_ms=round(elapsed_ms, 1),
metadata={
"provider": "sec_edgar",
"results_count": len(items),
"total_hits": self._total_hits(data),
"query": params.get("q", ""),
"forms": params.get("forms", ""),
},
)
except httpx.HTTPStatusError as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("SEC EDGAR HTTP error for %s: %s", ticker, e)
return self._error_result(
ticker, str(e), elapsed_ms,
http_status=e.response.status_code if e.response else None,
raw=e.response.content if e.response else b"",
)
except httpx.TimeoutException as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("SEC EDGAR timeout for %s: %s", ticker, e)
return self._error_result(ticker, f"timeout: {e}", elapsed_ms)
except Exception as e:
logger.error(f"Filings fetch failed for {ticker}: {e}")
return AdapterResult(
source_type="filings_api",
ticker=ticker,
items=[],
raw_payload=b"",
content_hash="",
fetched_at=datetime.utcnow(),
error=str(e),
)
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("SEC EDGAR fetch failed for %s: %s", ticker, e)
return self._error_result(ticker, str(e), elapsed_ms)
def _build_request(
self, ticker: str, config: dict[str, Any]
) -> tuple[str, dict[str, str], dict[str, str]]:
"""Build the URL, query params, and headers for an EDGAR EFTS request."""
params: dict[str, str] = {}
headers: dict[str, str] = {"User-Agent": self.user_agent}
# Query: use custom override or default to ticker-based search
query = config.get("query")
if query:
params["q"] = str(query)
else:
params["q"] = f'"{ticker}"'
# Form types filter
forms = config.get("forms", "8-K,10-Q,10-K")
params["forms"] = str(forms)
# Date range
if config.get("start_date"):
params["dateRange"] = "custom"
params["startdt"] = str(config["start_date"])
if config.get("end_date"):
params["dateRange"] = "custom"
params["enddt"] = str(config["end_date"])
# CIK filter (entity-level narrowing)
cik = config.get("cik")
if cik:
params["q"] = f'{params["q"]} AND cik:{cik}'
url = f"{self.base_url}{self.SEARCH_ENDPOINT}"
return url, params, headers
def _extract_items(self, data: dict[str, Any]) -> list[dict[str, Any]]:
"""Extract the filing hits from an EDGAR EFTS response.
EFTS returns results under hits.hits as a list of objects,
each containing _source with fields like file_date, form_type,
entity_name, file_num, and period_of_report.
"""
hits_wrapper = data.get("hits", {})
if not isinstance(hits_wrapper, dict):
return []
hits = hits_wrapper.get("hits", [])
if isinstance(hits, list):
return hits
return []
def _total_hits(self, data: dict[str, Any]) -> int:
"""Extract total hit count from EFTS response."""
hits_wrapper = data.get("hits", {})
if not isinstance(hits_wrapper, dict):
return 0
total = hits_wrapper.get("total", {})
if isinstance(total, dict):
return int(total.get("value", 0))
if isinstance(total, int):
return total
return 0
def _error_result(
self,
ticker: str,
error: str,
elapsed_ms: float,
http_status: int | None = None,
raw: bytes = b"",
) -> AdapterResult:
"""Build an error AdapterResult for filings fetches."""
return AdapterResult(
source_type="filings_api",
ticker=ticker,
items=[],
raw_payload=raw,
content_hash="",
fetched_at=datetime.now(timezone.utc),
error=error,
http_status=http_status,
response_time_ms=round(elapsed_ms, 1),
metadata={"provider": "sec_edgar"},
)
+145 -27
View File
@@ -1,8 +1,16 @@
"""Market data API adapter - fetches quotes, bars, and reference data."""
"""Market data API adapter interface and concrete Polygon.io provider.
The MarketDataAdapter is the abstract interface for all market data providers.
PolygonMarketAdapter is the first concrete implementation, targeting the
Polygon.io REST API for previous-day bars, quotes, and ticker details.
Requirements: 2.1, 2.5, 3.1, 3.2, 3.3
"""
import hashlib
import logging
from datetime import datetime
from typing import Any, Dict
import time
from datetime import datetime, timezone
from typing import Any
import httpx
@@ -12,48 +20,158 @@ logger = logging.getLogger("market_adapter")
class MarketDataAdapter(BaseAdapter):
"""Concrete adapter for a market data provider (e.g., Alpha Vantage, Polygon, Yahoo)."""
"""Abstract interface for market data providers.
def __init__(self, api_key: str = "", base_url: str = ""):
self.api_key = api_key
self.base_url = base_url
Subclasses implement fetch() for their specific market data API.
"""
def source_type(self) -> str:
return "market_api"
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
endpoint = config.get("endpoint", "/v2/aggs/ticker/{ticker}/prev")
url = f"{self.base_url}{endpoint.format(ticker=ticker)}"
params = config.get("params", {})
if self.api_key:
params["apiKey"] = self.api_key
class PolygonMarketAdapter(MarketDataAdapter):
"""Concrete adapter for the Polygon.io REST API.
Supports:
- Previous-day aggregate bars (/v2/aggs/ticker/{ticker}/prev)
- Grouped daily bars (/v2/aggs/grouped/locale/us/market/stocks/{date})
- Ticker details (/v3/reference/tickers/{ticker})
The endpoint is selected via the source config's "endpoint" field,
defaulting to previous-day bars.
"""
PREV_BARS = "/v2/aggs/ticker/{ticker}/prev"
RANGE_BARS = "/v2/aggs/ticker/{ticker}/range/{multiplier}/{timespan}/{from_date}/{to_date}"
TICKER_DETAILS = "/v3/reference/tickers/{ticker}"
def __init__(self, api_key: str, base_url: str = "https://api.polygon.io") -> None:
self.api_key: str = api_key
self.base_url: str = base_url.rstrip("/")
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch market data from Polygon.io for a given ticker.
Config options:
endpoint: One of "prev_bars" (default), "range_bars", "ticker_details"
multiplier: Bar multiplier for range queries (default 1)
timespan: Bar timespan for range queries (default "day")
from_date: Start date for range queries (YYYY-MM-DD)
to_date: End date for range queries (YYYY-MM-DD)
adjusted: Whether bars are adjusted for splits (default true)
"""
endpoint_key = config.get("endpoint", "prev_bars")
url, params = self._build_request(ticker, endpoint_key, config)
async with httpx.AsyncClient(timeout=30) as client:
t0 = time.monotonic()
try:
resp = await client.get(url, params=params)
elapsed_ms = (time.monotonic() - t0) * 1000
resp.raise_for_status()
raw = resp.content
data = resp.json()
content_hash = hashlib.sha256(raw).hexdigest()
items = data.get("results", [data]) if isinstance(data, dict) else data
items = self._extract_items(data, endpoint_key)
return AdapterResult(
source_type="market_api",
ticker=ticker,
items=items if isinstance(items, list) else [items],
items=items,
raw_payload=raw,
content_hash=content_hash,
fetched_at=datetime.utcnow(),
fetched_at=datetime.now(timezone.utc),
http_status=resp.status_code,
response_time_ms=round(elapsed_ms, 1),
metadata={
"provider": "polygon",
"endpoint": endpoint_key,
"results_count": data.get("resultsCount", len(items)),
"request_id": data.get("request_id", ""),
},
)
except httpx.HTTPStatusError as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Polygon HTTP error for %s: %s", ticker, e)
return self._error_result(
ticker, str(e), elapsed_ms,
http_status=e.response.status_code if e.response else None,
raw=e.response.content if e.response else b"",
)
except httpx.TimeoutException as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Polygon timeout for %s: %s", ticker, e)
return self._error_result(ticker, f"timeout: {e}", elapsed_ms)
except Exception as e:
logger.error(f"Market fetch failed for {ticker}: {e}")
return AdapterResult(
source_type="market_api",
ticker=ticker,
items=[],
raw_payload=b"",
content_hash="",
fetched_at=datetime.utcnow(),
error=str(e),
)
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Polygon fetch failed for %s: %s", ticker, e)
return self._error_result(ticker, str(e), elapsed_ms)
def _build_request(
self, ticker: str, endpoint_key: str, config: dict[str, Any]
) -> tuple[str, dict[str, str]]:
"""Build the URL and query params for a Polygon request."""
params: dict[str, str] = {"apiKey": self.api_key}
if endpoint_key == "range_bars":
multiplier = str(config.get("multiplier", 1))
timespan = config.get("timespan", "day")
from_date = config.get("from_date", "")
to_date = config.get("to_date", "")
path = self.RANGE_BARS.format(
ticker=ticker,
multiplier=multiplier,
timespan=timespan,
from_date=from_date,
to_date=to_date,
)
if config.get("adjusted") is not None:
params["adjusted"] = str(config["adjusted"]).lower()
if config.get("sort"):
params["sort"] = config["sort"]
if config.get("limit"):
params["limit"] = str(config["limit"])
elif endpoint_key == "ticker_details":
path = self.TICKER_DETAILS.format(ticker=ticker)
else:
# Default: previous-day bars
path = self.PREV_BARS.format(ticker=ticker)
if config.get("adjusted") is not None:
params["adjusted"] = str(config["adjusted"]).lower()
return f"{self.base_url}{path}", params
def _extract_items(self, data: dict[str, Any], endpoint_key: str) -> list[dict[str, Any]]:
"""Extract the relevant items list from a Polygon response."""
if endpoint_key == "ticker_details":
results = data.get("results", {})
return [results] if isinstance(results, dict) and results else []
# Aggregate endpoints return results as a list
results = data.get("results", [])
if isinstance(results, list):
return results
return [results] if results else []
def _error_result(
self,
ticker: str,
error: str,
elapsed_ms: float,
http_status: int | None = None,
raw: bytes = b"",
) -> AdapterResult:
"""Build an error AdapterResult."""
return AdapterResult(
source_type="market_api",
ticker=ticker,
items=[],
raw_payload=raw,
content_hash="",
fetched_at=datetime.now(timezone.utc),
error=error,
http_status=http_status,
response_time_ms=round(elapsed_ms, 1),
metadata={"provider": "polygon"},
)
+135 -30
View File
@@ -1,8 +1,17 @@
"""News API adapter - fetches company-linked headlines and article metadata."""
"""News API adapter interface and concrete Polygon.io news provider.
The NewsDataAdapter is the abstract interface for all news data providers.
PolygonNewsAdapter is the first concrete implementation, targeting the
Polygon.io REST API for company-linked news articles and headlines.
Requirements: 2.2, 2.5, 3.1, 3.2, 3.3
"""
import hashlib
import logging
from datetime import datetime
from typing import Any, Dict
import time
from abc import ABC
from datetime import datetime, timezone
from typing import Any
import httpx
@@ -11,51 +20,147 @@ from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("news_adapter")
class NewsApiAdapter(BaseAdapter):
"""Concrete adapter for a news API provider."""
class NewsDataAdapter(BaseAdapter, ABC):
"""Abstract interface for news data providers.
def __init__(self, api_key: str = "", base_url: str = ""):
self.api_key = api_key
self.base_url = base_url
Subclasses implement fetch() for their specific news API.
source_type() is concrete here since all news adapters share the same type.
"""
def source_type(self) -> str:
return "news_api"
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
endpoint = config.get("endpoint", "/v2/everything")
url = f"{self.base_url}{endpoint}"
params = config.get("params", {})
params.setdefault("q", ticker)
params.setdefault("sortBy", "publishedAt")
params.setdefault("pageSize", 20)
if self.api_key:
params["apiKey"] = self.api_key
class PolygonNewsAdapter(NewsDataAdapter):
"""Concrete adapter for the Polygon.io ticker news endpoint.
Supports:
- Ticker news (/v2/reference/news?ticker={ticker})
Config options:
limit: Max articles to return per request (default 20, max 1000)
published_utc_gte: Only articles published on or after this date (YYYY-MM-DD)
published_utc_lte: Only articles published on or before this date (YYYY-MM-DD)
order: Sort order for results, "asc" or "desc" (default "desc")
"""
NEWS_ENDPOINT = "/v2/reference/news"
def __init__(self, api_key: str, base_url: str = "https://api.polygon.io") -> None:
self.api_key: str = api_key
self.base_url: str = base_url.rstrip("/")
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch news articles from Polygon.io for a given ticker.
Args:
ticker: The company ticker symbol.
config: Source-specific configuration from the sources table.
Returns:
AdapterResult with raw payload, parsed article items, and metadata.
"""
url, params = self._build_request(ticker, config)
async with httpx.AsyncClient(timeout=30) as client:
t0 = time.monotonic()
try:
resp = await client.get(url, params=params)
elapsed_ms = (time.monotonic() - t0) * 1000
resp.raise_for_status()
raw = resp.content
data = resp.json()
content_hash = hashlib.sha256(raw).hexdigest()
items = self._extract_items(data)
articles = data.get("articles", [])
return AdapterResult(
source_type="news_api",
ticker=ticker,
items=articles,
items=items,
raw_payload=raw,
content_hash=content_hash,
fetched_at=datetime.utcnow(),
fetched_at=datetime.now(timezone.utc),
http_status=resp.status_code,
response_time_ms=round(elapsed_ms, 1),
metadata={
"provider": "polygon",
"results_count": data.get("count", len(items)),
"next_url": data.get("next_url", ""),
"request_id": data.get("request_id", ""),
},
)
except httpx.HTTPStatusError as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Polygon news HTTP error for %s: %s", ticker, e)
return self._error_result(
ticker, str(e), elapsed_ms,
http_status=e.response.status_code if e.response else None,
raw=e.response.content if e.response else b"",
)
except httpx.TimeoutException as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Polygon news timeout for %s: %s", ticker, e)
return self._error_result(ticker, f"timeout: {e}", elapsed_ms)
except Exception as e:
logger.error(f"News fetch failed for {ticker}: {e}")
return AdapterResult(
source_type="news_api",
ticker=ticker,
items=[],
raw_payload=b"",
content_hash="",
fetched_at=datetime.utcnow(),
error=str(e),
)
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Polygon news fetch failed for %s: %s", ticker, e)
return self._error_result(ticker, str(e), elapsed_ms)
def _build_request(
self, ticker: str, config: dict[str, Any]
) -> tuple[str, dict[str, str]]:
"""Build the URL and query params for a Polygon news request."""
params: dict[str, str] = {
"apiKey": self.api_key,
"ticker": ticker,
}
limit = config.get("limit", 20)
params["limit"] = str(min(int(limit), 1000))
if config.get("order"):
params["order"] = config["order"]
if config.get("published_utc_gte"):
params["published_utc.gte"] = config["published_utc_gte"]
if config.get("published_utc_lte"):
params["published_utc.lte"] = config["published_utc_lte"]
url = f"{self.base_url}{self.NEWS_ENDPOINT}"
return url, params
def _extract_items(self, data: dict[str, Any]) -> list[dict[str, Any]]:
"""Extract the article list from a Polygon news response.
Polygon returns articles under the "results" key as a list of objects,
each containing fields like id, publisher, title, article_url, tickers,
published_utc, description, and keywords.
"""
results = data.get("results", [])
if isinstance(results, list):
return results
return []
def _error_result(
self,
ticker: str,
error: str,
elapsed_ms: float,
http_status: int | None = None,
raw: bytes = b"",
) -> AdapterResult:
"""Build an error AdapterResult for news fetches."""
return AdapterResult(
source_type="news_api",
ticker=ticker,
items=[],
raw_payload=raw,
content_hash="",
fetched_at=datetime.now(timezone.utc),
error=error,
http_status=http_status,
response_time_ms=round(elapsed_ms, 1),
metadata={"provider": "polygon"},
)
+603
View File
@@ -0,0 +1,603 @@
"""Paper trading adapter - local order simulation and state sync.
Implements a fully local paper trading engine that simulates order
execution without requiring a real broker API. Tracks positions,
account balance, fills, and order events in-memory with PostgreSQL
persistence for state sync and audit trail.
Requirements: 8.1, 8.3, 8.5, 2.4
Design: Section 4.9 - Broker Adapter (paper mode)
"""
from __future__ import annotations
import json
import logging
import uuid
from datetime import datetime, timezone
from typing import Any
import asyncpg
from services.adapters.broker_adapter import (
AccountInfo,
BrokerDataAdapter,
OrderEventType,
OrderRequest,
OrderResponse,
OrderSide,
OrderStatus,
OrderType,
PositionInfo,
TradingMode,
)
from services.adapters.base import AdapterResult
logger = logging.getLogger("paper_trading")
# ---------------------------------------------------------------------------
# In-memory paper trading state
# ---------------------------------------------------------------------------
class PaperPosition:
"""Tracks a single paper position."""
def __init__(
self,
ticker: str,
quantity: float = 0.0,
avg_entry_price: float = 0.0,
realized_pnl: float = 0.0,
) -> None:
self.ticker = ticker
self.quantity = quantity
self.avg_entry_price = avg_entry_price
self.realized_pnl = realized_pnl
def apply_fill(self, side: OrderSide, fill_qty: float, fill_price: float) -> float:
"""Apply a fill to this position. Returns realized PnL from the fill."""
realized = 0.0
if side == OrderSide.BUY:
# Buying: average up the entry price
total_cost = self.avg_entry_price * self.quantity + fill_price * fill_qty
self.quantity += fill_qty
if self.quantity > 0:
self.avg_entry_price = total_cost / self.quantity
else:
# Selling: realize PnL on the sold shares
if self.quantity > 0:
sell_qty = min(fill_qty, self.quantity)
realized = sell_qty * (fill_price - self.avg_entry_price)
self.quantity -= sell_qty
self.realized_pnl += realized
if self.quantity <= 0:
self.quantity = 0.0
self.avg_entry_price = 0.0
return realized
@property
def is_open(self) -> bool:
return self.quantity > 0
def to_position_info(self, current_price: float | None = None) -> PositionInfo:
"""Convert to a PositionInfo for the broker interface."""
price = current_price if current_price is not None else self.avg_entry_price
unrealized = (price - self.avg_entry_price) * self.quantity if self.quantity > 0 else 0.0
market_value = price * self.quantity
return PositionInfo(
ticker=self.ticker,
quantity=self.quantity,
avg_entry_price=self.avg_entry_price,
current_price=price,
unrealized_pnl=round(unrealized, 4),
market_value=round(market_value, 4),
side="long" if self.quantity > 0 else "flat",
)
class PaperAccount:
"""In-memory paper trading account state."""
def __init__(
self,
account_id: str = "paper-default",
initial_cash: float = 100_000.0,
) -> None:
self.account_id = account_id
self.initial_cash = initial_cash
self.cash = initial_cash
self.positions: dict[str, PaperPosition] = {}
self.orders: dict[str, OrderResponse] = {}
self.order_events: list[dict[str, Any]] = []
self._seen_idempotency_keys: dict[str, str] = {} # key -> order_id
@property
def portfolio_value(self) -> float:
position_value = sum(
p.quantity * p.avg_entry_price for p in self.positions.values() if p.is_open
)
return self.cash + position_value
@property
def buying_power(self) -> float:
return self.cash
def get_position(self, ticker: str) -> PaperPosition:
if ticker not in self.positions:
self.positions[ticker] = PaperPosition(ticker=ticker)
return self.positions[ticker]
def to_account_info(self) -> AccountInfo:
return AccountInfo(
account_id=self.account_id,
buying_power=round(self.buying_power, 2),
cash=round(self.cash, 2),
portfolio_value=round(self.portfolio_value, 2),
currency="USD",
mode=TradingMode.PAPER,
)
# ---------------------------------------------------------------------------
# Paper trading adapter
# ---------------------------------------------------------------------------
class PaperTradingAdapter(BrokerDataAdapter):
"""Local paper trading adapter that simulates order execution.
All orders are filled immediately at the estimated price (market orders)
or at the limit/stop price when applicable. No real broker API is called.
Features:
- Idempotent order submission via idempotency_key (Req 8.5)
- Full order event trail for audit (Req 8.3)
- Position tracking with average entry price
- Cash balance management
- State sync to/from PostgreSQL
The adapter operates in PAPER mode only and rejects any attempt
to switch to LIVE mode.
"""
def __init__(
self,
account_id: str = "paper-default",
initial_cash: float = 100_000.0,
simulated_slippage_pct: float = 0.001,
) -> None:
super().__init__(mode=TradingMode.PAPER)
self.account = PaperAccount(account_id=account_id, initial_cash=initial_cash)
self.slippage_pct = simulated_slippage_pct
def source_type(self) -> str:
return "broker"
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch paper positions/account as a raw artifact snapshot."""
endpoint = config.get("endpoint", "positions")
now = datetime.now(timezone.utc)
if endpoint == "account":
data = self.account.to_account_info().to_dict()
items = [data]
elif endpoint == "orders":
items = [
resp.to_dict()
for resp in self.account.orders.values()
if resp.ticker == ticker or ticker == "*"
]
else:
pos = self.account.get_position(ticker)
data = pos.to_position_info().to_dict()
items = [data] if pos.is_open else []
raw = json.dumps(items).encode()
return AdapterResult(
source_type="broker",
ticker=ticker,
items=items,
raw_payload=raw,
content_hash="",
fetched_at=now,
metadata={"provider": "paper", "mode": "paper", "endpoint": endpoint},
)
async def submit_order(self, order: OrderRequest) -> OrderResponse:
"""Simulate order submission and immediate fill.
Idempotency: if the same idempotency_key was already used,
return the original response (Req 8.5).
"""
# Idempotency check
existing_id = self.account._seen_idempotency_keys.get(order.idempotency_key)
if existing_id and existing_id in self.account.orders:
logger.info("Duplicate order key %s — returning cached response", order.idempotency_key)
return self.account.orders[existing_id]
now = datetime.now(timezone.utc)
order_id = str(uuid.uuid4())
# Determine fill price based on order type
fill_price = self._compute_fill_price(order)
# Check if we have enough cash for buys
if order.side == OrderSide.BUY:
required_cash = fill_price * order.quantity
if required_cash > self.account.cash:
resp = OrderResponse(
broker_order_id=order_id,
status=OrderStatus.REJECTED,
ticker=order.ticker,
side=order.side,
quantity=order.quantity,
submitted_at=now,
error=f"Insufficient cash: need {required_cash:.2f}, have {self.account.cash:.2f}",
)
self._record_event(order_id, OrderEventType.REJECTED, resp.to_dict(), now)
self.account.orders[order_id] = resp
self.account._seen_idempotency_keys[order.idempotency_key] = order_id
return resp
# Check if we have enough shares for sells
if order.side == OrderSide.SELL:
pos = self.account.get_position(order.ticker)
if pos.quantity < order.quantity:
resp = OrderResponse(
broker_order_id=order_id,
status=OrderStatus.REJECTED,
ticker=order.ticker,
side=order.side,
quantity=order.quantity,
submitted_at=now,
error=f"Insufficient shares: need {order.quantity}, have {pos.quantity}",
)
self._record_event(order_id, OrderEventType.REJECTED, resp.to_dict(), now)
self.account.orders[order_id] = resp
self.account._seen_idempotency_keys[order.idempotency_key] = order_id
return resp
# Simulate immediate fill
position = self.account.get_position(order.ticker)
realized_pnl = position.apply_fill(order.side, order.quantity, fill_price)
# Update cash
if order.side == OrderSide.BUY:
self.account.cash -= fill_price * order.quantity
else:
self.account.cash += fill_price * order.quantity
resp = OrderResponse(
broker_order_id=order_id,
status=OrderStatus.FILLED,
ticker=order.ticker,
side=order.side,
quantity=order.quantity,
filled_quantity=order.quantity,
filled_avg_price=fill_price,
submitted_at=now,
raw_response={
"realized_pnl": round(realized_pnl, 4),
"cash_after": round(self.account.cash, 2),
"position_qty_after": position.quantity,
"simulated": True,
},
)
# Record events
self._record_event(order_id, OrderEventType.SUBMITTED, {"ticker": order.ticker}, now)
self._record_event(order_id, OrderEventType.ACCEPTED, {"ticker": order.ticker}, now)
self._record_event(order_id, OrderEventType.FILL, {
"fill_price": fill_price,
"fill_qty": order.quantity,
"realized_pnl": round(realized_pnl, 4),
}, now)
self.account.orders[order_id] = resp
self.account._seen_idempotency_keys[order.idempotency_key] = order_id
logger.info(
"Paper fill: %s %s %.0f %s @ %.2f | cash=%.2f pnl=%.4f",
order_id[:8], order.side.value, order.quantity,
order.ticker, fill_price, self.account.cash, realized_pnl,
)
return resp
async def cancel_order(self, broker_order_id: str) -> OrderResponse:
"""Cancel a paper order. Only pending orders can be cancelled."""
existing = self.account.orders.get(broker_order_id)
if existing is None:
return OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.REJECTED,
ticker="",
side=OrderSide.BUY,
quantity=0,
error=f"Order {broker_order_id} not found",
)
# Paper orders fill immediately, so they can't be cancelled
if existing.status == OrderStatus.FILLED:
return OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.REJECTED,
ticker=existing.ticker,
side=existing.side,
quantity=existing.quantity,
error="Cannot cancel a filled order",
)
now = datetime.now(timezone.utc)
cancelled = OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.CANCELLED,
ticker=existing.ticker,
side=existing.side,
quantity=existing.quantity,
submitted_at=existing.submitted_at,
)
self.account.orders[broker_order_id] = cancelled
self._record_event(broker_order_id, OrderEventType.CANCELLED, {}, now)
return cancelled
async def get_order_status(self, broker_order_id: str) -> OrderResponse:
"""Get the status of a paper order."""
existing = self.account.orders.get(broker_order_id)
if existing is None:
return OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.REJECTED,
ticker="",
side=OrderSide.BUY,
quantity=0,
error=f"Order {broker_order_id} not found",
)
return existing
async def get_positions(self) -> list[PositionInfo]:
"""Get all open paper positions."""
return [
p.to_position_info()
for p in self.account.positions.values()
if p.is_open
]
async def get_account(self) -> AccountInfo:
"""Get paper account summary."""
return self.account.to_account_info()
# -----------------------------------------------------------------------
# Internal helpers
# -----------------------------------------------------------------------
def _compute_fill_price(self, order: OrderRequest) -> float:
"""Determine the simulated fill price for an order.
Market orders use the limit_price as a proxy (or 0 if not set).
Limit orders fill at the limit price.
Stop orders fill at the stop price.
A small slippage is applied to market orders.
"""
if order.order_type == OrderType.LIMIT and order.limit_price is not None:
return order.limit_price
if order.order_type == OrderType.STOP and order.stop_price is not None:
return order.stop_price
if order.order_type == OrderType.STOP_LIMIT and order.limit_price is not None:
return order.limit_price
# Market order: use limit_price as estimate, or a default
base_price = order.limit_price if order.limit_price is not None else 100.0
if order.side == OrderSide.BUY:
return round(base_price * (1 + self.slippage_pct), 4)
return round(base_price * (1 - self.slippage_pct), 4)
def _record_event(
self,
order_id: str,
event_type: OrderEventType,
data: dict[str, Any],
timestamp: datetime,
) -> None:
"""Record an order event for audit trail."""
self.account.order_events.append({
"order_id": order_id,
"event_type": event_type.value,
"data": data,
"timestamp": timestamp.isoformat(),
})
# ---------------------------------------------------------------------------
# State sync: persist and restore paper trading state to/from PostgreSQL
# ---------------------------------------------------------------------------
# SQL for persisting paper orders to the orders table
_INSERT_PAPER_ORDER = """
INSERT INTO orders (
id, recommendation_id, broker_account_id, ticker, side, order_type,
quantity, limit_price, stop_price, status, idempotency_key,
broker_order_id, decision_trace, submitted_at, filled_at,
fill_price, fill_quantity
) VALUES (
$1::uuid, $2, $3, $4, $5, $6,
$7, $8, $9, $10, $11,
$12, $13::jsonb, $14, $15,
$16, $17
)
ON CONFLICT (idempotency_key) DO NOTHING
"""
_INSERT_PAPER_ORDER_EVENT = """
INSERT INTO order_events (order_id, event_type, data, broker_timestamp)
VALUES ($1::uuid, $2, $3::jsonb, $4)
"""
_UPSERT_PAPER_POSITION = """
INSERT INTO positions (broker_account_id, ticker, quantity, avg_entry_price, realized_pnl, updated_at)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (broker_account_id, ticker)
DO UPDATE SET
quantity = EXCLUDED.quantity,
avg_entry_price = EXCLUDED.avg_entry_price,
realized_pnl = EXCLUDED.realized_pnl,
updated_at = EXCLUDED.updated_at
"""
_UPSERT_PAPER_ACCOUNT = """
INSERT INTO broker_accounts (id, provider, account_id, mode, config, active)
VALUES ($1::uuid, 'paper', $2, 'paper', $3::jsonb, TRUE)
ON CONFLICT (id) DO UPDATE SET
config = EXCLUDED.config,
active = TRUE
"""
_LOAD_PAPER_POSITIONS = """
SELECT ticker, quantity, avg_entry_price, COALESCE(realized_pnl, 0) AS realized_pnl
FROM positions
WHERE broker_account_id = $1 AND quantity > 0
"""
_LOAD_PAPER_ACCOUNT_CONFIG = """
SELECT config FROM broker_accounts
WHERE account_id = $1 AND mode = 'paper' AND active = TRUE
LIMIT 1
"""
_LOAD_PAPER_ORDERS = """
SELECT
id, ticker, side, order_type, quantity, status,
idempotency_key, broker_order_id, fill_price, fill_quantity,
submitted_at
FROM orders
WHERE broker_account_id = (
SELECT id FROM broker_accounts WHERE account_id = $1 AND mode = 'paper' LIMIT 1
)
ORDER BY submitted_at DESC
LIMIT 500
"""
async def sync_state_to_db(
adapter: PaperTradingAdapter,
pool: asyncpg.Pool,
broker_account_uuid: str | None = None,
) -> None:
"""Persist the current paper trading state to PostgreSQL.
Writes:
- broker_accounts row for the paper account
- positions rows for all open positions
- orders rows for all orders (idempotent via ON CONFLICT)
- order_events for audit trail
This enables state recovery after restarts and provides the
full execution audit trail (Requirement 8.3).
"""
acct = adapter.account
now = datetime.now(timezone.utc)
acct_uuid = broker_account_uuid or str(uuid.uuid5(uuid.NAMESPACE_DNS, acct.account_id))
async with pool.acquire() as conn:
async with conn.transaction():
# 1. Upsert broker account
config_json = json.dumps({
"initial_cash": acct.initial_cash,
"current_cash": round(acct.cash, 2),
"portfolio_value": round(acct.portfolio_value, 2),
"slippage_pct": adapter.slippage_pct,
})
await conn.execute(_UPSERT_PAPER_ACCOUNT, acct_uuid, acct.account_id, config_json)
# 2. Upsert positions
for ticker, pos in acct.positions.items():
await conn.execute(
_UPSERT_PAPER_POSITION,
acct_uuid, ticker,
pos.quantity, pos.avg_entry_price, pos.realized_pnl,
now,
)
# 3. Insert orders (idempotent)
for order_id, resp in acct.orders.items():
filled_at = now if resp.status == OrderStatus.FILLED else None
await conn.execute(
_INSERT_PAPER_ORDER,
order_id,
None, # recommendation_id
acct_uuid,
resp.ticker,
resp.side.value,
"market", # paper orders are always market-simulated
resp.quantity,
resp.filled_avg_price, # limit_price
None, # stop_price
resp.status.value,
order_id, # use order_id as idempotency_key fallback
order_id,
json.dumps(resp.raw_response),
resp.submitted_at,
filled_at,
resp.filled_avg_price,
resp.filled_quantity,
)
# 4. Insert order events
for event in acct.order_events:
await conn.execute(
_INSERT_PAPER_ORDER_EVENT,
event["order_id"],
event["event_type"],
json.dumps(event["data"]),
datetime.fromisoformat(event["timestamp"]),
)
logger.info(
"Synced paper state to DB: account=%s positions=%d orders=%d events=%d",
acct.account_id, len(acct.positions), len(acct.orders), len(acct.order_events),
)
# Clear events after sync to avoid re-inserting
acct.order_events.clear()
async def load_state_from_db(
adapter: PaperTradingAdapter,
pool: asyncpg.Pool,
) -> bool:
"""Restore paper trading state from PostgreSQL.
Loads positions and account config from the DB so the adapter
can resume after a restart. Returns True if state was found.
"""
acct = adapter.account
async with pool.acquire() as conn:
# Load account config
row = await conn.fetchrow(_LOAD_PAPER_ACCOUNT_CONFIG, acct.account_id)
if row is None:
logger.info("No saved paper account state for %s", acct.account_id)
return False
config = json.loads(row["config"]) if isinstance(row["config"], str) else row["config"]
acct.cash = float(config.get("current_cash", acct.initial_cash))
# Load positions
pos_rows = await conn.fetch(_LOAD_PAPER_POSITIONS, acct.account_id)
for pr in pos_rows:
ticker = pr["ticker"]
acct.positions[ticker] = PaperPosition(
ticker=ticker,
quantity=float(pr["quantity"]),
avg_entry_price=float(pr["avg_entry_price"] or 0),
realized_pnl=float(pr["realized_pnl"]),
)
logger.info(
"Loaded paper state from DB: account=%s cash=%.2f positions=%d",
acct.account_id, acct.cash, len(acct.positions),
)
return True
+241
View File
@@ -0,0 +1,241 @@
"""Resilient adapter wrapper with rate-limit coordination, retries, and backoff.
Wraps any BaseAdapter with:
- Per-source-type rate limiting via Redis (distributed across workers)
- Exponential backoff with jitter on retryable failures
- Configurable retry counts and retryable HTTP status codes
- Graceful degradation when Redis is unavailable
Requirements: 2.5, 3.4
"""
import asyncio
import logging
import random
import time
from dataclasses import dataclass
from typing import Any
import redis.asyncio as aioredis
from services.shared.redis_keys import rate_limit_key
from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("resilient_adapter")
# HTTP status codes that are safe to retry
RETRYABLE_STATUS_CODES: frozenset[int] = frozenset({429, 500, 502, 503, 504})
@dataclass
class RetryConfig:
"""Configuration for retry and rate-limit behavior."""
max_retries: int = 3
base_delay: float = 1.0
max_delay: float = 60.0
jitter_factor: float = 0.5
retryable_status_codes: frozenset[int] = RETRYABLE_STATUS_CODES
# Rate limit: max requests per window per source type
rate_limit_max: int = 30
rate_limit_window_seconds: int = 60
# Sensible defaults per source type
DEFAULT_RETRY_CONFIGS: dict[str, RetryConfig] = {
"market_api": RetryConfig(max_retries=3, rate_limit_max=30),
"news_api": RetryConfig(max_retries=3, rate_limit_max=20),
"filings_api": RetryConfig(max_retries=2, rate_limit_max=10, base_delay=2.0),
"web_scrape": RetryConfig(max_retries=2, rate_limit_max=10, base_delay=2.0),
"broker": RetryConfig(max_retries=2, rate_limit_max=60, base_delay=0.5),
}
def compute_delay(attempt: int, config: RetryConfig) -> float:
"""Compute backoff delay with jitter for a given attempt number."""
exp_delay = config.base_delay * (2 ** attempt)
capped = min(exp_delay, config.max_delay)
jitter = capped * config.jitter_factor * random.random()
return capped + jitter
@dataclass
class RetryStats:
"""Tracks retry statistics for observability."""
attempts: int = 0
total_delay: float = 0.0
rate_limited_waits: int = 0
last_error: str | None = None
retryable: bool = False
class ResilientAdapter:
"""Wraps a BaseAdapter with rate-limit coordination, retries, and backoff.
Usage:
adapter = PolygonMarketAdapter(api_key="...")
resilient = ResilientAdapter(adapter, redis=rds)
result = await resilient.fetch(ticker, config)
If redis is None, rate limiting is skipped (local dev / testing).
"""
def __init__(
self,
adapter: BaseAdapter,
redis: aioredis.Redis | None = None,
retry_config: RetryConfig | None = None,
) -> None:
self._adapter = adapter
self._redis = redis
source_type = adapter.source_type()
self._config = retry_config or DEFAULT_RETRY_CONFIGS.get(
source_type, RetryConfig()
)
@property
def adapter(self) -> BaseAdapter:
"""Access the underlying adapter."""
return self._adapter
@property
def config(self) -> RetryConfig:
return self._config
def source_type(self) -> str:
return self._adapter.source_type()
async def _check_rate_limit(self) -> float:
"""Check distributed rate limit via Redis.
Returns 0.0 if allowed, or the number of seconds to wait.
"""
if self._redis is None:
return 0.0
source_type = self._adapter.source_type()
window_sec = self._config.rate_limit_window_seconds
# Use a time-bucketed key so counters auto-expire
bucket = int(time.time()) // window_sec
key = rate_limit_key(source_type, str(bucket))
try:
count = await self._redis.incr(key)
if count == 1:
await self._redis.expire(key, window_sec * 2)
if count > self._config.rate_limit_max:
# Over limit — compute how long until the window rolls over
elapsed_in_window = time.time() % window_sec
wait = window_sec - elapsed_in_window
return max(wait, 0.5)
except Exception:
# Redis unavailable — degrade gracefully, allow the request
logger.warning("Redis rate-limit check failed, allowing request")
return 0.0
def _is_retryable(self, result: AdapterResult) -> bool:
"""Determine if a failed result is worth retrying."""
if result.ok:
return False
# Retry on known retryable HTTP status codes
if result.http_status and result.http_status in self._config.retryable_status_codes:
return True
# Retry on timeouts
if result.error and "timeout" in result.error.lower():
return True
# Retry on connection errors
if result.error and any(
kw in result.error.lower()
for kw in ("connection", "connect", "reset", "refused")
):
return True
return False
def _extract_retry_after(self, result: AdapterResult) -> float | None:
"""Extract Retry-After hint from result metadata if present."""
retry_after = result.metadata.get("retry_after")
if retry_after is not None:
try:
return float(retry_after)
except (ValueError, TypeError):
pass
return None
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch with rate-limit coordination, retries, and exponential backoff.
Returns the AdapterResult from the underlying adapter. On retryable
failures, retries up to max_retries times with exponential backoff
and jitter. Rate-limit waits are applied before each attempt.
The returned result's metadata includes retry stats under the
"retry_stats" key.
"""
stats = RetryStats()
last_result: AdapterResult | None = None
for attempt in range(self._config.max_retries + 1):
stats.attempts = attempt + 1
# Rate limit check
wait = await self._check_rate_limit()
if wait > 0:
stats.rate_limited_waits += 1
logger.info(
"Rate limited for %s/%s, waiting %.1fs",
self.source_type(), ticker, wait,
)
stats.total_delay += wait
await asyncio.sleep(wait)
# Execute the fetch
result = await self._adapter.fetch(ticker, config)
last_result = result
# Success — attach stats and return
if result.ok:
result.metadata["retry_stats"] = {
"attempts": stats.attempts,
"total_delay": round(stats.total_delay, 2),
"rate_limited_waits": stats.rate_limited_waits,
}
return result
# Check if retryable
if not self._is_retryable(result):
stats.last_error = result.error
stats.retryable = False
break
stats.retryable = True
stats.last_error = result.error
# Don't sleep after the last attempt
if attempt < self._config.max_retries:
# Respect Retry-After header for 429s
retry_after = self._extract_retry_after(result)
if result.http_status == 429 and retry_after is not None:
delay = min(retry_after, self._config.max_delay)
else:
delay = compute_delay(attempt, self._config)
logger.info(
"Retrying %s/%s (attempt %d/%d) after %.1fs: %s",
self.source_type(), ticker, attempt + 1,
self._config.max_retries + 1, delay, result.error,
)
stats.total_delay += delay
await asyncio.sleep(delay)
# All retries exhausted — return last result with stats
assert last_result is not None
last_result.metadata["retry_stats"] = {
"attempts": stats.attempts,
"total_delay": round(stats.total_delay, 2),
"rate_limited_waits": stats.rate_limited_waits,
"exhausted": True,
"last_error": stats.last_error,
}
return last_result
+321
View File
@@ -0,0 +1,321 @@
"""Web scrape adapter for curated URLs and article pages.
Fetches full article HTML from curated URLs (investor relations pages,
press releases, earnings transcripts, etc.) using BeautifulSoup + requests
with retry adapters, content hashing, boilerplate awareness, and quality scoring.
Inspired by Noctipede crawler patterns: BeautifulSoup + requests with retry
adapters, content hashing, boilerplate stripping, quality scoring.
Requirements: 1.2, 2.5, 3.1, 3.2, 3.3, 3.4
"""
import json
import logging
import time
from datetime import datetime, timezone
from urllib.parse import urlparse
from typing import Any
import httpx
from bs4 import BeautifulSoup
from services.shared.content import content_hash, normalize_url
from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("web_scrape_adapter")
# Default request settings
DEFAULT_TIMEOUT = 30
DEFAULT_USER_AGENT = "StonksOracle/1.0 (+https://stonks-oracle.celestium.life)"
MAX_CONTENT_LENGTH = 10 * 1024 * 1024 # 10MB cap
def extract_metadata_from_html(html: str, url: str) -> dict[str, str | None]:
"""Extract title, author, publisher, published date, and links from HTML."""
soup = BeautifulSoup(html, "html.parser")
meta: dict[str, str | None] = {}
# Title: prefer og:title, then <title>
og_title = soup.find("meta", property="og:title")
if og_title and og_title.get("content"):
content = og_title["content"]
meta["title"] = content.strip() if isinstance(content, str) else ""
elif soup.title and soup.title.string:
meta["title"] = soup.title.string.strip()
else:
meta["title"] = ""
# Author
author_tag = soup.find("meta", attrs={"name": "author"})
if author_tag and author_tag.get("content"):
content = author_tag["content"]
meta["author"] = content.strip() if isinstance(content, str) else ""
else:
meta["author"] = ""
# Publisher: og:site_name
site_name = soup.find("meta", property="og:site_name")
if site_name and site_name.get("content"):
content = site_name["content"]
meta["publisher"] = content.strip() if isinstance(content, str) else ""
else:
meta["publisher"] = urlparse(url).hostname or ""
# Published date: article:published_time or datePublished
pub_time = soup.find("meta", property="article:published_time")
if pub_time and pub_time.get("content"):
content = pub_time["content"]
meta["published_at"] = content.strip() if isinstance(content, str) else None
else:
# Try JSON-LD datePublished
for script in soup.find_all("script", type="application/ld+json"):
if script.string and "datePublished" in script.string:
try:
ld = json.loads(script.string)
if isinstance(ld, dict) and "datePublished" in ld:
meta["published_at"] = str(ld["datePublished"])
break
if isinstance(ld, list):
for item in ld:
if isinstance(item, dict) and "datePublished" in item:
meta["published_at"] = str(item["datePublished"])
break
except (json.JSONDecodeError, TypeError):
pass
if "published_at" not in meta:
meta["published_at"] = None
# Canonical URL
canonical = soup.find("link", rel="canonical")
if canonical and canonical.get("href"):
href = canonical["href"]
meta["canonical_url"] = str(href) if href else normalize_url(url)
else:
og_url = soup.find("meta", property="og:url")
if og_url and og_url.get("content"):
content = og_url["content"]
meta["canonical_url"] = str(content) if content else normalize_url(url)
else:
meta["canonical_url"] = normalize_url(url)
# Language
html_tag = soup.find("html")
if html_tag and html_tag.get("lang"):
lang = html_tag["lang"]
meta["language"] = str(lang)[:5] if lang else "en"
else:
meta["language"] = "en"
# Description for summary
desc = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
if desc and desc.get("content"):
content = desc["content"]
meta["description"] = content.strip() if isinstance(content, str) else ""
else:
meta["description"] = ""
return meta
def extract_body_text(html: str) -> str:
"""Extract main body text from HTML, stripping nav/footer/ads."""
soup = BeautifulSoup(html, "html.parser")
# Remove non-content elements
for tag in soup.find_all(
["script", "style", "nav", "footer", "header", "aside", "iframe", "noscript"]
):
tag.decompose()
# Try to find article body
article = soup.find("article")
if not article:
for div in soup.find_all("div"):
cls = div.get("class", [])
cls_str = " ".join(cls) if isinstance(cls, list) else str(cls) if cls else ""
if any(kw in cls_str for kw in ["article-body", "post-content", "entry-content", "story-body"]):
article = div
break
if article:
text = article.get_text(separator="\n", strip=True)
else:
# Fallback: use body
body = soup.find("body")
text = body.get_text(separator="\n", strip=True) if body else soup.get_text(separator="\n", strip=True)
# Collapse whitespace
lines = [line.strip() for line in text.splitlines() if line.strip()]
return "\n".join(lines)
class WebScrapeAdapter(BaseAdapter):
"""Adapter for fetching curated web pages and article URLs.
Config options (from source config):
urls: List of URLs to scrape for this company
url: Single URL to scrape (alternative to urls)
timeout: Request timeout in seconds (default 30)
user_agent: Custom user agent string
follow_links: Whether to follow article links from index pages (default False)
max_pages: Max pages to fetch per cycle (default 5)
"""
def __init__(self) -> None:
pass
def source_type(self) -> str:
return "web_scrape"
def bucket_name(self) -> str:
"""Web scrape artifacts go to the news raw bucket."""
return "stonks-raw-news"
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch HTML from curated URLs for a given ticker.
Supports both single URL and multi-URL configs. Each URL is fetched,
HTML is preserved as raw payload, and metadata is extracted.
"""
urls = config.get("urls", [])
if not urls and config.get("url"):
urls = [config["url"]]
if not urls:
return self._error_result(ticker, "No URLs configured for web_scrape source", 0)
timeout = config.get("timeout", DEFAULT_TIMEOUT)
user_agent = config.get("user_agent", DEFAULT_USER_AGENT)
max_pages = min(config.get("max_pages", 5), 20)
items: list[dict[str, Any]] = []
all_raw: list[bytes] = []
total_elapsed = 0.0
errors: list[str] = []
async with httpx.AsyncClient(
timeout=timeout,
follow_redirects=True,
headers={"User-Agent": user_agent},
) as client:
for url in urls[:max_pages]:
t0 = time.monotonic()
try:
resp = await client.get(url)
elapsed_ms = (time.monotonic() - t0) * 1000
total_elapsed += elapsed_ms
resp.raise_for_status()
# Content length guard
if len(resp.content) > MAX_CONTENT_LENGTH:
errors.append(f"Content too large for {url}: {len(resp.content)} bytes")
continue
html = resp.text
raw_bytes = resp.content
all_raw.append(raw_bytes)
item_content_hash = content_hash(raw_bytes)
meta = extract_metadata_from_html(html, url)
body_text = extract_body_text(html)
item: dict[str, Any] = {
"url": url,
"canonical_url": meta.get("canonical_url", normalize_url(url)),
"title": meta.get("title", ""),
"author": meta.get("author", ""),
"publisher": meta.get("publisher", ""),
"published_at": meta.get("published_at"),
"language": meta.get("language", "en"),
"description": meta.get("description", ""),
"content_hash": item_content_hash,
"body_text": body_text,
"body_length": len(body_text),
"html_length": len(html),
"http_status": resp.status_code,
"response_time_ms": round(elapsed_ms, 1),
}
items.append(item)
except httpx.HTTPStatusError as e:
elapsed_ms = (time.monotonic() - t0) * 1000
total_elapsed += elapsed_ms
status = e.response.status_code if e.response else None
errors.append(f"HTTP {status} for {url}: {e}")
logger.warning("Scrape HTTP error for %s/%s: %s", ticker, url, e)
except httpx.TimeoutException as e:
elapsed_ms = (time.monotonic() - t0) * 1000
total_elapsed += elapsed_ms
errors.append(f"Timeout for {url}: {e}")
logger.warning("Scrape timeout for %s/%s: %s", ticker, url, e)
except Exception as e:
elapsed_ms = (time.monotonic() - t0) * 1000
total_elapsed += elapsed_ms
errors.append(f"Error for {url}: {e}")
logger.warning("Scrape error for %s/%s: %s", ticker, url, e)
if not items:
error_msg = "; ".join(errors) if errors else "No pages fetched"
return self._error_result(ticker, error_msg, total_elapsed)
# Combine all raw payloads into a single artifact
combined_raw = json.dumps({
"ticker": ticker,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"pages": [
{
"url": item["url"],
"content_hash": item["content_hash"],
"html_length": item["html_length"],
"body_length": item["body_length"],
}
for item in items
],
"errors": errors,
}).encode("utf-8")
combined_hash = content_hash(
b"".join(item["content_hash"].encode() for item in items)
)
return AdapterResult(
source_type="web_scrape",
ticker=ticker,
items=items,
raw_payload=combined_raw,
content_hash=combined_hash,
fetched_at=datetime.now(timezone.utc),
http_status=200,
response_time_ms=round(total_elapsed, 1),
metadata={
"provider": "web_scrape",
"pages_fetched": len(items),
"pages_failed": len(errors),
"errors": errors,
},
)
def _error_result(
self,
ticker: str,
error: str,
elapsed_ms: float,
) -> AdapterResult:
"""Build an error AdapterResult for scrape fetches."""
return AdapterResult(
source_type="web_scrape",
ticker=ticker,
items=[],
raw_payload=b"",
content_hash="",
fetched_at=datetime.now(timezone.utc),
error=error,
http_status=None,
response_time_ms=round(elapsed_ms, 1),
metadata={"provider": "web_scrape"},
)