phase 14-15: docker build validation and helm deployment
This commit is contained in:
@@ -24,99 +24,153 @@
|
|||||||
- [x] Add seed data support for an initial tracked watchlist
|
- [x] Add seed data support for an initial tracked watchlist
|
||||||
## Phase 3
|
## Phase 3
|
||||||
- External API Adapters
|
- External API Adapters
|
||||||
- [ ] Implement scheduler for symbol and source polling windows
|
- [x] Implement scheduler for symbol and source polling windows
|
||||||
- [ ] Implement market data API adapter interface
|
- [x] Implement market data API adapter interface
|
||||||
- [ ] Implement first concrete market data provider adapter
|
- [x] Implement first concrete market data provider adapter
|
||||||
- [ ] Implement news API adapter interface
|
- [x] Implement news API adapter interface
|
||||||
- [ ] Implement first concrete news API provider adapter
|
- [x] Implement first concrete news API provider adapter
|
||||||
- [ ] Implement filings or regulatory adapter interface
|
- [x] Implement filings or regulatory adapter interface
|
||||||
- [ ] Implement first concrete filings provider adapter
|
- [x] Implement first concrete filings provider adapter
|
||||||
- [ ] Implement broker API adapter interface for paper trading and order events
|
- [x] Implement broker API adapter interface for paper trading and order events
|
||||||
- [ ] Implement rate-limit coordination, retries, and backoff across adapters
|
- [x] Implement rate-limit coordination, retries, and backoff across adapters
|
||||||
|
|
||||||
## Phase 4 - Ingestion Pipeline
|
## Phase 4 - Ingestion Pipeline
|
||||||
- [ ] Implement web scraper worker for curated URLs and article pages
|
- [x] Implement web scraper worker for curated URLs and article pages
|
||||||
- [ ] Implement canonical URL normalization and content hashing
|
- [x] Implement canonical URL normalization and content hashing
|
||||||
- [ ] Implement raw artifact upload to MinIO
|
- [x] Implement raw artifact upload to MinIO
|
||||||
- [ ] Implement metadata persistence in PostgreSQL for market payloads, documents, and broker events
|
- [x] Implement metadata persistence in PostgreSQL for market payloads, documents, and broker events
|
||||||
- [ ] Implement retry and failure tracking for source retrieval
|
- [x] Implement retry and failure tracking for source retrieval
|
||||||
- [ ] Implement dedupe logic across article and filing sources
|
- [x] Implement dedupe logic across article and filing sources
|
||||||
|
|
||||||
## Phase 5 - Parsing and Normalization
|
## Phase 5 - Parsing and Normalization
|
||||||
- [ ] Implement HTML-to-text parsing pipeline
|
- [x] Implement HTML-to-text parsing pipeline
|
||||||
- [ ] Implement boilerplate reduction and body extraction heuristics
|
- [x] Implement boilerplate reduction and body extraction heuristics
|
||||||
- [ ] Implement parser quality scoring and confidence flags
|
- [x] Implement parser quality scoring and confidence flags
|
||||||
- [ ] Implement company mention detection using ticker, alias, and name matching
|
- [x] Implement company mention detection using ticker, alias, and name matching
|
||||||
- [ ] Persist normalized text and parser outputs to MinIO and PostgreSQL
|
- [x] Persist normalized text and parser outputs to MinIO and PostgreSQL
|
||||||
|
|
||||||
## Phase 6 - Ollama Structured Extraction
|
## Phase 6 - Ollama Structured Extraction
|
||||||
- [ ] Build extraction prompt templates with anti-hallucination instructions
|
- [x] Build extraction prompt templates with anti-hallucination instructions
|
||||||
- [ ] Build JSON schema definitions for document intelligence extraction
|
- [x] Build JSON schema definitions for document intelligence extraction
|
||||||
- [ ] Implement Ollama client wrapper using structured output format
|
- [x] Implement Ollama client wrapper using structured output format
|
||||||
- [ ] Implement schema validation and semantic validation layers
|
- [x] Implement schema validation and semantic validation layers
|
||||||
- [ ] Persist prompts, model metadata, raw outputs, validation reports, and final intelligence objects
|
- [x] Persist prompts, model metadata, raw outputs, validation reports, and final intelligence objects
|
||||||
- [ ] Add retry behavior for invalid or incomplete model responses
|
- [x] Add retry behavior for invalid or incomplete model responses
|
||||||
- [ ] Add model performance metrics and dashboards
|
- [x] Add model performance metrics and dashboards
|
||||||
|
|
||||||
## Phase 7 - Aggregation and Trend Engine
|
## Phase 7 - Aggregation and Trend Engine
|
||||||
- [ ] Implement recency decay and source credibility weighting
|
- [x] Implement recency decay and source credibility weighting
|
||||||
- [ ] Integrate market context features into aggregation windows
|
- [x] Integrate market context features into aggregation windows
|
||||||
- [ ] Implement company-level rolling window aggregation
|
- [x] Implement company-level rolling window aggregation
|
||||||
- [ ] Implement contradiction detection and disagreement representation
|
- [x] Implement contradiction detection and disagreement representation
|
||||||
- [ ] Implement sector and market rollups
|
- [x] Implement sector and market rollups
|
||||||
- [ ] Implement evidence ranking for supporting and opposing documents
|
- [x] Implement evidence ranking for supporting and opposing documents
|
||||||
- [ ] Persist trend windows and evidence mappings
|
- [x] Persist trend windows and evidence mappings
|
||||||
|
|
||||||
## Phase 8 - Recommendation Engine
|
## Phase 8 - Recommendation Engine
|
||||||
- [ ] Design deterministic recommendation eligibility logic
|
- [x] Design deterministic recommendation eligibility logic
|
||||||
- [ ] Implement recommendation generation from aggregated scores and evidence
|
- [x] Implement recommendation generation from aggregated scores and evidence
|
||||||
- [ ] Add optional LLM wording layer for thesis generation only
|
- [x] Add optional LLM wording layer for thesis generation only
|
||||||
- [ ] Persist recommendation objects and evidence citations
|
- [x] Persist recommendation objects and evidence citations
|
||||||
- [ ] Add suppression logic for low-quality data or low confidence
|
- [x] Add suppression logic for low-quality data or low confidence
|
||||||
- [ ] Publish prediction facts to analytical tables
|
- [x] Publish prediction facts to analytical tables
|
||||||
|
|
||||||
## Phase 9 - Risk Engine and Trade Adapter
|
## Phase 9 - Risk Engine and Trade Adapter
|
||||||
- [ ] Implement portfolio and account risk configuration model
|
- [x] Implement portfolio and account risk configuration model
|
||||||
- [ ] Implement hard blocks for max position size, sector exposure, daily loss limits, and news-shock lockouts
|
- [x] Implement hard blocks for max position size, sector exposure, daily loss limits, and news-shock lockouts
|
||||||
- [ ] Implement paper trading adapter behavior and state sync
|
- [x] Implement paper trading adapter behavior and state sync
|
||||||
- [ ] Integrate first broker API in sandbox mode
|
- [x] Integrate first broker API in sandbox mode
|
||||||
- [ ] Implement idempotent order submission keys and duplicate prevention
|
- [x] Implement idempotent order submission keys and duplicate prevention
|
||||||
- [ ] Implement full execution audit trail
|
- [x] Implement full execution audit trail
|
||||||
- [ ] Add operator approval workflow for live trading mode
|
- [x] Add operator approval workflow for live trading mode
|
||||||
- [ ] Publish order, fill, and position facts to analytical tables
|
- [x] Publish order, fill, and position facts to analytical tables
|
||||||
|
|
||||||
## Phase 10 - Lakehouse and SQL Analytics
|
## Phase 10 - Lakehouse and SQL Analytics
|
||||||
- [ ] Define analytical fact tables for bars, documents, extractions, signals, orders, fills, positions, and PnL
|
- [x] Define analytical fact tables for bars, documents, extractions, signals, orders, fills, positions, and PnL
|
||||||
- [ ] Implement Parquet writers for analytical datasets
|
- [x] Implement Parquet writers for analytical datasets
|
||||||
- [ ] Implement Hive-compatible partition layout conventions on MinIO
|
- [x] Implement Hive-compatible partition layout conventions on MinIO
|
||||||
- [ ] Implement Iceberg table creation and metadata management for analytical datasets
|
- [x] Implement Iceberg table creation and metadata management for analytical datasets
|
||||||
- [ ] Implement lake publisher jobs from operational data into analytical fact tables
|
- [x] Implement lake publisher jobs from operational data into analytical fact tables
|
||||||
- [ ] Configure Trino catalogs for Hive and or Iceberg access to MinIO
|
- [x] Configure Trino catalogs for Hive and or Iceberg access to MinIO
|
||||||
- [ ] Add example SQL views for prediction-vs-outcome and paper-trade scorecards
|
- [x] Add example SQL views for prediction-vs-outcome and paper-trade scorecards
|
||||||
|
|
||||||
## Phase 11 - Query API and Dashboard
|
## Phase 11 - Query API and Dashboard
|
||||||
- [ ] Build APIs for companies, document timelines, trend summaries, recommendations, and order history
|
- [x] Build APIs for companies, document timelines, trend summaries, recommendations, and order history
|
||||||
- [ ] Build evidence drill-down view linking recommendations to source documents and raw artifacts
|
- [x] Build evidence drill-down view linking recommendations to source documents and raw artifacts
|
||||||
- [ ] Build admin controls for source health, symbol configs, and trading mode
|
- [x] Build admin controls for source health, symbol configs, and trading mode
|
||||||
- [ ] Build operational dashboard for ingestion throughput, model failures, and source coverage gaps
|
- [x] Build operational dashboard for ingestion throughput, model failures, and source coverage gaps
|
||||||
- [ ] Build Superset starter dashboards for symbol overview, sentiment heatmap, PnL, and prediction accuracy
|
- [x] Build Superset starter dashboards for symbol overview, sentiment heatmap, PnL, and prediction accuracy
|
||||||
|
|
||||||
## Phase 12 - Observability and Hardening
|
## Phase 12 - Observability and Hardening
|
||||||
- [ ] Add structured logs and distributed tracing across services
|
- [x] Add structured logs and distributed tracing across services
|
||||||
- [ ] Add Prometheus metrics for ingestion, parsing, extraction, aggregation, lake publication, and trading
|
- [x] Add Prometheus metrics for ingestion, parsing, extraction, aggregation, lake publication, and trading
|
||||||
- [ ] Add alerting for source failures, schema failure spikes, analytical lag, and broker issues
|
- [x] Add alerting for source failures, schema failure spikes, analytical lag, and broker issues
|
||||||
- [ ] Add dead-letter queues and replay tooling
|
- [x] Add dead-letter queues and replay tooling
|
||||||
- [ ] Add data retention and lifecycle controls for raw and derived artifacts
|
- [x] Add data retention and lifecycle controls for raw and derived artifacts
|
||||||
- [ ] Add security review for secrets, network policies, trading isolation, and dashboard access control
|
- [x] Add security review for secrets, network policies, trading isolation, and dashboard access control
|
||||||
|
|
||||||
## Phase 13 - Verification and Rollout
|
## Phase 13 - Verification and Rollout
|
||||||
- [ ] Create replay dataset from archived documents for deterministic extraction testing
|
- [x] Create replay dataset from archived documents for deterministic extraction testing
|
||||||
- [ ] Create integration tests for the full ingest-to-recommendation flow
|
- [x] Create integration tests for the full ingest-to-recommendation flow
|
||||||
- [ ] Create paper trading simulation scenarios
|
- [x] Create paper trading simulation scenarios
|
||||||
- [ ] Validate fail-closed behavior for broker outages and ambiguous order states
|
- [x] Validate fail-closed behavior for broker outages and ambiguous order states
|
||||||
- [ ] Validate lake publication and Trino query correctness over partitioned MinIO datasets
|
- [x] Validate lake publication and Trino query correctness over partitioned MinIO datasets
|
||||||
- [ ] Run shadow mode before enabling any live execution
|
- [x] ~~Run shadow mode~~ moved to Phase 15.5 (post-deployment)
|
||||||
- [ ] Prepare operator runbook and incident response procedures
|
- [x] ~~Prepare operator runbook~~ moved to Phase 15.5 (post-deployment)
|
||||||
|
|
||||||
|
## Phase 14 - Local Docker Build Validation
|
||||||
|
- [x] 14. Build and validate all Docker containers locally
|
||||||
|
- [x] 14.1 Build all 11 service containers locally using the Makefile
|
||||||
|
- Run `make build` to build scheduler, symbol-registry, ingestion, parser, extractor, aggregation, recommendation, risk, broker-adapter, lake-publisher, and query-api images
|
||||||
|
- Fix any build failures (missing dependencies, import errors, syntax issues)
|
||||||
|
- _Requirements: N1, 12.1_
|
||||||
|
- [x] 14.2 Validate schema and logic consistency across all services
|
||||||
|
- Run the full test suite with `pytest tests/ -x --tb=short -q` to catch import errors, schema mismatches, and logic inconsistencies
|
||||||
|
- Verify all shared schemas in `services/shared/schemas.py` are consistent with what each service expects
|
||||||
|
- Verify config loader fields match the configmap and secrets definitions
|
||||||
|
- Fix any mismatches found between services, schemas, migrations, and K8s manifests
|
||||||
|
- _Requirements: 5.2, 5.3, 9.2, N2_
|
||||||
|
- [x] 14.3 Verify each container starts without immediate crash
|
||||||
|
- Run each built image with `docker run --rm` and a quick health check or `--help` flag to confirm the entrypoint resolves
|
||||||
|
- Fix any runtime import errors or missing module paths
|
||||||
|
- _Requirements: N1_
|
||||||
|
|
||||||
|
## Phase 15 - CI Validation, Helm Deployment, and Cluster Rollout
|
||||||
|
- [-] 15. Commit, push, validate CI, create Helm chart, and deploy to cluster
|
||||||
|
- [-] 15.1 Commit and push code to GitHub
|
||||||
|
- Configure git with SSH key for the private repo
|
||||||
|
- Commit all current changes with message `phase 14-15: docker build validation and helm deployment`
|
||||||
|
- Push to main branch
|
||||||
|
- _Requirements: N1_
|
||||||
|
- [ ] 15.2 Validate GitHub Actions workflow builds containers
|
||||||
|
- Monitor the GitHub Actions run to confirm lint-and-test and build-services jobs succeed
|
||||||
|
- Fix any CI failures and re-push if needed
|
||||||
|
- _Requirements: N1_
|
||||||
|
- [ ] 15.3 Create Helm chart for stonks-oracle deployment
|
||||||
|
- Create `infra/helm/stonks-oracle/Chart.yaml` with chart metadata
|
||||||
|
- Create `infra/helm/stonks-oracle/values.yaml` with configurable image tags, replica counts, resource limits, and environment references
|
||||||
|
- Create Helm templates for all deployments, services, configmap, secrets, ingress, and network policies from existing K8s manifests
|
||||||
|
- Add imagePullSecrets configuration for GHCR private registry access
|
||||||
|
- Add a template for a Kubernetes Secret of type `kubernetes.io/dockerconfigjson` for GHCR authentication
|
||||||
|
- _Requirements: N1, 8.2_
|
||||||
|
- [ ] 15.4 Configure GHCR image pull authentication on the cluster
|
||||||
|
- Create a `docker-registry` secret in the `stonks-oracle` namespace with GHCR credentials (using a GitHub PAT or deploy key)
|
||||||
|
- Reference the imagePullSecret in all deployment specs via the Helm values
|
||||||
|
- _Requirements: 8.2, N1_
|
||||||
|
- [ ] 15.5 Deploy stonks-oracle to the cluster via Helm
|
||||||
|
- Run `helm install` or `helm upgrade --install` targeting the `stonks-oracle` namespace
|
||||||
|
- Verify all pods reach Running/Ready state
|
||||||
|
- Verify services and ingress endpoints are reachable
|
||||||
|
- Debug and fix any deployment issues (CrashLoopBackOff, image pull errors, config mismatches)
|
||||||
|
- _Requirements: N1, 12.1_
|
||||||
|
- [ ] 15.6 Run shadow mode before enabling any live execution
|
||||||
|
- Confirm all services are running and processing in paper-only mode
|
||||||
|
- Validate end-to-end data flow from ingestion through recommendation without live trades
|
||||||
|
- _Requirements: N5, 8.1_
|
||||||
|
- [ ] 15.7 Prepare operator runbook and incident response procedures
|
||||||
|
- Document service restart procedures, log access, and common failure modes
|
||||||
|
- Document how to toggle trading modes and approve live execution
|
||||||
|
- _Requirements: 8.2, 12.1_
|
||||||
|
|
||||||
## Recommended First Vertical Slice
|
## Recommended First Vertical Slice
|
||||||
- [ ] Track 5 to 10 symbols
|
- [ ] Track 5 to 10 symbols
|
||||||
|
|||||||
@@ -24,8 +24,25 @@ test:
|
|||||||
|
|
||||||
build:
|
build:
|
||||||
@for svc in $(SERVICES); do \
|
@for svc in $(SERVICES); do \
|
||||||
echo "Building $$svc..."; \
|
case $$svc in \
|
||||||
docker build -t $(GHCR)/$$svc:$(SHA) -t $(GHCR)/$$svc:latest -f docker/Dockerfile .; \
|
scheduler) cmd="python -m services.scheduler.app" ;; \
|
||||||
|
symbol-registry) cmd="uvicorn services.symbol_registry.app:app --host 0.0.0.0 --port 8000" ;; \
|
||||||
|
ingestion) cmd="python -m services.ingestion.worker" ;; \
|
||||||
|
parser) cmd="python -m services.parser.worker" ;; \
|
||||||
|
extractor) cmd="python -m services.extractor.main" ;; \
|
||||||
|
aggregation) cmd="python -m services.aggregation.main" ;; \
|
||||||
|
recommendation) cmd="python -m services.recommendation.main" ;; \
|
||||||
|
risk) cmd="uvicorn services.risk.app:app --host 0.0.0.0 --port 8000" ;; \
|
||||||
|
broker-adapter) cmd="python -m services.adapters.broker_service" ;; \
|
||||||
|
lake-publisher) cmd="python -m services.lake_publisher.jobs" ;; \
|
||||||
|
query-api) cmd="uvicorn services.api.app:app --host 0.0.0.0 --port 8000" ;; \
|
||||||
|
esac; \
|
||||||
|
echo "Building $$svc ($$cmd)..."; \
|
||||||
|
docker build \
|
||||||
|
--build-arg "SERVICE_CMD=$$cmd" \
|
||||||
|
-t $(GHCR)/$$svc:$(SHA) \
|
||||||
|
-t $(GHCR)/$$svc:latest \
|
||||||
|
-f docker/Dockerfile . || exit 1; \
|
||||||
done
|
done
|
||||||
|
|
||||||
push:
|
push:
|
||||||
|
|||||||
+15
-6
@@ -3,9 +3,18 @@
|
|||||||
Apache Superset dashboard configurations and starter datasets for Stonks Oracle.
|
Apache Superset dashboard configurations and starter datasets for Stonks Oracle.
|
||||||
|
|
||||||
## Starter Dashboards
|
## Starter Dashboards
|
||||||
- Symbol Overview — company profile, source health, recent documents
|
See `starter/` for dashboard definitions covering:
|
||||||
- Sentiment Heatmap — market-wide sentiment by sector and symbol
|
- Symbol Overview — company profiles, source health, recent documents, and market snapshots
|
||||||
- Prediction Accuracy — predicted signals vs realized price moves
|
- Sentiment Heatmap — market-wide sentiment by sector and symbol, catalyst analysis
|
||||||
- Paper Trading PnL — paper trade performance and position tracking
|
- Prediction Accuracy — predicted signals vs realized price moves, confidence calibration
|
||||||
- Model Quality — extraction success rates, latency, and confidence distributions
|
- Paper Trading PnL — cumulative PnL, position snapshots, order history, and scorecards
|
||||||
- Source Coverage — ingestion throughput, source failures, and coverage gaps
|
|
||||||
|
## Operational Dashboards
|
||||||
|
See `operational/` for dashboard definitions covering:
|
||||||
|
- Ingestion Throughput — documents/hour by source type, success/failure rates, stale sources
|
||||||
|
- Model Extraction Quality — success rates, latency percentiles, validation failures, confidence distributions
|
||||||
|
- Source Coverage & Gaps — per-symbol source type matrix, missing sources, failure heatmap
|
||||||
|
|
||||||
|
Starter dashboards are powered by the Trino `lakehouse` catalog over MinIO-backed analytical tables.
|
||||||
|
Operational dashboards query the Query API `/api/ops/*` endpoints.
|
||||||
|
All dashboards can be imported into Superset via the UI or CLI.
|
||||||
|
|||||||
@@ -0,0 +1,22 @@
|
|||||||
|
# Operational Dashboard
|
||||||
|
|
||||||
|
Superset dashboard definitions for Stonks Oracle operational monitoring.
|
||||||
|
|
||||||
|
## Dashboards
|
||||||
|
- Ingestion Throughput — documents ingested per hour by source type, success/failure rates
|
||||||
|
- Model Extraction Quality — extraction success rates, latency percentiles, validation failures
|
||||||
|
- Source Coverage Gaps — symbols missing source types, stale sources with no recent data
|
||||||
|
|
||||||
|
## Data Sources
|
||||||
|
These dashboards query the Query API operational endpoints:
|
||||||
|
- `/api/ops/ingestion/throughput` — time-bucketed ingestion metrics
|
||||||
|
- `/api/ops/ingestion/summary` — aggregate ingestion stats
|
||||||
|
- `/api/ops/model/failures` — recent extraction failures
|
||||||
|
- `/api/ops/model/performance` — model performance summary
|
||||||
|
- `/api/ops/pipeline/health` — pipeline stage health
|
||||||
|
- `/api/ops/sources/coverage-gaps` — source coverage analysis
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
Import the dashboard JSON files into Superset via the Superset UI or CLI.
|
||||||
|
The dashboards use the Trino `lakehouse` catalog as their primary datasource,
|
||||||
|
with supplementary queries against the Query API for real-time operational data.
|
||||||
@@ -0,0 +1,75 @@
|
|||||||
|
{
|
||||||
|
"dashboard_title": "Ingestion Throughput",
|
||||||
|
"description": "Operational dashboard for monitoring ingestion pipeline throughput, success rates, and item counts across source types.",
|
||||||
|
"slug": "ingestion-throughput",
|
||||||
|
"position_json": {
|
||||||
|
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Ingestion Throughput"}},
|
||||||
|
"ROW-1": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-throughput-timeseries", "CHART-source-type-breakdown"]
|
||||||
|
},
|
||||||
|
"ROW-2": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-success-failure-rate", "CHART-items-fetched"]
|
||||||
|
},
|
||||||
|
"ROW-3": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-stale-sources", "CHART-active-companies"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"refresh_frequency": 300,
|
||||||
|
"default_filters": "{}",
|
||||||
|
"color_scheme": "supersetColors"
|
||||||
|
},
|
||||||
|
"charts": [
|
||||||
|
{
|
||||||
|
"slice_name": "Ingestion Runs Over Time",
|
||||||
|
"viz_type": "echarts_timeseries_bar",
|
||||||
|
"description": "Ingestion run counts bucketed by hour, stacked by source type",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT date_trunc('hour', ir.started_at) AS bucket, ir.source_type, COUNT(*) AS run_count, COUNT(*) FILTER (WHERE ir.status = 'completed') AS completed, COUNT(*) FILTER (WHERE ir.status = 'failed') AS failed FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY 1, 2 ORDER BY 1",
|
||||||
|
"params": {
|
||||||
|
"x_axis": "bucket",
|
||||||
|
"metrics": ["run_count"],
|
||||||
|
"groupby": ["source_type"],
|
||||||
|
"time_grain_sqla": "PT1H"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Source Type Breakdown",
|
||||||
|
"viz_type": "pie",
|
||||||
|
"description": "Distribution of ingestion runs by source type in the last 24h",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT ir.source_type, COUNT(*) AS runs FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY ir.source_type ORDER BY runs DESC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Success vs Failure Rate",
|
||||||
|
"viz_type": "echarts_timeseries_line",
|
||||||
|
"description": "Hourly success and failure counts over time",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT date_trunc('hour', ir.started_at) AS bucket, COUNT(*) FILTER (WHERE ir.status = 'completed') AS completed, COUNT(*) FILTER (WHERE ir.status = 'failed') AS failed, ROUND(COUNT(*) FILTER (WHERE ir.status = 'completed')::numeric / NULLIF(COUNT(*), 0), 3) AS success_rate FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Items Fetched Over Time",
|
||||||
|
"viz_type": "echarts_timeseries_bar",
|
||||||
|
"description": "Total items fetched and new items per hour",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT date_trunc('hour', ir.started_at) AS bucket, COALESCE(SUM(ir.items_fetched), 0) AS items_fetched, COALESCE(SUM(ir.items_new), 0) AS items_new FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Stale Sources",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Sources with no successful run in the last 24 hours",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT c.ticker, s.source_type, s.source_name, MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') AS last_success, COUNT(*) FILTER (WHERE ir.status = 'failed' AND ir.started_at >= NOW() - INTERVAL '24 hours') AS recent_failures FROM sources s JOIN companies c ON c.id = s.company_id LEFT JOIN ingestion_runs ir ON ir.source_id = s.id WHERE s.active = TRUE AND c.active = TRUE GROUP BY c.ticker, s.source_type, s.source_name HAVING MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') < NOW() - INTERVAL '24 hours' OR MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') IS NULL ORDER BY c.ticker"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Active Companies Ingested",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Count of distinct companies with ingestion activity in the last 24h",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT COUNT(DISTINCT company_id) AS active_companies FROM ingestion_runs WHERE started_at >= NOW() - INTERVAL '24 hours'"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,94 @@
|
|||||||
|
{
|
||||||
|
"dashboard_title": "Model Extraction Quality",
|
||||||
|
"description": "Operational dashboard for monitoring Ollama extraction success rates, latency, validation failures, and confidence distributions.",
|
||||||
|
"slug": "model-extraction-quality",
|
||||||
|
"position_json": {
|
||||||
|
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Model Extraction Quality"}},
|
||||||
|
"ROW-1": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-success-rate-kpi", "CHART-avg-latency-kpi", "CHART-avg-confidence-kpi", "CHART-retry-rate-kpi"]
|
||||||
|
},
|
||||||
|
"ROW-2": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-extraction-timeseries", "CHART-validation-status-pie"]
|
||||||
|
},
|
||||||
|
"ROW-3": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-latency-percentiles", "CHART-confidence-distribution"]
|
||||||
|
},
|
||||||
|
"ROW-4": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-recent-failures-table"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"refresh_frequency": 300,
|
||||||
|
"default_filters": "{}",
|
||||||
|
"color_scheme": "supersetColors"
|
||||||
|
},
|
||||||
|
"charts": [
|
||||||
|
{
|
||||||
|
"slice_name": "Extraction Success Rate",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Overall extraction success rate in the last 24h",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT ROUND(COUNT(*) FILTER (WHERE success)::numeric / NULLIF(COUNT(*), 0), 4) AS success_rate FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours'"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Avg Extraction Latency",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Average extraction duration in milliseconds",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT ROUND(AVG(total_duration_ms)::numeric, 0) AS avg_latency_ms FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours'"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Avg Confidence Score",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Average confidence of successful extractions",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT ROUND(AVG(confidence)::numeric, 3) AS avg_confidence FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' AND success = TRUE"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Avg Retry Count",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Average retries per extraction attempt",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT ROUND(AVG(retry_count)::numeric, 2) AS avg_retries FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours'"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Extractions Over Time",
|
||||||
|
"viz_type": "echarts_timeseries_bar",
|
||||||
|
"description": "Hourly extraction counts split by success/failure",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT date_trunc('hour', recorded_at) AS bucket, COUNT(*) FILTER (WHERE success) AS successful, COUNT(*) FILTER (WHERE NOT success) AS failed FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Validation Status Distribution",
|
||||||
|
"viz_type": "pie",
|
||||||
|
"description": "Breakdown of extraction validation outcomes",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT validation_status, COUNT(*) AS count FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' GROUP BY validation_status"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Latency Percentiles Over Time",
|
||||||
|
"viz_type": "echarts_timeseries_line",
|
||||||
|
"description": "P50, P95, P99 extraction latency per hour",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT date_trunc('hour', recorded_at) AS bucket, ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 0) AS p50_ms, ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 0) AS p95_ms, ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 0) AS p99_ms FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Confidence Distribution",
|
||||||
|
"viz_type": "histogram",
|
||||||
|
"description": "Distribution of extraction confidence scores",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT CASE WHEN confidence >= 0.9 THEN '0.9-1.0' WHEN confidence >= 0.8 THEN '0.8-0.9' WHEN confidence >= 0.7 THEN '0.7-0.8' WHEN confidence >= 0.6 THEN '0.6-0.7' WHEN confidence >= 0.5 THEN '0.5-0.6' ELSE '<0.5' END AS confidence_bucket, COUNT(*) AS count FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' AND success = TRUE GROUP BY 1 ORDER BY 1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Recent Extraction Failures",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Most recent failed extractions with error details",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT mpm.ticker, mpm.model_name, mpm.validation_status, mpm.validation_error_count, mpm.attempt_count, mpm.total_duration_ms, mpm.recorded_at, d.title, d.document_type FROM model_performance_metrics mpm LEFT JOIN documents d ON d.id = mpm.document_id WHERE mpm.success = FALSE AND mpm.recorded_at >= NOW() - INTERVAL '24 hours' ORDER BY mpm.recorded_at DESC LIMIT 50"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,51 @@
|
|||||||
|
{
|
||||||
|
"dashboard_title": "Source Coverage & Gaps",
|
||||||
|
"description": "Operational dashboard for identifying source coverage gaps, stale sources, and symbols missing expected data feeds.",
|
||||||
|
"slug": "source-coverage-gaps",
|
||||||
|
"position_json": {
|
||||||
|
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Source Coverage & Gaps"}},
|
||||||
|
"ROW-1": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-coverage-matrix", "CHART-missing-types-table"]
|
||||||
|
},
|
||||||
|
"ROW-2": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-stale-sources-table", "CHART-failure-heatmap"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"refresh_frequency": 600,
|
||||||
|
"default_filters": "{}",
|
||||||
|
"color_scheme": "supersetColors"
|
||||||
|
},
|
||||||
|
"charts": [
|
||||||
|
{
|
||||||
|
"slice_name": "Source Coverage Matrix",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Per-symbol source type coverage showing active source counts",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT c.ticker, c.legal_name, c.sector, COUNT(s.id) FILTER (WHERE s.active) AS active_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'market_api' AND s.active) AS market_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'news_api' AND s.active) AS news_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'filings_api' AND s.active) AS filings_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'web_scrape' AND s.active) AS web_scrape_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'broker' AND s.active) AS broker_sources FROM companies c LEFT JOIN sources s ON s.company_id = c.id WHERE c.active = TRUE GROUP BY c.ticker, c.legal_name, c.sector ORDER BY c.ticker"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Symbols Missing Source Types",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Companies that lack one or more expected source types (market_api, news_api, filings_api)",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT c.ticker, c.legal_name, c.sector, ARRAY_AGG(DISTINCT s.source_type) FILTER (WHERE s.active) AS active_types FROM companies c LEFT JOIN sources s ON s.company_id = c.id AND s.active = TRUE WHERE c.active = TRUE GROUP BY c.ticker, c.legal_name, c.sector HAVING NOT ARRAY['market_api', 'news_api', 'filings_api'] <@ ARRAY_AGG(DISTINCT s.source_type) FILTER (WHERE s.active) OR ARRAY_AGG(DISTINCT s.source_type) FILTER (WHERE s.active) IS NULL ORDER BY c.ticker"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Stale Sources (No Success in 24h)",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Active sources that have not completed a successful ingestion run in the last 24 hours",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT c.ticker, s.source_type, s.source_name, MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') AS last_success, MAX(ir.started_at) AS last_attempt, COUNT(*) FILTER (WHERE ir.status = 'failed' AND ir.started_at >= NOW() - INTERVAL '24 hours') AS recent_failures FROM sources s JOIN companies c ON c.id = s.company_id LEFT JOIN ingestion_runs ir ON ir.source_id = s.id WHERE s.active = TRUE AND c.active = TRUE GROUP BY c.ticker, s.source_type, s.source_name HAVING MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') < NOW() - INTERVAL '24 hours' OR MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') IS NULL ORDER BY c.ticker, s.source_type"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Source Failure Heatmap",
|
||||||
|
"viz_type": "heatmap",
|
||||||
|
"description": "Failure counts by source type and ticker in the last 24h",
|
||||||
|
"datasource_type": "query",
|
||||||
|
"query": "SELECT c.ticker, ir.source_type, COUNT(*) FILTER (WHERE ir.status = 'failed') AS failures FROM ingestion_runs ir JOIN companies c ON c.id = ir.company_id WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY c.ticker, ir.source_type HAVING COUNT(*) FILTER (WHERE ir.status = 'failed') > 0 ORDER BY failures DESC"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
# Starter Dashboards
|
||||||
|
|
||||||
|
Superset dashboard definitions for Stonks Oracle research, analysis, and trading review.
|
||||||
|
|
||||||
|
## Dashboards
|
||||||
|
- Symbol Overview — company profiles, source health, recent documents, and market snapshots
|
||||||
|
- Sentiment Heatmap — market-wide sentiment by sector and symbol, catalyst analysis, contradiction tracking
|
||||||
|
- Prediction Accuracy — predicted signals vs realized price moves, confidence calibration, per-symbol accuracy
|
||||||
|
- Paper Trading PnL — cumulative PnL, daily performance, position snapshots, order history, and scorecards
|
||||||
|
|
||||||
|
## Data Sources
|
||||||
|
These dashboards query the Trino `lakehouse` catalog over MinIO-backed analytical fact tables:
|
||||||
|
- `lakehouse.stonks.documents` — ingested document metadata
|
||||||
|
- `lakehouse.stonks.document_extractions` — AI extraction outputs
|
||||||
|
- `lakehouse.stonks.trade_signals` — aggregated trend signals
|
||||||
|
- `lakehouse.stonks.market_bars` — OHLCV bar data
|
||||||
|
- `lakehouse.stonks.prediction_vs_outcome` — prediction accuracy tracking
|
||||||
|
- `lakehouse.stonks.pnl_daily` — daily PnL records
|
||||||
|
- `lakehouse.stonks.positions_daily` — end-of-day position snapshots
|
||||||
|
- `lakehouse.stonks.trade_orders` — order submission records
|
||||||
|
- `lakehouse.stonks.trade_fills` — fill and execution records
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
1. Import the dashboard JSON files into Superset via the Superset UI or CLI
|
||||||
|
2. Ensure the Trino datasource is configured: `trino://trino@trino:8080/lakehouse/stonks`
|
||||||
|
3. Create the lakehouse views from `lakehouse/views/` for additional drill-down capability
|
||||||
|
|
||||||
|
## Trino Connection
|
||||||
|
The dashboards use the default Superset Trino connection configured in `infra/superset/superset_config.py`.
|
||||||
@@ -0,0 +1,124 @@
|
|||||||
|
{
|
||||||
|
"dashboard_title": "Paper Trading PnL",
|
||||||
|
"description": "Paper trading performance tracking with PnL curves, position snapshots, order history, and trade detail drill-down.",
|
||||||
|
"slug": "paper-trading-pnl",
|
||||||
|
"position_json": {
|
||||||
|
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Paper Trading PnL"}},
|
||||||
|
"ROW-1": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-total-net-pnl-kpi", "CHART-win-rate-kpi", "CHART-total-orders-kpi", "CHART-active-positions-kpi"]
|
||||||
|
},
|
||||||
|
"ROW-2": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-cumulative-pnl-timeseries", "CHART-daily-pnl-bar"]
|
||||||
|
},
|
||||||
|
"ROW-3": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-pnl-by-symbol", "CHART-order-status-pie"]
|
||||||
|
},
|
||||||
|
"ROW-4": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-positions-table"]
|
||||||
|
},
|
||||||
|
"ROW-5": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-scorecard-table"]
|
||||||
|
},
|
||||||
|
"ROW-6": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-recent-orders-table"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"refresh_frequency": 300,
|
||||||
|
"default_filters": "{}",
|
||||||
|
"color_scheme": "supersetColors"
|
||||||
|
},
|
||||||
|
"charts": [
|
||||||
|
{
|
||||||
|
"slice_name": "Total Net PnL",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Cumulative net PnL across all paper trading activity",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ROUND(SUM(net_pnl), 2) AS total_net_pnl FROM lakehouse.stonks.pnl_daily WHERE execution_mode = 'paper'"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Win Rate",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Fraction of trading days with positive net PnL",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ROUND(CAST(COUNT(CASE WHEN net_pnl > 0 THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS win_rate FROM lakehouse.stonks.pnl_daily WHERE execution_mode = 'paper'"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Total Orders",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Total paper trade orders submitted",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT COUNT(DISTINCT order_id) AS total_orders FROM lakehouse.stonks.trade_orders WHERE execution_mode = 'paper'"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Active Positions",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Number of symbols with open positions as of the latest snapshot",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT COUNT(DISTINCT ticker) AS active_positions FROM lakehouse.stonks.positions_daily WHERE execution_mode = 'paper' AND quantity <> 0 AND dt = (SELECT MAX(dt) FROM lakehouse.stonks.positions_daily WHERE execution_mode = 'paper')"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Cumulative PnL Over Time",
|
||||||
|
"viz_type": "echarts_timeseries_line",
|
||||||
|
"description": "Running cumulative net PnL across all paper trades",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT dt AS bucket, SUM(net_pnl) AS daily_net_pnl, SUM(SUM(net_pnl)) OVER (ORDER BY dt) AS cumulative_pnl FROM lakehouse.stonks.pnl_daily WHERE execution_mode = 'paper' GROUP BY dt ORDER BY dt"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Daily PnL",
|
||||||
|
"viz_type": "echarts_timeseries_bar",
|
||||||
|
"description": "Daily net PnL for paper trading, colored by positive/negative",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT dt AS bucket, ROUND(SUM(net_pnl), 2) AS daily_pnl, ROUND(SUM(realized_pnl), 2) AS realized, ROUND(SUM(unrealized_pnl), 2) AS unrealized FROM lakehouse.stonks.pnl_daily WHERE execution_mode = 'paper' GROUP BY dt ORDER BY dt",
|
||||||
|
"params": {
|
||||||
|
"x_axis": "bucket",
|
||||||
|
"metrics": ["daily_pnl"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "PnL by Symbol",
|
||||||
|
"viz_type": "echarts_timeseries_bar",
|
||||||
|
"description": "Total net PnL per symbol for paper trading",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ticker, ROUND(SUM(net_pnl), 2) AS total_pnl, ROUND(SUM(realized_pnl), 2) AS realized_pnl, ROUND(SUM(fees), 2) AS total_fees FROM lakehouse.stonks.pnl_daily WHERE execution_mode = 'paper' GROUP BY ticker ORDER BY total_pnl DESC",
|
||||||
|
"params": {
|
||||||
|
"x_axis": "ticker",
|
||||||
|
"metrics": ["total_pnl"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Order Status Distribution",
|
||||||
|
"viz_type": "pie",
|
||||||
|
"description": "Breakdown of paper trade order statuses",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT status, COUNT(*) AS count FROM lakehouse.stonks.trade_orders WHERE execution_mode = 'paper' GROUP BY status ORDER BY count DESC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Current Positions",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Latest position snapshot for all paper trading symbols",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT p.ticker, p.quantity, ROUND(p.avg_entry_price, 2) AS avg_entry, ROUND(p.close_price, 2) AS close_price, ROUND(p.market_value, 2) AS market_value, ROUND(p.unrealized_pnl, 2) AS unrealized_pnl, p.snapshot_at FROM lakehouse.stonks.positions_daily p WHERE p.execution_mode = 'paper' AND p.dt = (SELECT MAX(dt) FROM lakehouse.stonks.positions_daily WHERE execution_mode = 'paper') ORDER BY ABS(p.unrealized_pnl) DESC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Paper Trade Scorecard",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Per-symbol paper trading scorecard with win rates, PnL, and order counts",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT pnl.ticker, COUNT(DISTINCT pnl.dt) AS trading_days, ROUND(SUM(pnl.net_pnl), 2) AS total_net_pnl, ROUND(AVG(pnl.net_pnl), 2) AS avg_daily_pnl, ROUND(CAST(COUNT(CASE WHEN pnl.net_pnl > 0 THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS win_rate, ROUND(MIN(pnl.net_pnl), 2) AS worst_day, ROUND(MAX(pnl.net_pnl), 2) AS best_day, ROUND(SUM(pnl.fees), 2) AS total_fees, MIN(pnl.dt) AS first_trade, MAX(pnl.dt) AS last_trade FROM lakehouse.stonks.pnl_daily pnl WHERE pnl.execution_mode = 'paper' GROUP BY pnl.ticker ORDER BY total_net_pnl DESC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Recent Orders",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Most recent paper trade orders with fill details",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT o.ticker, o.side, o.order_type, o.quantity, ROUND(o.limit_price, 2) AS limit_price, o.status, f.fill_price, f.fill_quantity, f.commission, o.submitted_at, f.filled_at FROM lakehouse.stonks.trade_orders o LEFT JOIN lakehouse.stonks.trade_fills f ON o.order_id = f.order_id AND o.dt = f.dt WHERE o.execution_mode = 'paper' ORDER BY o.submitted_at DESC LIMIT 50"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,125 @@
|
|||||||
|
{
|
||||||
|
"dashboard_title": "Prediction Accuracy",
|
||||||
|
"description": "Predicted signals vs realized price moves, confidence calibration, and model accuracy tracking.",
|
||||||
|
"slug": "prediction-accuracy",
|
||||||
|
"position_json": {
|
||||||
|
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Prediction Accuracy"}},
|
||||||
|
"ROW-1": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-overall-hit-rate-kpi", "CHART-total-predictions-kpi", "CHART-avg-confidence-kpi", "CHART-avg-move-kpi"]
|
||||||
|
},
|
||||||
|
"ROW-2": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-hit-rate-timeseries", "CHART-outcome-distribution-pie"]
|
||||||
|
},
|
||||||
|
"ROW-3": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-confidence-calibration", "CHART-confidence-vs-move-scatter"]
|
||||||
|
},
|
||||||
|
"ROW-4": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-accuracy-by-symbol", "CHART-accuracy-by-action"]
|
||||||
|
},
|
||||||
|
"ROW-5": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-recent-predictions-table"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"refresh_frequency": 600,
|
||||||
|
"default_filters": "{}",
|
||||||
|
"color_scheme": "supersetColors"
|
||||||
|
},
|
||||||
|
"charts": [
|
||||||
|
{
|
||||||
|
"slice_name": "Overall Hit Rate",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Fraction of predictions with correct directional outcome over the last 30 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ROUND(CAST(COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS hit_rate FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Total Predictions (30d)",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Total evaluated predictions in the last 30 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT COUNT(*) AS total_predictions FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Avg Predicted Confidence",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Average confidence of predictions in the last 30 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ROUND(AVG(predicted_confidence), 3) AS avg_confidence FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Avg Realized Move",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Average absolute realized price move percentage",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ROUND(AVG(ABS(actual_move_pct)), 3) AS avg_abs_move FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Daily Hit Rate",
|
||||||
|
"viz_type": "echarts_timeseries_line",
|
||||||
|
"description": "Daily prediction hit rate over the last 30 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT dt AS bucket, COUNT(*) AS total, COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS correct, ROUND(CAST(COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS hit_rate FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY dt ORDER BY dt"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Outcome Distribution",
|
||||||
|
"viz_type": "pie",
|
||||||
|
"description": "Breakdown of prediction outcomes (correct, incorrect, neutral) over the last 30 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT outcome, COUNT(*) AS count FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY outcome ORDER BY count DESC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Confidence Calibration",
|
||||||
|
"viz_type": "echarts_timeseries_bar",
|
||||||
|
"description": "Hit rate by confidence bucket to assess calibration quality",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT CASE WHEN predicted_confidence >= 0.8 THEN '0.8-1.0 (high)' WHEN predicted_confidence >= 0.6 THEN '0.6-0.8 (medium)' WHEN predicted_confidence >= 0.4 THEN '0.4-0.6 (low)' ELSE '0.0-0.4 (very low)' END AS confidence_bucket, COUNT(*) AS total, COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS correct, ROUND(CAST(COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS hit_rate FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY 1 ORDER BY 1",
|
||||||
|
"params": {
|
||||||
|
"x_axis": "confidence_bucket",
|
||||||
|
"metrics": ["hit_rate"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Confidence vs Realized Move",
|
||||||
|
"viz_type": "echarts_timeseries_scatter",
|
||||||
|
"description": "Scatter plot of predicted confidence vs actual realized move percentage",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ticker, predicted_confidence, actual_move_pct, predicted_action, outcome, dt FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY ORDER BY dt DESC",
|
||||||
|
"params": {
|
||||||
|
"x_axis": "predicted_confidence",
|
||||||
|
"y_axis": "actual_move_pct",
|
||||||
|
"groupby": ["outcome"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Accuracy by Symbol",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Per-symbol prediction accuracy summary",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ticker, COUNT(*) AS predictions, COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS correct, COUNT(CASE WHEN outcome = 'incorrect' THEN 1 END) AS incorrect, ROUND(CAST(COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS hit_rate, ROUND(AVG(predicted_confidence), 3) AS avg_confidence, ROUND(AVG(actual_move_pct), 3) AS avg_move_pct, ROUND(AVG(ABS(actual_move_pct)), 3) AS avg_abs_move_pct FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY ticker ORDER BY hit_rate DESC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Accuracy by Action Type",
|
||||||
|
"viz_type": "echarts_timeseries_bar",
|
||||||
|
"description": "Hit rate broken down by predicted action (buy, sell, hold, watch)",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT predicted_action, COUNT(*) AS total, COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS correct, ROUND(CAST(COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS hit_rate, ROUND(AVG(predicted_confidence), 3) AS avg_confidence FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY predicted_action ORDER BY predicted_action",
|
||||||
|
"params": {
|
||||||
|
"x_axis": "predicted_action",
|
||||||
|
"metrics": ["hit_rate"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Recent Predictions",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Most recent evaluated predictions with outcomes",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ticker, predicted_action, ROUND(predicted_confidence, 3) AS confidence, ROUND(actual_move_pct, 3) AS actual_move_pct, outcome, horizon_days, model_version, predicted_at, evaluated_at FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '14' DAY ORDER BY evaluated_at DESC LIMIT 50"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,120 @@
|
|||||||
|
{
|
||||||
|
"dashboard_title": "Sentiment Heatmap",
|
||||||
|
"description": "Market-wide sentiment visualization by sector and symbol, with trend direction and catalyst analysis.",
|
||||||
|
"slug": "sentiment-heatmap",
|
||||||
|
"position_json": {
|
||||||
|
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Sentiment Heatmap"}},
|
||||||
|
"ROW-1": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-bullish-count-kpi", "CHART-bearish-count-kpi", "CHART-mixed-count-kpi", "CHART-avg-contradiction-kpi"]
|
||||||
|
},
|
||||||
|
"ROW-2": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-sentiment-heatmap"]
|
||||||
|
},
|
||||||
|
"ROW-3": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-sentiment-timeseries", "CHART-catalyst-breakdown"]
|
||||||
|
},
|
||||||
|
"ROW-4": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-contradiction-scatter", "CHART-sentiment-distribution"]
|
||||||
|
},
|
||||||
|
"ROW-5": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-symbol-sentiment-detail"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"refresh_frequency": 300,
|
||||||
|
"default_filters": "{}",
|
||||||
|
"color_scheme": "supersetColors"
|
||||||
|
},
|
||||||
|
"charts": [
|
||||||
|
{
|
||||||
|
"slice_name": "Bullish Signals (7d)",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Count of bullish trend signals in the last 7 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT COUNT(*) AS bullish_count FROM lakehouse.stonks.trade_signals WHERE trend_direction = 'bullish' AND dt >= CURRENT_DATE - INTERVAL '7' DAY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Bearish Signals (7d)",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Count of bearish trend signals in the last 7 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT COUNT(*) AS bearish_count FROM lakehouse.stonks.trade_signals WHERE trend_direction = 'bearish' AND dt >= CURRENT_DATE - INTERVAL '7' DAY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Mixed Signals (7d)",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Count of mixed or neutral trend signals in the last 7 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT COUNT(*) AS mixed_count FROM lakehouse.stonks.trade_signals WHERE trend_direction IN ('mixed', 'neutral') AND dt >= CURRENT_DATE - INTERVAL '7' DAY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Avg Contradiction Score (7d)",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Average contradiction score across all signals in the last 7 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ROUND(AVG(contradiction_score), 3) AS avg_contradiction FROM lakehouse.stonks.trade_signals WHERE dt >= CURRENT_DATE - INTERVAL '7' DAY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Sentiment Heatmap by Symbol",
|
||||||
|
"viz_type": "heatmap",
|
||||||
|
"description": "Daily average sentiment impact score by symbol over the last 14 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT de.ticker, de.dt, ROUND(AVG(de.impact_score), 3) AS avg_impact, AVG(CASE WHEN de.sentiment = 'positive' THEN 1.0 WHEN de.sentiment = 'negative' THEN -1.0 ELSE 0.0 END) AS sentiment_score FROM lakehouse.stonks.document_extractions de WHERE de.dt >= CURRENT_DATE - INTERVAL '14' DAY GROUP BY de.ticker, de.dt ORDER BY de.ticker, de.dt",
|
||||||
|
"params": {
|
||||||
|
"x_axis": "dt",
|
||||||
|
"y_axis": "ticker",
|
||||||
|
"metric": "sentiment_score"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Sentiment Trend Over Time",
|
||||||
|
"viz_type": "echarts_timeseries_line",
|
||||||
|
"description": "Daily average sentiment score across all symbols over the last 30 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT de.dt AS bucket, ROUND(AVG(CASE WHEN de.sentiment = 'positive' THEN 1.0 WHEN de.sentiment = 'negative' THEN -1.0 ELSE 0.0 END), 3) AS avg_sentiment, COUNT(*) AS extraction_count FROM lakehouse.stonks.document_extractions de WHERE de.dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY de.dt ORDER BY de.dt"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Catalyst Type Breakdown",
|
||||||
|
"viz_type": "pie",
|
||||||
|
"description": "Distribution of catalyst types across extractions in the last 14 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT catalyst_type, COUNT(*) AS count FROM lakehouse.stonks.document_extractions WHERE dt >= CURRENT_DATE - INTERVAL '14' DAY AND catalyst_type IS NOT NULL GROUP BY catalyst_type ORDER BY count DESC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Contradiction vs Confidence",
|
||||||
|
"viz_type": "echarts_timeseries_scatter",
|
||||||
|
"description": "Scatter of contradiction score vs confidence for recent signals",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ticker, confidence, contradiction_score, trend_strength, trend_direction, dt FROM lakehouse.stonks.trade_signals WHERE dt >= CURRENT_DATE - INTERVAL '14' DAY ORDER BY dt DESC",
|
||||||
|
"params": {
|
||||||
|
"x_axis": "confidence",
|
||||||
|
"y_axis": "contradiction_score",
|
||||||
|
"groupby": ["trend_direction"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Sentiment Distribution by Symbol",
|
||||||
|
"viz_type": "echarts_timeseries_bar",
|
||||||
|
"description": "Count of positive, negative, and neutral extractions per symbol in the last 14 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ticker, sentiment, COUNT(*) AS count FROM lakehouse.stonks.document_extractions WHERE dt >= CURRENT_DATE - INTERVAL '14' DAY GROUP BY ticker, sentiment ORDER BY ticker, sentiment",
|
||||||
|
"params": {
|
||||||
|
"x_axis": "ticker",
|
||||||
|
"metrics": ["count"],
|
||||||
|
"groupby": ["sentiment"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Symbol Sentiment Detail",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Per-symbol sentiment summary with extraction counts, average impact, and dominant catalysts",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT de.ticker, COUNT(*) AS extractions, ROUND(AVG(de.impact_score), 3) AS avg_impact, ROUND(AVG(de.confidence), 3) AS avg_confidence, ROUND(AVG(de.novelty_score), 3) AS avg_novelty, COUNT(CASE WHEN de.sentiment = 'positive' THEN 1 END) AS positive_count, COUNT(CASE WHEN de.sentiment = 'negative' THEN 1 END) AS negative_count, COUNT(CASE WHEN de.sentiment = 'neutral' THEN 1 END) AS neutral_count, ts.trend_direction AS latest_trend, ts.trend_strength AS latest_trend_strength FROM lakehouse.stonks.document_extractions de LEFT JOIN lakehouse.stonks.trade_signals ts ON de.ticker = ts.ticker AND ts.dt = (SELECT MAX(dt) FROM lakehouse.stonks.trade_signals WHERE ticker = de.ticker) WHERE de.dt >= CURRENT_DATE - INTERVAL '14' DAY GROUP BY de.ticker, ts.trend_direction, ts.trend_strength ORDER BY de.ticker"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,104 @@
|
|||||||
|
{
|
||||||
|
"dashboard_title": "Symbol Overview",
|
||||||
|
"description": "Company profiles, source health, recent documents, and market snapshot for tracked symbols.",
|
||||||
|
"slug": "symbol-overview",
|
||||||
|
"position_json": {
|
||||||
|
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Symbol Overview"}},
|
||||||
|
"ROW-1": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-tracked-symbols-kpi", "CHART-total-documents-kpi", "CHART-total-extractions-kpi", "CHART-active-signals-kpi"]
|
||||||
|
},
|
||||||
|
"ROW-2": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-company-summary-table"]
|
||||||
|
},
|
||||||
|
"ROW-3": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-recent-documents-timeseries", "CHART-document-type-breakdown"]
|
||||||
|
},
|
||||||
|
"ROW-4": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-latest-prices-table"]
|
||||||
|
},
|
||||||
|
"ROW-5": {
|
||||||
|
"type": "ROW",
|
||||||
|
"children": ["CHART-recent-documents-table"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"metadata": {
|
||||||
|
"refresh_frequency": 300,
|
||||||
|
"default_filters": "{}",
|
||||||
|
"color_scheme": "supersetColors"
|
||||||
|
},
|
||||||
|
"charts": [
|
||||||
|
{
|
||||||
|
"slice_name": "Tracked Symbols",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Count of distinct symbols with documents in the last 30 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT COUNT(DISTINCT ticker) AS tracked_symbols FROM lakehouse.stonks.documents WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Total Documents (30d)",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Total documents ingested in the last 30 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT COUNT(*) AS total_documents FROM lakehouse.stonks.documents WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Total Extractions (30d)",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Total AI extractions completed in the last 30 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT COUNT(*) AS total_extractions FROM lakehouse.stonks.document_extractions WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Active Signals (7d)",
|
||||||
|
"viz_type": "big_number_total",
|
||||||
|
"description": "Trade signals generated in the last 7 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT COUNT(*) AS active_signals FROM lakehouse.stonks.trade_signals WHERE dt >= CURRENT_DATE - INTERVAL '7' DAY"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Company Summary",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Per-symbol summary with document counts, extraction counts, latest signal, and latest price",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT d.ticker, COUNT(DISTINCT d.document_id) AS documents_30d, COUNT(DISTINCT de.document_id) AS extractions_30d, MAX(d.published_at) AS latest_document_at, MAX(ts.generated_at) AS latest_signal_at, MAX(ts.trend_direction) AS latest_trend, MAX(mb.close_price) AS latest_close FROM lakehouse.stonks.documents d LEFT JOIN lakehouse.stonks.document_extractions de ON d.ticker = de.ticker AND de.dt >= CURRENT_DATE - INTERVAL '30' DAY LEFT JOIN lakehouse.stonks.trade_signals ts ON d.ticker = ts.ticker AND ts.dt = (SELECT MAX(dt) FROM lakehouse.stonks.trade_signals WHERE ticker = d.ticker) LEFT JOIN lakehouse.stonks.market_bars mb ON d.ticker = mb.ticker AND mb.dt = (SELECT MAX(dt) FROM lakehouse.stonks.market_bars WHERE ticker = d.ticker) WHERE d.dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY d.ticker ORDER BY d.ticker"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Documents Ingested Over Time",
|
||||||
|
"viz_type": "echarts_timeseries_bar",
|
||||||
|
"description": "Daily document ingestion counts by source type over the last 30 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT dt AS bucket, source_type, COUNT(*) AS doc_count FROM lakehouse.stonks.documents WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY dt, source_type ORDER BY dt",
|
||||||
|
"params": {
|
||||||
|
"x_axis": "bucket",
|
||||||
|
"metrics": ["doc_count"],
|
||||||
|
"groupby": ["source_type"],
|
||||||
|
"time_grain_sqla": "P1D"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Document Type Breakdown",
|
||||||
|
"viz_type": "pie",
|
||||||
|
"description": "Distribution of documents by type in the last 30 days",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT document_type, COUNT(*) AS count FROM lakehouse.stonks.documents WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY document_type ORDER BY count DESC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Latest Prices by Symbol",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Most recent closing prices and volume for each tracked symbol",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT mb.ticker, mb.close_price, mb.open_price, mb.high_price, mb.low_price, mb.volume, mb.vwap, mb.bar_timestamp FROM lakehouse.stonks.market_bars mb INNER JOIN (SELECT ticker, MAX(bar_timestamp) AS max_ts FROM lakehouse.stonks.market_bars GROUP BY ticker) latest ON mb.ticker = latest.ticker AND mb.bar_timestamp = latest.max_ts ORDER BY mb.ticker"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"slice_name": "Recent Documents",
|
||||||
|
"viz_type": "table",
|
||||||
|
"description": "Most recently ingested documents across all symbols",
|
||||||
|
"datasource_type": "trino",
|
||||||
|
"query": "SELECT ticker, document_type, source_type, title, publisher, published_at, retrieved_at, confidence FROM lakehouse.stonks.documents WHERE dt >= CURRENT_DATE - INTERVAL '7' DAY ORDER BY retrieved_at DESC LIMIT 50"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
+7
-1
@@ -72,6 +72,9 @@ services:
|
|||||||
image: trinodb/trino:latest
|
image: trinodb/trino:latest
|
||||||
ports:
|
ports:
|
||||||
- "8080:8080"
|
- "8080:8080"
|
||||||
|
environment:
|
||||||
|
MINIO_ACCESS_KEY: minioadmin
|
||||||
|
MINIO_SECRET_KEY: minioadmin
|
||||||
volumes:
|
volumes:
|
||||||
- ./infra/trino/catalog:/etc/trino/catalog
|
- ./infra/trino/catalog:/etc/trino/catalog
|
||||||
depends_on:
|
depends_on:
|
||||||
@@ -83,11 +86,14 @@ services:
|
|||||||
environment:
|
environment:
|
||||||
SERVICE_NAME: metastore
|
SERVICE_NAME: metastore
|
||||||
DB_DRIVER: derby
|
DB_DRIVER: derby
|
||||||
SERVICE_OPTS: "-Djavax.jdo.option.ConnectionURL=jdbc:derby:/opt/hive/data/metastore_db;create=true"
|
|
||||||
ports:
|
ports:
|
||||||
- "9083:9083"
|
- "9083:9083"
|
||||||
volumes:
|
volumes:
|
||||||
- hive_data:/opt/hive/data
|
- hive_data:/opt/hive/data
|
||||||
|
- ./infra/hive/core-site.xml:/opt/hive/conf/core-site.xml:ro
|
||||||
|
- ./infra/hive/metastore-site.xml:/opt/hive/conf/metastore-site.xml:ro
|
||||||
|
depends_on:
|
||||||
|
- minio
|
||||||
|
|
||||||
superset:
|
superset:
|
||||||
image: apache/superset:latest
|
image: apache/superset:latest
|
||||||
|
|||||||
@@ -0,0 +1,27 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>fs.s3a.endpoint</name>
|
||||||
|
<value>http://minio:9000</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>fs.s3a.access.key</name>
|
||||||
|
<value>minioadmin</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>fs.s3a.secret.key</name>
|
||||||
|
<value>minioadmin</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>fs.s3a.path.style.access</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>fs.s3a.impl</name>
|
||||||
|
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>fs.s3a.connection.ssl.enabled</name>
|
||||||
|
<value>false</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
<?xml version="1.0"?>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>metastore.thrift.uris</name>
|
||||||
|
<value>thrift://0.0.0.0:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>metastore.task.threads.always</name>
|
||||||
|
<value>org.apache.hadoop.hive.metastore.events.EventCleanerTask</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>metastore.expression.proxy</name>
|
||||||
|
<value>org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>javax.jdo.option.ConnectionDriverName</name>
|
||||||
|
<value>org.apache.derby.jdbc.EmbeddedDriver</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>javax.jdo.option.ConnectionURL</name>
|
||||||
|
<value>jdbc:derby:/opt/hive/data/metastore_db;create=true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>metastore.warehouse.dir</name>
|
||||||
|
<value>s3a://stonks-lakehouse/warehouse</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: aggregation-worker
|
app: aggregation-worker
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: processing
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@@ -15,16 +16,30 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: aggregation-worker
|
app: aggregation-worker
|
||||||
|
stonks-oracle/tier: processing
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
containers:
|
containers:
|
||||||
- name: aggregation-worker
|
- name: aggregation-worker
|
||||||
image: ghcr.io/celesrenata/stonks-oracle/aggregation:latest
|
image: ghcr.io/celesrenata/stonks-oracle/aggregation:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
envFrom:
|
envFrom:
|
||||||
- configMapRef:
|
- configMapRef:
|
||||||
name: stonks-config
|
name: stonks-config
|
||||||
- secretRef:
|
- secretRef:
|
||||||
name: stonks-secrets
|
name: stonks-core-secrets
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
@@ -32,3 +47,10 @@ spec:
|
|||||||
limits:
|
limits:
|
||||||
cpu: 500m
|
cpu: 500m
|
||||||
memory: 256Mi
|
memory: 256Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
volumes:
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
sizeLimit: 10Mi
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: broker-adapter
|
app: broker-adapter
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: trading
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@@ -15,16 +16,32 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: broker-adapter
|
app: broker-adapter
|
||||||
|
stonks-oracle/tier: trading
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
containers:
|
containers:
|
||||||
- name: broker-adapter
|
- name: broker-adapter
|
||||||
image: ghcr.io/celesrenata/stonks-oracle/broker-adapter:latest
|
image: ghcr.io/celesrenata/stonks-oracle/broker-adapter:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
envFrom:
|
envFrom:
|
||||||
- configMapRef:
|
- configMapRef:
|
||||||
name: stonks-config
|
name: stonks-config
|
||||||
- secretRef:
|
- secretRef:
|
||||||
name: stonks-secrets
|
name: stonks-core-secrets
|
||||||
|
- secretRef:
|
||||||
|
name: stonks-broker-secrets
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 50m
|
cpu: 50m
|
||||||
@@ -32,3 +49,10 @@ spec:
|
|||||||
limits:
|
limits:
|
||||||
cpu: 200m
|
cpu: 200m
|
||||||
memory: 128Mi
|
memory: 128Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
volumes:
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
sizeLimit: 10Mi
|
||||||
|
|||||||
@@ -25,15 +25,48 @@ data:
|
|||||||
OLLAMA_BASE_URL: "http://ollama.ollama-service.svc.cluster.local:11434"
|
OLLAMA_BASE_URL: "http://ollama.ollama-service.svc.cluster.local:11434"
|
||||||
OLLAMA_MODEL: "llama3.1:8b"
|
OLLAMA_MODEL: "llama3.1:8b"
|
||||||
OLLAMA_TIMEOUT: "120"
|
OLLAMA_TIMEOUT: "120"
|
||||||
|
OLLAMA_MAX_RETRIES: "2"
|
||||||
|
OLLAMA_RETRY_BASE_DELAY: "1.0"
|
||||||
|
OLLAMA_RETRY_MAX_DELAY: "10.0"
|
||||||
|
OLLAMA_RETRY_BACKOFF_MULTIPLIER: "2.0"
|
||||||
|
|
||||||
# Trino — deployed in stonks-oracle namespace
|
# Trino — deployed in stonks-oracle namespace
|
||||||
TRINO_HOST: "trino.stonks-oracle.svc.cluster.local"
|
TRINO_HOST: "trino.stonks-oracle.svc.cluster.local"
|
||||||
TRINO_PORT: "8080"
|
TRINO_PORT: "8080"
|
||||||
TRINO_CATALOG: "lakehouse"
|
TRINO_CATALOG: "lakehouse"
|
||||||
TRINO_SCHEMA: "stonks"
|
TRINO_SCHEMA: "stonks"
|
||||||
|
TRINO_ICEBERG_CATALOG: "iceberg"
|
||||||
|
|
||||||
# Broker
|
# Broker
|
||||||
BROKER_MODE: "paper"
|
BROKER_MODE: "paper"
|
||||||
|
BROKER_PROVIDER: "alpaca"
|
||||||
|
|
||||||
|
# Market Data
|
||||||
|
MARKET_DATA_BASE_URL: "https://api.polygon.io"
|
||||||
|
MARKET_DATA_PROVIDER: "polygon"
|
||||||
|
|
||||||
|
# Retention (days per bucket class)
|
||||||
|
RETENTION_RAW_MARKET_DAYS: "90"
|
||||||
|
RETENTION_RAW_NEWS_DAYS: "180"
|
||||||
|
RETENTION_RAW_FILINGS_DAYS: "365"
|
||||||
|
RETENTION_NORMALIZED_DAYS: "180"
|
||||||
|
RETENTION_LLM_PROMPTS_DAYS: "365"
|
||||||
|
RETENTION_LLM_RESULTS_DAYS: "365"
|
||||||
|
RETENTION_LAKEHOUSE_DAYS: "730"
|
||||||
|
RETENTION_AUDIT_DAYS: "730"
|
||||||
|
RETENTION_CLEANUP_INTERVAL_HOURS: "24"
|
||||||
|
RETENTION_BATCH_SIZE: "1000"
|
||||||
|
|
||||||
# General
|
# General
|
||||||
LOG_LEVEL: "INFO"
|
LOG_LEVEL: "INFO"
|
||||||
|
JSON_LOGS: "true"
|
||||||
|
|
||||||
|
# Alerting thresholds
|
||||||
|
ALERT_SOURCE_FAILURE_THRESHOLD: "3"
|
||||||
|
ALERT_SOURCE_FAILURE_WINDOW_HOURS: "6"
|
||||||
|
ALERT_SCHEMA_FAILURE_RATE_THRESHOLD: "0.3"
|
||||||
|
ALERT_SCHEMA_FAILURE_WINDOW_HOURS: "1"
|
||||||
|
ALERT_LAKE_LAG_THRESHOLD_MINUTES: "60"
|
||||||
|
ALERT_BROKER_ERROR_THRESHOLD: "3"
|
||||||
|
ALERT_BROKER_ERROR_WINDOW_HOURS: "1"
|
||||||
|
ALERT_CHECK_INTERVAL_SECONDS: "120"
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: extractor-worker
|
app: extractor-worker
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: processing
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@@ -15,16 +16,30 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: extractor-worker
|
app: extractor-worker
|
||||||
|
stonks-oracle/tier: processing
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
containers:
|
containers:
|
||||||
- name: extractor-worker
|
- name: extractor-worker
|
||||||
image: ghcr.io/celesrenata/stonks-oracle/extractor:latest
|
image: ghcr.io/celesrenata/stonks-oracle/extractor:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
envFrom:
|
envFrom:
|
||||||
- configMapRef:
|
- configMapRef:
|
||||||
name: stonks-config
|
name: stonks-config
|
||||||
- secretRef:
|
- secretRef:
|
||||||
name: stonks-secrets
|
name: stonks-core-secrets
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 200m
|
cpu: 200m
|
||||||
@@ -32,3 +47,10 @@ spec:
|
|||||||
limits:
|
limits:
|
||||||
cpu: "1"
|
cpu: "1"
|
||||||
memory: 512Mi
|
memory: 512Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
volumes:
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
sizeLimit: 10Mi
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: hive-metastore
|
app: hive-metastore
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: analytics
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@@ -15,22 +16,121 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: hive-metastore
|
app: hive-metastore
|
||||||
|
stonks-oracle/tier: analytics
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
initContainers:
|
||||||
|
- name: hive-config-init
|
||||||
|
image: busybox:1.36
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
command: ["sh", "-c"]
|
||||||
|
args:
|
||||||
|
- |
|
||||||
|
cat > /hive-config/core-site.xml <<EOF
|
||||||
|
<?xml version="1.0"?>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>fs.s3a.endpoint</name>
|
||||||
|
<value>http://minio.minio-service.svc.cluster.local:80</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>fs.s3a.access.key</name>
|
||||||
|
<value>${MINIO_ACCESS_KEY}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>fs.s3a.secret.key</name>
|
||||||
|
<value>${MINIO_SECRET_KEY}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>fs.s3a.path.style.access</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>fs.s3a.impl</name>
|
||||||
|
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>fs.s3a.connection.ssl.enabled</name>
|
||||||
|
<value>false</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
EOF
|
||||||
|
cat > /hive-config/metastore-site.xml <<EOF
|
||||||
|
<?xml version="1.0"?>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>metastore.thrift.uris</name>
|
||||||
|
<value>thrift://0.0.0.0:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>metastore.task.threads.always</name>
|
||||||
|
<value>org.apache.hadoop.hive.metastore.events.EventCleanerTask</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>metastore.expression.proxy</name>
|
||||||
|
<value>org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>javax.jdo.option.ConnectionDriverName</name>
|
||||||
|
<value>org.apache.derby.jdbc.EmbeddedDriver</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>javax.jdo.option.ConnectionURL</name>
|
||||||
|
<value>jdbc:derby:/opt/hive/data/metastore_db;create=true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>metastore.warehouse.dir</name>
|
||||||
|
<value>s3a://stonks-lakehouse/warehouse</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
EOF
|
||||||
|
env:
|
||||||
|
- name: MINIO_ACCESS_KEY
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: stonks-core-secrets
|
||||||
|
key: MINIO_ACCESS_KEY
|
||||||
|
- name: MINIO_SECRET_KEY
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: stonks-core-secrets
|
||||||
|
key: MINIO_SECRET_KEY
|
||||||
|
volumeMounts:
|
||||||
|
- name: hive-config
|
||||||
|
mountPath: /hive-config
|
||||||
containers:
|
containers:
|
||||||
- name: hive-metastore
|
- name: hive-metastore
|
||||||
image: apache/hive:4.0.0
|
image: apache/hive:4.0.0
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 9083
|
- containerPort: 9083
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
env:
|
env:
|
||||||
- name: SERVICE_NAME
|
- name: SERVICE_NAME
|
||||||
value: metastore
|
value: metastore
|
||||||
- name: DB_DRIVER
|
- name: DB_DRIVER
|
||||||
value: derby
|
value: derby
|
||||||
- name: SERVICE_OPTS
|
|
||||||
value: "-Djavax.jdo.option.ConnectionURL=jdbc:derby:/opt/hive/data/metastore_db;create=true"
|
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: hive-data
|
- name: hive-data
|
||||||
mountPath: /opt/hive/data
|
mountPath: /opt/hive/data
|
||||||
|
- name: hive-config
|
||||||
|
mountPath: /opt/hive/conf/core-site.xml
|
||||||
|
subPath: core-site.xml
|
||||||
|
- name: hive-config
|
||||||
|
mountPath: /opt/hive/conf/metastore-site.xml
|
||||||
|
subPath: metastore-site.xml
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 200m
|
cpu: 200m
|
||||||
@@ -42,6 +142,8 @@ spec:
|
|||||||
- name: hive-data
|
- name: hive-data
|
||||||
persistentVolumeClaim:
|
persistentVolumeClaim:
|
||||||
claimName: hive-metastore-data
|
claimName: hive-metastore-data
|
||||||
|
- name: hive-config
|
||||||
|
emptyDir: {}
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Service
|
kind: Service
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: ingestion-worker
|
app: ingestion-worker
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: ingestion
|
||||||
spec:
|
spec:
|
||||||
replicas: 2
|
replicas: 2
|
||||||
selector:
|
selector:
|
||||||
@@ -15,16 +16,32 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: ingestion-worker
|
app: ingestion-worker
|
||||||
|
stonks-oracle/tier: ingestion
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
containers:
|
containers:
|
||||||
- name: ingestion-worker
|
- name: ingestion-worker
|
||||||
image: ghcr.io/celesrenata/stonks-oracle/ingestion:latest
|
image: ghcr.io/celesrenata/stonks-oracle/ingestion:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
envFrom:
|
envFrom:
|
||||||
- configMapRef:
|
- configMapRef:
|
||||||
name: stonks-config
|
name: stonks-config
|
||||||
- secretRef:
|
- secretRef:
|
||||||
name: stonks-secrets
|
name: stonks-core-secrets
|
||||||
|
- secretRef:
|
||||||
|
name: stonks-market-secrets
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
@@ -32,3 +49,10 @@ spec:
|
|||||||
limits:
|
limits:
|
||||||
cpu: 500m
|
cpu: 500m
|
||||||
memory: 256Mi
|
memory: 256Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
volumes:
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
sizeLimit: 10Mi
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: lake-publisher
|
app: lake-publisher
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: analytics
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@@ -15,16 +16,30 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: lake-publisher
|
app: lake-publisher
|
||||||
|
stonks-oracle/tier: analytics
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
containers:
|
containers:
|
||||||
- name: lake-publisher
|
- name: lake-publisher
|
||||||
image: ghcr.io/celesrenata/stonks-oracle/lake-publisher:latest
|
image: ghcr.io/celesrenata/stonks-oracle/lake-publisher:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
envFrom:
|
envFrom:
|
||||||
- configMapRef:
|
- configMapRef:
|
||||||
name: stonks-config
|
name: stonks-config
|
||||||
- secretRef:
|
- secretRef:
|
||||||
name: stonks-secrets
|
name: stonks-core-secrets
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
@@ -32,3 +47,10 @@ spec:
|
|||||||
limits:
|
limits:
|
||||||
cpu: 500m
|
cpu: 500m
|
||||||
memory: 256Mi
|
memory: 256Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
volumes:
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
sizeLimit: 10Mi
|
||||||
|
|||||||
@@ -4,3 +4,4 @@ metadata:
|
|||||||
name: stonks-oracle
|
name: stonks-oracle
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
kubernetes.io/metadata.name: stonks-oracle
|
||||||
|
|||||||
@@ -0,0 +1,173 @@
|
|||||||
|
##
|
||||||
|
## Stonks Oracle — Network Policies
|
||||||
|
##
|
||||||
|
## Default-deny ingress for the namespace, then allow only the
|
||||||
|
## traffic patterns each component actually needs.
|
||||||
|
##
|
||||||
|
## Requirements: 8.2 (trading isolation), 12.1 (observability)
|
||||||
|
##
|
||||||
|
|
||||||
|
# ── Default deny all ingress in the namespace ──────────────────────────
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: default-deny-ingress
|
||||||
|
namespace: stonks-oracle
|
||||||
|
spec:
|
||||||
|
podSelector: {}
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
---
|
||||||
|
# ── Query API: accept from Traefik ingress only ───────────────────────
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: allow-query-api-ingress
|
||||||
|
namespace: stonks-oracle
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: query-api
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
kubernetes.io/metadata.name: kube-system
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 8000
|
||||||
|
---
|
||||||
|
# ── Symbol Registry API: accept from Traefik ingress only ─────────────
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: allow-symbol-registry-ingress
|
||||||
|
namespace: stonks-oracle
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: symbol-registry-api
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
kubernetes.io/metadata.name: kube-system
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 8000
|
||||||
|
---
|
||||||
|
# ── Risk Engine: accept from broker-adapter only ───────────────────────
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: allow-risk-engine-ingress
|
||||||
|
namespace: stonks-oracle
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: risk-engine
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: broker-adapter
|
||||||
|
- podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: query-api
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 8000
|
||||||
|
---
|
||||||
|
# ── Superset: accept from Traefik ingress only ────────────────────────
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: allow-superset-ingress
|
||||||
|
namespace: stonks-oracle
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: superset
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
kubernetes.io/metadata.name: kube-system
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 8088
|
||||||
|
---
|
||||||
|
# ── Trino: accept from Superset and query-api ─────────────────────────
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: allow-trino-ingress
|
||||||
|
namespace: stonks-oracle
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: trino
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: superset
|
||||||
|
- podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: query-api
|
||||||
|
- namespaceSelector:
|
||||||
|
matchLabels:
|
||||||
|
kubernetes.io/metadata.name: kube-system
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 8080
|
||||||
|
---
|
||||||
|
# ── Hive Metastore: accept from Trino and lake-publisher ──────────────
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: allow-hive-metastore-ingress
|
||||||
|
namespace: stonks-oracle
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: hive-metastore
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress:
|
||||||
|
- from:
|
||||||
|
- podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: trino
|
||||||
|
- podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: lake-publisher
|
||||||
|
ports:
|
||||||
|
- protocol: TCP
|
||||||
|
port: 9083
|
||||||
|
---
|
||||||
|
# ── Broker adapter: isolated — no inbound from other pods ──────────────
|
||||||
|
# The broker-adapter only makes outbound calls to the broker API
|
||||||
|
# and reads from Redis queues. No pod needs to call into it.
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: NetworkPolicy
|
||||||
|
metadata:
|
||||||
|
name: deny-broker-adapter-ingress
|
||||||
|
namespace: stonks-oracle
|
||||||
|
spec:
|
||||||
|
podSelector:
|
||||||
|
matchLabels:
|
||||||
|
app: broker-adapter
|
||||||
|
policyTypes:
|
||||||
|
- Ingress
|
||||||
|
ingress: []
|
||||||
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: parser-worker
|
app: parser-worker
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: processing
|
||||||
spec:
|
spec:
|
||||||
replicas: 2
|
replicas: 2
|
||||||
selector:
|
selector:
|
||||||
@@ -15,16 +16,30 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: parser-worker
|
app: parser-worker
|
||||||
|
stonks-oracle/tier: processing
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
containers:
|
containers:
|
||||||
- name: parser-worker
|
- name: parser-worker
|
||||||
image: ghcr.io/celesrenata/stonks-oracle/parser:latest
|
image: ghcr.io/celesrenata/stonks-oracle/parser:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
envFrom:
|
envFrom:
|
||||||
- configMapRef:
|
- configMapRef:
|
||||||
name: stonks-config
|
name: stonks-config
|
||||||
- secretRef:
|
- secretRef:
|
||||||
name: stonks-secrets
|
name: stonks-core-secrets
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
@@ -32,3 +47,10 @@ spec:
|
|||||||
limits:
|
limits:
|
||||||
cpu: 500m
|
cpu: 500m
|
||||||
memory: 256Mi
|
memory: 256Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
volumes:
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
sizeLimit: 10Mi
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: query-api
|
app: query-api
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: api
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@@ -15,18 +16,32 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: query-api
|
app: query-api
|
||||||
|
stonks-oracle/tier: api
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
containers:
|
containers:
|
||||||
- name: query-api
|
- name: query-api
|
||||||
image: ghcr.io/celesrenata/stonks-oracle/query-api:latest
|
image: ghcr.io/celesrenata/stonks-oracle/query-api:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8000
|
- containerPort: 8000
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
envFrom:
|
envFrom:
|
||||||
- configMapRef:
|
- configMapRef:
|
||||||
name: stonks-config
|
name: stonks-config
|
||||||
- secretRef:
|
- secretRef:
|
||||||
name: stonks-secrets
|
name: stonks-core-secrets
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
@@ -40,6 +55,13 @@ spec:
|
|||||||
port: 8000
|
port: 8000
|
||||||
initialDelaySeconds: 5
|
initialDelaySeconds: 5
|
||||||
periodSeconds: 10
|
periodSeconds: 10
|
||||||
|
volumeMounts:
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
volumes:
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
sizeLimit: 10Mi
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Service
|
kind: Service
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: recommendation-worker
|
app: recommendation-worker
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: processing
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@@ -15,16 +16,30 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: recommendation-worker
|
app: recommendation-worker
|
||||||
|
stonks-oracle/tier: processing
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
containers:
|
containers:
|
||||||
- name: recommendation-worker
|
- name: recommendation-worker
|
||||||
image: ghcr.io/celesrenata/stonks-oracle/recommendation:latest
|
image: ghcr.io/celesrenata/stonks-oracle/recommendation:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
envFrom:
|
envFrom:
|
||||||
- configMapRef:
|
- configMapRef:
|
||||||
name: stonks-config
|
name: stonks-config
|
||||||
- secretRef:
|
- secretRef:
|
||||||
name: stonks-secrets
|
name: stonks-core-secrets
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
@@ -32,3 +47,10 @@ spec:
|
|||||||
limits:
|
limits:
|
||||||
cpu: 500m
|
cpu: 500m
|
||||||
memory: 256Mi
|
memory: 256Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
volumes:
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
sizeLimit: 10Mi
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: risk-engine
|
app: risk-engine
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: trading
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@@ -15,18 +16,34 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: risk-engine
|
app: risk-engine
|
||||||
|
stonks-oracle/tier: trading
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
containers:
|
containers:
|
||||||
- name: risk-engine
|
- name: risk-engine
|
||||||
image: ghcr.io/celesrenata/stonks-oracle/risk:latest
|
image: ghcr.io/celesrenata/stonks-oracle/risk:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8000
|
- containerPort: 8000
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
envFrom:
|
envFrom:
|
||||||
- configMapRef:
|
- configMapRef:
|
||||||
name: stonks-config
|
name: stonks-config
|
||||||
- secretRef:
|
- secretRef:
|
||||||
name: stonks-secrets
|
name: stonks-core-secrets
|
||||||
|
- secretRef:
|
||||||
|
name: stonks-broker-secrets
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
@@ -34,6 +51,13 @@ spec:
|
|||||||
limits:
|
limits:
|
||||||
cpu: 500m
|
cpu: 500m
|
||||||
memory: 256Mi
|
memory: 256Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
volumes:
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
sizeLimit: 10Mi
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Service
|
kind: Service
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: scheduler
|
app: scheduler
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: orchestration
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@@ -15,16 +16,30 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: scheduler
|
app: scheduler
|
||||||
|
stonks-oracle/tier: orchestration
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
containers:
|
containers:
|
||||||
- name: scheduler
|
- name: scheduler
|
||||||
image: ghcr.io/celesrenata/stonks-oracle/scheduler:latest
|
image: ghcr.io/celesrenata/stonks-oracle/scheduler:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
envFrom:
|
envFrom:
|
||||||
- configMapRef:
|
- configMapRef:
|
||||||
name: stonks-config
|
name: stonks-config
|
||||||
- secretRef:
|
- secretRef:
|
||||||
name: stonks-secrets
|
name: stonks-core-secrets
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 50m
|
cpu: 50m
|
||||||
@@ -32,3 +47,10 @@ spec:
|
|||||||
limits:
|
limits:
|
||||||
cpu: 200m
|
cpu: 200m
|
||||||
memory: 128Mi
|
memory: 128Mi
|
||||||
|
volumeMounts:
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
volumes:
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
sizeLimit: 10Mi
|
||||||
|
|||||||
+54
-8
@@ -1,17 +1,63 @@
|
|||||||
|
##
|
||||||
|
## Stonks Oracle — Scoped Secrets
|
||||||
|
##
|
||||||
|
## Secrets are split by concern so that only the services that need
|
||||||
|
## broker or market-data credentials actually receive them.
|
||||||
|
## Replace placeholder values before deploying.
|
||||||
|
##
|
||||||
|
## Requirements: 8.2 (broker credential isolation)
|
||||||
|
##
|
||||||
|
|
||||||
|
# ── Core infrastructure secrets (DB, object store, cache) ──────────────
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Secret
|
kind: Secret
|
||||||
metadata:
|
metadata:
|
||||||
name: stonks-secrets
|
name: stonks-core-secrets
|
||||||
namespace: stonks-oracle
|
namespace: stonks-oracle
|
||||||
labels:
|
labels:
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
type: Opaque
|
type: Opaque
|
||||||
stringData:
|
stringData:
|
||||||
POSTGRES_PASSWORD: "changeme"
|
POSTGRES_PASSWORD: "REPLACE_ME"
|
||||||
MINIO_ACCESS_KEY: "changeme"
|
MINIO_ACCESS_KEY: "REPLACE_ME"
|
||||||
MINIO_SECRET_KEY: "changeme"
|
MINIO_SECRET_KEY: "REPLACE_ME"
|
||||||
REDIS_PASSWORD: ""
|
REDIS_PASSWORD: ""
|
||||||
BROKER_API_KEY: ""
|
---
|
||||||
BROKER_API_SECRET: ""
|
# ── Broker secrets — only for broker-adapter and risk-engine ───────────
|
||||||
BROKER_BASE_URL: ""
|
apiVersion: v1
|
||||||
SUPERSET_SECRET_KEY: "stonks-superset-secret-change-me"
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: stonks-broker-secrets
|
||||||
|
namespace: stonks-oracle
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
type: Opaque
|
||||||
|
stringData:
|
||||||
|
BROKER_API_KEY: "REPLACE_ME"
|
||||||
|
BROKER_API_SECRET: "REPLACE_ME"
|
||||||
|
BROKER_BASE_URL: "https://paper-api.alpaca.markets"
|
||||||
|
---
|
||||||
|
# ── Market data secrets — only for ingestion and adapters ──────────────
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: stonks-market-secrets
|
||||||
|
namespace: stonks-oracle
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
type: Opaque
|
||||||
|
stringData:
|
||||||
|
MARKET_DATA_API_KEY: "REPLACE_ME"
|
||||||
|
---
|
||||||
|
# ── Dashboard secrets — only for Superset ──────────────────────────────
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: stonks-dashboard-secrets
|
||||||
|
namespace: stonks-oracle
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
type: Opaque
|
||||||
|
stringData:
|
||||||
|
SUPERSET_SECRET_KEY: "REPLACE_ME"
|
||||||
|
SUPERSET_ADMIN_PASSWORD: "REPLACE_ME"
|
||||||
|
|||||||
+47
-3
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: superset
|
app: superset
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: dashboard
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@@ -15,22 +16,38 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: superset
|
app: superset
|
||||||
|
stonks-oracle/tier: dashboard
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
containers:
|
containers:
|
||||||
- name: superset
|
- name: superset
|
||||||
image: apache/superset:latest
|
image: apache/superset:latest
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8088
|
- containerPort: 8088
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
env:
|
env:
|
||||||
- name: SUPERSET_SECRET_KEY
|
- name: SUPERSET_SECRET_KEY
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
name: stonks-secrets
|
name: stonks-dashboard-secrets
|
||||||
key: SUPERSET_SECRET_KEY
|
key: SUPERSET_SECRET_KEY
|
||||||
- name: ADMIN_USERNAME
|
- name: ADMIN_USERNAME
|
||||||
value: admin
|
value: admin
|
||||||
- name: ADMIN_PASSWORD
|
- name: ADMIN_PASSWORD
|
||||||
value: admin
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: stonks-dashboard-secrets
|
||||||
|
key: SUPERSET_ADMIN_PASSWORD
|
||||||
- name: ADMIN_EMAIL
|
- name: ADMIN_EMAIL
|
||||||
value: admin@stonks.local
|
value: admin@stonks.local
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
@@ -94,12 +111,39 @@ data:
|
|||||||
import os
|
import os
|
||||||
SECRET_KEY = os.getenv("SUPERSET_SECRET_KEY", "stonks-dev-secret-key-change-me")
|
SECRET_KEY = os.getenv("SUPERSET_SECRET_KEY", "stonks-dev-secret-key-change-me")
|
||||||
SQLALCHEMY_DATABASE_URI = "trino://trino@trino.stonks-oracle.svc.cluster.local:8080/lakehouse/stonks"
|
SQLALCHEMY_DATABASE_URI = "trino://trino@trino.stonks-oracle.svc.cluster.local:8080/lakehouse/stonks"
|
||||||
|
# Additional database connections available in Superset UI:
|
||||||
|
# Hive catalog: trino://trino@trino.stonks-oracle.svc.cluster.local:8080/lakehouse/stonks
|
||||||
|
# Iceberg catalog: trino://trino@trino.stonks-oracle.svc.cluster.local:8080/iceberg/stonks
|
||||||
FEATURE_FLAGS = {"ENABLE_TEMPLATE_PROCESSING": True}
|
FEATURE_FLAGS = {"ENABLE_TEMPLATE_PROCESSING": True}
|
||||||
CACHE_CONFIG = {
|
CACHE_CONFIG = {
|
||||||
"CACHE_TYPE": "RedisCache",
|
"CACHE_TYPE": "RedisCache",
|
||||||
"CACHE_DEFAULT_TIMEOUT": 300,
|
"CACHE_DEFAULT_TIMEOUT": 300,
|
||||||
"CACHE_KEY_PREFIX": "superset_",
|
"CACHE_KEY_PREFIX": "superset_",
|
||||||
"CACHE_REDIS_HOST": os.getenv("REDIS_HOST", "redis.redis-service.svc.cluster.local"),
|
"CACHE_REDIS_HOST": os.getenv("REDIS_HOST", "redis-master.redis-service.svc.cluster.local"),
|
||||||
"CACHE_REDIS_PORT": int(os.getenv("REDIS_PORT", "6379")),
|
"CACHE_REDIS_PORT": int(os.getenv("REDIS_PORT", "6379")),
|
||||||
"CACHE_REDIS_DB": 1,
|
"CACHE_REDIS_DB": 1,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# --- Security hardening ---
|
||||||
|
# Disable public user role (require login)
|
||||||
|
PUBLIC_ROLE_LIKE = None
|
||||||
|
# Session cookie security
|
||||||
|
SESSION_COOKIE_HTTPONLY = True
|
||||||
|
SESSION_COOKIE_SECURE = True
|
||||||
|
SESSION_COOKIE_SAMESITE = "Lax"
|
||||||
|
# Talisman CSP headers
|
||||||
|
TALISMAN_ENABLED = True
|
||||||
|
TALISMAN_CONFIG = {
|
||||||
|
"content_security_policy": {
|
||||||
|
"default-src": ["'self'"],
|
||||||
|
"img-src": ["'self'", "data:"],
|
||||||
|
"style-src": ["'self'", "'unsafe-inline'"],
|
||||||
|
"script-src": ["'self'", "'unsafe-inline'", "'unsafe-eval'"],
|
||||||
|
},
|
||||||
|
"force_https": False, # TLS terminated at ingress
|
||||||
|
}
|
||||||
|
# Prevent Superset from allowing arbitrary SQL database connections
|
||||||
|
PREVENT_UNSAFE_DB_CONNECTIONS = True
|
||||||
|
# Row limit for queries
|
||||||
|
ROW_LIMIT = 50000
|
||||||
|
SQL_MAX_ROW = 100000
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: symbol-registry-api
|
app: symbol-registry-api
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: api
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@@ -15,18 +16,32 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: symbol-registry-api
|
app: symbol-registry-api
|
||||||
|
stonks-oracle/tier: api
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
containers:
|
containers:
|
||||||
- name: symbol-registry-api
|
- name: symbol-registry-api
|
||||||
image: ghcr.io/celesrenata/stonks-oracle/symbol-registry:latest
|
image: ghcr.io/celesrenata/stonks-oracle/symbol-registry:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8000
|
- containerPort: 8000
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
envFrom:
|
envFrom:
|
||||||
- configMapRef:
|
- configMapRef:
|
||||||
name: stonks-config
|
name: stonks-config
|
||||||
- secretRef:
|
- secretRef:
|
||||||
name: stonks-secrets
|
name: stonks-core-secrets
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
cpu: 100m
|
cpu: 100m
|
||||||
@@ -46,6 +61,13 @@ spec:
|
|||||||
port: 8000
|
port: 8000
|
||||||
initialDelaySeconds: 10
|
initialDelaySeconds: 10
|
||||||
periodSeconds: 30
|
periodSeconds: 30
|
||||||
|
volumeMounts:
|
||||||
|
- name: tmp
|
||||||
|
mountPath: /tmp
|
||||||
|
volumes:
|
||||||
|
- name: tmp
|
||||||
|
emptyDir:
|
||||||
|
sizeLimit: 10Mi
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Service
|
kind: Service
|
||||||
|
|||||||
+63
-26
@@ -6,6 +6,7 @@ metadata:
|
|||||||
labels:
|
labels:
|
||||||
app: trino
|
app: trino
|
||||||
app.kubernetes.io/part-of: stonks-oracle
|
app.kubernetes.io/part-of: stonks-oracle
|
||||||
|
stonks-oracle/tier: analytics
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
selector:
|
selector:
|
||||||
@@ -15,12 +16,73 @@ spec:
|
|||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
app: trino
|
app: trino
|
||||||
|
stonks-oracle/tier: analytics
|
||||||
spec:
|
spec:
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
securityContext:
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
runAsGroup: 1000
|
||||||
|
fsGroup: 1000
|
||||||
|
seccompProfile:
|
||||||
|
type: RuntimeDefault
|
||||||
|
initContainers:
|
||||||
|
- name: catalog-init
|
||||||
|
image: busybox:1.36
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
|
command: ["sh", "-c"]
|
||||||
|
args:
|
||||||
|
- |
|
||||||
|
cat > /catalog/iceberg.properties <<EOF
|
||||||
|
connector.name=iceberg
|
||||||
|
iceberg.catalog.type=hive_metastore
|
||||||
|
hive.metastore.uri=thrift://hive-metastore.stonks-oracle.svc.cluster.local:9083
|
||||||
|
hive.s3.endpoint=http://minio.minio-service.svc.cluster.local:80
|
||||||
|
hive.s3.path-style-access=true
|
||||||
|
hive.s3.aws-access-key=${MINIO_ACCESS_KEY}
|
||||||
|
hive.s3.aws-secret-key=${MINIO_SECRET_KEY}
|
||||||
|
fs.native-s3.enabled=true
|
||||||
|
s3.endpoint=http://minio.minio-service.svc.cluster.local:80
|
||||||
|
s3.path-style-access=true
|
||||||
|
s3.aws-access-key=${MINIO_ACCESS_KEY}
|
||||||
|
s3.aws-secret-key=${MINIO_SECRET_KEY}
|
||||||
|
EOF
|
||||||
|
cat > /catalog/lakehouse.properties <<EOF
|
||||||
|
connector.name=hive
|
||||||
|
hive.metastore.uri=thrift://hive-metastore.stonks-oracle.svc.cluster.local:9083
|
||||||
|
hive.s3.endpoint=http://minio.minio-service.svc.cluster.local:80
|
||||||
|
hive.s3.path-style-access=true
|
||||||
|
hive.s3.aws-access-key=${MINIO_ACCESS_KEY}
|
||||||
|
hive.s3.aws-secret-key=${MINIO_SECRET_KEY}
|
||||||
|
hive.non-managed-table-writes-enabled=true
|
||||||
|
hive.s3select-pushdown.enabled=true
|
||||||
|
EOF
|
||||||
|
env:
|
||||||
|
- name: MINIO_ACCESS_KEY
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: stonks-core-secrets
|
||||||
|
key: MINIO_ACCESS_KEY
|
||||||
|
- name: MINIO_SECRET_KEY
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: stonks-core-secrets
|
||||||
|
key: MINIO_SECRET_KEY
|
||||||
|
volumeMounts:
|
||||||
|
- name: catalog-config
|
||||||
|
mountPath: /catalog
|
||||||
containers:
|
containers:
|
||||||
- name: trino
|
- name: trino
|
||||||
image: trinodb/trino:latest
|
image: trinodb/trino:latest
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8080
|
- containerPort: 8080
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop: ["ALL"]
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: catalog-config
|
- name: catalog-config
|
||||||
mountPath: /etc/trino/catalog
|
mountPath: /etc/trino/catalog
|
||||||
@@ -39,8 +101,7 @@ spec:
|
|||||||
periodSeconds: 10
|
periodSeconds: 10
|
||||||
volumes:
|
volumes:
|
||||||
- name: catalog-config
|
- name: catalog-config
|
||||||
configMap:
|
emptyDir: {}
|
||||||
name: trino-catalog
|
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Service
|
kind: Service
|
||||||
@@ -53,27 +114,3 @@ spec:
|
|||||||
ports:
|
ports:
|
||||||
- port: 8080
|
- port: 8080
|
||||||
targetPort: 8080
|
targetPort: 8080
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: trino-catalog
|
|
||||||
namespace: stonks-oracle
|
|
||||||
data:
|
|
||||||
iceberg.properties: |
|
|
||||||
connector.name=iceberg
|
|
||||||
iceberg.catalog.type=hive_metastore
|
|
||||||
hive.metastore.uri=thrift://hive-metastore.stonks-oracle.svc.cluster.local:9083
|
|
||||||
hive.s3.endpoint=http://minio.minio-service.svc.cluster.local:80
|
|
||||||
hive.s3.path-style-access=true
|
|
||||||
hive.s3.aws-access-key=changeme
|
|
||||||
hive.s3.aws-secret-key=changeme
|
|
||||||
lakehouse.properties: |
|
|
||||||
connector.name=hive
|
|
||||||
hive.metastore.uri=thrift://hive-metastore.stonks-oracle.svc.cluster.local:9083
|
|
||||||
hive.s3.endpoint=http://minio.minio-service.svc.cluster.local:80
|
|
||||||
hive.s3.path-style-access=true
|
|
||||||
hive.s3.aws-access-key=changeme
|
|
||||||
hive.s3.aws-secret-key=changeme
|
|
||||||
hive.non-managed-table-writes-enabled=true
|
|
||||||
hive.s3select-pushdown.enabled=true
|
|
||||||
|
|||||||
@@ -0,0 +1,13 @@
|
|||||||
|
-- Stonks Oracle - Dedupe support indexes
|
||||||
|
|
||||||
|
-- Index on canonical_url for cross-source deduplication lookups.
|
||||||
|
-- The dedupe module queries documents by canonical_url to detect
|
||||||
|
-- the same article ingested from different source types.
|
||||||
|
CREATE INDEX idx_documents_canonical_url ON documents(canonical_url)
|
||||||
|
WHERE canonical_url IS NOT NULL;
|
||||||
|
|
||||||
|
-- Unique constraint on document_company_mentions to prevent duplicate
|
||||||
|
-- company links when cross-source dedupe links an existing document
|
||||||
|
-- to an additional company.
|
||||||
|
CREATE UNIQUE INDEX idx_doc_mentions_unique
|
||||||
|
ON document_company_mentions(document_id, company_id);
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
-- Stonks Oracle - Add parser_output_ref to documents table
|
||||||
|
-- Stores the MinIO reference to the structured parser output JSON
|
||||||
|
-- (metadata, quality signals, warnings, outbound links, tags, etc.)
|
||||||
|
|
||||||
|
ALTER TABLE documents ADD COLUMN IF NOT EXISTS parser_output_ref VARCHAR(1000);
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
-- Stonks Oracle - Model Performance Metrics
|
||||||
|
-- Tracks extraction success/failure rates, latency, retries, confidence,
|
||||||
|
-- token usage estimates, and validation error distributions.
|
||||||
|
-- Requirements: 5.2, 5.4, 12.1, 12.2
|
||||||
|
|
||||||
|
CREATE TABLE model_performance_metrics (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
|
||||||
|
ticker VARCHAR(20),
|
||||||
|
model_name VARCHAR(200) NOT NULL,
|
||||||
|
prompt_version VARCHAR(100),
|
||||||
|
schema_version VARCHAR(50),
|
||||||
|
success BOOLEAN NOT NULL,
|
||||||
|
attempt_count INTEGER NOT NULL DEFAULT 1,
|
||||||
|
total_duration_ms INTEGER NOT NULL DEFAULT 0,
|
||||||
|
first_attempt_duration_ms INTEGER DEFAULT 0,
|
||||||
|
final_attempt_duration_ms INTEGER DEFAULT 0,
|
||||||
|
confidence FLOAT DEFAULT 0.0,
|
||||||
|
validation_status VARCHAR(50) NOT NULL DEFAULT 'unknown',
|
||||||
|
validation_error_count INTEGER DEFAULT 0,
|
||||||
|
validation_warning_count INTEGER DEFAULT 0,
|
||||||
|
validation_errors JSONB DEFAULT '[]',
|
||||||
|
retry_count INTEGER DEFAULT 0,
|
||||||
|
input_token_estimate INTEGER DEFAULT 0,
|
||||||
|
output_token_estimate INTEGER DEFAULT 0,
|
||||||
|
company_count INTEGER DEFAULT 0,
|
||||||
|
recorded_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Query by time range (dashboard primary access pattern)
|
||||||
|
CREATE INDEX idx_model_perf_recorded ON model_performance_metrics(recorded_at DESC);
|
||||||
|
|
||||||
|
-- Filter by model for per-model dashboards
|
||||||
|
CREATE INDEX idx_model_perf_model ON model_performance_metrics(model_name, recorded_at DESC);
|
||||||
|
|
||||||
|
-- Filter by success for failure analysis
|
||||||
|
CREATE INDEX idx_model_perf_success ON model_performance_metrics(success, recorded_at DESC);
|
||||||
|
|
||||||
|
-- Filter by validation status for schema failure dashboards
|
||||||
|
CREATE INDEX idx_model_perf_validation ON model_performance_metrics(validation_status);
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
-- Stonks Oracle - Add disagreement details to trend windows
|
||||||
|
-- Stores structured contradiction/disagreement representations
|
||||||
|
-- so downstream consumers can inspect *why* signals conflict
|
||||||
|
-- rather than relying on a single scalar contradiction_score.
|
||||||
|
-- Requirements: 6.4
|
||||||
|
|
||||||
|
ALTER TABLE trend_windows
|
||||||
|
ADD COLUMN IF NOT EXISTS disagreement_details JSONB DEFAULT '[]';
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
-- Stonks Oracle - Trend evidence mappings
|
||||||
|
-- Links trend_windows to the documents that contributed as evidence,
|
||||||
|
-- storing the evidence type (supporting/opposing), rank score, and
|
||||||
|
-- weight breakdown for explainability and drill-down queries.
|
||||||
|
-- Requirements: 6.5, 10.4
|
||||||
|
|
||||||
|
CREATE TABLE trend_evidence (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
trend_window_id UUID NOT NULL REFERENCES trend_windows(id) ON DELETE CASCADE,
|
||||||
|
document_id UUID NOT NULL,
|
||||||
|
evidence_type VARCHAR(20) NOT NULL DEFAULT 'supporting', -- supporting | opposing
|
||||||
|
rank_score FLOAT DEFAULT 0.0,
|
||||||
|
weight_component FLOAT DEFAULT 0.0,
|
||||||
|
impact_component FLOAT DEFAULT 0.0,
|
||||||
|
recency_component FLOAT DEFAULT 0.0,
|
||||||
|
confidence_component FLOAT DEFAULT 0.0,
|
||||||
|
sentiment_value FLOAT DEFAULT 0.0,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_trend_evidence_trend ON trend_evidence(trend_window_id);
|
||||||
|
CREATE INDEX idx_trend_evidence_doc ON trend_evidence(document_id);
|
||||||
|
CREATE INDEX idx_trend_evidence_type ON trend_evidence(trend_window_id, evidence_type);
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
-- Stonks Oracle - Recommendation persistence enhancements
|
||||||
|
-- Adds full model metadata columns to recommendations table
|
||||||
|
-- and a risk_classification column for the computed risk label.
|
||||||
|
-- Requirements: 7.1, 7.2, 8.3
|
||||||
|
|
||||||
|
-- Store full model provenance on the recommendation itself
|
||||||
|
ALTER TABLE recommendations
|
||||||
|
ADD COLUMN IF NOT EXISTS model_provider VARCHAR(100) DEFAULT 'deterministic',
|
||||||
|
ADD COLUMN IF NOT EXISTS prompt_version VARCHAR(100) DEFAULT '',
|
||||||
|
ADD COLUMN IF NOT EXISTS schema_version VARCHAR(50) DEFAULT '1.0.0',
|
||||||
|
ADD COLUMN IF NOT EXISTS risk_classification VARCHAR(20) DEFAULT 'moderate';
|
||||||
|
|
||||||
|
-- Index for querying recommendations by risk classification
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_recommendations_risk
|
||||||
|
ON recommendations(risk_classification);
|
||||||
@@ -0,0 +1,55 @@
|
|||||||
|
-- Stonks Oracle - Portfolio and account risk configuration
|
||||||
|
-- Persists risk configuration profiles and tracks risk state snapshots.
|
||||||
|
-- Requirements: 8.1, 8.2, 8.4
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- Risk Configuration Profiles
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
CREATE TABLE risk_configs (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
name VARCHAR(200) NOT NULL DEFAULT 'default',
|
||||||
|
trading_mode VARCHAR(20) NOT NULL DEFAULT 'paper',
|
||||||
|
config JSONB NOT NULL DEFAULT '{}',
|
||||||
|
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX idx_risk_configs_active_name
|
||||||
|
ON risk_configs(name) WHERE active = TRUE;
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- Symbol-level lockouts (news-shock, cooldown)
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
CREATE TABLE symbol_lockouts (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
ticker VARCHAR(20) NOT NULL,
|
||||||
|
lockout_type VARCHAR(50) NOT NULL,
|
||||||
|
reason TEXT DEFAULT '',
|
||||||
|
expires_at TIMESTAMPTZ NOT NULL,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_symbol_lockouts_ticker ON symbol_lockouts(ticker, expires_at);
|
||||||
|
CREATE INDEX idx_symbol_lockouts_expiry ON symbol_lockouts(expires_at);
|
||||||
|
|
||||||
|
-- ============================================================
|
||||||
|
-- Daily risk snapshots (for daily loss tracking)
|
||||||
|
-- ============================================================
|
||||||
|
|
||||||
|
CREATE TABLE daily_risk_snapshots (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
account_id VARCHAR(200) NOT NULL,
|
||||||
|
snapshot_date DATE NOT NULL DEFAULT CURRENT_DATE,
|
||||||
|
portfolio_value NUMERIC DEFAULT 0,
|
||||||
|
daily_pnl NUMERIC DEFAULT 0,
|
||||||
|
daily_trade_count INTEGER DEFAULT 0,
|
||||||
|
positions_by_sector JSONB DEFAULT '{}',
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
UNIQUE(account_id, snapshot_date)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_daily_risk_account ON daily_risk_snapshots(account_id, snapshot_date DESC);
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
-- Stonks Oracle - Add unique constraint for paper trading position upserts
|
||||||
|
-- Requirements: 8.1, 8.3
|
||||||
|
|
||||||
|
-- The paper trading adapter needs to upsert positions by (broker_account_id, ticker).
|
||||||
|
-- Add a unique constraint to support ON CONFLICT.
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_positions_account_ticker
|
||||||
|
ON positions(broker_account_id, ticker);
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
-- Stonks Oracle - Execution audit trail indexes
|
||||||
|
-- Supports efficient querying of the full decision chain from
|
||||||
|
-- recommendation through risk evaluation to broker execution.
|
||||||
|
-- Requirements: 8.3, 11.3
|
||||||
|
|
||||||
|
-- GIN index on audit_events.data for JSONB key lookups
|
||||||
|
-- (e.g. data->>'recommendation_id', data->>'order_id')
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_audit_events_data_gin
|
||||||
|
ON audit_events USING gin (data);
|
||||||
|
|
||||||
|
-- Index for chronological audit trail queries by entity
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_audit_events_entity_created
|
||||||
|
ON audit_events (entity_id, created_at ASC);
|
||||||
|
|
||||||
|
-- Index for filtering by event_type + entity_type
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_audit_events_type_entity
|
||||||
|
ON audit_events (event_type, entity_type);
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
-- Stonks Oracle - Operator approval workflow for live trading mode
|
||||||
|
-- Tracks pending, approved, rejected, and expired approval requests
|
||||||
|
-- for orders that require operator sign-off before broker submission.
|
||||||
|
-- Requirements: 8.2
|
||||||
|
|
||||||
|
CREATE TABLE operator_approvals (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
order_job JSONB NOT NULL DEFAULT '{}',
|
||||||
|
recommendation_id UUID REFERENCES recommendations(id),
|
||||||
|
ticker VARCHAR(20) NOT NULL,
|
||||||
|
side VARCHAR(10) NOT NULL DEFAULT 'buy',
|
||||||
|
quantity NUMERIC NOT NULL DEFAULT 0,
|
||||||
|
estimated_value NUMERIC NOT NULL DEFAULT 0,
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'pending',
|
||||||
|
risk_evaluation_id UUID,
|
||||||
|
requested_by VARCHAR(200) NOT NULL DEFAULT 'system',
|
||||||
|
reviewed_by VARCHAR(200),
|
||||||
|
review_note TEXT,
|
||||||
|
expires_at TIMESTAMPTZ NOT NULL,
|
||||||
|
requested_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
reviewed_at TIMESTAMPTZ,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_operator_approvals_status ON operator_approvals(status);
|
||||||
|
CREATE INDEX idx_operator_approvals_ticker ON operator_approvals(ticker);
|
||||||
|
CREATE INDEX idx_operator_approvals_expires ON operator_approvals(expires_at)
|
||||||
|
WHERE status = 'pending';
|
||||||
@@ -0,0 +1,43 @@
|
|||||||
|
-- Stonks Oracle - Data retention and lifecycle policies
|
||||||
|
-- Tracks per-bucket and per-artifact-class retention rules.
|
||||||
|
-- Requirements: N3
|
||||||
|
|
||||||
|
CREATE TABLE retention_policies (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
bucket_name VARCHAR(200) NOT NULL,
|
||||||
|
artifact_class VARCHAR(100) NOT NULL DEFAULT 'default',
|
||||||
|
retention_days INTEGER NOT NULL DEFAULT 365,
|
||||||
|
archive_before_delete BOOLEAN NOT NULL DEFAULT FALSE,
|
||||||
|
active BOOLEAN NOT NULL DEFAULT TRUE,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
UNIQUE(bucket_name, artifact_class)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Seed default retention policies per bucket
|
||||||
|
INSERT INTO retention_policies (bucket_name, artifact_class, retention_days, archive_before_delete) VALUES
|
||||||
|
('stonks-raw-market', 'default', 90, FALSE),
|
||||||
|
('stonks-raw-news', 'default', 180, FALSE),
|
||||||
|
('stonks-raw-filings', 'default', 365, FALSE),
|
||||||
|
('stonks-normalized', 'default', 180, FALSE),
|
||||||
|
('stonks-llm-prompts', 'default', 365, FALSE),
|
||||||
|
('stonks-llm-results', 'default', 365, FALSE),
|
||||||
|
('stonks-lakehouse', 'default', 730, FALSE),
|
||||||
|
('stonks-audit', 'default', 730, FALSE);
|
||||||
|
|
||||||
|
-- Track retention cleanup runs for observability
|
||||||
|
CREATE TABLE retention_runs (
|
||||||
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
|
bucket_name VARCHAR(200) NOT NULL,
|
||||||
|
objects_scanned INTEGER NOT NULL DEFAULT 0,
|
||||||
|
objects_deleted INTEGER NOT NULL DEFAULT 0,
|
||||||
|
bytes_freed BIGINT NOT NULL DEFAULT 0,
|
||||||
|
db_rows_deleted INTEGER NOT NULL DEFAULT 0,
|
||||||
|
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
completed_at TIMESTAMPTZ,
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'running',
|
||||||
|
error_message TEXT
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_retention_runs_bucket ON retention_runs(bucket_name, started_at DESC);
|
||||||
|
CREATE INDEX idx_retention_runs_status ON retention_runs(status);
|
||||||
+81
-11
@@ -1,14 +1,84 @@
|
|||||||
{
|
{
|
||||||
"Rules": [
|
"buckets": {
|
||||||
{
|
"stonks-raw-market": {
|
||||||
"ID": "raw-retention-365d",
|
"Rules": [
|
||||||
"Status": "Enabled",
|
{
|
||||||
"Filter": {
|
"ID": "raw-market-retention-90d",
|
||||||
"Prefix": ""
|
"Status": "Enabled",
|
||||||
},
|
"Filter": { "Prefix": "" },
|
||||||
"Expiration": {
|
"Expiration": { "Days": 90 }
|
||||||
"Days": 365
|
}
|
||||||
}
|
]
|
||||||
|
},
|
||||||
|
"stonks-raw-news": {
|
||||||
|
"Rules": [
|
||||||
|
{
|
||||||
|
"ID": "raw-news-retention-180d",
|
||||||
|
"Status": "Enabled",
|
||||||
|
"Filter": { "Prefix": "" },
|
||||||
|
"Expiration": { "Days": 180 }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"stonks-raw-filings": {
|
||||||
|
"Rules": [
|
||||||
|
{
|
||||||
|
"ID": "raw-filings-retention-365d",
|
||||||
|
"Status": "Enabled",
|
||||||
|
"Filter": { "Prefix": "" },
|
||||||
|
"Expiration": { "Days": 365 }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"stonks-normalized": {
|
||||||
|
"Rules": [
|
||||||
|
{
|
||||||
|
"ID": "normalized-retention-180d",
|
||||||
|
"Status": "Enabled",
|
||||||
|
"Filter": { "Prefix": "" },
|
||||||
|
"Expiration": { "Days": 180 }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"stonks-llm-prompts": {
|
||||||
|
"Rules": [
|
||||||
|
{
|
||||||
|
"ID": "llm-prompts-retention-365d",
|
||||||
|
"Status": "Enabled",
|
||||||
|
"Filter": { "Prefix": "" },
|
||||||
|
"Expiration": { "Days": 365 }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"stonks-llm-results": {
|
||||||
|
"Rules": [
|
||||||
|
{
|
||||||
|
"ID": "llm-results-retention-365d",
|
||||||
|
"Status": "Enabled",
|
||||||
|
"Filter": { "Prefix": "" },
|
||||||
|
"Expiration": { "Days": 365 }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"stonks-lakehouse": {
|
||||||
|
"Rules": [
|
||||||
|
{
|
||||||
|
"ID": "lakehouse-retention-730d",
|
||||||
|
"Status": "Enabled",
|
||||||
|
"Filter": { "Prefix": "" },
|
||||||
|
"Expiration": { "Days": 730 }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"stonks-audit": {
|
||||||
|
"Rules": [
|
||||||
|
{
|
||||||
|
"ID": "audit-retention-730d",
|
||||||
|
"Status": "Enabled",
|
||||||
|
"Filter": { "Prefix": "" },
|
||||||
|
"Expiration": { "Days": 730 }
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
]
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,10 +1,18 @@
|
|||||||
"""Apache Superset configuration for Stonks Oracle."""
|
"""Apache Superset configuration for Stonks Oracle.
|
||||||
|
|
||||||
|
Security hardening applied:
|
||||||
|
- Session cookies: HttpOnly, Secure, SameSite=Lax
|
||||||
|
- Talisman CSP headers enabled
|
||||||
|
- Public role disabled (login required)
|
||||||
|
- Unsafe DB connections blocked
|
||||||
|
- Row limits enforced
|
||||||
|
"""
|
||||||
import os
|
import os
|
||||||
|
|
||||||
# Superset secret key
|
# Superset secret key — must be set via SUPERSET_SECRET_KEY env var
|
||||||
SECRET_KEY = os.getenv("SUPERSET_SECRET_KEY", "stonks-dev-secret-key-change-me")
|
SECRET_KEY = os.getenv("SUPERSET_SECRET_KEY", "stonks-dev-secret-key-change-me")
|
||||||
|
|
||||||
# Trino datasource
|
# Default Trino datasource (Hive catalog for backward compatibility)
|
||||||
SQLALCHEMY_DATABASE_URI = "trino://trino@trino:8080/lakehouse/stonks"
|
SQLALCHEMY_DATABASE_URI = "trino://trino@trino:8080/lakehouse/stonks"
|
||||||
|
|
||||||
# Feature flags
|
# Feature flags
|
||||||
@@ -12,6 +20,10 @@ FEATURE_FLAGS = {
|
|||||||
"ENABLE_TEMPLATE_PROCESSING": True,
|
"ENABLE_TEMPLATE_PROCESSING": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Additional database connections available in Superset UI:
|
||||||
|
# Hive catalog: trino://trino@trino:8080/lakehouse/stonks
|
||||||
|
# Iceberg catalog: trino://trino@trino:8080/iceberg/stonks
|
||||||
|
|
||||||
# Cache config (Redis-backed)
|
# Cache config (Redis-backed)
|
||||||
CACHE_CONFIG = {
|
CACHE_CONFIG = {
|
||||||
"CACHE_TYPE": "RedisCache",
|
"CACHE_TYPE": "RedisCache",
|
||||||
@@ -21,3 +33,31 @@ CACHE_CONFIG = {
|
|||||||
"CACHE_REDIS_PORT": int(os.getenv("REDIS_PORT", "6379")),
|
"CACHE_REDIS_PORT": int(os.getenv("REDIS_PORT", "6379")),
|
||||||
"CACHE_REDIS_DB": 1,
|
"CACHE_REDIS_DB": 1,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# --- Security hardening ---
|
||||||
|
# Disable public user role (require login)
|
||||||
|
PUBLIC_ROLE_LIKE = None
|
||||||
|
|
||||||
|
# Session cookie security
|
||||||
|
SESSION_COOKIE_HTTPONLY = True
|
||||||
|
SESSION_COOKIE_SECURE = True
|
||||||
|
SESSION_COOKIE_SAMESITE = "Lax"
|
||||||
|
|
||||||
|
# Talisman CSP headers
|
||||||
|
TALISMAN_ENABLED = True
|
||||||
|
TALISMAN_CONFIG = {
|
||||||
|
"content_security_policy": {
|
||||||
|
"default-src": ["'self'"],
|
||||||
|
"img-src": ["'self'", "data:"],
|
||||||
|
"style-src": ["'self'", "'unsafe-inline'"],
|
||||||
|
"script-src": ["'self'", "'unsafe-inline'", "'unsafe-eval'"],
|
||||||
|
},
|
||||||
|
"force_https": False, # TLS terminated at ingress
|
||||||
|
}
|
||||||
|
|
||||||
|
# Prevent Superset from allowing arbitrary SQL database connections
|
||||||
|
PREVENT_UNSAFE_DB_CONNECTIONS = True
|
||||||
|
|
||||||
|
# Row limit for queries
|
||||||
|
ROW_LIMIT = 50000
|
||||||
|
SQL_MAX_ROW = 100000
|
||||||
|
|||||||
@@ -5,3 +5,8 @@ hive.s3.endpoint=http://minio:9000
|
|||||||
hive.s3.path-style-access=true
|
hive.s3.path-style-access=true
|
||||||
hive.s3.aws-access-key=minioadmin
|
hive.s3.aws-access-key=minioadmin
|
||||||
hive.s3.aws-secret-key=minioadmin
|
hive.s3.aws-secret-key=minioadmin
|
||||||
|
fs.native-s3.enabled=true
|
||||||
|
s3.endpoint=http://minio:9000
|
||||||
|
s3.path-style-access=true
|
||||||
|
s3.aws-access-key=minioadmin
|
||||||
|
s3.aws-secret-key=minioadmin
|
||||||
|
|||||||
@@ -2,15 +2,31 @@
|
|||||||
|
|
||||||
Analytical fact table definitions for MinIO-backed datasets queried via Trino.
|
Analytical fact table definitions for MinIO-backed datasets queried via Trino.
|
||||||
|
|
||||||
|
All tables use Hive-compatible partition layouts on MinIO (`s3a://stonks-lakehouse/warehouse/`)
|
||||||
|
and are defined in the `lakehouse.stonks` schema. Parquet is the storage format.
|
||||||
|
|
||||||
## Fact Tables
|
## Fact Tables
|
||||||
- `lake.market_bars` — OHLCV bar data
|
- `lake.market_bars` — OHLCV bar data per symbol per interval
|
||||||
- `lake.market_quotes` — quote snapshots
|
- `lake.market_quotes` — bid/ask quote snapshots
|
||||||
- `lake.company_events` — corporate actions and events
|
- `lake.company_events` — corporate actions, earnings, filings, and issuer events
|
||||||
- `lake.documents` — ingested document metadata
|
- `lake.documents` — ingested document metadata (articles, filings, transcripts)
|
||||||
- `lake.document_extractions` — AI extraction outputs
|
- `lake.document_extractions` — AI extraction outputs per document per company
|
||||||
- `lake.trade_signals` — aggregated trend signals
|
- `lake.trade_signals` — aggregated trend signals and recommendation actions
|
||||||
- `lake.trade_orders` — order submission records
|
- `lake.trade_orders` — order submission records (paper and live)
|
||||||
- `lake.trade_fills` — fill and execution records
|
- `lake.trade_fills` — fill and execution records from broker
|
||||||
- `lake.positions_daily` — end-of-day position snapshots
|
- `lake.positions_daily` — end-of-day position snapshots
|
||||||
- `lake.pnl_daily` — daily PnL records
|
- `lake.pnl_daily` — daily PnL records per symbol per account
|
||||||
- `lake.prediction_vs_outcome` — prediction accuracy tracking
|
- `lake.prediction_vs_outcome` — prediction accuracy tracking
|
||||||
|
- `lake.model_performance` — extraction model performance metrics
|
||||||
|
|
||||||
|
## Partitioning
|
||||||
|
- Most tables partition by `dt` (date)
|
||||||
|
- `document_extractions`, `prediction_vs_outcome`, and `model_performance` also partition by `model_version`
|
||||||
|
|
||||||
|
## Trino Catalogs
|
||||||
|
- `lakehouse` catalog (Hive connector) for external Hive-compatible tables
|
||||||
|
- `iceberg` catalog (Iceberg connector) for managed Iceberg tables
|
||||||
|
|
||||||
|
## Views
|
||||||
|
Example SQL views for dashboards and ad hoc analysis are in `lakehouse/views/`.
|
||||||
|
See `lakehouse/views/README.md` for details.
|
||||||
|
|||||||
@@ -0,0 +1,24 @@
|
|||||||
|
-- Analytical fact table: company_events
|
||||||
|
-- Corporate actions, earnings, filings, and other issuer events.
|
||||||
|
-- Partitioned by dt (date) on MinIO.
|
||||||
|
-- Path: s3://stonks-lakehouse/warehouse/company_events/dt={yyyy-mm-dd}/part-*.parquet
|
||||||
|
-- Requirements: 2.3, 9.4, 9.5, 10.1
|
||||||
|
-- Design ref: Section 7 (lake.company_events)
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS lakehouse.stonks.company_events (
|
||||||
|
event_id VARCHAR,
|
||||||
|
ticker VARCHAR,
|
||||||
|
event_type VARCHAR,
|
||||||
|
event_subtype VARCHAR,
|
||||||
|
title VARCHAR,
|
||||||
|
description VARCHAR,
|
||||||
|
source VARCHAR,
|
||||||
|
source_url VARCHAR,
|
||||||
|
event_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
|
ingested_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
|
dt DATE
|
||||||
|
) WITH (
|
||||||
|
format = 'PARQUET',
|
||||||
|
partitioned_by = ARRAY['dt'],
|
||||||
|
external_location = 's3a://stonks-lakehouse/warehouse/company_events/'
|
||||||
|
);
|
||||||
@@ -1,16 +1,28 @@
|
|||||||
-- Analytical fact table: document_extractions
|
-- Analytical fact table: document_extractions
|
||||||
-- Partitioned by dt and model_version on MinIO
|
-- AI extraction outputs per document per company.
|
||||||
|
-- Partitioned by dt and model_version on MinIO.
|
||||||
|
-- Path: s3://stonks-lakehouse/warehouse/document_extractions/dt={yyyy-mm-dd}/model_version={ver}/part-*.parquet
|
||||||
|
-- Requirements: 5.3, 5.5, 9.4, 9.5, 10.1, 10.4
|
||||||
|
-- Design ref: Section 6.3, Section 7 (lake.document_extractions)
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS lakehouse.stonks.document_extractions (
|
CREATE TABLE IF NOT EXISTS lakehouse.stonks.document_extractions (
|
||||||
document_id VARCHAR,
|
document_id VARCHAR,
|
||||||
ticker VARCHAR,
|
ticker VARCHAR,
|
||||||
|
company_name VARCHAR,
|
||||||
|
relevance DOUBLE,
|
||||||
sentiment VARCHAR,
|
sentiment VARCHAR,
|
||||||
impact_score DOUBLE,
|
impact_score DOUBLE,
|
||||||
|
impact_horizon VARCHAR,
|
||||||
catalyst_type VARCHAR,
|
catalyst_type VARCHAR,
|
||||||
confidence DOUBLE,
|
confidence DOUBLE,
|
||||||
novelty_score DOUBLE,
|
novelty_score DOUBLE,
|
||||||
|
source_credibility DOUBLE,
|
||||||
|
key_facts VARCHAR,
|
||||||
|
risks VARCHAR,
|
||||||
|
macro_themes VARCHAR,
|
||||||
model_name VARCHAR,
|
model_name VARCHAR,
|
||||||
prompt_version VARCHAR,
|
prompt_version VARCHAR,
|
||||||
|
schema_version VARCHAR,
|
||||||
extraction_at TIMESTAMP(6) WITH TIME ZONE,
|
extraction_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
dt DATE,
|
dt DATE,
|
||||||
model_version VARCHAR
|
model_version VARCHAR
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
-- Analytical fact table: documents
|
-- Analytical fact table: documents
|
||||||
-- Partitioned by dt and source_type on MinIO
|
-- Ingested document metadata for articles, filings, transcripts, and press releases.
|
||||||
-- Path: s3://stonks-lakehouse/warehouse/documents/dt={yyyy-mm-dd}/source_type={type}/part-*.parquet
|
-- Partitioned by dt on MinIO.
|
||||||
|
-- Path: s3://stonks-lakehouse/warehouse/documents/dt={yyyy-mm-dd}/part-*.parquet
|
||||||
|
-- Requirements: 3.1, 3.3, 9.4, 9.5, 10.1, 10.4
|
||||||
|
-- Design ref: Section 6.2, Section 7 (lake.documents)
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS lakehouse.stonks.documents (
|
CREATE TABLE IF NOT EXISTS lakehouse.stonks.documents (
|
||||||
document_id VARCHAR,
|
document_id VARCHAR,
|
||||||
@@ -9,7 +12,11 @@ CREATE TABLE IF NOT EXISTS lakehouse.stonks.documents (
|
|||||||
ticker VARCHAR,
|
ticker VARCHAR,
|
||||||
publisher VARCHAR,
|
publisher VARCHAR,
|
||||||
title VARCHAR,
|
title VARCHAR,
|
||||||
|
url VARCHAR,
|
||||||
|
canonical_url VARCHAR,
|
||||||
|
language VARCHAR,
|
||||||
published_at TIMESTAMP(6) WITH TIME ZONE,
|
published_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
|
retrieved_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
content_hash VARCHAR,
|
content_hash VARCHAR,
|
||||||
confidence DOUBLE,
|
confidence DOUBLE,
|
||||||
dt DATE
|
dt DATE
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
-- Analytical fact table: market_bars
|
-- Analytical fact table: market_bars
|
||||||
-- Partitioned by dt (date) on MinIO
|
-- OHLCV bar data for tracked symbols.
|
||||||
|
-- Partitioned by dt (date) on MinIO.
|
||||||
-- Path: s3://stonks-lakehouse/warehouse/market_bars/dt={yyyy-mm-dd}/part-*.parquet
|
-- Path: s3://stonks-lakehouse/warehouse/market_bars/dt={yyyy-mm-dd}/part-*.parquet
|
||||||
|
-- Requirements: 2.1, 9.4, 9.5, 10.1
|
||||||
|
-- Design ref: Section 7 (lake.market_bars)
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS lakehouse.stonks.market_bars (
|
CREATE TABLE IF NOT EXISTS lakehouse.stonks.market_bars (
|
||||||
ticker VARCHAR,
|
ticker VARCHAR,
|
||||||
@@ -10,7 +13,9 @@ CREATE TABLE IF NOT EXISTS lakehouse.stonks.market_bars (
|
|||||||
close_price DOUBLE,
|
close_price DOUBLE,
|
||||||
volume BIGINT,
|
volume BIGINT,
|
||||||
vwap DOUBLE,
|
vwap DOUBLE,
|
||||||
|
trade_count BIGINT,
|
||||||
bar_timestamp TIMESTAMP(6) WITH TIME ZONE,
|
bar_timestamp TIMESTAMP(6) WITH TIME ZONE,
|
||||||
|
bar_interval VARCHAR,
|
||||||
source VARCHAR,
|
source VARCHAR,
|
||||||
dt DATE
|
dt DATE
|
||||||
) WITH (
|
) WITH (
|
||||||
|
|||||||
@@ -0,0 +1,23 @@
|
|||||||
|
-- Analytical fact table: market_quotes
|
||||||
|
-- Quote snapshots for tracked symbols.
|
||||||
|
-- Partitioned by dt (date) on MinIO.
|
||||||
|
-- Path: s3://stonks-lakehouse/warehouse/market_quotes/dt={yyyy-mm-dd}/part-*.parquet
|
||||||
|
-- Requirements: 2.1, 9.4, 9.5, 10.1
|
||||||
|
-- Design ref: Section 7 (lake.market_quotes)
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS lakehouse.stonks.market_quotes (
|
||||||
|
ticker VARCHAR,
|
||||||
|
bid_price DOUBLE,
|
||||||
|
ask_price DOUBLE,
|
||||||
|
bid_size BIGINT,
|
||||||
|
ask_size BIGINT,
|
||||||
|
last_price DOUBLE,
|
||||||
|
last_size BIGINT,
|
||||||
|
source VARCHAR,
|
||||||
|
quote_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
|
dt DATE
|
||||||
|
) WITH (
|
||||||
|
format = 'PARQUET',
|
||||||
|
partitioned_by = ARRAY['dt'],
|
||||||
|
external_location = 's3a://stonks-lakehouse/warehouse/market_quotes/'
|
||||||
|
);
|
||||||
@@ -0,0 +1,33 @@
|
|||||||
|
-- Analytical fact table: model_performance
|
||||||
|
-- Tracks extraction model performance for Trino/Superset dashboards.
|
||||||
|
-- Partitioned by dt and model_name on MinIO.
|
||||||
|
-- Path: s3://stonks-lakehouse/warehouse/model_performance/dt={yyyy-mm-dd}/model_name={name}/part-*.parquet
|
||||||
|
-- Requirements: 12.1, 12.2
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS lakehouse.stonks.model_performance (
|
||||||
|
document_id VARCHAR,
|
||||||
|
ticker VARCHAR,
|
||||||
|
model_name VARCHAR,
|
||||||
|
prompt_version VARCHAR,
|
||||||
|
schema_version VARCHAR,
|
||||||
|
success BOOLEAN,
|
||||||
|
attempt_count INTEGER,
|
||||||
|
total_duration_ms INTEGER,
|
||||||
|
first_attempt_duration_ms INTEGER,
|
||||||
|
final_attempt_duration_ms INTEGER,
|
||||||
|
confidence DOUBLE,
|
||||||
|
validation_status VARCHAR,
|
||||||
|
validation_error_count INTEGER,
|
||||||
|
validation_warning_count INTEGER,
|
||||||
|
retry_count INTEGER,
|
||||||
|
input_token_estimate INTEGER,
|
||||||
|
output_token_estimate INTEGER,
|
||||||
|
company_count INTEGER,
|
||||||
|
recorded_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
|
dt DATE,
|
||||||
|
model_version VARCHAR
|
||||||
|
) WITH (
|
||||||
|
format = 'PARQUET',
|
||||||
|
partitioned_by = ARRAY['dt', 'model_version'],
|
||||||
|
external_location = 's3a://stonks-lakehouse/warehouse/model_performance/'
|
||||||
|
);
|
||||||
@@ -1,12 +1,19 @@
|
|||||||
-- Analytical fact table: pnl_daily
|
-- Analytical fact table: pnl_daily
|
||||||
-- Partitioned by dt on MinIO
|
-- Daily profit and loss records per symbol per account.
|
||||||
|
-- Partitioned by dt on MinIO.
|
||||||
|
-- Path: s3://stonks-lakehouse/warehouse/pnl_daily/dt={yyyy-mm-dd}/part-*.parquet
|
||||||
|
-- Requirements: 9.4, 9.5, 10.1, 10.3
|
||||||
|
-- Design ref: Section 7 (lake.pnl_daily)
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS lakehouse.stonks.pnl_daily (
|
CREATE TABLE IF NOT EXISTS lakehouse.stonks.pnl_daily (
|
||||||
ticker VARCHAR,
|
ticker VARCHAR,
|
||||||
realized_pnl DOUBLE,
|
realized_pnl DOUBLE,
|
||||||
unrealized_pnl DOUBLE,
|
unrealized_pnl DOUBLE,
|
||||||
total_pnl DOUBLE,
|
total_pnl DOUBLE,
|
||||||
|
fees DOUBLE,
|
||||||
|
net_pnl DOUBLE,
|
||||||
broker_account VARCHAR,
|
broker_account VARCHAR,
|
||||||
|
execution_mode VARCHAR,
|
||||||
dt DATE
|
dt DATE
|
||||||
) WITH (
|
) WITH (
|
||||||
format = 'PARQUET',
|
format = 'PARQUET',
|
||||||
|
|||||||
@@ -1,13 +1,19 @@
|
|||||||
-- Analytical fact table: positions_daily
|
-- Analytical fact table: positions_daily
|
||||||
-- Partitioned by dt on MinIO
|
-- End-of-day position snapshots.
|
||||||
|
-- Partitioned by dt on MinIO.
|
||||||
|
-- Path: s3://stonks-lakehouse/warehouse/positions_daily/dt={yyyy-mm-dd}/part-*.parquet
|
||||||
|
-- Requirements: 9.4, 9.5, 10.1, 10.3
|
||||||
|
-- Design ref: Section 7 (lake.positions_daily)
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS lakehouse.stonks.positions_daily (
|
CREATE TABLE IF NOT EXISTS lakehouse.stonks.positions_daily (
|
||||||
ticker VARCHAR,
|
ticker VARCHAR,
|
||||||
quantity DOUBLE,
|
quantity DOUBLE,
|
||||||
avg_entry_price DOUBLE,
|
avg_entry_price DOUBLE,
|
||||||
close_price DOUBLE,
|
close_price DOUBLE,
|
||||||
|
market_value DOUBLE,
|
||||||
unrealized_pnl DOUBLE,
|
unrealized_pnl DOUBLE,
|
||||||
broker_account VARCHAR,
|
broker_account VARCHAR,
|
||||||
|
execution_mode VARCHAR,
|
||||||
snapshot_at TIMESTAMP(6) WITH TIME ZONE,
|
snapshot_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
dt DATE
|
dt DATE
|
||||||
) WITH (
|
) WITH (
|
||||||
|
|||||||
@@ -1,19 +1,24 @@
|
|||||||
-- Analytical fact table: prediction_vs_outcome
|
-- Analytical fact table: prediction_vs_outcome
|
||||||
-- Partitioned by dt on MinIO
|
-- Prediction accuracy tracking: predicted signals vs realized market moves.
|
||||||
|
-- Partitioned by dt and model_version on MinIO.
|
||||||
|
-- Path: s3://stonks-lakehouse/warehouse/prediction_vs_outcome/dt={yyyy-mm-dd}/model_version={ver}/part-*.parquet
|
||||||
|
-- Requirements: 9.4, 9.5, 10.1, 10.3
|
||||||
|
-- Design ref: Section 7 (lake.prediction_vs_outcome)
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS lakehouse.stonks.prediction_vs_outcome (
|
CREATE TABLE IF NOT EXISTS lakehouse.stonks.prediction_vs_outcome (
|
||||||
recommendation_id VARCHAR,
|
recommendation_id VARCHAR,
|
||||||
ticker VARCHAR,
|
ticker VARCHAR,
|
||||||
predicted_action VARCHAR,
|
predicted_action VARCHAR,
|
||||||
predicted_confidence DOUBLE,
|
predicted_confidence DOUBLE,
|
||||||
actual_move_pct DOUBLE,
|
actual_move_pct DOUBLE,
|
||||||
outcome VARCHAR,
|
outcome VARCHAR,
|
||||||
horizon_days INTEGER,
|
horizon_days INTEGER,
|
||||||
predicted_at TIMESTAMP(6) WITH TIME ZONE,
|
predicted_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
evaluated_at TIMESTAMP(6) WITH TIME ZONE,
|
evaluated_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
dt DATE
|
model_version VARCHAR,
|
||||||
|
dt DATE
|
||||||
) WITH (
|
) WITH (
|
||||||
format = 'PARQUET',
|
format = 'PARQUET',
|
||||||
partitioned_by = ARRAY['dt'],
|
partitioned_by = ARRAY['dt', 'model_version'],
|
||||||
external_location = 's3a://stonks-lakehouse/warehouse/prediction_vs_outcome/'
|
external_location = 's3a://stonks-lakehouse/warehouse/prediction_vs_outcome/'
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -1,5 +1,9 @@
|
|||||||
-- Analytical fact table: trade_fills
|
-- Analytical fact table: trade_fills
|
||||||
-- Partitioned by dt on MinIO
|
-- Fill and execution records from broker.
|
||||||
|
-- Partitioned by dt on MinIO.
|
||||||
|
-- Path: s3://stonks-lakehouse/warehouse/trade_fills/dt={yyyy-mm-dd}/part-*.parquet
|
||||||
|
-- Requirements: 9.4, 9.5, 10.1, 10.3
|
||||||
|
-- Design ref: Section 7 (lake.trade_fills)
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_fills (
|
CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_fills (
|
||||||
fill_id VARCHAR,
|
fill_id VARCHAR,
|
||||||
@@ -8,6 +12,7 @@ CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_fills (
|
|||||||
side VARCHAR,
|
side VARCHAR,
|
||||||
fill_price DOUBLE,
|
fill_price DOUBLE,
|
||||||
fill_quantity DOUBLE,
|
fill_quantity DOUBLE,
|
||||||
|
commission DOUBLE,
|
||||||
broker_account VARCHAR,
|
broker_account VARCHAR,
|
||||||
filled_at TIMESTAMP(6) WITH TIME ZONE,
|
filled_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
dt DATE
|
dt DATE
|
||||||
|
|||||||
@@ -1,14 +1,20 @@
|
|||||||
-- Analytical fact table: trade_orders
|
-- Analytical fact table: trade_orders
|
||||||
-- Partitioned by dt on MinIO
|
-- Order submission records for paper and live trading.
|
||||||
|
-- Partitioned by dt on MinIO.
|
||||||
|
-- Path: s3://stonks-lakehouse/warehouse/trade_orders/dt={yyyy-mm-dd}/part-*.parquet
|
||||||
|
-- Requirements: 8.3, 9.4, 9.5, 10.1, 10.3
|
||||||
|
-- Design ref: Section 7 (lake.trade_orders)
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_orders (
|
CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_orders (
|
||||||
order_id VARCHAR,
|
order_id VARCHAR,
|
||||||
|
recommendation_id VARCHAR,
|
||||||
ticker VARCHAR,
|
ticker VARCHAR,
|
||||||
side VARCHAR,
|
side VARCHAR,
|
||||||
order_type VARCHAR,
|
order_type VARCHAR,
|
||||||
quantity DOUBLE,
|
quantity DOUBLE,
|
||||||
limit_price DOUBLE,
|
limit_price DOUBLE,
|
||||||
status VARCHAR,
|
status VARCHAR,
|
||||||
|
execution_mode VARCHAR,
|
||||||
broker_account VARCHAR,
|
broker_account VARCHAR,
|
||||||
submitted_at TIMESTAMP(6) WITH TIME ZONE,
|
submitted_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
dt DATE
|
dt DATE
|
||||||
|
|||||||
@@ -1,16 +1,24 @@
|
|||||||
-- Analytical fact table: trade_signals
|
-- Analytical fact table: trade_signals
|
||||||
-- Partitioned by dt on MinIO
|
-- Aggregated trend signals and recommendation actions.
|
||||||
|
-- Partitioned by dt on MinIO.
|
||||||
|
-- Path: s3://stonks-lakehouse/warehouse/trade_signals/dt={yyyy-mm-dd}/part-*.parquet
|
||||||
|
-- Requirements: 6.1, 6.2, 6.4, 6.5, 7.1, 9.4, 9.5, 10.1
|
||||||
|
-- Design ref: Section 6.4, Section 6.5, Section 7 (lake.trade_signals)
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_signals (
|
CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_signals (
|
||||||
signal_id VARCHAR,
|
signal_id VARCHAR,
|
||||||
ticker VARCHAR,
|
ticker VARCHAR,
|
||||||
trend_direction VARCHAR,
|
trend_direction VARCHAR,
|
||||||
trend_strength DOUBLE,
|
trend_strength DOUBLE,
|
||||||
confidence DOUBLE,
|
confidence DOUBLE,
|
||||||
action VARCHAR,
|
contradiction_score DOUBLE,
|
||||||
time_horizon VARCHAR,
|
dominant_catalysts VARCHAR,
|
||||||
generated_at TIMESTAMP(6) WITH TIME ZONE,
|
material_risks VARCHAR,
|
||||||
dt DATE
|
action VARCHAR,
|
||||||
|
time_horizon VARCHAR,
|
||||||
|
recommendation_id VARCHAR,
|
||||||
|
generated_at TIMESTAMP(6) WITH TIME ZONE,
|
||||||
|
dt DATE
|
||||||
) WITH (
|
) WITH (
|
||||||
format = 'PARQUET',
|
format = 'PARQUET',
|
||||||
partitioned_by = ARRAY['dt'],
|
partitioned_by = ARRAY['dt'],
|
||||||
|
|||||||
@@ -0,0 +1,23 @@
|
|||||||
|
# Lakehouse Views
|
||||||
|
|
||||||
|
Example SQL views for Trino over MinIO-backed analytical fact tables.
|
||||||
|
|
||||||
|
These views are designed to be created in the `lakehouse.stonks` schema and
|
||||||
|
can be used directly in Superset dashboards or ad hoc Trino queries.
|
||||||
|
|
||||||
|
## Views
|
||||||
|
|
||||||
|
- `prediction_accuracy` — Joins predicted signals with realized market moves to score prediction quality
|
||||||
|
- `paper_trade_scorecard` — Aggregates paper trading performance by symbol with win rates and PnL
|
||||||
|
- `paper_trade_detail` — Per-order paper trade detail with fill prices and realized outcomes
|
||||||
|
- `signal_hit_rate` — Daily signal accuracy summary across all symbols
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
Connect to Trino and run each `.sql` file to create the view:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
trino --catalog lakehouse --schema stonks < lakehouse/views/prediction_accuracy.sql
|
||||||
|
```
|
||||||
|
|
||||||
|
Or paste into the Superset SQL Lab to explore interactively.
|
||||||
@@ -0,0 +1,47 @@
|
|||||||
|
-- View: paper_trade_detail
|
||||||
|
-- Per-order paper trade detail joining orders, fills, and the originating
|
||||||
|
-- recommendation's prediction outcome. Useful for drill-down from the scorecard.
|
||||||
|
-- Requirements: 10.1, 10.3, 10.4
|
||||||
|
-- Design ref: Section 9.2 (evidence-to-outcome drill-down)
|
||||||
|
|
||||||
|
CREATE OR REPLACE VIEW lakehouse.stonks.paper_trade_detail AS
|
||||||
|
SELECT
|
||||||
|
o.order_id,
|
||||||
|
o.recommendation_id,
|
||||||
|
o.ticker,
|
||||||
|
o.side,
|
||||||
|
o.order_type,
|
||||||
|
o.quantity,
|
||||||
|
o.limit_price,
|
||||||
|
o.status AS order_status,
|
||||||
|
o.submitted_at,
|
||||||
|
f.fill_id,
|
||||||
|
f.fill_price,
|
||||||
|
f.fill_quantity,
|
||||||
|
f.commission,
|
||||||
|
f.filled_at,
|
||||||
|
-- Slippage: difference between limit and fill price (buys positive = worse)
|
||||||
|
CASE
|
||||||
|
WHEN o.limit_price IS NOT NULL AND o.limit_price > 0 THEN
|
||||||
|
(f.fill_price - o.limit_price) / o.limit_price * 100
|
||||||
|
ELSE NULL
|
||||||
|
END AS slippage_pct,
|
||||||
|
-- Link back to prediction outcome
|
||||||
|
pvo.predicted_action,
|
||||||
|
pvo.predicted_confidence,
|
||||||
|
pvo.actual_move_pct,
|
||||||
|
pvo.outcome AS prediction_outcome,
|
||||||
|
o.broker_account,
|
||||||
|
o.dt
|
||||||
|
FROM
|
||||||
|
lakehouse.stonks.trade_orders o
|
||||||
|
LEFT JOIN
|
||||||
|
lakehouse.stonks.trade_fills f
|
||||||
|
ON o.order_id = f.order_id
|
||||||
|
AND o.dt = f.dt
|
||||||
|
LEFT JOIN
|
||||||
|
lakehouse.stonks.prediction_vs_outcome pvo
|
||||||
|
ON o.recommendation_id = pvo.recommendation_id
|
||||||
|
AND o.dt = pvo.dt
|
||||||
|
WHERE
|
||||||
|
o.execution_mode = 'paper';
|
||||||
@@ -0,0 +1,42 @@
|
|||||||
|
-- View: paper_trade_scorecard
|
||||||
|
-- Aggregates paper trading performance per symbol with win rates, PnL, and
|
||||||
|
-- average fill quality. Filters to paper execution mode only.
|
||||||
|
-- Requirements: 10.1, 10.2, 10.3
|
||||||
|
-- Design ref: Section 9.2 (paper trading PnL scorecard)
|
||||||
|
|
||||||
|
CREATE OR REPLACE VIEW lakehouse.stonks.paper_trade_scorecard AS
|
||||||
|
SELECT
|
||||||
|
pnl.ticker,
|
||||||
|
pnl.broker_account,
|
||||||
|
COUNT(DISTINCT pnl.dt) AS trading_days,
|
||||||
|
SUM(pnl.realized_pnl) AS total_realized_pnl,
|
||||||
|
SUM(pnl.unrealized_pnl) AS total_unrealized_pnl,
|
||||||
|
SUM(pnl.net_pnl) AS total_net_pnl,
|
||||||
|
SUM(pnl.fees) AS total_fees,
|
||||||
|
AVG(pnl.net_pnl) AS avg_daily_pnl,
|
||||||
|
-- Win rate: fraction of days with positive net PnL
|
||||||
|
CAST(
|
||||||
|
COUNT(CASE WHEN pnl.net_pnl > 0 THEN 1 END) AS DOUBLE
|
||||||
|
) / NULLIF(COUNT(*), 0) AS win_rate,
|
||||||
|
-- Worst and best single-day PnL
|
||||||
|
MIN(pnl.net_pnl) AS worst_day_pnl,
|
||||||
|
MAX(pnl.net_pnl) AS best_day_pnl,
|
||||||
|
-- Order counts from trade_orders
|
||||||
|
COUNT(DISTINCT o.order_id) AS total_orders,
|
||||||
|
COUNT(DISTINCT CASE WHEN o.status = 'filled' THEN o.order_id END)
|
||||||
|
AS filled_orders,
|
||||||
|
MIN(pnl.dt) AS first_trade_date,
|
||||||
|
MAX(pnl.dt) AS last_trade_date
|
||||||
|
FROM
|
||||||
|
lakehouse.stonks.pnl_daily pnl
|
||||||
|
LEFT JOIN
|
||||||
|
lakehouse.stonks.trade_orders o
|
||||||
|
ON pnl.ticker = o.ticker
|
||||||
|
AND pnl.broker_account = o.broker_account
|
||||||
|
AND pnl.dt = o.dt
|
||||||
|
AND o.execution_mode = 'paper'
|
||||||
|
WHERE
|
||||||
|
pnl.execution_mode = 'paper'
|
||||||
|
GROUP BY
|
||||||
|
pnl.ticker,
|
||||||
|
pnl.broker_account;
|
||||||
@@ -0,0 +1,44 @@
|
|||||||
|
-- View: prediction_accuracy
|
||||||
|
-- Joins prediction_vs_outcome with trade_signals and market_bars to provide
|
||||||
|
-- a comprehensive prediction accuracy scorecard.
|
||||||
|
-- Requirements: 10.1, 10.2, 10.3, 10.4
|
||||||
|
-- Design ref: Section 9.2 (prediction confidence vs realized move)
|
||||||
|
|
||||||
|
CREATE OR REPLACE VIEW lakehouse.stonks.prediction_accuracy AS
|
||||||
|
SELECT
|
||||||
|
pvo.recommendation_id,
|
||||||
|
pvo.ticker,
|
||||||
|
pvo.predicted_action,
|
||||||
|
pvo.predicted_confidence,
|
||||||
|
pvo.actual_move_pct,
|
||||||
|
pvo.outcome,
|
||||||
|
pvo.horizon_days,
|
||||||
|
pvo.predicted_at,
|
||||||
|
pvo.evaluated_at,
|
||||||
|
pvo.model_version,
|
||||||
|
ts.trend_direction,
|
||||||
|
ts.trend_strength,
|
||||||
|
ts.contradiction_score,
|
||||||
|
ts.dominant_catalysts,
|
||||||
|
-- Confidence bucket for dashboard grouping
|
||||||
|
CASE
|
||||||
|
WHEN pvo.predicted_confidence >= 0.8 THEN 'high'
|
||||||
|
WHEN pvo.predicted_confidence >= 0.5 THEN 'medium'
|
||||||
|
ELSE 'low'
|
||||||
|
END AS confidence_bucket,
|
||||||
|
-- Direction correctness: did the predicted action match the actual move?
|
||||||
|
CASE
|
||||||
|
WHEN pvo.predicted_action = 'buy' AND pvo.actual_move_pct > 0 THEN true
|
||||||
|
WHEN pvo.predicted_action = 'sell' AND pvo.actual_move_pct < 0 THEN true
|
||||||
|
WHEN pvo.predicted_action IN ('hold', 'watch') THEN NULL
|
||||||
|
ELSE false
|
||||||
|
END AS direction_correct,
|
||||||
|
-- Magnitude of prediction error
|
||||||
|
ABS(pvo.actual_move_pct) AS abs_move_pct,
|
||||||
|
pvo.dt
|
||||||
|
FROM
|
||||||
|
lakehouse.stonks.prediction_vs_outcome pvo
|
||||||
|
LEFT JOIN
|
||||||
|
lakehouse.stonks.trade_signals ts
|
||||||
|
ON pvo.recommendation_id = ts.recommendation_id
|
||||||
|
AND pvo.dt = ts.dt;
|
||||||
@@ -0,0 +1,31 @@
|
|||||||
|
-- View: signal_hit_rate
|
||||||
|
-- Daily summary of signal accuracy across all symbols and model versions.
|
||||||
|
-- Designed for the Superset prediction accuracy dashboard.
|
||||||
|
-- Requirements: 10.1, 10.2, 10.3
|
||||||
|
-- Design ref: Section 9.2 (prediction confidence vs realized move)
|
||||||
|
|
||||||
|
CREATE OR REPLACE VIEW lakehouse.stonks.signal_hit_rate AS
|
||||||
|
SELECT
|
||||||
|
pvo.dt,
|
||||||
|
pvo.model_version,
|
||||||
|
COUNT(*) AS total_predictions,
|
||||||
|
COUNT(CASE WHEN pvo.outcome = 'correct' THEN 1 END) AS correct_predictions,
|
||||||
|
COUNT(CASE WHEN pvo.outcome = 'incorrect' THEN 1 END) AS incorrect_predictions,
|
||||||
|
COUNT(CASE WHEN pvo.outcome = 'neutral' THEN 1 END) AS neutral_predictions,
|
||||||
|
-- Hit rate
|
||||||
|
CAST(
|
||||||
|
COUNT(CASE WHEN pvo.outcome = 'correct' THEN 1 END) AS DOUBLE
|
||||||
|
) / NULLIF(COUNT(*), 0) AS hit_rate,
|
||||||
|
-- Average confidence of correct vs incorrect
|
||||||
|
AVG(CASE WHEN pvo.outcome = 'correct' THEN pvo.predicted_confidence END)
|
||||||
|
AS avg_confidence_correct,
|
||||||
|
AVG(CASE WHEN pvo.outcome = 'incorrect' THEN pvo.predicted_confidence END)
|
||||||
|
AS avg_confidence_incorrect,
|
||||||
|
-- Average realized move magnitude
|
||||||
|
AVG(ABS(pvo.actual_move_pct)) AS avg_abs_move_pct,
|
||||||
|
AVG(pvo.actual_move_pct) AS avg_move_pct
|
||||||
|
FROM
|
||||||
|
lakehouse.stonks.prediction_vs_outcome pvo
|
||||||
|
GROUP BY
|
||||||
|
pvo.dt,
|
||||||
|
pvo.model_version;
|
||||||
@@ -24,6 +24,12 @@ pandas>=2.2.0
|
|||||||
# Trino
|
# Trino
|
||||||
trino>=0.330.0
|
trino>=0.330.0
|
||||||
|
|
||||||
|
# Observability
|
||||||
|
prometheus_client>=0.21.0
|
||||||
|
|
||||||
|
# YAML parsing (used by K8s security tests)
|
||||||
|
pyyaml>=6.0.0
|
||||||
|
|
||||||
# Testing
|
# Testing
|
||||||
pytest>=8.0.0
|
pytest>=8.0.0
|
||||||
pytest-asyncio>=0.24.0
|
pytest-asyncio>=0.24.0
|
||||||
|
|||||||
@@ -1 +1,45 @@
|
|||||||
# Ingestion Adapters
|
# Ingestion Adapters
|
||||||
|
from .base import AdapterResult, BaseAdapter
|
||||||
|
from .resilient import ResilientAdapter, RetryConfig, RetryStats, compute_delay
|
||||||
|
from .broker_adapter import (
|
||||||
|
AccountInfo,
|
||||||
|
AlpacaBrokerAdapter,
|
||||||
|
BrokerDataAdapter,
|
||||||
|
OrderEventType,
|
||||||
|
OrderRequest,
|
||||||
|
OrderResponse,
|
||||||
|
OrderSide,
|
||||||
|
OrderStatus,
|
||||||
|
OrderType,
|
||||||
|
PositionInfo,
|
||||||
|
TradingMode,
|
||||||
|
)
|
||||||
|
from .filings_adapter import FilingsDataAdapter, SECEdgarAdapter
|
||||||
|
from .market_adapter import MarketDataAdapter, PolygonMarketAdapter
|
||||||
|
from .news_adapter import NewsDataAdapter, PolygonNewsAdapter
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"AccountInfo",
|
||||||
|
"AdapterResult",
|
||||||
|
"AlpacaBrokerAdapter",
|
||||||
|
"BaseAdapter",
|
||||||
|
"BrokerDataAdapter",
|
||||||
|
"FilingsDataAdapter",
|
||||||
|
"MarketDataAdapter",
|
||||||
|
"NewsDataAdapter",
|
||||||
|
"OrderEventType",
|
||||||
|
"OrderRequest",
|
||||||
|
"OrderResponse",
|
||||||
|
"OrderSide",
|
||||||
|
"OrderStatus",
|
||||||
|
"OrderType",
|
||||||
|
"PolygonMarketAdapter",
|
||||||
|
"PolygonNewsAdapter",
|
||||||
|
"PositionInfo",
|
||||||
|
"ResilientAdapter",
|
||||||
|
"RetryConfig",
|
||||||
|
"RetryStats",
|
||||||
|
"SECEdgarAdapter",
|
||||||
|
"TradingMode",
|
||||||
|
"compute_delay",
|
||||||
|
]
|
||||||
|
|||||||
@@ -1,29 +1,84 @@
|
|||||||
"""Base adapter interface for all external API integrations."""
|
"""Base adapter interface for all external API integrations.
|
||||||
|
|
||||||
|
All ingestion adapters follow the same contract:
|
||||||
|
1. Fetch external payloads for a given ticker/source config.
|
||||||
|
2. Return a structured result with raw bytes, parsed items, and metadata.
|
||||||
|
3. The ingestion worker handles MinIO upload, PostgreSQL metadata, and downstream job emission.
|
||||||
|
|
||||||
|
Requirements: 2.1, 2.2, 2.3, 2.4, 2.5, 3.1, 3.2, 3.3, 3.4
|
||||||
|
"""
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class AdapterResult:
|
class AdapterResult:
|
||||||
|
"""Result of a single adapter fetch operation."""
|
||||||
|
|
||||||
source_type: str
|
source_type: str
|
||||||
ticker: str
|
ticker: str
|
||||||
items: List[Dict[str, Any]]
|
items: list[dict[str, Any]]
|
||||||
raw_payload: bytes
|
raw_payload: bytes
|
||||||
content_hash: str
|
content_hash: str
|
||||||
fetched_at: datetime
|
fetched_at: datetime
|
||||||
error: Optional[str] = None
|
error: str | None = None
|
||||||
|
# HTTP metadata for observability
|
||||||
|
http_status: int | None = None
|
||||||
|
response_time_ms: float | None = None
|
||||||
|
# Additional metadata the adapter wants to pass downstream
|
||||||
|
metadata: dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ok(self) -> bool:
|
||||||
|
"""True if the fetch succeeded without error."""
|
||||||
|
return self.error is None and len(self.items) > 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def item_count(self) -> int:
|
||||||
|
return len(self.items)
|
||||||
|
|
||||||
|
|
||||||
class BaseAdapter(ABC):
|
class BaseAdapter(ABC):
|
||||||
"""Interface for all ingestion adapters."""
|
"""Interface for all ingestion adapters.
|
||||||
|
|
||||||
|
Subclasses implement fetch() for their specific API and source_type()
|
||||||
|
to identify the adapter class. The ingestion worker orchestrates
|
||||||
|
persistence and downstream job emission.
|
||||||
|
"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
|
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
|
||||||
"""Fetch data for a given ticker using source config."""
|
"""Fetch data for a given ticker using source config.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ticker: The company ticker symbol.
|
||||||
|
config: Source-specific configuration from the sources table.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AdapterResult with raw payload, parsed items, and metadata.
|
||||||
|
"""
|
||||||
...
|
...
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def source_type(self) -> str:
|
def source_type(self) -> str:
|
||||||
|
"""Return the source type identifier for this adapter (e.g. 'market_api')."""
|
||||||
...
|
...
|
||||||
|
|
||||||
|
def bucket_name(self) -> str:
|
||||||
|
"""Return the MinIO bucket name for raw artifact storage.
|
||||||
|
|
||||||
|
Override in subclasses if the bucket differs from the default pattern.
|
||||||
|
"""
|
||||||
|
return f"stonks-raw-{self.source_type().replace('_api', '').replace('_', '-')}"
|
||||||
|
|
||||||
|
def artifact_path(self, ticker: str, document_id: str, now: datetime) -> str:
|
||||||
|
"""Build the MinIO object path for a raw artifact.
|
||||||
|
|
||||||
|
Pattern: /{source_type}/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/raw.json
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
f"{self.source_type()}/{ticker}/"
|
||||||
|
f"{now.strftime('%Y/%m/%d')}/{document_id}/raw.json"
|
||||||
|
)
|
||||||
|
|||||||
@@ -1,9 +1,19 @@
|
|||||||
"""Broker API adapter - paper/live trading, orders, positions, balances."""
|
"""Broker API adapter interface for paper trading and order events.
|
||||||
|
|
||||||
|
The BrokerDataAdapter is the abstract interface for all broker integrations.
|
||||||
|
AlpacaBrokerAdapter is the first concrete implementation, targeting the
|
||||||
|
Alpaca Markets REST API for paper and live trading.
|
||||||
|
|
||||||
|
Requirements: 2.4, 2.5, 8.1, 8.3, 8.5
|
||||||
|
"""
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from datetime import datetime
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Dict, Optional
|
from datetime import datetime, timezone
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
@@ -12,97 +22,584 @@ from .base import AdapterResult, BaseAdapter
|
|||||||
logger = logging.getLogger("broker_adapter")
|
logger = logging.getLogger("broker_adapter")
|
||||||
|
|
||||||
|
|
||||||
class BrokerAdapter(BaseAdapter):
|
# --- Broker-specific enums ---
|
||||||
"""Broker API adapter supporting paper and live modes."""
|
|
||||||
|
|
||||||
def __init__(self, api_key: str = "", api_secret: str = "", base_url: str = "", mode: str = "paper"):
|
|
||||||
self.api_key = api_key
|
class OrderSide(str, Enum):
|
||||||
self.api_secret = api_secret
|
BUY = "buy"
|
||||||
self.base_url = base_url
|
SELL = "sell"
|
||||||
self.mode = mode # paper | live
|
|
||||||
|
|
||||||
|
class OrderType(str, Enum):
|
||||||
|
MARKET = "market"
|
||||||
|
LIMIT = "limit"
|
||||||
|
STOP = "stop"
|
||||||
|
STOP_LIMIT = "stop_limit"
|
||||||
|
|
||||||
|
|
||||||
|
class OrderStatus(str, Enum):
|
||||||
|
PENDING = "pending"
|
||||||
|
SUBMITTED = "submitted"
|
||||||
|
ACCEPTED = "accepted"
|
||||||
|
PARTIALLY_FILLED = "partially_filled"
|
||||||
|
FILLED = "filled"
|
||||||
|
CANCELLED = "cancelled"
|
||||||
|
REJECTED = "rejected"
|
||||||
|
EXPIRED = "expired"
|
||||||
|
|
||||||
|
|
||||||
|
class TradingMode(str, Enum):
|
||||||
|
PAPER = "paper"
|
||||||
|
LIVE = "live"
|
||||||
|
|
||||||
|
|
||||||
|
class OrderEventType(str, Enum):
|
||||||
|
SUBMITTED = "submitted"
|
||||||
|
ACCEPTED = "accepted"
|
||||||
|
REJECTED = "rejected"
|
||||||
|
FILL = "fill"
|
||||||
|
PARTIAL_FILL = "partial_fill"
|
||||||
|
CANCELLED = "cancelled"
|
||||||
|
EXPIRED = "expired"
|
||||||
|
|
||||||
|
|
||||||
|
# --- Data structures ---
|
||||||
|
|
||||||
|
|
||||||
|
class OrderRequest:
|
||||||
|
"""Represents an order to be submitted to a broker."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
ticker: str,
|
||||||
|
side: OrderSide,
|
||||||
|
quantity: float,
|
||||||
|
order_type: OrderType = OrderType.MARKET,
|
||||||
|
limit_price: float | None = None,
|
||||||
|
stop_price: float | None = None,
|
||||||
|
time_in_force: str = "day",
|
||||||
|
idempotency_key: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
self.ticker = ticker
|
||||||
|
self.side = side
|
||||||
|
self.quantity = quantity
|
||||||
|
self.order_type = order_type
|
||||||
|
self.limit_price = limit_price
|
||||||
|
self.stop_price = stop_price
|
||||||
|
self.time_in_force = time_in_force
|
||||||
|
self.idempotency_key = idempotency_key or str(uuid.uuid4())
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, Any]:
|
||||||
|
"""Serialize to a dict for audit/persistence."""
|
||||||
|
d: dict[str, Any] = {
|
||||||
|
"ticker": self.ticker,
|
||||||
|
"side": self.side.value,
|
||||||
|
"quantity": self.quantity,
|
||||||
|
"order_type": self.order_type.value,
|
||||||
|
"time_in_force": self.time_in_force,
|
||||||
|
"idempotency_key": self.idempotency_key,
|
||||||
|
}
|
||||||
|
if self.limit_price is not None:
|
||||||
|
d["limit_price"] = self.limit_price
|
||||||
|
if self.stop_price is not None:
|
||||||
|
d["stop_price"] = self.stop_price
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
class OrderResponse:
|
||||||
|
"""Represents a broker's response to an order submission."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
broker_order_id: str,
|
||||||
|
status: OrderStatus,
|
||||||
|
ticker: str,
|
||||||
|
side: OrderSide,
|
||||||
|
quantity: float,
|
||||||
|
filled_quantity: float = 0.0,
|
||||||
|
filled_avg_price: float | None = None,
|
||||||
|
submitted_at: datetime | None = None,
|
||||||
|
raw_response: dict[str, Any] | None = None,
|
||||||
|
error: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
self.broker_order_id = broker_order_id
|
||||||
|
self.status = status
|
||||||
|
self.ticker = ticker
|
||||||
|
self.side = side
|
||||||
|
self.quantity = quantity
|
||||||
|
self.filled_quantity = filled_quantity
|
||||||
|
self.filled_avg_price = filled_avg_price
|
||||||
|
self.submitted_at = submitted_at or datetime.now(timezone.utc)
|
||||||
|
self.raw_response = raw_response or {}
|
||||||
|
self.error = error
|
||||||
|
|
||||||
|
@property
|
||||||
|
def ok(self) -> bool:
|
||||||
|
return self.error is None and self.status not in (
|
||||||
|
OrderStatus.REJECTED,
|
||||||
|
OrderStatus.CANCELLED,
|
||||||
|
OrderStatus.EXPIRED,
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"broker_order_id": self.broker_order_id,
|
||||||
|
"status": self.status.value,
|
||||||
|
"ticker": self.ticker,
|
||||||
|
"side": self.side.value,
|
||||||
|
"quantity": self.quantity,
|
||||||
|
"filled_quantity": self.filled_quantity,
|
||||||
|
"filled_avg_price": self.filled_avg_price,
|
||||||
|
"submitted_at": self.submitted_at.isoformat(),
|
||||||
|
"error": self.error,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class PositionInfo:
|
||||||
|
"""Represents a current position from the broker."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
ticker: str,
|
||||||
|
quantity: float,
|
||||||
|
avg_entry_price: float,
|
||||||
|
current_price: float,
|
||||||
|
unrealized_pnl: float,
|
||||||
|
market_value: float,
|
||||||
|
side: str = "long",
|
||||||
|
) -> None:
|
||||||
|
self.ticker = ticker
|
||||||
|
self.quantity = quantity
|
||||||
|
self.avg_entry_price = avg_entry_price
|
||||||
|
self.current_price = current_price
|
||||||
|
self.unrealized_pnl = unrealized_pnl
|
||||||
|
self.market_value = market_value
|
||||||
|
self.side = side
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"ticker": self.ticker,
|
||||||
|
"quantity": self.quantity,
|
||||||
|
"avg_entry_price": self.avg_entry_price,
|
||||||
|
"current_price": self.current_price,
|
||||||
|
"unrealized_pnl": self.unrealized_pnl,
|
||||||
|
"market_value": self.market_value,
|
||||||
|
"side": self.side,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class AccountInfo:
|
||||||
|
"""Represents broker account summary."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
account_id: str,
|
||||||
|
buying_power: float,
|
||||||
|
cash: float,
|
||||||
|
portfolio_value: float,
|
||||||
|
currency: str = "USD",
|
||||||
|
mode: TradingMode = TradingMode.PAPER,
|
||||||
|
) -> None:
|
||||||
|
self.account_id = account_id
|
||||||
|
self.buying_power = buying_power
|
||||||
|
self.cash = cash
|
||||||
|
self.portfolio_value = portfolio_value
|
||||||
|
self.currency = currency
|
||||||
|
self.mode = mode
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"account_id": self.account_id,
|
||||||
|
"buying_power": self.buying_power,
|
||||||
|
"cash": self.cash,
|
||||||
|
"portfolio_value": self.portfolio_value,
|
||||||
|
"currency": self.currency,
|
||||||
|
"mode": self.mode.value,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# --- Abstract interface ---
|
||||||
|
|
||||||
|
|
||||||
|
class BrokerDataAdapter(BaseAdapter, ABC):
|
||||||
|
"""Abstract interface for broker API integrations.
|
||||||
|
|
||||||
|
Extends BaseAdapter with broker-specific operations:
|
||||||
|
- submit_order: place an order with idempotency key
|
||||||
|
- cancel_order: cancel an existing order
|
||||||
|
- get_order_status: check order state
|
||||||
|
- get_positions: list current positions
|
||||||
|
- get_account: retrieve account summary
|
||||||
|
|
||||||
|
All concrete adapters must enforce:
|
||||||
|
- Idempotent order submission via idempotency_key (Req 8.5)
|
||||||
|
- Paper/live mode separation (Req 8.1)
|
||||||
|
- Fail-closed on broker unavailability (Req 8.5)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, mode: TradingMode = TradingMode.PAPER) -> None:
|
||||||
|
self._mode = mode
|
||||||
|
|
||||||
|
@property
|
||||||
|
def mode(self) -> TradingMode:
|
||||||
|
return self._mode
|
||||||
|
|
||||||
def source_type(self) -> str:
|
def source_type(self) -> str:
|
||||||
return "broker"
|
return "broker"
|
||||||
|
|
||||||
def _headers(self) -> Dict[str, str]:
|
@abstractmethod
|
||||||
|
async def submit_order(self, order: OrderRequest) -> OrderResponse:
|
||||||
|
"""Submit an order to the broker.
|
||||||
|
|
||||||
|
Must use order.idempotency_key to prevent duplicate submissions.
|
||||||
|
Must fail closed if the broker is unavailable or returns ambiguous state.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def cancel_order(self, broker_order_id: str) -> OrderResponse:
|
||||||
|
"""Cancel an existing order by broker order ID."""
|
||||||
|
...
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def get_order_status(self, broker_order_id: str) -> OrderResponse:
|
||||||
|
"""Get the current status of an order."""
|
||||||
|
...
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def get_positions(self) -> list[PositionInfo]:
|
||||||
|
"""Get all current positions."""
|
||||||
|
...
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def get_account(self) -> AccountInfo:
|
||||||
|
"""Get account summary (balance, buying power, etc.)."""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# --- Concrete Alpaca implementation ---
|
||||||
|
|
||||||
|
|
||||||
|
class AlpacaBrokerAdapter(BrokerDataAdapter):
|
||||||
|
"""Concrete broker adapter for the Alpaca Markets REST API.
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- Paper trading via paper-api.alpaca.markets
|
||||||
|
- Live trading via api.alpaca.markets
|
||||||
|
- Order submission, cancellation, and status
|
||||||
|
- Position and account queries
|
||||||
|
|
||||||
|
Config options for fetch():
|
||||||
|
endpoint: One of "positions", "orders", "account" (default "positions")
|
||||||
|
"""
|
||||||
|
|
||||||
|
PAPER_BASE_URL: str = "https://paper-api.alpaca.markets"
|
||||||
|
LIVE_BASE_URL: str = "https://api.alpaca.markets"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_key: str,
|
||||||
|
api_secret: str,
|
||||||
|
mode: TradingMode = TradingMode.PAPER,
|
||||||
|
base_url: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(mode=mode)
|
||||||
|
self.api_key = api_key
|
||||||
|
self.api_secret = api_secret
|
||||||
|
if base_url:
|
||||||
|
self.base_url = base_url.rstrip("/")
|
||||||
|
elif mode == TradingMode.LIVE:
|
||||||
|
self.base_url = self.LIVE_BASE_URL
|
||||||
|
else:
|
||||||
|
self.base_url = self.PAPER_BASE_URL
|
||||||
|
|
||||||
|
def _headers(self) -> dict[str, str]:
|
||||||
return {
|
return {
|
||||||
"Authorization": f"Bearer {self.api_key}",
|
"APCA-API-KEY-ID": self.api_key,
|
||||||
|
"APCA-API-SECRET-KEY": self.api_secret,
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
}
|
}
|
||||||
|
|
||||||
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
|
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
|
||||||
"""Fetch positions and recent orders for a ticker."""
|
"""Fetch positions or recent orders for a ticker from Alpaca.
|
||||||
|
|
||||||
|
This satisfies the BaseAdapter contract for the ingestion pipeline.
|
||||||
|
The broker adapter uses fetch() to pull position/order snapshots
|
||||||
|
that get persisted as raw artifacts.
|
||||||
|
"""
|
||||||
|
endpoint = config.get("endpoint", "positions")
|
||||||
|
url = self._build_fetch_url(ticker, endpoint)
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=30) as client:
|
async with httpx.AsyncClient(timeout=30) as client:
|
||||||
|
t0 = time.monotonic()
|
||||||
try:
|
try:
|
||||||
resp = await client.get(
|
resp = await client.get(url, headers=self._headers())
|
||||||
f"{self.base_url}/v2/positions/{ticker}",
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
headers=self._headers(),
|
resp.raise_for_status()
|
||||||
)
|
|
||||||
raw = resp.content
|
raw = resp.content
|
||||||
data = resp.json() if resp.status_code == 200 else {}
|
data = resp.json()
|
||||||
content_hash = hashlib.sha256(raw).hexdigest()
|
content_hash = hashlib.sha256(raw).hexdigest()
|
||||||
|
items = [data] if isinstance(data, dict) else data if isinstance(data, list) else []
|
||||||
|
|
||||||
return AdapterResult(
|
return AdapterResult(
|
||||||
source_type="broker",
|
source_type="broker",
|
||||||
ticker=ticker,
|
ticker=ticker,
|
||||||
items=[data] if data else [],
|
items=items,
|
||||||
raw_payload=raw,
|
raw_payload=raw,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
fetched_at=datetime.utcnow(),
|
fetched_at=datetime.now(timezone.utc),
|
||||||
|
http_status=resp.status_code,
|
||||||
|
response_time_ms=round(elapsed_ms, 1),
|
||||||
|
metadata={
|
||||||
|
"provider": "alpaca",
|
||||||
|
"mode": self._mode.value,
|
||||||
|
"endpoint": endpoint,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
|
logger.error("Alpaca HTTP error for %s: %s", ticker, e)
|
||||||
|
return self._error_result(
|
||||||
|
ticker, str(e), elapsed_ms,
|
||||||
|
http_status=e.response.status_code if e.response else None,
|
||||||
|
raw=e.response.content if e.response else b"",
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Broker fetch failed for {ticker}: {e}")
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
return AdapterResult(
|
logger.error("Alpaca fetch failed for %s: %s", ticker, e)
|
||||||
source_type="broker",
|
return self._error_result(ticker, str(e), elapsed_ms)
|
||||||
ticker=ticker,
|
|
||||||
items=[],
|
|
||||||
raw_payload=b"",
|
|
||||||
content_hash="",
|
|
||||||
fetched_at=datetime.utcnow(),
|
|
||||||
error=str(e),
|
|
||||||
)
|
|
||||||
|
|
||||||
async def submit_order(
|
def _build_fetch_url(self, ticker: str, endpoint: str) -> str:
|
||||||
self,
|
"""Build the URL for a fetch operation."""
|
||||||
ticker: str,
|
if endpoint == "orders":
|
||||||
side: str,
|
return f"{self.base_url}/v2/orders?symbols={ticker}&status=all&limit=50"
|
||||||
qty: float,
|
if endpoint == "account":
|
||||||
order_type: str = "market",
|
return f"{self.base_url}/v2/account"
|
||||||
limit_price: Optional[float] = None,
|
# Default: positions for ticker
|
||||||
idempotency_key: Optional[str] = None,
|
return f"{self.base_url}/v2/positions/{ticker}"
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Submit an order to the broker. Returns broker response."""
|
|
||||||
if self.mode == "live":
|
|
||||||
logger.warning("LIVE order submission")
|
|
||||||
|
|
||||||
idem_key = idempotency_key or str(uuid.uuid4())
|
async def submit_order(self, order: OrderRequest) -> OrderResponse:
|
||||||
payload = {
|
"""Submit an order to Alpaca with idempotency key.
|
||||||
"symbol": ticker,
|
|
||||||
"qty": str(qty),
|
Fails closed: any network error or ambiguous response returns
|
||||||
"side": side,
|
a rejected OrderResponse rather than risking duplicate orders.
|
||||||
"type": order_type,
|
"""
|
||||||
"time_in_force": "day",
|
if self._mode == TradingMode.LIVE:
|
||||||
|
logger.warning("LIVE order submission: %s %s %s", order.side.value, order.quantity, order.ticker)
|
||||||
|
|
||||||
|
payload: dict[str, Any] = {
|
||||||
|
"symbol": order.ticker,
|
||||||
|
"qty": str(order.quantity),
|
||||||
|
"side": order.side.value,
|
||||||
|
"type": order.order_type.value,
|
||||||
|
"time_in_force": order.time_in_force,
|
||||||
}
|
}
|
||||||
if limit_price and order_type == "limit":
|
if order.limit_price is not None and order.order_type in (OrderType.LIMIT, OrderType.STOP_LIMIT):
|
||||||
payload["limit_price"] = str(limit_price)
|
payload["limit_price"] = str(order.limit_price)
|
||||||
|
if order.stop_price is not None and order.order_type in (OrderType.STOP, OrderType.STOP_LIMIT):
|
||||||
|
payload["stop_price"] = str(order.stop_price)
|
||||||
|
|
||||||
|
headers = {**self._headers(), "Idempotency-Key": order.idempotency_key}
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=30) as client:
|
async with httpx.AsyncClient(timeout=30) as client:
|
||||||
try:
|
try:
|
||||||
resp = await client.post(
|
resp = await client.post(
|
||||||
f"{self.base_url}/v2/orders",
|
f"{self.base_url}/v2/orders",
|
||||||
headers={**self._headers(), "Idempotency-Key": idem_key},
|
headers=headers,
|
||||||
json=payload,
|
json=payload,
|
||||||
)
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
return resp.json()
|
data = resp.json()
|
||||||
|
return self._parse_order_response(data)
|
||||||
except httpx.HTTPStatusError as e:
|
except httpx.HTTPStatusError as e:
|
||||||
logger.error(f"Order rejected: {e.response.text}")
|
error_body = e.response.text if e.response else "unknown"
|
||||||
return {"error": e.response.text, "status": e.response.status_code}
|
logger.error("Order rejected by Alpaca: %s", error_body)
|
||||||
|
return OrderResponse(
|
||||||
|
broker_order_id="",
|
||||||
|
status=OrderStatus.REJECTED,
|
||||||
|
ticker=order.ticker,
|
||||||
|
side=order.side,
|
||||||
|
quantity=order.quantity,
|
||||||
|
error=f"HTTP {e.response.status_code}: {error_body}" if e.response else str(e),
|
||||||
|
raw_response={"error": error_body},
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Order submission failed: {e}")
|
# Fail closed: treat any unexpected error as rejection
|
||||||
return {"error": str(e)}
|
logger.error("Order submission failed (fail-closed): %s", e)
|
||||||
|
return OrderResponse(
|
||||||
|
broker_order_id="",
|
||||||
|
status=OrderStatus.REJECTED,
|
||||||
|
ticker=order.ticker,
|
||||||
|
side=order.side,
|
||||||
|
quantity=order.quantity,
|
||||||
|
error=f"fail-closed: {e}",
|
||||||
|
)
|
||||||
|
|
||||||
async def get_account(self) -> Dict[str, Any]:
|
async def cancel_order(self, broker_order_id: str) -> OrderResponse:
|
||||||
|
"""Cancel an order on Alpaca."""
|
||||||
async with httpx.AsyncClient(timeout=30) as client:
|
async with httpx.AsyncClient(timeout=30) as client:
|
||||||
resp = await client.get(f"{self.base_url}/v2/account", headers=self._headers())
|
try:
|
||||||
return resp.json()
|
resp = await client.delete(
|
||||||
|
f"{self.base_url}/v2/orders/{broker_order_id}",
|
||||||
|
headers=self._headers(),
|
||||||
|
)
|
||||||
|
if resp.status_code == 204:
|
||||||
|
return OrderResponse(
|
||||||
|
broker_order_id=broker_order_id,
|
||||||
|
status=OrderStatus.CANCELLED,
|
||||||
|
ticker="",
|
||||||
|
side=OrderSide.BUY,
|
||||||
|
quantity=0,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
return self._parse_order_response(data)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Cancel failed for %s: %s", broker_order_id, e)
|
||||||
|
return OrderResponse(
|
||||||
|
broker_order_id=broker_order_id,
|
||||||
|
status=OrderStatus.REJECTED,
|
||||||
|
ticker="",
|
||||||
|
side=OrderSide.BUY,
|
||||||
|
quantity=0,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def get_order_status(self, broker_order_id: str) -> OrderResponse:
|
||||||
|
"""Get order status from Alpaca."""
|
||||||
|
async with httpx.AsyncClient(timeout=30) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.get(
|
||||||
|
f"{self.base_url}/v2/orders/{broker_order_id}",
|
||||||
|
headers=self._headers(),
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
return self._parse_order_response(data)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Get order status failed for %s: %s", broker_order_id, e)
|
||||||
|
return OrderResponse(
|
||||||
|
broker_order_id=broker_order_id,
|
||||||
|
status=OrderStatus.REJECTED,
|
||||||
|
ticker="",
|
||||||
|
side=OrderSide.BUY,
|
||||||
|
quantity=0,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
async def get_positions(self) -> list[PositionInfo]:
|
||||||
|
"""Get all current positions from Alpaca."""
|
||||||
|
async with httpx.AsyncClient(timeout=30) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.get(
|
||||||
|
f"{self.base_url}/v2/positions",
|
||||||
|
headers=self._headers(),
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
if not isinstance(data, list):
|
||||||
|
return []
|
||||||
|
return [self._parse_position(p) for p in data if isinstance(p, dict)]
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Get positions failed: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
async def get_account(self) -> AccountInfo:
|
||||||
|
"""Get account summary from Alpaca."""
|
||||||
|
async with httpx.AsyncClient(timeout=30) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.get(
|
||||||
|
f"{self.base_url}/v2/account",
|
||||||
|
headers=self._headers(),
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
return AccountInfo(
|
||||||
|
account_id=str(data.get("id", "")),
|
||||||
|
buying_power=float(data.get("buying_power", 0)),
|
||||||
|
cash=float(data.get("cash", 0)),
|
||||||
|
portfolio_value=float(data.get("portfolio_value", 0)),
|
||||||
|
currency=str(data.get("currency", "USD")),
|
||||||
|
mode=self._mode,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Get account failed: %s", e)
|
||||||
|
return AccountInfo(
|
||||||
|
account_id="",
|
||||||
|
buying_power=0,
|
||||||
|
cash=0,
|
||||||
|
portfolio_value=0,
|
||||||
|
mode=self._mode,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_order_response(self, data: dict[str, Any]) -> OrderResponse:
|
||||||
|
"""Parse an Alpaca order response into an OrderResponse."""
|
||||||
|
status_map: dict[str, OrderStatus] = {
|
||||||
|
"new": OrderStatus.SUBMITTED,
|
||||||
|
"accepted": OrderStatus.ACCEPTED,
|
||||||
|
"partially_filled": OrderStatus.PARTIALLY_FILLED,
|
||||||
|
"filled": OrderStatus.FILLED,
|
||||||
|
"done_for_day": OrderStatus.FILLED,
|
||||||
|
"canceled": OrderStatus.CANCELLED,
|
||||||
|
"expired": OrderStatus.EXPIRED,
|
||||||
|
"replaced": OrderStatus.SUBMITTED,
|
||||||
|
"pending_new": OrderStatus.PENDING,
|
||||||
|
"pending_cancel": OrderStatus.PENDING,
|
||||||
|
"pending_replace": OrderStatus.PENDING,
|
||||||
|
"rejected": OrderStatus.REJECTED,
|
||||||
|
}
|
||||||
|
raw_status = str(data.get("status", "pending"))
|
||||||
|
status = status_map.get(raw_status, OrderStatus.PENDING)
|
||||||
|
|
||||||
|
side_str = str(data.get("side", "buy"))
|
||||||
|
side = OrderSide.SELL if side_str == "sell" else OrderSide.BUY
|
||||||
|
|
||||||
|
filled_qty = float(data.get("filled_qty", 0) or 0)
|
||||||
|
filled_avg = data.get("filled_avg_price")
|
||||||
|
filled_avg_price = float(filled_avg) if filled_avg else None
|
||||||
|
|
||||||
|
return OrderResponse(
|
||||||
|
broker_order_id=str(data.get("id", "")),
|
||||||
|
status=status,
|
||||||
|
ticker=str(data.get("symbol", "")),
|
||||||
|
side=side,
|
||||||
|
quantity=float(data.get("qty", 0) or 0),
|
||||||
|
filled_quantity=filled_qty,
|
||||||
|
filled_avg_price=filled_avg_price,
|
||||||
|
raw_response=data,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_position(self, data: dict[str, Any]) -> PositionInfo:
|
||||||
|
"""Parse an Alpaca position response into a PositionInfo."""
|
||||||
|
return PositionInfo(
|
||||||
|
ticker=str(data.get("symbol", "")),
|
||||||
|
quantity=float(data.get("qty", 0) or 0),
|
||||||
|
avg_entry_price=float(data.get("avg_entry_price", 0) or 0),
|
||||||
|
current_price=float(data.get("current_price", 0) or 0),
|
||||||
|
unrealized_pnl=float(data.get("unrealized_pl", 0) or 0),
|
||||||
|
market_value=float(data.get("market_value", 0) or 0),
|
||||||
|
side=str(data.get("side", "long")),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _error_result(
|
||||||
|
self,
|
||||||
|
ticker: str,
|
||||||
|
error: str,
|
||||||
|
elapsed_ms: float,
|
||||||
|
http_status: int | None = None,
|
||||||
|
raw: bytes = b"",
|
||||||
|
) -> AdapterResult:
|
||||||
|
"""Build an error AdapterResult for broker fetches."""
|
||||||
|
return AdapterResult(
|
||||||
|
source_type="broker",
|
||||||
|
ticker=ticker,
|
||||||
|
items=[],
|
||||||
|
raw_payload=raw,
|
||||||
|
content_hash="",
|
||||||
|
fetched_at=datetime.now(timezone.utc),
|
||||||
|
error=error,
|
||||||
|
http_status=http_status,
|
||||||
|
response_time_ms=round(elapsed_ms, 1),
|
||||||
|
metadata={"provider": "alpaca", "mode": self._mode.value},
|
||||||
|
)
|
||||||
|
|||||||
@@ -0,0 +1,832 @@
|
|||||||
|
"""Broker adapter service - standalone worker for sandbox order execution.
|
||||||
|
|
||||||
|
Runs the Alpaca broker adapter in sandbox (paper) mode, processing order
|
||||||
|
requests from the broker queue, evaluating them through the risk engine,
|
||||||
|
submitting to Alpaca's paper trading API, and persisting the full audit trail.
|
||||||
|
|
||||||
|
Also periodically syncs positions and account state from Alpaca.
|
||||||
|
|
||||||
|
Implements idempotent order submission keys and duplicate prevention:
|
||||||
|
- Deterministic idempotency key generation from job attributes
|
||||||
|
- Redis-based fast-path duplicate detection before broker submission
|
||||||
|
- PostgreSQL UNIQUE constraint on idempotency_key as durable fallback
|
||||||
|
|
||||||
|
Requirements: 2.4, 8.1, 8.3, 8.5
|
||||||
|
Design: Section 4.9 - Broker Adapter
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
import redis.asyncio as aioredis
|
||||||
|
|
||||||
|
from services.adapters.broker_adapter import (
|
||||||
|
AlpacaBrokerAdapter,
|
||||||
|
OrderRequest,
|
||||||
|
OrderResponse,
|
||||||
|
OrderSide,
|
||||||
|
OrderStatus,
|
||||||
|
OrderType,
|
||||||
|
TradingMode,
|
||||||
|
)
|
||||||
|
from services.risk.engine import (
|
||||||
|
AccountRiskState,
|
||||||
|
PortfolioRiskConfig,
|
||||||
|
ProposedOrder,
|
||||||
|
evaluate_order,
|
||||||
|
)
|
||||||
|
from services.risk.approval import (
|
||||||
|
ApprovalRequest,
|
||||||
|
ApprovalStatus,
|
||||||
|
compute_expiry,
|
||||||
|
create_approval_request,
|
||||||
|
requires_approval,
|
||||||
|
)
|
||||||
|
from services.shared.audit import (
|
||||||
|
audit_approval_requested,
|
||||||
|
audit_duplicate_prevented,
|
||||||
|
audit_order_filled,
|
||||||
|
audit_order_rejected,
|
||||||
|
audit_order_submitted,
|
||||||
|
audit_risk_evaluated,
|
||||||
|
)
|
||||||
|
from services.lake_publisher.worker import (
|
||||||
|
publish_trade_order,
|
||||||
|
publish_trade_fill,
|
||||||
|
publish_positions_daily_batch,
|
||||||
|
LAKEHOUSE_BUCKET,
|
||||||
|
)
|
||||||
|
from services.shared.config import load_config
|
||||||
|
from services.shared.db import get_pg_pool, get_redis
|
||||||
|
from services.shared.logging import Span, new_trace_id, set_trace_context, setup_logging
|
||||||
|
from services.shared.metrics import (
|
||||||
|
ORDERS_DUPLICATES_PREVENTED,
|
||||||
|
ORDERS_FILLED,
|
||||||
|
ORDERS_REJECTED,
|
||||||
|
ORDERS_SUBMITTED,
|
||||||
|
POSITIONS_SYNCED,
|
||||||
|
RISK_CHECK_FAILURES,
|
||||||
|
RISK_EVALUATIONS_TOTAL,
|
||||||
|
)
|
||||||
|
from services.shared.redis_keys import QUEUE_BROKER, queue_key
|
||||||
|
|
||||||
|
logger = logging.getLogger("broker_service")
|
||||||
|
|
||||||
|
POSITION_SYNC_INTERVAL = 60 # seconds
|
||||||
|
|
||||||
|
# Redis TTL for idempotency markers (24 hours)
|
||||||
|
ORDER_IDEMPOTENCY_TTL = 86400
|
||||||
|
ORDER_IDEMPOTENCY_PREFIX = "stonks:order_idempotency"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# DB persistence helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_UPSERT_BROKER_ACCOUNT = """
|
||||||
|
INSERT INTO broker_accounts (id, provider, account_id, mode, config, active)
|
||||||
|
VALUES ($1::uuid, $2, $3, $4, $5::jsonb, TRUE)
|
||||||
|
ON CONFLICT (id) DO UPDATE SET
|
||||||
|
config = EXCLUDED.config,
|
||||||
|
mode = EXCLUDED.mode,
|
||||||
|
active = TRUE
|
||||||
|
"""
|
||||||
|
|
||||||
|
_INSERT_ORDER = """
|
||||||
|
INSERT INTO orders (
|
||||||
|
id, recommendation_id, broker_account_id, ticker, side, order_type,
|
||||||
|
quantity, limit_price, stop_price, status, idempotency_key,
|
||||||
|
broker_order_id, decision_trace, submitted_at, filled_at,
|
||||||
|
fill_price, fill_quantity
|
||||||
|
) VALUES (
|
||||||
|
$1::uuid, $2, $3::uuid, $4, $5, $6,
|
||||||
|
$7, $8, $9, $10, $11,
|
||||||
|
$12, $13::jsonb, $14, $15,
|
||||||
|
$16, $17
|
||||||
|
)
|
||||||
|
ON CONFLICT (idempotency_key) DO UPDATE SET
|
||||||
|
status = EXCLUDED.status,
|
||||||
|
broker_order_id = EXCLUDED.broker_order_id,
|
||||||
|
filled_at = EXCLUDED.filled_at,
|
||||||
|
fill_price = EXCLUDED.fill_price,
|
||||||
|
fill_quantity = EXCLUDED.fill_quantity,
|
||||||
|
updated_at = NOW()
|
||||||
|
"""
|
||||||
|
|
||||||
|
_INSERT_ORDER_EVENT = """
|
||||||
|
INSERT INTO order_events (order_id, event_type, data, broker_timestamp)
|
||||||
|
VALUES ($1::uuid, $2, $3::jsonb, $4)
|
||||||
|
"""
|
||||||
|
|
||||||
|
_INSERT_RISK_EVALUATION = """
|
||||||
|
INSERT INTO risk_evaluations (id, recommendation_id, eligible, allowed_mode, rejection_reasons, risk_checks, evaluated_at)
|
||||||
|
VALUES ($1::uuid, $2::uuid, $3, $4, $5::jsonb, $6::jsonb, $7)
|
||||||
|
"""
|
||||||
|
|
||||||
|
_UPSERT_POSITION = """
|
||||||
|
INSERT INTO positions (broker_account_id, ticker, quantity, avg_entry_price, current_price, unrealized_pnl, updated_at)
|
||||||
|
VALUES ($1::uuid, $2, $3, $4, $5, $6, $7)
|
||||||
|
ON CONFLICT (broker_account_id, ticker)
|
||||||
|
DO UPDATE SET
|
||||||
|
quantity = EXCLUDED.quantity,
|
||||||
|
avg_entry_price = EXCLUDED.avg_entry_price,
|
||||||
|
current_price = EXCLUDED.current_price,
|
||||||
|
unrealized_pnl = EXCLUDED.unrealized_pnl,
|
||||||
|
updated_at = EXCLUDED.updated_at
|
||||||
|
"""
|
||||||
|
|
||||||
|
_LOAD_RISK_CONFIG = """
|
||||||
|
SELECT config FROM risk_configs WHERE active = TRUE ORDER BY updated_at DESC LIMIT 1
|
||||||
|
"""
|
||||||
|
|
||||||
|
_LOAD_DAILY_SNAPSHOT = """
|
||||||
|
SELECT portfolio_value, daily_pnl, daily_trade_count, positions_by_sector
|
||||||
|
FROM daily_risk_snapshots
|
||||||
|
WHERE account_id = $1 AND snapshot_date = CURRENT_DATE
|
||||||
|
LIMIT 1
|
||||||
|
"""
|
||||||
|
|
||||||
|
_CHECK_ORDER_BY_IDEMPOTENCY_KEY = """
|
||||||
|
SELECT id, status, broker_order_id FROM orders
|
||||||
|
WHERE idempotency_key = $1
|
||||||
|
LIMIT 1
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Idempotency helpers (Requirement 8.5)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def generate_idempotency_key(job: dict[str, Any]) -> str:
|
||||||
|
"""Generate a deterministic idempotency key from job attributes.
|
||||||
|
|
||||||
|
If the job already carries an explicit idempotency_key, use it.
|
||||||
|
Otherwise, derive a stable key from the combination of
|
||||||
|
recommendation_id, ticker, side, quantity, and order_type so that
|
||||||
|
replayed queue messages produce the same key and are detected as
|
||||||
|
duplicates.
|
||||||
|
"""
|
||||||
|
explicit = job.get("idempotency_key")
|
||||||
|
if explicit:
|
||||||
|
return str(explicit)
|
||||||
|
|
||||||
|
# Build a deterministic key from job content
|
||||||
|
parts = [
|
||||||
|
str(job.get("recommendation_id", "")),
|
||||||
|
str(job.get("ticker", "")),
|
||||||
|
str(job.get("side", "buy")),
|
||||||
|
str(job.get("quantity", 0)),
|
||||||
|
str(job.get("order_type", "market")),
|
||||||
|
str(job.get("limit_price", "")),
|
||||||
|
str(job.get("stop_price", "")),
|
||||||
|
]
|
||||||
|
raw = "|".join(parts)
|
||||||
|
return hashlib.sha256(raw.encode()).hexdigest()[:40]
|
||||||
|
|
||||||
|
|
||||||
|
def _redis_idempotency_key(idempotency_key: str) -> str:
|
||||||
|
"""Build the Redis key for an order idempotency marker."""
|
||||||
|
return f"{ORDER_IDEMPOTENCY_PREFIX}:{idempotency_key}"
|
||||||
|
|
||||||
|
|
||||||
|
async def check_idempotency_redis(
|
||||||
|
rds: aioredis.Redis,
|
||||||
|
idempotency_key: str,
|
||||||
|
) -> str | None:
|
||||||
|
"""Fast-path: check Redis for a previously processed idempotency key.
|
||||||
|
|
||||||
|
Returns the existing order_id if found, None otherwise.
|
||||||
|
"""
|
||||||
|
redis_key = _redis_idempotency_key(idempotency_key)
|
||||||
|
cached = await rds.get(redis_key)
|
||||||
|
if cached:
|
||||||
|
return str(cached)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def check_idempotency_db(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
idempotency_key: str,
|
||||||
|
) -> dict[str, Any] | None:
|
||||||
|
"""Durable fallback: check PostgreSQL for an existing order with this key.
|
||||||
|
|
||||||
|
Returns a dict with id, status, broker_order_id if found, None otherwise.
|
||||||
|
"""
|
||||||
|
row = await pool.fetchrow(_CHECK_ORDER_BY_IDEMPOTENCY_KEY, idempotency_key)
|
||||||
|
if row:
|
||||||
|
return {
|
||||||
|
"id": str(row["id"]),
|
||||||
|
"status": str(row["status"]),
|
||||||
|
"broker_order_id": str(row["broker_order_id"] or ""),
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def mark_idempotency_redis(
|
||||||
|
rds: aioredis.Redis,
|
||||||
|
idempotency_key: str,
|
||||||
|
order_id: str,
|
||||||
|
) -> None:
|
||||||
|
"""Set the Redis idempotency marker after an order is processed."""
|
||||||
|
redis_key = _redis_idempotency_key(idempotency_key)
|
||||||
|
await rds.set(redis_key, order_id, ex=ORDER_IDEMPOTENCY_TTL)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Core service logic
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def build_order_request(job: dict[str, Any]) -> OrderRequest:
|
||||||
|
"""Build an OrderRequest from a broker queue job payload."""
|
||||||
|
side = OrderSide.SELL if job.get("side", "buy") == "sell" else OrderSide.BUY
|
||||||
|
order_type_str = job.get("order_type", "market")
|
||||||
|
order_type_map = {
|
||||||
|
"market": OrderType.MARKET,
|
||||||
|
"limit": OrderType.LIMIT,
|
||||||
|
"stop": OrderType.STOP,
|
||||||
|
"stop_limit": OrderType.STOP_LIMIT,
|
||||||
|
}
|
||||||
|
return OrderRequest(
|
||||||
|
ticker=job["ticker"],
|
||||||
|
side=side,
|
||||||
|
quantity=float(job.get("quantity", 0)),
|
||||||
|
order_type=order_type_map.get(order_type_str, OrderType.MARKET),
|
||||||
|
limit_price=job.get("limit_price"),
|
||||||
|
stop_price=job.get("stop_price"),
|
||||||
|
time_in_force=job.get("time_in_force", "day"),
|
||||||
|
idempotency_key=generate_idempotency_key(job),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_proposed_order(job: dict[str, Any]) -> ProposedOrder:
|
||||||
|
"""Build a ProposedOrder for risk evaluation from a broker queue job."""
|
||||||
|
return ProposedOrder(
|
||||||
|
recommendation_id=job.get("recommendation_id"),
|
||||||
|
ticker=job["ticker"],
|
||||||
|
sector=job.get("sector", ""),
|
||||||
|
action=job.get("side", "buy"),
|
||||||
|
quantity=float(job.get("quantity", 0)),
|
||||||
|
estimated_value=float(job.get("estimated_value", 0)),
|
||||||
|
confidence=float(job.get("confidence", 0)),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def load_risk_config(pool: asyncpg.Pool) -> PortfolioRiskConfig:
|
||||||
|
"""Load the active risk configuration from the database."""
|
||||||
|
row = await pool.fetchrow(_LOAD_RISK_CONFIG)
|
||||||
|
if row and row["config"]:
|
||||||
|
data = row["config"] if isinstance(row["config"], dict) else json.loads(row["config"])
|
||||||
|
return PortfolioRiskConfig.from_db_json(data)
|
||||||
|
return PortfolioRiskConfig()
|
||||||
|
|
||||||
|
|
||||||
|
async def load_account_risk_state(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
adapter: AlpacaBrokerAdapter,
|
||||||
|
account_uuid: str,
|
||||||
|
) -> AccountRiskState:
|
||||||
|
"""Build an AccountRiskState from the broker and daily snapshot."""
|
||||||
|
state = AccountRiskState(account_id=account_uuid)
|
||||||
|
|
||||||
|
# Get live account info from Alpaca
|
||||||
|
try:
|
||||||
|
acct = await adapter.get_account()
|
||||||
|
state.portfolio_value = acct.portfolio_value
|
||||||
|
state.cash = acct.cash
|
||||||
|
state.buying_power = acct.buying_power
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to fetch account from Alpaca: %s", e)
|
||||||
|
|
||||||
|
# Get positions from Alpaca
|
||||||
|
try:
|
||||||
|
positions = await adapter.get_positions()
|
||||||
|
for pos in positions:
|
||||||
|
state.positions_by_symbol[pos.ticker] = pos.market_value
|
||||||
|
state.open_position_count = len(positions)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to fetch positions from Alpaca: %s", e)
|
||||||
|
|
||||||
|
# Overlay daily snapshot from DB
|
||||||
|
row = await pool.fetchrow(_LOAD_DAILY_SNAPSHOT, account_uuid)
|
||||||
|
if row:
|
||||||
|
state.daily_pnl = float(row["daily_pnl"] or 0)
|
||||||
|
state.daily_trade_count = int(row["daily_trade_count"] or 0)
|
||||||
|
sector_data = row["positions_by_sector"]
|
||||||
|
if sector_data:
|
||||||
|
state.positions_by_sector = (
|
||||||
|
sector_data if isinstance(sector_data, dict) else json.loads(sector_data)
|
||||||
|
)
|
||||||
|
|
||||||
|
return state
|
||||||
|
|
||||||
|
|
||||||
|
async def persist_order(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
order_id: str,
|
||||||
|
order: OrderRequest,
|
||||||
|
resp: OrderResponse,
|
||||||
|
account_uuid: str,
|
||||||
|
risk_eval: dict[str, Any],
|
||||||
|
recommendation_id: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Persist order, events, and risk evaluation to PostgreSQL."""
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
filled_at = now if resp.status == OrderStatus.FILLED else None
|
||||||
|
|
||||||
|
decision_trace = {
|
||||||
|
"risk_evaluation": risk_eval,
|
||||||
|
"order_request": order.to_dict(),
|
||||||
|
"broker_response": resp.to_dict(),
|
||||||
|
}
|
||||||
|
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
async with conn.transaction():
|
||||||
|
await conn.execute(
|
||||||
|
_INSERT_ORDER,
|
||||||
|
order_id,
|
||||||
|
recommendation_id,
|
||||||
|
account_uuid,
|
||||||
|
order.ticker,
|
||||||
|
order.side.value,
|
||||||
|
order.order_type.value,
|
||||||
|
order.quantity,
|
||||||
|
order.limit_price,
|
||||||
|
order.stop_price,
|
||||||
|
resp.status.value,
|
||||||
|
order.idempotency_key,
|
||||||
|
resp.broker_order_id,
|
||||||
|
json.dumps(decision_trace),
|
||||||
|
resp.submitted_at or now,
|
||||||
|
filled_at,
|
||||||
|
resp.filled_avg_price,
|
||||||
|
resp.filled_quantity,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Record order events
|
||||||
|
for event_type in ["submitted"]:
|
||||||
|
await conn.execute(
|
||||||
|
_INSERT_ORDER_EVENT,
|
||||||
|
order_id,
|
||||||
|
event_type,
|
||||||
|
json.dumps({"ticker": order.ticker, "side": order.side.value}),
|
||||||
|
now,
|
||||||
|
)
|
||||||
|
|
||||||
|
if resp.status == OrderStatus.FILLED:
|
||||||
|
await conn.execute(
|
||||||
|
_INSERT_ORDER_EVENT,
|
||||||
|
order_id,
|
||||||
|
"fill",
|
||||||
|
json.dumps({
|
||||||
|
"fill_price": resp.filled_avg_price,
|
||||||
|
"fill_qty": resp.filled_quantity,
|
||||||
|
}),
|
||||||
|
now,
|
||||||
|
)
|
||||||
|
elif resp.status == OrderStatus.REJECTED:
|
||||||
|
await conn.execute(
|
||||||
|
_INSERT_ORDER_EVENT,
|
||||||
|
order_id,
|
||||||
|
"rejected",
|
||||||
|
json.dumps({"error": resp.error}),
|
||||||
|
now,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def sync_positions(
|
||||||
|
adapter: AlpacaBrokerAdapter,
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
account_uuid: str,
|
||||||
|
minio_client: Any | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Sync current positions from Alpaca to PostgreSQL and publish to lake."""
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
try:
|
||||||
|
positions = await adapter.get_positions()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
for pos in positions:
|
||||||
|
await conn.execute(
|
||||||
|
_UPSERT_POSITION,
|
||||||
|
account_uuid,
|
||||||
|
pos.ticker,
|
||||||
|
pos.quantity,
|
||||||
|
pos.avg_entry_price,
|
||||||
|
pos.current_price,
|
||||||
|
pos.unrealized_pnl,
|
||||||
|
now,
|
||||||
|
)
|
||||||
|
logger.info("Synced %d positions from Alpaca", len(positions))
|
||||||
|
POSITIONS_SYNCED.inc()
|
||||||
|
|
||||||
|
# Publish positions snapshot to analytical lake
|
||||||
|
if minio_client is not None and positions:
|
||||||
|
try:
|
||||||
|
pos_dicts = [
|
||||||
|
{
|
||||||
|
"ticker": p.ticker,
|
||||||
|
"quantity": p.quantity,
|
||||||
|
"avg_entry_price": p.avg_entry_price,
|
||||||
|
"close_price": p.current_price,
|
||||||
|
"unrealized_pnl": p.unrealized_pnl,
|
||||||
|
}
|
||||||
|
for p in positions
|
||||||
|
]
|
||||||
|
publish_positions_daily_batch(
|
||||||
|
minio_client, pos_dicts, account_uuid, now,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to publish positions to lake: %s", e)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Position sync failed: %s", e)
|
||||||
|
|
||||||
|
|
||||||
|
async def register_broker_account(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
account_uuid: str,
|
||||||
|
adapter: AlpacaBrokerAdapter,
|
||||||
|
) -> None:
|
||||||
|
"""Register or update the broker account in PostgreSQL."""
|
||||||
|
try:
|
||||||
|
acct = await adapter.get_account()
|
||||||
|
config_json = json.dumps({
|
||||||
|
"provider": "alpaca",
|
||||||
|
"buying_power": acct.buying_power,
|
||||||
|
"cash": acct.cash,
|
||||||
|
"portfolio_value": acct.portfolio_value,
|
||||||
|
})
|
||||||
|
await pool.execute(
|
||||||
|
_UPSERT_BROKER_ACCOUNT,
|
||||||
|
account_uuid,
|
||||||
|
"alpaca",
|
||||||
|
acct.account_id or account_uuid,
|
||||||
|
adapter.mode.value,
|
||||||
|
config_json,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"Registered Alpaca account: id=%s mode=%s portfolio=%.2f",
|
||||||
|
acct.account_id, adapter.mode.value, acct.portfolio_value,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to register broker account: %s", e)
|
||||||
|
|
||||||
|
|
||||||
|
async def process_order_job(
|
||||||
|
job: dict[str, Any],
|
||||||
|
adapter: AlpacaBrokerAdapter,
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
account_uuid: str,
|
||||||
|
rds: aioredis.Redis | None = None,
|
||||||
|
minio_client: Any | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Process a single order job from the broker queue.
|
||||||
|
|
||||||
|
1. Generate deterministic idempotency key
|
||||||
|
2. Check Redis + DB for duplicate (Req 8.5)
|
||||||
|
3. Build proposed order and run risk evaluation
|
||||||
|
4. If risk passes, submit to Alpaca
|
||||||
|
5. Persist order, events, and risk evaluation
|
||||||
|
6. Set Redis idempotency marker
|
||||||
|
"""
|
||||||
|
ticker = job.get("ticker", "???")
|
||||||
|
order_id = str(uuid.uuid4())
|
||||||
|
idempotency_key = generate_idempotency_key(job)
|
||||||
|
|
||||||
|
# --- Duplicate prevention (Requirement 8.5) ---
|
||||||
|
# Fast path: Redis check
|
||||||
|
if rds is not None:
|
||||||
|
existing_order_id = await check_idempotency_redis(rds, idempotency_key)
|
||||||
|
if existing_order_id:
|
||||||
|
logger.info(
|
||||||
|
"Duplicate order detected (redis) for %s key=%s existing=%s",
|
||||||
|
ticker, idempotency_key[:16], existing_order_id,
|
||||||
|
)
|
||||||
|
ORDERS_DUPLICATES_PREVENTED.labels(detected_via="redis").inc()
|
||||||
|
await audit_duplicate_prevented(
|
||||||
|
pool, existing_order_id, ticker, idempotency_key, detected_via="redis",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Durable fallback: DB check
|
||||||
|
existing = await check_idempotency_db(pool, idempotency_key)
|
||||||
|
if existing:
|
||||||
|
logger.info(
|
||||||
|
"Duplicate order detected (db) for %s key=%s existing=%s status=%s",
|
||||||
|
ticker, idempotency_key[:16], existing["id"], existing["status"],
|
||||||
|
)
|
||||||
|
ORDERS_DUPLICATES_PREVENTED.labels(detected_via="db").inc()
|
||||||
|
await audit_duplicate_prevented(
|
||||||
|
pool, existing["id"], ticker, idempotency_key, detected_via="db",
|
||||||
|
)
|
||||||
|
# Warm Redis cache for future fast-path hits
|
||||||
|
if rds is not None:
|
||||||
|
await mark_idempotency_redis(rds, idempotency_key, existing["id"])
|
||||||
|
return
|
||||||
|
|
||||||
|
# Risk evaluation
|
||||||
|
risk_config = await load_risk_config(pool)
|
||||||
|
risk_state = await load_account_risk_state(pool, adapter, account_uuid)
|
||||||
|
proposed = build_proposed_order(job)
|
||||||
|
evaluation = evaluate_order(proposed, risk_config, risk_state)
|
||||||
|
|
||||||
|
risk_eval_dict = {
|
||||||
|
"evaluation_id": evaluation.evaluation_id,
|
||||||
|
"eligible": evaluation.eligible,
|
||||||
|
"allowed_mode": evaluation.allowed_mode.value,
|
||||||
|
"rejection_reasons": evaluation.rejection_reasons,
|
||||||
|
"checks": [c.model_dump(mode="json") for c in evaluation.checks],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Persist risk evaluation
|
||||||
|
rec_id = job.get("recommendation_id")
|
||||||
|
try:
|
||||||
|
await pool.execute(
|
||||||
|
_INSERT_RISK_EVALUATION,
|
||||||
|
evaluation.evaluation_id,
|
||||||
|
rec_id,
|
||||||
|
evaluation.eligible,
|
||||||
|
evaluation.allowed_mode.value,
|
||||||
|
json.dumps(evaluation.rejection_reasons),
|
||||||
|
json.dumps(risk_eval_dict["checks"]),
|
||||||
|
evaluation.evaluated_at,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to persist risk evaluation: %s", e)
|
||||||
|
|
||||||
|
# Audit: risk evaluation result
|
||||||
|
await audit_risk_evaluated(
|
||||||
|
pool,
|
||||||
|
evaluation_id=evaluation.evaluation_id,
|
||||||
|
recommendation_id=rec_id,
|
||||||
|
ticker=ticker,
|
||||||
|
eligible=evaluation.eligible,
|
||||||
|
allowed_mode=evaluation.allowed_mode.value,
|
||||||
|
rejection_reasons=evaluation.rejection_reasons,
|
||||||
|
check_count=len(evaluation.checks),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not evaluation.eligible:
|
||||||
|
RISK_EVALUATIONS_TOTAL.labels(result="rejected").inc()
|
||||||
|
for check in evaluation.checks:
|
||||||
|
if check.result.value == "fail":
|
||||||
|
RISK_CHECK_FAILURES.labels(check_name=check.check_name).inc()
|
||||||
|
ORDERS_REJECTED.labels(reason_category="risk_engine").inc()
|
||||||
|
logger.info(
|
||||||
|
"Order rejected by risk engine for %s: %s",
|
||||||
|
ticker, evaluation.rejection_reasons,
|
||||||
|
)
|
||||||
|
# Persist the rejected order for audit
|
||||||
|
order_req = build_order_request(job)
|
||||||
|
rejected_resp = OrderResponse(
|
||||||
|
broker_order_id="",
|
||||||
|
status=OrderStatus.REJECTED,
|
||||||
|
ticker=ticker,
|
||||||
|
side=OrderSide.SELL if job.get("side") == "sell" else OrderSide.BUY,
|
||||||
|
quantity=float(job.get("quantity", 0)),
|
||||||
|
error=f"Risk rejected: {'; '.join(evaluation.rejection_reasons)}",
|
||||||
|
)
|
||||||
|
await persist_order(
|
||||||
|
pool, order_id, order_req, rejected_resp,
|
||||||
|
account_uuid, risk_eval_dict, rec_id,
|
||||||
|
)
|
||||||
|
# Publish rejected order fact to analytical lake
|
||||||
|
if minio_client is not None:
|
||||||
|
try:
|
||||||
|
publish_trade_order(
|
||||||
|
minio_client, order_id, ticker,
|
||||||
|
side=job.get("side", "buy"),
|
||||||
|
order_type=job.get("order_type", "market"),
|
||||||
|
quantity=float(job.get("quantity", 0)),
|
||||||
|
limit_price=job.get("limit_price"),
|
||||||
|
status="rejected",
|
||||||
|
broker_account=account_uuid,
|
||||||
|
submitted_at=datetime.now(timezone.utc),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to publish rejected order to lake: %s", e)
|
||||||
|
# Audit: order rejected by risk engine
|
||||||
|
await audit_order_rejected(
|
||||||
|
pool, order_id, ticker,
|
||||||
|
reason=f"Risk rejected: {'; '.join(evaluation.rejection_reasons)}",
|
||||||
|
source="risk_engine",
|
||||||
|
)
|
||||||
|
# Mark idempotency even for rejected orders to prevent reprocessing
|
||||||
|
if rds is not None:
|
||||||
|
await mark_idempotency_redis(rds, idempotency_key, order_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
# --- Operator approval gate (Requirement 8.2) ---
|
||||||
|
if requires_approval(risk_config, evaluation.allowed_mode):
|
||||||
|
expiry = compute_expiry(risk_config)
|
||||||
|
approval_req = ApprovalRequest(
|
||||||
|
order_job=job,
|
||||||
|
recommendation_id=rec_id,
|
||||||
|
ticker=ticker,
|
||||||
|
side=job.get("side", "buy"),
|
||||||
|
quantity=float(job.get("quantity", 0)),
|
||||||
|
estimated_value=float(job.get("estimated_value", 0)),
|
||||||
|
risk_evaluation_id=evaluation.evaluation_id,
|
||||||
|
expires_at=expiry,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
await create_approval_request(pool, approval_req)
|
||||||
|
logger.info(
|
||||||
|
"Order for %s held for operator approval (id=%s, expires=%s)",
|
||||||
|
ticker, approval_req.approval_id, expiry.isoformat(),
|
||||||
|
)
|
||||||
|
await audit_approval_requested(
|
||||||
|
pool,
|
||||||
|
approval_id=approval_req.approval_id,
|
||||||
|
ticker=ticker,
|
||||||
|
side=approval_req.side,
|
||||||
|
quantity=approval_req.quantity,
|
||||||
|
estimated_value=approval_req.estimated_value,
|
||||||
|
recommendation_id=rec_id,
|
||||||
|
expires_at=expiry.isoformat(),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to create approval request for %s: %s", ticker, e)
|
||||||
|
# Do NOT mark idempotency — the job will be re-submitted after approval
|
||||||
|
return
|
||||||
|
|
||||||
|
# Submit to Alpaca
|
||||||
|
order_req = build_order_request(job)
|
||||||
|
RISK_EVALUATIONS_TOTAL.labels(result="passed").inc()
|
||||||
|
|
||||||
|
# Audit: order submitted to broker
|
||||||
|
await audit_order_submitted(
|
||||||
|
pool,
|
||||||
|
order_id=order_id,
|
||||||
|
ticker=ticker,
|
||||||
|
side=order_req.side.value,
|
||||||
|
quantity=order_req.quantity,
|
||||||
|
order_type=order_req.order_type.value,
|
||||||
|
idempotency_key=order_req.idempotency_key,
|
||||||
|
recommendation_id=rec_id,
|
||||||
|
evaluation_id=evaluation.evaluation_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
resp = await adapter.submit_order(order_req)
|
||||||
|
|
||||||
|
await persist_order(
|
||||||
|
pool, order_id, order_req, resp,
|
||||||
|
account_uuid, risk_eval_dict, rec_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Publish order fact to analytical lake
|
||||||
|
if minio_client is not None:
|
||||||
|
try:
|
||||||
|
publish_trade_order(
|
||||||
|
minio_client, order_id, ticker,
|
||||||
|
side=order_req.side.value,
|
||||||
|
order_type=order_req.order_type.value,
|
||||||
|
quantity=order_req.quantity,
|
||||||
|
limit_price=order_req.limit_price,
|
||||||
|
status=resp.status.value,
|
||||||
|
broker_account=account_uuid,
|
||||||
|
submitted_at=resp.submitted_at or datetime.now(timezone.utc),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to publish order to lake: %s", e)
|
||||||
|
|
||||||
|
# Publish fill fact if the order was filled
|
||||||
|
if resp.status == OrderStatus.FILLED and resp.filled_avg_price is not None:
|
||||||
|
try:
|
||||||
|
fill_id = str(uuid.uuid4())
|
||||||
|
publish_trade_fill(
|
||||||
|
minio_client, fill_id, order_id, ticker,
|
||||||
|
side=order_req.side.value,
|
||||||
|
fill_price=resp.filled_avg_price,
|
||||||
|
fill_quantity=resp.filled_quantity,
|
||||||
|
broker_account=account_uuid,
|
||||||
|
filled_at=datetime.now(timezone.utc),
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to publish fill to lake: %s", e)
|
||||||
|
|
||||||
|
# Mark idempotency after successful persistence
|
||||||
|
if rds is not None:
|
||||||
|
await mark_idempotency_redis(rds, idempotency_key, order_id)
|
||||||
|
|
||||||
|
if resp.ok:
|
||||||
|
mode = "paper" if adapter.mode == TradingMode.PAPER else "live"
|
||||||
|
ORDERS_SUBMITTED.labels(
|
||||||
|
side=order_req.side.value,
|
||||||
|
order_type=order_req.order_type.value,
|
||||||
|
mode=mode,
|
||||||
|
).inc()
|
||||||
|
logger.info(
|
||||||
|
"Order submitted to Alpaca: %s %s %.0f %s @ %s | broker_id=%s",
|
||||||
|
resp.status.value, order_req.side.value, order_req.quantity,
|
||||||
|
ticker, resp.filled_avg_price, resp.broker_order_id,
|
||||||
|
)
|
||||||
|
# Audit: order filled
|
||||||
|
if resp.status == OrderStatus.FILLED:
|
||||||
|
ORDERS_FILLED.labels(side=order_req.side.value).inc()
|
||||||
|
await audit_order_filled(
|
||||||
|
pool, order_id, ticker,
|
||||||
|
side=order_req.side.value,
|
||||||
|
fill_quantity=resp.filled_quantity,
|
||||||
|
fill_price=resp.filled_avg_price,
|
||||||
|
broker_order_id=resp.broker_order_id,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ORDERS_REJECTED.labels(reason_category="broker").inc()
|
||||||
|
logger.warning(
|
||||||
|
"Order failed for %s: %s (status=%s)",
|
||||||
|
ticker, resp.error, resp.status.value,
|
||||||
|
)
|
||||||
|
# Audit: order rejected by broker
|
||||||
|
await audit_order_rejected(
|
||||||
|
pool, order_id, ticker,
|
||||||
|
reason=resp.error or f"Broker status: {resp.status.value}",
|
||||||
|
source="broker",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def position_sync_loop(
|
||||||
|
adapter: AlpacaBrokerAdapter,
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
account_uuid: str,
|
||||||
|
minio_client: Any | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Periodically sync positions from Alpaca to PostgreSQL and lake."""
|
||||||
|
while True:
|
||||||
|
await sync_positions(adapter, pool, account_uuid, minio_client)
|
||||||
|
await asyncio.sleep(POSITION_SYNC_INTERVAL)
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
config = load_config()
|
||||||
|
setup_logging("broker_service", level=config.log_level, json_output=config.json_logs)
|
||||||
|
|
||||||
|
pool = await get_pg_pool(config)
|
||||||
|
rds = get_redis(config)
|
||||||
|
|
||||||
|
# Initialize MinIO client for lake publishing
|
||||||
|
from minio import Minio
|
||||||
|
minio_client = Minio(
|
||||||
|
config.minio.endpoint,
|
||||||
|
access_key=config.minio.access_key,
|
||||||
|
secret_key=config.minio.secret_key,
|
||||||
|
secure=config.minio.secure,
|
||||||
|
)
|
||||||
|
# Ensure lakehouse bucket exists
|
||||||
|
if not minio_client.bucket_exists(LAKEHOUSE_BUCKET):
|
||||||
|
minio_client.make_bucket(LAKEHOUSE_BUCKET)
|
||||||
|
|
||||||
|
# Determine mode — default to paper for safety (Req 8.1)
|
||||||
|
mode = TradingMode.LIVE if config.broker.mode == "live" else TradingMode.PAPER
|
||||||
|
if mode == TradingMode.LIVE:
|
||||||
|
logger.warning("LIVE trading mode enabled — orders will be submitted to real broker")
|
||||||
|
|
||||||
|
adapter = AlpacaBrokerAdapter(
|
||||||
|
api_key=config.broker.api_key or "",
|
||||||
|
api_secret=config.broker.api_secret or "",
|
||||||
|
mode=mode,
|
||||||
|
base_url=config.broker.base_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate a stable account UUID from the API key
|
||||||
|
account_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"alpaca-{config.broker.api_key or 'default'}"))
|
||||||
|
|
||||||
|
# Register broker account on startup
|
||||||
|
await register_broker_account(pool, account_uuid, adapter)
|
||||||
|
|
||||||
|
# Start position sync in background
|
||||||
|
sync_task = asyncio.create_task(
|
||||||
|
position_sync_loop(adapter, pool, account_uuid, minio_client)
|
||||||
|
)
|
||||||
|
|
||||||
|
queue = queue_key(QUEUE_BROKER)
|
||||||
|
logger.info("Broker service started (mode=%s)", mode.value)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
result = await rds.lpop(queue)
|
||||||
|
raw = str(result) if result else None
|
||||||
|
if raw:
|
||||||
|
try:
|
||||||
|
job = json.loads(raw)
|
||||||
|
await process_order_job(job, adapter, pool, account_uuid, rds, minio_client)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Error processing broker job")
|
||||||
|
else:
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
finally:
|
||||||
|
sync_task.cancel()
|
||||||
|
await pool.close()
|
||||||
|
await rds.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
@@ -1,8 +1,17 @@
|
|||||||
"""Filings / Regulatory API adapter - fetches SEC-style submissions."""
|
"""Filings / Regulatory API adapter interface and concrete SEC EDGAR provider.
|
||||||
|
|
||||||
|
The FilingsDataAdapter is the abstract interface for all filings data providers.
|
||||||
|
SECEdgarAdapter is the first concrete implementation, targeting the SEC EDGAR
|
||||||
|
full-text search system (EFTS) for company filings discovery.
|
||||||
|
|
||||||
|
Requirements: 2.3, 2.5, 3.1, 3.2, 3.3
|
||||||
|
"""
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
import time
|
||||||
from typing import Any, Dict
|
from abc import ABC
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
@@ -11,48 +20,182 @@ from .base import AdapterResult, BaseAdapter
|
|||||||
logger = logging.getLogger("filings_adapter")
|
logger = logging.getLogger("filings_adapter")
|
||||||
|
|
||||||
|
|
||||||
class FilingsAdapter(BaseAdapter):
|
class FilingsDataAdapter(BaseAdapter, ABC):
|
||||||
"""Concrete adapter for SEC EDGAR or similar filings API."""
|
"""Abstract interface for filings / regulatory data providers.
|
||||||
|
|
||||||
def __init__(self, base_url: str = "https://efts.sec.gov", user_agent: str = "StonksOracle/1.0"):
|
Subclasses implement fetch() for their specific filings API.
|
||||||
self.base_url = base_url
|
source_type() is concrete here since all filings adapters share the same type.
|
||||||
self.user_agent = user_agent
|
"""
|
||||||
|
|
||||||
def source_type(self) -> str:
|
def source_type(self) -> str:
|
||||||
return "filings_api"
|
return "filings_api"
|
||||||
|
|
||||||
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
|
|
||||||
_cik = config.get("cik", "")
|
|
||||||
endpoint = config.get("endpoint", f"/LATEST/search-index?q=%22{ticker}%22&dateRange=custom&startdt=2026-01-01&forms=8-K,10-Q,10-K")
|
|
||||||
url = f"{self.base_url}{endpoint}"
|
|
||||||
|
|
||||||
headers = {"User-Agent": self.user_agent}
|
class SECEdgarAdapter(FilingsDataAdapter):
|
||||||
|
"""Concrete adapter for the SEC EDGAR full-text search system (EFTS).
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- Full-text search (/LATEST/search-index) for 8-K, 10-Q, 10-K, and other forms
|
||||||
|
- Filtering by date range, form type, and entity
|
||||||
|
|
||||||
|
The SEC EDGAR EFTS API is public and does not require an API key,
|
||||||
|
but requires a descriptive User-Agent header per SEC fair-access policy.
|
||||||
|
|
||||||
|
Config options:
|
||||||
|
cik: Company CIK number (optional, narrows search)
|
||||||
|
forms: Comma-separated form types to search (default "8-K,10-Q,10-K")
|
||||||
|
start_date: Only filings on or after this date, YYYY-MM-DD (optional)
|
||||||
|
end_date: Only filings on or before this date, YYYY-MM-DD (optional)
|
||||||
|
query: Custom search query override (optional, replaces ticker-based query)
|
||||||
|
"""
|
||||||
|
|
||||||
|
SEARCH_ENDPOINT: str = "/LATEST/search-index"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
base_url: str = "https://efts.sec.gov",
|
||||||
|
user_agent: str = "StonksOracle/1.0 ([email])",
|
||||||
|
) -> None:
|
||||||
|
self.base_url: str = base_url.rstrip("/")
|
||||||
|
self.user_agent: str = user_agent
|
||||||
|
|
||||||
|
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
|
||||||
|
"""Fetch filings from SEC EDGAR EFTS for a given ticker.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ticker: The company ticker symbol.
|
||||||
|
config: Source-specific configuration from the sources table.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AdapterResult with raw payload, parsed filing items, and metadata.
|
||||||
|
"""
|
||||||
|
url, params, headers = self._build_request(ticker, config)
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=30) as client:
|
async with httpx.AsyncClient(timeout=30) as client:
|
||||||
|
t0 = time.monotonic()
|
||||||
try:
|
try:
|
||||||
resp = await client.get(url, headers=headers)
|
resp = await client.get(url, params=params, headers=headers)
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
raw = resp.content
|
raw = resp.content
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
content_hash = hashlib.sha256(raw).hexdigest()
|
content_hash = hashlib.sha256(raw).hexdigest()
|
||||||
|
items = self._extract_items(data)
|
||||||
|
|
||||||
hits = data.get("hits", {}).get("hits", [])
|
|
||||||
return AdapterResult(
|
return AdapterResult(
|
||||||
source_type="filings_api",
|
source_type="filings_api",
|
||||||
ticker=ticker,
|
ticker=ticker,
|
||||||
items=hits,
|
items=items,
|
||||||
raw_payload=raw,
|
raw_payload=raw,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
fetched_at=datetime.utcnow(),
|
fetched_at=datetime.now(timezone.utc),
|
||||||
|
http_status=resp.status_code,
|
||||||
|
response_time_ms=round(elapsed_ms, 1),
|
||||||
|
metadata={
|
||||||
|
"provider": "sec_edgar",
|
||||||
|
"results_count": len(items),
|
||||||
|
"total_hits": self._total_hits(data),
|
||||||
|
"query": params.get("q", ""),
|
||||||
|
"forms": params.get("forms", ""),
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
|
logger.error("SEC EDGAR HTTP error for %s: %s", ticker, e)
|
||||||
|
return self._error_result(
|
||||||
|
ticker, str(e), elapsed_ms,
|
||||||
|
http_status=e.response.status_code if e.response else None,
|
||||||
|
raw=e.response.content if e.response else b"",
|
||||||
|
)
|
||||||
|
except httpx.TimeoutException as e:
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
|
logger.error("SEC EDGAR timeout for %s: %s", ticker, e)
|
||||||
|
return self._error_result(ticker, f"timeout: {e}", elapsed_ms)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Filings fetch failed for {ticker}: {e}")
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
return AdapterResult(
|
logger.error("SEC EDGAR fetch failed for %s: %s", ticker, e)
|
||||||
source_type="filings_api",
|
return self._error_result(ticker, str(e), elapsed_ms)
|
||||||
ticker=ticker,
|
|
||||||
items=[],
|
def _build_request(
|
||||||
raw_payload=b"",
|
self, ticker: str, config: dict[str, Any]
|
||||||
content_hash="",
|
) -> tuple[str, dict[str, str], dict[str, str]]:
|
||||||
fetched_at=datetime.utcnow(),
|
"""Build the URL, query params, and headers for an EDGAR EFTS request."""
|
||||||
error=str(e),
|
params: dict[str, str] = {}
|
||||||
)
|
headers: dict[str, str] = {"User-Agent": self.user_agent}
|
||||||
|
|
||||||
|
# Query: use custom override or default to ticker-based search
|
||||||
|
query = config.get("query")
|
||||||
|
if query:
|
||||||
|
params["q"] = str(query)
|
||||||
|
else:
|
||||||
|
params["q"] = f'"{ticker}"'
|
||||||
|
|
||||||
|
# Form types filter
|
||||||
|
forms = config.get("forms", "8-K,10-Q,10-K")
|
||||||
|
params["forms"] = str(forms)
|
||||||
|
|
||||||
|
# Date range
|
||||||
|
if config.get("start_date"):
|
||||||
|
params["dateRange"] = "custom"
|
||||||
|
params["startdt"] = str(config["start_date"])
|
||||||
|
if config.get("end_date"):
|
||||||
|
params["dateRange"] = "custom"
|
||||||
|
params["enddt"] = str(config["end_date"])
|
||||||
|
|
||||||
|
# CIK filter (entity-level narrowing)
|
||||||
|
cik = config.get("cik")
|
||||||
|
if cik:
|
||||||
|
params["q"] = f'{params["q"]} AND cik:{cik}'
|
||||||
|
|
||||||
|
url = f"{self.base_url}{self.SEARCH_ENDPOINT}"
|
||||||
|
return url, params, headers
|
||||||
|
|
||||||
|
def _extract_items(self, data: dict[str, Any]) -> list[dict[str, Any]]:
|
||||||
|
"""Extract the filing hits from an EDGAR EFTS response.
|
||||||
|
|
||||||
|
EFTS returns results under hits.hits as a list of objects,
|
||||||
|
each containing _source with fields like file_date, form_type,
|
||||||
|
entity_name, file_num, and period_of_report.
|
||||||
|
"""
|
||||||
|
hits_wrapper = data.get("hits", {})
|
||||||
|
if not isinstance(hits_wrapper, dict):
|
||||||
|
return []
|
||||||
|
hits = hits_wrapper.get("hits", [])
|
||||||
|
if isinstance(hits, list):
|
||||||
|
return hits
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _total_hits(self, data: dict[str, Any]) -> int:
|
||||||
|
"""Extract total hit count from EFTS response."""
|
||||||
|
hits_wrapper = data.get("hits", {})
|
||||||
|
if not isinstance(hits_wrapper, dict):
|
||||||
|
return 0
|
||||||
|
total = hits_wrapper.get("total", {})
|
||||||
|
if isinstance(total, dict):
|
||||||
|
return int(total.get("value", 0))
|
||||||
|
if isinstance(total, int):
|
||||||
|
return total
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def _error_result(
|
||||||
|
self,
|
||||||
|
ticker: str,
|
||||||
|
error: str,
|
||||||
|
elapsed_ms: float,
|
||||||
|
http_status: int | None = None,
|
||||||
|
raw: bytes = b"",
|
||||||
|
) -> AdapterResult:
|
||||||
|
"""Build an error AdapterResult for filings fetches."""
|
||||||
|
return AdapterResult(
|
||||||
|
source_type="filings_api",
|
||||||
|
ticker=ticker,
|
||||||
|
items=[],
|
||||||
|
raw_payload=raw,
|
||||||
|
content_hash="",
|
||||||
|
fetched_at=datetime.now(timezone.utc),
|
||||||
|
error=error,
|
||||||
|
http_status=http_status,
|
||||||
|
response_time_ms=round(elapsed_ms, 1),
|
||||||
|
metadata={"provider": "sec_edgar"},
|
||||||
|
)
|
||||||
|
|||||||
@@ -1,8 +1,16 @@
|
|||||||
"""Market data API adapter - fetches quotes, bars, and reference data."""
|
"""Market data API adapter interface and concrete Polygon.io provider.
|
||||||
|
|
||||||
|
The MarketDataAdapter is the abstract interface for all market data providers.
|
||||||
|
PolygonMarketAdapter is the first concrete implementation, targeting the
|
||||||
|
Polygon.io REST API for previous-day bars, quotes, and ticker details.
|
||||||
|
|
||||||
|
Requirements: 2.1, 2.5, 3.1, 3.2, 3.3
|
||||||
|
"""
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
import time
|
||||||
from typing import Any, Dict
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
@@ -12,48 +20,158 @@ logger = logging.getLogger("market_adapter")
|
|||||||
|
|
||||||
|
|
||||||
class MarketDataAdapter(BaseAdapter):
|
class MarketDataAdapter(BaseAdapter):
|
||||||
"""Concrete adapter for a market data provider (e.g., Alpha Vantage, Polygon, Yahoo)."""
|
"""Abstract interface for market data providers.
|
||||||
|
|
||||||
def __init__(self, api_key: str = "", base_url: str = ""):
|
Subclasses implement fetch() for their specific market data API.
|
||||||
self.api_key = api_key
|
"""
|
||||||
self.base_url = base_url
|
|
||||||
|
|
||||||
def source_type(self) -> str:
|
def source_type(self) -> str:
|
||||||
return "market_api"
|
return "market_api"
|
||||||
|
|
||||||
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
|
|
||||||
endpoint = config.get("endpoint", "/v2/aggs/ticker/{ticker}/prev")
|
class PolygonMarketAdapter(MarketDataAdapter):
|
||||||
url = f"{self.base_url}{endpoint.format(ticker=ticker)}"
|
"""Concrete adapter for the Polygon.io REST API.
|
||||||
params = config.get("params", {})
|
|
||||||
if self.api_key:
|
Supports:
|
||||||
params["apiKey"] = self.api_key
|
- Previous-day aggregate bars (/v2/aggs/ticker/{ticker}/prev)
|
||||||
|
- Grouped daily bars (/v2/aggs/grouped/locale/us/market/stocks/{date})
|
||||||
|
- Ticker details (/v3/reference/tickers/{ticker})
|
||||||
|
|
||||||
|
The endpoint is selected via the source config's "endpoint" field,
|
||||||
|
defaulting to previous-day bars.
|
||||||
|
"""
|
||||||
|
|
||||||
|
PREV_BARS = "/v2/aggs/ticker/{ticker}/prev"
|
||||||
|
RANGE_BARS = "/v2/aggs/ticker/{ticker}/range/{multiplier}/{timespan}/{from_date}/{to_date}"
|
||||||
|
TICKER_DETAILS = "/v3/reference/tickers/{ticker}"
|
||||||
|
|
||||||
|
def __init__(self, api_key: str, base_url: str = "https://api.polygon.io") -> None:
|
||||||
|
self.api_key: str = api_key
|
||||||
|
self.base_url: str = base_url.rstrip("/")
|
||||||
|
|
||||||
|
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
|
||||||
|
"""Fetch market data from Polygon.io for a given ticker.
|
||||||
|
|
||||||
|
Config options:
|
||||||
|
endpoint: One of "prev_bars" (default), "range_bars", "ticker_details"
|
||||||
|
multiplier: Bar multiplier for range queries (default 1)
|
||||||
|
timespan: Bar timespan for range queries (default "day")
|
||||||
|
from_date: Start date for range queries (YYYY-MM-DD)
|
||||||
|
to_date: End date for range queries (YYYY-MM-DD)
|
||||||
|
adjusted: Whether bars are adjusted for splits (default true)
|
||||||
|
"""
|
||||||
|
endpoint_key = config.get("endpoint", "prev_bars")
|
||||||
|
url, params = self._build_request(ticker, endpoint_key, config)
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=30) as client:
|
async with httpx.AsyncClient(timeout=30) as client:
|
||||||
|
t0 = time.monotonic()
|
||||||
try:
|
try:
|
||||||
resp = await client.get(url, params=params)
|
resp = await client.get(url, params=params)
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
raw = resp.content
|
raw = resp.content
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
content_hash = hashlib.sha256(raw).hexdigest()
|
content_hash = hashlib.sha256(raw).hexdigest()
|
||||||
|
items = self._extract_items(data, endpoint_key)
|
||||||
items = data.get("results", [data]) if isinstance(data, dict) else data
|
|
||||||
|
|
||||||
return AdapterResult(
|
return AdapterResult(
|
||||||
source_type="market_api",
|
source_type="market_api",
|
||||||
ticker=ticker,
|
ticker=ticker,
|
||||||
items=items if isinstance(items, list) else [items],
|
items=items,
|
||||||
raw_payload=raw,
|
raw_payload=raw,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
fetched_at=datetime.utcnow(),
|
fetched_at=datetime.now(timezone.utc),
|
||||||
|
http_status=resp.status_code,
|
||||||
|
response_time_ms=round(elapsed_ms, 1),
|
||||||
|
metadata={
|
||||||
|
"provider": "polygon",
|
||||||
|
"endpoint": endpoint_key,
|
||||||
|
"results_count": data.get("resultsCount", len(items)),
|
||||||
|
"request_id": data.get("request_id", ""),
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
|
logger.error("Polygon HTTP error for %s: %s", ticker, e)
|
||||||
|
return self._error_result(
|
||||||
|
ticker, str(e), elapsed_ms,
|
||||||
|
http_status=e.response.status_code if e.response else None,
|
||||||
|
raw=e.response.content if e.response else b"",
|
||||||
|
)
|
||||||
|
except httpx.TimeoutException as e:
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
|
logger.error("Polygon timeout for %s: %s", ticker, e)
|
||||||
|
return self._error_result(ticker, f"timeout: {e}", elapsed_ms)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Market fetch failed for {ticker}: {e}")
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
return AdapterResult(
|
logger.error("Polygon fetch failed for %s: %s", ticker, e)
|
||||||
source_type="market_api",
|
return self._error_result(ticker, str(e), elapsed_ms)
|
||||||
ticker=ticker,
|
|
||||||
items=[],
|
def _build_request(
|
||||||
raw_payload=b"",
|
self, ticker: str, endpoint_key: str, config: dict[str, Any]
|
||||||
content_hash="",
|
) -> tuple[str, dict[str, str]]:
|
||||||
fetched_at=datetime.utcnow(),
|
"""Build the URL and query params for a Polygon request."""
|
||||||
error=str(e),
|
params: dict[str, str] = {"apiKey": self.api_key}
|
||||||
)
|
|
||||||
|
if endpoint_key == "range_bars":
|
||||||
|
multiplier = str(config.get("multiplier", 1))
|
||||||
|
timespan = config.get("timespan", "day")
|
||||||
|
from_date = config.get("from_date", "")
|
||||||
|
to_date = config.get("to_date", "")
|
||||||
|
path = self.RANGE_BARS.format(
|
||||||
|
ticker=ticker,
|
||||||
|
multiplier=multiplier,
|
||||||
|
timespan=timespan,
|
||||||
|
from_date=from_date,
|
||||||
|
to_date=to_date,
|
||||||
|
)
|
||||||
|
if config.get("adjusted") is not None:
|
||||||
|
params["adjusted"] = str(config["adjusted"]).lower()
|
||||||
|
if config.get("sort"):
|
||||||
|
params["sort"] = config["sort"]
|
||||||
|
if config.get("limit"):
|
||||||
|
params["limit"] = str(config["limit"])
|
||||||
|
elif endpoint_key == "ticker_details":
|
||||||
|
path = self.TICKER_DETAILS.format(ticker=ticker)
|
||||||
|
else:
|
||||||
|
# Default: previous-day bars
|
||||||
|
path = self.PREV_BARS.format(ticker=ticker)
|
||||||
|
if config.get("adjusted") is not None:
|
||||||
|
params["adjusted"] = str(config["adjusted"]).lower()
|
||||||
|
|
||||||
|
return f"{self.base_url}{path}", params
|
||||||
|
|
||||||
|
def _extract_items(self, data: dict[str, Any], endpoint_key: str) -> list[dict[str, Any]]:
|
||||||
|
"""Extract the relevant items list from a Polygon response."""
|
||||||
|
if endpoint_key == "ticker_details":
|
||||||
|
results = data.get("results", {})
|
||||||
|
return [results] if isinstance(results, dict) and results else []
|
||||||
|
|
||||||
|
# Aggregate endpoints return results as a list
|
||||||
|
results = data.get("results", [])
|
||||||
|
if isinstance(results, list):
|
||||||
|
return results
|
||||||
|
return [results] if results else []
|
||||||
|
|
||||||
|
def _error_result(
|
||||||
|
self,
|
||||||
|
ticker: str,
|
||||||
|
error: str,
|
||||||
|
elapsed_ms: float,
|
||||||
|
http_status: int | None = None,
|
||||||
|
raw: bytes = b"",
|
||||||
|
) -> AdapterResult:
|
||||||
|
"""Build an error AdapterResult."""
|
||||||
|
return AdapterResult(
|
||||||
|
source_type="market_api",
|
||||||
|
ticker=ticker,
|
||||||
|
items=[],
|
||||||
|
raw_payload=raw,
|
||||||
|
content_hash="",
|
||||||
|
fetched_at=datetime.now(timezone.utc),
|
||||||
|
error=error,
|
||||||
|
http_status=http_status,
|
||||||
|
response_time_ms=round(elapsed_ms, 1),
|
||||||
|
metadata={"provider": "polygon"},
|
||||||
|
)
|
||||||
|
|||||||
@@ -1,8 +1,17 @@
|
|||||||
"""News API adapter - fetches company-linked headlines and article metadata."""
|
"""News API adapter interface and concrete Polygon.io news provider.
|
||||||
|
|
||||||
|
The NewsDataAdapter is the abstract interface for all news data providers.
|
||||||
|
PolygonNewsAdapter is the first concrete implementation, targeting the
|
||||||
|
Polygon.io REST API for company-linked news articles and headlines.
|
||||||
|
|
||||||
|
Requirements: 2.2, 2.5, 3.1, 3.2, 3.3
|
||||||
|
"""
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
import time
|
||||||
from typing import Any, Dict
|
from abc import ABC
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
@@ -11,51 +20,147 @@ from .base import AdapterResult, BaseAdapter
|
|||||||
logger = logging.getLogger("news_adapter")
|
logger = logging.getLogger("news_adapter")
|
||||||
|
|
||||||
|
|
||||||
class NewsApiAdapter(BaseAdapter):
|
class NewsDataAdapter(BaseAdapter, ABC):
|
||||||
"""Concrete adapter for a news API provider."""
|
"""Abstract interface for news data providers.
|
||||||
|
|
||||||
def __init__(self, api_key: str = "", base_url: str = ""):
|
Subclasses implement fetch() for their specific news API.
|
||||||
self.api_key = api_key
|
source_type() is concrete here since all news adapters share the same type.
|
||||||
self.base_url = base_url
|
"""
|
||||||
|
|
||||||
def source_type(self) -> str:
|
def source_type(self) -> str:
|
||||||
return "news_api"
|
return "news_api"
|
||||||
|
|
||||||
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
|
|
||||||
endpoint = config.get("endpoint", "/v2/everything")
|
class PolygonNewsAdapter(NewsDataAdapter):
|
||||||
url = f"{self.base_url}{endpoint}"
|
"""Concrete adapter for the Polygon.io ticker news endpoint.
|
||||||
params = config.get("params", {})
|
|
||||||
params.setdefault("q", ticker)
|
Supports:
|
||||||
params.setdefault("sortBy", "publishedAt")
|
- Ticker news (/v2/reference/news?ticker={ticker})
|
||||||
params.setdefault("pageSize", 20)
|
|
||||||
if self.api_key:
|
Config options:
|
||||||
params["apiKey"] = self.api_key
|
limit: Max articles to return per request (default 20, max 1000)
|
||||||
|
published_utc_gte: Only articles published on or after this date (YYYY-MM-DD)
|
||||||
|
published_utc_lte: Only articles published on or before this date (YYYY-MM-DD)
|
||||||
|
order: Sort order for results, "asc" or "desc" (default "desc")
|
||||||
|
"""
|
||||||
|
|
||||||
|
NEWS_ENDPOINT = "/v2/reference/news"
|
||||||
|
|
||||||
|
def __init__(self, api_key: str, base_url: str = "https://api.polygon.io") -> None:
|
||||||
|
self.api_key: str = api_key
|
||||||
|
self.base_url: str = base_url.rstrip("/")
|
||||||
|
|
||||||
|
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
|
||||||
|
"""Fetch news articles from Polygon.io for a given ticker.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ticker: The company ticker symbol.
|
||||||
|
config: Source-specific configuration from the sources table.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
AdapterResult with raw payload, parsed article items, and metadata.
|
||||||
|
"""
|
||||||
|
url, params = self._build_request(ticker, config)
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=30) as client:
|
async with httpx.AsyncClient(timeout=30) as client:
|
||||||
|
t0 = time.monotonic()
|
||||||
try:
|
try:
|
||||||
resp = await client.get(url, params=params)
|
resp = await client.get(url, params=params)
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
raw = resp.content
|
raw = resp.content
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
content_hash = hashlib.sha256(raw).hexdigest()
|
content_hash = hashlib.sha256(raw).hexdigest()
|
||||||
|
items = self._extract_items(data)
|
||||||
|
|
||||||
articles = data.get("articles", [])
|
|
||||||
return AdapterResult(
|
return AdapterResult(
|
||||||
source_type="news_api",
|
source_type="news_api",
|
||||||
ticker=ticker,
|
ticker=ticker,
|
||||||
items=articles,
|
items=items,
|
||||||
raw_payload=raw,
|
raw_payload=raw,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
fetched_at=datetime.utcnow(),
|
fetched_at=datetime.now(timezone.utc),
|
||||||
|
http_status=resp.status_code,
|
||||||
|
response_time_ms=round(elapsed_ms, 1),
|
||||||
|
metadata={
|
||||||
|
"provider": "polygon",
|
||||||
|
"results_count": data.get("count", len(items)),
|
||||||
|
"next_url": data.get("next_url", ""),
|
||||||
|
"request_id": data.get("request_id", ""),
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
|
logger.error("Polygon news HTTP error for %s: %s", ticker, e)
|
||||||
|
return self._error_result(
|
||||||
|
ticker, str(e), elapsed_ms,
|
||||||
|
http_status=e.response.status_code if e.response else None,
|
||||||
|
raw=e.response.content if e.response else b"",
|
||||||
|
)
|
||||||
|
except httpx.TimeoutException as e:
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
|
logger.error("Polygon news timeout for %s: %s", ticker, e)
|
||||||
|
return self._error_result(ticker, f"timeout: {e}", elapsed_ms)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"News fetch failed for {ticker}: {e}")
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
return AdapterResult(
|
logger.error("Polygon news fetch failed for %s: %s", ticker, e)
|
||||||
source_type="news_api",
|
return self._error_result(ticker, str(e), elapsed_ms)
|
||||||
ticker=ticker,
|
|
||||||
items=[],
|
def _build_request(
|
||||||
raw_payload=b"",
|
self, ticker: str, config: dict[str, Any]
|
||||||
content_hash="",
|
) -> tuple[str, dict[str, str]]:
|
||||||
fetched_at=datetime.utcnow(),
|
"""Build the URL and query params for a Polygon news request."""
|
||||||
error=str(e),
|
params: dict[str, str] = {
|
||||||
)
|
"apiKey": self.api_key,
|
||||||
|
"ticker": ticker,
|
||||||
|
}
|
||||||
|
|
||||||
|
limit = config.get("limit", 20)
|
||||||
|
params["limit"] = str(min(int(limit), 1000))
|
||||||
|
|
||||||
|
if config.get("order"):
|
||||||
|
params["order"] = config["order"]
|
||||||
|
|
||||||
|
if config.get("published_utc_gte"):
|
||||||
|
params["published_utc.gte"] = config["published_utc_gte"]
|
||||||
|
|
||||||
|
if config.get("published_utc_lte"):
|
||||||
|
params["published_utc.lte"] = config["published_utc_lte"]
|
||||||
|
|
||||||
|
url = f"{self.base_url}{self.NEWS_ENDPOINT}"
|
||||||
|
return url, params
|
||||||
|
|
||||||
|
def _extract_items(self, data: dict[str, Any]) -> list[dict[str, Any]]:
|
||||||
|
"""Extract the article list from a Polygon news response.
|
||||||
|
|
||||||
|
Polygon returns articles under the "results" key as a list of objects,
|
||||||
|
each containing fields like id, publisher, title, article_url, tickers,
|
||||||
|
published_utc, description, and keywords.
|
||||||
|
"""
|
||||||
|
results = data.get("results", [])
|
||||||
|
if isinstance(results, list):
|
||||||
|
return results
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _error_result(
|
||||||
|
self,
|
||||||
|
ticker: str,
|
||||||
|
error: str,
|
||||||
|
elapsed_ms: float,
|
||||||
|
http_status: int | None = None,
|
||||||
|
raw: bytes = b"",
|
||||||
|
) -> AdapterResult:
|
||||||
|
"""Build an error AdapterResult for news fetches."""
|
||||||
|
return AdapterResult(
|
||||||
|
source_type="news_api",
|
||||||
|
ticker=ticker,
|
||||||
|
items=[],
|
||||||
|
raw_payload=raw,
|
||||||
|
content_hash="",
|
||||||
|
fetched_at=datetime.now(timezone.utc),
|
||||||
|
error=error,
|
||||||
|
http_status=http_status,
|
||||||
|
response_time_ms=round(elapsed_ms, 1),
|
||||||
|
metadata={"provider": "polygon"},
|
||||||
|
)
|
||||||
|
|||||||
@@ -0,0 +1,603 @@
|
|||||||
|
"""Paper trading adapter - local order simulation and state sync.
|
||||||
|
|
||||||
|
Implements a fully local paper trading engine that simulates order
|
||||||
|
execution without requiring a real broker API. Tracks positions,
|
||||||
|
account balance, fills, and order events in-memory with PostgreSQL
|
||||||
|
persistence for state sync and audit trail.
|
||||||
|
|
||||||
|
Requirements: 8.1, 8.3, 8.5, 2.4
|
||||||
|
Design: Section 4.9 - Broker Adapter (paper mode)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
from services.adapters.broker_adapter import (
|
||||||
|
AccountInfo,
|
||||||
|
BrokerDataAdapter,
|
||||||
|
OrderEventType,
|
||||||
|
OrderRequest,
|
||||||
|
OrderResponse,
|
||||||
|
OrderSide,
|
||||||
|
OrderStatus,
|
||||||
|
OrderType,
|
||||||
|
PositionInfo,
|
||||||
|
TradingMode,
|
||||||
|
)
|
||||||
|
from services.adapters.base import AdapterResult
|
||||||
|
|
||||||
|
logger = logging.getLogger("paper_trading")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# In-memory paper trading state
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class PaperPosition:
|
||||||
|
"""Tracks a single paper position."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
ticker: str,
|
||||||
|
quantity: float = 0.0,
|
||||||
|
avg_entry_price: float = 0.0,
|
||||||
|
realized_pnl: float = 0.0,
|
||||||
|
) -> None:
|
||||||
|
self.ticker = ticker
|
||||||
|
self.quantity = quantity
|
||||||
|
self.avg_entry_price = avg_entry_price
|
||||||
|
self.realized_pnl = realized_pnl
|
||||||
|
|
||||||
|
def apply_fill(self, side: OrderSide, fill_qty: float, fill_price: float) -> float:
|
||||||
|
"""Apply a fill to this position. Returns realized PnL from the fill."""
|
||||||
|
realized = 0.0
|
||||||
|
|
||||||
|
if side == OrderSide.BUY:
|
||||||
|
# Buying: average up the entry price
|
||||||
|
total_cost = self.avg_entry_price * self.quantity + fill_price * fill_qty
|
||||||
|
self.quantity += fill_qty
|
||||||
|
if self.quantity > 0:
|
||||||
|
self.avg_entry_price = total_cost / self.quantity
|
||||||
|
else:
|
||||||
|
# Selling: realize PnL on the sold shares
|
||||||
|
if self.quantity > 0:
|
||||||
|
sell_qty = min(fill_qty, self.quantity)
|
||||||
|
realized = sell_qty * (fill_price - self.avg_entry_price)
|
||||||
|
self.quantity -= sell_qty
|
||||||
|
self.realized_pnl += realized
|
||||||
|
if self.quantity <= 0:
|
||||||
|
self.quantity = 0.0
|
||||||
|
self.avg_entry_price = 0.0
|
||||||
|
|
||||||
|
return realized
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_open(self) -> bool:
|
||||||
|
return self.quantity > 0
|
||||||
|
|
||||||
|
def to_position_info(self, current_price: float | None = None) -> PositionInfo:
|
||||||
|
"""Convert to a PositionInfo for the broker interface."""
|
||||||
|
price = current_price if current_price is not None else self.avg_entry_price
|
||||||
|
unrealized = (price - self.avg_entry_price) * self.quantity if self.quantity > 0 else 0.0
|
||||||
|
market_value = price * self.quantity
|
||||||
|
return PositionInfo(
|
||||||
|
ticker=self.ticker,
|
||||||
|
quantity=self.quantity,
|
||||||
|
avg_entry_price=self.avg_entry_price,
|
||||||
|
current_price=price,
|
||||||
|
unrealized_pnl=round(unrealized, 4),
|
||||||
|
market_value=round(market_value, 4),
|
||||||
|
side="long" if self.quantity > 0 else "flat",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PaperAccount:
|
||||||
|
"""In-memory paper trading account state."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
account_id: str = "paper-default",
|
||||||
|
initial_cash: float = 100_000.0,
|
||||||
|
) -> None:
|
||||||
|
self.account_id = account_id
|
||||||
|
self.initial_cash = initial_cash
|
||||||
|
self.cash = initial_cash
|
||||||
|
self.positions: dict[str, PaperPosition] = {}
|
||||||
|
self.orders: dict[str, OrderResponse] = {}
|
||||||
|
self.order_events: list[dict[str, Any]] = []
|
||||||
|
self._seen_idempotency_keys: dict[str, str] = {} # key -> order_id
|
||||||
|
|
||||||
|
@property
|
||||||
|
def portfolio_value(self) -> float:
|
||||||
|
position_value = sum(
|
||||||
|
p.quantity * p.avg_entry_price for p in self.positions.values() if p.is_open
|
||||||
|
)
|
||||||
|
return self.cash + position_value
|
||||||
|
|
||||||
|
@property
|
||||||
|
def buying_power(self) -> float:
|
||||||
|
return self.cash
|
||||||
|
|
||||||
|
def get_position(self, ticker: str) -> PaperPosition:
|
||||||
|
if ticker not in self.positions:
|
||||||
|
self.positions[ticker] = PaperPosition(ticker=ticker)
|
||||||
|
return self.positions[ticker]
|
||||||
|
|
||||||
|
def to_account_info(self) -> AccountInfo:
|
||||||
|
return AccountInfo(
|
||||||
|
account_id=self.account_id,
|
||||||
|
buying_power=round(self.buying_power, 2),
|
||||||
|
cash=round(self.cash, 2),
|
||||||
|
portfolio_value=round(self.portfolio_value, 2),
|
||||||
|
currency="USD",
|
||||||
|
mode=TradingMode.PAPER,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Paper trading adapter
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class PaperTradingAdapter(BrokerDataAdapter):
|
||||||
|
"""Local paper trading adapter that simulates order execution.
|
||||||
|
|
||||||
|
All orders are filled immediately at the estimated price (market orders)
|
||||||
|
or at the limit/stop price when applicable. No real broker API is called.
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Idempotent order submission via idempotency_key (Req 8.5)
|
||||||
|
- Full order event trail for audit (Req 8.3)
|
||||||
|
- Position tracking with average entry price
|
||||||
|
- Cash balance management
|
||||||
|
- State sync to/from PostgreSQL
|
||||||
|
|
||||||
|
The adapter operates in PAPER mode only and rejects any attempt
|
||||||
|
to switch to LIVE mode.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
account_id: str = "paper-default",
|
||||||
|
initial_cash: float = 100_000.0,
|
||||||
|
simulated_slippage_pct: float = 0.001,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(mode=TradingMode.PAPER)
|
||||||
|
self.account = PaperAccount(account_id=account_id, initial_cash=initial_cash)
|
||||||
|
self.slippage_pct = simulated_slippage_pct
|
||||||
|
|
||||||
|
def source_type(self) -> str:
|
||||||
|
return "broker"
|
||||||
|
|
||||||
|
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
|
||||||
|
"""Fetch paper positions/account as a raw artifact snapshot."""
|
||||||
|
endpoint = config.get("endpoint", "positions")
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
if endpoint == "account":
|
||||||
|
data = self.account.to_account_info().to_dict()
|
||||||
|
items = [data]
|
||||||
|
elif endpoint == "orders":
|
||||||
|
items = [
|
||||||
|
resp.to_dict()
|
||||||
|
for resp in self.account.orders.values()
|
||||||
|
if resp.ticker == ticker or ticker == "*"
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
pos = self.account.get_position(ticker)
|
||||||
|
data = pos.to_position_info().to_dict()
|
||||||
|
items = [data] if pos.is_open else []
|
||||||
|
|
||||||
|
raw = json.dumps(items).encode()
|
||||||
|
return AdapterResult(
|
||||||
|
source_type="broker",
|
||||||
|
ticker=ticker,
|
||||||
|
items=items,
|
||||||
|
raw_payload=raw,
|
||||||
|
content_hash="",
|
||||||
|
fetched_at=now,
|
||||||
|
metadata={"provider": "paper", "mode": "paper", "endpoint": endpoint},
|
||||||
|
)
|
||||||
|
|
||||||
|
async def submit_order(self, order: OrderRequest) -> OrderResponse:
|
||||||
|
"""Simulate order submission and immediate fill.
|
||||||
|
|
||||||
|
Idempotency: if the same idempotency_key was already used,
|
||||||
|
return the original response (Req 8.5).
|
||||||
|
"""
|
||||||
|
# Idempotency check
|
||||||
|
existing_id = self.account._seen_idempotency_keys.get(order.idempotency_key)
|
||||||
|
if existing_id and existing_id in self.account.orders:
|
||||||
|
logger.info("Duplicate order key %s — returning cached response", order.idempotency_key)
|
||||||
|
return self.account.orders[existing_id]
|
||||||
|
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
order_id = str(uuid.uuid4())
|
||||||
|
|
||||||
|
# Determine fill price based on order type
|
||||||
|
fill_price = self._compute_fill_price(order)
|
||||||
|
|
||||||
|
# Check if we have enough cash for buys
|
||||||
|
if order.side == OrderSide.BUY:
|
||||||
|
required_cash = fill_price * order.quantity
|
||||||
|
if required_cash > self.account.cash:
|
||||||
|
resp = OrderResponse(
|
||||||
|
broker_order_id=order_id,
|
||||||
|
status=OrderStatus.REJECTED,
|
||||||
|
ticker=order.ticker,
|
||||||
|
side=order.side,
|
||||||
|
quantity=order.quantity,
|
||||||
|
submitted_at=now,
|
||||||
|
error=f"Insufficient cash: need {required_cash:.2f}, have {self.account.cash:.2f}",
|
||||||
|
)
|
||||||
|
self._record_event(order_id, OrderEventType.REJECTED, resp.to_dict(), now)
|
||||||
|
self.account.orders[order_id] = resp
|
||||||
|
self.account._seen_idempotency_keys[order.idempotency_key] = order_id
|
||||||
|
return resp
|
||||||
|
|
||||||
|
# Check if we have enough shares for sells
|
||||||
|
if order.side == OrderSide.SELL:
|
||||||
|
pos = self.account.get_position(order.ticker)
|
||||||
|
if pos.quantity < order.quantity:
|
||||||
|
resp = OrderResponse(
|
||||||
|
broker_order_id=order_id,
|
||||||
|
status=OrderStatus.REJECTED,
|
||||||
|
ticker=order.ticker,
|
||||||
|
side=order.side,
|
||||||
|
quantity=order.quantity,
|
||||||
|
submitted_at=now,
|
||||||
|
error=f"Insufficient shares: need {order.quantity}, have {pos.quantity}",
|
||||||
|
)
|
||||||
|
self._record_event(order_id, OrderEventType.REJECTED, resp.to_dict(), now)
|
||||||
|
self.account.orders[order_id] = resp
|
||||||
|
self.account._seen_idempotency_keys[order.idempotency_key] = order_id
|
||||||
|
return resp
|
||||||
|
|
||||||
|
# Simulate immediate fill
|
||||||
|
position = self.account.get_position(order.ticker)
|
||||||
|
realized_pnl = position.apply_fill(order.side, order.quantity, fill_price)
|
||||||
|
|
||||||
|
# Update cash
|
||||||
|
if order.side == OrderSide.BUY:
|
||||||
|
self.account.cash -= fill_price * order.quantity
|
||||||
|
else:
|
||||||
|
self.account.cash += fill_price * order.quantity
|
||||||
|
|
||||||
|
resp = OrderResponse(
|
||||||
|
broker_order_id=order_id,
|
||||||
|
status=OrderStatus.FILLED,
|
||||||
|
ticker=order.ticker,
|
||||||
|
side=order.side,
|
||||||
|
quantity=order.quantity,
|
||||||
|
filled_quantity=order.quantity,
|
||||||
|
filled_avg_price=fill_price,
|
||||||
|
submitted_at=now,
|
||||||
|
raw_response={
|
||||||
|
"realized_pnl": round(realized_pnl, 4),
|
||||||
|
"cash_after": round(self.account.cash, 2),
|
||||||
|
"position_qty_after": position.quantity,
|
||||||
|
"simulated": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Record events
|
||||||
|
self._record_event(order_id, OrderEventType.SUBMITTED, {"ticker": order.ticker}, now)
|
||||||
|
self._record_event(order_id, OrderEventType.ACCEPTED, {"ticker": order.ticker}, now)
|
||||||
|
self._record_event(order_id, OrderEventType.FILL, {
|
||||||
|
"fill_price": fill_price,
|
||||||
|
"fill_qty": order.quantity,
|
||||||
|
"realized_pnl": round(realized_pnl, 4),
|
||||||
|
}, now)
|
||||||
|
|
||||||
|
self.account.orders[order_id] = resp
|
||||||
|
self.account._seen_idempotency_keys[order.idempotency_key] = order_id
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Paper fill: %s %s %.0f %s @ %.2f | cash=%.2f pnl=%.4f",
|
||||||
|
order_id[:8], order.side.value, order.quantity,
|
||||||
|
order.ticker, fill_price, self.account.cash, realized_pnl,
|
||||||
|
)
|
||||||
|
|
||||||
|
return resp
|
||||||
|
|
||||||
|
async def cancel_order(self, broker_order_id: str) -> OrderResponse:
|
||||||
|
"""Cancel a paper order. Only pending orders can be cancelled."""
|
||||||
|
existing = self.account.orders.get(broker_order_id)
|
||||||
|
if existing is None:
|
||||||
|
return OrderResponse(
|
||||||
|
broker_order_id=broker_order_id,
|
||||||
|
status=OrderStatus.REJECTED,
|
||||||
|
ticker="",
|
||||||
|
side=OrderSide.BUY,
|
||||||
|
quantity=0,
|
||||||
|
error=f"Order {broker_order_id} not found",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Paper orders fill immediately, so they can't be cancelled
|
||||||
|
if existing.status == OrderStatus.FILLED:
|
||||||
|
return OrderResponse(
|
||||||
|
broker_order_id=broker_order_id,
|
||||||
|
status=OrderStatus.REJECTED,
|
||||||
|
ticker=existing.ticker,
|
||||||
|
side=existing.side,
|
||||||
|
quantity=existing.quantity,
|
||||||
|
error="Cannot cancel a filled order",
|
||||||
|
)
|
||||||
|
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
cancelled = OrderResponse(
|
||||||
|
broker_order_id=broker_order_id,
|
||||||
|
status=OrderStatus.CANCELLED,
|
||||||
|
ticker=existing.ticker,
|
||||||
|
side=existing.side,
|
||||||
|
quantity=existing.quantity,
|
||||||
|
submitted_at=existing.submitted_at,
|
||||||
|
)
|
||||||
|
self.account.orders[broker_order_id] = cancelled
|
||||||
|
self._record_event(broker_order_id, OrderEventType.CANCELLED, {}, now)
|
||||||
|
return cancelled
|
||||||
|
|
||||||
|
async def get_order_status(self, broker_order_id: str) -> OrderResponse:
|
||||||
|
"""Get the status of a paper order."""
|
||||||
|
existing = self.account.orders.get(broker_order_id)
|
||||||
|
if existing is None:
|
||||||
|
return OrderResponse(
|
||||||
|
broker_order_id=broker_order_id,
|
||||||
|
status=OrderStatus.REJECTED,
|
||||||
|
ticker="",
|
||||||
|
side=OrderSide.BUY,
|
||||||
|
quantity=0,
|
||||||
|
error=f"Order {broker_order_id} not found",
|
||||||
|
)
|
||||||
|
return existing
|
||||||
|
|
||||||
|
async def get_positions(self) -> list[PositionInfo]:
|
||||||
|
"""Get all open paper positions."""
|
||||||
|
return [
|
||||||
|
p.to_position_info()
|
||||||
|
for p in self.account.positions.values()
|
||||||
|
if p.is_open
|
||||||
|
]
|
||||||
|
|
||||||
|
async def get_account(self) -> AccountInfo:
|
||||||
|
"""Get paper account summary."""
|
||||||
|
return self.account.to_account_info()
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------
|
||||||
|
# Internal helpers
|
||||||
|
# -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _compute_fill_price(self, order: OrderRequest) -> float:
|
||||||
|
"""Determine the simulated fill price for an order.
|
||||||
|
|
||||||
|
Market orders use the limit_price as a proxy (or 0 if not set).
|
||||||
|
Limit orders fill at the limit price.
|
||||||
|
Stop orders fill at the stop price.
|
||||||
|
A small slippage is applied to market orders.
|
||||||
|
"""
|
||||||
|
if order.order_type == OrderType.LIMIT and order.limit_price is not None:
|
||||||
|
return order.limit_price
|
||||||
|
if order.order_type == OrderType.STOP and order.stop_price is not None:
|
||||||
|
return order.stop_price
|
||||||
|
if order.order_type == OrderType.STOP_LIMIT and order.limit_price is not None:
|
||||||
|
return order.limit_price
|
||||||
|
|
||||||
|
# Market order: use limit_price as estimate, or a default
|
||||||
|
base_price = order.limit_price if order.limit_price is not None else 100.0
|
||||||
|
if order.side == OrderSide.BUY:
|
||||||
|
return round(base_price * (1 + self.slippage_pct), 4)
|
||||||
|
return round(base_price * (1 - self.slippage_pct), 4)
|
||||||
|
|
||||||
|
def _record_event(
|
||||||
|
self,
|
||||||
|
order_id: str,
|
||||||
|
event_type: OrderEventType,
|
||||||
|
data: dict[str, Any],
|
||||||
|
timestamp: datetime,
|
||||||
|
) -> None:
|
||||||
|
"""Record an order event for audit trail."""
|
||||||
|
self.account.order_events.append({
|
||||||
|
"order_id": order_id,
|
||||||
|
"event_type": event_type.value,
|
||||||
|
"data": data,
|
||||||
|
"timestamp": timestamp.isoformat(),
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# State sync: persist and restore paper trading state to/from PostgreSQL
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# SQL for persisting paper orders to the orders table
|
||||||
|
_INSERT_PAPER_ORDER = """
|
||||||
|
INSERT INTO orders (
|
||||||
|
id, recommendation_id, broker_account_id, ticker, side, order_type,
|
||||||
|
quantity, limit_price, stop_price, status, idempotency_key,
|
||||||
|
broker_order_id, decision_trace, submitted_at, filled_at,
|
||||||
|
fill_price, fill_quantity
|
||||||
|
) VALUES (
|
||||||
|
$1::uuid, $2, $3, $4, $5, $6,
|
||||||
|
$7, $8, $9, $10, $11,
|
||||||
|
$12, $13::jsonb, $14, $15,
|
||||||
|
$16, $17
|
||||||
|
)
|
||||||
|
ON CONFLICT (idempotency_key) DO NOTHING
|
||||||
|
"""
|
||||||
|
|
||||||
|
_INSERT_PAPER_ORDER_EVENT = """
|
||||||
|
INSERT INTO order_events (order_id, event_type, data, broker_timestamp)
|
||||||
|
VALUES ($1::uuid, $2, $3::jsonb, $4)
|
||||||
|
"""
|
||||||
|
|
||||||
|
_UPSERT_PAPER_POSITION = """
|
||||||
|
INSERT INTO positions (broker_account_id, ticker, quantity, avg_entry_price, realized_pnl, updated_at)
|
||||||
|
VALUES ($1, $2, $3, $4, $5, $6)
|
||||||
|
ON CONFLICT (broker_account_id, ticker)
|
||||||
|
DO UPDATE SET
|
||||||
|
quantity = EXCLUDED.quantity,
|
||||||
|
avg_entry_price = EXCLUDED.avg_entry_price,
|
||||||
|
realized_pnl = EXCLUDED.realized_pnl,
|
||||||
|
updated_at = EXCLUDED.updated_at
|
||||||
|
"""
|
||||||
|
|
||||||
|
_UPSERT_PAPER_ACCOUNT = """
|
||||||
|
INSERT INTO broker_accounts (id, provider, account_id, mode, config, active)
|
||||||
|
VALUES ($1::uuid, 'paper', $2, 'paper', $3::jsonb, TRUE)
|
||||||
|
ON CONFLICT (id) DO UPDATE SET
|
||||||
|
config = EXCLUDED.config,
|
||||||
|
active = TRUE
|
||||||
|
"""
|
||||||
|
|
||||||
|
_LOAD_PAPER_POSITIONS = """
|
||||||
|
SELECT ticker, quantity, avg_entry_price, COALESCE(realized_pnl, 0) AS realized_pnl
|
||||||
|
FROM positions
|
||||||
|
WHERE broker_account_id = $1 AND quantity > 0
|
||||||
|
"""
|
||||||
|
|
||||||
|
_LOAD_PAPER_ACCOUNT_CONFIG = """
|
||||||
|
SELECT config FROM broker_accounts
|
||||||
|
WHERE account_id = $1 AND mode = 'paper' AND active = TRUE
|
||||||
|
LIMIT 1
|
||||||
|
"""
|
||||||
|
|
||||||
|
_LOAD_PAPER_ORDERS = """
|
||||||
|
SELECT
|
||||||
|
id, ticker, side, order_type, quantity, status,
|
||||||
|
idempotency_key, broker_order_id, fill_price, fill_quantity,
|
||||||
|
submitted_at
|
||||||
|
FROM orders
|
||||||
|
WHERE broker_account_id = (
|
||||||
|
SELECT id FROM broker_accounts WHERE account_id = $1 AND mode = 'paper' LIMIT 1
|
||||||
|
)
|
||||||
|
ORDER BY submitted_at DESC
|
||||||
|
LIMIT 500
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def sync_state_to_db(
|
||||||
|
adapter: PaperTradingAdapter,
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
broker_account_uuid: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Persist the current paper trading state to PostgreSQL.
|
||||||
|
|
||||||
|
Writes:
|
||||||
|
- broker_accounts row for the paper account
|
||||||
|
- positions rows for all open positions
|
||||||
|
- orders rows for all orders (idempotent via ON CONFLICT)
|
||||||
|
- order_events for audit trail
|
||||||
|
|
||||||
|
This enables state recovery after restarts and provides the
|
||||||
|
full execution audit trail (Requirement 8.3).
|
||||||
|
"""
|
||||||
|
acct = adapter.account
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
acct_uuid = broker_account_uuid or str(uuid.uuid5(uuid.NAMESPACE_DNS, acct.account_id))
|
||||||
|
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
async with conn.transaction():
|
||||||
|
# 1. Upsert broker account
|
||||||
|
config_json = json.dumps({
|
||||||
|
"initial_cash": acct.initial_cash,
|
||||||
|
"current_cash": round(acct.cash, 2),
|
||||||
|
"portfolio_value": round(acct.portfolio_value, 2),
|
||||||
|
"slippage_pct": adapter.slippage_pct,
|
||||||
|
})
|
||||||
|
await conn.execute(_UPSERT_PAPER_ACCOUNT, acct_uuid, acct.account_id, config_json)
|
||||||
|
|
||||||
|
# 2. Upsert positions
|
||||||
|
for ticker, pos in acct.positions.items():
|
||||||
|
await conn.execute(
|
||||||
|
_UPSERT_PAPER_POSITION,
|
||||||
|
acct_uuid, ticker,
|
||||||
|
pos.quantity, pos.avg_entry_price, pos.realized_pnl,
|
||||||
|
now,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Insert orders (idempotent)
|
||||||
|
for order_id, resp in acct.orders.items():
|
||||||
|
filled_at = now if resp.status == OrderStatus.FILLED else None
|
||||||
|
await conn.execute(
|
||||||
|
_INSERT_PAPER_ORDER,
|
||||||
|
order_id,
|
||||||
|
None, # recommendation_id
|
||||||
|
acct_uuid,
|
||||||
|
resp.ticker,
|
||||||
|
resp.side.value,
|
||||||
|
"market", # paper orders are always market-simulated
|
||||||
|
resp.quantity,
|
||||||
|
resp.filled_avg_price, # limit_price
|
||||||
|
None, # stop_price
|
||||||
|
resp.status.value,
|
||||||
|
order_id, # use order_id as idempotency_key fallback
|
||||||
|
order_id,
|
||||||
|
json.dumps(resp.raw_response),
|
||||||
|
resp.submitted_at,
|
||||||
|
filled_at,
|
||||||
|
resp.filled_avg_price,
|
||||||
|
resp.filled_quantity,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 4. Insert order events
|
||||||
|
for event in acct.order_events:
|
||||||
|
await conn.execute(
|
||||||
|
_INSERT_PAPER_ORDER_EVENT,
|
||||||
|
event["order_id"],
|
||||||
|
event["event_type"],
|
||||||
|
json.dumps(event["data"]),
|
||||||
|
datetime.fromisoformat(event["timestamp"]),
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Synced paper state to DB: account=%s positions=%d orders=%d events=%d",
|
||||||
|
acct.account_id, len(acct.positions), len(acct.orders), len(acct.order_events),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clear events after sync to avoid re-inserting
|
||||||
|
acct.order_events.clear()
|
||||||
|
|
||||||
|
|
||||||
|
async def load_state_from_db(
|
||||||
|
adapter: PaperTradingAdapter,
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
) -> bool:
|
||||||
|
"""Restore paper trading state from PostgreSQL.
|
||||||
|
|
||||||
|
Loads positions and account config from the DB so the adapter
|
||||||
|
can resume after a restart. Returns True if state was found.
|
||||||
|
"""
|
||||||
|
acct = adapter.account
|
||||||
|
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
# Load account config
|
||||||
|
row = await conn.fetchrow(_LOAD_PAPER_ACCOUNT_CONFIG, acct.account_id)
|
||||||
|
if row is None:
|
||||||
|
logger.info("No saved paper account state for %s", acct.account_id)
|
||||||
|
return False
|
||||||
|
|
||||||
|
config = json.loads(row["config"]) if isinstance(row["config"], str) else row["config"]
|
||||||
|
acct.cash = float(config.get("current_cash", acct.initial_cash))
|
||||||
|
|
||||||
|
# Load positions
|
||||||
|
pos_rows = await conn.fetch(_LOAD_PAPER_POSITIONS, acct.account_id)
|
||||||
|
for pr in pos_rows:
|
||||||
|
ticker = pr["ticker"]
|
||||||
|
acct.positions[ticker] = PaperPosition(
|
||||||
|
ticker=ticker,
|
||||||
|
quantity=float(pr["quantity"]),
|
||||||
|
avg_entry_price=float(pr["avg_entry_price"] or 0),
|
||||||
|
realized_pnl=float(pr["realized_pnl"]),
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Loaded paper state from DB: account=%s cash=%.2f positions=%d",
|
||||||
|
acct.account_id, acct.cash, len(acct.positions),
|
||||||
|
)
|
||||||
|
return True
|
||||||
@@ -0,0 +1,241 @@
|
|||||||
|
"""Resilient adapter wrapper with rate-limit coordination, retries, and backoff.
|
||||||
|
|
||||||
|
Wraps any BaseAdapter with:
|
||||||
|
- Per-source-type rate limiting via Redis (distributed across workers)
|
||||||
|
- Exponential backoff with jitter on retryable failures
|
||||||
|
- Configurable retry counts and retryable HTTP status codes
|
||||||
|
- Graceful degradation when Redis is unavailable
|
||||||
|
|
||||||
|
Requirements: 2.5, 3.4
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import redis.asyncio as aioredis
|
||||||
|
|
||||||
|
from services.shared.redis_keys import rate_limit_key
|
||||||
|
|
||||||
|
from .base import AdapterResult, BaseAdapter
|
||||||
|
|
||||||
|
logger = logging.getLogger("resilient_adapter")
|
||||||
|
|
||||||
|
# HTTP status codes that are safe to retry
|
||||||
|
RETRYABLE_STATUS_CODES: frozenset[int] = frozenset({429, 500, 502, 503, 504})
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RetryConfig:
|
||||||
|
"""Configuration for retry and rate-limit behavior."""
|
||||||
|
|
||||||
|
max_retries: int = 3
|
||||||
|
base_delay: float = 1.0
|
||||||
|
max_delay: float = 60.0
|
||||||
|
jitter_factor: float = 0.5
|
||||||
|
retryable_status_codes: frozenset[int] = RETRYABLE_STATUS_CODES
|
||||||
|
# Rate limit: max requests per window per source type
|
||||||
|
rate_limit_max: int = 30
|
||||||
|
rate_limit_window_seconds: int = 60
|
||||||
|
|
||||||
|
|
||||||
|
# Sensible defaults per source type
|
||||||
|
DEFAULT_RETRY_CONFIGS: dict[str, RetryConfig] = {
|
||||||
|
"market_api": RetryConfig(max_retries=3, rate_limit_max=30),
|
||||||
|
"news_api": RetryConfig(max_retries=3, rate_limit_max=20),
|
||||||
|
"filings_api": RetryConfig(max_retries=2, rate_limit_max=10, base_delay=2.0),
|
||||||
|
"web_scrape": RetryConfig(max_retries=2, rate_limit_max=10, base_delay=2.0),
|
||||||
|
"broker": RetryConfig(max_retries=2, rate_limit_max=60, base_delay=0.5),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def compute_delay(attempt: int, config: RetryConfig) -> float:
|
||||||
|
"""Compute backoff delay with jitter for a given attempt number."""
|
||||||
|
exp_delay = config.base_delay * (2 ** attempt)
|
||||||
|
capped = min(exp_delay, config.max_delay)
|
||||||
|
jitter = capped * config.jitter_factor * random.random()
|
||||||
|
return capped + jitter
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RetryStats:
|
||||||
|
"""Tracks retry statistics for observability."""
|
||||||
|
|
||||||
|
attempts: int = 0
|
||||||
|
total_delay: float = 0.0
|
||||||
|
rate_limited_waits: int = 0
|
||||||
|
last_error: str | None = None
|
||||||
|
retryable: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class ResilientAdapter:
|
||||||
|
"""Wraps a BaseAdapter with rate-limit coordination, retries, and backoff.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
adapter = PolygonMarketAdapter(api_key="...")
|
||||||
|
resilient = ResilientAdapter(adapter, redis=rds)
|
||||||
|
result = await resilient.fetch(ticker, config)
|
||||||
|
|
||||||
|
If redis is None, rate limiting is skipped (local dev / testing).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
adapter: BaseAdapter,
|
||||||
|
redis: aioredis.Redis | None = None,
|
||||||
|
retry_config: RetryConfig | None = None,
|
||||||
|
) -> None:
|
||||||
|
self._adapter = adapter
|
||||||
|
self._redis = redis
|
||||||
|
source_type = adapter.source_type()
|
||||||
|
self._config = retry_config or DEFAULT_RETRY_CONFIGS.get(
|
||||||
|
source_type, RetryConfig()
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def adapter(self) -> BaseAdapter:
|
||||||
|
"""Access the underlying adapter."""
|
||||||
|
return self._adapter
|
||||||
|
|
||||||
|
@property
|
||||||
|
def config(self) -> RetryConfig:
|
||||||
|
return self._config
|
||||||
|
|
||||||
|
def source_type(self) -> str:
|
||||||
|
return self._adapter.source_type()
|
||||||
|
|
||||||
|
async def _check_rate_limit(self) -> float:
|
||||||
|
"""Check distributed rate limit via Redis.
|
||||||
|
|
||||||
|
Returns 0.0 if allowed, or the number of seconds to wait.
|
||||||
|
"""
|
||||||
|
if self._redis is None:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
source_type = self._adapter.source_type()
|
||||||
|
window_sec = self._config.rate_limit_window_seconds
|
||||||
|
# Use a time-bucketed key so counters auto-expire
|
||||||
|
bucket = int(time.time()) // window_sec
|
||||||
|
key = rate_limit_key(source_type, str(bucket))
|
||||||
|
|
||||||
|
try:
|
||||||
|
count = await self._redis.incr(key)
|
||||||
|
if count == 1:
|
||||||
|
await self._redis.expire(key, window_sec * 2)
|
||||||
|
if count > self._config.rate_limit_max:
|
||||||
|
# Over limit — compute how long until the window rolls over
|
||||||
|
elapsed_in_window = time.time() % window_sec
|
||||||
|
wait = window_sec - elapsed_in_window
|
||||||
|
return max(wait, 0.5)
|
||||||
|
except Exception:
|
||||||
|
# Redis unavailable — degrade gracefully, allow the request
|
||||||
|
logger.warning("Redis rate-limit check failed, allowing request")
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def _is_retryable(self, result: AdapterResult) -> bool:
|
||||||
|
"""Determine if a failed result is worth retrying."""
|
||||||
|
if result.ok:
|
||||||
|
return False
|
||||||
|
# Retry on known retryable HTTP status codes
|
||||||
|
if result.http_status and result.http_status in self._config.retryable_status_codes:
|
||||||
|
return True
|
||||||
|
# Retry on timeouts
|
||||||
|
if result.error and "timeout" in result.error.lower():
|
||||||
|
return True
|
||||||
|
# Retry on connection errors
|
||||||
|
if result.error and any(
|
||||||
|
kw in result.error.lower()
|
||||||
|
for kw in ("connection", "connect", "reset", "refused")
|
||||||
|
):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _extract_retry_after(self, result: AdapterResult) -> float | None:
|
||||||
|
"""Extract Retry-After hint from result metadata if present."""
|
||||||
|
retry_after = result.metadata.get("retry_after")
|
||||||
|
if retry_after is not None:
|
||||||
|
try:
|
||||||
|
return float(retry_after)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
|
||||||
|
"""Fetch with rate-limit coordination, retries, and exponential backoff.
|
||||||
|
|
||||||
|
Returns the AdapterResult from the underlying adapter. On retryable
|
||||||
|
failures, retries up to max_retries times with exponential backoff
|
||||||
|
and jitter. Rate-limit waits are applied before each attempt.
|
||||||
|
|
||||||
|
The returned result's metadata includes retry stats under the
|
||||||
|
"retry_stats" key.
|
||||||
|
"""
|
||||||
|
stats = RetryStats()
|
||||||
|
last_result: AdapterResult | None = None
|
||||||
|
|
||||||
|
for attempt in range(self._config.max_retries + 1):
|
||||||
|
stats.attempts = attempt + 1
|
||||||
|
|
||||||
|
# Rate limit check
|
||||||
|
wait = await self._check_rate_limit()
|
||||||
|
if wait > 0:
|
||||||
|
stats.rate_limited_waits += 1
|
||||||
|
logger.info(
|
||||||
|
"Rate limited for %s/%s, waiting %.1fs",
|
||||||
|
self.source_type(), ticker, wait,
|
||||||
|
)
|
||||||
|
stats.total_delay += wait
|
||||||
|
await asyncio.sleep(wait)
|
||||||
|
|
||||||
|
# Execute the fetch
|
||||||
|
result = await self._adapter.fetch(ticker, config)
|
||||||
|
last_result = result
|
||||||
|
|
||||||
|
# Success — attach stats and return
|
||||||
|
if result.ok:
|
||||||
|
result.metadata["retry_stats"] = {
|
||||||
|
"attempts": stats.attempts,
|
||||||
|
"total_delay": round(stats.total_delay, 2),
|
||||||
|
"rate_limited_waits": stats.rate_limited_waits,
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Check if retryable
|
||||||
|
if not self._is_retryable(result):
|
||||||
|
stats.last_error = result.error
|
||||||
|
stats.retryable = False
|
||||||
|
break
|
||||||
|
|
||||||
|
stats.retryable = True
|
||||||
|
stats.last_error = result.error
|
||||||
|
|
||||||
|
# Don't sleep after the last attempt
|
||||||
|
if attempt < self._config.max_retries:
|
||||||
|
# Respect Retry-After header for 429s
|
||||||
|
retry_after = self._extract_retry_after(result)
|
||||||
|
if result.http_status == 429 and retry_after is not None:
|
||||||
|
delay = min(retry_after, self._config.max_delay)
|
||||||
|
else:
|
||||||
|
delay = compute_delay(attempt, self._config)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Retrying %s/%s (attempt %d/%d) after %.1fs: %s",
|
||||||
|
self.source_type(), ticker, attempt + 1,
|
||||||
|
self._config.max_retries + 1, delay, result.error,
|
||||||
|
)
|
||||||
|
stats.total_delay += delay
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
|
||||||
|
# All retries exhausted — return last result with stats
|
||||||
|
assert last_result is not None
|
||||||
|
last_result.metadata["retry_stats"] = {
|
||||||
|
"attempts": stats.attempts,
|
||||||
|
"total_delay": round(stats.total_delay, 2),
|
||||||
|
"rate_limited_waits": stats.rate_limited_waits,
|
||||||
|
"exhausted": True,
|
||||||
|
"last_error": stats.last_error,
|
||||||
|
}
|
||||||
|
return last_result
|
||||||
@@ -0,0 +1,321 @@
|
|||||||
|
"""Web scrape adapter for curated URLs and article pages.
|
||||||
|
|
||||||
|
Fetches full article HTML from curated URLs (investor relations pages,
|
||||||
|
press releases, earnings transcripts, etc.) using BeautifulSoup + requests
|
||||||
|
with retry adapters, content hashing, boilerplate awareness, and quality scoring.
|
||||||
|
|
||||||
|
Inspired by Noctipede crawler patterns: BeautifulSoup + requests with retry
|
||||||
|
adapters, content hashing, boilerplate stripping, quality scoring.
|
||||||
|
|
||||||
|
Requirements: 1.2, 2.5, 3.1, 3.2, 3.3, 3.4
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from services.shared.content import content_hash, normalize_url
|
||||||
|
|
||||||
|
from .base import AdapterResult, BaseAdapter
|
||||||
|
|
||||||
|
logger = logging.getLogger("web_scrape_adapter")
|
||||||
|
|
||||||
|
# Default request settings
|
||||||
|
DEFAULT_TIMEOUT = 30
|
||||||
|
DEFAULT_USER_AGENT = "StonksOracle/1.0 (+https://stonks-oracle.celestium.life)"
|
||||||
|
MAX_CONTENT_LENGTH = 10 * 1024 * 1024 # 10MB cap
|
||||||
|
|
||||||
|
|
||||||
|
def extract_metadata_from_html(html: str, url: str) -> dict[str, str | None]:
|
||||||
|
"""Extract title, author, publisher, published date, and links from HTML."""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
meta: dict[str, str | None] = {}
|
||||||
|
|
||||||
|
# Title: prefer og:title, then <title>
|
||||||
|
og_title = soup.find("meta", property="og:title")
|
||||||
|
if og_title and og_title.get("content"):
|
||||||
|
content = og_title["content"]
|
||||||
|
meta["title"] = content.strip() if isinstance(content, str) else ""
|
||||||
|
elif soup.title and soup.title.string:
|
||||||
|
meta["title"] = soup.title.string.strip()
|
||||||
|
else:
|
||||||
|
meta["title"] = ""
|
||||||
|
|
||||||
|
# Author
|
||||||
|
author_tag = soup.find("meta", attrs={"name": "author"})
|
||||||
|
if author_tag and author_tag.get("content"):
|
||||||
|
content = author_tag["content"]
|
||||||
|
meta["author"] = content.strip() if isinstance(content, str) else ""
|
||||||
|
else:
|
||||||
|
meta["author"] = ""
|
||||||
|
|
||||||
|
# Publisher: og:site_name
|
||||||
|
site_name = soup.find("meta", property="og:site_name")
|
||||||
|
if site_name and site_name.get("content"):
|
||||||
|
content = site_name["content"]
|
||||||
|
meta["publisher"] = content.strip() if isinstance(content, str) else ""
|
||||||
|
else:
|
||||||
|
meta["publisher"] = urlparse(url).hostname or ""
|
||||||
|
|
||||||
|
# Published date: article:published_time or datePublished
|
||||||
|
pub_time = soup.find("meta", property="article:published_time")
|
||||||
|
if pub_time and pub_time.get("content"):
|
||||||
|
content = pub_time["content"]
|
||||||
|
meta["published_at"] = content.strip() if isinstance(content, str) else None
|
||||||
|
else:
|
||||||
|
# Try JSON-LD datePublished
|
||||||
|
for script in soup.find_all("script", type="application/ld+json"):
|
||||||
|
if script.string and "datePublished" in script.string:
|
||||||
|
try:
|
||||||
|
ld = json.loads(script.string)
|
||||||
|
if isinstance(ld, dict) and "datePublished" in ld:
|
||||||
|
meta["published_at"] = str(ld["datePublished"])
|
||||||
|
break
|
||||||
|
if isinstance(ld, list):
|
||||||
|
for item in ld:
|
||||||
|
if isinstance(item, dict) and "datePublished" in item:
|
||||||
|
meta["published_at"] = str(item["datePublished"])
|
||||||
|
break
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
pass
|
||||||
|
if "published_at" not in meta:
|
||||||
|
meta["published_at"] = None
|
||||||
|
|
||||||
|
# Canonical URL
|
||||||
|
canonical = soup.find("link", rel="canonical")
|
||||||
|
if canonical and canonical.get("href"):
|
||||||
|
href = canonical["href"]
|
||||||
|
meta["canonical_url"] = str(href) if href else normalize_url(url)
|
||||||
|
else:
|
||||||
|
og_url = soup.find("meta", property="og:url")
|
||||||
|
if og_url and og_url.get("content"):
|
||||||
|
content = og_url["content"]
|
||||||
|
meta["canonical_url"] = str(content) if content else normalize_url(url)
|
||||||
|
else:
|
||||||
|
meta["canonical_url"] = normalize_url(url)
|
||||||
|
|
||||||
|
# Language
|
||||||
|
html_tag = soup.find("html")
|
||||||
|
if html_tag and html_tag.get("lang"):
|
||||||
|
lang = html_tag["lang"]
|
||||||
|
meta["language"] = str(lang)[:5] if lang else "en"
|
||||||
|
else:
|
||||||
|
meta["language"] = "en"
|
||||||
|
|
||||||
|
# Description for summary
|
||||||
|
desc = soup.find("meta", property="og:description") or soup.find(
|
||||||
|
"meta", attrs={"name": "description"}
|
||||||
|
)
|
||||||
|
if desc and desc.get("content"):
|
||||||
|
content = desc["content"]
|
||||||
|
meta["description"] = content.strip() if isinstance(content, str) else ""
|
||||||
|
else:
|
||||||
|
meta["description"] = ""
|
||||||
|
|
||||||
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def extract_body_text(html: str) -> str:
|
||||||
|
"""Extract main body text from HTML, stripping nav/footer/ads."""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
# Remove non-content elements
|
||||||
|
for tag in soup.find_all(
|
||||||
|
["script", "style", "nav", "footer", "header", "aside", "iframe", "noscript"]
|
||||||
|
):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
# Try to find article body
|
||||||
|
article = soup.find("article")
|
||||||
|
if not article:
|
||||||
|
for div in soup.find_all("div"):
|
||||||
|
cls = div.get("class", [])
|
||||||
|
cls_str = " ".join(cls) if isinstance(cls, list) else str(cls) if cls else ""
|
||||||
|
if any(kw in cls_str for kw in ["article-body", "post-content", "entry-content", "story-body"]):
|
||||||
|
article = div
|
||||||
|
break
|
||||||
|
|
||||||
|
if article:
|
||||||
|
text = article.get_text(separator="\n", strip=True)
|
||||||
|
else:
|
||||||
|
# Fallback: use body
|
||||||
|
body = soup.find("body")
|
||||||
|
text = body.get_text(separator="\n", strip=True) if body else soup.get_text(separator="\n", strip=True)
|
||||||
|
|
||||||
|
# Collapse whitespace
|
||||||
|
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
class WebScrapeAdapter(BaseAdapter):
|
||||||
|
"""Adapter for fetching curated web pages and article URLs.
|
||||||
|
|
||||||
|
Config options (from source config):
|
||||||
|
urls: List of URLs to scrape for this company
|
||||||
|
url: Single URL to scrape (alternative to urls)
|
||||||
|
timeout: Request timeout in seconds (default 30)
|
||||||
|
user_agent: Custom user agent string
|
||||||
|
follow_links: Whether to follow article links from index pages (default False)
|
||||||
|
max_pages: Max pages to fetch per cycle (default 5)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def source_type(self) -> str:
|
||||||
|
return "web_scrape"
|
||||||
|
|
||||||
|
def bucket_name(self) -> str:
|
||||||
|
"""Web scrape artifacts go to the news raw bucket."""
|
||||||
|
return "stonks-raw-news"
|
||||||
|
|
||||||
|
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
|
||||||
|
"""Fetch HTML from curated URLs for a given ticker.
|
||||||
|
|
||||||
|
Supports both single URL and multi-URL configs. Each URL is fetched,
|
||||||
|
HTML is preserved as raw payload, and metadata is extracted.
|
||||||
|
"""
|
||||||
|
urls = config.get("urls", [])
|
||||||
|
if not urls and config.get("url"):
|
||||||
|
urls = [config["url"]]
|
||||||
|
|
||||||
|
if not urls:
|
||||||
|
return self._error_result(ticker, "No URLs configured for web_scrape source", 0)
|
||||||
|
|
||||||
|
timeout = config.get("timeout", DEFAULT_TIMEOUT)
|
||||||
|
user_agent = config.get("user_agent", DEFAULT_USER_AGENT)
|
||||||
|
max_pages = min(config.get("max_pages", 5), 20)
|
||||||
|
|
||||||
|
items: list[dict[str, Any]] = []
|
||||||
|
all_raw: list[bytes] = []
|
||||||
|
total_elapsed = 0.0
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=timeout,
|
||||||
|
follow_redirects=True,
|
||||||
|
headers={"User-Agent": user_agent},
|
||||||
|
) as client:
|
||||||
|
for url in urls[:max_pages]:
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
resp = await client.get(url)
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
|
total_elapsed += elapsed_ms
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
# Content length guard
|
||||||
|
if len(resp.content) > MAX_CONTENT_LENGTH:
|
||||||
|
errors.append(f"Content too large for {url}: {len(resp.content)} bytes")
|
||||||
|
continue
|
||||||
|
|
||||||
|
html = resp.text
|
||||||
|
raw_bytes = resp.content
|
||||||
|
all_raw.append(raw_bytes)
|
||||||
|
|
||||||
|
item_content_hash = content_hash(raw_bytes)
|
||||||
|
meta = extract_metadata_from_html(html, url)
|
||||||
|
body_text = extract_body_text(html)
|
||||||
|
|
||||||
|
item: dict[str, Any] = {
|
||||||
|
"url": url,
|
||||||
|
"canonical_url": meta.get("canonical_url", normalize_url(url)),
|
||||||
|
"title": meta.get("title", ""),
|
||||||
|
"author": meta.get("author", ""),
|
||||||
|
"publisher": meta.get("publisher", ""),
|
||||||
|
"published_at": meta.get("published_at"),
|
||||||
|
"language": meta.get("language", "en"),
|
||||||
|
"description": meta.get("description", ""),
|
||||||
|
"content_hash": item_content_hash,
|
||||||
|
"body_text": body_text,
|
||||||
|
"body_length": len(body_text),
|
||||||
|
"html_length": len(html),
|
||||||
|
"http_status": resp.status_code,
|
||||||
|
"response_time_ms": round(elapsed_ms, 1),
|
||||||
|
}
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
|
total_elapsed += elapsed_ms
|
||||||
|
status = e.response.status_code if e.response else None
|
||||||
|
errors.append(f"HTTP {status} for {url}: {e}")
|
||||||
|
logger.warning("Scrape HTTP error for %s/%s: %s", ticker, url, e)
|
||||||
|
|
||||||
|
except httpx.TimeoutException as e:
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
|
total_elapsed += elapsed_ms
|
||||||
|
errors.append(f"Timeout for {url}: {e}")
|
||||||
|
logger.warning("Scrape timeout for %s/%s: %s", ticker, url, e)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
|
total_elapsed += elapsed_ms
|
||||||
|
errors.append(f"Error for {url}: {e}")
|
||||||
|
logger.warning("Scrape error for %s/%s: %s", ticker, url, e)
|
||||||
|
|
||||||
|
if not items:
|
||||||
|
error_msg = "; ".join(errors) if errors else "No pages fetched"
|
||||||
|
return self._error_result(ticker, error_msg, total_elapsed)
|
||||||
|
|
||||||
|
# Combine all raw payloads into a single artifact
|
||||||
|
combined_raw = json.dumps({
|
||||||
|
"ticker": ticker,
|
||||||
|
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"pages": [
|
||||||
|
{
|
||||||
|
"url": item["url"],
|
||||||
|
"content_hash": item["content_hash"],
|
||||||
|
"html_length": item["html_length"],
|
||||||
|
"body_length": item["body_length"],
|
||||||
|
}
|
||||||
|
for item in items
|
||||||
|
],
|
||||||
|
"errors": errors,
|
||||||
|
}).encode("utf-8")
|
||||||
|
|
||||||
|
combined_hash = content_hash(
|
||||||
|
b"".join(item["content_hash"].encode() for item in items)
|
||||||
|
)
|
||||||
|
|
||||||
|
return AdapterResult(
|
||||||
|
source_type="web_scrape",
|
||||||
|
ticker=ticker,
|
||||||
|
items=items,
|
||||||
|
raw_payload=combined_raw,
|
||||||
|
content_hash=combined_hash,
|
||||||
|
fetched_at=datetime.now(timezone.utc),
|
||||||
|
http_status=200,
|
||||||
|
response_time_ms=round(total_elapsed, 1),
|
||||||
|
metadata={
|
||||||
|
"provider": "web_scrape",
|
||||||
|
"pages_fetched": len(items),
|
||||||
|
"pages_failed": len(errors),
|
||||||
|
"errors": errors,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def _error_result(
|
||||||
|
self,
|
||||||
|
ticker: str,
|
||||||
|
error: str,
|
||||||
|
elapsed_ms: float,
|
||||||
|
) -> AdapterResult:
|
||||||
|
"""Build an error AdapterResult for scrape fetches."""
|
||||||
|
return AdapterResult(
|
||||||
|
source_type="web_scrape",
|
||||||
|
ticker=ticker,
|
||||||
|
items=[],
|
||||||
|
raw_payload=b"",
|
||||||
|
content_hash="",
|
||||||
|
fetched_at=datetime.now(timezone.utc),
|
||||||
|
error=error,
|
||||||
|
http_status=None,
|
||||||
|
response_time_ms=round(elapsed_ms, 1),
|
||||||
|
metadata={"provider": "web_scrape"},
|
||||||
|
)
|
||||||
@@ -0,0 +1,169 @@
|
|||||||
|
"""Contradiction detection and disagreement representation.
|
||||||
|
|
||||||
|
Analyses weighted signals to detect and represent disagreement explicitly,
|
||||||
|
rather than collapsing contradictory evidence into a single unsupported
|
||||||
|
conclusion.
|
||||||
|
|
||||||
|
Requirements: 6.4, 6.5
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from services.aggregation.scoring import WeightedSignal
|
||||||
|
from services.shared.schemas import DisagreementDetail
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CatalystEntry:
|
||||||
|
"""Lightweight carrier for per-document catalyst info needed by
|
||||||
|
contradiction detection. Avoids importing ImpactRow and creating
|
||||||
|
a circular dependency with worker.py."""
|
||||||
|
|
||||||
|
document_id: str
|
||||||
|
catalyst_type: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ContradictionResult:
|
||||||
|
"""Full contradiction analysis output."""
|
||||||
|
|
||||||
|
score: float # 0-1, same semantics as existing compute_contradiction_score
|
||||||
|
details: list[DisagreementDetail]
|
||||||
|
|
||||||
|
|
||||||
|
def detect_contradictions(
|
||||||
|
signals: list[WeightedSignal],
|
||||||
|
catalyst_entries: list[CatalystEntry] | None = None,
|
||||||
|
) -> ContradictionResult:
|
||||||
|
"""Run contradiction detection across multiple dimensions.
|
||||||
|
|
||||||
|
Analyses:
|
||||||
|
1. Sentiment disagreement — the core positive-vs-negative split
|
||||||
|
2. Catalyst disagreement — same catalyst type with opposing sentiment
|
||||||
|
|
||||||
|
Returns a ContradictionResult with an overall score and per-dimension
|
||||||
|
disagreement details.
|
||||||
|
"""
|
||||||
|
details: list[DisagreementDetail] = []
|
||||||
|
|
||||||
|
sentiment_detail = _detect_sentiment_disagreement(signals)
|
||||||
|
if sentiment_detail is not None:
|
||||||
|
details.append(sentiment_detail)
|
||||||
|
|
||||||
|
if catalyst_entries:
|
||||||
|
catalyst_details = _detect_catalyst_disagreement(signals, catalyst_entries)
|
||||||
|
details.extend(catalyst_details)
|
||||||
|
|
||||||
|
score = _compute_overall_score(signals)
|
||||||
|
|
||||||
|
return ContradictionResult(score=score, details=details)
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_overall_score(signals: list[WeightedSignal]) -> float:
|
||||||
|
"""Minority/majority weight ratio — backward-compatible formula."""
|
||||||
|
if not signals:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
pos_weight = 0.0
|
||||||
|
neg_weight = 0.0
|
||||||
|
for sig in signals:
|
||||||
|
w = sig.weight.combined * sig.impact_score
|
||||||
|
if sig.sentiment_value > 0:
|
||||||
|
pos_weight += w
|
||||||
|
elif sig.sentiment_value < 0:
|
||||||
|
neg_weight += w
|
||||||
|
|
||||||
|
total = pos_weight + neg_weight
|
||||||
|
if total == 0.0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
minority = min(pos_weight, neg_weight)
|
||||||
|
return round(minority / total, 4)
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_sentiment_disagreement(
|
||||||
|
signals: list[WeightedSignal],
|
||||||
|
) -> DisagreementDetail | None:
|
||||||
|
"""Detect when both positive and negative sentiment signals exist."""
|
||||||
|
pos_ids: list[str] = []
|
||||||
|
neg_ids: list[str] = []
|
||||||
|
pos_weight = 0.0
|
||||||
|
neg_weight = 0.0
|
||||||
|
|
||||||
|
for sig in signals:
|
||||||
|
w = sig.weight.combined * sig.impact_score
|
||||||
|
if w <= 0:
|
||||||
|
continue
|
||||||
|
if sig.sentiment_value > 0:
|
||||||
|
pos_ids.append(sig.document_id)
|
||||||
|
pos_weight += w
|
||||||
|
elif sig.sentiment_value < 0:
|
||||||
|
neg_ids.append(sig.document_id)
|
||||||
|
neg_weight += w
|
||||||
|
|
||||||
|
if not pos_ids or not neg_ids:
|
||||||
|
return None
|
||||||
|
|
||||||
|
total = pos_weight + neg_weight
|
||||||
|
minority_pct = min(pos_weight, neg_weight) / total if total > 0 else 0.0
|
||||||
|
|
||||||
|
return DisagreementDetail(
|
||||||
|
dimension="sentiment",
|
||||||
|
positive_doc_ids=pos_ids,
|
||||||
|
negative_doc_ids=neg_ids,
|
||||||
|
positive_weight=round(pos_weight, 4),
|
||||||
|
negative_weight=round(neg_weight, 4),
|
||||||
|
description=(
|
||||||
|
f"Sentiment split: {len(pos_ids)} positive vs {len(neg_ids)} negative signals "
|
||||||
|
f"(minority weight ratio {minority_pct:.0%})"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_catalyst_disagreement(
|
||||||
|
signals: list[WeightedSignal],
|
||||||
|
catalyst_entries: list[CatalystEntry],
|
||||||
|
) -> list[DisagreementDetail]:
|
||||||
|
"""Detect when the same catalyst type has both positive and negative signals."""
|
||||||
|
# Build lookup: document_id → (sentiment_value, combined_weight)
|
||||||
|
sig_lookup: dict[str, tuple[float, float]] = {}
|
||||||
|
for sig in signals:
|
||||||
|
w = sig.weight.combined * sig.impact_score
|
||||||
|
if w > 0:
|
||||||
|
sig_lookup[sig.document_id] = (sig.sentiment_value, w)
|
||||||
|
|
||||||
|
# Group by catalyst type
|
||||||
|
from collections import defaultdict
|
||||||
|
catalyst_groups: dict[str, list[tuple[str, float, float]]] = defaultdict(list)
|
||||||
|
for entry in catalyst_entries:
|
||||||
|
if entry.document_id in sig_lookup:
|
||||||
|
sent_val, weight = sig_lookup[entry.document_id]
|
||||||
|
if sent_val != 0.0:
|
||||||
|
catalyst_groups[entry.catalyst_type].append(
|
||||||
|
(entry.document_id, sent_val, weight)
|
||||||
|
)
|
||||||
|
|
||||||
|
details: list[DisagreementDetail] = []
|
||||||
|
for catalyst, entries in catalyst_groups.items():
|
||||||
|
pos_ids = [doc_id for doc_id, sv, _ in entries if sv > 0]
|
||||||
|
neg_ids = [doc_id for doc_id, sv, _ in entries if sv < 0]
|
||||||
|
if not pos_ids or not neg_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
pos_w = sum(w for _, sv, w in entries if sv > 0)
|
||||||
|
neg_w = sum(w for _, sv, w in entries if sv < 0)
|
||||||
|
|
||||||
|
details.append(DisagreementDetail(
|
||||||
|
dimension=f"catalyst:{catalyst}",
|
||||||
|
positive_doc_ids=pos_ids,
|
||||||
|
negative_doc_ids=neg_ids,
|
||||||
|
positive_weight=round(pos_w, 4),
|
||||||
|
negative_weight=round(neg_w, 4),
|
||||||
|
description=(
|
||||||
|
f"Catalyst '{catalyst}' has {len(pos_ids)} positive and "
|
||||||
|
f"{len(neg_ids)} negative signals"
|
||||||
|
),
|
||||||
|
))
|
||||||
|
|
||||||
|
return details
|
||||||
@@ -0,0 +1,141 @@
|
|||||||
|
"""Evidence ranking for supporting and opposing documents.
|
||||||
|
|
||||||
|
Ranks document signals by a composite score that considers multiple
|
||||||
|
factors beyond raw weight, producing explainable evidence lists for
|
||||||
|
trend summaries.
|
||||||
|
|
||||||
|
Requirements: 6.5
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from services.aggregation.scoring import WeightedSignal
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class EvidenceRankConfig:
|
||||||
|
"""Weights for the composite evidence ranking score."""
|
||||||
|
|
||||||
|
# How much the combined signal weight matters (recency * credibility * novelty * market)
|
||||||
|
weight_factor: float = 0.40
|
||||||
|
# How much the document's impact score matters
|
||||||
|
impact_factor: float = 0.30
|
||||||
|
# How much recency alone matters (favours fresh evidence in the ranking)
|
||||||
|
recency_factor: float = 0.20
|
||||||
|
# How much extraction confidence matters
|
||||||
|
confidence_factor: float = 0.10
|
||||||
|
# Maximum evidence refs per side (supporting / opposing)
|
||||||
|
max_refs: int = 10
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_RANK_CONFIG = EvidenceRankConfig()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RankedEvidence:
|
||||||
|
"""A document with its composite ranking score and breakdown."""
|
||||||
|
|
||||||
|
document_id: str
|
||||||
|
rank_score: float
|
||||||
|
weight_component: float
|
||||||
|
impact_component: float
|
||||||
|
recency_component: float
|
||||||
|
confidence_component: float
|
||||||
|
sentiment_value: float # +1 / -1 / 0
|
||||||
|
|
||||||
|
|
||||||
|
def compute_evidence_rank(
|
||||||
|
signal: WeightedSignal,
|
||||||
|
config: EvidenceRankConfig = DEFAULT_RANK_CONFIG,
|
||||||
|
) -> RankedEvidence:
|
||||||
|
"""Compute a composite ranking score for a single signal.
|
||||||
|
|
||||||
|
The score blends:
|
||||||
|
- combined signal weight (captures recency decay, credibility, novelty, market ctx)
|
||||||
|
- raw impact score
|
||||||
|
- recency weight alone (extra boost for freshness in the ranking)
|
||||||
|
- extraction confidence (via the credibility component of the weight)
|
||||||
|
|
||||||
|
All components are in [0, 1] so the composite is bounded by the sum
|
||||||
|
of the factor weights.
|
||||||
|
"""
|
||||||
|
w = signal.weight
|
||||||
|
|
||||||
|
weight_component = w.combined * config.weight_factor
|
||||||
|
impact_component = signal.impact_score * config.impact_factor
|
||||||
|
recency_component = w.recency * config.recency_factor
|
||||||
|
confidence_component = w.credibility * config.confidence_factor
|
||||||
|
|
||||||
|
rank_score = weight_component + impact_component + recency_component + confidence_component
|
||||||
|
|
||||||
|
return RankedEvidence(
|
||||||
|
document_id=signal.document_id,
|
||||||
|
rank_score=round(rank_score, 6),
|
||||||
|
weight_component=round(weight_component, 6),
|
||||||
|
impact_component=round(impact_component, 6),
|
||||||
|
recency_component=round(recency_component, 6),
|
||||||
|
confidence_component=round(confidence_component, 6),
|
||||||
|
sentiment_value=signal.sentiment_value,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def rank_evidence(
|
||||||
|
signals: list[WeightedSignal],
|
||||||
|
config: EvidenceRankConfig = DEFAULT_RANK_CONFIG,
|
||||||
|
) -> tuple[list[str], list[str]]:
|
||||||
|
"""Rank signals into top supporting and opposing document ID lists.
|
||||||
|
|
||||||
|
Supporting = positive sentiment, Opposing = negative sentiment.
|
||||||
|
Neutral/mixed signals are excluded.
|
||||||
|
|
||||||
|
Returns (supporting_ids, opposing_ids) each capped at config.max_refs.
|
||||||
|
"""
|
||||||
|
supporting: list[RankedEvidence] = []
|
||||||
|
opposing: list[RankedEvidence] = []
|
||||||
|
|
||||||
|
for sig in signals:
|
||||||
|
if sig.sentiment_value == 0.0:
|
||||||
|
continue
|
||||||
|
ranked = compute_evidence_rank(sig, config)
|
||||||
|
if sig.sentiment_value > 0:
|
||||||
|
supporting.append(ranked)
|
||||||
|
else:
|
||||||
|
opposing.append(ranked)
|
||||||
|
|
||||||
|
supporting.sort(key=lambda r: r.rank_score, reverse=True)
|
||||||
|
opposing.sort(key=lambda r: r.rank_score, reverse=True)
|
||||||
|
|
||||||
|
return (
|
||||||
|
[r.document_id for r in supporting[: config.max_refs]],
|
||||||
|
[r.document_id for r in opposing[: config.max_refs]],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def rank_evidence_detailed(
|
||||||
|
signals: list[WeightedSignal],
|
||||||
|
config: EvidenceRankConfig = DEFAULT_RANK_CONFIG,
|
||||||
|
) -> tuple[list[RankedEvidence], list[RankedEvidence]]:
|
||||||
|
"""Like rank_evidence but returns full RankedEvidence objects.
|
||||||
|
|
||||||
|
Useful when callers need the score breakdown for explainability.
|
||||||
|
"""
|
||||||
|
supporting: list[RankedEvidence] = []
|
||||||
|
opposing: list[RankedEvidence] = []
|
||||||
|
|
||||||
|
for sig in signals:
|
||||||
|
if sig.sentiment_value == 0.0:
|
||||||
|
continue
|
||||||
|
ranked = compute_evidence_rank(sig, config)
|
||||||
|
if sig.sentiment_value > 0:
|
||||||
|
supporting.append(ranked)
|
||||||
|
else:
|
||||||
|
opposing.append(ranked)
|
||||||
|
|
||||||
|
supporting.sort(key=lambda r: r.rank_score, reverse=True)
|
||||||
|
opposing.sort(key=lambda r: r.rank_score, reverse=True)
|
||||||
|
|
||||||
|
return (
|
||||||
|
supporting[: config.max_refs],
|
||||||
|
opposing[: config.max_refs],
|
||||||
|
)
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
"""Aggregation worker entrypoint - polls Redis for aggregation jobs."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
from services.aggregation.worker import aggregate_company
|
||||||
|
from services.shared.config import load_config
|
||||||
|
from services.shared.logging import setup_logging
|
||||||
|
from services.shared.redis_keys import QUEUE_AGGREGATION, queue_key
|
||||||
|
|
||||||
|
logger = logging.getLogger("aggregation_main")
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
config = load_config()
|
||||||
|
setup_logging("aggregation", level=config.log_level, json_output=config.json_logs)
|
||||||
|
|
||||||
|
pool = await asyncpg.create_pool(dsn=config.postgres.dsn, min_size=2, max_size=8)
|
||||||
|
|
||||||
|
import redis.asyncio as aioredis
|
||||||
|
|
||||||
|
redis_client = aioredis.from_url(config.redis.url)
|
||||||
|
queue = queue_key(QUEUE_AGGREGATION)
|
||||||
|
logger.info("Aggregation worker started, polling %s", queue)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
raw = await redis_client.lpop(queue)
|
||||||
|
if raw is None:
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
payload = raw
|
||||||
|
job = json.loads(payload)
|
||||||
|
ticker = job.get("ticker", "")
|
||||||
|
|
||||||
|
logger.info("Processing aggregation job for %s", ticker)
|
||||||
|
|
||||||
|
try:
|
||||||
|
summaries = await aggregate_company(pool, ticker)
|
||||||
|
logger.info(
|
||||||
|
"Aggregation complete for %s: %d windows",
|
||||||
|
ticker, len(summaries),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Aggregation failed for %s", ticker)
|
||||||
|
finally:
|
||||||
|
await pool.close()
|
||||||
|
await redis_client.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
@@ -0,0 +1,150 @@
|
|||||||
|
"""Market context feature computation for aggregation windows.
|
||||||
|
|
||||||
|
Fetches recent market snapshots from PostgreSQL and computes context
|
||||||
|
features (price change, volume trend, volatility) that enrich trend
|
||||||
|
summaries and modulate signal weighting.
|
||||||
|
|
||||||
|
Requirements: 6.1, 6.2
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
from services.shared.schemas import MarketContext, TrendWindow
|
||||||
|
|
||||||
|
# Map TrendWindow values to lookback durations in days.
|
||||||
|
WINDOW_LOOKBACK_DAYS: dict[str, int] = {
|
||||||
|
TrendWindow.INTRADAY.value: 1,
|
||||||
|
TrendWindow.ONE_DAY.value: 2,
|
||||||
|
TrendWindow.SEVEN_DAY.value: 8,
|
||||||
|
TrendWindow.THIRTY_DAY.value: 35,
|
||||||
|
TrendWindow.NINETY_DAY.value: 95,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_market_context(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
ticker: str,
|
||||||
|
window: str,
|
||||||
|
reference_time: datetime | None = None,
|
||||||
|
) -> MarketContext:
|
||||||
|
"""Build a MarketContext for *ticker* over the given trend *window*.
|
||||||
|
|
||||||
|
Queries the ``market_snapshots`` table for recent bars and computes:
|
||||||
|
- price_change_pct: (last_close - first_close) / first_close
|
||||||
|
- avg_volume: mean volume across bars
|
||||||
|
- volume_change_pct: second-half avg volume vs first-half avg volume
|
||||||
|
- volatility: std-dev of close prices
|
||||||
|
- latest_close / latest_bar_at
|
||||||
|
|
||||||
|
Returns a MarketContext with ``bars_available == 0`` when no data exists.
|
||||||
|
"""
|
||||||
|
if reference_time is None:
|
||||||
|
reference_time = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
lookback_days = WINDOW_LOOKBACK_DAYS.get(window, 8)
|
||||||
|
start = reference_time - timedelta(days=lookback_days)
|
||||||
|
|
||||||
|
rows = await pool.fetch(
|
||||||
|
"""
|
||||||
|
SELECT data, captured_at
|
||||||
|
FROM market_snapshots
|
||||||
|
WHERE ticker = $1
|
||||||
|
AND captured_at >= $2
|
||||||
|
AND captured_at <= $3
|
||||||
|
ORDER BY captured_at ASC
|
||||||
|
""",
|
||||||
|
ticker,
|
||||||
|
start,
|
||||||
|
reference_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return MarketContext(ticker=ticker)
|
||||||
|
|
||||||
|
bars = _extract_bars(rows)
|
||||||
|
if not bars:
|
||||||
|
return MarketContext(ticker=ticker)
|
||||||
|
|
||||||
|
return _compute_context(ticker, bars)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_bars(rows: list[Any]) -> list[dict[str, Any]]:
|
||||||
|
"""Extract OHLCV bar dicts from market_snapshot rows.
|
||||||
|
|
||||||
|
The ``data`` column is JSONB. Polygon prev-day bars store fields like
|
||||||
|
``o``, ``h``, ``l``, ``c``, ``v``, ``t``. We normalise to a common
|
||||||
|
dict with ``close``, ``volume``, ``captured_at``.
|
||||||
|
"""
|
||||||
|
bars: list[dict[str, Any]] = []
|
||||||
|
for row in rows:
|
||||||
|
data = row["data"]
|
||||||
|
if isinstance(data, str):
|
||||||
|
import json
|
||||||
|
data = json.loads(data)
|
||||||
|
|
||||||
|
# Polygon-style single bar or list of bars
|
||||||
|
items = data if isinstance(data, list) else [data]
|
||||||
|
for item in items:
|
||||||
|
close = item.get("c") or item.get("close")
|
||||||
|
volume = item.get("v") or item.get("volume")
|
||||||
|
if close is not None:
|
||||||
|
bars.append({
|
||||||
|
"close": float(close),
|
||||||
|
"volume": float(volume) if volume is not None else 0.0,
|
||||||
|
"captured_at": row["captured_at"],
|
||||||
|
})
|
||||||
|
return bars
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_context(ticker: str, bars: list[dict[str, Any]]) -> MarketContext:
|
||||||
|
"""Derive market context features from a sorted list of bar dicts."""
|
||||||
|
closes = [b["close"] for b in bars]
|
||||||
|
volumes = [b["volume"] for b in bars]
|
||||||
|
|
||||||
|
first_close = closes[0]
|
||||||
|
last_close = closes[-1]
|
||||||
|
|
||||||
|
price_change_pct = (
|
||||||
|
((last_close - first_close) / first_close * 100.0)
|
||||||
|
if first_close != 0
|
||||||
|
else 0.0
|
||||||
|
)
|
||||||
|
|
||||||
|
avg_volume = sum(volumes) / len(volumes) if volumes else 0.0
|
||||||
|
|
||||||
|
# Volume trend: compare second half to first half
|
||||||
|
mid = len(volumes) // 2
|
||||||
|
if mid > 0:
|
||||||
|
first_half_avg = sum(volumes[:mid]) / mid
|
||||||
|
second_half_avg = sum(volumes[mid:]) / len(volumes[mid:])
|
||||||
|
volume_change_pct = (
|
||||||
|
((second_half_avg - first_half_avg) / first_half_avg * 100.0)
|
||||||
|
if first_half_avg > 0
|
||||||
|
else 0.0
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
volume_change_pct = 0.0
|
||||||
|
|
||||||
|
# Volatility: std dev of closes
|
||||||
|
if len(closes) > 1:
|
||||||
|
mean_close = sum(closes) / len(closes)
|
||||||
|
variance = sum((c - mean_close) ** 2 for c in closes) / len(closes)
|
||||||
|
volatility = math.sqrt(variance)
|
||||||
|
else:
|
||||||
|
volatility = 0.0
|
||||||
|
|
||||||
|
return MarketContext(
|
||||||
|
ticker=ticker,
|
||||||
|
price_change_pct=round(price_change_pct, 4),
|
||||||
|
avg_volume=round(avg_volume, 2),
|
||||||
|
volume_change_pct=round(volume_change_pct, 4),
|
||||||
|
volatility=round(volatility, 6),
|
||||||
|
latest_close=last_close,
|
||||||
|
latest_bar_at=bars[-1]["captured_at"],
|
||||||
|
bars_available=len(bars),
|
||||||
|
)
|
||||||
@@ -0,0 +1,439 @@
|
|||||||
|
"""Sector and market-level rollup aggregation.
|
||||||
|
|
||||||
|
Aggregates company-level trend summaries into sector and market-level
|
||||||
|
summaries, enabling top-down views of sentiment and risk across the
|
||||||
|
portfolio.
|
||||||
|
|
||||||
|
Requirements: 6.3, 6.4, 6.5
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
from services.shared.schemas import (
|
||||||
|
DisagreementDetail,
|
||||||
|
TrendDirection,
|
||||||
|
TrendSummary,
|
||||||
|
TrendWindow,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CompanyTrendRow:
|
||||||
|
"""A company-level trend summary fetched from the DB for rollup."""
|
||||||
|
|
||||||
|
entity_id: str # ticker
|
||||||
|
sector: str
|
||||||
|
window: str
|
||||||
|
trend_direction: str
|
||||||
|
trend_strength: float
|
||||||
|
confidence: float
|
||||||
|
contradiction_score: float
|
||||||
|
dominant_catalysts: list[str]
|
||||||
|
material_risks: list[str]
|
||||||
|
top_supporting_evidence: list[str]
|
||||||
|
top_opposing_evidence: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fetch latest company trends for a given window
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_LATEST_COMPANY_TRENDS_QUERY = """
|
||||||
|
SELECT DISTINCT ON (tw.entity_id)
|
||||||
|
tw.entity_id,
|
||||||
|
c.sector,
|
||||||
|
tw.window,
|
||||||
|
tw.trend_direction,
|
||||||
|
tw.trend_strength,
|
||||||
|
tw.confidence,
|
||||||
|
tw.contradiction_score,
|
||||||
|
tw.dominant_catalysts,
|
||||||
|
tw.material_risks,
|
||||||
|
tw.top_supporting_evidence,
|
||||||
|
tw.top_opposing_evidence
|
||||||
|
FROM trend_windows tw
|
||||||
|
JOIN companies c ON c.ticker = tw.entity_id AND c.active = TRUE
|
||||||
|
WHERE tw.entity_type = 'company'
|
||||||
|
AND tw.window = $1
|
||||||
|
AND tw.generated_at >= $2
|
||||||
|
ORDER BY tw.entity_id, tw.generated_at DESC
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_jsonb_list(val: object) -> list[str]:
|
||||||
|
"""Safely parse a JSONB column that should be a list of strings."""
|
||||||
|
if isinstance(val, list):
|
||||||
|
return [str(v) for v in val]
|
||||||
|
if isinstance(val, str):
|
||||||
|
parsed = json.loads(val)
|
||||||
|
if isinstance(parsed, list):
|
||||||
|
return [str(v) for v in parsed]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_company_trend_row(row: object) -> CompanyTrendRow:
|
||||||
|
"""Convert an asyncpg Record to a CompanyTrendRow."""
|
||||||
|
# asyncpg Records support dict() but aren't typed; use getattr-style access
|
||||||
|
get = getattr(row, "__getitem__", None)
|
||||||
|
if get is None:
|
||||||
|
raise TypeError(f"Expected a mapping-like row, got {type(row)}")
|
||||||
|
|
||||||
|
def _str(key: str, default: str = "") -> str:
|
||||||
|
val = get(key)
|
||||||
|
return str(val) if val is not None else default
|
||||||
|
|
||||||
|
def _float(key: str) -> float:
|
||||||
|
val = get(key)
|
||||||
|
return float(val) if val is not None else 0.0
|
||||||
|
|
||||||
|
return CompanyTrendRow(
|
||||||
|
entity_id=_str("entity_id"),
|
||||||
|
sector=_str("sector", "Unknown") or "Unknown",
|
||||||
|
window=_str("window"),
|
||||||
|
trend_direction=_str("trend_direction"),
|
||||||
|
trend_strength=_float("trend_strength"),
|
||||||
|
confidence=_float("confidence"),
|
||||||
|
contradiction_score=_float("contradiction_score"),
|
||||||
|
dominant_catalysts=_parse_jsonb_list(get("dominant_catalysts")),
|
||||||
|
material_risks=_parse_jsonb_list(get("material_risks")),
|
||||||
|
top_supporting_evidence=_parse_jsonb_list(get("top_supporting_evidence")),
|
||||||
|
top_opposing_evidence=_parse_jsonb_list(get("top_opposing_evidence")),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_latest_company_trends(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
window: str,
|
||||||
|
since: datetime,
|
||||||
|
) -> list[CompanyTrendRow]:
|
||||||
|
"""Fetch the most recent company-level trend for each ticker in a window."""
|
||||||
|
rows = await pool.fetch(_LATEST_COMPANY_TRENDS_QUERY, window, since)
|
||||||
|
return [_parse_company_trend_row(r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Pure rollup logic
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Direction mapping for numeric aggregation
|
||||||
|
_DIRECTION_VALUES = {
|
||||||
|
TrendDirection.BULLISH.value: 1.0,
|
||||||
|
TrendDirection.BEARISH.value: -1.0,
|
||||||
|
TrendDirection.MIXED.value: 0.0,
|
||||||
|
TrendDirection.NEUTRAL.value: 0.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
BULLISH_THRESHOLD = 0.15
|
||||||
|
BEARISH_THRESHOLD = -0.15
|
||||||
|
|
||||||
|
|
||||||
|
def rollup_trends(
|
||||||
|
trends: list[CompanyTrendRow],
|
||||||
|
entity_type: str,
|
||||||
|
entity_id: str,
|
||||||
|
window: str,
|
||||||
|
reference_time: datetime,
|
||||||
|
) -> TrendSummary:
|
||||||
|
"""Aggregate a list of company-level trends into a single rollup summary.
|
||||||
|
|
||||||
|
Each company trend is weighted by its confidence to produce a
|
||||||
|
confidence-weighted average of direction, strength, and contradiction.
|
||||||
|
"""
|
||||||
|
if not trends:
|
||||||
|
return TrendSummary(
|
||||||
|
entity_type=entity_type,
|
||||||
|
entity_id=entity_id,
|
||||||
|
window=TrendWindow(window),
|
||||||
|
trend_direction=TrendDirection.NEUTRAL,
|
||||||
|
trend_strength=0.0,
|
||||||
|
confidence=0.0,
|
||||||
|
generated_at=reference_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
total_weight = 0.0
|
||||||
|
weighted_direction = 0.0
|
||||||
|
weighted_strength = 0.0
|
||||||
|
weighted_contradiction = 0.0
|
||||||
|
catalyst_weights: dict[str, float] = {}
|
||||||
|
risk_set: dict[str, float] = {}
|
||||||
|
all_supporting: list[str] = []
|
||||||
|
all_opposing: list[str] = []
|
||||||
|
|
||||||
|
for t in trends:
|
||||||
|
w = t.confidence
|
||||||
|
total_weight += w
|
||||||
|
dir_val = _DIRECTION_VALUES.get(t.trend_direction, 0.0)
|
||||||
|
weighted_direction += w * dir_val
|
||||||
|
weighted_strength += w * t.trend_strength
|
||||||
|
weighted_contradiction += w * t.contradiction_score
|
||||||
|
|
||||||
|
for cat in t.dominant_catalysts:
|
||||||
|
catalyst_weights[cat] = catalyst_weights.get(cat, 0.0) + w
|
||||||
|
|
||||||
|
for risk in t.material_risks:
|
||||||
|
norm = risk.strip().lower()
|
||||||
|
if norm not in risk_set:
|
||||||
|
risk_set[norm] = w
|
||||||
|
else:
|
||||||
|
risk_set[norm] = max(risk_set[norm], w)
|
||||||
|
|
||||||
|
all_supporting.extend(t.top_supporting_evidence)
|
||||||
|
all_opposing.extend(t.top_opposing_evidence)
|
||||||
|
|
||||||
|
if total_weight == 0.0:
|
||||||
|
return TrendSummary(
|
||||||
|
entity_type=entity_type,
|
||||||
|
entity_id=entity_id,
|
||||||
|
window=TrendWindow(window),
|
||||||
|
trend_direction=TrendDirection.NEUTRAL,
|
||||||
|
trend_strength=0.0,
|
||||||
|
confidence=0.0,
|
||||||
|
generated_at=reference_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
avg_direction = weighted_direction / total_weight
|
||||||
|
avg_strength = weighted_strength / total_weight
|
||||||
|
avg_contradiction = weighted_contradiction / total_weight
|
||||||
|
avg_confidence = total_weight / len(trends)
|
||||||
|
|
||||||
|
# Derive direction
|
||||||
|
direction = _derive_rollup_direction(avg_direction, avg_contradiction)
|
||||||
|
|
||||||
|
# Top catalysts
|
||||||
|
sorted_catalysts = sorted(catalyst_weights.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
catalysts = [c for c, _ in sorted_catalysts[:5]]
|
||||||
|
|
||||||
|
# Top risks (deduplicated, by weight)
|
||||||
|
sorted_risks = sorted(risk_set.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
risks = [r for r, _ in sorted_risks[:5]]
|
||||||
|
|
||||||
|
# Disagreement details
|
||||||
|
disagreement = _build_rollup_disagreement(trends, entity_id)
|
||||||
|
|
||||||
|
return TrendSummary(
|
||||||
|
entity_type=entity_type,
|
||||||
|
entity_id=entity_id,
|
||||||
|
window=TrendWindow(window),
|
||||||
|
trend_direction=direction,
|
||||||
|
trend_strength=round(min(abs(avg_strength), 1.0), 4),
|
||||||
|
confidence=round(max(0.0, min(avg_confidence, 1.0)), 4),
|
||||||
|
top_supporting_evidence=list(dict.fromkeys(all_supporting))[:10],
|
||||||
|
top_opposing_evidence=list(dict.fromkeys(all_opposing))[:10],
|
||||||
|
dominant_catalysts=catalysts,
|
||||||
|
material_risks=risks,
|
||||||
|
contradiction_score=round(max(0.0, min(avg_contradiction, 1.0)), 4),
|
||||||
|
disagreement_details=disagreement,
|
||||||
|
generated_at=reference_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _derive_rollup_direction(
|
||||||
|
avg_direction: float,
|
||||||
|
avg_contradiction: float,
|
||||||
|
) -> TrendDirection:
|
||||||
|
"""Map averaged direction value to a TrendDirection."""
|
||||||
|
if avg_contradiction > 0.10 and abs(avg_direction) < 0.3:
|
||||||
|
return TrendDirection.MIXED
|
||||||
|
if avg_direction >= BULLISH_THRESHOLD:
|
||||||
|
return TrendDirection.BULLISH
|
||||||
|
if avg_direction <= BEARISH_THRESHOLD:
|
||||||
|
return TrendDirection.BEARISH
|
||||||
|
return TrendDirection.NEUTRAL
|
||||||
|
|
||||||
|
|
||||||
|
def _build_rollup_disagreement(
|
||||||
|
trends: list[CompanyTrendRow],
|
||||||
|
entity_id: str,
|
||||||
|
) -> list[DisagreementDetail]:
|
||||||
|
"""Build disagreement details showing which companies are bullish vs bearish."""
|
||||||
|
bullish_ids: list[str] = []
|
||||||
|
bearish_ids: list[str] = []
|
||||||
|
bullish_weight = 0.0
|
||||||
|
bearish_weight = 0.0
|
||||||
|
|
||||||
|
for t in trends:
|
||||||
|
if t.trend_direction == TrendDirection.BULLISH.value:
|
||||||
|
bullish_ids.append(t.entity_id)
|
||||||
|
bullish_weight += t.confidence
|
||||||
|
elif t.trend_direction == TrendDirection.BEARISH.value:
|
||||||
|
bearish_ids.append(t.entity_id)
|
||||||
|
bearish_weight += t.confidence
|
||||||
|
|
||||||
|
if not bullish_ids or not bearish_ids:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [
|
||||||
|
DisagreementDetail(
|
||||||
|
dimension="company_direction",
|
||||||
|
positive_doc_ids=bullish_ids,
|
||||||
|
negative_doc_ids=bearish_ids,
|
||||||
|
positive_weight=round(bullish_weight, 4),
|
||||||
|
negative_weight=round(bearish_weight, 4),
|
||||||
|
description=(
|
||||||
|
f"{entity_id}: {len(bullish_ids)} bullish vs "
|
||||||
|
f"{len(bearish_ids)} bearish companies"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Persist rollup (reuses the same trend_windows table)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_UPSERT_TREND = """
|
||||||
|
INSERT INTO trend_windows (
|
||||||
|
entity_type, entity_id, window, trend_direction, trend_strength,
|
||||||
|
confidence, top_supporting_evidence, top_opposing_evidence,
|
||||||
|
dominant_catalysts, material_risks, contradiction_score,
|
||||||
|
disagreement_details, market_context, generated_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, $3, $4, $5,
|
||||||
|
$6, $7::jsonb, $8::jsonb,
|
||||||
|
$9::jsonb, $10::jsonb, $11,
|
||||||
|
$12::jsonb, $13::jsonb, $14
|
||||||
|
)
|
||||||
|
RETURNING id
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def persist_rollup(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
summary: TrendSummary,
|
||||||
|
) -> str:
|
||||||
|
"""Insert a rollup trend summary and return its UUID."""
|
||||||
|
row = await pool.fetchrow(
|
||||||
|
_UPSERT_TREND,
|
||||||
|
summary.entity_type,
|
||||||
|
summary.entity_id,
|
||||||
|
summary.window.value,
|
||||||
|
summary.trend_direction.value,
|
||||||
|
summary.trend_strength,
|
||||||
|
summary.confidence,
|
||||||
|
json.dumps(summary.top_supporting_evidence),
|
||||||
|
json.dumps(summary.top_opposing_evidence),
|
||||||
|
json.dumps(summary.dominant_catalysts),
|
||||||
|
json.dumps(summary.material_risks),
|
||||||
|
summary.contradiction_score,
|
||||||
|
json.dumps([d.model_dump() for d in summary.disagreement_details]),
|
||||||
|
json.dumps({}),
|
||||||
|
summary.generated_at,
|
||||||
|
)
|
||||||
|
return str(row["id"]) # type: ignore[index]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# High-level rollup entry points
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def aggregate_sector(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
sector: str,
|
||||||
|
window: str,
|
||||||
|
reference_time: datetime | None = None,
|
||||||
|
since: datetime | None = None,
|
||||||
|
) -> TrendSummary:
|
||||||
|
"""Compute and persist a sector-level rollup for one window.
|
||||||
|
|
||||||
|
Fetches the latest company trends, filters to the given sector,
|
||||||
|
and rolls them up into a single sector summary.
|
||||||
|
"""
|
||||||
|
if reference_time is None:
|
||||||
|
reference_time = datetime.now(timezone.utc)
|
||||||
|
if since is None:
|
||||||
|
since = reference_time - _window_lookback(window)
|
||||||
|
|
||||||
|
all_trends = await fetch_latest_company_trends(pool, window, since)
|
||||||
|
sector_trends = [t for t in all_trends if t.sector == sector]
|
||||||
|
|
||||||
|
summary = rollup_trends(sector_trends, "sector", sector, window, reference_time)
|
||||||
|
|
||||||
|
if sector_trends:
|
||||||
|
rollup_id = await persist_rollup(pool, summary)
|
||||||
|
logger.info(
|
||||||
|
"Persisted sector rollup %s for %s/%s: direction=%s strength=%.3f companies=%d",
|
||||||
|
rollup_id, sector, window, summary.trend_direction.value,
|
||||||
|
summary.trend_strength, len(sector_trends),
|
||||||
|
)
|
||||||
|
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
async def aggregate_market(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
window: str,
|
||||||
|
reference_time: datetime | None = None,
|
||||||
|
since: datetime | None = None,
|
||||||
|
) -> TrendSummary:
|
||||||
|
"""Compute and persist a market-wide rollup for one window.
|
||||||
|
|
||||||
|
Aggregates all company trends regardless of sector.
|
||||||
|
"""
|
||||||
|
if reference_time is None:
|
||||||
|
reference_time = datetime.now(timezone.utc)
|
||||||
|
if since is None:
|
||||||
|
since = reference_time - _window_lookback(window)
|
||||||
|
|
||||||
|
all_trends = await fetch_latest_company_trends(pool, window, since)
|
||||||
|
|
||||||
|
summary = rollup_trends(all_trends, "market", "all", window, reference_time)
|
||||||
|
|
||||||
|
if all_trends:
|
||||||
|
rollup_id = await persist_rollup(pool, summary)
|
||||||
|
logger.info(
|
||||||
|
"Persisted market rollup %s for %s: direction=%s strength=%.3f companies=%d",
|
||||||
|
rollup_id, window, summary.trend_direction.value,
|
||||||
|
summary.trend_strength, len(all_trends),
|
||||||
|
)
|
||||||
|
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
async def aggregate_all_sectors(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
window: str,
|
||||||
|
reference_time: datetime | None = None,
|
||||||
|
since: datetime | None = None,
|
||||||
|
) -> list[TrendSummary]:
|
||||||
|
"""Compute sector rollups for every sector that has company trends."""
|
||||||
|
if reference_time is None:
|
||||||
|
reference_time = datetime.now(timezone.utc)
|
||||||
|
if since is None:
|
||||||
|
since = reference_time - _window_lookback(window)
|
||||||
|
|
||||||
|
all_trends = await fetch_latest_company_trends(pool, window, since)
|
||||||
|
|
||||||
|
# Group by sector
|
||||||
|
sectors: dict[str, list[CompanyTrendRow]] = {}
|
||||||
|
for t in all_trends:
|
||||||
|
sectors.setdefault(t.sector, []).append(t)
|
||||||
|
|
||||||
|
summaries: list[TrendSummary] = []
|
||||||
|
for sector, trends in sectors.items():
|
||||||
|
summary = rollup_trends(trends, "sector", sector, window, reference_time)
|
||||||
|
if trends:
|
||||||
|
_id = await persist_rollup(pool, summary)
|
||||||
|
summaries.append(summary)
|
||||||
|
|
||||||
|
return summaries
|
||||||
|
|
||||||
|
|
||||||
|
def _window_lookback(window: str) -> timedelta:
|
||||||
|
"""Return a reasonable lookback for finding recent company trends."""
|
||||||
|
mapping = {
|
||||||
|
TrendWindow.INTRADAY.value: timedelta(hours=24),
|
||||||
|
TrendWindow.ONE_DAY.value: timedelta(days=2),
|
||||||
|
TrendWindow.SEVEN_DAY.value: timedelta(days=8),
|
||||||
|
TrendWindow.THIRTY_DAY.value: timedelta(days=35),
|
||||||
|
TrendWindow.NINETY_DAY.value: timedelta(days=95),
|
||||||
|
}
|
||||||
|
return mapping.get(window, timedelta(days=8))
|
||||||
@@ -0,0 +1,285 @@
|
|||||||
|
"""Recency decay, source credibility weighting, and market context
|
||||||
|
integration for aggregation.
|
||||||
|
|
||||||
|
Provides scoring functions used by the aggregation engine to weight
|
||||||
|
document intelligence signals when computing trend summaries.
|
||||||
|
|
||||||
|
Requirements: 6.1, 6.2, 6.5
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
from services.shared.schemas import MarketContext
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ScoringConfig:
|
||||||
|
"""Tunable parameters for signal scoring."""
|
||||||
|
|
||||||
|
# Recency decay: exponential half-life in hours per window.
|
||||||
|
# After one half-life, a document's recency weight drops to 0.5.
|
||||||
|
half_life_hours: dict[str, float] = field(default_factory=lambda: {
|
||||||
|
"intraday": 2.0,
|
||||||
|
"1d": 12.0,
|
||||||
|
"7d": 72.0,
|
||||||
|
"30d": 240.0,
|
||||||
|
"90d": 720.0,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Minimum recency weight — prevents very old docs from being zeroed out
|
||||||
|
# entirely so they can still contribute trace-level signal.
|
||||||
|
min_recency_weight: float = 0.01
|
||||||
|
|
||||||
|
# Source credibility bounds — credibility scores outside this range
|
||||||
|
# are clamped before weighting.
|
||||||
|
credibility_floor: float = 0.1
|
||||||
|
credibility_ceiling: float = 1.0
|
||||||
|
|
||||||
|
# Exponent applied to credibility score. >1 penalises low-credibility
|
||||||
|
# sources more aggressively; <1 flattens the curve.
|
||||||
|
credibility_exponent: float = 1.0
|
||||||
|
|
||||||
|
# Novelty bonus: multiplier range applied on top of base weight.
|
||||||
|
# A novelty_score of 1.0 gets the full bonus; 0.0 gets none.
|
||||||
|
novelty_bonus_max: float = 0.25
|
||||||
|
|
||||||
|
# Confidence floor — documents below this extraction confidence
|
||||||
|
# receive zero weight (they are too unreliable to aggregate).
|
||||||
|
confidence_floor: float = 0.2
|
||||||
|
|
||||||
|
# Market context modulation ---
|
||||||
|
# When volatility exceeds this threshold (in price units), recency
|
||||||
|
# signals are amplified because fast-moving markets make fresh data
|
||||||
|
# more important.
|
||||||
|
volatility_recency_boost_threshold: float = 1.0
|
||||||
|
volatility_recency_boost_max: float = 0.30 # max extra multiplier
|
||||||
|
|
||||||
|
# When volume surges above this % change, signals get a small boost
|
||||||
|
# because high-volume moves carry more conviction.
|
||||||
|
volume_surge_threshold_pct: float = 50.0
|
||||||
|
volume_surge_boost: float = 0.15
|
||||||
|
|
||||||
|
|
||||||
|
# Singleton default config
|
||||||
|
DEFAULT_CONFIG = ScoringConfig()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Recency decay
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def recency_weight(
|
||||||
|
published_at: datetime,
|
||||||
|
reference_time: datetime,
|
||||||
|
window: str,
|
||||||
|
config: ScoringConfig = DEFAULT_CONFIG,
|
||||||
|
) -> float:
|
||||||
|
"""Compute an exponential recency decay weight for a document.
|
||||||
|
|
||||||
|
Uses the formula: w = 2^(-age_hours / half_life)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
published_at: When the document was published (tz-aware).
|
||||||
|
reference_time: The "now" anchor for the aggregation window (tz-aware).
|
||||||
|
window: One of the TrendWindow values (e.g. "7d").
|
||||||
|
config: Scoring parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A weight in [config.min_recency_weight, 1.0].
|
||||||
|
"""
|
||||||
|
# Ensure both are tz-aware; treat naive as UTC.
|
||||||
|
if published_at.tzinfo is None:
|
||||||
|
published_at = published_at.replace(tzinfo=timezone.utc)
|
||||||
|
if reference_time.tzinfo is None:
|
||||||
|
reference_time = reference_time.replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
age_seconds = (reference_time - published_at).total_seconds()
|
||||||
|
if age_seconds <= 0:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
age_hours = age_seconds / 3600.0
|
||||||
|
half_life = config.half_life_hours.get(window, 72.0)
|
||||||
|
|
||||||
|
weight = math.pow(2.0, -age_hours / half_life)
|
||||||
|
return max(weight, config.min_recency_weight)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Source credibility weighting
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def credibility_weight(
|
||||||
|
source_credibility: float,
|
||||||
|
config: ScoringConfig = DEFAULT_CONFIG,
|
||||||
|
) -> float:
|
||||||
|
"""Compute a weight from a source's credibility score.
|
||||||
|
|
||||||
|
The raw credibility (0-1) is clamped to [floor, ceiling] then raised
|
||||||
|
to ``credibility_exponent``.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source_credibility: The credibility score from the source or
|
||||||
|
document intelligence record (0-1).
|
||||||
|
config: Scoring parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A weight in [floor^exp, ceiling^exp].
|
||||||
|
"""
|
||||||
|
clamped = max(config.credibility_floor, min(source_credibility, config.credibility_ceiling))
|
||||||
|
return math.pow(clamped, config.credibility_exponent)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Market context adjustment
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def market_context_multiplier(
|
||||||
|
market_ctx: MarketContext | None,
|
||||||
|
config: ScoringConfig = DEFAULT_CONFIG,
|
||||||
|
) -> float:
|
||||||
|
"""Compute a multiplicative adjustment from market context features.
|
||||||
|
|
||||||
|
Returns a value >= 1.0 that amplifies signal weights when market
|
||||||
|
conditions suggest heightened importance (high volatility or volume
|
||||||
|
surges). Returns 1.0 when no market context is available.
|
||||||
|
"""
|
||||||
|
if market_ctx is None or not market_ctx.has_data:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
boost = 0.0
|
||||||
|
|
||||||
|
# Volatility boost — more volatile markets make recent signals more valuable
|
||||||
|
if market_ctx.volatility is not None and market_ctx.volatility > config.volatility_recency_boost_threshold:
|
||||||
|
excess = market_ctx.volatility - config.volatility_recency_boost_threshold
|
||||||
|
# Logarithmic scaling so extreme volatility doesn't blow up the weight
|
||||||
|
boost += min(
|
||||||
|
math.log1p(excess) * 0.15,
|
||||||
|
config.volatility_recency_boost_max,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Volume surge boost
|
||||||
|
if market_ctx.volume_change_pct is not None and market_ctx.volume_change_pct > config.volume_surge_threshold_pct:
|
||||||
|
boost += config.volume_surge_boost
|
||||||
|
|
||||||
|
return 1.0 + boost
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Combined document signal weight
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SignalWeight:
|
||||||
|
"""Breakdown of a document's aggregation weight."""
|
||||||
|
|
||||||
|
recency: float
|
||||||
|
credibility: float
|
||||||
|
novelty_bonus: float
|
||||||
|
confidence_gate: float # 0.0 or 1.0
|
||||||
|
market_ctx_multiplier: float # >= 1.0
|
||||||
|
combined: float
|
||||||
|
|
||||||
|
|
||||||
|
def compute_signal_weight(
|
||||||
|
published_at: datetime,
|
||||||
|
reference_time: datetime,
|
||||||
|
window: str,
|
||||||
|
source_credibility: float,
|
||||||
|
novelty_score: float = 0.5,
|
||||||
|
extraction_confidence: float = 0.5,
|
||||||
|
market_ctx: MarketContext | None = None,
|
||||||
|
config: ScoringConfig = DEFAULT_CONFIG,
|
||||||
|
) -> SignalWeight:
|
||||||
|
"""Compute the combined aggregation weight for a single document signal.
|
||||||
|
|
||||||
|
The formula is:
|
||||||
|
combined = confidence_gate * recency * credibility
|
||||||
|
* (1 + novelty_bonus) * market_ctx_multiplier
|
||||||
|
|
||||||
|
where novelty_bonus = novelty_score * config.novelty_bonus_max
|
||||||
|
and market_ctx_multiplier >= 1.0 based on volatility/volume features.
|
||||||
|
|
||||||
|
Documents with extraction_confidence below config.confidence_floor
|
||||||
|
receive a combined weight of 0.0 (gated out).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
published_at: Document publication time.
|
||||||
|
reference_time: Aggregation anchor time.
|
||||||
|
window: Trend window identifier.
|
||||||
|
source_credibility: Source credibility score (0-1).
|
||||||
|
novelty_score: Document novelty score (0-1).
|
||||||
|
extraction_confidence: Extraction confidence from the model (0-1).
|
||||||
|
market_ctx: Optional market context features for the symbol.
|
||||||
|
config: Scoring parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A ``SignalWeight`` with the component breakdown and combined score.
|
||||||
|
"""
|
||||||
|
# Confidence gate
|
||||||
|
gate = 1.0 if extraction_confidence >= config.confidence_floor else 0.0
|
||||||
|
|
||||||
|
rec = recency_weight(published_at, reference_time, window, config)
|
||||||
|
cred = credibility_weight(source_credibility, config)
|
||||||
|
bonus = novelty_score * config.novelty_bonus_max
|
||||||
|
mkt_mult = market_context_multiplier(market_ctx, config)
|
||||||
|
|
||||||
|
combined = gate * rec * cred * (1.0 + bonus) * mkt_mult
|
||||||
|
|
||||||
|
return SignalWeight(
|
||||||
|
recency=rec,
|
||||||
|
credibility=cred,
|
||||||
|
novelty_bonus=bonus,
|
||||||
|
confidence_gate=gate,
|
||||||
|
market_ctx_multiplier=mkt_mult,
|
||||||
|
combined=combined,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Batch helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class WeightedSignal:
|
||||||
|
"""A document intelligence reference paired with its computed weight."""
|
||||||
|
|
||||||
|
document_id: str
|
||||||
|
weight: SignalWeight
|
||||||
|
sentiment_value: float # numeric sentiment: +1 positive, -1 negative, 0 neutral/mixed
|
||||||
|
impact_score: float
|
||||||
|
|
||||||
|
|
||||||
|
def sentiment_to_numeric(sentiment: str) -> float:
|
||||||
|
"""Map a sentiment label to a signed numeric value."""
|
||||||
|
mapping = {
|
||||||
|
"positive": 1.0,
|
||||||
|
"negative": -1.0,
|
||||||
|
"neutral": 0.0,
|
||||||
|
"mixed": 0.0,
|
||||||
|
}
|
||||||
|
return mapping.get(sentiment.lower(), 0.0)
|
||||||
|
|
||||||
|
|
||||||
|
def weighted_sentiment_average(signals: list[WeightedSignal]) -> float:
|
||||||
|
"""Compute a weight-adjusted average sentiment across signals.
|
||||||
|
|
||||||
|
Returns a value in [-1, 1]. Returns 0.0 when total weight is zero.
|
||||||
|
"""
|
||||||
|
total_weight = 0.0
|
||||||
|
weighted_sum = 0.0
|
||||||
|
for sig in signals:
|
||||||
|
w = sig.weight.combined * sig.impact_score
|
||||||
|
weighted_sum += w * sig.sentiment_value
|
||||||
|
total_weight += w
|
||||||
|
if total_weight == 0.0:
|
||||||
|
return 0.0
|
||||||
|
return weighted_sum / total_weight
|
||||||
@@ -1 +1,650 @@
|
|||||||
"""Aggregation worker - rolling trend summaries, contradiction detection, evidence ranking."""
|
"""Aggregation worker - company-level rolling window trend summaries.
|
||||||
|
|
||||||
|
Queries document intelligence and market context for a given ticker,
|
||||||
|
computes weighted signal scores, and produces TrendSummary objects
|
||||||
|
persisted to the trend_windows table.
|
||||||
|
|
||||||
|
Requirements: 6.1, 6.2, 6.5
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
from services.aggregation.contradiction import CatalystEntry, detect_contradictions
|
||||||
|
from services.aggregation.evidence import (
|
||||||
|
EvidenceRankConfig,
|
||||||
|
RankedEvidence,
|
||||||
|
rank_evidence as _rank_evidence_composite,
|
||||||
|
rank_evidence_detailed,
|
||||||
|
)
|
||||||
|
from services.aggregation.market_context import fetch_market_context
|
||||||
|
from services.aggregation.scoring import (
|
||||||
|
ScoringConfig,
|
||||||
|
WeightedSignal,
|
||||||
|
compute_signal_weight,
|
||||||
|
sentiment_to_numeric,
|
||||||
|
weighted_sentiment_average,
|
||||||
|
)
|
||||||
|
from services.shared.schemas import TrendDirection, TrendSummary, TrendWindow
|
||||||
|
from services.shared.metrics import (
|
||||||
|
AGGREGATION_CONTRADICTION_SCORE,
|
||||||
|
AGGREGATION_DURATION,
|
||||||
|
AGGREGATION_SIGNALS_PROCESSED,
|
||||||
|
AGGREGATION_WINDOWS_COMPUTED,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Map TrendWindow values to lookback durations.
|
||||||
|
WINDOW_DURATIONS: dict[str, timedelta] = {
|
||||||
|
TrendWindow.INTRADAY.value: timedelta(hours=12),
|
||||||
|
TrendWindow.ONE_DAY.value: timedelta(days=1),
|
||||||
|
TrendWindow.SEVEN_DAY.value: timedelta(days=7),
|
||||||
|
TrendWindow.THIRTY_DAY.value: timedelta(days=30),
|
||||||
|
TrendWindow.NINETY_DAY.value: timedelta(days=90),
|
||||||
|
}
|
||||||
|
|
||||||
|
# How many evidence document IDs to keep in supporting/opposing lists.
|
||||||
|
MAX_EVIDENCE_REFS = 10
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AggregationConfig:
|
||||||
|
"""Controls which windows to compute and scoring parameters."""
|
||||||
|
|
||||||
|
windows: list[str] | None = None # None = all windows
|
||||||
|
scoring: ScoringConfig | None = None
|
||||||
|
max_evidence: int = MAX_EVIDENCE_REFS
|
||||||
|
|
||||||
|
def effective_windows(self) -> list[str]:
|
||||||
|
if self.windows:
|
||||||
|
return self.windows
|
||||||
|
return [w.value for w in TrendWindow]
|
||||||
|
|
||||||
|
def effective_scoring(self) -> ScoringConfig:
|
||||||
|
return self.scoring or ScoringConfig()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fetch impact records for a ticker within a time window
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_IMPACT_QUERY = """
|
||||||
|
SELECT
|
||||||
|
di.document_id,
|
||||||
|
di.confidence,
|
||||||
|
di.novelty_score,
|
||||||
|
di.source_credibility,
|
||||||
|
dir.sentiment,
|
||||||
|
dir.impact_score,
|
||||||
|
dir.catalyst_type,
|
||||||
|
dir.key_facts,
|
||||||
|
dir.risks,
|
||||||
|
d.published_at
|
||||||
|
FROM document_impact_records dir
|
||||||
|
JOIN document_intelligence di ON di.id = dir.intelligence_id
|
||||||
|
JOIN documents d ON d.id = di.document_id
|
||||||
|
WHERE dir.ticker = $1
|
||||||
|
AND d.published_at >= $2
|
||||||
|
AND d.published_at <= $3
|
||||||
|
AND di.validation_status = 'valid'
|
||||||
|
AND d.status != 'rejected'
|
||||||
|
ORDER BY d.published_at DESC
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ImpactRow:
|
||||||
|
"""Parsed row from the impact query."""
|
||||||
|
|
||||||
|
document_id: str
|
||||||
|
confidence: float
|
||||||
|
novelty_score: float
|
||||||
|
source_credibility: float
|
||||||
|
sentiment: str
|
||||||
|
impact_score: float
|
||||||
|
catalyst_type: str
|
||||||
|
key_facts: list[str]
|
||||||
|
risks: list[str]
|
||||||
|
published_at: datetime
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_impact_row(row: Any) -> ImpactRow:
|
||||||
|
"""Convert an asyncpg Record to an ImpactRow."""
|
||||||
|
key_facts = row["key_facts"]
|
||||||
|
if isinstance(key_facts, str):
|
||||||
|
key_facts = json.loads(key_facts)
|
||||||
|
risks = row["risks"]
|
||||||
|
if isinstance(risks, str):
|
||||||
|
risks = json.loads(risks)
|
||||||
|
|
||||||
|
return ImpactRow(
|
||||||
|
document_id=str(row["document_id"]),
|
||||||
|
confidence=float(row["confidence"] or 0.5),
|
||||||
|
novelty_score=float(row["novelty_score"] or 0.5),
|
||||||
|
source_credibility=float(row["source_credibility"] or 0.5),
|
||||||
|
sentiment=row["sentiment"] or "neutral",
|
||||||
|
impact_score=float(row["impact_score"] or 0.0),
|
||||||
|
catalyst_type=row["catalyst_type"] or "other",
|
||||||
|
key_facts=key_facts if isinstance(key_facts, list) else [],
|
||||||
|
risks=risks if isinstance(risks, list) else [],
|
||||||
|
published_at=row["published_at"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_impact_records(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
ticker: str,
|
||||||
|
window_start: datetime,
|
||||||
|
window_end: datetime,
|
||||||
|
) -> list[ImpactRow]:
|
||||||
|
"""Fetch validated document impact records for a ticker in a time range."""
|
||||||
|
rows = await pool.fetch(_IMPACT_QUERY, ticker, window_start, window_end)
|
||||||
|
return [_parse_impact_row(r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Build weighted signals from impact records
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def build_weighted_signals(
|
||||||
|
impacts: list[ImpactRow],
|
||||||
|
reference_time: datetime,
|
||||||
|
window: str,
|
||||||
|
market_ctx: Any | None = None,
|
||||||
|
config: ScoringConfig | None = None,
|
||||||
|
) -> list[WeightedSignal]:
|
||||||
|
"""Convert impact records into WeightedSignal objects using the scoring module."""
|
||||||
|
cfg = config or ScoringConfig()
|
||||||
|
signals: list[WeightedSignal] = []
|
||||||
|
for imp in impacts:
|
||||||
|
sw = compute_signal_weight(
|
||||||
|
published_at=imp.published_at,
|
||||||
|
reference_time=reference_time,
|
||||||
|
window=window,
|
||||||
|
source_credibility=imp.source_credibility,
|
||||||
|
novelty_score=imp.novelty_score,
|
||||||
|
extraction_confidence=imp.confidence,
|
||||||
|
market_ctx=market_ctx,
|
||||||
|
config=cfg,
|
||||||
|
)
|
||||||
|
signals.append(
|
||||||
|
WeightedSignal(
|
||||||
|
document_id=imp.document_id,
|
||||||
|
weight=sw,
|
||||||
|
sentiment_value=sentiment_to_numeric(imp.sentiment),
|
||||||
|
impact_score=imp.impact_score,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return signals
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Derive trend direction from weighted sentiment
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Thresholds for mapping numeric sentiment to direction.
|
||||||
|
BULLISH_THRESHOLD = 0.15
|
||||||
|
BEARISH_THRESHOLD = -0.15
|
||||||
|
MIXED_THRESHOLD = 0.10 # contradiction score above this → mixed
|
||||||
|
|
||||||
|
|
||||||
|
def derive_trend_direction(
|
||||||
|
avg_sentiment: float,
|
||||||
|
contradiction_score: float = 0.0,
|
||||||
|
) -> TrendDirection:
|
||||||
|
"""Map a weighted average sentiment to a TrendDirection.
|
||||||
|
|
||||||
|
If contradiction is high, the direction is MIXED regardless of
|
||||||
|
the average sentiment value.
|
||||||
|
"""
|
||||||
|
if contradiction_score > MIXED_THRESHOLD and abs(avg_sentiment) < 0.3:
|
||||||
|
return TrendDirection.MIXED
|
||||||
|
if avg_sentiment >= BULLISH_THRESHOLD:
|
||||||
|
return TrendDirection.BULLISH
|
||||||
|
if avg_sentiment <= BEARISH_THRESHOLD:
|
||||||
|
return TrendDirection.BEARISH
|
||||||
|
return TrendDirection.NEUTRAL
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Compute contradiction score
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def compute_contradiction_score(signals: list[WeightedSignal]) -> float:
|
||||||
|
"""Measure how much disagreement exists among weighted signals.
|
||||||
|
|
||||||
|
Returns a value in [0, 1] where 0 means full agreement and 1 means
|
||||||
|
equal-weight positive and negative signals.
|
||||||
|
|
||||||
|
The formula computes the ratio of the minority-side total weight to
|
||||||
|
the majority-side total weight.
|
||||||
|
"""
|
||||||
|
if not signals:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
pos_weight = 0.0
|
||||||
|
neg_weight = 0.0
|
||||||
|
for sig in signals:
|
||||||
|
w = sig.weight.combined * sig.impact_score
|
||||||
|
if sig.sentiment_value > 0:
|
||||||
|
pos_weight += w
|
||||||
|
elif sig.sentiment_value < 0:
|
||||||
|
neg_weight += w
|
||||||
|
|
||||||
|
total = pos_weight + neg_weight
|
||||||
|
if total == 0.0:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
minority = min(pos_weight, neg_weight)
|
||||||
|
return round(minority / total, 4)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Rank evidence (supporting vs opposing)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def rank_evidence(
|
||||||
|
signals: list[WeightedSignal],
|
||||||
|
max_refs: int = MAX_EVIDENCE_REFS,
|
||||||
|
) -> tuple[list[str], list[str]]:
|
||||||
|
"""Return top supporting and opposing document IDs ranked by composite score.
|
||||||
|
|
||||||
|
Delegates to the evidence ranking module which considers multiple
|
||||||
|
factors (weight, impact, recency, confidence) rather than raw weight alone.
|
||||||
|
|
||||||
|
Supporting = positive sentiment, Opposing = negative sentiment.
|
||||||
|
Neutral/mixed signals are excluded from evidence lists.
|
||||||
|
"""
|
||||||
|
config = EvidenceRankConfig(max_refs=max_refs)
|
||||||
|
return _rank_evidence_composite(signals, config)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Extract dominant catalysts and material risks
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def extract_catalysts_and_risks(
|
||||||
|
impacts: list[ImpactRow],
|
||||||
|
signals: list[WeightedSignal],
|
||||||
|
) -> tuple[list[str], list[str]]:
|
||||||
|
"""Return dominant catalyst types and material risks weighted by signal strength.
|
||||||
|
|
||||||
|
Catalysts are ranked by cumulative weight. Risks are deduplicated and
|
||||||
|
ordered by the weight of the signal that surfaced them.
|
||||||
|
"""
|
||||||
|
catalyst_weights: dict[str, float] = {}
|
||||||
|
risk_entries: list[tuple[float, str]] = []
|
||||||
|
|
||||||
|
# Build a lookup from document_id to combined weight
|
||||||
|
weight_by_doc = {s.document_id: s.weight.combined * s.impact_score for s in signals}
|
||||||
|
|
||||||
|
for imp in impacts:
|
||||||
|
w = weight_by_doc.get(imp.document_id, 0.0)
|
||||||
|
if w <= 0.0:
|
||||||
|
continue
|
||||||
|
catalyst_weights[imp.catalyst_type] = catalyst_weights.get(imp.catalyst_type, 0.0) + w
|
||||||
|
for risk in imp.risks:
|
||||||
|
risk_entries.append((w, risk))
|
||||||
|
|
||||||
|
# Top catalysts by cumulative weight
|
||||||
|
sorted_catalysts = sorted(catalyst_weights.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
catalysts = [cat for cat, _ in sorted_catalysts[:5]]
|
||||||
|
|
||||||
|
# Deduplicated risks ordered by weight
|
||||||
|
seen_risks: set[str] = set()
|
||||||
|
risks: list[str] = []
|
||||||
|
risk_entries.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
for _, risk_text in risk_entries:
|
||||||
|
normalized = risk_text.strip().lower()
|
||||||
|
if normalized not in seen_risks:
|
||||||
|
seen_risks.add(normalized)
|
||||||
|
risks.append(risk_text.strip())
|
||||||
|
if len(risks) >= 5:
|
||||||
|
break
|
||||||
|
|
||||||
|
return catalysts, risks
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Compute trend confidence
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def compute_trend_confidence(
|
||||||
|
signals: list[WeightedSignal],
|
||||||
|
contradiction_score: float,
|
||||||
|
) -> float:
|
||||||
|
"""Derive an overall confidence for the trend summary.
|
||||||
|
|
||||||
|
Confidence is based on:
|
||||||
|
- Number of contributing signals (more = higher base)
|
||||||
|
- Average extraction confidence of contributing signals
|
||||||
|
- Contradiction penalty (high contradiction lowers confidence)
|
||||||
|
|
||||||
|
Returns a value in [0, 1].
|
||||||
|
"""
|
||||||
|
if not signals:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
active = [s for s in signals if s.weight.combined > 0]
|
||||||
|
if not active:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
# Base confidence from signal count (diminishing returns)
|
||||||
|
count_factor = min(len(active) / 20.0, 1.0)
|
||||||
|
|
||||||
|
# Average extraction confidence (from the confidence_gate — if gated,
|
||||||
|
# the signal wouldn't be in active list, so we use the raw confidence
|
||||||
|
# from the weight breakdown).
|
||||||
|
avg_conf = sum(s.weight.credibility for s in active) / len(active)
|
||||||
|
|
||||||
|
# Contradiction penalty
|
||||||
|
contradiction_penalty = contradiction_score * 0.4
|
||||||
|
|
||||||
|
confidence = (0.4 * count_factor + 0.6 * avg_conf) - contradiction_penalty
|
||||||
|
return round(max(0.0, min(1.0, confidence)), 4)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Assemble a TrendSummary from components
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AssembledTrend:
|
||||||
|
"""A trend summary paired with its detailed evidence rankings."""
|
||||||
|
|
||||||
|
summary: TrendSummary
|
||||||
|
supporting_evidence: list[RankedEvidence]
|
||||||
|
opposing_evidence: list[RankedEvidence]
|
||||||
|
|
||||||
|
|
||||||
|
def assemble_trend_summary(
|
||||||
|
ticker: str,
|
||||||
|
window: str,
|
||||||
|
signals: list[WeightedSignal],
|
||||||
|
impacts: list[ImpactRow],
|
||||||
|
market_ctx: Any | None = None,
|
||||||
|
max_evidence: int = MAX_EVIDENCE_REFS,
|
||||||
|
reference_time: datetime | None = None,
|
||||||
|
) -> TrendSummary:
|
||||||
|
"""Build a complete TrendSummary from weighted signals and impact records."""
|
||||||
|
result = assemble_trend_with_evidence(
|
||||||
|
ticker, window, signals, impacts, market_ctx, max_evidence, reference_time,
|
||||||
|
)
|
||||||
|
return result.summary
|
||||||
|
|
||||||
|
|
||||||
|
def assemble_trend_with_evidence(
|
||||||
|
ticker: str,
|
||||||
|
window: str,
|
||||||
|
signals: list[WeightedSignal],
|
||||||
|
impacts: list[ImpactRow],
|
||||||
|
market_ctx: Any | None = None,
|
||||||
|
max_evidence: int = MAX_EVIDENCE_REFS,
|
||||||
|
reference_time: datetime | None = None,
|
||||||
|
) -> AssembledTrend:
|
||||||
|
"""Build a TrendSummary and return detailed evidence rankings for persistence."""
|
||||||
|
if reference_time is None:
|
||||||
|
reference_time = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
avg_sentiment = weighted_sentiment_average(signals)
|
||||||
|
|
||||||
|
# Run full contradiction detection (Requirement 6.4)
|
||||||
|
catalyst_entries = [
|
||||||
|
CatalystEntry(document_id=imp.document_id, catalyst_type=imp.catalyst_type)
|
||||||
|
for imp in impacts
|
||||||
|
]
|
||||||
|
contradiction_result = detect_contradictions(signals, catalyst_entries)
|
||||||
|
contradiction = contradiction_result.score
|
||||||
|
|
||||||
|
direction = derive_trend_direction(avg_sentiment, contradiction)
|
||||||
|
confidence = compute_trend_confidence(signals, contradiction)
|
||||||
|
|
||||||
|
# Get detailed evidence rankings for persistence
|
||||||
|
config = EvidenceRankConfig(max_refs=max_evidence)
|
||||||
|
supporting_ranked, opposing_ranked = rank_evidence_detailed(signals, config)
|
||||||
|
|
||||||
|
supporting = [r.document_id for r in supporting_ranked]
|
||||||
|
opposing = [r.document_id for r in opposing_ranked]
|
||||||
|
|
||||||
|
catalysts, risks = extract_catalysts_and_risks(impacts, signals)
|
||||||
|
|
||||||
|
# Trend strength: absolute value of weighted sentiment, clamped to [0, 1]
|
||||||
|
strength = round(min(abs(avg_sentiment), 1.0), 4)
|
||||||
|
|
||||||
|
summary = TrendSummary(
|
||||||
|
entity_type="company",
|
||||||
|
entity_id=ticker,
|
||||||
|
window=TrendWindow(window),
|
||||||
|
trend_direction=direction,
|
||||||
|
trend_strength=strength,
|
||||||
|
confidence=confidence,
|
||||||
|
top_supporting_evidence=supporting,
|
||||||
|
top_opposing_evidence=opposing,
|
||||||
|
dominant_catalysts=catalysts,
|
||||||
|
material_risks=risks,
|
||||||
|
contradiction_score=contradiction,
|
||||||
|
disagreement_details=contradiction_result.details,
|
||||||
|
market_context=market_ctx,
|
||||||
|
generated_at=reference_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
return AssembledTrend(
|
||||||
|
summary=summary,
|
||||||
|
supporting_evidence=supporting_ranked,
|
||||||
|
opposing_evidence=opposing_ranked,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Persist trend summary to PostgreSQL
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_UPSERT_TREND = """
|
||||||
|
INSERT INTO trend_windows (
|
||||||
|
entity_type, entity_id, window, trend_direction, trend_strength,
|
||||||
|
confidence, top_supporting_evidence, top_opposing_evidence,
|
||||||
|
dominant_catalysts, material_risks, contradiction_score,
|
||||||
|
disagreement_details, market_context, generated_at
|
||||||
|
) VALUES (
|
||||||
|
$1, $2, $3, $4, $5,
|
||||||
|
$6, $7::jsonb, $8::jsonb,
|
||||||
|
$9::jsonb, $10::jsonb, $11,
|
||||||
|
$12::jsonb, $13::jsonb, $14
|
||||||
|
)
|
||||||
|
RETURNING id
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def persist_trend_summary(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
summary: TrendSummary,
|
||||||
|
) -> str:
|
||||||
|
"""Insert a trend summary row and return its UUID."""
|
||||||
|
row = await pool.fetchrow(
|
||||||
|
_UPSERT_TREND,
|
||||||
|
summary.entity_type,
|
||||||
|
summary.entity_id,
|
||||||
|
summary.window.value,
|
||||||
|
summary.trend_direction.value,
|
||||||
|
summary.trend_strength,
|
||||||
|
summary.confidence,
|
||||||
|
json.dumps(summary.top_supporting_evidence),
|
||||||
|
json.dumps(summary.top_opposing_evidence),
|
||||||
|
json.dumps(summary.dominant_catalysts),
|
||||||
|
json.dumps(summary.material_risks),
|
||||||
|
summary.contradiction_score,
|
||||||
|
json.dumps([d.model_dump() for d in summary.disagreement_details]),
|
||||||
|
json.dumps(summary.market_context.model_dump() if summary.market_context else {}),
|
||||||
|
summary.generated_at,
|
||||||
|
)
|
||||||
|
return str(row["id"])
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Persist evidence mappings to trend_evidence table
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_INSERT_EVIDENCE = """
|
||||||
|
INSERT INTO trend_evidence (
|
||||||
|
trend_window_id, document_id, evidence_type,
|
||||||
|
rank_score, weight_component, impact_component,
|
||||||
|
recency_component, confidence_component, sentiment_value
|
||||||
|
) VALUES (
|
||||||
|
$1, $2::uuid, $3,
|
||||||
|
$4, $5, $6,
|
||||||
|
$7, $8, $9
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def persist_trend_evidence(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
trend_window_id: str,
|
||||||
|
supporting: list[RankedEvidence],
|
||||||
|
opposing: list[RankedEvidence],
|
||||||
|
) -> int:
|
||||||
|
"""Insert evidence mapping rows for a trend window. Returns count inserted."""
|
||||||
|
rows: list[tuple[str, str, str, float, float, float, float, float, float]] = []
|
||||||
|
for ev in supporting:
|
||||||
|
rows.append((
|
||||||
|
trend_window_id, ev.document_id, "supporting",
|
||||||
|
ev.rank_score, ev.weight_component, ev.impact_component,
|
||||||
|
ev.recency_component, ev.confidence_component, ev.sentiment_value,
|
||||||
|
))
|
||||||
|
for ev in opposing:
|
||||||
|
rows.append((
|
||||||
|
trend_window_id, ev.document_id, "opposing",
|
||||||
|
ev.rank_score, ev.weight_component, ev.impact_component,
|
||||||
|
ev.recency_component, ev.confidence_component, ev.sentiment_value,
|
||||||
|
))
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
await pool.executemany(_INSERT_EVIDENCE, rows)
|
||||||
|
return len(rows)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main aggregation entry point for a single ticker + window
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def aggregate_company_window(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
ticker: str,
|
||||||
|
window: str,
|
||||||
|
reference_time: datetime | None = None,
|
||||||
|
config: AggregationConfig | None = None,
|
||||||
|
) -> TrendSummary:
|
||||||
|
"""Compute and persist a trend summary for one ticker and one window.
|
||||||
|
|
||||||
|
Steps:
|
||||||
|
1. Determine the time range for the window.
|
||||||
|
2. Fetch document impact records from PostgreSQL.
|
||||||
|
3. Fetch market context for the ticker.
|
||||||
|
4. Build weighted signals using the scoring module.
|
||||||
|
5. Assemble the TrendSummary.
|
||||||
|
6. Persist to trend_windows table.
|
||||||
|
|
||||||
|
Returns the assembled TrendSummary.
|
||||||
|
"""
|
||||||
|
cfg = config or AggregationConfig()
|
||||||
|
scoring_cfg = cfg.effective_scoring()
|
||||||
|
|
||||||
|
if reference_time is None:
|
||||||
|
reference_time = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
_agg_start = time.monotonic()
|
||||||
|
duration = WINDOW_DURATIONS.get(window, timedelta(days=7))
|
||||||
|
window_start = reference_time - duration
|
||||||
|
|
||||||
|
# 1. Fetch impact records
|
||||||
|
impacts = await fetch_impact_records(pool, ticker, window_start, reference_time)
|
||||||
|
|
||||||
|
# 2. Fetch market context
|
||||||
|
market_ctx = await fetch_market_context(pool, ticker, window, reference_time)
|
||||||
|
|
||||||
|
# 3. Build weighted signals
|
||||||
|
signals = build_weighted_signals(
|
||||||
|
impacts, reference_time, window, market_ctx, scoring_cfg,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 4. Assemble trend summary with evidence details
|
||||||
|
assembled = assemble_trend_with_evidence(
|
||||||
|
ticker=ticker,
|
||||||
|
window=window,
|
||||||
|
signals=signals,
|
||||||
|
impacts=impacts,
|
||||||
|
market_ctx=market_ctx if market_ctx.has_data else None,
|
||||||
|
max_evidence=cfg.max_evidence,
|
||||||
|
reference_time=reference_time,
|
||||||
|
)
|
||||||
|
summary = assembled.summary
|
||||||
|
|
||||||
|
# 5. Persist trend window
|
||||||
|
trend_id = await persist_trend_summary(pool, summary)
|
||||||
|
|
||||||
|
# 6. Persist evidence mappings
|
||||||
|
evidence_count = await persist_trend_evidence(
|
||||||
|
pool, trend_id,
|
||||||
|
assembled.supporting_evidence,
|
||||||
|
assembled.opposing_evidence,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Persisted trend %s for %s/%s: direction=%s strength=%.3f confidence=%.3f signals=%d evidence=%d",
|
||||||
|
trend_id, ticker, window, summary.trend_direction.value,
|
||||||
|
summary.trend_strength, summary.confidence, len(signals), evidence_count,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prometheus metrics
|
||||||
|
AGGREGATION_WINDOWS_COMPUTED.labels(window=window).inc()
|
||||||
|
AGGREGATION_SIGNALS_PROCESSED.labels(window=window).inc(len(signals))
|
||||||
|
AGGREGATION_CONTRADICTION_SCORE.observe(summary.contradiction_score)
|
||||||
|
AGGREGATION_DURATION.labels(window=window).observe(time.monotonic() - _agg_start)
|
||||||
|
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Aggregate all windows for a single ticker
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def aggregate_company(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
ticker: str,
|
||||||
|
reference_time: datetime | None = None,
|
||||||
|
config: AggregationConfig | None = None,
|
||||||
|
) -> list[TrendSummary]:
|
||||||
|
"""Compute trend summaries for all configured windows for a ticker."""
|
||||||
|
cfg = config or AggregationConfig()
|
||||||
|
if reference_time is None:
|
||||||
|
reference_time = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
summaries: list[TrendSummary] = []
|
||||||
|
for window in cfg.effective_windows():
|
||||||
|
summary = await aggregate_company_window(
|
||||||
|
pool, ticker, window, reference_time, cfg,
|
||||||
|
)
|
||||||
|
summaries.append(summary)
|
||||||
|
|
||||||
|
return summaries
|
||||||
|
|||||||
+1507
-1
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,268 @@
|
|||||||
|
"""Ollama client wrapper using structured output format.
|
||||||
|
|
||||||
|
Sends documents to a local Ollama instance via the /api/chat endpoint
|
||||||
|
with the ``format`` parameter set to the extraction JSON schema, ensuring
|
||||||
|
the model returns schema-compliant JSON.
|
||||||
|
|
||||||
|
Includes retry logic for invalid or incomplete model responses with
|
||||||
|
exponential backoff, error classification, and full audit preservation.
|
||||||
|
|
||||||
|
Requirements: 5.1, 5.2, 5.4
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from services.extractor.prompts import (
|
||||||
|
build_extraction_prompt,
|
||||||
|
get_json_schema,
|
||||||
|
get_prompt_metadata,
|
||||||
|
)
|
||||||
|
from services.extractor.schemas import ExtractionResult, ValidationReport, validate_extraction
|
||||||
|
from services.shared.config import OllamaConfig
|
||||||
|
|
||||||
|
logger = logging.getLogger("ollama_client")
|
||||||
|
|
||||||
|
# Errors that should NOT be retried — the request itself is bad.
|
||||||
|
_NON_RETRYABLE_ERRORS = frozenset({
|
||||||
|
"http_400",
|
||||||
|
"http_401",
|
||||||
|
"http_403",
|
||||||
|
"http_404",
|
||||||
|
"http_422",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _is_retryable(error: str | None) -> bool:
|
||||||
|
"""Determine whether an extraction error warrants a retry."""
|
||||||
|
if error is None:
|
||||||
|
return False
|
||||||
|
return error not in _NON_RETRYABLE_ERRORS
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExtractionAttempt:
|
||||||
|
"""Record of a single extraction attempt for audit."""
|
||||||
|
|
||||||
|
raw_output: str = ""
|
||||||
|
validation: ValidationReport | None = None
|
||||||
|
error: str | None = None
|
||||||
|
duration_ms: int = 0
|
||||||
|
model: str = ""
|
||||||
|
retryable: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExtractionResponse:
|
||||||
|
"""Full response from an extraction call, including all attempts."""
|
||||||
|
|
||||||
|
success: bool = False
|
||||||
|
result: ExtractionResult | None = None
|
||||||
|
attempts: list[ExtractionAttempt] = field(default_factory=list)
|
||||||
|
prompt_metadata: dict[str, str] = field(default_factory=dict)
|
||||||
|
model: str = ""
|
||||||
|
total_duration_ms: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_backoff(
|
||||||
|
attempt_num: int,
|
||||||
|
base_delay: float,
|
||||||
|
max_delay: float,
|
||||||
|
multiplier: float,
|
||||||
|
) -> float:
|
||||||
|
"""Compute exponential backoff delay for a given attempt number."""
|
||||||
|
delay = base_delay * (multiplier ** attempt_num)
|
||||||
|
return min(delay, max_delay)
|
||||||
|
|
||||||
|
|
||||||
|
class OllamaClient:
|
||||||
|
"""Async client for Ollama structured extraction.
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
config = OllamaConfig(base_url="http://localhost:11434", model="llama3.1:8b")
|
||||||
|
client = OllamaClient(config)
|
||||||
|
response = await client.extract(
|
||||||
|
document_text="Apple reported record earnings...",
|
||||||
|
document_type="article",
|
||||||
|
document_id="abc-123",
|
||||||
|
)
|
||||||
|
if response.success:
|
||||||
|
print(response.result)
|
||||||
|
"""
|
||||||
|
|
||||||
|
_config: OllamaConfig
|
||||||
|
_max_retries: int
|
||||||
|
_base_delay: float
|
||||||
|
_max_delay: float
|
||||||
|
_backoff_multiplier: float
|
||||||
|
_owns_client: bool
|
||||||
|
_http: httpx.AsyncClient
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: OllamaConfig,
|
||||||
|
max_retries: int | None = None,
|
||||||
|
http_client: httpx.AsyncClient | None = None,
|
||||||
|
) -> None:
|
||||||
|
self._config = config
|
||||||
|
self._max_retries = max_retries if max_retries is not None else config.max_retries
|
||||||
|
self._base_delay = config.retry_base_delay
|
||||||
|
self._max_delay = config.retry_max_delay
|
||||||
|
self._backoff_multiplier = config.retry_backoff_multiplier
|
||||||
|
self._owns_client = http_client is None
|
||||||
|
self._http = http_client or httpx.AsyncClient(timeout=config.timeout)
|
||||||
|
|
||||||
|
async def close(self) -> None:
|
||||||
|
"""Close the underlying HTTP client if we own it."""
|
||||||
|
if self._owns_client:
|
||||||
|
await self._http.aclose()
|
||||||
|
|
||||||
|
async def extract(
|
||||||
|
self,
|
||||||
|
document_text: str,
|
||||||
|
document_type: str = "article",
|
||||||
|
document_id: str = "",
|
||||||
|
known_tickers: list[str] | None = None,
|
||||||
|
) -> ExtractionResponse:
|
||||||
|
"""Send a document to Ollama for structured intelligence extraction.
|
||||||
|
|
||||||
|
Retries up to ``max_retries`` times when the model returns invalid
|
||||||
|
or incomplete JSON. Uses exponential backoff between retries.
|
||||||
|
Non-retryable errors (e.g. HTTP 400) stop retries immediately.
|
||||||
|
Each attempt and its validation result are preserved for audit.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_text: Normalized text content of the document.
|
||||||
|
document_type: One of article, filing, transcript, press_release.
|
||||||
|
document_id: Optional document ID for traceability.
|
||||||
|
known_tickers: Optional ticker hints for the model.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An ``ExtractionResponse`` with the parsed result on success.
|
||||||
|
"""
|
||||||
|
prompts = build_extraction_prompt(
|
||||||
|
document_text=document_text,
|
||||||
|
document_type=document_type,
|
||||||
|
document_id=document_id,
|
||||||
|
known_tickers=known_tickers,
|
||||||
|
)
|
||||||
|
json_schema = get_json_schema()
|
||||||
|
prompt_meta = get_prompt_metadata()
|
||||||
|
|
||||||
|
response = ExtractionResponse(
|
||||||
|
prompt_metadata=prompt_meta,
|
||||||
|
model=self._config.model,
|
||||||
|
)
|
||||||
|
|
||||||
|
total_start = time.monotonic()
|
||||||
|
|
||||||
|
for attempt_num in range(self._max_retries + 1):
|
||||||
|
attempt = await self._call_ollama(prompts, json_schema, document_text)
|
||||||
|
response.attempts.append(attempt)
|
||||||
|
|
||||||
|
if attempt.error is None and attempt.validation and attempt.validation.valid:
|
||||||
|
response.success = True
|
||||||
|
response.result = attempt.validation.parsed
|
||||||
|
break
|
||||||
|
|
||||||
|
# Check if the error is non-retryable — stop immediately
|
||||||
|
if not _is_retryable(attempt.error):
|
||||||
|
attempt.retryable = False
|
||||||
|
logger.warning(
|
||||||
|
"Non-retryable error for doc %s: %s — stopping retries",
|
||||||
|
document_id or "unknown",
|
||||||
|
attempt.error,
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
if attempt_num < self._max_retries:
|
||||||
|
delay = _compute_backoff(
|
||||||
|
attempt_num,
|
||||||
|
self._base_delay,
|
||||||
|
self._max_delay,
|
||||||
|
self._backoff_multiplier,
|
||||||
|
)
|
||||||
|
logger.warning(
|
||||||
|
"Extraction attempt %d/%d failed for doc %s: %s — retrying in %.1fs",
|
||||||
|
attempt_num + 1,
|
||||||
|
self._max_retries + 1,
|
||||||
|
document_id or "unknown",
|
||||||
|
attempt.error or "validation failed",
|
||||||
|
delay,
|
||||||
|
)
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
|
||||||
|
response.total_duration_ms = int((time.monotonic() - total_start) * 1000)
|
||||||
|
return response
|
||||||
|
|
||||||
|
async def _call_ollama(
|
||||||
|
self,
|
||||||
|
prompts: dict[str, str],
|
||||||
|
json_schema: dict[str, object],
|
||||||
|
document_text: str = "",
|
||||||
|
) -> ExtractionAttempt:
|
||||||
|
"""Make a single call to the Ollama /api/chat endpoint."""
|
||||||
|
attempt = ExtractionAttempt(model=self._config.model)
|
||||||
|
start = time.monotonic()
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": self._config.model,
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": prompts["system"]},
|
||||||
|
{"role": "user", "content": prompts["user"]},
|
||||||
|
],
|
||||||
|
"format": json_schema,
|
||||||
|
"stream": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = await self._http.post(
|
||||||
|
f"{self._config.base_url}/api/chat",
|
||||||
|
json=payload,
|
||||||
|
)
|
||||||
|
_ = resp.raise_for_status()
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
attempt.error = "timeout"
|
||||||
|
attempt.duration_ms = int((time.monotonic() - start) * 1000)
|
||||||
|
return attempt
|
||||||
|
except httpx.HTTPStatusError as exc:
|
||||||
|
attempt.error = f"http_{exc.response.status_code}"
|
||||||
|
attempt.retryable = _is_retryable(attempt.error)
|
||||||
|
attempt.duration_ms = int((time.monotonic() - start) * 1000)
|
||||||
|
return attempt
|
||||||
|
except httpx.HTTPError as exc:
|
||||||
|
attempt.error = f"connection_error: {exc}"
|
||||||
|
attempt.duration_ms = int((time.monotonic() - start) * 1000)
|
||||||
|
return attempt
|
||||||
|
|
||||||
|
attempt.duration_ms = int((time.monotonic() - start) * 1000)
|
||||||
|
|
||||||
|
# Parse the Ollama response envelope
|
||||||
|
try:
|
||||||
|
body: dict[str, object] = resp.json()
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
attempt.error = "invalid_response_json"
|
||||||
|
attempt.raw_output = resp.text
|
||||||
|
return attempt
|
||||||
|
|
||||||
|
msg = body.get("message")
|
||||||
|
content: str = msg.get("content", "") if isinstance(msg, dict) else ""
|
||||||
|
attempt.raw_output = content
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
attempt.error = "empty_model_response"
|
||||||
|
return attempt
|
||||||
|
|
||||||
|
# Validate against extraction schema
|
||||||
|
attempt.validation = validate_extraction(content, document_text=document_text)
|
||||||
|
if not attempt.validation.valid:
|
||||||
|
attempt.error = "; ".join(attempt.validation.errors)
|
||||||
|
|
||||||
|
return attempt
|
||||||
@@ -0,0 +1,72 @@
|
|||||||
|
"""Extractor worker entrypoint - polls Redis for extraction jobs."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
from minio import Minio
|
||||||
|
|
||||||
|
from services.extractor.client import OllamaClient
|
||||||
|
from services.extractor.worker import persist_extraction
|
||||||
|
from services.shared.config import load_config
|
||||||
|
from services.shared.logging import setup_logging
|
||||||
|
from services.shared.redis_keys import QUEUE_EXTRACTION, queue_key
|
||||||
|
|
||||||
|
logger = logging.getLogger("extractor_main")
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
config = load_config()
|
||||||
|
setup_logging("extractor", level=config.log_level, json_output=config.json_logs)
|
||||||
|
|
||||||
|
pool = await asyncpg.create_pool(dsn=config.postgres.dsn, min_size=2, max_size=8)
|
||||||
|
minio_client = Minio(
|
||||||
|
config.minio.endpoint,
|
||||||
|
access_key=config.minio.access_key,
|
||||||
|
secret_key=config.minio.secret_key,
|
||||||
|
secure=config.minio.secure,
|
||||||
|
)
|
||||||
|
ollama = OllamaClient(config.ollama)
|
||||||
|
|
||||||
|
import json
|
||||||
|
import redis.asyncio as aioredis
|
||||||
|
|
||||||
|
redis_client = aioredis.from_url(config.redis.url)
|
||||||
|
queue = queue_key(QUEUE_EXTRACTION)
|
||||||
|
logger.info("Extractor worker started, polling %s", queue)
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
raw = await redis_client.lpop(queue)
|
||||||
|
if raw is None:
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
payload = raw
|
||||||
|
job = json.loads(payload)
|
||||||
|
document_id = job.get("document_id", "")
|
||||||
|
ticker = job.get("ticker", "")
|
||||||
|
text = job.get("text", "")
|
||||||
|
|
||||||
|
logger.info("Processing extraction job for doc %s / %s", document_id, ticker)
|
||||||
|
|
||||||
|
try:
|
||||||
|
extraction_response = await ollama.extract(text)
|
||||||
|
await persist_extraction(
|
||||||
|
pool=pool,
|
||||||
|
minio_client=minio_client,
|
||||||
|
document_id=document_id,
|
||||||
|
ticker=ticker,
|
||||||
|
extraction_response=extraction_response,
|
||||||
|
document_text_length=len(text),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Extraction failed for doc %s", document_id)
|
||||||
|
finally:
|
||||||
|
await pool.close()
|
||||||
|
await redis_client.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
@@ -0,0 +1,250 @@
|
|||||||
|
"""Model performance metrics collection and persistence.
|
||||||
|
|
||||||
|
Tracks extraction success/failure rates, latency percentiles, retry counts,
|
||||||
|
validation error distributions, confidence scores, and token usage estimates.
|
||||||
|
Metrics are persisted to PostgreSQL for operational dashboards and published
|
||||||
|
to the analytical lake for Trino/Superset queries.
|
||||||
|
|
||||||
|
Requirements: 5.2, 5.4, 12.1, 12.2
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
|
||||||
|
from services.extractor.client import ExtractionResponse
|
||||||
|
|
||||||
|
logger = logging.getLogger("extractor_metrics")
|
||||||
|
|
||||||
|
# Rough token estimate: ~4 chars per token for English text
|
||||||
|
_CHARS_PER_TOKEN = 4
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExtractionMetrics:
|
||||||
|
"""Metrics extracted from a single extraction run."""
|
||||||
|
|
||||||
|
document_id: str = ""
|
||||||
|
ticker: str = ""
|
||||||
|
model_name: str = ""
|
||||||
|
prompt_version: str = ""
|
||||||
|
schema_version: str = ""
|
||||||
|
success: bool = False
|
||||||
|
attempt_count: int = 0
|
||||||
|
total_duration_ms: int = 0
|
||||||
|
first_attempt_duration_ms: int = 0
|
||||||
|
final_attempt_duration_ms: int = 0
|
||||||
|
confidence: float = 0.0
|
||||||
|
validation_status: str = "unknown"
|
||||||
|
validation_error_count: int = 0
|
||||||
|
validation_warning_count: int = 0
|
||||||
|
validation_errors: list[str] = field(default_factory=list)
|
||||||
|
retry_count: int = 0
|
||||||
|
input_token_estimate: int = 0
|
||||||
|
output_token_estimate: int = 0
|
||||||
|
company_count: int = 0
|
||||||
|
recorded_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||||
|
|
||||||
|
|
||||||
|
def collect_metrics(
|
||||||
|
extraction_response: ExtractionResponse,
|
||||||
|
*,
|
||||||
|
document_id: str = "",
|
||||||
|
ticker: str = "",
|
||||||
|
document_text_length: int = 0,
|
||||||
|
) -> ExtractionMetrics:
|
||||||
|
"""Collect metrics from an ExtractionResponse.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
extraction_response: The full response from OllamaClient.extract().
|
||||||
|
document_id: UUID of the source document.
|
||||||
|
ticker: Primary ticker symbol.
|
||||||
|
document_text_length: Length of the input document text in characters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An ExtractionMetrics dataclass with all computed fields.
|
||||||
|
"""
|
||||||
|
attempts = extraction_response.attempts
|
||||||
|
first_dur = attempts[0].duration_ms if attempts else 0
|
||||||
|
final_dur = attempts[-1].duration_ms if attempts else 0
|
||||||
|
|
||||||
|
# Gather validation info from the final attempt
|
||||||
|
final_attempt = attempts[-1] if attempts else None
|
||||||
|
val_errors: list[str] = []
|
||||||
|
val_warnings: list[str] = []
|
||||||
|
if final_attempt and final_attempt.validation:
|
||||||
|
val_errors = final_attempt.validation.errors
|
||||||
|
val_warnings = final_attempt.validation.warnings
|
||||||
|
|
||||||
|
# Determine validation status
|
||||||
|
if extraction_response.success:
|
||||||
|
validation_status = "valid"
|
||||||
|
elif attempts:
|
||||||
|
validation_status = "failed"
|
||||||
|
else:
|
||||||
|
validation_status = "unknown"
|
||||||
|
|
||||||
|
# Confidence from the result, or 0 if failed
|
||||||
|
confidence = 0.0
|
||||||
|
company_count = 0
|
||||||
|
if extraction_response.result:
|
||||||
|
confidence = extraction_response.result.confidence
|
||||||
|
company_count = len(extraction_response.result.companies)
|
||||||
|
|
||||||
|
# Token estimates
|
||||||
|
input_tokens = document_text_length // _CHARS_PER_TOKEN if document_text_length > 0 else 0
|
||||||
|
output_tokens = 0
|
||||||
|
if final_attempt and final_attempt.raw_output:
|
||||||
|
output_tokens = len(final_attempt.raw_output) // _CHARS_PER_TOKEN
|
||||||
|
|
||||||
|
return ExtractionMetrics(
|
||||||
|
document_id=document_id,
|
||||||
|
ticker=ticker,
|
||||||
|
model_name=extraction_response.model,
|
||||||
|
prompt_version=extraction_response.prompt_metadata.get("prompt_version", ""),
|
||||||
|
schema_version=extraction_response.prompt_metadata.get("schema_version", ""),
|
||||||
|
success=extraction_response.success,
|
||||||
|
attempt_count=len(attempts),
|
||||||
|
total_duration_ms=extraction_response.total_duration_ms,
|
||||||
|
first_attempt_duration_ms=first_dur,
|
||||||
|
final_attempt_duration_ms=final_dur,
|
||||||
|
confidence=confidence,
|
||||||
|
validation_status=validation_status,
|
||||||
|
validation_error_count=len(val_errors),
|
||||||
|
validation_warning_count=len(val_warnings),
|
||||||
|
validation_errors=val_errors,
|
||||||
|
retry_count=max(0, len(attempts) - 1),
|
||||||
|
input_token_estimate=input_tokens,
|
||||||
|
output_token_estimate=output_tokens,
|
||||||
|
company_count=company_count,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def persist_metrics(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
metrics: ExtractionMetrics,
|
||||||
|
) -> str:
|
||||||
|
"""Persist extraction metrics to the model_performance_metrics table.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pool: PostgreSQL connection pool.
|
||||||
|
metrics: Collected metrics from an extraction run.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The UUID of the inserted metrics row.
|
||||||
|
"""
|
||||||
|
row_id = await pool.fetchval(
|
||||||
|
"""INSERT INTO model_performance_metrics
|
||||||
|
(document_id, ticker, model_name, prompt_version, schema_version,
|
||||||
|
success, attempt_count, total_duration_ms,
|
||||||
|
first_attempt_duration_ms, final_attempt_duration_ms,
|
||||||
|
confidence, validation_status, validation_error_count,
|
||||||
|
validation_warning_count, validation_errors, retry_count,
|
||||||
|
input_token_estimate, output_token_estimate, company_count,
|
||||||
|
recorded_at)
|
||||||
|
VALUES ($1::uuid, $2, $3, $4, $5, $6, $7, $8, $9, $10,
|
||||||
|
$11, $12, $13, $14, $15::jsonb, $16, $17, $18, $19, $20)
|
||||||
|
RETURNING id""",
|
||||||
|
metrics.document_id,
|
||||||
|
metrics.ticker,
|
||||||
|
metrics.model_name,
|
||||||
|
metrics.prompt_version,
|
||||||
|
metrics.schema_version,
|
||||||
|
metrics.success,
|
||||||
|
metrics.attempt_count,
|
||||||
|
metrics.total_duration_ms,
|
||||||
|
metrics.first_attempt_duration_ms,
|
||||||
|
metrics.final_attempt_duration_ms,
|
||||||
|
metrics.confidence,
|
||||||
|
metrics.validation_status,
|
||||||
|
metrics.validation_error_count,
|
||||||
|
metrics.validation_warning_count,
|
||||||
|
json.dumps(metrics.validation_errors),
|
||||||
|
metrics.retry_count,
|
||||||
|
metrics.input_token_estimate,
|
||||||
|
metrics.output_token_estimate,
|
||||||
|
metrics.company_count,
|
||||||
|
metrics.recorded_at,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"Persisted extraction metrics %s for doc %s: success=%s duration=%dms retries=%d",
|
||||||
|
row_id, metrics.document_id, metrics.success,
|
||||||
|
metrics.total_duration_ms, metrics.retry_count,
|
||||||
|
)
|
||||||
|
return str(row_id)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_model_performance_summary(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
*,
|
||||||
|
model_name: str | None = None,
|
||||||
|
hours: int = 24,
|
||||||
|
) -> dict[str, object]:
|
||||||
|
"""Query aggregated model performance metrics for dashboards.
|
||||||
|
|
||||||
|
Returns a summary dict with success rate, avg latency, retry rate,
|
||||||
|
confidence distribution, and error breakdown for the given time window.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pool: PostgreSQL connection pool.
|
||||||
|
model_name: Optional filter by model name.
|
||||||
|
hours: Lookback window in hours (default 24).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with aggregated performance metrics.
|
||||||
|
"""
|
||||||
|
model_filter = "AND model_name = $2" if model_name else ""
|
||||||
|
params: list[object] = [hours]
|
||||||
|
if model_name:
|
||||||
|
params.append(model_name)
|
||||||
|
|
||||||
|
row = await pool.fetchrow(
|
||||||
|
f"""SELECT
|
||||||
|
COUNT(*) AS total_extractions,
|
||||||
|
COUNT(*) FILTER (WHERE success) AS successful,
|
||||||
|
COUNT(*) FILTER (WHERE NOT success) AS failed,
|
||||||
|
ROUND(AVG(total_duration_ms)::numeric, 1) AS avg_duration_ms,
|
||||||
|
ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 1) AS p50_duration_ms,
|
||||||
|
ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 1) AS p95_duration_ms,
|
||||||
|
ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 1) AS p99_duration_ms,
|
||||||
|
ROUND(AVG(retry_count)::numeric, 2) AS avg_retries,
|
||||||
|
ROUND(AVG(confidence)::numeric, 3) AS avg_confidence,
|
||||||
|
SUM(input_token_estimate) AS total_input_tokens,
|
||||||
|
SUM(output_token_estimate) AS total_output_tokens,
|
||||||
|
ROUND(AVG(company_count)::numeric, 2) AS avg_companies_per_doc,
|
||||||
|
ROUND(AVG(validation_error_count)::numeric, 2) AS avg_validation_errors,
|
||||||
|
ROUND(AVG(validation_warning_count)::numeric, 2) AS avg_validation_warnings
|
||||||
|
FROM model_performance_metrics
|
||||||
|
WHERE recorded_at >= NOW() - INTERVAL '1 hour' * $1
|
||||||
|
{model_filter}""",
|
||||||
|
*params,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not row or row["total_extractions"] == 0:
|
||||||
|
return {"total_extractions": 0, "success_rate": 0.0}
|
||||||
|
|
||||||
|
total = row["total_extractions"]
|
||||||
|
successful = row["successful"]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_extractions": total,
|
||||||
|
"successful": successful,
|
||||||
|
"failed": row["failed"],
|
||||||
|
"success_rate": round(successful / total, 4) if total > 0 else 0.0,
|
||||||
|
"avg_duration_ms": float(row["avg_duration_ms"] or 0),
|
||||||
|
"p50_duration_ms": float(row["p50_duration_ms"] or 0),
|
||||||
|
"p95_duration_ms": float(row["p95_duration_ms"] or 0),
|
||||||
|
"p99_duration_ms": float(row["p99_duration_ms"] or 0),
|
||||||
|
"avg_retries": float(row["avg_retries"] or 0),
|
||||||
|
"avg_confidence": float(row["avg_confidence"] or 0),
|
||||||
|
"total_input_tokens": int(row["total_input_tokens"] or 0),
|
||||||
|
"total_output_tokens": int(row["total_output_tokens"] or 0),
|
||||||
|
"avg_companies_per_doc": float(row["avg_companies_per_doc"] or 0),
|
||||||
|
"avg_validation_errors": float(row["avg_validation_errors"] or 0),
|
||||||
|
"avg_validation_warnings": float(row["avg_validation_warnings"] or 0),
|
||||||
|
"hours": hours,
|
||||||
|
}
|
||||||
@@ -0,0 +1,149 @@
|
|||||||
|
"""Extraction prompt templates with anti-hallucination instructions.
|
||||||
|
|
||||||
|
Builds structured prompts for Ollama document intelligence extraction.
|
||||||
|
Each prompt includes the target JSON schema, anti-hallucination rules,
|
||||||
|
and document-type-specific guidance.
|
||||||
|
|
||||||
|
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from services.extractor.schemas import generate_json_schema, SCHEMA_VERSION
|
||||||
|
from services.shared.schemas import (
|
||||||
|
DocumentType,
|
||||||
|
)
|
||||||
|
|
||||||
|
PROMPT_VERSION = "document-intel-v1"
|
||||||
|
|
||||||
|
# --- JSON schema for structured output (generated from Pydantic models) ---
|
||||||
|
|
||||||
|
EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema()
|
||||||
|
|
||||||
|
# --- Anti-hallucination system prompt ---
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """\
|
||||||
|
You are a financial document analysis system. You extract structured intelligence \
|
||||||
|
from financial documents into JSON.
|
||||||
|
|
||||||
|
STRICT RULES — VIOLATIONS WILL INVALIDATE YOUR OUTPUT:
|
||||||
|
|
||||||
|
1. ONLY extract information explicitly stated in the document text provided.
|
||||||
|
2. NEVER fabricate facts, quotes, numbers, dates, or company names.
|
||||||
|
3. NEVER infer information that is not directly supported by the text.
|
||||||
|
4. If the document does not mention a company, do NOT include that company.
|
||||||
|
5. If the document is ambiguous about sentiment or impact, use "neutral" or "mixed" \
|
||||||
|
and set confidence lower.
|
||||||
|
6. evidence_spans MUST be short verbatim quotes copied from the document. \
|
||||||
|
Do NOT paraphrase or invent quotes.
|
||||||
|
7. key_facts MUST be directly stated in the document. Do NOT add external knowledge.
|
||||||
|
8. If you are uncertain about any field, lower the confidence score and add a warning \
|
||||||
|
to extraction_warnings.
|
||||||
|
9. If the document text is too short, garbled, or uninformative, return an empty \
|
||||||
|
companies array, set confidence below 0.3, and add "insufficient_content" to warnings.
|
||||||
|
10. Return ONLY valid JSON matching the provided schema. No commentary, no markdown fences."""
|
||||||
|
|
||||||
|
# --- Document-type-specific guidance ---
|
||||||
|
|
||||||
|
_DOCTYPE_GUIDANCE: dict[str, str] = {
|
||||||
|
DocumentType.ARTICLE: (
|
||||||
|
"This is a news article. Focus on reported facts, quoted sources, and stated "
|
||||||
|
"analyst opinions. Distinguish between the journalist's framing and actual "
|
||||||
|
"company developments. Do not treat speculative language as confirmed fact."
|
||||||
|
),
|
||||||
|
DocumentType.FILING: (
|
||||||
|
"This is a regulatory filing (e.g. SEC 10-K, 10-Q, 8-K). Extract concrete "
|
||||||
|
"financial figures, risk factors, and material events as stated. Filings use "
|
||||||
|
"precise legal language — preserve that precision in your extraction."
|
||||||
|
),
|
||||||
|
DocumentType.TRANSCRIPT: (
|
||||||
|
"This is an earnings call or event transcript. Distinguish between management "
|
||||||
|
"forward-looking statements and reported results. Flag forward-looking language "
|
||||||
|
"as lower confidence. Extract specific guidance numbers when stated."
|
||||||
|
),
|
||||||
|
DocumentType.PRESS_RELEASE: (
|
||||||
|
"This is a company press release. Be aware that press releases are promotional. "
|
||||||
|
"Extract stated facts and figures but note that sentiment may be biased positive. "
|
||||||
|
"Look for concrete metrics rather than marketing language."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_doctype_guidance(document_type: str) -> str:
|
||||||
|
"""Return document-type-specific extraction guidance."""
|
||||||
|
return _DOCTYPE_GUIDANCE.get(document_type, _DOCTYPE_GUIDANCE[DocumentType.ARTICLE])
|
||||||
|
|
||||||
|
|
||||||
|
# --- Prompt builder ---
|
||||||
|
|
||||||
|
def build_extraction_prompt(
|
||||||
|
document_text: str,
|
||||||
|
document_type: str = DocumentType.ARTICLE,
|
||||||
|
known_tickers: list[str] | None = None,
|
||||||
|
document_id: str = "",
|
||||||
|
) -> dict[str, str]:
|
||||||
|
"""Build system and user prompts for Ollama structured extraction.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_text: Normalized text content of the document.
|
||||||
|
document_type: One of the DocumentType enum values.
|
||||||
|
known_tickers: Optional list of tickers the document may reference.
|
||||||
|
Helps the model focus but does NOT mean all tickers are relevant.
|
||||||
|
document_id: Optional document ID for traceability.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with 'system' and 'user' prompt strings.
|
||||||
|
"""
|
||||||
|
doctype_guidance = _get_doctype_guidance(document_type)
|
||||||
|
|
||||||
|
ticker_hint = ""
|
||||||
|
if known_tickers:
|
||||||
|
tickers_str = ", ".join(known_tickers)
|
||||||
|
ticker_hint = (
|
||||||
|
f"\nThe following tickers may be referenced in this document: {tickers_str}\n"
|
||||||
|
"Only include a ticker in your output if the document actually discusses that company. "
|
||||||
|
"Do NOT include a ticker just because it appears in this hint."
|
||||||
|
)
|
||||||
|
|
||||||
|
schema_str = json.dumps(EXTRACTION_JSON_SCHEMA, indent=2)
|
||||||
|
|
||||||
|
doc_id_line = f"Document ID: {document_id}\n" if document_id else ""
|
||||||
|
|
||||||
|
user_prompt = f"""\
|
||||||
|
Extract structured intelligence from the following document.
|
||||||
|
|
||||||
|
{doc_id_line}Document type: {document_type}
|
||||||
|
{doctype_guidance}
|
||||||
|
{ticker_hint}
|
||||||
|
Your output MUST be a single JSON object conforming to this schema:
|
||||||
|
{schema_str}
|
||||||
|
|
||||||
|
REMEMBER:
|
||||||
|
- Only extract what is explicitly in the text below.
|
||||||
|
- evidence_spans must be verbatim quotes from the text.
|
||||||
|
- If the text is insufficient, return empty companies and low confidence.
|
||||||
|
- Return ONLY the JSON object. No other text.
|
||||||
|
|
||||||
|
--- DOCUMENT TEXT ---
|
||||||
|
{document_text}
|
||||||
|
--- END DOCUMENT TEXT ---"""
|
||||||
|
|
||||||
|
return {
|
||||||
|
"system": SYSTEM_PROMPT,
|
||||||
|
"user": user_prompt,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_prompt_metadata() -> dict[str, str]:
|
||||||
|
"""Return metadata about the current prompt version for audit trails."""
|
||||||
|
return {
|
||||||
|
"prompt_version": PROMPT_VERSION,
|
||||||
|
"schema_version": SCHEMA_VERSION,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_json_schema() -> dict[str, Any]:
|
||||||
|
"""Return the extraction JSON schema for Ollama structured output format parameter."""
|
||||||
|
return EXTRACTION_JSON_SCHEMA
|
||||||
@@ -0,0 +1,250 @@
|
|||||||
|
"""Replay dataset loader and runner for deterministic extraction testing.
|
||||||
|
|
||||||
|
Loads archived document fixtures from JSON files, validates their expected
|
||||||
|
extraction outputs against the current schema, and provides a runner that
|
||||||
|
can compare live Ollama extraction results against expected baselines.
|
||||||
|
|
||||||
|
This enables:
|
||||||
|
- Schema regression testing: verify expected outputs still pass validation
|
||||||
|
- Prompt regression testing: detect drift when prompts or schemas change
|
||||||
|
- End-to-end replay: run fixtures through a live Ollama and compare
|
||||||
|
|
||||||
|
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from services.extractor.schemas import (
|
||||||
|
ExtractionResult,
|
||||||
|
ValidationReport,
|
||||||
|
get_schema_version,
|
||||||
|
validate_extraction,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger("extractor_replay")
|
||||||
|
|
||||||
|
FIXTURES_DIR = Path(__file__).resolve().parent.parent.parent / "tests" / "replay_fixtures"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ReplayFixture:
|
||||||
|
"""A single replay fixture loaded from disk."""
|
||||||
|
|
||||||
|
document_id: str
|
||||||
|
document_type: str
|
||||||
|
document_text: str
|
||||||
|
known_tickers: list[str]
|
||||||
|
expected_extraction: dict[str, Any]
|
||||||
|
metadata: dict[str, str]
|
||||||
|
source_path: str = ""
|
||||||
|
|
||||||
|
@property
|
||||||
|
def expected_result(self) -> ExtractionResult:
|
||||||
|
"""Parse expected_extraction into a validated ExtractionResult."""
|
||||||
|
return ExtractionResult.model_validate(self.expected_extraction)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ReplayValidationResult:
|
||||||
|
"""Result of validating a single fixture against the current schema."""
|
||||||
|
|
||||||
|
fixture_id: str
|
||||||
|
schema_valid: bool = False
|
||||||
|
validation_report: ValidationReport | None = None
|
||||||
|
schema_version: str = ""
|
||||||
|
error: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ReplayComparisonResult:
|
||||||
|
"""Result of comparing a live extraction against the expected baseline."""
|
||||||
|
|
||||||
|
fixture_id: str
|
||||||
|
expected_companies: list[str] = field(default_factory=list)
|
||||||
|
actual_companies: list[str] = field(default_factory=list)
|
||||||
|
companies_match: bool = False
|
||||||
|
expected_sentiment_map: dict[str, str] = field(default_factory=dict)
|
||||||
|
actual_sentiment_map: dict[str, str] = field(default_factory=dict)
|
||||||
|
sentiment_match: bool = False
|
||||||
|
expected_catalyst_map: dict[str, str] = field(default_factory=dict)
|
||||||
|
actual_catalyst_map: dict[str, str] = field(default_factory=dict)
|
||||||
|
catalyst_match: bool = False
|
||||||
|
actual_schema_valid: bool = False
|
||||||
|
warnings: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def load_fixture(path: Path) -> ReplayFixture:
|
||||||
|
"""Load a single replay fixture from a JSON file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to the fixture JSON file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A ReplayFixture with all fields populated.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the fixture is missing required fields.
|
||||||
|
json.JSONDecodeError: If the file is not valid JSON.
|
||||||
|
"""
|
||||||
|
with open(path) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
required = {"document_id", "document_type", "document_text", "expected_extraction"}
|
||||||
|
missing = required - set(data.keys())
|
||||||
|
if missing:
|
||||||
|
raise ValueError(f"Fixture {path.name} missing required fields: {missing}")
|
||||||
|
|
||||||
|
return ReplayFixture(
|
||||||
|
document_id=data["document_id"],
|
||||||
|
document_type=data["document_type"],
|
||||||
|
document_text=data["document_text"],
|
||||||
|
known_tickers=data.get("known_tickers", []),
|
||||||
|
expected_extraction=data["expected_extraction"],
|
||||||
|
metadata=data.get("metadata", {}),
|
||||||
|
source_path=str(path),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_all_fixtures(fixtures_dir: Path | None = None) -> list[ReplayFixture]:
|
||||||
|
"""Load all replay fixtures from the fixtures directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fixtures_dir: Override path to fixtures directory.
|
||||||
|
Defaults to tests/replay_fixtures/.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of loaded ReplayFixture objects, sorted by document_id.
|
||||||
|
"""
|
||||||
|
directory = fixtures_dir or FIXTURES_DIR
|
||||||
|
if not directory.is_dir():
|
||||||
|
logger.warning("Fixtures directory not found: %s", directory)
|
||||||
|
return []
|
||||||
|
|
||||||
|
fixtures: list[ReplayFixture] = []
|
||||||
|
for path in sorted(directory.glob("*.json")):
|
||||||
|
try:
|
||||||
|
fixture = load_fixture(path)
|
||||||
|
fixtures.append(fixture)
|
||||||
|
except (ValueError, json.JSONDecodeError) as exc:
|
||||||
|
logger.warning("Skipping invalid fixture %s: %s", path.name, exc)
|
||||||
|
|
||||||
|
logger.info("Loaded %d replay fixtures from %s", len(fixtures), directory)
|
||||||
|
return fixtures
|
||||||
|
|
||||||
|
|
||||||
|
def validate_fixture(fixture: ReplayFixture) -> ReplayValidationResult:
|
||||||
|
"""Validate a fixture's expected extraction against the current schema.
|
||||||
|
|
||||||
|
This is the core deterministic test: the expected output must still
|
||||||
|
pass schema and semantic validation with the current code. If it
|
||||||
|
doesn't, either the fixture is stale or the schema has regressed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fixture: The replay fixture to validate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A ReplayValidationResult indicating pass/fail.
|
||||||
|
"""
|
||||||
|
result = ReplayValidationResult(
|
||||||
|
fixture_id=fixture.document_id,
|
||||||
|
schema_version=get_schema_version(),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
report = validate_extraction(
|
||||||
|
fixture.expected_extraction,
|
||||||
|
document_text=fixture.document_text,
|
||||||
|
)
|
||||||
|
result.validation_report = report
|
||||||
|
result.schema_valid = report.valid
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
result.error = str(exc)
|
||||||
|
result.schema_valid = False
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def validate_all_fixtures(
|
||||||
|
fixtures_dir: Path | None = None,
|
||||||
|
) -> list[ReplayValidationResult]:
|
||||||
|
"""Load and validate all fixtures against the current schema.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fixtures_dir: Override path to fixtures directory.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of validation results, one per fixture.
|
||||||
|
"""
|
||||||
|
fixtures = load_all_fixtures(fixtures_dir)
|
||||||
|
return [validate_fixture(f) for f in fixtures]
|
||||||
|
|
||||||
|
|
||||||
|
def compare_extraction(
|
||||||
|
fixture: ReplayFixture,
|
||||||
|
actual_result: ExtractionResult,
|
||||||
|
) -> ReplayComparisonResult:
|
||||||
|
"""Compare a live extraction result against the fixture's expected output.
|
||||||
|
|
||||||
|
Checks structural alignment (same companies detected, same sentiments,
|
||||||
|
same catalyst types) rather than exact string equality, since LLM
|
||||||
|
outputs vary in wording across runs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
fixture: The replay fixture with expected output.
|
||||||
|
actual_result: The ExtractionResult from a live extraction.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A ReplayComparisonResult with match details.
|
||||||
|
"""
|
||||||
|
expected = fixture.expected_result
|
||||||
|
comparison = ReplayComparisonResult(fixture_id=fixture.document_id)
|
||||||
|
|
||||||
|
# Company ticker sets
|
||||||
|
comparison.expected_companies = sorted(c.ticker for c in expected.companies)
|
||||||
|
comparison.actual_companies = sorted(c.ticker for c in actual_result.companies)
|
||||||
|
comparison.companies_match = (
|
||||||
|
set(comparison.expected_companies) == set(comparison.actual_companies)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Sentiment by ticker
|
||||||
|
comparison.expected_sentiment_map = {
|
||||||
|
c.ticker: c.sentiment for c in expected.companies
|
||||||
|
}
|
||||||
|
comparison.actual_sentiment_map = {
|
||||||
|
c.ticker: c.sentiment for c in actual_result.companies
|
||||||
|
}
|
||||||
|
comparison.sentiment_match = (
|
||||||
|
comparison.expected_sentiment_map == comparison.actual_sentiment_map
|
||||||
|
)
|
||||||
|
|
||||||
|
# Catalyst type by ticker
|
||||||
|
comparison.expected_catalyst_map = {
|
||||||
|
c.ticker: c.catalyst_type for c in expected.companies
|
||||||
|
}
|
||||||
|
comparison.actual_catalyst_map = {
|
||||||
|
c.ticker: c.catalyst_type for c in actual_result.companies
|
||||||
|
}
|
||||||
|
comparison.catalyst_match = (
|
||||||
|
comparison.expected_catalyst_map == comparison.actual_catalyst_map
|
||||||
|
)
|
||||||
|
|
||||||
|
# Schema validity of actual result
|
||||||
|
actual_report = validate_extraction(
|
||||||
|
actual_result.model_dump(mode="json"),
|
||||||
|
document_text=fixture.document_text,
|
||||||
|
)
|
||||||
|
comparison.actual_schema_valid = actual_report.valid
|
||||||
|
if actual_report.warnings:
|
||||||
|
comparison.warnings = actual_report.warnings
|
||||||
|
|
||||||
|
if not comparison.companies_match:
|
||||||
|
comparison.warnings.append(
|
||||||
|
f"company_mismatch: expected={comparison.expected_companies} actual={comparison.actual_companies}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return comparison
|
||||||
@@ -0,0 +1,316 @@
|
|||||||
|
"""JSON schema definitions for document intelligence extraction.
|
||||||
|
|
||||||
|
Generates Ollama-compatible JSON schemas from Pydantic models so the
|
||||||
|
extraction contract stays in sync with the shared data models. Also
|
||||||
|
provides schema validation and semantic validation helpers.
|
||||||
|
|
||||||
|
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from services.shared.schemas import (
|
||||||
|
CatalystType,
|
||||||
|
Sentiment,
|
||||||
|
)
|
||||||
|
|
||||||
|
SCHEMA_VERSION = "2.0.0"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Pydantic model that mirrors the Ollama extraction output contract.
|
||||||
|
# This is the *response* shape we ask the model to produce — it intentionally
|
||||||
|
# omits server-side fields like document_id, source_credibility, and model
|
||||||
|
# metadata that are attached after extraction.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class CompanyExtractionItem(BaseModel):
|
||||||
|
"""Per-company extraction output expected from the model.
|
||||||
|
|
||||||
|
All fields are required (no defaults) so the generated JSON schema
|
||||||
|
forces the model to produce every field explicitly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
ticker: str = Field(description="Stock ticker symbol mentioned in the document.")
|
||||||
|
company_name: str = Field(description="Full company name as referenced in the document.")
|
||||||
|
relevance: float = Field(
|
||||||
|
ge=0,
|
||||||
|
le=1,
|
||||||
|
description="How relevant the document is to this company. 0=tangential, 1=primary subject.",
|
||||||
|
)
|
||||||
|
sentiment: Sentiment = Field(description="Overall sentiment toward this company in the document.")
|
||||||
|
impact_score: float = Field(
|
||||||
|
ge=0,
|
||||||
|
le=1,
|
||||||
|
description="Estimated magnitude of impact. 0=negligible, 1=highly material.",
|
||||||
|
)
|
||||||
|
impact_horizon: str = Field(
|
||||||
|
description="One of: intraday, 1d, 1d_7d, 1d_30d, 30d_90d, 90d_plus",
|
||||||
|
)
|
||||||
|
catalyst_type: CatalystType = Field(description="Primary catalyst category.")
|
||||||
|
key_facts: list[str] = Field(
|
||||||
|
description="Facts explicitly stated in the document. Do NOT infer or fabricate.",
|
||||||
|
)
|
||||||
|
risks: list[str] = Field(
|
||||||
|
description="Risks explicitly mentioned in the document.",
|
||||||
|
)
|
||||||
|
evidence_spans: list[str] = Field(
|
||||||
|
description="Short verbatim quotes from the document supporting the analysis.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractionResult(BaseModel):
|
||||||
|
"""Top-level structured output the model must return.
|
||||||
|
|
||||||
|
All fields are required (no defaults) so the generated JSON schema
|
||||||
|
forces the model to produce every field explicitly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
summary: str = Field(
|
||||||
|
description="A concise 1-3 sentence summary of the document's main point.",
|
||||||
|
)
|
||||||
|
companies: list[CompanyExtractionItem] = Field(
|
||||||
|
description="Per-company intelligence extracted from the document.",
|
||||||
|
)
|
||||||
|
macro_themes: list[str] = Field(
|
||||||
|
description="Broad economic or market themes mentioned (e.g. rates, inflation, ai_capex).",
|
||||||
|
)
|
||||||
|
novelty_score: float = Field(
|
||||||
|
ge=0,
|
||||||
|
le=1,
|
||||||
|
description="How novel or surprising the information is. 0=routine, 1=highly novel.",
|
||||||
|
)
|
||||||
|
confidence: float = Field(
|
||||||
|
ge=0,
|
||||||
|
le=1,
|
||||||
|
description="Model confidence in the accuracy of this extraction. Lower if text is ambiguous.",
|
||||||
|
)
|
||||||
|
extraction_warnings: list[str] = Field(
|
||||||
|
description="Any issues encountered: ambiguous_ticker, incomplete_text, low_confidence, etc.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Schema generation
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def generate_json_schema() -> dict[str, Any]:
|
||||||
|
"""Generate the JSON schema from the Pydantic model.
|
||||||
|
|
||||||
|
Returns a plain JSON Schema dict suitable for Ollama's ``format``
|
||||||
|
parameter. Pydantic ``$defs`` are inlined so the schema is
|
||||||
|
self-contained.
|
||||||
|
"""
|
||||||
|
raw = ExtractionResult.model_json_schema()
|
||||||
|
# Inline $defs so the schema is flat and Ollama-friendly
|
||||||
|
return _inline_defs(raw)
|
||||||
|
|
||||||
|
|
||||||
|
def get_schema_version() -> str:
|
||||||
|
"""Return the current schema version string."""
|
||||||
|
return SCHEMA_VERSION
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Validation helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class ValidationReport(BaseModel):
|
||||||
|
"""Result of validating a raw model response."""
|
||||||
|
|
||||||
|
valid: bool = False
|
||||||
|
errors: list[str] = Field(default_factory=list)
|
||||||
|
warnings: list[str] = Field(default_factory=list)
|
||||||
|
parsed: ExtractionResult | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def validate_extraction(
|
||||||
|
raw_json: str | dict[str, Any],
|
||||||
|
*,
|
||||||
|
document_text: str = "",
|
||||||
|
) -> ValidationReport:
|
||||||
|
"""Validate raw model output against the extraction schema.
|
||||||
|
|
||||||
|
Performs structural (JSON / Pydantic) validation followed by semantic
|
||||||
|
checks that catch hallucination indicators, cross-field inconsistencies,
|
||||||
|
and data-quality issues.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_json: Either a JSON string or an already-parsed dict.
|
||||||
|
document_text: Optional original document text used for evidence
|
||||||
|
span verification.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A ``ValidationReport`` with parsed result on success.
|
||||||
|
"""
|
||||||
|
errors: list[str] = []
|
||||||
|
warnings: list[str] = []
|
||||||
|
|
||||||
|
# --- Parse JSON string if needed ---
|
||||||
|
if isinstance(raw_json, str):
|
||||||
|
try:
|
||||||
|
data = json.loads(raw_json)
|
||||||
|
except json.JSONDecodeError as exc:
|
||||||
|
return ValidationReport(valid=False, errors=[f"Invalid JSON: {exc}"])
|
||||||
|
else:
|
||||||
|
data = raw_json
|
||||||
|
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
return ValidationReport(valid=False, errors=["Expected a JSON object at top level."])
|
||||||
|
|
||||||
|
# --- Pydantic structural validation ---
|
||||||
|
try:
|
||||||
|
result = ExtractionResult.model_validate(data)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
return ValidationReport(valid=False, errors=[f"Schema validation failed: {exc}"])
|
||||||
|
|
||||||
|
# --- Semantic checks ---
|
||||||
|
sem_errors, sem_warnings = _semantic_checks(result, document_text)
|
||||||
|
errors.extend(sem_errors)
|
||||||
|
warnings.extend(sem_warnings)
|
||||||
|
|
||||||
|
# Semantic errors make the report invalid — the caller should retry.
|
||||||
|
valid = len(errors) == 0
|
||||||
|
return ValidationReport(
|
||||||
|
valid=valid,
|
||||||
|
errors=errors,
|
||||||
|
warnings=warnings,
|
||||||
|
parsed=result,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Known valid impact horizons
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
VALID_IMPACT_HORIZONS = frozenset({
|
||||||
|
"intraday",
|
||||||
|
"1d",
|
||||||
|
"1d_7d",
|
||||||
|
"1d_30d",
|
||||||
|
"30d_90d",
|
||||||
|
"90d_plus",
|
||||||
|
})
|
||||||
|
|
||||||
|
# Ticker: 1-5 uppercase letters (covers NYSE, NASDAQ, etc.)
|
||||||
|
_TICKER_RE = re.compile(r"^[A-Z]{1,5}$")
|
||||||
|
|
||||||
|
# Evidence span length bounds (characters)
|
||||||
|
_MIN_EVIDENCE_LEN = 8
|
||||||
|
_MAX_EVIDENCE_LEN = 500
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Semantic validation rules
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _semantic_checks(
|
||||||
|
result: ExtractionResult,
|
||||||
|
document_text: str = "",
|
||||||
|
) -> tuple[list[str], list[str]]:
|
||||||
|
"""Run semantic checks on a parsed extraction.
|
||||||
|
|
||||||
|
Returns a tuple of (errors, warnings). Errors are issues severe enough
|
||||||
|
to warrant a retry; warnings are informational.
|
||||||
|
"""
|
||||||
|
errors: list[str] = []
|
||||||
|
warnings: list[str] = []
|
||||||
|
|
||||||
|
# --- Top-level checks ---
|
||||||
|
if not result.summary:
|
||||||
|
warnings.append("empty_summary")
|
||||||
|
|
||||||
|
if result.confidence < 0.3 and len(result.companies) > 0:
|
||||||
|
warnings.append("low_confidence_with_companies")
|
||||||
|
|
||||||
|
# Duplicate tickers across company entries
|
||||||
|
tickers_seen: list[str] = []
|
||||||
|
for comp in result.companies:
|
||||||
|
if comp.ticker in tickers_seen:
|
||||||
|
errors.append(f"duplicate_ticker_{comp.ticker}")
|
||||||
|
tickers_seen.append(comp.ticker)
|
||||||
|
|
||||||
|
# --- Per-company checks ---
|
||||||
|
for comp in result.companies:
|
||||||
|
tag = comp.ticker or "unknown"
|
||||||
|
|
||||||
|
# Ticker format
|
||||||
|
if not comp.ticker:
|
||||||
|
errors.append("company_missing_ticker")
|
||||||
|
elif not _TICKER_RE.match(comp.ticker):
|
||||||
|
warnings.append(f"invalid_ticker_format_{tag}")
|
||||||
|
|
||||||
|
# Impact horizon must be a known value
|
||||||
|
if comp.impact_horizon not in VALID_IMPACT_HORIZONS:
|
||||||
|
errors.append(f"invalid_impact_horizon_{comp.impact_horizon}_for_{tag}")
|
||||||
|
|
||||||
|
# Evidence spans
|
||||||
|
if not comp.evidence_spans:
|
||||||
|
warnings.append(f"no_evidence_spans_for_{tag}")
|
||||||
|
else:
|
||||||
|
for idx, span in enumerate(comp.evidence_spans):
|
||||||
|
if len(span) < _MIN_EVIDENCE_LEN:
|
||||||
|
warnings.append(f"evidence_span_too_short_for_{tag}_{idx}")
|
||||||
|
if len(span) > _MAX_EVIDENCE_LEN:
|
||||||
|
warnings.append(f"evidence_span_too_long_for_{tag}_{idx}")
|
||||||
|
|
||||||
|
# Cross-field: high impact but no facts
|
||||||
|
if not comp.key_facts and comp.impact_score > 0.5:
|
||||||
|
warnings.append(f"high_impact_no_facts_for_{tag}")
|
||||||
|
|
||||||
|
# Cross-field: very low relevance
|
||||||
|
if comp.relevance < 0.2:
|
||||||
|
warnings.append(f"very_low_relevance_for_{tag}")
|
||||||
|
|
||||||
|
# Cross-field: strong sentiment but low impact
|
||||||
|
if comp.sentiment in (Sentiment.POSITIVE, Sentiment.NEGATIVE) and comp.impact_score < 0.1:
|
||||||
|
warnings.append(f"strong_sentiment_low_impact_for_{tag}")
|
||||||
|
|
||||||
|
# --- Evidence grounding check (when source text is available) ---
|
||||||
|
if document_text:
|
||||||
|
doc_lower = document_text.lower()
|
||||||
|
for comp in result.companies:
|
||||||
|
for idx, span in enumerate(comp.evidence_spans):
|
||||||
|
if span.lower() not in doc_lower:
|
||||||
|
warnings.append(
|
||||||
|
f"evidence_span_not_found_in_document_for_{comp.ticker or 'unknown'}_{idx}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return errors, warnings
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Internal helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _inline_defs(schema: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
"""Recursively inline ``$defs`` / ``$ref`` so the schema is self-contained."""
|
||||||
|
defs = schema.pop("$defs", {})
|
||||||
|
return _resolve_refs(schema, defs)
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_refs(node: Any, defs: dict[str, Any]) -> Any:
|
||||||
|
"""Walk the schema tree and replace ``$ref`` pointers with their definitions."""
|
||||||
|
if isinstance(node, dict):
|
||||||
|
if "$ref" in node:
|
||||||
|
ref_path = node["$ref"] # e.g. "#/$defs/CompanyExtractionItem"
|
||||||
|
ref_name = ref_path.rsplit("/", 1)[-1]
|
||||||
|
if ref_name in defs:
|
||||||
|
resolved = defs[ref_name].copy()
|
||||||
|
# The resolved def may itself contain refs
|
||||||
|
return _resolve_refs(resolved, defs)
|
||||||
|
return node # unresolvable ref, leave as-is
|
||||||
|
return {k: _resolve_refs(v, defs) for k, v in node.items()}
|
||||||
|
if isinstance(node, list):
|
||||||
|
return [_resolve_refs(item, defs) for item in node]
|
||||||
|
return node
|
||||||
@@ -1 +1,291 @@
|
|||||||
"""Extraction worker - sends documents to Ollama for structured intelligence extraction."""
|
"""Extraction worker - sends documents to Ollama for structured intelligence extraction.
|
||||||
|
|
||||||
|
Orchestrates the full extraction pipeline for a single document:
|
||||||
|
1. Calls OllamaClient to get structured extraction
|
||||||
|
2. Uploads prompts, raw outputs, and validation reports to MinIO
|
||||||
|
3. Persists the final intelligence object and per-company impact records to PostgreSQL
|
||||||
|
4. Updates document status
|
||||||
|
|
||||||
|
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5, 9.1, 9.2
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
from minio import Minio
|
||||||
|
|
||||||
|
from services.extractor.client import ExtractionResponse
|
||||||
|
from services.extractor.metrics import collect_metrics, persist_metrics
|
||||||
|
from services.shared.metadata import (
|
||||||
|
persist_document_impact,
|
||||||
|
persist_document_intelligence,
|
||||||
|
update_document_status,
|
||||||
|
)
|
||||||
|
from services.shared.storage import (
|
||||||
|
upload_extraction_intelligence,
|
||||||
|
upload_extraction_prompt,
|
||||||
|
upload_extraction_raw_output,
|
||||||
|
upload_extraction_validation,
|
||||||
|
)
|
||||||
|
from services.shared.logging import Span
|
||||||
|
from services.shared.metrics import (
|
||||||
|
EXTRACTION_ATTEMPTS,
|
||||||
|
EXTRACTION_CONFIDENCE,
|
||||||
|
EXTRACTION_DURATION,
|
||||||
|
EXTRACTION_JOBS_TOTAL,
|
||||||
|
EXTRACTION_RETRIES,
|
||||||
|
EXTRACTION_TOKEN_ESTIMATE,
|
||||||
|
EXTRACTION_VALIDATION_ERRORS,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger("extractor_worker")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExtractionPersistResult:
|
||||||
|
"""Result of persisting an extraction to storage and database."""
|
||||||
|
|
||||||
|
intelligence_id: str | None = None
|
||||||
|
prompt_ref: str | None = None
|
||||||
|
raw_output_ref: str | None = None
|
||||||
|
validation_ref: str | None = None
|
||||||
|
intelligence_ref: str | None = None
|
||||||
|
impact_ids: list[str] | None = None
|
||||||
|
metrics_id: str | None = None
|
||||||
|
success: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
async def persist_extraction(
|
||||||
|
*,
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
minio_client: Minio,
|
||||||
|
document_id: str,
|
||||||
|
ticker: str,
|
||||||
|
extraction_response: ExtractionResponse,
|
||||||
|
company_id_map: dict[str, str] | None = None,
|
||||||
|
source_credibility: float = 0.5,
|
||||||
|
timestamp: datetime | None = None,
|
||||||
|
document_text_length: int = 0,
|
||||||
|
) -> ExtractionPersistResult:
|
||||||
|
"""Persist all extraction artifacts to MinIO and PostgreSQL.
|
||||||
|
|
||||||
|
Uploads prompts, raw model outputs, validation reports, and the final
|
||||||
|
intelligence object to MinIO. Persists the intelligence record and
|
||||||
|
per-company impact records to PostgreSQL. Updates document status.
|
||||||
|
Also collects and persists model performance metrics.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pool: PostgreSQL connection pool.
|
||||||
|
minio_client: MinIO client.
|
||||||
|
document_id: UUID of the source document.
|
||||||
|
ticker: Primary ticker for path construction.
|
||||||
|
extraction_response: Full response from OllamaClient.extract().
|
||||||
|
company_id_map: Optional mapping of ticker -> company UUID for impact records.
|
||||||
|
source_credibility: Credibility score to attach to the intelligence record.
|
||||||
|
timestamp: Override timestamp for MinIO paths (defaults to UTC now).
|
||||||
|
document_text_length: Length of the input document text for token estimation.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ExtractionPersistResult with references to all persisted artifacts.
|
||||||
|
"""
|
||||||
|
ts = timestamp or datetime.now(timezone.utc)
|
||||||
|
result = ExtractionPersistResult()
|
||||||
|
company_id_map = company_id_map or {}
|
||||||
|
|
||||||
|
# 1. Upload prompt metadata to MinIO
|
||||||
|
prompt_payload = json.dumps({
|
||||||
|
"prompt_metadata": extraction_response.prompt_metadata,
|
||||||
|
"model": extraction_response.model,
|
||||||
|
}, indent=2).encode()
|
||||||
|
result.prompt_ref = upload_extraction_prompt(
|
||||||
|
minio_client, ticker, document_id, prompt_payload, timestamp=ts,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Upload raw outputs for each attempt
|
||||||
|
attempts_data: list[dict[str, object]] = []
|
||||||
|
for idx, attempt in enumerate(extraction_response.attempts):
|
||||||
|
attempt_record: dict[str, object] = {
|
||||||
|
"attempt_index": idx,
|
||||||
|
"raw_output": attempt.raw_output,
|
||||||
|
"error": attempt.error,
|
||||||
|
"duration_ms": attempt.duration_ms,
|
||||||
|
"model": attempt.model,
|
||||||
|
"retryable": attempt.retryable,
|
||||||
|
}
|
||||||
|
if attempt.validation:
|
||||||
|
attempt_record["validation"] = {
|
||||||
|
"valid": attempt.validation.valid,
|
||||||
|
"errors": attempt.validation.errors,
|
||||||
|
"warnings": attempt.validation.warnings,
|
||||||
|
}
|
||||||
|
attempts_data.append(attempt_record)
|
||||||
|
|
||||||
|
raw_output_payload = json.dumps({
|
||||||
|
"document_id": document_id,
|
||||||
|
"attempts": attempts_data,
|
||||||
|
"total_duration_ms": extraction_response.total_duration_ms,
|
||||||
|
"success": extraction_response.success,
|
||||||
|
}, indent=2).encode()
|
||||||
|
result.raw_output_ref = upload_extraction_raw_output(
|
||||||
|
minio_client, ticker, document_id, raw_output_payload, timestamp=ts,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Upload validation report
|
||||||
|
final_attempt = extraction_response.attempts[-1] if extraction_response.attempts else None
|
||||||
|
validation_payload = json.dumps({
|
||||||
|
"document_id": document_id,
|
||||||
|
"success": extraction_response.success,
|
||||||
|
"attempt_count": len(extraction_response.attempts),
|
||||||
|
"final_validation": {
|
||||||
|
"valid": final_attempt.validation.valid if final_attempt and final_attempt.validation else False,
|
||||||
|
"errors": final_attempt.validation.errors if final_attempt and final_attempt.validation else [],
|
||||||
|
"warnings": final_attempt.validation.warnings if final_attempt and final_attempt.validation else [],
|
||||||
|
} if final_attempt else None,
|
||||||
|
}, indent=2).encode()
|
||||||
|
result.validation_ref = upload_extraction_validation(
|
||||||
|
minio_client, ticker, document_id, validation_payload, timestamp=ts,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 4. Determine validation status and persist intelligence
|
||||||
|
if extraction_response.success and extraction_response.result:
|
||||||
|
extraction = extraction_response.result
|
||||||
|
validation_status = "valid"
|
||||||
|
validation_errors: list[str] = []
|
||||||
|
|
||||||
|
# Upload final intelligence object to MinIO
|
||||||
|
intelligence_payload = json.dumps(
|
||||||
|
extraction.model_dump(mode="json"), indent=2,
|
||||||
|
).encode()
|
||||||
|
result.intelligence_ref = upload_extraction_intelligence(
|
||||||
|
minio_client, ticker, document_id, intelligence_payload, timestamp=ts,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Persist to PostgreSQL
|
||||||
|
intel_id = await persist_document_intelligence(
|
||||||
|
pool,
|
||||||
|
document_id=document_id,
|
||||||
|
summary=extraction.summary,
|
||||||
|
macro_themes=extraction.macro_themes,
|
||||||
|
novelty_score=extraction.novelty_score,
|
||||||
|
source_credibility=source_credibility,
|
||||||
|
extraction_warnings=extraction.extraction_warnings,
|
||||||
|
confidence=extraction.confidence,
|
||||||
|
model_provider="ollama",
|
||||||
|
model_name=extraction_response.model,
|
||||||
|
prompt_version=extraction_response.prompt_metadata.get("prompt_version", ""),
|
||||||
|
schema_version=extraction_response.prompt_metadata.get("schema_version", ""),
|
||||||
|
raw_output_ref=result.raw_output_ref,
|
||||||
|
prompt_ref=result.prompt_ref,
|
||||||
|
validation_status=validation_status,
|
||||||
|
validation_errors=validation_errors,
|
||||||
|
retry_count=len(extraction_response.attempts) - 1,
|
||||||
|
)
|
||||||
|
result.intelligence_id = intel_id
|
||||||
|
|
||||||
|
# Persist per-company impact records
|
||||||
|
result.impact_ids = []
|
||||||
|
for company in extraction.companies:
|
||||||
|
cid = company_id_map.get(company.ticker)
|
||||||
|
if not cid:
|
||||||
|
logger.warning(
|
||||||
|
"No company_id for ticker %s in doc %s, skipping impact record",
|
||||||
|
company.ticker, document_id,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
impact_id = await persist_document_impact(
|
||||||
|
pool,
|
||||||
|
intelligence_id=intel_id,
|
||||||
|
company_id=cid,
|
||||||
|
ticker=company.ticker,
|
||||||
|
relevance=company.relevance,
|
||||||
|
sentiment=company.sentiment,
|
||||||
|
impact_score=company.impact_score,
|
||||||
|
impact_horizon=company.impact_horizon,
|
||||||
|
catalyst_type=company.catalyst_type,
|
||||||
|
key_facts=company.key_facts,
|
||||||
|
risks=company.risks,
|
||||||
|
evidence_spans=company.evidence_spans,
|
||||||
|
)
|
||||||
|
result.impact_ids.append(impact_id)
|
||||||
|
|
||||||
|
await update_document_status(pool, document_id=document_id, status="extracted")
|
||||||
|
result.success = True
|
||||||
|
logger.info(
|
||||||
|
"Extraction persisted for doc %s: intel=%s, impacts=%d",
|
||||||
|
document_id, intel_id, len(result.impact_ids),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Failed extraction — still persist the attempt data
|
||||||
|
all_errors: list[str] = []
|
||||||
|
for attempt in extraction_response.attempts:
|
||||||
|
if attempt.error:
|
||||||
|
all_errors.append(attempt.error)
|
||||||
|
|
||||||
|
intel_id = await persist_document_intelligence(
|
||||||
|
pool,
|
||||||
|
document_id=document_id,
|
||||||
|
summary="",
|
||||||
|
macro_themes=[],
|
||||||
|
novelty_score=0.0,
|
||||||
|
source_credibility=source_credibility,
|
||||||
|
extraction_warnings=["extraction_failed"],
|
||||||
|
confidence=0.0,
|
||||||
|
model_provider="ollama",
|
||||||
|
model_name=extraction_response.model,
|
||||||
|
prompt_version=extraction_response.prompt_metadata.get("prompt_version", ""),
|
||||||
|
schema_version=extraction_response.prompt_metadata.get("schema_version", ""),
|
||||||
|
raw_output_ref=result.raw_output_ref,
|
||||||
|
prompt_ref=result.prompt_ref,
|
||||||
|
validation_status="failed",
|
||||||
|
validation_errors=all_errors,
|
||||||
|
retry_count=len(extraction_response.attempts),
|
||||||
|
)
|
||||||
|
result.intelligence_id = intel_id
|
||||||
|
|
||||||
|
await update_document_status(pool, document_id=document_id, status="extraction_failed")
|
||||||
|
logger.warning(
|
||||||
|
"Extraction failed for doc %s after %d attempts: %s",
|
||||||
|
document_id, len(extraction_response.attempts), "; ".join(all_errors),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Collect and persist model performance metrics
|
||||||
|
try:
|
||||||
|
metrics = collect_metrics(
|
||||||
|
extraction_response,
|
||||||
|
document_id=document_id,
|
||||||
|
ticker=ticker,
|
||||||
|
document_text_length=document_text_length,
|
||||||
|
)
|
||||||
|
metrics.recorded_at = ts
|
||||||
|
metrics_id = await persist_metrics(pool, metrics)
|
||||||
|
result.metrics_id = metrics_id
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Failed to persist extraction metrics for doc %s", document_id)
|
||||||
|
|
||||||
|
# Prometheus metrics
|
||||||
|
EXTRACTION_ATTEMPTS.inc(len(extraction_response.attempts))
|
||||||
|
EXTRACTION_DURATION.observe(extraction_response.total_duration_ms / 1000.0)
|
||||||
|
retry_count = max(0, len(extraction_response.attempts) - 1)
|
||||||
|
if retry_count > 0:
|
||||||
|
EXTRACTION_RETRIES.inc(retry_count)
|
||||||
|
if extraction_response.success:
|
||||||
|
EXTRACTION_JOBS_TOTAL.labels(status="success").inc()
|
||||||
|
if extraction_response.result:
|
||||||
|
EXTRACTION_CONFIDENCE.observe(extraction_response.result.confidence)
|
||||||
|
else:
|
||||||
|
EXTRACTION_JOBS_TOTAL.labels(status="failed").inc()
|
||||||
|
# Count validation errors from final attempt
|
||||||
|
final = extraction_response.attempts[-1] if extraction_response.attempts else None
|
||||||
|
if final and final.validation and final.validation.errors:
|
||||||
|
EXTRACTION_VALIDATION_ERRORS.inc(len(final.validation.errors))
|
||||||
|
# Token estimates
|
||||||
|
if document_text_length > 0:
|
||||||
|
EXTRACTION_TOKEN_ESTIMATE.labels(direction="input").inc(document_text_length // 4)
|
||||||
|
if final and final.raw_output:
|
||||||
|
EXTRACTION_TOKEN_ESTIMATE.labels(direction="output").inc(len(final.raw_output) // 4)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|||||||
+151
-80
@@ -1,47 +1,50 @@
|
|||||||
"""Ingestion worker - processes jobs from the ingestion queue."""
|
"""Ingestion worker - processes jobs from the ingestion queue."""
|
||||||
import asyncio
|
import asyncio
|
||||||
import hashlib
|
|
||||||
import io
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
import asyncpg
|
import asyncpg
|
||||||
import redis.asyncio as aioredis
|
import redis.asyncio as aioredis
|
||||||
from minio import Minio
|
from minio import Minio
|
||||||
|
|
||||||
from services.adapters.base import AdapterResult
|
from services.adapters.base import AdapterResult
|
||||||
from services.adapters.filings_adapter import FilingsAdapter
|
from services.adapters.broker_adapter import AlpacaBrokerAdapter, TradingMode
|
||||||
from services.adapters.market_adapter import MarketDataAdapter
|
from services.adapters.filings_adapter import SECEdgarAdapter
|
||||||
from services.adapters.news_adapter import NewsApiAdapter
|
from services.adapters.market_adapter import PolygonMarketAdapter
|
||||||
|
from services.adapters.news_adapter import PolygonNewsAdapter
|
||||||
|
from services.adapters.web_scrape_adapter import WebScrapeAdapter
|
||||||
from services.shared.config import load_config
|
from services.shared.config import load_config
|
||||||
from services.shared.db import get_minio, get_pg_pool, get_redis
|
from services.shared.db import get_minio, get_pg_pool, get_redis
|
||||||
|
from services.shared.dedupe import dedupe_items, mark_as_seen
|
||||||
|
from services.shared.metadata import (
|
||||||
|
persist_ingestion_items,
|
||||||
|
record_retrieval_failure,
|
||||||
|
reset_source_retry_state,
|
||||||
|
)
|
||||||
from services.shared.redis_keys import (
|
from services.shared.redis_keys import (
|
||||||
QUEUE_INGESTION,
|
QUEUE_INGESTION,
|
||||||
QUEUE_PARSING,
|
QUEUE_PARSING,
|
||||||
dedupe_key,
|
dedupe_key,
|
||||||
queue_key,
|
queue_key,
|
||||||
)
|
)
|
||||||
|
from services.shared.logging import Span, extract_trace_context, inject_trace_context, new_trace_id, set_trace_context, setup_logging
|
||||||
|
from services.shared.metrics import (
|
||||||
|
ACTIVE_JOBS,
|
||||||
|
INGESTION_ADAPTER_DURATION,
|
||||||
|
INGESTION_ERRORS,
|
||||||
|
INGESTION_ITEMS_DEDUPED,
|
||||||
|
INGESTION_ITEMS_FETCHED,
|
||||||
|
INGESTION_ITEMS_NEW,
|
||||||
|
INGESTION_JOBS_TOTAL,
|
||||||
|
)
|
||||||
|
from services.shared.storage import (
|
||||||
|
bucket_for_source,
|
||||||
|
ensure_buckets,
|
||||||
|
upload_raw_artifact,
|
||||||
|
)
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger("ingestion_worker")
|
logger = logging.getLogger("ingestion_worker")
|
||||||
|
|
||||||
BUCKET_MAP = {
|
|
||||||
"market_api": "stonks-raw-market",
|
|
||||||
"news_api": "stonks-raw-news",
|
|
||||||
"filings_api": "stonks-raw-filings",
|
|
||||||
"broker": "stonks-raw-market",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def build_storage_path(source_type: str, ticker: str, doc_id: str) -> str:
|
|
||||||
now = datetime.utcnow()
|
|
||||||
return f"{source_type}/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.json"
|
|
||||||
|
|
||||||
|
|
||||||
async def store_raw_artifact(minio_client: Minio, bucket: str, path: str, data: bytes):
|
|
||||||
minio_client.put_object(bucket, path, io.BytesIO(data), len(data), content_type="application/json")
|
|
||||||
|
|
||||||
|
|
||||||
async def process_job(
|
async def process_job(
|
||||||
job: dict,
|
job: dict,
|
||||||
@@ -55,9 +58,11 @@ async def process_job(
|
|||||||
source_id = job["source_id"]
|
source_id = job["source_id"]
|
||||||
config = job.get("config", {})
|
config = job.get("config", {})
|
||||||
|
|
||||||
|
set_trace_context(trace_id=job.get("_trace_id") or new_trace_id())
|
||||||
|
|
||||||
adapter = adapters.get(source_type)
|
adapter = adapters.get(source_type)
|
||||||
if not adapter:
|
if not adapter:
|
||||||
logger.warning(f"No adapter for source_type={source_type}")
|
logger.warning("No adapter for source_type=%s", source_type)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Record ingestion run
|
# Record ingestion run
|
||||||
@@ -68,25 +73,37 @@ async def process_job(
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result: AdapterResult = await adapter.fetch(ticker, config)
|
with Span("adapter_fetch", ticker=ticker, source_type=source_type):
|
||||||
|
with INGESTION_ADAPTER_DURATION.labels(source_type=source_type).time():
|
||||||
|
result: AdapterResult = await adapter.fetch(ticker, config)
|
||||||
|
|
||||||
if result.error:
|
if result.error:
|
||||||
await pool.execute(
|
INGESTION_JOBS_TOTAL.labels(source_type=source_type, status="error").inc()
|
||||||
"UPDATE ingestion_runs SET status='failed', error_message=$2, completed_at=NOW() WHERE id=$1",
|
await record_retrieval_failure(
|
||||||
run_id, result.error,
|
pool,
|
||||||
|
run_id=str(run_id),
|
||||||
|
source_id=source_id,
|
||||||
|
error_message=result.error,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Store raw payload
|
# Store raw payload in MinIO
|
||||||
bucket = BUCKET_MAP.get(source_type, "stonks-raw-market")
|
bucket = bucket_for_source(source_type)
|
||||||
storage_path = build_storage_path(source_type, ticker, str(run_id))
|
artifact_type = "raw_html" if source_type == "web_scrape" else "raw_json"
|
||||||
await store_raw_artifact(minio_client, bucket, storage_path, result.raw_payload)
|
storage_uri = upload_raw_artifact(
|
||||||
|
minio_client,
|
||||||
|
source_type=source_type,
|
||||||
|
ticker=ticker,
|
||||||
|
document_id=str(run_id),
|
||||||
|
data=result.raw_payload,
|
||||||
|
artifact_type=artifact_type,
|
||||||
|
)
|
||||||
|
|
||||||
# Dedupe check
|
# Dedupe check on the overall payload hash
|
||||||
if result.content_hash:
|
if result.content_hash:
|
||||||
already_seen = await rds.get(dedupe_key(result.content_hash))
|
already_seen = await rds.get(dedupe_key(result.content_hash))
|
||||||
if already_seen:
|
if already_seen:
|
||||||
logger.info(f"Duplicate content for {ticker}, skipping")
|
logger.info("Duplicate content for %s, skipping", ticker)
|
||||||
await pool.execute(
|
await pool.execute(
|
||||||
"UPDATE ingestion_runs SET status='completed', items_fetched=$2, items_new=0, completed_at=NOW() WHERE id=$1",
|
"UPDATE ingestion_runs SET status='completed', items_fetched=$2, items_new=0, completed_at=NOW() WHERE id=$1",
|
||||||
run_id, len(result.items),
|
run_id, len(result.items),
|
||||||
@@ -94,72 +111,126 @@ async def process_job(
|
|||||||
return
|
return
|
||||||
await rds.set(dedupe_key(result.content_hash), "1", ex=86400)
|
await rds.set(dedupe_key(result.content_hash), "1", ex=86400)
|
||||||
|
|
||||||
new_items = 0
|
# Cross-source dedupe on individual document items (news, filings, web_scrape)
|
||||||
for item in result.items:
|
items_to_persist = result.items
|
||||||
item_json = json.dumps(item)
|
deduped_count = 0
|
||||||
item_hash = hashlib.sha256(item_json.encode()).hexdigest()
|
if source_type not in ("market_api", "broker"):
|
||||||
|
items_to_persist, dup_items = await dedupe_items(pool, rds, result.items)
|
||||||
|
deduped_count = len(dup_items)
|
||||||
|
if deduped_count:
|
||||||
|
INGESTION_ITEMS_DEDUPED.labels(source_type=source_type).inc(deduped_count)
|
||||||
|
logger.info(
|
||||||
|
"Deduped %d/%d items for %s/%s",
|
||||||
|
deduped_count, len(result.items), ticker, source_type,
|
||||||
|
)
|
||||||
|
|
||||||
# Check if document already exists
|
# Persist metadata via the unified metadata module
|
||||||
exists = await pool.fetchval("SELECT 1 FROM documents WHERE content_hash = $1", item_hash)
|
new_items, new_ids = await persist_ingestion_items(
|
||||||
if exists:
|
pool,
|
||||||
continue
|
source_type=source_type,
|
||||||
|
ticker=ticker,
|
||||||
|
company_id=job.get("company_id"),
|
||||||
|
items=items_to_persist,
|
||||||
|
storage_ref=storage_uri,
|
||||||
|
adapter_metadata=result.metadata,
|
||||||
|
content_hash=result.content_hash,
|
||||||
|
)
|
||||||
|
|
||||||
title = item.get("title", item.get("name", ""))
|
# Enqueue new document items for parsing (not market/broker)
|
||||||
url = item.get("url", item.get("link", ""))
|
if source_type not in ("market_api", "broker"):
|
||||||
published = item.get("publishedAt", item.get("published_at"))
|
for doc_id in new_ids:
|
||||||
|
await rds.rpush(queue_key(QUEUE_PARSING), json.dumps(inject_trace_context({
|
||||||
|
"document_id": doc_id,
|
||||||
|
"ticker": ticker,
|
||||||
|
"source_type": source_type,
|
||||||
|
})))
|
||||||
|
|
||||||
doc_id = await pool.fetchval(
|
# Mark newly persisted documents in Redis for fast future dedupe
|
||||||
"""INSERT INTO documents (document_type, source_type, publisher, url, title, published_at, content_hash, raw_storage_ref, status)
|
for item, doc_id in zip(items_to_persist, new_ids):
|
||||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'ingested')
|
await mark_as_seen(
|
||||||
RETURNING id""",
|
rds,
|
||||||
"article" if source_type == "news_api" else "filing" if source_type == "filings_api" else "article",
|
content_hash=item.get("content_hash", ""),
|
||||||
source_type,
|
canonical_url=item.get("canonical_url"),
|
||||||
item.get("source", {}).get("name", "") if isinstance(item.get("source"), dict) else str(item.get("source", "")),
|
document_id=doc_id,
|
||||||
url, title,
|
)
|
||||||
datetime.fromisoformat(published.replace("Z", "+00:00")) if published else None,
|
|
||||||
item_hash,
|
|
||||||
f"s3://{bucket}/{storage_path}",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Enqueue for parsing
|
# Link duplicate documents to this company if not already linked
|
||||||
await rds.rpush(queue_key(QUEUE_PARSING), json.dumps({
|
company_id = job.get("company_id")
|
||||||
"document_id": str(doc_id),
|
if company_id and deduped_count:
|
||||||
"ticker": ticker,
|
from services.shared.metadata import persist_document_company_mention
|
||||||
"source_type": source_type,
|
for dup in dup_items:
|
||||||
"url": url,
|
existing_id = dup.get("_dedupe_existing_id")
|
||||||
}))
|
if existing_id:
|
||||||
new_items += 1
|
try:
|
||||||
|
await persist_document_company_mention(
|
||||||
|
pool,
|
||||||
|
document_id=existing_id,
|
||||||
|
company_id=company_id,
|
||||||
|
ticker=ticker,
|
||||||
|
mention_type="cross_source",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
# Duplicate mention link — safe to ignore
|
||||||
|
pass
|
||||||
|
|
||||||
await pool.execute(
|
await pool.execute(
|
||||||
"UPDATE ingestion_runs SET status='completed', items_fetched=$2, items_new=$3, completed_at=NOW() WHERE id=$1",
|
"UPDATE ingestion_runs SET status='completed', items_fetched=$2, items_new=$3, completed_at=NOW() WHERE id=$1",
|
||||||
run_id, len(result.items), new_items,
|
run_id, len(result.items), new_items,
|
||||||
)
|
)
|
||||||
logger.info(f"Ingested {ticker}/{source_type}: {len(result.items)} fetched, {new_items} new")
|
# Clear any accumulated retry backoff after success
|
||||||
|
await reset_source_retry_state(pool, source_id)
|
||||||
|
INGESTION_ITEMS_FETCHED.labels(source_type=source_type).inc(len(result.items))
|
||||||
|
INGESTION_ITEMS_NEW.labels(source_type=source_type).inc(new_items)
|
||||||
|
INGESTION_JOBS_TOTAL.labels(source_type=source_type, status="success").inc()
|
||||||
|
logger.info(
|
||||||
|
"Ingested %s/%s: %d fetched, %d new",
|
||||||
|
ticker, source_type, len(result.items), new_items,
|
||||||
|
extra={"ticker": ticker, "source_type": source_type, "count": new_items},
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Ingestion error for {ticker}: {e}")
|
INGESTION_ERRORS.labels(source_type=source_type).inc()
|
||||||
await pool.execute(
|
INGESTION_JOBS_TOTAL.labels(source_type=source_type, status="error").inc()
|
||||||
"UPDATE ingestion_runs SET status='failed', error_message=$2, completed_at=NOW() WHERE id=$1",
|
logger.error(
|
||||||
run_id, str(e),
|
"Ingestion error for %s: %s", ticker, e,
|
||||||
|
extra={"ticker": ticker, "source_type": source_type, "error": str(e)},
|
||||||
|
)
|
||||||
|
await record_retrieval_failure(
|
||||||
|
pool,
|
||||||
|
run_id=str(run_id),
|
||||||
|
source_id=source_id,
|
||||||
|
error_message=str(e),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
config = load_config()
|
cfg = load_config()
|
||||||
pool = await get_pg_pool(config)
|
setup_logging("ingestion_worker", level=cfg.log_level, json_output=cfg.json_logs)
|
||||||
rds = get_redis(config)
|
|
||||||
minio_client = get_minio(config)
|
pool = await get_pg_pool(cfg)
|
||||||
|
rds = get_redis(cfg)
|
||||||
|
minio_client = get_minio(cfg)
|
||||||
|
|
||||||
|
# Ensure all required buckets exist
|
||||||
|
ensure_buckets(minio_client)
|
||||||
|
|
||||||
adapters = {
|
adapters = {
|
||||||
"market_api": MarketDataAdapter(
|
"market_api": PolygonMarketAdapter(
|
||||||
api_key=config.broker.api_key or "",
|
api_key=cfg.market_data.api_key,
|
||||||
|
base_url=cfg.market_data.base_url,
|
||||||
|
),
|
||||||
|
"news_api": PolygonNewsAdapter(
|
||||||
|
api_key=cfg.market_data.api_key,
|
||||||
base_url="https://api.polygon.io",
|
base_url="https://api.polygon.io",
|
||||||
),
|
),
|
||||||
"news_api": NewsApiAdapter(
|
"filings_api": SECEdgarAdapter(),
|
||||||
api_key="",
|
"web_scrape": WebScrapeAdapter(),
|
||||||
base_url="https://newsapi.org",
|
"broker": AlpacaBrokerAdapter(
|
||||||
|
api_key=cfg.broker.api_key or "",
|
||||||
|
api_secret=cfg.broker.api_secret or "",
|
||||||
|
mode=TradingMode.LIVE if cfg.broker.mode == "live" else TradingMode.PAPER,
|
||||||
|
base_url=cfg.broker.base_url,
|
||||||
),
|
),
|
||||||
"filings_api": FilingsAdapter(),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Ingestion worker started")
|
logger.info("Ingestion worker started")
|
||||||
|
|||||||
@@ -1 +1 @@
|
|||||||
# Lake Publisher - transforms operational data into analytical fact datasets
|
"""Lake publisher — writes partitioned Parquet facts to MinIO for Trino/Superset."""
|
||||||
|
|||||||
@@ -0,0 +1,39 @@
|
|||||||
|
"""Helpers for enqueuing lake publish jobs from upstream workers.
|
||||||
|
|
||||||
|
Other services import these helpers to push jobs onto the QUEUE_LAKE_PUBLISH
|
||||||
|
Redis queue. The lake publisher worker (jobs.py) consumes them.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
await enqueue_lake_job(rds, "document", document_id)
|
||||||
|
await enqueue_lake_job(rds, "trade_order", order_id)
|
||||||
|
await enqueue_lake_job(rds, "bulk_documents", since=cutoff.isoformat())
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
import redis.asyncio as aioredis
|
||||||
|
|
||||||
|
from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key
|
||||||
|
|
||||||
|
|
||||||
|
async def enqueue_lake_job(
|
||||||
|
rds: aioredis.Redis,
|
||||||
|
job_type: str,
|
||||||
|
entity_id: str = "",
|
||||||
|
since: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Push a lake publish job onto the Redis queue.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rds: Async Redis client.
|
||||||
|
job_type: One of the supported job types (document, document_extraction,
|
||||||
|
market_snapshot, trade_order, trade_fill, positions_snapshot,
|
||||||
|
pnl_snapshot, bulk_documents, bulk_extractions).
|
||||||
|
entity_id: UUID or identifier for the entity to publish.
|
||||||
|
since: ISO datetime string for bulk jobs (cutoff timestamp).
|
||||||
|
"""
|
||||||
|
payload: dict[str, str] = {"job_type": job_type, "entity_id": entity_id}
|
||||||
|
if since:
|
||||||
|
payload["since"] = since
|
||||||
|
await rds.rpush(queue_key(QUEUE_LAKE_PUBLISH), json.dumps(payload)) # type: ignore[misc]
|
||||||
@@ -0,0 +1,420 @@
|
|||||||
|
"""Iceberg table creation and metadata management for analytical datasets.
|
||||||
|
|
||||||
|
Manages Iceberg tables in Trino's Iceberg catalog, providing:
|
||||||
|
- Table creation with proper schemas and partition specs
|
||||||
|
- Schema synchronization between PyArrow definitions and Iceberg tables
|
||||||
|
- Table metadata inspection (existence checks, schema retrieval, partition listing)
|
||||||
|
|
||||||
|
The Iceberg catalog complements the existing Hive-compatible partition layout.
|
||||||
|
Parquet files written by the lake publisher are stored in the same MinIO paths,
|
||||||
|
but Iceberg metadata enables schema evolution, snapshot isolation, and better
|
||||||
|
partition pruning via Trino's Iceberg connector.
|
||||||
|
|
||||||
|
Requirements: 9.4, 9.5, 10.1, N4, N6
|
||||||
|
Design ref: Section 5.3 (Lakehouse model), Section 4.12 (SQL Query Engine)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pyarrow as pa
|
||||||
|
from trino.dbapi import connect as trino_connect
|
||||||
|
|
||||||
|
from services.lake_publisher.partitions import (
|
||||||
|
LAKEHOUSE_BUCKET,
|
||||||
|
TABLE_PARTITIONS,
|
||||||
|
WAREHOUSE_PREFIX,
|
||||||
|
PartitionSpec,
|
||||||
|
)
|
||||||
|
from services.lake_publisher.worker import (
|
||||||
|
COMPANY_EVENTS_SCHEMA,
|
||||||
|
DOCUMENTS_SCHEMA,
|
||||||
|
DOCUMENT_EXTRACTIONS_SCHEMA,
|
||||||
|
MARKET_BARS_SCHEMA,
|
||||||
|
MARKET_QUOTES_SCHEMA,
|
||||||
|
MODEL_PERFORMANCE_SCHEMA,
|
||||||
|
PNL_DAILY_SCHEMA,
|
||||||
|
POSITIONS_DAILY_SCHEMA,
|
||||||
|
PREDICTION_VS_OUTCOME_SCHEMA,
|
||||||
|
TRADE_FILLS_SCHEMA,
|
||||||
|
TRADE_ORDERS_SCHEMA,
|
||||||
|
TRADE_SIGNALS_SCHEMA,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
ICEBERG_CATALOG = "iceberg"
|
||||||
|
ICEBERG_SCHEMA = "stonks"
|
||||||
|
|
||||||
|
|
||||||
|
def _get_iceberg_catalog() -> str:
|
||||||
|
"""Return the Iceberg catalog name from env or default."""
|
||||||
|
import os
|
||||||
|
return os.getenv("TRINO_ICEBERG_CATALOG", ICEBERG_CATALOG)
|
||||||
|
|
||||||
|
# Map PyArrow types to Trino/Iceberg SQL types.
|
||||||
|
_ARROW_TO_TRINO: dict[str, str] = {
|
||||||
|
"string": "VARCHAR",
|
||||||
|
"utf8": "VARCHAR",
|
||||||
|
"large_string": "VARCHAR",
|
||||||
|
"large_utf8": "VARCHAR",
|
||||||
|
"float64": "DOUBLE",
|
||||||
|
"double": "DOUBLE",
|
||||||
|
"float32": "REAL",
|
||||||
|
"float": "REAL",
|
||||||
|
"int8": "TINYINT",
|
||||||
|
"int16": "SMALLINT",
|
||||||
|
"int32": "INTEGER",
|
||||||
|
"int64": "BIGINT",
|
||||||
|
"bool": "BOOLEAN",
|
||||||
|
"date32": "DATE",
|
||||||
|
"date32[day]": "DATE",
|
||||||
|
"date64": "DATE",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _arrow_type_to_trino(arrow_type: pa.DataType) -> str:
|
||||||
|
"""Convert a PyArrow data type to a Trino SQL type string."""
|
||||||
|
type_str = str(arrow_type)
|
||||||
|
|
||||||
|
# Handle timestamp types (with or without timezone)
|
||||||
|
if type_str.startswith("timestamp"):
|
||||||
|
if "tz=" in type_str:
|
||||||
|
return "TIMESTAMP(6) WITH TIME ZONE"
|
||||||
|
return "TIMESTAMP(6)"
|
||||||
|
|
||||||
|
# Direct lookup
|
||||||
|
result = _ARROW_TO_TRINO.get(type_str)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Fallback for type IDs
|
||||||
|
if pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type):
|
||||||
|
return "VARCHAR"
|
||||||
|
if pa.types.is_floating(arrow_type):
|
||||||
|
return "DOUBLE"
|
||||||
|
if pa.types.is_integer(arrow_type):
|
||||||
|
return "BIGINT"
|
||||||
|
if pa.types.is_boolean(arrow_type):
|
||||||
|
return "BOOLEAN"
|
||||||
|
if pa.types.is_date(arrow_type):
|
||||||
|
return "DATE"
|
||||||
|
if pa.types.is_timestamp(arrow_type):
|
||||||
|
return "TIMESTAMP(6) WITH TIME ZONE"
|
||||||
|
|
||||||
|
raise ValueError(f"Unsupported PyArrow type for Iceberg DDL: {arrow_type}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Registry mapping table names to their PyArrow schemas.
|
||||||
|
TABLE_SCHEMAS: dict[str, pa.Schema] = {
|
||||||
|
"market_bars": MARKET_BARS_SCHEMA,
|
||||||
|
"market_quotes": MARKET_QUOTES_SCHEMA,
|
||||||
|
"company_events": COMPANY_EVENTS_SCHEMA,
|
||||||
|
"documents": DOCUMENTS_SCHEMA,
|
||||||
|
"document_extractions": DOCUMENT_EXTRACTIONS_SCHEMA,
|
||||||
|
"trade_signals": TRADE_SIGNALS_SCHEMA,
|
||||||
|
"trade_orders": TRADE_ORDERS_SCHEMA,
|
||||||
|
"trade_fills": TRADE_FILLS_SCHEMA,
|
||||||
|
"positions_daily": POSITIONS_DAILY_SCHEMA,
|
||||||
|
"pnl_daily": PNL_DAILY_SCHEMA,
|
||||||
|
"prediction_vs_outcome": PREDICTION_VS_OUTCOME_SCHEMA,
|
||||||
|
"model_performance": MODEL_PERFORMANCE_SCHEMA,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IcebergTableDef:
|
||||||
|
"""Definition for an Iceberg table derived from PyArrow schema + partition spec."""
|
||||||
|
|
||||||
|
table_name: str
|
||||||
|
schema: pa.Schema
|
||||||
|
partition_spec: PartitionSpec
|
||||||
|
|
||||||
|
@property
|
||||||
|
def qualified_name(self) -> str:
|
||||||
|
return f"{ICEBERG_CATALOG}.{ICEBERG_SCHEMA}.{self.table_name}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def location(self) -> str:
|
||||||
|
return f"s3a://{LAKEHOUSE_BUCKET}/{WAREHOUSE_PREFIX}/{self.table_name}/"
|
||||||
|
|
||||||
|
def column_defs_sql(self) -> list[str]:
|
||||||
|
"""Generate SQL column definitions from the PyArrow schema.
|
||||||
|
|
||||||
|
Partition columns are included in the column list (Iceberg stores them
|
||||||
|
in the data files, unlike Hive external tables).
|
||||||
|
"""
|
||||||
|
cols: list[str] = []
|
||||||
|
for i in range(len(self.schema)):
|
||||||
|
name = self.schema.field(i).name
|
||||||
|
arrow_type = self.schema.field(i).type
|
||||||
|
trino_type = _arrow_type_to_trino(arrow_type)
|
||||||
|
cols.append(f" {name} {trino_type}")
|
||||||
|
return cols
|
||||||
|
|
||||||
|
def partition_keys_sql(self) -> str:
|
||||||
|
"""Generate the partitioning clause for CREATE TABLE."""
|
||||||
|
keys = list(self.partition_spec.all_keys)
|
||||||
|
if not keys:
|
||||||
|
return ""
|
||||||
|
quoted = ", ".join(f"'{k}'" for k in keys)
|
||||||
|
return f"partitioning = ARRAY[{quoted}]"
|
||||||
|
|
||||||
|
def create_table_sql(self) -> str:
|
||||||
|
"""Generate a CREATE TABLE IF NOT EXISTS statement for Trino's Iceberg catalog."""
|
||||||
|
col_lines = ",\n".join(self.column_defs_sql())
|
||||||
|
with_clauses = [
|
||||||
|
"format = 'PARQUET'",
|
||||||
|
f"location = '{self.location}'",
|
||||||
|
]
|
||||||
|
part_sql = self.partition_keys_sql()
|
||||||
|
if part_sql:
|
||||||
|
with_clauses.append(part_sql)
|
||||||
|
|
||||||
|
with_block = ",\n ".join(with_clauses)
|
||||||
|
|
||||||
|
return (
|
||||||
|
f"CREATE TABLE IF NOT EXISTS {self.qualified_name} (\n"
|
||||||
|
f"{col_lines}\n"
|
||||||
|
f") WITH (\n"
|
||||||
|
f" {with_block}\n"
|
||||||
|
f")"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_table_defs() -> list[IcebergTableDef]:
|
||||||
|
"""Build IcebergTableDef for every registered analytical table."""
|
||||||
|
defs: list[IcebergTableDef] = []
|
||||||
|
for table_name, partition_spec in TABLE_PARTITIONS.items():
|
||||||
|
schema = TABLE_SCHEMAS.get(table_name)
|
||||||
|
if schema is None:
|
||||||
|
logger.warning("No PyArrow schema for table %s, skipping", table_name)
|
||||||
|
continue
|
||||||
|
defs.append(IcebergTableDef(
|
||||||
|
table_name=table_name,
|
||||||
|
schema=schema,
|
||||||
|
partition_spec=partition_spec,
|
||||||
|
))
|
||||||
|
return defs
|
||||||
|
|
||||||
|
|
||||||
|
def get_table_def(table_name: str) -> IcebergTableDef:
|
||||||
|
"""Get the IcebergTableDef for a single table by name."""
|
||||||
|
if table_name not in TABLE_PARTITIONS:
|
||||||
|
raise ValueError(f"Unknown table: {table_name}")
|
||||||
|
schema = TABLE_SCHEMAS.get(table_name)
|
||||||
|
if schema is None:
|
||||||
|
raise ValueError(f"No PyArrow schema registered for table: {table_name}")
|
||||||
|
return IcebergTableDef(
|
||||||
|
table_name=table_name,
|
||||||
|
schema=schema,
|
||||||
|
partition_spec=TABLE_PARTITIONS[table_name],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class IcebergManager:
|
||||||
|
"""Manages Iceberg tables via Trino's Iceberg catalog.
|
||||||
|
|
||||||
|
Provides table creation, existence checks, schema inspection,
|
||||||
|
and metadata operations against the Trino Iceberg connector.
|
||||||
|
"""
|
||||||
|
|
||||||
|
host: str = "localhost"
|
||||||
|
port: int = 8080
|
||||||
|
user: str = "stonks"
|
||||||
|
catalog: str = ICEBERG_CATALOG
|
||||||
|
schema: str = ICEBERG_SCHEMA
|
||||||
|
|
||||||
|
def _get_connection(self) -> Any:
|
||||||
|
"""Create a Trino DBAPI connection."""
|
||||||
|
return trino_connect(
|
||||||
|
host=self.host,
|
||||||
|
port=self.port,
|
||||||
|
user=self.user,
|
||||||
|
catalog=self.catalog,
|
||||||
|
schema=self.schema,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _execute(self, sql: str) -> list[list[Any]]:
|
||||||
|
"""Execute a SQL statement and return all rows."""
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(sql)
|
||||||
|
return cursor.fetchall()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def _execute_no_fetch(self, sql: str) -> None:
|
||||||
|
"""Execute a DDL statement that returns no rows."""
|
||||||
|
conn = self._get_connection()
|
||||||
|
try:
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(sql)
|
||||||
|
# DDL statements in Trino still need fetchall to complete
|
||||||
|
try:
|
||||||
|
cursor.fetchall()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def ensure_schema(self) -> None:
|
||||||
|
"""Create the Iceberg schema if it doesn't exist."""
|
||||||
|
sql = f"CREATE SCHEMA IF NOT EXISTS {self.catalog}.{self.schema}"
|
||||||
|
logger.info("Ensuring Iceberg schema: %s.%s", self.catalog, self.schema)
|
||||||
|
self._execute_no_fetch(sql)
|
||||||
|
|
||||||
|
def table_exists(self, table_name: str) -> bool:
|
||||||
|
"""Check if an Iceberg table exists."""
|
||||||
|
sql = (
|
||||||
|
f"SELECT table_name FROM {self.catalog}.information_schema.tables "
|
||||||
|
f"WHERE table_schema = '{self.schema}' AND table_name = '{table_name}'"
|
||||||
|
)
|
||||||
|
rows = self._execute(sql)
|
||||||
|
return len(rows) > 0
|
||||||
|
|
||||||
|
def create_table(self, table_name: str) -> bool:
|
||||||
|
"""Create a single Iceberg table if it doesn't exist.
|
||||||
|
|
||||||
|
Returns True if the table was created, False if it already existed.
|
||||||
|
"""
|
||||||
|
table_def = get_table_def(table_name)
|
||||||
|
ddl = table_def.create_table_sql()
|
||||||
|
logger.info("Creating Iceberg table: %s", table_def.qualified_name)
|
||||||
|
self._execute_no_fetch(ddl)
|
||||||
|
logger.info("Iceberg table ready: %s", table_def.qualified_name)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def create_all_tables(self) -> dict[str, bool]:
|
||||||
|
"""Create all registered Iceberg tables.
|
||||||
|
|
||||||
|
Returns a dict mapping table_name -> True (created) or False (error).
|
||||||
|
"""
|
||||||
|
self.ensure_schema()
|
||||||
|
results: dict[str, bool] = {}
|
||||||
|
for table_def in get_all_table_defs():
|
||||||
|
try:
|
||||||
|
self.create_table(table_def.table_name)
|
||||||
|
results[table_def.table_name] = True
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Failed to create Iceberg table: %s", table_def.table_name)
|
||||||
|
results[table_def.table_name] = False
|
||||||
|
return results
|
||||||
|
|
||||||
|
def get_table_schema(self, table_name: str) -> list[dict[str, str]]:
|
||||||
|
"""Retrieve the column schema of an Iceberg table from Trino.
|
||||||
|
|
||||||
|
Returns a list of dicts with 'column_name', 'data_type', and 'is_nullable'.
|
||||||
|
"""
|
||||||
|
sql = (
|
||||||
|
f"SELECT column_name, data_type, is_nullable "
|
||||||
|
f"FROM {self.catalog}.information_schema.columns "
|
||||||
|
f"WHERE table_schema = '{self.schema}' AND table_name = '{table_name}' "
|
||||||
|
f"ORDER BY ordinal_position"
|
||||||
|
)
|
||||||
|
rows = self._execute(sql)
|
||||||
|
return [
|
||||||
|
{"column_name": r[0], "data_type": r[1], "is_nullable": r[2]}
|
||||||
|
for r in rows
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_table_snapshots(self, table_name: str) -> list[dict[str, Any]]:
|
||||||
|
"""List Iceberg snapshots for a table (useful for auditing and rollback).
|
||||||
|
|
||||||
|
Returns snapshot metadata from Trino's $snapshots metadata table.
|
||||||
|
"""
|
||||||
|
qualified = f"{self.catalog}.{self.schema}.{table_name}"
|
||||||
|
sql = f'SELECT * FROM "{qualified}$snapshots"'
|
||||||
|
try:
|
||||||
|
rows = self._execute(sql)
|
||||||
|
return [{"snapshot_id": r[0], "parent_id": r[1], "operation": r[2],
|
||||||
|
"manifest_list": r[3], "summary": r[4]} for r in rows]
|
||||||
|
except Exception:
|
||||||
|
logger.debug("Could not read snapshots for %s (table may be empty)", table_name)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_table_partitions(self, table_name: str) -> list[dict[str, Any]]:
|
||||||
|
"""List partition values for an Iceberg table.
|
||||||
|
|
||||||
|
Returns partition metadata from Trino's $partitions metadata table.
|
||||||
|
"""
|
||||||
|
qualified = f"{self.catalog}.{self.schema}.{table_name}"
|
||||||
|
sql = f'SELECT * FROM "{qualified}$partitions"'
|
||||||
|
try:
|
||||||
|
rows = self._execute(sql)
|
||||||
|
return [{"row": r} for r in rows]
|
||||||
|
except Exception:
|
||||||
|
logger.debug("Could not read partitions for %s (table may be empty)", table_name)
|
||||||
|
return []
|
||||||
|
|
||||||
|
def list_tables(self) -> list[str]:
|
||||||
|
"""List all tables in the Iceberg schema."""
|
||||||
|
sql = (
|
||||||
|
f"SELECT table_name FROM {self.catalog}.information_schema.tables "
|
||||||
|
f"WHERE table_schema = '{self.schema}' ORDER BY table_name"
|
||||||
|
)
|
||||||
|
rows = self._execute(sql)
|
||||||
|
return [r[0] for r in rows]
|
||||||
|
|
||||||
|
def drop_table(self, table_name: str) -> None:
|
||||||
|
"""Drop an Iceberg table (for testing/reset purposes)."""
|
||||||
|
qualified = f"{self.catalog}.{self.schema}.{table_name}"
|
||||||
|
logger.warning("Dropping Iceberg table: %s", qualified)
|
||||||
|
self._execute_no_fetch(f"DROP TABLE IF EXISTS {qualified}")
|
||||||
|
|
||||||
|
def sync_table_schema(self, table_name: str) -> list[str]:
|
||||||
|
"""Compare the expected PyArrow schema with the actual Iceberg table schema.
|
||||||
|
|
||||||
|
If columns are missing from the Iceberg table, adds them via ALTER TABLE.
|
||||||
|
Returns a list of columns that were added.
|
||||||
|
|
||||||
|
This supports forward-only schema evolution — columns are never dropped.
|
||||||
|
"""
|
||||||
|
table_def = get_table_def(table_name)
|
||||||
|
existing = self.get_table_schema(table_name)
|
||||||
|
existing_names = {col["column_name"] for col in existing}
|
||||||
|
|
||||||
|
added: list[str] = []
|
||||||
|
qualified = table_def.qualified_name
|
||||||
|
|
||||||
|
for i in range(len(table_def.schema)):
|
||||||
|
col_name = table_def.schema.field(i).name
|
||||||
|
if col_name not in existing_names:
|
||||||
|
trino_type = _arrow_type_to_trino(table_def.schema.field(i).type)
|
||||||
|
alter_sql = f"ALTER TABLE {qualified} ADD COLUMN {col_name} {trino_type}"
|
||||||
|
logger.info("Adding column %s to %s", col_name, qualified)
|
||||||
|
self._execute_no_fetch(alter_sql)
|
||||||
|
added.append(col_name)
|
||||||
|
|
||||||
|
return added
|
||||||
|
|
||||||
|
def sync_all_schemas(self) -> dict[str, list[str]]:
|
||||||
|
"""Sync schemas for all registered tables. Returns table_name -> added columns."""
|
||||||
|
results: dict[str, list[str]] = {}
|
||||||
|
for table_def in get_all_table_defs():
|
||||||
|
try:
|
||||||
|
if self.table_exists(table_def.table_name):
|
||||||
|
added = self.sync_table_schema(table_def.table_name)
|
||||||
|
results[table_def.table_name] = added
|
||||||
|
else:
|
||||||
|
logger.info("Table %s doesn't exist yet, skipping sync", table_def.table_name)
|
||||||
|
results[table_def.table_name] = []
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Failed to sync schema for %s", table_def.table_name)
|
||||||
|
results[table_def.table_name] = []
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def create_iceberg_manager_from_config(
|
||||||
|
host: str = "localhost",
|
||||||
|
port: int = 8080,
|
||||||
|
user: str = "stonks",
|
||||||
|
) -> IcebergManager:
|
||||||
|
"""Factory that creates an IcebergManager from explicit connection params."""
|
||||||
|
return IcebergManager(host=host, port=port, user=user)
|
||||||
@@ -0,0 +1,673 @@
|
|||||||
|
"""Lake publisher async job runner — transforms operational data into analytical facts.
|
||||||
|
|
||||||
|
Reads jobs from the QUEUE_LAKE_PUBLISH Redis queue, queries PostgreSQL for
|
||||||
|
operational records, and publishes them as partitioned Parquet files to MinIO
|
||||||
|
via the existing publish_* functions in worker.py.
|
||||||
|
|
||||||
|
Job message format:
|
||||||
|
{"job_type": "<table_name>", "entity_id": "<uuid or ticker>", "dt": "2026-04-11T..."}
|
||||||
|
|
||||||
|
Supported job types:
|
||||||
|
- document: publish a single document metadata fact
|
||||||
|
- document_extraction: publish extraction facts for a document
|
||||||
|
- market_snapshot: publish market bars/quotes from a snapshot
|
||||||
|
- trade_order: publish an order fact
|
||||||
|
- trade_fill: publish fill facts for an order
|
||||||
|
- positions_snapshot: publish daily position snapshots for a broker account
|
||||||
|
- pnl_snapshot: publish daily PnL for a broker account
|
||||||
|
- company_event: publish a company event fact
|
||||||
|
- bulk_documents: publish all unpublished documents since a cutoff
|
||||||
|
- bulk_extractions: publish all unpublished extractions since a cutoff
|
||||||
|
|
||||||
|
Requirements: 9.4, 9.5, 10.1
|
||||||
|
Design ref: Section 4.10 (Lake Publisher), Section 8.4 (Lake publication flow)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
import asyncpg
|
||||||
|
import redis.asyncio as aioredis
|
||||||
|
from minio import Minio
|
||||||
|
|
||||||
|
from services.lake_publisher.worker import (
|
||||||
|
publish_document_extraction,
|
||||||
|
publish_document_fact,
|
||||||
|
publish_market_bar,
|
||||||
|
publish_market_quote,
|
||||||
|
publish_trade_order,
|
||||||
|
publish_trade_fill,
|
||||||
|
publish_pnl_daily,
|
||||||
|
publish_documents_batch,
|
||||||
|
publish_document_extractions_batch,
|
||||||
|
publish_positions_daily_batch,
|
||||||
|
)
|
||||||
|
from services.lake_publisher.partitions import partition_values
|
||||||
|
from services.shared.config import load_config
|
||||||
|
from services.shared.db import get_minio, get_pg_pool, get_redis
|
||||||
|
from services.shared.logging import setup_logging
|
||||||
|
from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# SQL queries for fetching operational data
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_FETCH_DOCUMENT = """
|
||||||
|
SELECT
|
||||||
|
d.id, d.document_type, d.source_type, d.publisher, d.title,
|
||||||
|
d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
|
||||||
|
d.content_hash, d.parse_quality_score,
|
||||||
|
COALESCE(
|
||||||
|
(SELECT dcm.ticker FROM document_company_mentions dcm
|
||||||
|
WHERE dcm.document_id = d.id LIMIT 1),
|
||||||
|
''
|
||||||
|
) AS ticker
|
||||||
|
FROM documents d
|
||||||
|
WHERE d.id = $1::uuid
|
||||||
|
"""
|
||||||
|
|
||||||
|
_FETCH_EXTRACTIONS = """
|
||||||
|
SELECT
|
||||||
|
di.document_id, dir.ticker, dir.relevance, dir.sentiment,
|
||||||
|
dir.impact_score, dir.impact_horizon, dir.catalyst_type,
|
||||||
|
di.confidence, di.novelty_score, di.source_credibility,
|
||||||
|
dir.key_facts, dir.risks, di.macro_themes,
|
||||||
|
di.model_name, di.prompt_version, di.schema_version,
|
||||||
|
di.created_at AS extraction_at,
|
||||||
|
COALESCE(c.legal_name, '') AS company_name
|
||||||
|
FROM document_intelligence di
|
||||||
|
JOIN document_impact_records dir ON dir.intelligence_id = di.id
|
||||||
|
LEFT JOIN companies c ON c.id = dir.company_id
|
||||||
|
WHERE di.document_id = $1::uuid
|
||||||
|
AND di.validation_status = 'valid'
|
||||||
|
"""
|
||||||
|
|
||||||
|
_FETCH_MARKET_SNAPSHOT = """
|
||||||
|
SELECT
|
||||||
|
ms.ticker, ms.snapshot_type, ms.data, ms.source_provider, ms.captured_at
|
||||||
|
FROM market_snapshots ms
|
||||||
|
WHERE ms.id = $1::uuid
|
||||||
|
"""
|
||||||
|
|
||||||
|
_FETCH_ORDER = """
|
||||||
|
SELECT
|
||||||
|
o.id, o.recommendation_id, o.ticker, o.side, o.order_type,
|
||||||
|
o.quantity, o.limit_price, o.status, o.submitted_at,
|
||||||
|
o.fill_price, o.fill_quantity, o.filled_at,
|
||||||
|
COALESCE(ba.account_id, '') AS broker_account,
|
||||||
|
COALESCE(ba.mode, 'paper') AS execution_mode
|
||||||
|
FROM orders o
|
||||||
|
LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
|
||||||
|
WHERE o.id = $1::uuid
|
||||||
|
"""
|
||||||
|
|
||||||
|
_FETCH_ORDER_FILLS = """
|
||||||
|
SELECT
|
||||||
|
oe.id AS fill_id, oe.order_id, oe.data, oe.broker_timestamp,
|
||||||
|
o.ticker, o.side,
|
||||||
|
COALESCE(ba.account_id, '') AS broker_account
|
||||||
|
FROM order_events oe
|
||||||
|
JOIN orders o ON o.id = oe.order_id
|
||||||
|
LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
|
||||||
|
WHERE oe.order_id = $1::uuid AND oe.event_type = 'fill'
|
||||||
|
"""
|
||||||
|
|
||||||
|
_FETCH_POSITIONS = """
|
||||||
|
SELECT
|
||||||
|
p.ticker, p.quantity, p.avg_entry_price, p.current_price,
|
||||||
|
p.unrealized_pnl, p.realized_pnl,
|
||||||
|
COALESCE(ba.account_id, '') AS broker_account,
|
||||||
|
COALESCE(ba.mode, 'paper') AS execution_mode
|
||||||
|
FROM positions p
|
||||||
|
LEFT JOIN broker_accounts ba ON ba.id = p.broker_account_id
|
||||||
|
WHERE p.broker_account_id = $1::uuid AND p.quantity != 0
|
||||||
|
"""
|
||||||
|
|
||||||
|
_FETCH_BULK_DOCUMENTS = """
|
||||||
|
SELECT
|
||||||
|
d.id, d.document_type, d.source_type, d.publisher, d.title,
|
||||||
|
d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
|
||||||
|
d.content_hash, d.parse_quality_score,
|
||||||
|
COALESCE(
|
||||||
|
(SELECT dcm.ticker FROM document_company_mentions dcm
|
||||||
|
WHERE dcm.document_id = d.id LIMIT 1),
|
||||||
|
''
|
||||||
|
) AS ticker
|
||||||
|
FROM documents d
|
||||||
|
WHERE d.created_at >= $1
|
||||||
|
AND d.status IN ('parsed', 'extracted')
|
||||||
|
ORDER BY d.created_at
|
||||||
|
LIMIT 500
|
||||||
|
"""
|
||||||
|
|
||||||
|
_FETCH_BULK_EXTRACTIONS = """
|
||||||
|
SELECT
|
||||||
|
di.document_id, dir.ticker, dir.relevance, dir.sentiment,
|
||||||
|
dir.impact_score, dir.impact_horizon, dir.catalyst_type,
|
||||||
|
di.confidence, di.novelty_score, di.source_credibility,
|
||||||
|
dir.key_facts, dir.risks, di.macro_themes,
|
||||||
|
di.model_name, di.prompt_version, di.schema_version,
|
||||||
|
di.created_at AS extraction_at,
|
||||||
|
COALESCE(c.legal_name, '') AS company_name
|
||||||
|
FROM document_intelligence di
|
||||||
|
JOIN document_impact_records dir ON dir.intelligence_id = di.id
|
||||||
|
LEFT JOIN companies c ON c.id = dir.company_id
|
||||||
|
WHERE di.created_at >= $1
|
||||||
|
AND di.validation_status = 'valid'
|
||||||
|
ORDER BY di.created_at
|
||||||
|
LIMIT 500
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Job handlers — each transforms operational rows into lake facts
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _jsonb_to_str(val: object) -> str:
|
||||||
|
"""Convert a JSONB column value (list or str) to a comma-separated string."""
|
||||||
|
if val is None:
|
||||||
|
return ""
|
||||||
|
if isinstance(val, str):
|
||||||
|
try:
|
||||||
|
parsed = json.loads(val)
|
||||||
|
if isinstance(parsed, list):
|
||||||
|
return ", ".join(str(x) for x in parsed)
|
||||||
|
return val
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
return val
|
||||||
|
if isinstance(val, list):
|
||||||
|
return ", ".join(str(x) for x in val)
|
||||||
|
return str(val)
|
||||||
|
|
||||||
|
|
||||||
|
async def publish_document_job(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
minio_client: Minio,
|
||||||
|
entity_id: str,
|
||||||
|
) -> str:
|
||||||
|
"""Publish a single document metadata fact from PostgreSQL to the lake."""
|
||||||
|
row = await pool.fetchrow(_FETCH_DOCUMENT, entity_id)
|
||||||
|
if row is None:
|
||||||
|
logger.warning("Document %s not found, skipping lake publish", entity_id)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
published_at = row["published_at"] or row["retrieved_at"]
|
||||||
|
return publish_document_fact(
|
||||||
|
client=minio_client,
|
||||||
|
document_id=str(row["id"]),
|
||||||
|
document_type=row["document_type"],
|
||||||
|
source_type=row["source_type"],
|
||||||
|
ticker=row["ticker"] or "",
|
||||||
|
publisher=row["publisher"] or "",
|
||||||
|
title=row["title"] or "",
|
||||||
|
published_at=published_at,
|
||||||
|
content_hash=row["content_hash"],
|
||||||
|
url=row["url"] or "",
|
||||||
|
canonical_url=row["canonical_url"] or "",
|
||||||
|
language=row["language"] or "en",
|
||||||
|
confidence=float(row["parse_quality_score"] or 0.0),
|
||||||
|
retrieved_at=row["retrieved_at"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def publish_extraction_job(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
minio_client: Minio,
|
||||||
|
entity_id: str,
|
||||||
|
) -> list[str]:
|
||||||
|
"""Publish document extraction facts for a document from PostgreSQL to the lake."""
|
||||||
|
rows = await pool.fetch(_FETCH_EXTRACTIONS, entity_id)
|
||||||
|
if not rows:
|
||||||
|
logger.info("No valid extractions for document %s", entity_id)
|
||||||
|
return []
|
||||||
|
|
||||||
|
refs: list[str] = []
|
||||||
|
for row in rows:
|
||||||
|
ref = publish_document_extraction(
|
||||||
|
client=minio_client,
|
||||||
|
document_id=str(row["document_id"]),
|
||||||
|
ticker=row["ticker"],
|
||||||
|
sentiment=row["sentiment"] or "neutral",
|
||||||
|
impact_score=float(row["impact_score"] or 0.0),
|
||||||
|
catalyst_type=row["catalyst_type"] or "other",
|
||||||
|
confidence=float(row["confidence"] or 0.0),
|
||||||
|
extraction_at=row["extraction_at"],
|
||||||
|
model_name=row["model_name"] or "",
|
||||||
|
prompt_version=row["prompt_version"] or "",
|
||||||
|
company_name=row["company_name"] or "",
|
||||||
|
relevance=float(row["relevance"] or 0.0),
|
||||||
|
impact_horizon=row["impact_horizon"] or "",
|
||||||
|
novelty_score=float(row["novelty_score"] or 0.0),
|
||||||
|
source_credibility=float(row["source_credibility"] or 0.0),
|
||||||
|
key_facts=_jsonb_to_str(row["key_facts"]),
|
||||||
|
risks=_jsonb_to_str(row["risks"]),
|
||||||
|
macro_themes=_jsonb_to_str(row["macro_themes"]),
|
||||||
|
schema_version=row["schema_version"] or "",
|
||||||
|
)
|
||||||
|
refs.append(ref)
|
||||||
|
return refs
|
||||||
|
|
||||||
|
|
||||||
|
async def publish_market_snapshot_job(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
minio_client: Minio,
|
||||||
|
entity_id: str,
|
||||||
|
) -> list[str]:
|
||||||
|
"""Publish market bar/quote facts from a market_snapshots row."""
|
||||||
|
row = await pool.fetchrow(_FETCH_MARKET_SNAPSHOT, entity_id)
|
||||||
|
if row is None:
|
||||||
|
logger.warning("Market snapshot %s not found", entity_id)
|
||||||
|
return []
|
||||||
|
|
||||||
|
ticker = row["ticker"]
|
||||||
|
data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"])
|
||||||
|
source = row["source_provider"] or ""
|
||||||
|
captured_at = row["captured_at"]
|
||||||
|
snapshot_type = row["snapshot_type"]
|
||||||
|
refs: list[str] = []
|
||||||
|
|
||||||
|
if snapshot_type == "bar" or snapshot_type == "bars":
|
||||||
|
# Single bar or list of bars
|
||||||
|
bars = data.get("bars", [data]) if "bars" in data else [data]
|
||||||
|
for bar in bars:
|
||||||
|
ref = publish_market_bar(
|
||||||
|
client=minio_client,
|
||||||
|
ticker=ticker,
|
||||||
|
open_price=float(bar.get("open", bar.get("o", 0))),
|
||||||
|
high_price=float(bar.get("high", bar.get("h", 0))),
|
||||||
|
low_price=float(bar.get("low", bar.get("l", 0))),
|
||||||
|
close_price=float(bar.get("close", bar.get("c", 0))),
|
||||||
|
volume=int(bar.get("volume", bar.get("v", 0))),
|
||||||
|
bar_timestamp=captured_at,
|
||||||
|
source=source,
|
||||||
|
vwap=float(bar.get("vwap", bar.get("vw", 0))),
|
||||||
|
trade_count=int(bar.get("trade_count", bar.get("n", 0))),
|
||||||
|
bar_interval=bar.get("interval", "1d"),
|
||||||
|
)
|
||||||
|
refs.append(ref)
|
||||||
|
elif snapshot_type == "quote" or snapshot_type == "quotes":
|
||||||
|
ref = publish_market_quote(
|
||||||
|
client=minio_client,
|
||||||
|
ticker=ticker,
|
||||||
|
bid_price=float(data.get("bid_price", data.get("bp", 0))),
|
||||||
|
ask_price=float(data.get("ask_price", data.get("ap", 0))),
|
||||||
|
last_price=float(data.get("last_price", data.get("lp", 0))),
|
||||||
|
quote_at=captured_at,
|
||||||
|
source=source,
|
||||||
|
bid_size=int(data.get("bid_size", data.get("bs", 0))),
|
||||||
|
ask_size=int(data.get("ask_size", data.get("as", 0))),
|
||||||
|
last_size=int(data.get("last_size", data.get("ls", 0))),
|
||||||
|
)
|
||||||
|
refs.append(ref)
|
||||||
|
|
||||||
|
return refs
|
||||||
|
|
||||||
|
|
||||||
|
async def publish_order_job(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
minio_client: Minio,
|
||||||
|
entity_id: str,
|
||||||
|
) -> str:
|
||||||
|
"""Publish a trade order fact from PostgreSQL to the lake."""
|
||||||
|
row = await pool.fetchrow(_FETCH_ORDER, entity_id)
|
||||||
|
if row is None:
|
||||||
|
logger.warning("Order %s not found", entity_id)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
submitted_at = row["submitted_at"] or datetime.now(timezone.utc)
|
||||||
|
return publish_trade_order(
|
||||||
|
client=minio_client,
|
||||||
|
order_id=str(row["id"]),
|
||||||
|
ticker=row["ticker"],
|
||||||
|
side=row["side"],
|
||||||
|
order_type=row["order_type"],
|
||||||
|
quantity=float(row["quantity"]),
|
||||||
|
limit_price=float(row["limit_price"]) if row["limit_price"] else None,
|
||||||
|
status=row["status"],
|
||||||
|
broker_account=row["broker_account"],
|
||||||
|
submitted_at=submitted_at,
|
||||||
|
recommendation_id=str(row["recommendation_id"]) if row["recommendation_id"] else "",
|
||||||
|
execution_mode=row["execution_mode"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def publish_fills_job(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
minio_client: Minio,
|
||||||
|
entity_id: str,
|
||||||
|
) -> list[str]:
|
||||||
|
"""Publish trade fill facts for an order from PostgreSQL to the lake."""
|
||||||
|
rows = await pool.fetch(_FETCH_ORDER_FILLS, entity_id)
|
||||||
|
if not rows:
|
||||||
|
logger.info("No fill events for order %s", entity_id)
|
||||||
|
return []
|
||||||
|
|
||||||
|
refs: list[str] = []
|
||||||
|
for row in rows:
|
||||||
|
data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"] or "{}")
|
||||||
|
filled_at = row["broker_timestamp"] or datetime.now(timezone.utc)
|
||||||
|
ref = publish_trade_fill(
|
||||||
|
client=minio_client,
|
||||||
|
fill_id=str(row["fill_id"]),
|
||||||
|
order_id=str(row["order_id"]),
|
||||||
|
ticker=row["ticker"],
|
||||||
|
side=row["side"],
|
||||||
|
fill_price=float(data.get("fill_price", data.get("price", 0))),
|
||||||
|
fill_quantity=float(data.get("fill_quantity", data.get("qty", 0))),
|
||||||
|
broker_account=row["broker_account"],
|
||||||
|
filled_at=filled_at,
|
||||||
|
commission=float(data.get("commission", 0)),
|
||||||
|
)
|
||||||
|
refs.append(ref)
|
||||||
|
return refs
|
||||||
|
|
||||||
|
|
||||||
|
async def publish_positions_job(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
minio_client: Minio,
|
||||||
|
entity_id: str,
|
||||||
|
) -> str:
|
||||||
|
"""Publish daily position snapshots for a broker account."""
|
||||||
|
rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
|
||||||
|
if not rows:
|
||||||
|
logger.info("No open positions for account %s", entity_id)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
snapshot_at = datetime.now(timezone.utc)
|
||||||
|
positions = [
|
||||||
|
{
|
||||||
|
"ticker": row["ticker"],
|
||||||
|
"quantity": float(row["quantity"]),
|
||||||
|
"avg_entry_price": float(row["avg_entry_price"] or 0),
|
||||||
|
"close_price": float(row["current_price"] or 0),
|
||||||
|
"unrealized_pnl": float(row["unrealized_pnl"] or 0),
|
||||||
|
}
|
||||||
|
for row in rows
|
||||||
|
]
|
||||||
|
broker_account = rows[0]["broker_account"] if rows else ""
|
||||||
|
return publish_positions_daily_batch(
|
||||||
|
client=minio_client,
|
||||||
|
positions=positions,
|
||||||
|
broker_account=broker_account,
|
||||||
|
snapshot_at=snapshot_at,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def publish_pnl_job(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
minio_client: Minio,
|
||||||
|
entity_id: str,
|
||||||
|
) -> list[str]:
|
||||||
|
"""Publish daily PnL facts for a broker account's positions."""
|
||||||
|
rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
|
||||||
|
if not rows:
|
||||||
|
logger.info("No positions for PnL snapshot, account %s", entity_id)
|
||||||
|
return []
|
||||||
|
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
refs: list[str] = []
|
||||||
|
for row in rows:
|
||||||
|
realized = float(row["realized_pnl"] or 0)
|
||||||
|
unrealized = float(row["unrealized_pnl"] or 0)
|
||||||
|
total = realized + unrealized
|
||||||
|
ref = publish_pnl_daily(
|
||||||
|
client=minio_client,
|
||||||
|
ticker=row["ticker"],
|
||||||
|
realized_pnl=realized,
|
||||||
|
unrealized_pnl=unrealized,
|
||||||
|
total_pnl=total,
|
||||||
|
broker_account=row["broker_account"],
|
||||||
|
dt=now,
|
||||||
|
execution_mode=row["execution_mode"],
|
||||||
|
)
|
||||||
|
refs.append(ref)
|
||||||
|
return refs
|
||||||
|
|
||||||
|
|
||||||
|
async def publish_bulk_documents_job(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
minio_client: Minio,
|
||||||
|
since: datetime,
|
||||||
|
) -> list[str]:
|
||||||
|
"""Publish all documents created since a cutoff as a batch."""
|
||||||
|
rows = await pool.fetch(_FETCH_BULK_DOCUMENTS, since)
|
||||||
|
if not rows:
|
||||||
|
logger.info("No documents to bulk-publish since %s", since)
|
||||||
|
return []
|
||||||
|
|
||||||
|
doc_rows: list[dict[str, object]] = []
|
||||||
|
for row in rows:
|
||||||
|
published_at = row["published_at"] or row["retrieved_at"]
|
||||||
|
doc_rows.append({
|
||||||
|
"document_id": str(row["id"]),
|
||||||
|
"document_type": row["document_type"],
|
||||||
|
"source_type": row["source_type"],
|
||||||
|
"ticker": row["ticker"] or "",
|
||||||
|
"publisher": row["publisher"] or "",
|
||||||
|
"title": row["title"] or "",
|
||||||
|
"url": row["url"] or "",
|
||||||
|
"canonical_url": row["canonical_url"] or "",
|
||||||
|
"language": row["language"] or "en",
|
||||||
|
"published_at": published_at,
|
||||||
|
"retrieved_at": row["retrieved_at"],
|
||||||
|
"content_hash": row["content_hash"],
|
||||||
|
"confidence": float(row["parse_quality_score"] or 0.0),
|
||||||
|
**partition_values(published_at),
|
||||||
|
})
|
||||||
|
|
||||||
|
ref = publish_documents_batch(minio_client, doc_rows, since)
|
||||||
|
return [ref] if ref else []
|
||||||
|
|
||||||
|
|
||||||
|
async def publish_bulk_extractions_job(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
minio_client: Minio,
|
||||||
|
since: datetime,
|
||||||
|
) -> list[str]:
|
||||||
|
"""Publish all extractions created since a cutoff as a batch."""
|
||||||
|
rows = await pool.fetch(_FETCH_BULK_EXTRACTIONS, since)
|
||||||
|
if not rows:
|
||||||
|
logger.info("No extractions to bulk-publish since %s", since)
|
||||||
|
return []
|
||||||
|
|
||||||
|
extraction_rows: list[dict[str, object]] = []
|
||||||
|
for row in rows:
|
||||||
|
model_ver = row["schema_version"] or row["prompt_version"] or ""
|
||||||
|
extraction_rows.append({
|
||||||
|
"document_id": str(row["document_id"]),
|
||||||
|
"ticker": row["ticker"],
|
||||||
|
"company_name": row["company_name"] or "",
|
||||||
|
"relevance": float(row["relevance"] or 0.0),
|
||||||
|
"sentiment": row["sentiment"] or "neutral",
|
||||||
|
"impact_score": float(row["impact_score"] or 0.0),
|
||||||
|
"impact_horizon": row["impact_horizon"] or "",
|
||||||
|
"catalyst_type": row["catalyst_type"] or "other",
|
||||||
|
"confidence": float(row["confidence"] or 0.0),
|
||||||
|
"novelty_score": float(row["novelty_score"] or 0.0),
|
||||||
|
"source_credibility": float(row["source_credibility"] or 0.0),
|
||||||
|
"key_facts": _jsonb_to_str(row["key_facts"]),
|
||||||
|
"risks": _jsonb_to_str(row["risks"]),
|
||||||
|
"macro_themes": _jsonb_to_str(row["macro_themes"]),
|
||||||
|
"model_name": row["model_name"] or "",
|
||||||
|
"prompt_version": row["prompt_version"] or "",
|
||||||
|
"schema_version": row["schema_version"] or "",
|
||||||
|
"extraction_at": row["extraction_at"],
|
||||||
|
**partition_values(row["extraction_at"], {"model_version": model_ver}),
|
||||||
|
})
|
||||||
|
|
||||||
|
model_ver = extraction_rows[0].get("model_version", "") if extraction_rows else ""
|
||||||
|
ref = publish_document_extractions_batch(
|
||||||
|
minio_client, extraction_rows, since,
|
||||||
|
model_version=str(model_ver),
|
||||||
|
)
|
||||||
|
return [ref] if ref else []
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Job dispatcher
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
JOB_TYPES = {
|
||||||
|
"document",
|
||||||
|
"document_extraction",
|
||||||
|
"market_snapshot",
|
||||||
|
"trade_order",
|
||||||
|
"trade_fill",
|
||||||
|
"positions_snapshot",
|
||||||
|
"pnl_snapshot",
|
||||||
|
"company_event",
|
||||||
|
"bulk_documents",
|
||||||
|
"bulk_extractions",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
async def dispatch_job(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
minio_client: Minio,
|
||||||
|
job: dict[str, str],
|
||||||
|
) -> dict[str, object]:
|
||||||
|
"""Dispatch a lake publish job to the appropriate handler.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pool: PostgreSQL connection pool.
|
||||||
|
minio_client: MinIO client for writing Parquet files.
|
||||||
|
job: Job dict with at least 'job_type' and 'entity_id'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A result dict with 'job_type', 'entity_id', 'refs' (list of s3 URIs),
|
||||||
|
and 'error' (None on success).
|
||||||
|
"""
|
||||||
|
job_type = job.get("job_type", "")
|
||||||
|
entity_id = job.get("entity_id", "")
|
||||||
|
since_str = job.get("since")
|
||||||
|
|
||||||
|
result: dict[str, object] = {
|
||||||
|
"job_type": job_type,
|
||||||
|
"entity_id": entity_id,
|
||||||
|
"refs": [],
|
||||||
|
"error": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
if job_type == "document":
|
||||||
|
ref = await publish_document_job(pool, minio_client, entity_id)
|
||||||
|
result["refs"] = [ref] if ref else []
|
||||||
|
|
||||||
|
elif job_type == "document_extraction":
|
||||||
|
refs = await publish_extraction_job(pool, minio_client, entity_id)
|
||||||
|
result["refs"] = refs
|
||||||
|
|
||||||
|
elif job_type == "market_snapshot":
|
||||||
|
refs = await publish_market_snapshot_job(pool, minio_client, entity_id)
|
||||||
|
result["refs"] = refs
|
||||||
|
|
||||||
|
elif job_type == "trade_order":
|
||||||
|
ref = await publish_order_job(pool, minio_client, entity_id)
|
||||||
|
result["refs"] = [ref] if ref else []
|
||||||
|
|
||||||
|
elif job_type == "trade_fill":
|
||||||
|
refs = await publish_fills_job(pool, minio_client, entity_id)
|
||||||
|
result["refs"] = refs
|
||||||
|
|
||||||
|
elif job_type == "positions_snapshot":
|
||||||
|
ref = await publish_positions_job(pool, minio_client, entity_id)
|
||||||
|
result["refs"] = [ref] if ref else []
|
||||||
|
|
||||||
|
elif job_type == "pnl_snapshot":
|
||||||
|
refs = await publish_pnl_job(pool, minio_client, entity_id)
|
||||||
|
result["refs"] = refs
|
||||||
|
|
||||||
|
elif job_type == "bulk_documents":
|
||||||
|
since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
|
||||||
|
refs = await publish_bulk_documents_job(pool, minio_client, since)
|
||||||
|
result["refs"] = refs
|
||||||
|
|
||||||
|
elif job_type == "bulk_extractions":
|
||||||
|
since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
|
||||||
|
refs = await publish_bulk_extractions_job(pool, minio_client, since)
|
||||||
|
result["refs"] = refs
|
||||||
|
|
||||||
|
else:
|
||||||
|
result["error"] = f"Unknown job_type: {job_type}"
|
||||||
|
logger.warning("Unknown lake publish job type: %s", job_type)
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
result["error"] = str(exc)
|
||||||
|
logger.exception("Lake publish job failed: %s/%s", job_type, entity_id)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Async worker loop
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def run_worker(
|
||||||
|
pool: asyncpg.Pool,
|
||||||
|
rds: aioredis.Redis,
|
||||||
|
minio_client: Minio,
|
||||||
|
poll_interval: float = 2.0,
|
||||||
|
) -> None:
|
||||||
|
"""Main worker loop — reads jobs from Redis and dispatches them.
|
||||||
|
|
||||||
|
Runs indefinitely until cancelled. Each job is processed sequentially
|
||||||
|
to keep MinIO write ordering predictable.
|
||||||
|
"""
|
||||||
|
queue = queue_key(QUEUE_LAKE_PUBLISH)
|
||||||
|
logger.info("Lake publisher worker started, listening on %s", queue)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
raw = await rds.lpop(queue) # type: ignore[misc]
|
||||||
|
if raw is None:
|
||||||
|
await asyncio.sleep(poll_interval)
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
job = json.loads(str(raw))
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
logger.error("Invalid lake publish job payload: %s", raw)
|
||||||
|
continue
|
||||||
|
|
||||||
|
result = await dispatch_job(pool, minio_client, job)
|
||||||
|
refs = result.get("refs") or []
|
||||||
|
error = result.get("error")
|
||||||
|
|
||||||
|
if error:
|
||||||
|
logger.error(
|
||||||
|
"Lake publish job %s/%s failed: %s",
|
||||||
|
result["job_type"], result["entity_id"], error,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ref_count = len(refs) if isinstance(refs, list) else 0
|
||||||
|
logger.info(
|
||||||
|
"Lake publish job %s/%s completed: %d facts written",
|
||||||
|
result["job_type"], result["entity_id"], ref_count,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
"""Entry point for the lake publisher worker process."""
|
||||||
|
config = load_config()
|
||||||
|
pool = await get_pg_pool(config)
|
||||||
|
rds = get_redis(config)
|
||||||
|
minio_client = get_minio(config)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await run_worker(pool, rds, minio_client)
|
||||||
|
finally:
|
||||||
|
await pool.close()
|
||||||
|
await rds.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
cfg = load_config()
|
||||||
|
setup_logging("lake_publisher", level=cfg.log_level, json_output=cfg.json_logs)
|
||||||
|
asyncio.run(main())
|
||||||
@@ -0,0 +1,128 @@
|
|||||||
|
"""Hive-compatible partition layout conventions for the MinIO lakehouse.
|
||||||
|
|
||||||
|
Centralizes partition path generation, partition column injection, and
|
||||||
|
bucket provisioning so that all lake publisher writers produce layouts
|
||||||
|
that Trino's Hive and Iceberg connectors can discover and prune.
|
||||||
|
|
||||||
|
Design ref: Section 5.2, 5.3 (Lakehouse model)
|
||||||
|
Requirements: 9.4, 9.5, N4, N6
|
||||||
|
|
||||||
|
Layout convention:
|
||||||
|
s3://stonks-lakehouse/warehouse/{table_name}/dt={YYYY-MM-DD}[/{extra_key}={value}]/part-{uuid}.parquet
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- Every fact table is partitioned by ``dt`` (DATE) derived from the row timestamp.
|
||||||
|
- Some tables have a second partition key (e.g. ``model_version``).
|
||||||
|
- Partition columns MUST appear in the Parquet file so Trino can read them
|
||||||
|
without relying solely on path parsing.
|
||||||
|
- File names use a UUID suffix to avoid collisions on concurrent writes.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import date, datetime, timezone
|
||||||
|
|
||||||
|
|
||||||
|
LAKEHOUSE_BUCKET = "stonks-lakehouse"
|
||||||
|
WAREHOUSE_PREFIX = "warehouse"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PartitionSpec:
|
||||||
|
"""Describes the partition layout for a single fact table."""
|
||||||
|
|
||||||
|
table_name: str
|
||||||
|
extra_keys: tuple[str, ...] = field(default_factory=tuple)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def all_keys(self) -> tuple[str, ...]:
|
||||||
|
"""Return all partition keys in order (dt first, then extras)."""
|
||||||
|
return ("dt", *self.extra_keys)
|
||||||
|
|
||||||
|
|
||||||
|
# Registry of every analytical fact table and its partition keys.
|
||||||
|
# This is the single source of truth — DDL, publisher, and tests should agree.
|
||||||
|
TABLE_PARTITIONS: dict[str, PartitionSpec] = {
|
||||||
|
"market_bars": PartitionSpec("market_bars"),
|
||||||
|
"market_quotes": PartitionSpec("market_quotes"),
|
||||||
|
"company_events": PartitionSpec("company_events"),
|
||||||
|
"documents": PartitionSpec("documents"),
|
||||||
|
"document_extractions": PartitionSpec("document_extractions", extra_keys=("model_version",)),
|
||||||
|
"trade_signals": PartitionSpec("trade_signals"),
|
||||||
|
"trade_orders": PartitionSpec("trade_orders"),
|
||||||
|
"trade_fills": PartitionSpec("trade_fills"),
|
||||||
|
"positions_daily": PartitionSpec("positions_daily"),
|
||||||
|
"pnl_daily": PartitionSpec("pnl_daily"),
|
||||||
|
"prediction_vs_outcome": PartitionSpec("prediction_vs_outcome", extra_keys=("model_version",)),
|
||||||
|
"model_performance": PartitionSpec("model_performance", extra_keys=("model_version",)),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def partition_path(
|
||||||
|
table_name: str,
|
||||||
|
dt: datetime | date,
|
||||||
|
extra_partitions: dict[str, str] | None = None,
|
||||||
|
file_id: str | None = None,
|
||||||
|
) -> str:
|
||||||
|
"""Build a Hive-compatible object path for a Parquet file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
table_name: Logical fact table name (must be in TABLE_PARTITIONS).
|
||||||
|
dt: Row timestamp or date used to derive the ``dt=`` partition.
|
||||||
|
extra_partitions: Additional partition key/value pairs (e.g. model_version).
|
||||||
|
file_id: Optional override for the file suffix (defaults to a UUID4).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Object key relative to the bucket root, e.g.
|
||||||
|
``warehouse/trade_signals/dt=2026-04-11/part-<uuid>.parquet``
|
||||||
|
"""
|
||||||
|
spec = TABLE_PARTITIONS.get(table_name)
|
||||||
|
if spec is None:
|
||||||
|
raise ValueError(f"Unknown table: {table_name}. Register it in TABLE_PARTITIONS.")
|
||||||
|
|
||||||
|
if isinstance(dt, datetime):
|
||||||
|
dt_str = dt.strftime("%Y-%m-%d")
|
||||||
|
else:
|
||||||
|
dt_str = dt.isoformat()
|
||||||
|
|
||||||
|
segments = [WAREHOUSE_PREFIX, table_name, f"dt={dt_str}"]
|
||||||
|
|
||||||
|
# Append extra partition directories in the order declared by the spec.
|
||||||
|
extras = extra_partitions or {}
|
||||||
|
for key in spec.extra_keys:
|
||||||
|
value = extras.get(key, "__NONE__")
|
||||||
|
segments.append(f"{key}={value}")
|
||||||
|
|
||||||
|
suffix = file_id or uuid.uuid4().hex[:16]
|
||||||
|
segments.append(f"part-{suffix}.parquet")
|
||||||
|
|
||||||
|
return "/".join(segments)
|
||||||
|
|
||||||
|
|
||||||
|
def partition_values(
|
||||||
|
dt: datetime | date,
|
||||||
|
extra_partitions: dict[str, str] | None = None,
|
||||||
|
) -> dict[str, object]:
|
||||||
|
"""Return partition column values to inject into Parquet row data.
|
||||||
|
|
||||||
|
Trino's Hive connector can read partition values from the directory path,
|
||||||
|
but embedding them in the Parquet file as well ensures compatibility with
|
||||||
|
engines that don't parse Hive paths (e.g. plain PyArrow reads, DuckDB).
|
||||||
|
|
||||||
|
Returns a dict like ``{"dt": date(2026, 4, 11), "model_version": "v2"}``.
|
||||||
|
"""
|
||||||
|
if isinstance(dt, datetime):
|
||||||
|
dt_date = dt.date()
|
||||||
|
else:
|
||||||
|
dt_date = dt
|
||||||
|
|
||||||
|
values: dict[str, object] = {"dt": dt_date}
|
||||||
|
if extra_partitions:
|
||||||
|
values.update(extra_partitions)
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
|
def s3_uri(path: str) -> str:
|
||||||
|
"""Build an s3:// URI from a bucket-relative object path."""
|
||||||
|
return f"s3://{LAKEHOUSE_BUCKET}/{path}"
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,858 @@
|
|||||||
|
"""HTML-to-text parsing pipeline using BeautifulSoup.
|
||||||
|
|
||||||
|
Provides structured HTML parsing with boilerplate removal, metadata extraction,
|
||||||
|
outbound link extraction, and quality scoring. Inspired by Noctipede crawler
|
||||||
|
patterns: BeautifulSoup + content hashing, boilerplate stripping, quality scoring.
|
||||||
|
|
||||||
|
Requirements: 4.1, 4.2, 4.3
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
logger = logging.getLogger("html_parser")
|
||||||
|
|
||||||
|
# Tags that never contain useful article content
|
||||||
|
STRIP_TAGS = [
|
||||||
|
"script", "style", "nav", "footer", "header", "aside",
|
||||||
|
"iframe", "noscript", "svg", "form", "button",
|
||||||
|
]
|
||||||
|
|
||||||
|
# CSS class / id substrings that signal boilerplate containers
|
||||||
|
BOILERPLATE_SIGNALS = [
|
||||||
|
"sidebar", "widget", "advert", "promo", "newsletter",
|
||||||
|
"social-share", "share-bar", "related-posts", "comment",
|
||||||
|
"cookie", "popup", "modal", "banner", "breadcrumb",
|
||||||
|
"pagination", "nav-", "menu", "toolbar", "signup",
|
||||||
|
"subscribe", "follow-us", "social-media", "share-button",
|
||||||
|
"ad-slot", "ad-container", "sponsored",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Regex patterns for residual boilerplate in extracted text
|
||||||
|
BOILERPLATE_TEXT_PATTERNS = [
|
||||||
|
re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)advertisement\s*\n?"),
|
||||||
|
re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
|
||||||
|
re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
|
||||||
|
re.compile(r"(?i)sign up for .*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)follow us on .*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)share this (article|story|post).*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)read more:?\s*$"),
|
||||||
|
re.compile(r"(?i)recommended for you.*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)you may also like.*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)trending now.*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)most (popular|read).*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)^tags:\s*$"),
|
||||||
|
re.compile(r"(?i)^\s*photo\s*:.*?(?:\n|$)"),
|
||||||
|
re.compile(r"(?i)^\s*image\s*(credit|source|courtesy)\s*:.*?(?:\n|$)"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Selectors for article body candidates, in priority order
|
||||||
|
ARTICLE_SELECTORS = [
|
||||||
|
"article",
|
||||||
|
"[role='main']",
|
||||||
|
".article-body",
|
||||||
|
".post-content",
|
||||||
|
".entry-content",
|
||||||
|
".story-body",
|
||||||
|
".article-content",
|
||||||
|
"#article-body",
|
||||||
|
"#story-body",
|
||||||
|
".article-text",
|
||||||
|
".post-body",
|
||||||
|
".content-body",
|
||||||
|
"main",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Minimum text density (text chars / total chars including markup) for a block
|
||||||
|
# to be considered content-rich rather than boilerplate
|
||||||
|
_MIN_TEXT_DENSITY = 0.25
|
||||||
|
|
||||||
|
# Minimum word count for a block to be a viable body candidate
|
||||||
|
_MIN_BLOCK_WORDS = 20
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class QualitySignals:
|
||||||
|
"""Individual quality signals contributing to the overall parse score.
|
||||||
|
|
||||||
|
Each signal is a float in [0, 1] representing how well the parsed
|
||||||
|
content performs on that dimension.
|
||||||
|
|
||||||
|
Requirements: 4.3
|
||||||
|
"""
|
||||||
|
word_count_signal: float = 0.0
|
||||||
|
diversity_signal: float = 0.0
|
||||||
|
sentence_signal: float = 0.0
|
||||||
|
paragraph_signal: float = 0.0
|
||||||
|
body_found_signal: float = 0.0
|
||||||
|
metadata_signal: float = 0.0
|
||||||
|
|
||||||
|
def as_dict(self) -> dict[str, float]:
|
||||||
|
return {
|
||||||
|
"word_count": self.word_count_signal,
|
||||||
|
"diversity": self.diversity_signal,
|
||||||
|
"sentence": self.sentence_signal,
|
||||||
|
"paragraph": self.paragraph_signal,
|
||||||
|
"body_found": self.body_found_signal,
|
||||||
|
"metadata": self.metadata_signal,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CompanyMention:
|
||||||
|
"""A detected company mention in parsed text.
|
||||||
|
|
||||||
|
Requirements: 1.3, 4.1
|
||||||
|
"""
|
||||||
|
company_id: str
|
||||||
|
ticker: str
|
||||||
|
mention_type: str # ticker, legal_name, alias, brand
|
||||||
|
confidence: float
|
||||||
|
match_count: int = 1
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ParsedDocument:
|
||||||
|
"""Result of HTML-to-text parsing pipeline."""
|
||||||
|
body_text: str = ""
|
||||||
|
title: str = ""
|
||||||
|
author: str = ""
|
||||||
|
publisher: str = ""
|
||||||
|
published_at: str | None = None
|
||||||
|
canonical_url: str | None = None
|
||||||
|
language: str = "en"
|
||||||
|
description: str = ""
|
||||||
|
document_type: str = "article"
|
||||||
|
outbound_links: list[str] = field(default_factory=list)
|
||||||
|
tags: list[str] = field(default_factory=list)
|
||||||
|
mentioned_companies: list[CompanyMention] = field(default_factory=list)
|
||||||
|
quality_score: float = 0.0
|
||||||
|
confidence: str = "low"
|
||||||
|
word_count: int = 0
|
||||||
|
quality_signals: QualitySignals = field(default_factory=QualitySignals)
|
||||||
|
low_quality_flag: bool = False
|
||||||
|
quality_warnings: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def _attr_str(tag: Tag, attr: str) -> str:
|
||||||
|
"""Safely get a tag attribute as a joined string."""
|
||||||
|
val = tag.get(attr, "")
|
||||||
|
if isinstance(val, list):
|
||||||
|
return " ".join(val)
|
||||||
|
return str(val) if val else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _is_boilerplate_container(tag: Tag) -> bool:
|
||||||
|
"""Check if a tag looks like a boilerplate container by class/id."""
|
||||||
|
cls = _attr_str(tag, "class").lower()
|
||||||
|
tag_id = _attr_str(tag, "id").lower()
|
||||||
|
combined = f"{cls} {tag_id}"
|
||||||
|
return any(sig in combined for sig in BOILERPLATE_SIGNALS)
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_boilerplate_tags(soup: BeautifulSoup) -> None:
|
||||||
|
"""Remove known non-content tags and boilerplate containers in-place."""
|
||||||
|
for tag_name in STRIP_TAGS:
|
||||||
|
for tag in soup.find_all(tag_name):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
for tag in soup.find_all(True):
|
||||||
|
if _is_boilerplate_container(tag):
|
||||||
|
tag.decompose()
|
||||||
|
|
||||||
|
|
||||||
|
def _reduce_boilerplate_text(text: str) -> str:
|
||||||
|
"""Apply regex patterns to strip residual boilerplate from extracted text."""
|
||||||
|
for pattern in BOILERPLATE_TEXT_PATTERNS:
|
||||||
|
text = pattern.sub("", text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _text_density(tag: Tag) -> float:
|
||||||
|
"""Compute text density for a tag: ratio of text length to total markup length.
|
||||||
|
|
||||||
|
Higher density means more actual text relative to HTML structure,
|
||||||
|
which is a strong signal for content blocks vs boilerplate.
|
||||||
|
|
||||||
|
Requirements: 4.2
|
||||||
|
"""
|
||||||
|
markup_len = len(str(tag))
|
||||||
|
if markup_len == 0:
|
||||||
|
return 0.0
|
||||||
|
text_len = len(tag.get_text(strip=True))
|
||||||
|
return text_len / markup_len
|
||||||
|
|
||||||
|
|
||||||
|
def _link_density(tag: Tag) -> float:
|
||||||
|
"""Compute link density: ratio of text inside <a> tags to total text.
|
||||||
|
|
||||||
|
High link density signals navigation/boilerplate blocks (menus, sidebars).
|
||||||
|
Low link density signals content paragraphs.
|
||||||
|
|
||||||
|
Requirements: 4.2
|
||||||
|
"""
|
||||||
|
total_text = len(tag.get_text(strip=True))
|
||||||
|
if total_text == 0:
|
||||||
|
return 1.0
|
||||||
|
link_text = sum(len(a.get_text(strip=True)) for a in tag.find_all("a"))
|
||||||
|
return link_text / total_text
|
||||||
|
|
||||||
|
|
||||||
|
def _block_score(tag: Tag) -> float:
|
||||||
|
"""Score a block element as a body candidate using text density heuristics.
|
||||||
|
|
||||||
|
Combines text density, link density, paragraph count, and word count
|
||||||
|
into a composite score. Higher is more likely to be the article body.
|
||||||
|
|
||||||
|
Requirements: 4.2
|
||||||
|
"""
|
||||||
|
text = tag.get_text(strip=True)
|
||||||
|
word_count = len(text.split())
|
||||||
|
if word_count < _MIN_BLOCK_WORDS:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
td = _text_density(tag)
|
||||||
|
ld = _link_density(tag)
|
||||||
|
p_count = len(tag.find_all("p"))
|
||||||
|
|
||||||
|
# Base score from text density (0-1), penalized by link density
|
||||||
|
score = td * (1.0 - ld)
|
||||||
|
|
||||||
|
# Bonus for paragraph-rich blocks (structured article content)
|
||||||
|
if p_count >= 2:
|
||||||
|
score += 0.1 * min(p_count, 10)
|
||||||
|
|
||||||
|
# Bonus for word count (log-scaled to avoid runaway scores)
|
||||||
|
score += 0.05 * math.log(max(word_count, 1))
|
||||||
|
|
||||||
|
return score
|
||||||
|
|
||||||
|
|
||||||
|
def _find_article_body(soup: BeautifulSoup) -> Tag | None:
|
||||||
|
"""Find the most likely article body element.
|
||||||
|
|
||||||
|
First tries semantic selectors (article, [role=main], etc.).
|
||||||
|
If no semantic match, falls back to text-density scoring across
|
||||||
|
candidate block elements to find the content-richest container.
|
||||||
|
|
||||||
|
Requirements: 4.2
|
||||||
|
"""
|
||||||
|
# Priority 1: semantic selectors
|
||||||
|
for selector in ARTICLE_SELECTORS:
|
||||||
|
result = soup.select_one(selector)
|
||||||
|
if result:
|
||||||
|
text = result.get_text(strip=True)
|
||||||
|
if len(text.split()) >= _MIN_BLOCK_WORDS:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Priority 2: text-density scoring on block-level containers
|
||||||
|
candidates: list[tuple[float, Tag]] = []
|
||||||
|
for tag in soup.find_all(["div", "section", "td"]):
|
||||||
|
score = _block_score(tag)
|
||||||
|
if score > 0:
|
||||||
|
candidates.append((score, tag))
|
||||||
|
|
||||||
|
if candidates:
|
||||||
|
candidates.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
return candidates[0][1]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _collapse_whitespace(text: str) -> str:
|
||||||
|
"""Collapse runs of blank lines into single separators."""
|
||||||
|
lines = [line.strip() for line in text.splitlines()]
|
||||||
|
result: list[str] = []
|
||||||
|
prev_blank = False
|
||||||
|
for line in lines:
|
||||||
|
if not line:
|
||||||
|
if not prev_blank:
|
||||||
|
result.append("")
|
||||||
|
prev_blank = True
|
||||||
|
else:
|
||||||
|
result.append(line)
|
||||||
|
prev_blank = False
|
||||||
|
return "\n".join(result).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_short_orphan_lines(text: str, min_words: int = 3) -> str:
|
||||||
|
"""Remove very short orphan lines that are likely UI fragments or captions.
|
||||||
|
|
||||||
|
Lines shorter than min_words that don't end with sentence punctuation
|
||||||
|
are stripped. This catches leftover button labels, image captions,
|
||||||
|
and navigation fragments.
|
||||||
|
|
||||||
|
Requirements: 4.2
|
||||||
|
"""
|
||||||
|
lines = text.splitlines()
|
||||||
|
kept: list[str] = []
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
words = stripped.split()
|
||||||
|
if len(words) < min_words and not stripped.endswith((".", "!", "?", ":")):
|
||||||
|
continue
|
||||||
|
kept.append(line)
|
||||||
|
return "\n".join(kept)
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_repeated_blocks(text: str, min_len: int = 40) -> str:
|
||||||
|
"""Remove repeated text blocks that appear more than once.
|
||||||
|
|
||||||
|
Template text (disclaimers, repeated footers) often appears verbatim
|
||||||
|
in multiple places. This strips exact duplicate blocks.
|
||||||
|
|
||||||
|
Requirements: 4.2
|
||||||
|
"""
|
||||||
|
lines = text.splitlines()
|
||||||
|
seen: dict[str, int] = {}
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
if len(stripped) >= min_len:
|
||||||
|
seen[stripped] = seen.get(stripped, 0) + 1
|
||||||
|
|
||||||
|
duplicates = {k for k, v in seen.items() if v > 1}
|
||||||
|
if not duplicates:
|
||||||
|
return text
|
||||||
|
|
||||||
|
kept: list[str] = []
|
||||||
|
emitted: set[str] = set()
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
if stripped in duplicates:
|
||||||
|
if stripped not in emitted:
|
||||||
|
kept.append(line)
|
||||||
|
emitted.add(stripped)
|
||||||
|
# Skip subsequent duplicates
|
||||||
|
else:
|
||||||
|
kept.append(line)
|
||||||
|
return "\n".join(kept)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_body_text(html: str) -> str:
|
||||||
|
"""Extract main body text from HTML with boilerplate removal.
|
||||||
|
|
||||||
|
Pipeline:
|
||||||
|
1. Strip non-content tags (script, style, nav, footer, etc.)
|
||||||
|
2. Strip boilerplate containers by class/id signals
|
||||||
|
3. Find article body via semantic selectors or text-density scoring
|
||||||
|
4. Extract text from best candidate
|
||||||
|
5. Remove residual boilerplate via regex patterns
|
||||||
|
6. Remove short orphan lines (UI fragments)
|
||||||
|
7. Detect and collapse repeated template blocks
|
||||||
|
8. Collapse whitespace
|
||||||
|
|
||||||
|
Requirements: 4.1, 4.2
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
_strip_boilerplate_tags(soup)
|
||||||
|
|
||||||
|
article = _find_article_body(soup)
|
||||||
|
if article:
|
||||||
|
raw_text = article.get_text(separator="\n", strip=True)
|
||||||
|
else:
|
||||||
|
body = soup.find("body")
|
||||||
|
raw_text = (body or soup).get_text(separator="\n", strip=True)
|
||||||
|
|
||||||
|
# Multi-stage text cleaning
|
||||||
|
text = _reduce_boilerplate_text(raw_text)
|
||||||
|
text = _remove_short_orphan_lines(text)
|
||||||
|
text = _detect_repeated_blocks(text)
|
||||||
|
text = _collapse_whitespace(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_metadata(html: str, url: str = "") -> dict[str, str | None]:
|
||||||
|
"""Extract document metadata from HTML head elements.
|
||||||
|
|
||||||
|
Extracts title, author, publisher, published date, canonical URL,
|
||||||
|
language, description, and tags/keywords.
|
||||||
|
|
||||||
|
Requirements: 4.1
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
meta: dict[str, str | None] = {}
|
||||||
|
|
||||||
|
# Title: og:title > <title>
|
||||||
|
og_title = soup.find("meta", property="og:title")
|
||||||
|
if og_title and og_title.get("content"):
|
||||||
|
content = og_title["content"]
|
||||||
|
meta["title"] = content.strip() if isinstance(content, str) else ""
|
||||||
|
elif soup.title and soup.title.string:
|
||||||
|
meta["title"] = soup.title.string.strip()
|
||||||
|
else:
|
||||||
|
meta["title"] = ""
|
||||||
|
|
||||||
|
# Author
|
||||||
|
author_tag = soup.find("meta", attrs={"name": "author"})
|
||||||
|
if author_tag and author_tag.get("content"):
|
||||||
|
content = author_tag["content"]
|
||||||
|
meta["author"] = content.strip() if isinstance(content, str) else ""
|
||||||
|
else:
|
||||||
|
meta["author"] = ""
|
||||||
|
|
||||||
|
# Publisher: og:site_name > hostname
|
||||||
|
site_name = soup.find("meta", property="og:site_name")
|
||||||
|
if site_name and site_name.get("content"):
|
||||||
|
content = site_name["content"]
|
||||||
|
meta["publisher"] = content.strip() if isinstance(content, str) else ""
|
||||||
|
else:
|
||||||
|
meta["publisher"] = urlparse(url).hostname or "" if url else ""
|
||||||
|
|
||||||
|
# Published date: article:published_time > JSON-LD datePublished
|
||||||
|
pub_time = soup.find("meta", property="article:published_time")
|
||||||
|
if pub_time and pub_time.get("content"):
|
||||||
|
content = pub_time["content"]
|
||||||
|
meta["published_at"] = content.strip() if isinstance(content, str) else None
|
||||||
|
else:
|
||||||
|
meta["published_at"] = _extract_jsonld_date(soup)
|
||||||
|
|
||||||
|
# Canonical URL
|
||||||
|
canonical = soup.find("link", rel="canonical")
|
||||||
|
if canonical and canonical.get("href"):
|
||||||
|
meta["canonical_url"] = str(canonical["href"])
|
||||||
|
else:
|
||||||
|
og_url = soup.find("meta", property="og:url")
|
||||||
|
if og_url and og_url.get("content"):
|
||||||
|
meta["canonical_url"] = str(og_url["content"])
|
||||||
|
else:
|
||||||
|
meta["canonical_url"] = url or None
|
||||||
|
|
||||||
|
# Language
|
||||||
|
html_tag = soup.find("html")
|
||||||
|
if html_tag and html_tag.get("lang"):
|
||||||
|
lang = html_tag["lang"]
|
||||||
|
meta["language"] = str(lang)[:5] if lang else "en"
|
||||||
|
else:
|
||||||
|
meta["language"] = "en"
|
||||||
|
|
||||||
|
# Description
|
||||||
|
desc = soup.find("meta", property="og:description") or soup.find(
|
||||||
|
"meta", attrs={"name": "description"}
|
||||||
|
)
|
||||||
|
if desc and desc.get("content"):
|
||||||
|
content = desc["content"]
|
||||||
|
meta["description"] = content.strip() if isinstance(content, str) else ""
|
||||||
|
else:
|
||||||
|
meta["description"] = ""
|
||||||
|
|
||||||
|
# Tags / keywords
|
||||||
|
keywords = soup.find("meta", attrs={"name": "keywords"})
|
||||||
|
if keywords and keywords.get("content"):
|
||||||
|
content = keywords["content"]
|
||||||
|
raw = content.strip() if isinstance(content, str) else ""
|
||||||
|
meta["tags"] = raw # comma-separated string
|
||||||
|
else:
|
||||||
|
meta["tags"] = ""
|
||||||
|
|
||||||
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_jsonld_date(soup: BeautifulSoup) -> str | None:
|
||||||
|
"""Try to extract datePublished from JSON-LD script tags."""
|
||||||
|
for script in soup.find_all("script", type="application/ld+json"):
|
||||||
|
if script.string and "datePublished" in script.string:
|
||||||
|
try:
|
||||||
|
ld = json.loads(script.string)
|
||||||
|
if isinstance(ld, dict) and "datePublished" in ld:
|
||||||
|
return str(ld["datePublished"])
|
||||||
|
if isinstance(ld, list):
|
||||||
|
for item in ld:
|
||||||
|
if isinstance(item, dict) and "datePublished" in item:
|
||||||
|
return str(item["datePublished"])
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_outbound_links(html: str, base_url: str = "") -> list[str]:
|
||||||
|
"""Extract outbound links from HTML, filtering out self-references.
|
||||||
|
|
||||||
|
Requirements: 4.1
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
base_host = urlparse(base_url).hostname or "" if base_url else ""
|
||||||
|
links: list[str] = []
|
||||||
|
|
||||||
|
for a_tag in soup.find_all("a", href=True):
|
||||||
|
href = str(a_tag["href"]).strip()
|
||||||
|
if not href or href.startswith("#") or href.startswith("javascript:"):
|
||||||
|
continue
|
||||||
|
parsed = urlparse(href)
|
||||||
|
# Only include absolute URLs that point to different hosts
|
||||||
|
if parsed.scheme in ("http", "https") and parsed.hostname:
|
||||||
|
if parsed.hostname != base_host:
|
||||||
|
links.append(href)
|
||||||
|
|
||||||
|
# Dedupe while preserving order
|
||||||
|
seen: set[str] = set()
|
||||||
|
unique: list[str] = []
|
||||||
|
for link in links:
|
||||||
|
if link not in seen:
|
||||||
|
seen.add(link)
|
||||||
|
unique.append(link)
|
||||||
|
return unique
|
||||||
|
|
||||||
|
|
||||||
|
def _count_sentences(text: str) -> int:
|
||||||
|
"""Count approximate sentence count by terminal punctuation."""
|
||||||
|
return len(re.findall(r"[.!?]+(?:\s|$)", text))
|
||||||
|
|
||||||
|
|
||||||
|
def _count_paragraphs(text: str) -> int:
|
||||||
|
"""Count non-empty paragraph blocks separated by blank lines."""
|
||||||
|
blocks = re.split(r"\n\s*\n", text.strip())
|
||||||
|
return sum(1 for b in blocks if len(b.strip().split()) >= 5)
|
||||||
|
|
||||||
|
|
||||||
|
def score_parse_quality(
|
||||||
|
text: str,
|
||||||
|
*,
|
||||||
|
body_found: bool = True,
|
||||||
|
has_title: bool = False,
|
||||||
|
has_author: bool = False,
|
||||||
|
has_publisher: bool = False,
|
||||||
|
has_published_at: bool = False,
|
||||||
|
) -> tuple[float, str, QualitySignals, list[str]]:
|
||||||
|
"""Score parse quality using multiple content and metadata signals.
|
||||||
|
|
||||||
|
Returns (score, confidence_label, signals, warnings).
|
||||||
|
|
||||||
|
Signals considered:
|
||||||
|
- word_count_signal: length of extracted text
|
||||||
|
- diversity_signal: vocabulary richness (unique/total words)
|
||||||
|
- sentence_signal: presence of proper sentence structure
|
||||||
|
- paragraph_signal: multi-paragraph structure
|
||||||
|
- body_found_signal: whether a semantic article body was located
|
||||||
|
- metadata_signal: presence of title, author, publisher, date
|
||||||
|
|
||||||
|
Requirements: 4.3
|
||||||
|
"""
|
||||||
|
warnings: list[str] = []
|
||||||
|
words = text.split()
|
||||||
|
word_count = len(words)
|
||||||
|
|
||||||
|
# --- word count signal ---
|
||||||
|
if word_count < 20:
|
||||||
|
wc_sig = 0.1
|
||||||
|
warnings.append("very_short_text")
|
||||||
|
elif word_count < 50:
|
||||||
|
wc_sig = 0.3
|
||||||
|
warnings.append("short_text")
|
||||||
|
elif word_count < 150:
|
||||||
|
wc_sig = 0.6
|
||||||
|
elif word_count < 300:
|
||||||
|
wc_sig = 0.8
|
||||||
|
else:
|
||||||
|
wc_sig = 1.0
|
||||||
|
|
||||||
|
# --- diversity signal ---
|
||||||
|
if word_count > 0:
|
||||||
|
unique = len(set(w.lower() for w in words))
|
||||||
|
diversity = unique / word_count
|
||||||
|
else:
|
||||||
|
diversity = 0.0
|
||||||
|
if diversity < 0.2:
|
||||||
|
div_sig = 0.2
|
||||||
|
if word_count >= 20:
|
||||||
|
warnings.append("low_vocabulary_diversity")
|
||||||
|
elif diversity < 0.4:
|
||||||
|
div_sig = 0.5
|
||||||
|
else:
|
||||||
|
div_sig = 1.0
|
||||||
|
|
||||||
|
# --- sentence signal ---
|
||||||
|
sentence_count = _count_sentences(text)
|
||||||
|
if sentence_count == 0:
|
||||||
|
sent_sig = 0.1
|
||||||
|
if word_count >= 20:
|
||||||
|
warnings.append("no_sentence_structure")
|
||||||
|
elif sentence_count < 3:
|
||||||
|
sent_sig = 0.5
|
||||||
|
else:
|
||||||
|
sent_sig = 1.0
|
||||||
|
|
||||||
|
# --- paragraph signal ---
|
||||||
|
para_count = _count_paragraphs(text)
|
||||||
|
if para_count == 0:
|
||||||
|
para_sig = 0.2
|
||||||
|
elif para_count == 1:
|
||||||
|
para_sig = 0.5
|
||||||
|
else:
|
||||||
|
para_sig = 1.0
|
||||||
|
|
||||||
|
# --- body found signal ---
|
||||||
|
body_sig = 1.0 if body_found else 0.3
|
||||||
|
if not body_found:
|
||||||
|
warnings.append("no_article_body_found")
|
||||||
|
|
||||||
|
# --- metadata signal ---
|
||||||
|
meta_hits = sum([has_title, has_author, has_publisher, has_published_at])
|
||||||
|
meta_sig = meta_hits / 4.0
|
||||||
|
|
||||||
|
signals = QualitySignals(
|
||||||
|
word_count_signal=wc_sig,
|
||||||
|
diversity_signal=div_sig,
|
||||||
|
sentence_signal=sent_sig,
|
||||||
|
paragraph_signal=para_sig,
|
||||||
|
body_found_signal=body_sig,
|
||||||
|
metadata_signal=meta_sig,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Weighted composite score
|
||||||
|
score = (
|
||||||
|
0.30 * wc_sig
|
||||||
|
+ 0.15 * div_sig
|
||||||
|
+ 0.15 * sent_sig
|
||||||
|
+ 0.10 * para_sig
|
||||||
|
+ 0.20 * body_sig
|
||||||
|
+ 0.10 * meta_sig
|
||||||
|
)
|
||||||
|
score = round(min(score, 0.95), 2)
|
||||||
|
|
||||||
|
# Confidence label
|
||||||
|
if score < 0.35:
|
||||||
|
confidence = "low"
|
||||||
|
elif score < 0.65:
|
||||||
|
confidence = "medium"
|
||||||
|
else:
|
||||||
|
confidence = "high"
|
||||||
|
|
||||||
|
return score, confidence, signals, warnings
|
||||||
|
|
||||||
|
|
||||||
|
def score_quality(text: str) -> tuple[float, str]:
|
||||||
|
"""Score parse quality based on extracted text characteristics.
|
||||||
|
|
||||||
|
Returns (score, confidence_label) where confidence is low/medium/high.
|
||||||
|
Thin wrapper around score_parse_quality for backward compatibility.
|
||||||
|
|
||||||
|
Requirements: 4.3
|
||||||
|
"""
|
||||||
|
score, confidence, _signals, _warnings = score_parse_quality(text)
|
||||||
|
return score, confidence
|
||||||
|
|
||||||
|
|
||||||
|
def infer_document_type(html: str, url: str = "") -> str:
|
||||||
|
"""Infer document type from URL patterns and HTML content.
|
||||||
|
|
||||||
|
Requirements: 4.1
|
||||||
|
"""
|
||||||
|
url_lower = url.lower()
|
||||||
|
if any(kw in url_lower for kw in ["sec.gov", "edgar", "filing", "10-k", "10-q", "8-k"]):
|
||||||
|
return "filing"
|
||||||
|
if any(kw in url_lower for kw in ["transcript", "earnings-call", "earnings_call"]):
|
||||||
|
return "transcript"
|
||||||
|
if any(kw in url_lower for kw in ["press-release", "press_release", "newsroom"]):
|
||||||
|
return "press_release"
|
||||||
|
# html reserved for future content-based inference
|
||||||
|
_ = html
|
||||||
|
return "article"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_html(html: str, url: str = "", aliases: list[dict[str, str]] | None = None) -> ParsedDocument:
|
||||||
|
"""Full HTML-to-text parsing pipeline.
|
||||||
|
|
||||||
|
Combines body extraction, metadata extraction, link extraction,
|
||||||
|
quality scoring, document type inference, and company mention
|
||||||
|
detection into a single result.
|
||||||
|
|
||||||
|
Requirements: 1.3, 4.1, 4.2, 4.3
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
_strip_boilerplate_tags(soup)
|
||||||
|
|
||||||
|
article = _find_article_body(soup)
|
||||||
|
body_found = article is not None
|
||||||
|
if article:
|
||||||
|
raw_text = article.get_text(separator="\n", strip=True)
|
||||||
|
else:
|
||||||
|
body = soup.find("body")
|
||||||
|
raw_text = (body or soup).get_text(separator="\n", strip=True)
|
||||||
|
|
||||||
|
# Multi-stage text cleaning
|
||||||
|
text = _reduce_boilerplate_text(raw_text)
|
||||||
|
text = _remove_short_orphan_lines(text)
|
||||||
|
text = _detect_repeated_blocks(text)
|
||||||
|
text = _collapse_whitespace(text)
|
||||||
|
|
||||||
|
metadata = extract_metadata(html, url)
|
||||||
|
outbound_links = extract_outbound_links(html, url)
|
||||||
|
doc_type = infer_document_type(html, url)
|
||||||
|
word_count = len(text.split())
|
||||||
|
|
||||||
|
tags_raw = metadata.get("tags", "") or ""
|
||||||
|
tags = [t.strip() for t in tags_raw.split(",") if t.strip()] if tags_raw else []
|
||||||
|
|
||||||
|
# Rich quality scoring with all available signals
|
||||||
|
quality, confidence, signals, warnings = score_parse_quality(
|
||||||
|
text,
|
||||||
|
body_found=body_found,
|
||||||
|
has_title=bool(metadata.get("title")),
|
||||||
|
has_author=bool(metadata.get("author")),
|
||||||
|
has_publisher=bool(metadata.get("publisher")),
|
||||||
|
has_published_at=bool(metadata.get("published_at")),
|
||||||
|
)
|
||||||
|
|
||||||
|
low_quality_flag = confidence == "low"
|
||||||
|
|
||||||
|
# Company mention detection
|
||||||
|
mentioned: list[CompanyMention] = []
|
||||||
|
if aliases and text:
|
||||||
|
# Search title + body for mentions
|
||||||
|
search_text = f"{metadata.get('title', '')} {text}"
|
||||||
|
raw_mentions = detect_company_mentions(search_text, aliases)
|
||||||
|
for m in raw_mentions:
|
||||||
|
mentioned.append(CompanyMention(
|
||||||
|
company_id=str(m["company_id"]),
|
||||||
|
ticker=str(m["ticker"]),
|
||||||
|
mention_type=str(m["mention_type"]),
|
||||||
|
confidence=float(m["confidence"]),
|
||||||
|
match_count=int(m["match_count"]),
|
||||||
|
))
|
||||||
|
|
||||||
|
return ParsedDocument(
|
||||||
|
body_text=text,
|
||||||
|
title=metadata.get("title", "") or "",
|
||||||
|
author=metadata.get("author", "") or "",
|
||||||
|
publisher=metadata.get("publisher", "") or "",
|
||||||
|
published_at=metadata.get("published_at"),
|
||||||
|
canonical_url=metadata.get("canonical_url"),
|
||||||
|
language=metadata.get("language", "en") or "en",
|
||||||
|
description=metadata.get("description", "") or "",
|
||||||
|
document_type=doc_type,
|
||||||
|
outbound_links=outbound_links,
|
||||||
|
tags=tags,
|
||||||
|
mentioned_companies=mentioned,
|
||||||
|
quality_score=quality,
|
||||||
|
confidence=confidence,
|
||||||
|
word_count=word_count,
|
||||||
|
quality_signals=signals,
|
||||||
|
low_quality_flag=low_quality_flag,
|
||||||
|
quality_warnings=warnings,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AliasEntry:
|
||||||
|
"""A company alias used for mention detection."""
|
||||||
|
company_id: str
|
||||||
|
alias: str
|
||||||
|
alias_type: str = "alias"
|
||||||
|
ticker: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
# Confidence by alias type — tickers are most precise, brands least
|
||||||
|
_CONFIDENCE_BY_TYPE: dict[str, float] = {
|
||||||
|
"ticker": 0.9,
|
||||||
|
"legal_name": 0.85,
|
||||||
|
"alias": 0.7,
|
||||||
|
"brand": 0.6,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_alias_entries(aliases: list[dict[str, str]]) -> list[AliasEntry]:
|
||||||
|
"""Convert raw alias dicts to typed AliasEntry objects."""
|
||||||
|
entries: list[AliasEntry] = []
|
||||||
|
for a in aliases:
|
||||||
|
alias_val = a.get("alias", "")
|
||||||
|
if not alias_val:
|
||||||
|
continue
|
||||||
|
entries.append(AliasEntry(
|
||||||
|
company_id=a.get("company_id", ""),
|
||||||
|
alias=alias_val,
|
||||||
|
alias_type=a.get("alias_type", "alias"),
|
||||||
|
ticker=a.get("ticker", ""),
|
||||||
|
))
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
def _count_matches(text: str, pattern: re.Pattern[str]) -> int:
|
||||||
|
"""Count non-overlapping matches of pattern in text."""
|
||||||
|
return len(pattern.findall(text))
|
||||||
|
|
||||||
|
|
||||||
|
def detect_company_mentions(
|
||||||
|
text: str,
|
||||||
|
aliases: list[dict[str, str]],
|
||||||
|
) -> list[dict[str, str | float | int]]:
|
||||||
|
"""Detect company mentions using ticker, alias, and name matching.
|
||||||
|
|
||||||
|
Matching strategy by alias length:
|
||||||
|
- 1-2 chars: case-sensitive word-boundary match (avoids "A" matching "a")
|
||||||
|
- 3-4 chars: case-insensitive word-boundary match (standard tickers)
|
||||||
|
- 5+ chars: case-insensitive substring match (company names, brands)
|
||||||
|
|
||||||
|
Confidence varies by alias_type: ticker > legal_name > alias > brand.
|
||||||
|
Multiple alias hits for the same company are deduplicated, keeping the
|
||||||
|
highest-confidence match and summing match counts.
|
||||||
|
|
||||||
|
Requirements: 1.3, 4.1
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
entries = _build_alias_entries(aliases)
|
||||||
|
text_upper = text.upper()
|
||||||
|
|
||||||
|
# Track best match per company: company_id -> (confidence, ticker, mention_type, count)
|
||||||
|
best: dict[str, tuple[float, str, str, int]] = {}
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
alias = entry.alias
|
||||||
|
alias_type = entry.alias_type
|
||||||
|
base_confidence = _CONFIDENCE_BY_TYPE.get(alias_type, 0.7)
|
||||||
|
|
||||||
|
match_count = 0
|
||||||
|
|
||||||
|
if len(alias) <= 2:
|
||||||
|
# Very short: case-sensitive word boundary
|
||||||
|
pattern = re.compile(r"\b" + re.escape(alias) + r"\b")
|
||||||
|
match_count = _count_matches(text, pattern)
|
||||||
|
elif len(alias) <= 4:
|
||||||
|
# Standard ticker length: case-insensitive word boundary
|
||||||
|
pattern = re.compile(r"\b" + re.escape(alias.upper()) + r"\b")
|
||||||
|
match_count = _count_matches(text_upper, pattern)
|
||||||
|
else:
|
||||||
|
# Longer names: case-insensitive substring
|
||||||
|
alias_up = alias.upper()
|
||||||
|
match_count = text_upper.count(alias_up)
|
||||||
|
|
||||||
|
if match_count == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
cid = entry.company_id
|
||||||
|
existing = best.get(cid)
|
||||||
|
if existing is None:
|
||||||
|
best[cid] = (base_confidence, entry.ticker, alias_type, match_count)
|
||||||
|
else:
|
||||||
|
# Keep highest confidence, accumulate match count
|
||||||
|
prev_conf, prev_ticker, prev_type, prev_count = existing
|
||||||
|
if base_confidence > prev_conf:
|
||||||
|
best[cid] = (base_confidence, entry.ticker, alias_type, prev_count + match_count)
|
||||||
|
else:
|
||||||
|
best[cid] = (prev_conf, prev_ticker, prev_type, prev_count + match_count)
|
||||||
|
|
||||||
|
mentions: list[dict[str, str | float | int]] = []
|
||||||
|
for cid, (confidence, ticker, mention_type, count) in best.items():
|
||||||
|
mentions.append({
|
||||||
|
"company_id": cid,
|
||||||
|
"ticker": ticker,
|
||||||
|
"mention_type": mention_type,
|
||||||
|
"confidence": confidence,
|
||||||
|
"match_count": count,
|
||||||
|
})
|
||||||
|
|
||||||
|
return mentions
|
||||||
+108
-107
@@ -1,84 +1,41 @@
|
|||||||
"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring."""
|
"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring.
|
||||||
|
|
||||||
|
Uses BeautifulSoup-based parsing pipeline for structured HTML extraction,
|
||||||
|
metadata extraction, outbound link extraction, and quality scoring.
|
||||||
|
Persists normalized text and structured parser output to MinIO,
|
||||||
|
and updates document metadata in PostgreSQL.
|
||||||
|
|
||||||
|
Requirements: 4.1, 4.2, 4.3, 9.1, 9.2
|
||||||
|
"""
|
||||||
import asyncio
|
import asyncio
|
||||||
import io
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone
|
||||||
from typing import List, Optional, Tuple
|
from typing import Any, Optional
|
||||||
|
|
||||||
import asyncpg
|
import asyncpg
|
||||||
import httpx
|
import httpx
|
||||||
import redis.asyncio as aioredis
|
import redis.asyncio as aioredis
|
||||||
from minio import Minio
|
from minio import Minio
|
||||||
|
|
||||||
|
from services.parser.html_parser import ParsedDocument, detect_company_mentions, parse_html
|
||||||
from services.shared.config import load_config
|
from services.shared.config import load_config
|
||||||
from services.shared.db import get_minio, get_pg_pool, get_redis
|
from services.shared.db import get_minio, get_pg_pool, get_redis
|
||||||
|
from services.shared.logging import Span, extract_trace_context, inject_trace_context, new_trace_id, set_trace_context, setup_logging
|
||||||
|
from services.shared.metrics import (
|
||||||
|
ACTIVE_JOBS,
|
||||||
|
PARSE_DURATION,
|
||||||
|
PARSE_JOBS_TOTAL,
|
||||||
|
PARSE_LOW_QUALITY_TOTAL,
|
||||||
|
PARSE_QUALITY_SCORE,
|
||||||
|
)
|
||||||
|
from services.shared.metadata import update_document_parse_results
|
||||||
from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key
|
from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key
|
||||||
|
from services.shared.storage import upload_normalized_text, upload_parser_output
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger("parser_worker")
|
logger = logging.getLogger("parser_worker")
|
||||||
|
|
||||||
# Simple boilerplate patterns to strip
|
|
||||||
BOILERPLATE_PATTERNS = [
|
|
||||||
re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
|
|
||||||
re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
|
|
||||||
re.compile(r"(?i)advertisement\s*\n"),
|
|
||||||
re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
|
|
||||||
re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
|
|
||||||
re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
|
|
||||||
re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
|
|
||||||
re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def strip_html_tags(html: str) -> str:
|
|
||||||
"""Basic HTML tag removal."""
|
|
||||||
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
|
||||||
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
|
|
||||||
text = re.sub(r"<[^>]+>", " ", text)
|
|
||||||
text = re.sub(r" ", " ", text)
|
|
||||||
text = re.sub(r"&", "&", text)
|
|
||||||
text = re.sub(r"<", "<", text)
|
|
||||||
text = re.sub(r">", ">", text)
|
|
||||||
text = re.sub(r"&#\d+;", "", text)
|
|
||||||
text = re.sub(r"\s+", " ", text).strip()
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def reduce_boilerplate(text: str) -> str:
|
|
||||||
for pattern in BOILERPLATE_PATTERNS:
|
|
||||||
text = pattern.sub("", text)
|
|
||||||
return text.strip()
|
|
||||||
|
|
||||||
|
|
||||||
def score_quality(text: str) -> Tuple[float, str]:
|
|
||||||
"""Score parse quality. Returns (score, confidence_label)."""
|
|
||||||
word_count = len(text.split())
|
|
||||||
if word_count < 20:
|
|
||||||
return 0.1, "low"
|
|
||||||
if word_count < 50:
|
|
||||||
return 0.3, "low"
|
|
||||||
if word_count < 150:
|
|
||||||
return 0.6, "medium"
|
|
||||||
return 0.85, "high"
|
|
||||||
|
|
||||||
|
|
||||||
def detect_company_mentions(text: str, aliases: List[dict]) -> List[dict]:
|
|
||||||
"""Detect company mentions using ticker, alias, and name matching."""
|
|
||||||
mentions = []
|
|
||||||
text_upper = text.upper()
|
|
||||||
for alias_info in aliases:
|
|
||||||
alias = alias_info["alias"]
|
|
||||||
if alias.upper() in text_upper:
|
|
||||||
mentions.append({
|
|
||||||
"company_id": alias_info["company_id"],
|
|
||||||
"ticker": alias_info.get("ticker", ""),
|
|
||||||
"mention_type": alias_info.get("alias_type", "alias"),
|
|
||||||
"confidence": 0.7,
|
|
||||||
})
|
|
||||||
return mentions
|
|
||||||
|
|
||||||
|
|
||||||
async def fetch_html(url: str) -> Optional[str]:
|
async def fetch_html(url: str) -> Optional[str]:
|
||||||
"""Fetch article HTML for scraping."""
|
"""Fetch article HTML for scraping."""
|
||||||
@@ -94,48 +51,65 @@ async def fetch_html(url: str) -> Optional[str]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def build_parser_output_json(parsed: ParsedDocument, mentions: list[dict[str, Any]]) -> dict[str, Any]:
|
||||||
|
"""Build a structured JSON dict from ParsedDocument and detected mentions.
|
||||||
|
|
||||||
|
This captures the full parser output for audit and downstream use:
|
||||||
|
metadata, quality signals, warnings, outbound links, tags, and mentions.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"title": parsed.title,
|
||||||
|
"author": parsed.author,
|
||||||
|
"publisher": parsed.publisher,
|
||||||
|
"published_at": parsed.published_at,
|
||||||
|
"canonical_url": parsed.canonical_url,
|
||||||
|
"language": parsed.language,
|
||||||
|
"description": parsed.description,
|
||||||
|
"document_type": parsed.document_type,
|
||||||
|
"word_count": parsed.word_count,
|
||||||
|
"outbound_links": parsed.outbound_links,
|
||||||
|
"tags": parsed.tags,
|
||||||
|
"quality_score": parsed.quality_score,
|
||||||
|
"confidence": parsed.confidence,
|
||||||
|
"low_quality_flag": parsed.low_quality_flag,
|
||||||
|
"quality_warnings": parsed.quality_warnings,
|
||||||
|
"quality_signals": parsed.quality_signals.as_dict(),
|
||||||
|
"mentioned_companies": mentions,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
async def process_job(
|
async def process_job(
|
||||||
job: dict,
|
job: dict[str, Any],
|
||||||
pool: asyncpg.Pool,
|
pool: asyncpg.Pool,
|
||||||
rds: aioredis.Redis,
|
rds: aioredis.Redis,
|
||||||
minio_client: Minio,
|
minio_client: Minio,
|
||||||
):
|
) -> None:
|
||||||
doc_id = job["document_id"]
|
doc_id = job["document_id"]
|
||||||
ticker = job["ticker"]
|
ticker = job["ticker"]
|
||||||
url = job.get("url", "")
|
url = job.get("url", "")
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
_parse_start = time.monotonic()
|
||||||
|
|
||||||
|
set_trace_context(trace_id=job.get("_trace_id") or new_trace_id())
|
||||||
|
|
||||||
# Fetch HTML if we have a URL
|
# Fetch HTML if we have a URL
|
||||||
html = await fetch_html(url) if url else None
|
html = await fetch_html(url) if url else None
|
||||||
|
|
||||||
if html:
|
if html:
|
||||||
# Store raw HTML
|
# Parse using BeautifulSoup pipeline
|
||||||
html_bytes = html.encode("utf-8")
|
parsed = parse_html(html, url)
|
||||||
now = datetime.utcnow()
|
|
||||||
html_path = f"scrape/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.html"
|
|
||||||
minio_client.put_object(
|
|
||||||
"stonks-raw-news", html_path, io.BytesIO(html_bytes), len(html_bytes),
|
|
||||||
content_type="text/html",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Parse
|
|
||||||
text = strip_html_tags(html)
|
|
||||||
text = reduce_boilerplate(text)
|
|
||||||
else:
|
else:
|
||||||
text = ""
|
parsed = ParsedDocument()
|
||||||
|
|
||||||
quality_score, confidence = score_quality(text)
|
text = parsed.body_text
|
||||||
|
|
||||||
# Store normalized text
|
# Upload normalized text to MinIO
|
||||||
|
norm_ref: str | None = None
|
||||||
if text:
|
if text:
|
||||||
text_bytes = text.encode("utf-8")
|
norm_ref = upload_normalized_text(
|
||||||
now = datetime.utcnow()
|
minio_client, ticker, doc_id,
|
||||||
norm_path = f"parsed/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/normalized.txt"
|
text.encode("utf-8"), timestamp=now,
|
||||||
minio_client.put_object(
|
|
||||||
"stonks-normalized", norm_path, io.BytesIO(text_bytes), len(text_bytes),
|
|
||||||
content_type="text/plain",
|
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
norm_path = None
|
|
||||||
|
|
||||||
# Detect company mentions
|
# Detect company mentions
|
||||||
aliases = await pool.fetch(
|
aliases = await pool.fetch(
|
||||||
@@ -150,14 +124,24 @@ async def process_job(
|
|||||||
)
|
)
|
||||||
mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else []
|
mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else []
|
||||||
|
|
||||||
# Update document
|
# Build and upload structured parser output JSON
|
||||||
status = "parsed" if confidence != "low" else "low_quality"
|
output_json = build_parser_output_json(parsed, mentions)
|
||||||
await pool.execute(
|
output_bytes = json.dumps(output_json, default=str, indent=2).encode("utf-8")
|
||||||
"""UPDATE documents SET
|
parser_output_ref = upload_parser_output(
|
||||||
normalized_storage_ref=$2, parse_quality_score=$3, parse_confidence=$4, status=$5, updated_at=NOW()
|
minio_client, ticker, doc_id,
|
||||||
WHERE id=$1""",
|
output_bytes, timestamp=now,
|
||||||
doc_id, f"s3://stonks-normalized/{norm_path}" if norm_path else None,
|
)
|
||||||
quality_score, confidence, status,
|
|
||||||
|
# Update document in PostgreSQL
|
||||||
|
status = "parsed" if parsed.confidence != "low" else "low_quality"
|
||||||
|
await update_document_parse_results(
|
||||||
|
pool,
|
||||||
|
document_id=doc_id,
|
||||||
|
normalized_storage_ref=norm_ref,
|
||||||
|
parser_output_ref=parser_output_ref,
|
||||||
|
parse_quality_score=parsed.quality_score,
|
||||||
|
parse_confidence=parsed.confidence,
|
||||||
|
status=status,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Insert company mentions
|
# Insert company mentions
|
||||||
@@ -169,19 +153,36 @@ async def process_job(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Only enqueue for extraction if quality is acceptable
|
# Only enqueue for extraction if quality is acceptable
|
||||||
if confidence != "low":
|
if parsed.confidence != "low":
|
||||||
await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps({
|
await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps(inject_trace_context({
|
||||||
"document_id": doc_id,
|
"document_id": doc_id,
|
||||||
"ticker": ticker,
|
"ticker": ticker,
|
||||||
"normalized_text": text[:8000], # Truncate for prompt
|
"normalized_text": text[:8000],
|
||||||
}))
|
})))
|
||||||
logger.info(f"Parsed doc {doc_id} for {ticker}: quality={quality_score:.2f}, confidence={confidence}")
|
PARSE_JOBS_TOTAL.labels(status="parsed").inc()
|
||||||
|
PARSE_QUALITY_SCORE.observe(parsed.quality_score)
|
||||||
|
PARSE_DURATION.observe(time.monotonic() - _parse_start)
|
||||||
|
logger.info(
|
||||||
|
"Parsed doc %s for %s: quality=%.2f, confidence=%s",
|
||||||
|
doc_id, ticker, parsed.quality_score, parsed.confidence,
|
||||||
|
extra={"ticker": ticker, "document_id": doc_id},
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Low quality parse for doc {doc_id}, skipping extraction")
|
PARSE_JOBS_TOTAL.labels(status="low_quality").inc()
|
||||||
|
PARSE_LOW_QUALITY_TOTAL.inc()
|
||||||
|
PARSE_QUALITY_SCORE.observe(parsed.quality_score)
|
||||||
|
PARSE_DURATION.observe(time.monotonic() - _parse_start)
|
||||||
|
logger.warning(
|
||||||
|
"Low quality parse for doc %s, skipping extraction",
|
||||||
|
doc_id,
|
||||||
|
extra={"ticker": ticker, "document_id": doc_id},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main() -> None:
|
||||||
config = load_config()
|
config = load_config()
|
||||||
|
setup_logging("parser_worker", level=config.log_level, json_output=config.json_logs)
|
||||||
|
|
||||||
pool = await get_pg_pool(config)
|
pool = await get_pg_pool(config)
|
||||||
rds = get_redis(config)
|
rds = get_redis(config)
|
||||||
minio_client = get_minio(config)
|
minio_client = get_minio(config)
|
||||||
@@ -197,7 +198,7 @@ async def main():
|
|||||||
try:
|
try:
|
||||||
await process_job(job, pool, rds, minio_client)
|
await process_job(job, pool, rds, minio_client)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Parse error: {e}")
|
logger.error("Parse error: %s", e, exc_info=True)
|
||||||
else:
|
else:
|
||||||
await asyncio.sleep(2)
|
await asyncio.sleep(2)
|
||||||
finally:
|
finally:
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user