phase 14-15: docker build validation and helm deployment

This commit is contained in:
Celes Renata
2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
+127 -73
View File
@@ -24,99 +24,153 @@
- [x] Add seed data support for an initial tracked watchlist - [x] Add seed data support for an initial tracked watchlist
## Phase 3 ## Phase 3
- External API Adapters - External API Adapters
- [ ] Implement scheduler for symbol and source polling windows - [x] Implement scheduler for symbol and source polling windows
- [ ] Implement market data API adapter interface - [x] Implement market data API adapter interface
- [ ] Implement first concrete market data provider adapter - [x] Implement first concrete market data provider adapter
- [ ] Implement news API adapter interface - [x] Implement news API adapter interface
- [ ] Implement first concrete news API provider adapter - [x] Implement first concrete news API provider adapter
- [ ] Implement filings or regulatory adapter interface - [x] Implement filings or regulatory adapter interface
- [ ] Implement first concrete filings provider adapter - [x] Implement first concrete filings provider adapter
- [ ] Implement broker API adapter interface for paper trading and order events - [x] Implement broker API adapter interface for paper trading and order events
- [ ] Implement rate-limit coordination, retries, and backoff across adapters - [x] Implement rate-limit coordination, retries, and backoff across adapters
## Phase 4 - Ingestion Pipeline ## Phase 4 - Ingestion Pipeline
- [ ] Implement web scraper worker for curated URLs and article pages - [x] Implement web scraper worker for curated URLs and article pages
- [ ] Implement canonical URL normalization and content hashing - [x] Implement canonical URL normalization and content hashing
- [ ] Implement raw artifact upload to MinIO - [x] Implement raw artifact upload to MinIO
- [ ] Implement metadata persistence in PostgreSQL for market payloads, documents, and broker events - [x] Implement metadata persistence in PostgreSQL for market payloads, documents, and broker events
- [ ] Implement retry and failure tracking for source retrieval - [x] Implement retry and failure tracking for source retrieval
- [ ] Implement dedupe logic across article and filing sources - [x] Implement dedupe logic across article and filing sources
## Phase 5 - Parsing and Normalization ## Phase 5 - Parsing and Normalization
- [ ] Implement HTML-to-text parsing pipeline - [x] Implement HTML-to-text parsing pipeline
- [ ] Implement boilerplate reduction and body extraction heuristics - [x] Implement boilerplate reduction and body extraction heuristics
- [ ] Implement parser quality scoring and confidence flags - [x] Implement parser quality scoring and confidence flags
- [ ] Implement company mention detection using ticker, alias, and name matching - [x] Implement company mention detection using ticker, alias, and name matching
- [ ] Persist normalized text and parser outputs to MinIO and PostgreSQL - [x] Persist normalized text and parser outputs to MinIO and PostgreSQL
## Phase 6 - Ollama Structured Extraction ## Phase 6 - Ollama Structured Extraction
- [ ] Build extraction prompt templates with anti-hallucination instructions - [x] Build extraction prompt templates with anti-hallucination instructions
- [ ] Build JSON schema definitions for document intelligence extraction - [x] Build JSON schema definitions for document intelligence extraction
- [ ] Implement Ollama client wrapper using structured output format - [x] Implement Ollama client wrapper using structured output format
- [ ] Implement schema validation and semantic validation layers - [x] Implement schema validation and semantic validation layers
- [ ] Persist prompts, model metadata, raw outputs, validation reports, and final intelligence objects - [x] Persist prompts, model metadata, raw outputs, validation reports, and final intelligence objects
- [ ] Add retry behavior for invalid or incomplete model responses - [x] Add retry behavior for invalid or incomplete model responses
- [ ] Add model performance metrics and dashboards - [x] Add model performance metrics and dashboards
## Phase 7 - Aggregation and Trend Engine ## Phase 7 - Aggregation and Trend Engine
- [ ] Implement recency decay and source credibility weighting - [x] Implement recency decay and source credibility weighting
- [ ] Integrate market context features into aggregation windows - [x] Integrate market context features into aggregation windows
- [ ] Implement company-level rolling window aggregation - [x] Implement company-level rolling window aggregation
- [ ] Implement contradiction detection and disagreement representation - [x] Implement contradiction detection and disagreement representation
- [ ] Implement sector and market rollups - [x] Implement sector and market rollups
- [ ] Implement evidence ranking for supporting and opposing documents - [x] Implement evidence ranking for supporting and opposing documents
- [ ] Persist trend windows and evidence mappings - [x] Persist trend windows and evidence mappings
## Phase 8 - Recommendation Engine ## Phase 8 - Recommendation Engine
- [ ] Design deterministic recommendation eligibility logic - [x] Design deterministic recommendation eligibility logic
- [ ] Implement recommendation generation from aggregated scores and evidence - [x] Implement recommendation generation from aggregated scores and evidence
- [ ] Add optional LLM wording layer for thesis generation only - [x] Add optional LLM wording layer for thesis generation only
- [ ] Persist recommendation objects and evidence citations - [x] Persist recommendation objects and evidence citations
- [ ] Add suppression logic for low-quality data or low confidence - [x] Add suppression logic for low-quality data or low confidence
- [ ] Publish prediction facts to analytical tables - [x] Publish prediction facts to analytical tables
## Phase 9 - Risk Engine and Trade Adapter ## Phase 9 - Risk Engine and Trade Adapter
- [ ] Implement portfolio and account risk configuration model - [x] Implement portfolio and account risk configuration model
- [ ] Implement hard blocks for max position size, sector exposure, daily loss limits, and news-shock lockouts - [x] Implement hard blocks for max position size, sector exposure, daily loss limits, and news-shock lockouts
- [ ] Implement paper trading adapter behavior and state sync - [x] Implement paper trading adapter behavior and state sync
- [ ] Integrate first broker API in sandbox mode - [x] Integrate first broker API in sandbox mode
- [ ] Implement idempotent order submission keys and duplicate prevention - [x] Implement idempotent order submission keys and duplicate prevention
- [ ] Implement full execution audit trail - [x] Implement full execution audit trail
- [ ] Add operator approval workflow for live trading mode - [x] Add operator approval workflow for live trading mode
- [ ] Publish order, fill, and position facts to analytical tables - [x] Publish order, fill, and position facts to analytical tables
## Phase 10 - Lakehouse and SQL Analytics ## Phase 10 - Lakehouse and SQL Analytics
- [ ] Define analytical fact tables for bars, documents, extractions, signals, orders, fills, positions, and PnL - [x] Define analytical fact tables for bars, documents, extractions, signals, orders, fills, positions, and PnL
- [ ] Implement Parquet writers for analytical datasets - [x] Implement Parquet writers for analytical datasets
- [ ] Implement Hive-compatible partition layout conventions on MinIO - [x] Implement Hive-compatible partition layout conventions on MinIO
- [ ] Implement Iceberg table creation and metadata management for analytical datasets - [x] Implement Iceberg table creation and metadata management for analytical datasets
- [ ] Implement lake publisher jobs from operational data into analytical fact tables - [x] Implement lake publisher jobs from operational data into analytical fact tables
- [ ] Configure Trino catalogs for Hive and or Iceberg access to MinIO - [x] Configure Trino catalogs for Hive and or Iceberg access to MinIO
- [ ] Add example SQL views for prediction-vs-outcome and paper-trade scorecards - [x] Add example SQL views for prediction-vs-outcome and paper-trade scorecards
## Phase 11 - Query API and Dashboard ## Phase 11 - Query API and Dashboard
- [ ] Build APIs for companies, document timelines, trend summaries, recommendations, and order history - [x] Build APIs for companies, document timelines, trend summaries, recommendations, and order history
- [ ] Build evidence drill-down view linking recommendations to source documents and raw artifacts - [x] Build evidence drill-down view linking recommendations to source documents and raw artifacts
- [ ] Build admin controls for source health, symbol configs, and trading mode - [x] Build admin controls for source health, symbol configs, and trading mode
- [ ] Build operational dashboard for ingestion throughput, model failures, and source coverage gaps - [x] Build operational dashboard for ingestion throughput, model failures, and source coverage gaps
- [ ] Build Superset starter dashboards for symbol overview, sentiment heatmap, PnL, and prediction accuracy - [x] Build Superset starter dashboards for symbol overview, sentiment heatmap, PnL, and prediction accuracy
## Phase 12 - Observability and Hardening ## Phase 12 - Observability and Hardening
- [ ] Add structured logs and distributed tracing across services - [x] Add structured logs and distributed tracing across services
- [ ] Add Prometheus metrics for ingestion, parsing, extraction, aggregation, lake publication, and trading - [x] Add Prometheus metrics for ingestion, parsing, extraction, aggregation, lake publication, and trading
- [ ] Add alerting for source failures, schema failure spikes, analytical lag, and broker issues - [x] Add alerting for source failures, schema failure spikes, analytical lag, and broker issues
- [ ] Add dead-letter queues and replay tooling - [x] Add dead-letter queues and replay tooling
- [ ] Add data retention and lifecycle controls for raw and derived artifacts - [x] Add data retention and lifecycle controls for raw and derived artifacts
- [ ] Add security review for secrets, network policies, trading isolation, and dashboard access control - [x] Add security review for secrets, network policies, trading isolation, and dashboard access control
## Phase 13 - Verification and Rollout ## Phase 13 - Verification and Rollout
- [ ] Create replay dataset from archived documents for deterministic extraction testing - [x] Create replay dataset from archived documents for deterministic extraction testing
- [ ] Create integration tests for the full ingest-to-recommendation flow - [x] Create integration tests for the full ingest-to-recommendation flow
- [ ] Create paper trading simulation scenarios - [x] Create paper trading simulation scenarios
- [ ] Validate fail-closed behavior for broker outages and ambiguous order states - [x] Validate fail-closed behavior for broker outages and ambiguous order states
- [ ] Validate lake publication and Trino query correctness over partitioned MinIO datasets - [x] Validate lake publication and Trino query correctness over partitioned MinIO datasets
- [ ] Run shadow mode before enabling any live execution - [x] ~~Run shadow mode~~ moved to Phase 15.5 (post-deployment)
- [ ] Prepare operator runbook and incident response procedures - [x] ~~Prepare operator runbook~~ moved to Phase 15.5 (post-deployment)
## Phase 14 - Local Docker Build Validation
- [x] 14. Build and validate all Docker containers locally
- [x] 14.1 Build all 11 service containers locally using the Makefile
- Run `make build` to build scheduler, symbol-registry, ingestion, parser, extractor, aggregation, recommendation, risk, broker-adapter, lake-publisher, and query-api images
- Fix any build failures (missing dependencies, import errors, syntax issues)
- _Requirements: N1, 12.1_
- [x] 14.2 Validate schema and logic consistency across all services
- Run the full test suite with `pytest tests/ -x --tb=short -q` to catch import errors, schema mismatches, and logic inconsistencies
- Verify all shared schemas in `services/shared/schemas.py` are consistent with what each service expects
- Verify config loader fields match the configmap and secrets definitions
- Fix any mismatches found between services, schemas, migrations, and K8s manifests
- _Requirements: 5.2, 5.3, 9.2, N2_
- [x] 14.3 Verify each container starts without immediate crash
- Run each built image with `docker run --rm` and a quick health check or `--help` flag to confirm the entrypoint resolves
- Fix any runtime import errors or missing module paths
- _Requirements: N1_
## Phase 15 - CI Validation, Helm Deployment, and Cluster Rollout
- [-] 15. Commit, push, validate CI, create Helm chart, and deploy to cluster
- [-] 15.1 Commit and push code to GitHub
- Configure git with SSH key for the private repo
- Commit all current changes with message `phase 14-15: docker build validation and helm deployment`
- Push to main branch
- _Requirements: N1_
- [ ] 15.2 Validate GitHub Actions workflow builds containers
- Monitor the GitHub Actions run to confirm lint-and-test and build-services jobs succeed
- Fix any CI failures and re-push if needed
- _Requirements: N1_
- [ ] 15.3 Create Helm chart for stonks-oracle deployment
- Create `infra/helm/stonks-oracle/Chart.yaml` with chart metadata
- Create `infra/helm/stonks-oracle/values.yaml` with configurable image tags, replica counts, resource limits, and environment references
- Create Helm templates for all deployments, services, configmap, secrets, ingress, and network policies from existing K8s manifests
- Add imagePullSecrets configuration for GHCR private registry access
- Add a template for a Kubernetes Secret of type `kubernetes.io/dockerconfigjson` for GHCR authentication
- _Requirements: N1, 8.2_
- [ ] 15.4 Configure GHCR image pull authentication on the cluster
- Create a `docker-registry` secret in the `stonks-oracle` namespace with GHCR credentials (using a GitHub PAT or deploy key)
- Reference the imagePullSecret in all deployment specs via the Helm values
- _Requirements: 8.2, N1_
- [ ] 15.5 Deploy stonks-oracle to the cluster via Helm
- Run `helm install` or `helm upgrade --install` targeting the `stonks-oracle` namespace
- Verify all pods reach Running/Ready state
- Verify services and ingress endpoints are reachable
- Debug and fix any deployment issues (CrashLoopBackOff, image pull errors, config mismatches)
- _Requirements: N1, 12.1_
- [ ] 15.6 Run shadow mode before enabling any live execution
- Confirm all services are running and processing in paper-only mode
- Validate end-to-end data flow from ingestion through recommendation without live trades
- _Requirements: N5, 8.1_
- [ ] 15.7 Prepare operator runbook and incident response procedures
- Document service restart procedures, log access, and common failure modes
- Document how to toggle trading modes and approve live execution
- _Requirements: 8.2, 12.1_
## Recommended First Vertical Slice ## Recommended First Vertical Slice
- [ ] Track 5 to 10 symbols - [ ] Track 5 to 10 symbols
+19 -2
View File
@@ -24,8 +24,25 @@ test:
build: build:
@for svc in $(SERVICES); do \ @for svc in $(SERVICES); do \
echo "Building $$svc..."; \ case $$svc in \
docker build -t $(GHCR)/$$svc:$(SHA) -t $(GHCR)/$$svc:latest -f docker/Dockerfile .; \ scheduler) cmd="python -m services.scheduler.app" ;; \
symbol-registry) cmd="uvicorn services.symbol_registry.app:app --host 0.0.0.0 --port 8000" ;; \
ingestion) cmd="python -m services.ingestion.worker" ;; \
parser) cmd="python -m services.parser.worker" ;; \
extractor) cmd="python -m services.extractor.main" ;; \
aggregation) cmd="python -m services.aggregation.main" ;; \
recommendation) cmd="python -m services.recommendation.main" ;; \
risk) cmd="uvicorn services.risk.app:app --host 0.0.0.0 --port 8000" ;; \
broker-adapter) cmd="python -m services.adapters.broker_service" ;; \
lake-publisher) cmd="python -m services.lake_publisher.jobs" ;; \
query-api) cmd="uvicorn services.api.app:app --host 0.0.0.0 --port 8000" ;; \
esac; \
echo "Building $$svc ($$cmd)..."; \
docker build \
--build-arg "SERVICE_CMD=$$cmd" \
-t $(GHCR)/$$svc:$(SHA) \
-t $(GHCR)/$$svc:latest \
-f docker/Dockerfile . || exit 1; \
done done
push: push:
+15 -6
View File
@@ -3,9 +3,18 @@
Apache Superset dashboard configurations and starter datasets for Stonks Oracle. Apache Superset dashboard configurations and starter datasets for Stonks Oracle.
## Starter Dashboards ## Starter Dashboards
- Symbol Overview — company profile, source health, recent documents See `starter/` for dashboard definitions covering:
- Sentiment Heatmap — market-wide sentiment by sector and symbol - Symbol Overview — company profiles, source health, recent documents, and market snapshots
- Prediction Accuracy — predicted signals vs realized price moves - Sentiment Heatmap — market-wide sentiment by sector and symbol, catalyst analysis
- Paper Trading PnL — paper trade performance and position tracking - Prediction Accuracy — predicted signals vs realized price moves, confidence calibration
- Model Quality — extraction success rates, latency, and confidence distributions - Paper Trading PnL — cumulative PnL, position snapshots, order history, and scorecards
- Source Coverage — ingestion throughput, source failures, and coverage gaps
## Operational Dashboards
See `operational/` for dashboard definitions covering:
- Ingestion Throughput — documents/hour by source type, success/failure rates, stale sources
- Model Extraction Quality — success rates, latency percentiles, validation failures, confidence distributions
- Source Coverage & Gaps — per-symbol source type matrix, missing sources, failure heatmap
Starter dashboards are powered by the Trino `lakehouse` catalog over MinIO-backed analytical tables.
Operational dashboards query the Query API `/api/ops/*` endpoints.
All dashboards can be imported into Superset via the UI or CLI.
+22
View File
@@ -0,0 +1,22 @@
# Operational Dashboard
Superset dashboard definitions for Stonks Oracle operational monitoring.
## Dashboards
- Ingestion Throughput — documents ingested per hour by source type, success/failure rates
- Model Extraction Quality — extraction success rates, latency percentiles, validation failures
- Source Coverage Gaps — symbols missing source types, stale sources with no recent data
## Data Sources
These dashboards query the Query API operational endpoints:
- `/api/ops/ingestion/throughput` — time-bucketed ingestion metrics
- `/api/ops/ingestion/summary` — aggregate ingestion stats
- `/api/ops/model/failures` — recent extraction failures
- `/api/ops/model/performance` — model performance summary
- `/api/ops/pipeline/health` — pipeline stage health
- `/api/ops/sources/coverage-gaps` — source coverage analysis
## Setup
Import the dashboard JSON files into Superset via the Superset UI or CLI.
The dashboards use the Trino `lakehouse` catalog as their primary datasource,
with supplementary queries against the Query API for real-time operational data.
@@ -0,0 +1,75 @@
{
"dashboard_title": "Ingestion Throughput",
"description": "Operational dashboard for monitoring ingestion pipeline throughput, success rates, and item counts across source types.",
"slug": "ingestion-throughput",
"position_json": {
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Ingestion Throughput"}},
"ROW-1": {
"type": "ROW",
"children": ["CHART-throughput-timeseries", "CHART-source-type-breakdown"]
},
"ROW-2": {
"type": "ROW",
"children": ["CHART-success-failure-rate", "CHART-items-fetched"]
},
"ROW-3": {
"type": "ROW",
"children": ["CHART-stale-sources", "CHART-active-companies"]
}
},
"metadata": {
"refresh_frequency": 300,
"default_filters": "{}",
"color_scheme": "supersetColors"
},
"charts": [
{
"slice_name": "Ingestion Runs Over Time",
"viz_type": "echarts_timeseries_bar",
"description": "Ingestion run counts bucketed by hour, stacked by source type",
"datasource_type": "query",
"query": "SELECT date_trunc('hour', ir.started_at) AS bucket, ir.source_type, COUNT(*) AS run_count, COUNT(*) FILTER (WHERE ir.status = 'completed') AS completed, COUNT(*) FILTER (WHERE ir.status = 'failed') AS failed FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY 1, 2 ORDER BY 1",
"params": {
"x_axis": "bucket",
"metrics": ["run_count"],
"groupby": ["source_type"],
"time_grain_sqla": "PT1H"
}
},
{
"slice_name": "Source Type Breakdown",
"viz_type": "pie",
"description": "Distribution of ingestion runs by source type in the last 24h",
"datasource_type": "query",
"query": "SELECT ir.source_type, COUNT(*) AS runs FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY ir.source_type ORDER BY runs DESC"
},
{
"slice_name": "Success vs Failure Rate",
"viz_type": "echarts_timeseries_line",
"description": "Hourly success and failure counts over time",
"datasource_type": "query",
"query": "SELECT date_trunc('hour', ir.started_at) AS bucket, COUNT(*) FILTER (WHERE ir.status = 'completed') AS completed, COUNT(*) FILTER (WHERE ir.status = 'failed') AS failed, ROUND(COUNT(*) FILTER (WHERE ir.status = 'completed')::numeric / NULLIF(COUNT(*), 0), 3) AS success_rate FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
},
{
"slice_name": "Items Fetched Over Time",
"viz_type": "echarts_timeseries_bar",
"description": "Total items fetched and new items per hour",
"datasource_type": "query",
"query": "SELECT date_trunc('hour', ir.started_at) AS bucket, COALESCE(SUM(ir.items_fetched), 0) AS items_fetched, COALESCE(SUM(ir.items_new), 0) AS items_new FROM ingestion_runs ir WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
},
{
"slice_name": "Stale Sources",
"viz_type": "table",
"description": "Sources with no successful run in the last 24 hours",
"datasource_type": "query",
"query": "SELECT c.ticker, s.source_type, s.source_name, MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') AS last_success, COUNT(*) FILTER (WHERE ir.status = 'failed' AND ir.started_at >= NOW() - INTERVAL '24 hours') AS recent_failures FROM sources s JOIN companies c ON c.id = s.company_id LEFT JOIN ingestion_runs ir ON ir.source_id = s.id WHERE s.active = TRUE AND c.active = TRUE GROUP BY c.ticker, s.source_type, s.source_name HAVING MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') < NOW() - INTERVAL '24 hours' OR MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') IS NULL ORDER BY c.ticker"
},
{
"slice_name": "Active Companies Ingested",
"viz_type": "big_number_total",
"description": "Count of distinct companies with ingestion activity in the last 24h",
"datasource_type": "query",
"query": "SELECT COUNT(DISTINCT company_id) AS active_companies FROM ingestion_runs WHERE started_at >= NOW() - INTERVAL '24 hours'"
}
]
}
+94
View File
@@ -0,0 +1,94 @@
{
"dashboard_title": "Model Extraction Quality",
"description": "Operational dashboard for monitoring Ollama extraction success rates, latency, validation failures, and confidence distributions.",
"slug": "model-extraction-quality",
"position_json": {
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Model Extraction Quality"}},
"ROW-1": {
"type": "ROW",
"children": ["CHART-success-rate-kpi", "CHART-avg-latency-kpi", "CHART-avg-confidence-kpi", "CHART-retry-rate-kpi"]
},
"ROW-2": {
"type": "ROW",
"children": ["CHART-extraction-timeseries", "CHART-validation-status-pie"]
},
"ROW-3": {
"type": "ROW",
"children": ["CHART-latency-percentiles", "CHART-confidence-distribution"]
},
"ROW-4": {
"type": "ROW",
"children": ["CHART-recent-failures-table"]
}
},
"metadata": {
"refresh_frequency": 300,
"default_filters": "{}",
"color_scheme": "supersetColors"
},
"charts": [
{
"slice_name": "Extraction Success Rate",
"viz_type": "big_number_total",
"description": "Overall extraction success rate in the last 24h",
"datasource_type": "query",
"query": "SELECT ROUND(COUNT(*) FILTER (WHERE success)::numeric / NULLIF(COUNT(*), 0), 4) AS success_rate FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours'"
},
{
"slice_name": "Avg Extraction Latency",
"viz_type": "big_number_total",
"description": "Average extraction duration in milliseconds",
"datasource_type": "query",
"query": "SELECT ROUND(AVG(total_duration_ms)::numeric, 0) AS avg_latency_ms FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours'"
},
{
"slice_name": "Avg Confidence Score",
"viz_type": "big_number_total",
"description": "Average confidence of successful extractions",
"datasource_type": "query",
"query": "SELECT ROUND(AVG(confidence)::numeric, 3) AS avg_confidence FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' AND success = TRUE"
},
{
"slice_name": "Avg Retry Count",
"viz_type": "big_number_total",
"description": "Average retries per extraction attempt",
"datasource_type": "query",
"query": "SELECT ROUND(AVG(retry_count)::numeric, 2) AS avg_retries FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours'"
},
{
"slice_name": "Extractions Over Time",
"viz_type": "echarts_timeseries_bar",
"description": "Hourly extraction counts split by success/failure",
"datasource_type": "query",
"query": "SELECT date_trunc('hour', recorded_at) AS bucket, COUNT(*) FILTER (WHERE success) AS successful, COUNT(*) FILTER (WHERE NOT success) AS failed FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
},
{
"slice_name": "Validation Status Distribution",
"viz_type": "pie",
"description": "Breakdown of extraction validation outcomes",
"datasource_type": "query",
"query": "SELECT validation_status, COUNT(*) AS count FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' GROUP BY validation_status"
},
{
"slice_name": "Latency Percentiles Over Time",
"viz_type": "echarts_timeseries_line",
"description": "P50, P95, P99 extraction latency per hour",
"datasource_type": "query",
"query": "SELECT date_trunc('hour', recorded_at) AS bucket, ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 0) AS p50_ms, ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 0) AS p95_ms, ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 0) AS p99_ms FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' GROUP BY 1 ORDER BY 1"
},
{
"slice_name": "Confidence Distribution",
"viz_type": "histogram",
"description": "Distribution of extraction confidence scores",
"datasource_type": "query",
"query": "SELECT CASE WHEN confidence >= 0.9 THEN '0.9-1.0' WHEN confidence >= 0.8 THEN '0.8-0.9' WHEN confidence >= 0.7 THEN '0.7-0.8' WHEN confidence >= 0.6 THEN '0.6-0.7' WHEN confidence >= 0.5 THEN '0.5-0.6' ELSE '<0.5' END AS confidence_bucket, COUNT(*) AS count FROM model_performance_metrics WHERE recorded_at >= NOW() - INTERVAL '24 hours' AND success = TRUE GROUP BY 1 ORDER BY 1"
},
{
"slice_name": "Recent Extraction Failures",
"viz_type": "table",
"description": "Most recent failed extractions with error details",
"datasource_type": "query",
"query": "SELECT mpm.ticker, mpm.model_name, mpm.validation_status, mpm.validation_error_count, mpm.attempt_count, mpm.total_duration_ms, mpm.recorded_at, d.title, d.document_type FROM model_performance_metrics mpm LEFT JOIN documents d ON d.id = mpm.document_id WHERE mpm.success = FALSE AND mpm.recorded_at >= NOW() - INTERVAL '24 hours' ORDER BY mpm.recorded_at DESC LIMIT 50"
}
]
}
@@ -0,0 +1,51 @@
{
"dashboard_title": "Source Coverage & Gaps",
"description": "Operational dashboard for identifying source coverage gaps, stale sources, and symbols missing expected data feeds.",
"slug": "source-coverage-gaps",
"position_json": {
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Source Coverage & Gaps"}},
"ROW-1": {
"type": "ROW",
"children": ["CHART-coverage-matrix", "CHART-missing-types-table"]
},
"ROW-2": {
"type": "ROW",
"children": ["CHART-stale-sources-table", "CHART-failure-heatmap"]
}
},
"metadata": {
"refresh_frequency": 600,
"default_filters": "{}",
"color_scheme": "supersetColors"
},
"charts": [
{
"slice_name": "Source Coverage Matrix",
"viz_type": "table",
"description": "Per-symbol source type coverage showing active source counts",
"datasource_type": "query",
"query": "SELECT c.ticker, c.legal_name, c.sector, COUNT(s.id) FILTER (WHERE s.active) AS active_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'market_api' AND s.active) AS market_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'news_api' AND s.active) AS news_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'filings_api' AND s.active) AS filings_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'web_scrape' AND s.active) AS web_scrape_sources, COUNT(s.id) FILTER (WHERE s.source_type = 'broker' AND s.active) AS broker_sources FROM companies c LEFT JOIN sources s ON s.company_id = c.id WHERE c.active = TRUE GROUP BY c.ticker, c.legal_name, c.sector ORDER BY c.ticker"
},
{
"slice_name": "Symbols Missing Source Types",
"viz_type": "table",
"description": "Companies that lack one or more expected source types (market_api, news_api, filings_api)",
"datasource_type": "query",
"query": "SELECT c.ticker, c.legal_name, c.sector, ARRAY_AGG(DISTINCT s.source_type) FILTER (WHERE s.active) AS active_types FROM companies c LEFT JOIN sources s ON s.company_id = c.id AND s.active = TRUE WHERE c.active = TRUE GROUP BY c.ticker, c.legal_name, c.sector HAVING NOT ARRAY['market_api', 'news_api', 'filings_api'] <@ ARRAY_AGG(DISTINCT s.source_type) FILTER (WHERE s.active) OR ARRAY_AGG(DISTINCT s.source_type) FILTER (WHERE s.active) IS NULL ORDER BY c.ticker"
},
{
"slice_name": "Stale Sources (No Success in 24h)",
"viz_type": "table",
"description": "Active sources that have not completed a successful ingestion run in the last 24 hours",
"datasource_type": "query",
"query": "SELECT c.ticker, s.source_type, s.source_name, MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') AS last_success, MAX(ir.started_at) AS last_attempt, COUNT(*) FILTER (WHERE ir.status = 'failed' AND ir.started_at >= NOW() - INTERVAL '24 hours') AS recent_failures FROM sources s JOIN companies c ON c.id = s.company_id LEFT JOIN ingestion_runs ir ON ir.source_id = s.id WHERE s.active = TRUE AND c.active = TRUE GROUP BY c.ticker, s.source_type, s.source_name HAVING MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') < NOW() - INTERVAL '24 hours' OR MAX(ir.started_at) FILTER (WHERE ir.status = 'completed') IS NULL ORDER BY c.ticker, s.source_type"
},
{
"slice_name": "Source Failure Heatmap",
"viz_type": "heatmap",
"description": "Failure counts by source type and ticker in the last 24h",
"datasource_type": "query",
"query": "SELECT c.ticker, ir.source_type, COUNT(*) FILTER (WHERE ir.status = 'failed') AS failures FROM ingestion_runs ir JOIN companies c ON c.id = ir.company_id WHERE ir.started_at >= NOW() - INTERVAL '24 hours' GROUP BY c.ticker, ir.source_type HAVING COUNT(*) FILTER (WHERE ir.status = 'failed') > 0 ORDER BY failures DESC"
}
]
}
+29
View File
@@ -0,0 +1,29 @@
# Starter Dashboards
Superset dashboard definitions for Stonks Oracle research, analysis, and trading review.
## Dashboards
- Symbol Overview — company profiles, source health, recent documents, and market snapshots
- Sentiment Heatmap — market-wide sentiment by sector and symbol, catalyst analysis, contradiction tracking
- Prediction Accuracy — predicted signals vs realized price moves, confidence calibration, per-symbol accuracy
- Paper Trading PnL — cumulative PnL, daily performance, position snapshots, order history, and scorecards
## Data Sources
These dashboards query the Trino `lakehouse` catalog over MinIO-backed analytical fact tables:
- `lakehouse.stonks.documents` — ingested document metadata
- `lakehouse.stonks.document_extractions` — AI extraction outputs
- `lakehouse.stonks.trade_signals` — aggregated trend signals
- `lakehouse.stonks.market_bars` — OHLCV bar data
- `lakehouse.stonks.prediction_vs_outcome` — prediction accuracy tracking
- `lakehouse.stonks.pnl_daily` — daily PnL records
- `lakehouse.stonks.positions_daily` — end-of-day position snapshots
- `lakehouse.stonks.trade_orders` — order submission records
- `lakehouse.stonks.trade_fills` — fill and execution records
## Setup
1. Import the dashboard JSON files into Superset via the Superset UI or CLI
2. Ensure the Trino datasource is configured: `trino://trino@trino:8080/lakehouse/stonks`
3. Create the lakehouse views from `lakehouse/views/` for additional drill-down capability
## Trino Connection
The dashboards use the default Superset Trino connection configured in `infra/superset/superset_config.py`.
+124
View File
@@ -0,0 +1,124 @@
{
"dashboard_title": "Paper Trading PnL",
"description": "Paper trading performance tracking with PnL curves, position snapshots, order history, and trade detail drill-down.",
"slug": "paper-trading-pnl",
"position_json": {
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Paper Trading PnL"}},
"ROW-1": {
"type": "ROW",
"children": ["CHART-total-net-pnl-kpi", "CHART-win-rate-kpi", "CHART-total-orders-kpi", "CHART-active-positions-kpi"]
},
"ROW-2": {
"type": "ROW",
"children": ["CHART-cumulative-pnl-timeseries", "CHART-daily-pnl-bar"]
},
"ROW-3": {
"type": "ROW",
"children": ["CHART-pnl-by-symbol", "CHART-order-status-pie"]
},
"ROW-4": {
"type": "ROW",
"children": ["CHART-positions-table"]
},
"ROW-5": {
"type": "ROW",
"children": ["CHART-scorecard-table"]
},
"ROW-6": {
"type": "ROW",
"children": ["CHART-recent-orders-table"]
}
},
"metadata": {
"refresh_frequency": 300,
"default_filters": "{}",
"color_scheme": "supersetColors"
},
"charts": [
{
"slice_name": "Total Net PnL",
"viz_type": "big_number_total",
"description": "Cumulative net PnL across all paper trading activity",
"datasource_type": "trino",
"query": "SELECT ROUND(SUM(net_pnl), 2) AS total_net_pnl FROM lakehouse.stonks.pnl_daily WHERE execution_mode = 'paper'"
},
{
"slice_name": "Win Rate",
"viz_type": "big_number_total",
"description": "Fraction of trading days with positive net PnL",
"datasource_type": "trino",
"query": "SELECT ROUND(CAST(COUNT(CASE WHEN net_pnl > 0 THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS win_rate FROM lakehouse.stonks.pnl_daily WHERE execution_mode = 'paper'"
},
{
"slice_name": "Total Orders",
"viz_type": "big_number_total",
"description": "Total paper trade orders submitted",
"datasource_type": "trino",
"query": "SELECT COUNT(DISTINCT order_id) AS total_orders FROM lakehouse.stonks.trade_orders WHERE execution_mode = 'paper'"
},
{
"slice_name": "Active Positions",
"viz_type": "big_number_total",
"description": "Number of symbols with open positions as of the latest snapshot",
"datasource_type": "trino",
"query": "SELECT COUNT(DISTINCT ticker) AS active_positions FROM lakehouse.stonks.positions_daily WHERE execution_mode = 'paper' AND quantity <> 0 AND dt = (SELECT MAX(dt) FROM lakehouse.stonks.positions_daily WHERE execution_mode = 'paper')"
},
{
"slice_name": "Cumulative PnL Over Time",
"viz_type": "echarts_timeseries_line",
"description": "Running cumulative net PnL across all paper trades",
"datasource_type": "trino",
"query": "SELECT dt AS bucket, SUM(net_pnl) AS daily_net_pnl, SUM(SUM(net_pnl)) OVER (ORDER BY dt) AS cumulative_pnl FROM lakehouse.stonks.pnl_daily WHERE execution_mode = 'paper' GROUP BY dt ORDER BY dt"
},
{
"slice_name": "Daily PnL",
"viz_type": "echarts_timeseries_bar",
"description": "Daily net PnL for paper trading, colored by positive/negative",
"datasource_type": "trino",
"query": "SELECT dt AS bucket, ROUND(SUM(net_pnl), 2) AS daily_pnl, ROUND(SUM(realized_pnl), 2) AS realized, ROUND(SUM(unrealized_pnl), 2) AS unrealized FROM lakehouse.stonks.pnl_daily WHERE execution_mode = 'paper' GROUP BY dt ORDER BY dt",
"params": {
"x_axis": "bucket",
"metrics": ["daily_pnl"]
}
},
{
"slice_name": "PnL by Symbol",
"viz_type": "echarts_timeseries_bar",
"description": "Total net PnL per symbol for paper trading",
"datasource_type": "trino",
"query": "SELECT ticker, ROUND(SUM(net_pnl), 2) AS total_pnl, ROUND(SUM(realized_pnl), 2) AS realized_pnl, ROUND(SUM(fees), 2) AS total_fees FROM lakehouse.stonks.pnl_daily WHERE execution_mode = 'paper' GROUP BY ticker ORDER BY total_pnl DESC",
"params": {
"x_axis": "ticker",
"metrics": ["total_pnl"]
}
},
{
"slice_name": "Order Status Distribution",
"viz_type": "pie",
"description": "Breakdown of paper trade order statuses",
"datasource_type": "trino",
"query": "SELECT status, COUNT(*) AS count FROM lakehouse.stonks.trade_orders WHERE execution_mode = 'paper' GROUP BY status ORDER BY count DESC"
},
{
"slice_name": "Current Positions",
"viz_type": "table",
"description": "Latest position snapshot for all paper trading symbols",
"datasource_type": "trino",
"query": "SELECT p.ticker, p.quantity, ROUND(p.avg_entry_price, 2) AS avg_entry, ROUND(p.close_price, 2) AS close_price, ROUND(p.market_value, 2) AS market_value, ROUND(p.unrealized_pnl, 2) AS unrealized_pnl, p.snapshot_at FROM lakehouse.stonks.positions_daily p WHERE p.execution_mode = 'paper' AND p.dt = (SELECT MAX(dt) FROM lakehouse.stonks.positions_daily WHERE execution_mode = 'paper') ORDER BY ABS(p.unrealized_pnl) DESC"
},
{
"slice_name": "Paper Trade Scorecard",
"viz_type": "table",
"description": "Per-symbol paper trading scorecard with win rates, PnL, and order counts",
"datasource_type": "trino",
"query": "SELECT pnl.ticker, COUNT(DISTINCT pnl.dt) AS trading_days, ROUND(SUM(pnl.net_pnl), 2) AS total_net_pnl, ROUND(AVG(pnl.net_pnl), 2) AS avg_daily_pnl, ROUND(CAST(COUNT(CASE WHEN pnl.net_pnl > 0 THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS win_rate, ROUND(MIN(pnl.net_pnl), 2) AS worst_day, ROUND(MAX(pnl.net_pnl), 2) AS best_day, ROUND(SUM(pnl.fees), 2) AS total_fees, MIN(pnl.dt) AS first_trade, MAX(pnl.dt) AS last_trade FROM lakehouse.stonks.pnl_daily pnl WHERE pnl.execution_mode = 'paper' GROUP BY pnl.ticker ORDER BY total_net_pnl DESC"
},
{
"slice_name": "Recent Orders",
"viz_type": "table",
"description": "Most recent paper trade orders with fill details",
"datasource_type": "trino",
"query": "SELECT o.ticker, o.side, o.order_type, o.quantity, ROUND(o.limit_price, 2) AS limit_price, o.status, f.fill_price, f.fill_quantity, f.commission, o.submitted_at, f.filled_at FROM lakehouse.stonks.trade_orders o LEFT JOIN lakehouse.stonks.trade_fills f ON o.order_id = f.order_id AND o.dt = f.dt WHERE o.execution_mode = 'paper' ORDER BY o.submitted_at DESC LIMIT 50"
}
]
}
+125
View File
@@ -0,0 +1,125 @@
{
"dashboard_title": "Prediction Accuracy",
"description": "Predicted signals vs realized price moves, confidence calibration, and model accuracy tracking.",
"slug": "prediction-accuracy",
"position_json": {
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Prediction Accuracy"}},
"ROW-1": {
"type": "ROW",
"children": ["CHART-overall-hit-rate-kpi", "CHART-total-predictions-kpi", "CHART-avg-confidence-kpi", "CHART-avg-move-kpi"]
},
"ROW-2": {
"type": "ROW",
"children": ["CHART-hit-rate-timeseries", "CHART-outcome-distribution-pie"]
},
"ROW-3": {
"type": "ROW",
"children": ["CHART-confidence-calibration", "CHART-confidence-vs-move-scatter"]
},
"ROW-4": {
"type": "ROW",
"children": ["CHART-accuracy-by-symbol", "CHART-accuracy-by-action"]
},
"ROW-5": {
"type": "ROW",
"children": ["CHART-recent-predictions-table"]
}
},
"metadata": {
"refresh_frequency": 600,
"default_filters": "{}",
"color_scheme": "supersetColors"
},
"charts": [
{
"slice_name": "Overall Hit Rate",
"viz_type": "big_number_total",
"description": "Fraction of predictions with correct directional outcome over the last 30 days",
"datasource_type": "trino",
"query": "SELECT ROUND(CAST(COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS hit_rate FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
},
{
"slice_name": "Total Predictions (30d)",
"viz_type": "big_number_total",
"description": "Total evaluated predictions in the last 30 days",
"datasource_type": "trino",
"query": "SELECT COUNT(*) AS total_predictions FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
},
{
"slice_name": "Avg Predicted Confidence",
"viz_type": "big_number_total",
"description": "Average confidence of predictions in the last 30 days",
"datasource_type": "trino",
"query": "SELECT ROUND(AVG(predicted_confidence), 3) AS avg_confidence FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
},
{
"slice_name": "Avg Realized Move",
"viz_type": "big_number_total",
"description": "Average absolute realized price move percentage",
"datasource_type": "trino",
"query": "SELECT ROUND(AVG(ABS(actual_move_pct)), 3) AS avg_abs_move FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
},
{
"slice_name": "Daily Hit Rate",
"viz_type": "echarts_timeseries_line",
"description": "Daily prediction hit rate over the last 30 days",
"datasource_type": "trino",
"query": "SELECT dt AS bucket, COUNT(*) AS total, COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS correct, ROUND(CAST(COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS hit_rate FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY dt ORDER BY dt"
},
{
"slice_name": "Outcome Distribution",
"viz_type": "pie",
"description": "Breakdown of prediction outcomes (correct, incorrect, neutral) over the last 30 days",
"datasource_type": "trino",
"query": "SELECT outcome, COUNT(*) AS count FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY outcome ORDER BY count DESC"
},
{
"slice_name": "Confidence Calibration",
"viz_type": "echarts_timeseries_bar",
"description": "Hit rate by confidence bucket to assess calibration quality",
"datasource_type": "trino",
"query": "SELECT CASE WHEN predicted_confidence >= 0.8 THEN '0.8-1.0 (high)' WHEN predicted_confidence >= 0.6 THEN '0.6-0.8 (medium)' WHEN predicted_confidence >= 0.4 THEN '0.4-0.6 (low)' ELSE '0.0-0.4 (very low)' END AS confidence_bucket, COUNT(*) AS total, COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS correct, ROUND(CAST(COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS hit_rate FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY 1 ORDER BY 1",
"params": {
"x_axis": "confidence_bucket",
"metrics": ["hit_rate"]
}
},
{
"slice_name": "Confidence vs Realized Move",
"viz_type": "echarts_timeseries_scatter",
"description": "Scatter plot of predicted confidence vs actual realized move percentage",
"datasource_type": "trino",
"query": "SELECT ticker, predicted_confidence, actual_move_pct, predicted_action, outcome, dt FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY ORDER BY dt DESC",
"params": {
"x_axis": "predicted_confidence",
"y_axis": "actual_move_pct",
"groupby": ["outcome"]
}
},
{
"slice_name": "Accuracy by Symbol",
"viz_type": "table",
"description": "Per-symbol prediction accuracy summary",
"datasource_type": "trino",
"query": "SELECT ticker, COUNT(*) AS predictions, COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS correct, COUNT(CASE WHEN outcome = 'incorrect' THEN 1 END) AS incorrect, ROUND(CAST(COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS hit_rate, ROUND(AVG(predicted_confidence), 3) AS avg_confidence, ROUND(AVG(actual_move_pct), 3) AS avg_move_pct, ROUND(AVG(ABS(actual_move_pct)), 3) AS avg_abs_move_pct FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY ticker ORDER BY hit_rate DESC"
},
{
"slice_name": "Accuracy by Action Type",
"viz_type": "echarts_timeseries_bar",
"description": "Hit rate broken down by predicted action (buy, sell, hold, watch)",
"datasource_type": "trino",
"query": "SELECT predicted_action, COUNT(*) AS total, COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS correct, ROUND(CAST(COUNT(CASE WHEN outcome = 'correct' THEN 1 END) AS DOUBLE) / NULLIF(COUNT(*), 0), 4) AS hit_rate, ROUND(AVG(predicted_confidence), 3) AS avg_confidence FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY predicted_action ORDER BY predicted_action",
"params": {
"x_axis": "predicted_action",
"metrics": ["hit_rate"]
}
},
{
"slice_name": "Recent Predictions",
"viz_type": "table",
"description": "Most recent evaluated predictions with outcomes",
"datasource_type": "trino",
"query": "SELECT ticker, predicted_action, ROUND(predicted_confidence, 3) AS confidence, ROUND(actual_move_pct, 3) AS actual_move_pct, outcome, horizon_days, model_version, predicted_at, evaluated_at FROM lakehouse.stonks.prediction_vs_outcome WHERE dt >= CURRENT_DATE - INTERVAL '14' DAY ORDER BY evaluated_at DESC LIMIT 50"
}
]
}
+120
View File
@@ -0,0 +1,120 @@
{
"dashboard_title": "Sentiment Heatmap",
"description": "Market-wide sentiment visualization by sector and symbol, with trend direction and catalyst analysis.",
"slug": "sentiment-heatmap",
"position_json": {
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Sentiment Heatmap"}},
"ROW-1": {
"type": "ROW",
"children": ["CHART-bullish-count-kpi", "CHART-bearish-count-kpi", "CHART-mixed-count-kpi", "CHART-avg-contradiction-kpi"]
},
"ROW-2": {
"type": "ROW",
"children": ["CHART-sentiment-heatmap"]
},
"ROW-3": {
"type": "ROW",
"children": ["CHART-sentiment-timeseries", "CHART-catalyst-breakdown"]
},
"ROW-4": {
"type": "ROW",
"children": ["CHART-contradiction-scatter", "CHART-sentiment-distribution"]
},
"ROW-5": {
"type": "ROW",
"children": ["CHART-symbol-sentiment-detail"]
}
},
"metadata": {
"refresh_frequency": 300,
"default_filters": "{}",
"color_scheme": "supersetColors"
},
"charts": [
{
"slice_name": "Bullish Signals (7d)",
"viz_type": "big_number_total",
"description": "Count of bullish trend signals in the last 7 days",
"datasource_type": "trino",
"query": "SELECT COUNT(*) AS bullish_count FROM lakehouse.stonks.trade_signals WHERE trend_direction = 'bullish' AND dt >= CURRENT_DATE - INTERVAL '7' DAY"
},
{
"slice_name": "Bearish Signals (7d)",
"viz_type": "big_number_total",
"description": "Count of bearish trend signals in the last 7 days",
"datasource_type": "trino",
"query": "SELECT COUNT(*) AS bearish_count FROM lakehouse.stonks.trade_signals WHERE trend_direction = 'bearish' AND dt >= CURRENT_DATE - INTERVAL '7' DAY"
},
{
"slice_name": "Mixed Signals (7d)",
"viz_type": "big_number_total",
"description": "Count of mixed or neutral trend signals in the last 7 days",
"datasource_type": "trino",
"query": "SELECT COUNT(*) AS mixed_count FROM lakehouse.stonks.trade_signals WHERE trend_direction IN ('mixed', 'neutral') AND dt >= CURRENT_DATE - INTERVAL '7' DAY"
},
{
"slice_name": "Avg Contradiction Score (7d)",
"viz_type": "big_number_total",
"description": "Average contradiction score across all signals in the last 7 days",
"datasource_type": "trino",
"query": "SELECT ROUND(AVG(contradiction_score), 3) AS avg_contradiction FROM lakehouse.stonks.trade_signals WHERE dt >= CURRENT_DATE - INTERVAL '7' DAY"
},
{
"slice_name": "Sentiment Heatmap by Symbol",
"viz_type": "heatmap",
"description": "Daily average sentiment impact score by symbol over the last 14 days",
"datasource_type": "trino",
"query": "SELECT de.ticker, de.dt, ROUND(AVG(de.impact_score), 3) AS avg_impact, AVG(CASE WHEN de.sentiment = 'positive' THEN 1.0 WHEN de.sentiment = 'negative' THEN -1.0 ELSE 0.0 END) AS sentiment_score FROM lakehouse.stonks.document_extractions de WHERE de.dt >= CURRENT_DATE - INTERVAL '14' DAY GROUP BY de.ticker, de.dt ORDER BY de.ticker, de.dt",
"params": {
"x_axis": "dt",
"y_axis": "ticker",
"metric": "sentiment_score"
}
},
{
"slice_name": "Sentiment Trend Over Time",
"viz_type": "echarts_timeseries_line",
"description": "Daily average sentiment score across all symbols over the last 30 days",
"datasource_type": "trino",
"query": "SELECT de.dt AS bucket, ROUND(AVG(CASE WHEN de.sentiment = 'positive' THEN 1.0 WHEN de.sentiment = 'negative' THEN -1.0 ELSE 0.0 END), 3) AS avg_sentiment, COUNT(*) AS extraction_count FROM lakehouse.stonks.document_extractions de WHERE de.dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY de.dt ORDER BY de.dt"
},
{
"slice_name": "Catalyst Type Breakdown",
"viz_type": "pie",
"description": "Distribution of catalyst types across extractions in the last 14 days",
"datasource_type": "trino",
"query": "SELECT catalyst_type, COUNT(*) AS count FROM lakehouse.stonks.document_extractions WHERE dt >= CURRENT_DATE - INTERVAL '14' DAY AND catalyst_type IS NOT NULL GROUP BY catalyst_type ORDER BY count DESC"
},
{
"slice_name": "Contradiction vs Confidence",
"viz_type": "echarts_timeseries_scatter",
"description": "Scatter of contradiction score vs confidence for recent signals",
"datasource_type": "trino",
"query": "SELECT ticker, confidence, contradiction_score, trend_strength, trend_direction, dt FROM lakehouse.stonks.trade_signals WHERE dt >= CURRENT_DATE - INTERVAL '14' DAY ORDER BY dt DESC",
"params": {
"x_axis": "confidence",
"y_axis": "contradiction_score",
"groupby": ["trend_direction"]
}
},
{
"slice_name": "Sentiment Distribution by Symbol",
"viz_type": "echarts_timeseries_bar",
"description": "Count of positive, negative, and neutral extractions per symbol in the last 14 days",
"datasource_type": "trino",
"query": "SELECT ticker, sentiment, COUNT(*) AS count FROM lakehouse.stonks.document_extractions WHERE dt >= CURRENT_DATE - INTERVAL '14' DAY GROUP BY ticker, sentiment ORDER BY ticker, sentiment",
"params": {
"x_axis": "ticker",
"metrics": ["count"],
"groupby": ["sentiment"]
}
},
{
"slice_name": "Symbol Sentiment Detail",
"viz_type": "table",
"description": "Per-symbol sentiment summary with extraction counts, average impact, and dominant catalysts",
"datasource_type": "trino",
"query": "SELECT de.ticker, COUNT(*) AS extractions, ROUND(AVG(de.impact_score), 3) AS avg_impact, ROUND(AVG(de.confidence), 3) AS avg_confidence, ROUND(AVG(de.novelty_score), 3) AS avg_novelty, COUNT(CASE WHEN de.sentiment = 'positive' THEN 1 END) AS positive_count, COUNT(CASE WHEN de.sentiment = 'negative' THEN 1 END) AS negative_count, COUNT(CASE WHEN de.sentiment = 'neutral' THEN 1 END) AS neutral_count, ts.trend_direction AS latest_trend, ts.trend_strength AS latest_trend_strength FROM lakehouse.stonks.document_extractions de LEFT JOIN lakehouse.stonks.trade_signals ts ON de.ticker = ts.ticker AND ts.dt = (SELECT MAX(dt) FROM lakehouse.stonks.trade_signals WHERE ticker = de.ticker) WHERE de.dt >= CURRENT_DATE - INTERVAL '14' DAY GROUP BY de.ticker, ts.trend_direction, ts.trend_strength ORDER BY de.ticker"
}
]
}
+104
View File
@@ -0,0 +1,104 @@
{
"dashboard_title": "Symbol Overview",
"description": "Company profiles, source health, recent documents, and market snapshot for tracked symbols.",
"slug": "symbol-overview",
"position_json": {
"HEADER_ID": {"id": "HEADER_ID", "type": "HEADER", "meta": {"text": "Symbol Overview"}},
"ROW-1": {
"type": "ROW",
"children": ["CHART-tracked-symbols-kpi", "CHART-total-documents-kpi", "CHART-total-extractions-kpi", "CHART-active-signals-kpi"]
},
"ROW-2": {
"type": "ROW",
"children": ["CHART-company-summary-table"]
},
"ROW-3": {
"type": "ROW",
"children": ["CHART-recent-documents-timeseries", "CHART-document-type-breakdown"]
},
"ROW-4": {
"type": "ROW",
"children": ["CHART-latest-prices-table"]
},
"ROW-5": {
"type": "ROW",
"children": ["CHART-recent-documents-table"]
}
},
"metadata": {
"refresh_frequency": 300,
"default_filters": "{}",
"color_scheme": "supersetColors"
},
"charts": [
{
"slice_name": "Tracked Symbols",
"viz_type": "big_number_total",
"description": "Count of distinct symbols with documents in the last 30 days",
"datasource_type": "trino",
"query": "SELECT COUNT(DISTINCT ticker) AS tracked_symbols FROM lakehouse.stonks.documents WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
},
{
"slice_name": "Total Documents (30d)",
"viz_type": "big_number_total",
"description": "Total documents ingested in the last 30 days",
"datasource_type": "trino",
"query": "SELECT COUNT(*) AS total_documents FROM lakehouse.stonks.documents WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
},
{
"slice_name": "Total Extractions (30d)",
"viz_type": "big_number_total",
"description": "Total AI extractions completed in the last 30 days",
"datasource_type": "trino",
"query": "SELECT COUNT(*) AS total_extractions FROM lakehouse.stonks.document_extractions WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY"
},
{
"slice_name": "Active Signals (7d)",
"viz_type": "big_number_total",
"description": "Trade signals generated in the last 7 days",
"datasource_type": "trino",
"query": "SELECT COUNT(*) AS active_signals FROM lakehouse.stonks.trade_signals WHERE dt >= CURRENT_DATE - INTERVAL '7' DAY"
},
{
"slice_name": "Company Summary",
"viz_type": "table",
"description": "Per-symbol summary with document counts, extraction counts, latest signal, and latest price",
"datasource_type": "trino",
"query": "SELECT d.ticker, COUNT(DISTINCT d.document_id) AS documents_30d, COUNT(DISTINCT de.document_id) AS extractions_30d, MAX(d.published_at) AS latest_document_at, MAX(ts.generated_at) AS latest_signal_at, MAX(ts.trend_direction) AS latest_trend, MAX(mb.close_price) AS latest_close FROM lakehouse.stonks.documents d LEFT JOIN lakehouse.stonks.document_extractions de ON d.ticker = de.ticker AND de.dt >= CURRENT_DATE - INTERVAL '30' DAY LEFT JOIN lakehouse.stonks.trade_signals ts ON d.ticker = ts.ticker AND ts.dt = (SELECT MAX(dt) FROM lakehouse.stonks.trade_signals WHERE ticker = d.ticker) LEFT JOIN lakehouse.stonks.market_bars mb ON d.ticker = mb.ticker AND mb.dt = (SELECT MAX(dt) FROM lakehouse.stonks.market_bars WHERE ticker = d.ticker) WHERE d.dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY d.ticker ORDER BY d.ticker"
},
{
"slice_name": "Documents Ingested Over Time",
"viz_type": "echarts_timeseries_bar",
"description": "Daily document ingestion counts by source type over the last 30 days",
"datasource_type": "trino",
"query": "SELECT dt AS bucket, source_type, COUNT(*) AS doc_count FROM lakehouse.stonks.documents WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY dt, source_type ORDER BY dt",
"params": {
"x_axis": "bucket",
"metrics": ["doc_count"],
"groupby": ["source_type"],
"time_grain_sqla": "P1D"
}
},
{
"slice_name": "Document Type Breakdown",
"viz_type": "pie",
"description": "Distribution of documents by type in the last 30 days",
"datasource_type": "trino",
"query": "SELECT document_type, COUNT(*) AS count FROM lakehouse.stonks.documents WHERE dt >= CURRENT_DATE - INTERVAL '30' DAY GROUP BY document_type ORDER BY count DESC"
},
{
"slice_name": "Latest Prices by Symbol",
"viz_type": "table",
"description": "Most recent closing prices and volume for each tracked symbol",
"datasource_type": "trino",
"query": "SELECT mb.ticker, mb.close_price, mb.open_price, mb.high_price, mb.low_price, mb.volume, mb.vwap, mb.bar_timestamp FROM lakehouse.stonks.market_bars mb INNER JOIN (SELECT ticker, MAX(bar_timestamp) AS max_ts FROM lakehouse.stonks.market_bars GROUP BY ticker) latest ON mb.ticker = latest.ticker AND mb.bar_timestamp = latest.max_ts ORDER BY mb.ticker"
},
{
"slice_name": "Recent Documents",
"viz_type": "table",
"description": "Most recently ingested documents across all symbols",
"datasource_type": "trino",
"query": "SELECT ticker, document_type, source_type, title, publisher, published_at, retrieved_at, confidence FROM lakehouse.stonks.documents WHERE dt >= CURRENT_DATE - INTERVAL '7' DAY ORDER BY retrieved_at DESC LIMIT 50"
}
]
}
+7 -1
View File
@@ -72,6 +72,9 @@ services:
image: trinodb/trino:latest image: trinodb/trino:latest
ports: ports:
- "8080:8080" - "8080:8080"
environment:
MINIO_ACCESS_KEY: minioadmin
MINIO_SECRET_KEY: minioadmin
volumes: volumes:
- ./infra/trino/catalog:/etc/trino/catalog - ./infra/trino/catalog:/etc/trino/catalog
depends_on: depends_on:
@@ -83,11 +86,14 @@ services:
environment: environment:
SERVICE_NAME: metastore SERVICE_NAME: metastore
DB_DRIVER: derby DB_DRIVER: derby
SERVICE_OPTS: "-Djavax.jdo.option.ConnectionURL=jdbc:derby:/opt/hive/data/metastore_db;create=true"
ports: ports:
- "9083:9083" - "9083:9083"
volumes: volumes:
- hive_data:/opt/hive/data - hive_data:/opt/hive/data
- ./infra/hive/core-site.xml:/opt/hive/conf/core-site.xml:ro
- ./infra/hive/metastore-site.xml:/opt/hive/conf/metastore-site.xml:ro
depends_on:
- minio
superset: superset:
image: apache/superset:latest image: apache/superset:latest
+27
View File
@@ -0,0 +1,27 @@
<?xml version="1.0"?>
<configuration>
<property>
<name>fs.s3a.endpoint</name>
<value>http://minio:9000</value>
</property>
<property>
<name>fs.s3a.access.key</name>
<value>minioadmin</value>
</property>
<property>
<name>fs.s3a.secret.key</name>
<value>minioadmin</value>
</property>
<property>
<name>fs.s3a.path.style.access</name>
<value>true</value>
</property>
<property>
<name>fs.s3a.impl</name>
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
</property>
<property>
<name>fs.s3a.connection.ssl.enabled</name>
<value>false</value>
</property>
</configuration>
+27
View File
@@ -0,0 +1,27 @@
<?xml version="1.0"?>
<configuration>
<property>
<name>metastore.thrift.uris</name>
<value>thrift://0.0.0.0:9083</value>
</property>
<property>
<name>metastore.task.threads.always</name>
<value>org.apache.hadoop.hive.metastore.events.EventCleanerTask</value>
</property>
<property>
<name>metastore.expression.proxy</name>
<value>org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.apache.derby.jdbc.EmbeddedDriver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:derby:/opt/hive/data/metastore_db;create=true</value>
</property>
<property>
<name>metastore.warehouse.dir</name>
<value>s3a://stonks-lakehouse/warehouse</value>
</property>
</configuration>
+23 -1
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: aggregation-worker app: aggregation-worker
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: processing
spec: spec:
replicas: 1 replicas: 1
selector: selector:
@@ -15,16 +16,30 @@ spec:
metadata: metadata:
labels: labels:
app: aggregation-worker app: aggregation-worker
stonks-oracle/tier: processing
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers: containers:
- name: aggregation-worker - name: aggregation-worker
image: ghcr.io/celesrenata/stonks-oracle/aggregation:latest image: ghcr.io/celesrenata/stonks-oracle/aggregation:latest
imagePullPolicy: Always imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
envFrom: envFrom:
- configMapRef: - configMapRef:
name: stonks-config name: stonks-config
- secretRef: - secretRef:
name: stonks-secrets name: stonks-core-secrets
resources: resources:
requests: requests:
cpu: 100m cpu: 100m
@@ -32,3 +47,10 @@ spec:
limits: limits:
cpu: 500m cpu: 500m
memory: 256Mi memory: 256Mi
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir:
sizeLimit: 10Mi
+25 -1
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: broker-adapter app: broker-adapter
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: trading
spec: spec:
replicas: 1 replicas: 1
selector: selector:
@@ -15,16 +16,32 @@ spec:
metadata: metadata:
labels: labels:
app: broker-adapter app: broker-adapter
stonks-oracle/tier: trading
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers: containers:
- name: broker-adapter - name: broker-adapter
image: ghcr.io/celesrenata/stonks-oracle/broker-adapter:latest image: ghcr.io/celesrenata/stonks-oracle/broker-adapter:latest
imagePullPolicy: Always imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
envFrom: envFrom:
- configMapRef: - configMapRef:
name: stonks-config name: stonks-config
- secretRef: - secretRef:
name: stonks-secrets name: stonks-core-secrets
- secretRef:
name: stonks-broker-secrets
resources: resources:
requests: requests:
cpu: 50m cpu: 50m
@@ -32,3 +49,10 @@ spec:
limits: limits:
cpu: 200m cpu: 200m
memory: 128Mi memory: 128Mi
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir:
sizeLimit: 10Mi
+33
View File
@@ -25,15 +25,48 @@ data:
OLLAMA_BASE_URL: "http://ollama.ollama-service.svc.cluster.local:11434" OLLAMA_BASE_URL: "http://ollama.ollama-service.svc.cluster.local:11434"
OLLAMA_MODEL: "llama3.1:8b" OLLAMA_MODEL: "llama3.1:8b"
OLLAMA_TIMEOUT: "120" OLLAMA_TIMEOUT: "120"
OLLAMA_MAX_RETRIES: "2"
OLLAMA_RETRY_BASE_DELAY: "1.0"
OLLAMA_RETRY_MAX_DELAY: "10.0"
OLLAMA_RETRY_BACKOFF_MULTIPLIER: "2.0"
# Trino — deployed in stonks-oracle namespace # Trino — deployed in stonks-oracle namespace
TRINO_HOST: "trino.stonks-oracle.svc.cluster.local" TRINO_HOST: "trino.stonks-oracle.svc.cluster.local"
TRINO_PORT: "8080" TRINO_PORT: "8080"
TRINO_CATALOG: "lakehouse" TRINO_CATALOG: "lakehouse"
TRINO_SCHEMA: "stonks" TRINO_SCHEMA: "stonks"
TRINO_ICEBERG_CATALOG: "iceberg"
# Broker # Broker
BROKER_MODE: "paper" BROKER_MODE: "paper"
BROKER_PROVIDER: "alpaca"
# Market Data
MARKET_DATA_BASE_URL: "https://api.polygon.io"
MARKET_DATA_PROVIDER: "polygon"
# Retention (days per bucket class)
RETENTION_RAW_MARKET_DAYS: "90"
RETENTION_RAW_NEWS_DAYS: "180"
RETENTION_RAW_FILINGS_DAYS: "365"
RETENTION_NORMALIZED_DAYS: "180"
RETENTION_LLM_PROMPTS_DAYS: "365"
RETENTION_LLM_RESULTS_DAYS: "365"
RETENTION_LAKEHOUSE_DAYS: "730"
RETENTION_AUDIT_DAYS: "730"
RETENTION_CLEANUP_INTERVAL_HOURS: "24"
RETENTION_BATCH_SIZE: "1000"
# General # General
LOG_LEVEL: "INFO" LOG_LEVEL: "INFO"
JSON_LOGS: "true"
# Alerting thresholds
ALERT_SOURCE_FAILURE_THRESHOLD: "3"
ALERT_SOURCE_FAILURE_WINDOW_HOURS: "6"
ALERT_SCHEMA_FAILURE_RATE_THRESHOLD: "0.3"
ALERT_SCHEMA_FAILURE_WINDOW_HOURS: "1"
ALERT_LAKE_LAG_THRESHOLD_MINUTES: "60"
ALERT_BROKER_ERROR_THRESHOLD: "3"
ALERT_BROKER_ERROR_WINDOW_HOURS: "1"
ALERT_CHECK_INTERVAL_SECONDS: "120"
+23 -1
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: extractor-worker app: extractor-worker
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: processing
spec: spec:
replicas: 1 replicas: 1
selector: selector:
@@ -15,16 +16,30 @@ spec:
metadata: metadata:
labels: labels:
app: extractor-worker app: extractor-worker
stonks-oracle/tier: processing
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers: containers:
- name: extractor-worker - name: extractor-worker
image: ghcr.io/celesrenata/stonks-oracle/extractor:latest image: ghcr.io/celesrenata/stonks-oracle/extractor:latest
imagePullPolicy: Always imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
envFrom: envFrom:
- configMapRef: - configMapRef:
name: stonks-config name: stonks-config
- secretRef: - secretRef:
name: stonks-secrets name: stonks-core-secrets
resources: resources:
requests: requests:
cpu: 200m cpu: 200m
@@ -32,3 +47,10 @@ spec:
limits: limits:
cpu: "1" cpu: "1"
memory: 512Mi memory: 512Mi
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir:
sizeLimit: 10Mi
+104 -2
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: hive-metastore app: hive-metastore
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: analytics
spec: spec:
replicas: 1 replicas: 1
selector: selector:
@@ -15,22 +16,121 @@ spec:
metadata: metadata:
labels: labels:
app: hive-metastore app: hive-metastore
stonks-oracle/tier: analytics
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
initContainers:
- name: hive-config-init
image: busybox:1.36
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
command: ["sh", "-c"]
args:
- |
cat > /hive-config/core-site.xml <<EOF
<?xml version="1.0"?>
<configuration>
<property>
<name>fs.s3a.endpoint</name>
<value>http://minio.minio-service.svc.cluster.local:80</value>
</property>
<property>
<name>fs.s3a.access.key</name>
<value>${MINIO_ACCESS_KEY}</value>
</property>
<property>
<name>fs.s3a.secret.key</name>
<value>${MINIO_SECRET_KEY}</value>
</property>
<property>
<name>fs.s3a.path.style.access</name>
<value>true</value>
</property>
<property>
<name>fs.s3a.impl</name>
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
</property>
<property>
<name>fs.s3a.connection.ssl.enabled</name>
<value>false</value>
</property>
</configuration>
EOF
cat > /hive-config/metastore-site.xml <<EOF
<?xml version="1.0"?>
<configuration>
<property>
<name>metastore.thrift.uris</name>
<value>thrift://0.0.0.0:9083</value>
</property>
<property>
<name>metastore.task.threads.always</name>
<value>org.apache.hadoop.hive.metastore.events.EventCleanerTask</value>
</property>
<property>
<name>metastore.expression.proxy</name>
<value>org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.apache.derby.jdbc.EmbeddedDriver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:derby:/opt/hive/data/metastore_db;create=true</value>
</property>
<property>
<name>metastore.warehouse.dir</name>
<value>s3a://stonks-lakehouse/warehouse</value>
</property>
</configuration>
EOF
env:
- name: MINIO_ACCESS_KEY
valueFrom:
secretKeyRef:
name: stonks-core-secrets
key: MINIO_ACCESS_KEY
- name: MINIO_SECRET_KEY
valueFrom:
secretKeyRef:
name: stonks-core-secrets
key: MINIO_SECRET_KEY
volumeMounts:
- name: hive-config
mountPath: /hive-config
containers: containers:
- name: hive-metastore - name: hive-metastore
image: apache/hive:4.0.0 image: apache/hive:4.0.0
ports: ports:
- containerPort: 9083 - containerPort: 9083
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
env: env:
- name: SERVICE_NAME - name: SERVICE_NAME
value: metastore value: metastore
- name: DB_DRIVER - name: DB_DRIVER
value: derby value: derby
- name: SERVICE_OPTS
value: "-Djavax.jdo.option.ConnectionURL=jdbc:derby:/opt/hive/data/metastore_db;create=true"
volumeMounts: volumeMounts:
- name: hive-data - name: hive-data
mountPath: /opt/hive/data mountPath: /opt/hive/data
- name: hive-config
mountPath: /opt/hive/conf/core-site.xml
subPath: core-site.xml
- name: hive-config
mountPath: /opt/hive/conf/metastore-site.xml
subPath: metastore-site.xml
resources: resources:
requests: requests:
cpu: 200m cpu: 200m
@@ -42,6 +142,8 @@ spec:
- name: hive-data - name: hive-data
persistentVolumeClaim: persistentVolumeClaim:
claimName: hive-metastore-data claimName: hive-metastore-data
- name: hive-config
emptyDir: {}
--- ---
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
+25 -1
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: ingestion-worker app: ingestion-worker
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: ingestion
spec: spec:
replicas: 2 replicas: 2
selector: selector:
@@ -15,16 +16,32 @@ spec:
metadata: metadata:
labels: labels:
app: ingestion-worker app: ingestion-worker
stonks-oracle/tier: ingestion
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers: containers:
- name: ingestion-worker - name: ingestion-worker
image: ghcr.io/celesrenata/stonks-oracle/ingestion:latest image: ghcr.io/celesrenata/stonks-oracle/ingestion:latest
imagePullPolicy: Always imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
envFrom: envFrom:
- configMapRef: - configMapRef:
name: stonks-config name: stonks-config
- secretRef: - secretRef:
name: stonks-secrets name: stonks-core-secrets
- secretRef:
name: stonks-market-secrets
resources: resources:
requests: requests:
cpu: 100m cpu: 100m
@@ -32,3 +49,10 @@ spec:
limits: limits:
cpu: 500m cpu: 500m
memory: 256Mi memory: 256Mi
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir:
sizeLimit: 10Mi
+23 -1
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: lake-publisher app: lake-publisher
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: analytics
spec: spec:
replicas: 1 replicas: 1
selector: selector:
@@ -15,16 +16,30 @@ spec:
metadata: metadata:
labels: labels:
app: lake-publisher app: lake-publisher
stonks-oracle/tier: analytics
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers: containers:
- name: lake-publisher - name: lake-publisher
image: ghcr.io/celesrenata/stonks-oracle/lake-publisher:latest image: ghcr.io/celesrenata/stonks-oracle/lake-publisher:latest
imagePullPolicy: Always imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
envFrom: envFrom:
- configMapRef: - configMapRef:
name: stonks-config name: stonks-config
- secretRef: - secretRef:
name: stonks-secrets name: stonks-core-secrets
resources: resources:
requests: requests:
cpu: 100m cpu: 100m
@@ -32,3 +47,10 @@ spec:
limits: limits:
cpu: 500m cpu: 500m
memory: 256Mi memory: 256Mi
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir:
sizeLimit: 10Mi
+1
View File
@@ -4,3 +4,4 @@ metadata:
name: stonks-oracle name: stonks-oracle
labels: labels:
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
kubernetes.io/metadata.name: stonks-oracle
+173
View File
@@ -0,0 +1,173 @@
##
## Stonks Oracle — Network Policies
##
## Default-deny ingress for the namespace, then allow only the
## traffic patterns each component actually needs.
##
## Requirements: 8.2 (trading isolation), 12.1 (observability)
##
# ── Default deny all ingress in the namespace ──────────────────────────
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: default-deny-ingress
namespace: stonks-oracle
spec:
podSelector: {}
policyTypes:
- Ingress
---
# ── Query API: accept from Traefik ingress only ───────────────────────
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-query-api-ingress
namespace: stonks-oracle
spec:
podSelector:
matchLabels:
app: query-api
policyTypes:
- Ingress
ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
ports:
- protocol: TCP
port: 8000
---
# ── Symbol Registry API: accept from Traefik ingress only ─────────────
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-symbol-registry-ingress
namespace: stonks-oracle
spec:
podSelector:
matchLabels:
app: symbol-registry-api
policyTypes:
- Ingress
ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
ports:
- protocol: TCP
port: 8000
---
# ── Risk Engine: accept from broker-adapter only ───────────────────────
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-risk-engine-ingress
namespace: stonks-oracle
spec:
podSelector:
matchLabels:
app: risk-engine
policyTypes:
- Ingress
ingress:
- from:
- podSelector:
matchLabels:
app: broker-adapter
- podSelector:
matchLabels:
app: query-api
ports:
- protocol: TCP
port: 8000
---
# ── Superset: accept from Traefik ingress only ────────────────────────
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-superset-ingress
namespace: stonks-oracle
spec:
podSelector:
matchLabels:
app: superset
policyTypes:
- Ingress
ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
ports:
- protocol: TCP
port: 8088
---
# ── Trino: accept from Superset and query-api ─────────────────────────
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-trino-ingress
namespace: stonks-oracle
spec:
podSelector:
matchLabels:
app: trino
policyTypes:
- Ingress
ingress:
- from:
- podSelector:
matchLabels:
app: superset
- podSelector:
matchLabels:
app: query-api
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
ports:
- protocol: TCP
port: 8080
---
# ── Hive Metastore: accept from Trino and lake-publisher ──────────────
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-hive-metastore-ingress
namespace: stonks-oracle
spec:
podSelector:
matchLabels:
app: hive-metastore
policyTypes:
- Ingress
ingress:
- from:
- podSelector:
matchLabels:
app: trino
- podSelector:
matchLabels:
app: lake-publisher
ports:
- protocol: TCP
port: 9083
---
# ── Broker adapter: isolated — no inbound from other pods ──────────────
# The broker-adapter only makes outbound calls to the broker API
# and reads from Redis queues. No pod needs to call into it.
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: deny-broker-adapter-ingress
namespace: stonks-oracle
spec:
podSelector:
matchLabels:
app: broker-adapter
policyTypes:
- Ingress
ingress: []
+23 -1
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: parser-worker app: parser-worker
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: processing
spec: spec:
replicas: 2 replicas: 2
selector: selector:
@@ -15,16 +16,30 @@ spec:
metadata: metadata:
labels: labels:
app: parser-worker app: parser-worker
stonks-oracle/tier: processing
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers: containers:
- name: parser-worker - name: parser-worker
image: ghcr.io/celesrenata/stonks-oracle/parser:latest image: ghcr.io/celesrenata/stonks-oracle/parser:latest
imagePullPolicy: Always imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
envFrom: envFrom:
- configMapRef: - configMapRef:
name: stonks-config name: stonks-config
- secretRef: - secretRef:
name: stonks-secrets name: stonks-core-secrets
resources: resources:
requests: requests:
cpu: 100m cpu: 100m
@@ -32,3 +47,10 @@ spec:
limits: limits:
cpu: 500m cpu: 500m
memory: 256Mi memory: 256Mi
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir:
sizeLimit: 10Mi
+23 -1
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: query-api app: query-api
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: api
spec: spec:
replicas: 1 replicas: 1
selector: selector:
@@ -15,18 +16,32 @@ spec:
metadata: metadata:
labels: labels:
app: query-api app: query-api
stonks-oracle/tier: api
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers: containers:
- name: query-api - name: query-api
image: ghcr.io/celesrenata/stonks-oracle/query-api:latest image: ghcr.io/celesrenata/stonks-oracle/query-api:latest
imagePullPolicy: Always imagePullPolicy: Always
ports: ports:
- containerPort: 8000 - containerPort: 8000
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
envFrom: envFrom:
- configMapRef: - configMapRef:
name: stonks-config name: stonks-config
- secretRef: - secretRef:
name: stonks-secrets name: stonks-core-secrets
resources: resources:
requests: requests:
cpu: 100m cpu: 100m
@@ -40,6 +55,13 @@ spec:
port: 8000 port: 8000
initialDelaySeconds: 5 initialDelaySeconds: 5
periodSeconds: 10 periodSeconds: 10
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir:
sizeLimit: 10Mi
--- ---
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
+23 -1
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: recommendation-worker app: recommendation-worker
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: processing
spec: spec:
replicas: 1 replicas: 1
selector: selector:
@@ -15,16 +16,30 @@ spec:
metadata: metadata:
labels: labels:
app: recommendation-worker app: recommendation-worker
stonks-oracle/tier: processing
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers: containers:
- name: recommendation-worker - name: recommendation-worker
image: ghcr.io/celesrenata/stonks-oracle/recommendation:latest image: ghcr.io/celesrenata/stonks-oracle/recommendation:latest
imagePullPolicy: Always imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
envFrom: envFrom:
- configMapRef: - configMapRef:
name: stonks-config name: stonks-config
- secretRef: - secretRef:
name: stonks-secrets name: stonks-core-secrets
resources: resources:
requests: requests:
cpu: 100m cpu: 100m
@@ -32,3 +47,10 @@ spec:
limits: limits:
cpu: 500m cpu: 500m
memory: 256Mi memory: 256Mi
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir:
sizeLimit: 10Mi
+25 -1
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: risk-engine app: risk-engine
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: trading
spec: spec:
replicas: 1 replicas: 1
selector: selector:
@@ -15,18 +16,34 @@ spec:
metadata: metadata:
labels: labels:
app: risk-engine app: risk-engine
stonks-oracle/tier: trading
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers: containers:
- name: risk-engine - name: risk-engine
image: ghcr.io/celesrenata/stonks-oracle/risk:latest image: ghcr.io/celesrenata/stonks-oracle/risk:latest
imagePullPolicy: Always imagePullPolicy: Always
ports: ports:
- containerPort: 8000 - containerPort: 8000
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
envFrom: envFrom:
- configMapRef: - configMapRef:
name: stonks-config name: stonks-config
- secretRef: - secretRef:
name: stonks-secrets name: stonks-core-secrets
- secretRef:
name: stonks-broker-secrets
resources: resources:
requests: requests:
cpu: 100m cpu: 100m
@@ -34,6 +51,13 @@ spec:
limits: limits:
cpu: 500m cpu: 500m
memory: 256Mi memory: 256Mi
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir:
sizeLimit: 10Mi
--- ---
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
+23 -1
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: scheduler app: scheduler
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: orchestration
spec: spec:
replicas: 1 replicas: 1
selector: selector:
@@ -15,16 +16,30 @@ spec:
metadata: metadata:
labels: labels:
app: scheduler app: scheduler
stonks-oracle/tier: orchestration
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers: containers:
- name: scheduler - name: scheduler
image: ghcr.io/celesrenata/stonks-oracle/scheduler:latest image: ghcr.io/celesrenata/stonks-oracle/scheduler:latest
imagePullPolicy: Always imagePullPolicy: Always
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
envFrom: envFrom:
- configMapRef: - configMapRef:
name: stonks-config name: stonks-config
- secretRef: - secretRef:
name: stonks-secrets name: stonks-core-secrets
resources: resources:
requests: requests:
cpu: 50m cpu: 50m
@@ -32,3 +47,10 @@ spec:
limits: limits:
cpu: 200m cpu: 200m
memory: 128Mi memory: 128Mi
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir:
sizeLimit: 10Mi
+54 -8
View File
@@ -1,17 +1,63 @@
##
## Stonks Oracle — Scoped Secrets
##
## Secrets are split by concern so that only the services that need
## broker or market-data credentials actually receive them.
## Replace placeholder values before deploying.
##
## Requirements: 8.2 (broker credential isolation)
##
# ── Core infrastructure secrets (DB, object store, cache) ──────────────
apiVersion: v1 apiVersion: v1
kind: Secret kind: Secret
metadata: metadata:
name: stonks-secrets name: stonks-core-secrets
namespace: stonks-oracle namespace: stonks-oracle
labels: labels:
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
type: Opaque type: Opaque
stringData: stringData:
POSTGRES_PASSWORD: "changeme" POSTGRES_PASSWORD: "REPLACE_ME"
MINIO_ACCESS_KEY: "changeme" MINIO_ACCESS_KEY: "REPLACE_ME"
MINIO_SECRET_KEY: "changeme" MINIO_SECRET_KEY: "REPLACE_ME"
REDIS_PASSWORD: "" REDIS_PASSWORD: ""
BROKER_API_KEY: "" ---
BROKER_API_SECRET: "" # ── Broker secrets — only for broker-adapter and risk-engine ───────────
BROKER_BASE_URL: "" apiVersion: v1
SUPERSET_SECRET_KEY: "stonks-superset-secret-change-me" kind: Secret
metadata:
name: stonks-broker-secrets
namespace: stonks-oracle
labels:
app.kubernetes.io/part-of: stonks-oracle
type: Opaque
stringData:
BROKER_API_KEY: "REPLACE_ME"
BROKER_API_SECRET: "REPLACE_ME"
BROKER_BASE_URL: "https://paper-api.alpaca.markets"
---
# ── Market data secrets — only for ingestion and adapters ──────────────
apiVersion: v1
kind: Secret
metadata:
name: stonks-market-secrets
namespace: stonks-oracle
labels:
app.kubernetes.io/part-of: stonks-oracle
type: Opaque
stringData:
MARKET_DATA_API_KEY: "REPLACE_ME"
---
# ── Dashboard secrets — only for Superset ──────────────────────────────
apiVersion: v1
kind: Secret
metadata:
name: stonks-dashboard-secrets
namespace: stonks-oracle
labels:
app.kubernetes.io/part-of: stonks-oracle
type: Opaque
stringData:
SUPERSET_SECRET_KEY: "REPLACE_ME"
SUPERSET_ADMIN_PASSWORD: "REPLACE_ME"
+47 -3
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: superset app: superset
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: dashboard
spec: spec:
replicas: 1 replicas: 1
selector: selector:
@@ -15,22 +16,38 @@ spec:
metadata: metadata:
labels: labels:
app: superset app: superset
stonks-oracle/tier: dashboard
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers: containers:
- name: superset - name: superset
image: apache/superset:latest image: apache/superset:latest
ports: ports:
- containerPort: 8088 - containerPort: 8088
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
env: env:
- name: SUPERSET_SECRET_KEY - name: SUPERSET_SECRET_KEY
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
name: stonks-secrets name: stonks-dashboard-secrets
key: SUPERSET_SECRET_KEY key: SUPERSET_SECRET_KEY
- name: ADMIN_USERNAME - name: ADMIN_USERNAME
value: admin value: admin
- name: ADMIN_PASSWORD - name: ADMIN_PASSWORD
value: admin valueFrom:
secretKeyRef:
name: stonks-dashboard-secrets
key: SUPERSET_ADMIN_PASSWORD
- name: ADMIN_EMAIL - name: ADMIN_EMAIL
value: admin@stonks.local value: admin@stonks.local
volumeMounts: volumeMounts:
@@ -94,12 +111,39 @@ data:
import os import os
SECRET_KEY = os.getenv("SUPERSET_SECRET_KEY", "stonks-dev-secret-key-change-me") SECRET_KEY = os.getenv("SUPERSET_SECRET_KEY", "stonks-dev-secret-key-change-me")
SQLALCHEMY_DATABASE_URI = "trino://trino@trino.stonks-oracle.svc.cluster.local:8080/lakehouse/stonks" SQLALCHEMY_DATABASE_URI = "trino://trino@trino.stonks-oracle.svc.cluster.local:8080/lakehouse/stonks"
# Additional database connections available in Superset UI:
# Hive catalog: trino://trino@trino.stonks-oracle.svc.cluster.local:8080/lakehouse/stonks
# Iceberg catalog: trino://trino@trino.stonks-oracle.svc.cluster.local:8080/iceberg/stonks
FEATURE_FLAGS = {"ENABLE_TEMPLATE_PROCESSING": True} FEATURE_FLAGS = {"ENABLE_TEMPLATE_PROCESSING": True}
CACHE_CONFIG = { CACHE_CONFIG = {
"CACHE_TYPE": "RedisCache", "CACHE_TYPE": "RedisCache",
"CACHE_DEFAULT_TIMEOUT": 300, "CACHE_DEFAULT_TIMEOUT": 300,
"CACHE_KEY_PREFIX": "superset_", "CACHE_KEY_PREFIX": "superset_",
"CACHE_REDIS_HOST": os.getenv("REDIS_HOST", "redis.redis-service.svc.cluster.local"), "CACHE_REDIS_HOST": os.getenv("REDIS_HOST", "redis-master.redis-service.svc.cluster.local"),
"CACHE_REDIS_PORT": int(os.getenv("REDIS_PORT", "6379")), "CACHE_REDIS_PORT": int(os.getenv("REDIS_PORT", "6379")),
"CACHE_REDIS_DB": 1, "CACHE_REDIS_DB": 1,
} }
# --- Security hardening ---
# Disable public user role (require login)
PUBLIC_ROLE_LIKE = None
# Session cookie security
SESSION_COOKIE_HTTPONLY = True
SESSION_COOKIE_SECURE = True
SESSION_COOKIE_SAMESITE = "Lax"
# Talisman CSP headers
TALISMAN_ENABLED = True
TALISMAN_CONFIG = {
"content_security_policy": {
"default-src": ["'self'"],
"img-src": ["'self'", "data:"],
"style-src": ["'self'", "'unsafe-inline'"],
"script-src": ["'self'", "'unsafe-inline'", "'unsafe-eval'"],
},
"force_https": False, # TLS terminated at ingress
}
# Prevent Superset from allowing arbitrary SQL database connections
PREVENT_UNSAFE_DB_CONNECTIONS = True
# Row limit for queries
ROW_LIMIT = 50000
SQL_MAX_ROW = 100000
+23 -1
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: symbol-registry-api app: symbol-registry-api
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: api
spec: spec:
replicas: 1 replicas: 1
selector: selector:
@@ -15,18 +16,32 @@ spec:
metadata: metadata:
labels: labels:
app: symbol-registry-api app: symbol-registry-api
stonks-oracle/tier: api
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
containers: containers:
- name: symbol-registry-api - name: symbol-registry-api
image: ghcr.io/celesrenata/stonks-oracle/symbol-registry:latest image: ghcr.io/celesrenata/stonks-oracle/symbol-registry:latest
imagePullPolicy: Always imagePullPolicy: Always
ports: ports:
- containerPort: 8000 - containerPort: 8000
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop: ["ALL"]
envFrom: envFrom:
- configMapRef: - configMapRef:
name: stonks-config name: stonks-config
- secretRef: - secretRef:
name: stonks-secrets name: stonks-core-secrets
resources: resources:
requests: requests:
cpu: 100m cpu: 100m
@@ -46,6 +61,13 @@ spec:
port: 8000 port: 8000
initialDelaySeconds: 10 initialDelaySeconds: 10
periodSeconds: 30 periodSeconds: 30
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir:
sizeLimit: 10Mi
--- ---
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
+63 -26
View File
@@ -6,6 +6,7 @@ metadata:
labels: labels:
app: trino app: trino
app.kubernetes.io/part-of: stonks-oracle app.kubernetes.io/part-of: stonks-oracle
stonks-oracle/tier: analytics
spec: spec:
replicas: 1 replicas: 1
selector: selector:
@@ -15,12 +16,73 @@ spec:
metadata: metadata:
labels: labels:
app: trino app: trino
stonks-oracle/tier: analytics
spec: spec:
automountServiceAccountToken: false
securityContext:
runAsNonRoot: true
runAsUser: 1000
runAsGroup: 1000
fsGroup: 1000
seccompProfile:
type: RuntimeDefault
initContainers:
- name: catalog-init
image: busybox:1.36
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
command: ["sh", "-c"]
args:
- |
cat > /catalog/iceberg.properties <<EOF
connector.name=iceberg
iceberg.catalog.type=hive_metastore
hive.metastore.uri=thrift://hive-metastore.stonks-oracle.svc.cluster.local:9083
hive.s3.endpoint=http://minio.minio-service.svc.cluster.local:80
hive.s3.path-style-access=true
hive.s3.aws-access-key=${MINIO_ACCESS_KEY}
hive.s3.aws-secret-key=${MINIO_SECRET_KEY}
fs.native-s3.enabled=true
s3.endpoint=http://minio.minio-service.svc.cluster.local:80
s3.path-style-access=true
s3.aws-access-key=${MINIO_ACCESS_KEY}
s3.aws-secret-key=${MINIO_SECRET_KEY}
EOF
cat > /catalog/lakehouse.properties <<EOF
connector.name=hive
hive.metastore.uri=thrift://hive-metastore.stonks-oracle.svc.cluster.local:9083
hive.s3.endpoint=http://minio.minio-service.svc.cluster.local:80
hive.s3.path-style-access=true
hive.s3.aws-access-key=${MINIO_ACCESS_KEY}
hive.s3.aws-secret-key=${MINIO_SECRET_KEY}
hive.non-managed-table-writes-enabled=true
hive.s3select-pushdown.enabled=true
EOF
env:
- name: MINIO_ACCESS_KEY
valueFrom:
secretKeyRef:
name: stonks-core-secrets
key: MINIO_ACCESS_KEY
- name: MINIO_SECRET_KEY
valueFrom:
secretKeyRef:
name: stonks-core-secrets
key: MINIO_SECRET_KEY
volumeMounts:
- name: catalog-config
mountPath: /catalog
containers: containers:
- name: trino - name: trino
image: trinodb/trino:latest image: trinodb/trino:latest
ports: ports:
- containerPort: 8080 - containerPort: 8080
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts: volumeMounts:
- name: catalog-config - name: catalog-config
mountPath: /etc/trino/catalog mountPath: /etc/trino/catalog
@@ -39,8 +101,7 @@ spec:
periodSeconds: 10 periodSeconds: 10
volumes: volumes:
- name: catalog-config - name: catalog-config
configMap: emptyDir: {}
name: trino-catalog
--- ---
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
@@ -53,27 +114,3 @@ spec:
ports: ports:
- port: 8080 - port: 8080
targetPort: 8080 targetPort: 8080
---
apiVersion: v1
kind: ConfigMap
metadata:
name: trino-catalog
namespace: stonks-oracle
data:
iceberg.properties: |
connector.name=iceberg
iceberg.catalog.type=hive_metastore
hive.metastore.uri=thrift://hive-metastore.stonks-oracle.svc.cluster.local:9083
hive.s3.endpoint=http://minio.minio-service.svc.cluster.local:80
hive.s3.path-style-access=true
hive.s3.aws-access-key=changeme
hive.s3.aws-secret-key=changeme
lakehouse.properties: |
connector.name=hive
hive.metastore.uri=thrift://hive-metastore.stonks-oracle.svc.cluster.local:9083
hive.s3.endpoint=http://minio.minio-service.svc.cluster.local:80
hive.s3.path-style-access=true
hive.s3.aws-access-key=changeme
hive.s3.aws-secret-key=changeme
hive.non-managed-table-writes-enabled=true
hive.s3select-pushdown.enabled=true
+13
View File
@@ -0,0 +1,13 @@
-- Stonks Oracle - Dedupe support indexes
-- Index on canonical_url for cross-source deduplication lookups.
-- The dedupe module queries documents by canonical_url to detect
-- the same article ingested from different source types.
CREATE INDEX idx_documents_canonical_url ON documents(canonical_url)
WHERE canonical_url IS NOT NULL;
-- Unique constraint on document_company_mentions to prevent duplicate
-- company links when cross-source dedupe links an existing document
-- to an additional company.
CREATE UNIQUE INDEX idx_doc_mentions_unique
ON document_company_mentions(document_id, company_id);
@@ -0,0 +1,5 @@
-- Stonks Oracle - Add parser_output_ref to documents table
-- Stores the MinIO reference to the structured parser output JSON
-- (metadata, quality signals, warnings, outbound links, tags, etc.)
ALTER TABLE documents ADD COLUMN IF NOT EXISTS parser_output_ref VARCHAR(1000);
@@ -0,0 +1,40 @@
-- Stonks Oracle - Model Performance Metrics
-- Tracks extraction success/failure rates, latency, retries, confidence,
-- token usage estimates, and validation error distributions.
-- Requirements: 5.2, 5.4, 12.1, 12.2
CREATE TABLE model_performance_metrics (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
document_id UUID REFERENCES documents(id) ON DELETE SET NULL,
ticker VARCHAR(20),
model_name VARCHAR(200) NOT NULL,
prompt_version VARCHAR(100),
schema_version VARCHAR(50),
success BOOLEAN NOT NULL,
attempt_count INTEGER NOT NULL DEFAULT 1,
total_duration_ms INTEGER NOT NULL DEFAULT 0,
first_attempt_duration_ms INTEGER DEFAULT 0,
final_attempt_duration_ms INTEGER DEFAULT 0,
confidence FLOAT DEFAULT 0.0,
validation_status VARCHAR(50) NOT NULL DEFAULT 'unknown',
validation_error_count INTEGER DEFAULT 0,
validation_warning_count INTEGER DEFAULT 0,
validation_errors JSONB DEFAULT '[]',
retry_count INTEGER DEFAULT 0,
input_token_estimate INTEGER DEFAULT 0,
output_token_estimate INTEGER DEFAULT 0,
company_count INTEGER DEFAULT 0,
recorded_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Query by time range (dashboard primary access pattern)
CREATE INDEX idx_model_perf_recorded ON model_performance_metrics(recorded_at DESC);
-- Filter by model for per-model dashboards
CREATE INDEX idx_model_perf_model ON model_performance_metrics(model_name, recorded_at DESC);
-- Filter by success for failure analysis
CREATE INDEX idx_model_perf_success ON model_performance_metrics(success, recorded_at DESC);
-- Filter by validation status for schema failure dashboards
CREATE INDEX idx_model_perf_validation ON model_performance_metrics(validation_status);
@@ -0,0 +1,8 @@
-- Stonks Oracle - Add disagreement details to trend windows
-- Stores structured contradiction/disagreement representations
-- so downstream consumers can inspect *why* signals conflict
-- rather than relying on a single scalar contradiction_score.
-- Requirements: 6.4
ALTER TABLE trend_windows
ADD COLUMN IF NOT EXISTS disagreement_details JSONB DEFAULT '[]';
+23
View File
@@ -0,0 +1,23 @@
-- Stonks Oracle - Trend evidence mappings
-- Links trend_windows to the documents that contributed as evidence,
-- storing the evidence type (supporting/opposing), rank score, and
-- weight breakdown for explainability and drill-down queries.
-- Requirements: 6.5, 10.4
CREATE TABLE trend_evidence (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
trend_window_id UUID NOT NULL REFERENCES trend_windows(id) ON DELETE CASCADE,
document_id UUID NOT NULL,
evidence_type VARCHAR(20) NOT NULL DEFAULT 'supporting', -- supporting | opposing
rank_score FLOAT DEFAULT 0.0,
weight_component FLOAT DEFAULT 0.0,
impact_component FLOAT DEFAULT 0.0,
recency_component FLOAT DEFAULT 0.0,
confidence_component FLOAT DEFAULT 0.0,
sentiment_value FLOAT DEFAULT 0.0,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_trend_evidence_trend ON trend_evidence(trend_window_id);
CREATE INDEX idx_trend_evidence_doc ON trend_evidence(document_id);
CREATE INDEX idx_trend_evidence_type ON trend_evidence(trend_window_id, evidence_type);
@@ -0,0 +1,15 @@
-- Stonks Oracle - Recommendation persistence enhancements
-- Adds full model metadata columns to recommendations table
-- and a risk_classification column for the computed risk label.
-- Requirements: 7.1, 7.2, 8.3
-- Store full model provenance on the recommendation itself
ALTER TABLE recommendations
ADD COLUMN IF NOT EXISTS model_provider VARCHAR(100) DEFAULT 'deterministic',
ADD COLUMN IF NOT EXISTS prompt_version VARCHAR(100) DEFAULT '',
ADD COLUMN IF NOT EXISTS schema_version VARCHAR(50) DEFAULT '1.0.0',
ADD COLUMN IF NOT EXISTS risk_classification VARCHAR(20) DEFAULT 'moderate';
-- Index for querying recommendations by risk classification
CREATE INDEX IF NOT EXISTS idx_recommendations_risk
ON recommendations(risk_classification);
@@ -0,0 +1,55 @@
-- Stonks Oracle - Portfolio and account risk configuration
-- Persists risk configuration profiles and tracks risk state snapshots.
-- Requirements: 8.1, 8.2, 8.4
-- ============================================================
-- Risk Configuration Profiles
-- ============================================================
CREATE TABLE risk_configs (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
name VARCHAR(200) NOT NULL DEFAULT 'default',
trading_mode VARCHAR(20) NOT NULL DEFAULT 'paper',
config JSONB NOT NULL DEFAULT '{}',
active BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX idx_risk_configs_active_name
ON risk_configs(name) WHERE active = TRUE;
-- ============================================================
-- Symbol-level lockouts (news-shock, cooldown)
-- ============================================================
CREATE TABLE symbol_lockouts (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
ticker VARCHAR(20) NOT NULL,
lockout_type VARCHAR(50) NOT NULL,
reason TEXT DEFAULT '',
expires_at TIMESTAMPTZ NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_symbol_lockouts_ticker ON symbol_lockouts(ticker, expires_at);
CREATE INDEX idx_symbol_lockouts_expiry ON symbol_lockouts(expires_at);
-- ============================================================
-- Daily risk snapshots (for daily loss tracking)
-- ============================================================
CREATE TABLE daily_risk_snapshots (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
account_id VARCHAR(200) NOT NULL,
snapshot_date DATE NOT NULL DEFAULT CURRENT_DATE,
portfolio_value NUMERIC DEFAULT 0,
daily_pnl NUMERIC DEFAULT 0,
daily_trade_count INTEGER DEFAULT 0,
positions_by_sector JSONB DEFAULT '{}',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(account_id, snapshot_date)
);
CREATE INDEX idx_daily_risk_account ON daily_risk_snapshots(account_id, snapshot_date DESC);
@@ -0,0 +1,7 @@
-- Stonks Oracle - Add unique constraint for paper trading position upserts
-- Requirements: 8.1, 8.3
-- The paper trading adapter needs to upsert positions by (broker_account_id, ticker).
-- Add a unique constraint to support ON CONFLICT.
CREATE UNIQUE INDEX IF NOT EXISTS idx_positions_account_ticker
ON positions(broker_account_id, ticker);
@@ -0,0 +1,17 @@
-- Stonks Oracle - Execution audit trail indexes
-- Supports efficient querying of the full decision chain from
-- recommendation through risk evaluation to broker execution.
-- Requirements: 8.3, 11.3
-- GIN index on audit_events.data for JSONB key lookups
-- (e.g. data->>'recommendation_id', data->>'order_id')
CREATE INDEX IF NOT EXISTS idx_audit_events_data_gin
ON audit_events USING gin (data);
-- Index for chronological audit trail queries by entity
CREATE INDEX IF NOT EXISTS idx_audit_events_entity_created
ON audit_events (entity_id, created_at ASC);
-- Index for filtering by event_type + entity_type
CREATE INDEX IF NOT EXISTS idx_audit_events_type_entity
ON audit_events (event_type, entity_type);
@@ -0,0 +1,29 @@
-- Stonks Oracle - Operator approval workflow for live trading mode
-- Tracks pending, approved, rejected, and expired approval requests
-- for orders that require operator sign-off before broker submission.
-- Requirements: 8.2
CREATE TABLE operator_approvals (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
order_job JSONB NOT NULL DEFAULT '{}',
recommendation_id UUID REFERENCES recommendations(id),
ticker VARCHAR(20) NOT NULL,
side VARCHAR(10) NOT NULL DEFAULT 'buy',
quantity NUMERIC NOT NULL DEFAULT 0,
estimated_value NUMERIC NOT NULL DEFAULT 0,
status VARCHAR(20) NOT NULL DEFAULT 'pending',
risk_evaluation_id UUID,
requested_by VARCHAR(200) NOT NULL DEFAULT 'system',
reviewed_by VARCHAR(200),
review_note TEXT,
expires_at TIMESTAMPTZ NOT NULL,
requested_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
reviewed_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_operator_approvals_status ON operator_approvals(status);
CREATE INDEX idx_operator_approvals_ticker ON operator_approvals(ticker);
CREATE INDEX idx_operator_approvals_expires ON operator_approvals(expires_at)
WHERE status = 'pending';
@@ -0,0 +1,43 @@
-- Stonks Oracle - Data retention and lifecycle policies
-- Tracks per-bucket and per-artifact-class retention rules.
-- Requirements: N3
CREATE TABLE retention_policies (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
bucket_name VARCHAR(200) NOT NULL,
artifact_class VARCHAR(100) NOT NULL DEFAULT 'default',
retention_days INTEGER NOT NULL DEFAULT 365,
archive_before_delete BOOLEAN NOT NULL DEFAULT FALSE,
active BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(bucket_name, artifact_class)
);
-- Seed default retention policies per bucket
INSERT INTO retention_policies (bucket_name, artifact_class, retention_days, archive_before_delete) VALUES
('stonks-raw-market', 'default', 90, FALSE),
('stonks-raw-news', 'default', 180, FALSE),
('stonks-raw-filings', 'default', 365, FALSE),
('stonks-normalized', 'default', 180, FALSE),
('stonks-llm-prompts', 'default', 365, FALSE),
('stonks-llm-results', 'default', 365, FALSE),
('stonks-lakehouse', 'default', 730, FALSE),
('stonks-audit', 'default', 730, FALSE);
-- Track retention cleanup runs for observability
CREATE TABLE retention_runs (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
bucket_name VARCHAR(200) NOT NULL,
objects_scanned INTEGER NOT NULL DEFAULT 0,
objects_deleted INTEGER NOT NULL DEFAULT 0,
bytes_freed BIGINT NOT NULL DEFAULT 0,
db_rows_deleted INTEGER NOT NULL DEFAULT 0,
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
completed_at TIMESTAMPTZ,
status VARCHAR(20) NOT NULL DEFAULT 'running',
error_message TEXT
);
CREATE INDEX idx_retention_runs_bucket ON retention_runs(bucket_name, started_at DESC);
CREATE INDEX idx_retention_runs_status ON retention_runs(status);
+81 -11
View File
@@ -1,14 +1,84 @@
{ {
"Rules": [ "buckets": {
{ "stonks-raw-market": {
"ID": "raw-retention-365d", "Rules": [
"Status": "Enabled", {
"Filter": { "ID": "raw-market-retention-90d",
"Prefix": "" "Status": "Enabled",
}, "Filter": { "Prefix": "" },
"Expiration": { "Expiration": { "Days": 90 }
"Days": 365 }
} ]
},
"stonks-raw-news": {
"Rules": [
{
"ID": "raw-news-retention-180d",
"Status": "Enabled",
"Filter": { "Prefix": "" },
"Expiration": { "Days": 180 }
}
]
},
"stonks-raw-filings": {
"Rules": [
{
"ID": "raw-filings-retention-365d",
"Status": "Enabled",
"Filter": { "Prefix": "" },
"Expiration": { "Days": 365 }
}
]
},
"stonks-normalized": {
"Rules": [
{
"ID": "normalized-retention-180d",
"Status": "Enabled",
"Filter": { "Prefix": "" },
"Expiration": { "Days": 180 }
}
]
},
"stonks-llm-prompts": {
"Rules": [
{
"ID": "llm-prompts-retention-365d",
"Status": "Enabled",
"Filter": { "Prefix": "" },
"Expiration": { "Days": 365 }
}
]
},
"stonks-llm-results": {
"Rules": [
{
"ID": "llm-results-retention-365d",
"Status": "Enabled",
"Filter": { "Prefix": "" },
"Expiration": { "Days": 365 }
}
]
},
"stonks-lakehouse": {
"Rules": [
{
"ID": "lakehouse-retention-730d",
"Status": "Enabled",
"Filter": { "Prefix": "" },
"Expiration": { "Days": 730 }
}
]
},
"stonks-audit": {
"Rules": [
{
"ID": "audit-retention-730d",
"Status": "Enabled",
"Filter": { "Prefix": "" },
"Expiration": { "Days": 730 }
}
]
} }
] }
} }
+43 -3
View File
@@ -1,10 +1,18 @@
"""Apache Superset configuration for Stonks Oracle.""" """Apache Superset configuration for Stonks Oracle.
Security hardening applied:
- Session cookies: HttpOnly, Secure, SameSite=Lax
- Talisman CSP headers enabled
- Public role disabled (login required)
- Unsafe DB connections blocked
- Row limits enforced
"""
import os import os
# Superset secret key # Superset secret key — must be set via SUPERSET_SECRET_KEY env var
SECRET_KEY = os.getenv("SUPERSET_SECRET_KEY", "stonks-dev-secret-key-change-me") SECRET_KEY = os.getenv("SUPERSET_SECRET_KEY", "stonks-dev-secret-key-change-me")
# Trino datasource # Default Trino datasource (Hive catalog for backward compatibility)
SQLALCHEMY_DATABASE_URI = "trino://trino@trino:8080/lakehouse/stonks" SQLALCHEMY_DATABASE_URI = "trino://trino@trino:8080/lakehouse/stonks"
# Feature flags # Feature flags
@@ -12,6 +20,10 @@ FEATURE_FLAGS = {
"ENABLE_TEMPLATE_PROCESSING": True, "ENABLE_TEMPLATE_PROCESSING": True,
} }
# Additional database connections available in Superset UI:
# Hive catalog: trino://trino@trino:8080/lakehouse/stonks
# Iceberg catalog: trino://trino@trino:8080/iceberg/stonks
# Cache config (Redis-backed) # Cache config (Redis-backed)
CACHE_CONFIG = { CACHE_CONFIG = {
"CACHE_TYPE": "RedisCache", "CACHE_TYPE": "RedisCache",
@@ -21,3 +33,31 @@ CACHE_CONFIG = {
"CACHE_REDIS_PORT": int(os.getenv("REDIS_PORT", "6379")), "CACHE_REDIS_PORT": int(os.getenv("REDIS_PORT", "6379")),
"CACHE_REDIS_DB": 1, "CACHE_REDIS_DB": 1,
} }
# --- Security hardening ---
# Disable public user role (require login)
PUBLIC_ROLE_LIKE = None
# Session cookie security
SESSION_COOKIE_HTTPONLY = True
SESSION_COOKIE_SECURE = True
SESSION_COOKIE_SAMESITE = "Lax"
# Talisman CSP headers
TALISMAN_ENABLED = True
TALISMAN_CONFIG = {
"content_security_policy": {
"default-src": ["'self'"],
"img-src": ["'self'", "data:"],
"style-src": ["'self'", "'unsafe-inline'"],
"script-src": ["'self'", "'unsafe-inline'", "'unsafe-eval'"],
},
"force_https": False, # TLS terminated at ingress
}
# Prevent Superset from allowing arbitrary SQL database connections
PREVENT_UNSAFE_DB_CONNECTIONS = True
# Row limit for queries
ROW_LIMIT = 50000
SQL_MAX_ROW = 100000
+5
View File
@@ -5,3 +5,8 @@ hive.s3.endpoint=http://minio:9000
hive.s3.path-style-access=true hive.s3.path-style-access=true
hive.s3.aws-access-key=minioadmin hive.s3.aws-access-key=minioadmin
hive.s3.aws-secret-key=minioadmin hive.s3.aws-secret-key=minioadmin
fs.native-s3.enabled=true
s3.endpoint=http://minio:9000
s3.path-style-access=true
s3.aws-access-key=minioadmin
s3.aws-secret-key=minioadmin
+25 -9
View File
@@ -2,15 +2,31 @@
Analytical fact table definitions for MinIO-backed datasets queried via Trino. Analytical fact table definitions for MinIO-backed datasets queried via Trino.
All tables use Hive-compatible partition layouts on MinIO (`s3a://stonks-lakehouse/warehouse/`)
and are defined in the `lakehouse.stonks` schema. Parquet is the storage format.
## Fact Tables ## Fact Tables
- `lake.market_bars` — OHLCV bar data - `lake.market_bars` — OHLCV bar data per symbol per interval
- `lake.market_quotes` — quote snapshots - `lake.market_quotes` bid/ask quote snapshots
- `lake.company_events` — corporate actions and events - `lake.company_events` — corporate actions, earnings, filings, and issuer events
- `lake.documents` — ingested document metadata - `lake.documents` — ingested document metadata (articles, filings, transcripts)
- `lake.document_extractions` — AI extraction outputs - `lake.document_extractions` — AI extraction outputs per document per company
- `lake.trade_signals` — aggregated trend signals - `lake.trade_signals` — aggregated trend signals and recommendation actions
- `lake.trade_orders` — order submission records - `lake.trade_orders` — order submission records (paper and live)
- `lake.trade_fills` — fill and execution records - `lake.trade_fills` — fill and execution records from broker
- `lake.positions_daily` — end-of-day position snapshots - `lake.positions_daily` — end-of-day position snapshots
- `lake.pnl_daily` — daily PnL records - `lake.pnl_daily` — daily PnL records per symbol per account
- `lake.prediction_vs_outcome` — prediction accuracy tracking - `lake.prediction_vs_outcome` — prediction accuracy tracking
- `lake.model_performance` — extraction model performance metrics
## Partitioning
- Most tables partition by `dt` (date)
- `document_extractions`, `prediction_vs_outcome`, and `model_performance` also partition by `model_version`
## Trino Catalogs
- `lakehouse` catalog (Hive connector) for external Hive-compatible tables
- `iceberg` catalog (Iceberg connector) for managed Iceberg tables
## Views
Example SQL views for dashboards and ad hoc analysis are in `lakehouse/views/`.
See `lakehouse/views/README.md` for details.
+24
View File
@@ -0,0 +1,24 @@
-- Analytical fact table: company_events
-- Corporate actions, earnings, filings, and other issuer events.
-- Partitioned by dt (date) on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/company_events/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 2.3, 9.4, 9.5, 10.1
-- Design ref: Section 7 (lake.company_events)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.company_events (
event_id VARCHAR,
ticker VARCHAR,
event_type VARCHAR,
event_subtype VARCHAR,
title VARCHAR,
description VARCHAR,
source VARCHAR,
source_url VARCHAR,
event_at TIMESTAMP(6) WITH TIME ZONE,
ingested_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE
) WITH (
format = 'PARQUET',
partitioned_by = ARRAY['dt'],
external_location = 's3a://stonks-lakehouse/warehouse/company_events/'
);
+13 -1
View File
@@ -1,16 +1,28 @@
-- Analytical fact table: document_extractions -- Analytical fact table: document_extractions
-- Partitioned by dt and model_version on MinIO -- AI extraction outputs per document per company.
-- Partitioned by dt and model_version on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/document_extractions/dt={yyyy-mm-dd}/model_version={ver}/part-*.parquet
-- Requirements: 5.3, 5.5, 9.4, 9.5, 10.1, 10.4
-- Design ref: Section 6.3, Section 7 (lake.document_extractions)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.document_extractions ( CREATE TABLE IF NOT EXISTS lakehouse.stonks.document_extractions (
document_id VARCHAR, document_id VARCHAR,
ticker VARCHAR, ticker VARCHAR,
company_name VARCHAR,
relevance DOUBLE,
sentiment VARCHAR, sentiment VARCHAR,
impact_score DOUBLE, impact_score DOUBLE,
impact_horizon VARCHAR,
catalyst_type VARCHAR, catalyst_type VARCHAR,
confidence DOUBLE, confidence DOUBLE,
novelty_score DOUBLE, novelty_score DOUBLE,
source_credibility DOUBLE,
key_facts VARCHAR,
risks VARCHAR,
macro_themes VARCHAR,
model_name VARCHAR, model_name VARCHAR,
prompt_version VARCHAR, prompt_version VARCHAR,
schema_version VARCHAR,
extraction_at TIMESTAMP(6) WITH TIME ZONE, extraction_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE, dt DATE,
model_version VARCHAR model_version VARCHAR
+9 -2
View File
@@ -1,6 +1,9 @@
-- Analytical fact table: documents -- Analytical fact table: documents
-- Partitioned by dt and source_type on MinIO -- Ingested document metadata for articles, filings, transcripts, and press releases.
-- Path: s3://stonks-lakehouse/warehouse/documents/dt={yyyy-mm-dd}/source_type={type}/part-*.parquet -- Partitioned by dt on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/documents/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 3.1, 3.3, 9.4, 9.5, 10.1, 10.4
-- Design ref: Section 6.2, Section 7 (lake.documents)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.documents ( CREATE TABLE IF NOT EXISTS lakehouse.stonks.documents (
document_id VARCHAR, document_id VARCHAR,
@@ -9,7 +12,11 @@ CREATE TABLE IF NOT EXISTS lakehouse.stonks.documents (
ticker VARCHAR, ticker VARCHAR,
publisher VARCHAR, publisher VARCHAR,
title VARCHAR, title VARCHAR,
url VARCHAR,
canonical_url VARCHAR,
language VARCHAR,
published_at TIMESTAMP(6) WITH TIME ZONE, published_at TIMESTAMP(6) WITH TIME ZONE,
retrieved_at TIMESTAMP(6) WITH TIME ZONE,
content_hash VARCHAR, content_hash VARCHAR,
confidence DOUBLE, confidence DOUBLE,
dt DATE dt DATE
+6 -1
View File
@@ -1,6 +1,9 @@
-- Analytical fact table: market_bars -- Analytical fact table: market_bars
-- Partitioned by dt (date) on MinIO -- OHLCV bar data for tracked symbols.
-- Partitioned by dt (date) on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/market_bars/dt={yyyy-mm-dd}/part-*.parquet -- Path: s3://stonks-lakehouse/warehouse/market_bars/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 2.1, 9.4, 9.5, 10.1
-- Design ref: Section 7 (lake.market_bars)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.market_bars ( CREATE TABLE IF NOT EXISTS lakehouse.stonks.market_bars (
ticker VARCHAR, ticker VARCHAR,
@@ -10,7 +13,9 @@ CREATE TABLE IF NOT EXISTS lakehouse.stonks.market_bars (
close_price DOUBLE, close_price DOUBLE,
volume BIGINT, volume BIGINT,
vwap DOUBLE, vwap DOUBLE,
trade_count BIGINT,
bar_timestamp TIMESTAMP(6) WITH TIME ZONE, bar_timestamp TIMESTAMP(6) WITH TIME ZONE,
bar_interval VARCHAR,
source VARCHAR, source VARCHAR,
dt DATE dt DATE
) WITH ( ) WITH (
+23
View File
@@ -0,0 +1,23 @@
-- Analytical fact table: market_quotes
-- Quote snapshots for tracked symbols.
-- Partitioned by dt (date) on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/market_quotes/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 2.1, 9.4, 9.5, 10.1
-- Design ref: Section 7 (lake.market_quotes)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.market_quotes (
ticker VARCHAR,
bid_price DOUBLE,
ask_price DOUBLE,
bid_size BIGINT,
ask_size BIGINT,
last_price DOUBLE,
last_size BIGINT,
source VARCHAR,
quote_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE
) WITH (
format = 'PARQUET',
partitioned_by = ARRAY['dt'],
external_location = 's3a://stonks-lakehouse/warehouse/market_quotes/'
);
+33
View File
@@ -0,0 +1,33 @@
-- Analytical fact table: model_performance
-- Tracks extraction model performance for Trino/Superset dashboards.
-- Partitioned by dt and model_name on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/model_performance/dt={yyyy-mm-dd}/model_name={name}/part-*.parquet
-- Requirements: 12.1, 12.2
CREATE TABLE IF NOT EXISTS lakehouse.stonks.model_performance (
document_id VARCHAR,
ticker VARCHAR,
model_name VARCHAR,
prompt_version VARCHAR,
schema_version VARCHAR,
success BOOLEAN,
attempt_count INTEGER,
total_duration_ms INTEGER,
first_attempt_duration_ms INTEGER,
final_attempt_duration_ms INTEGER,
confidence DOUBLE,
validation_status VARCHAR,
validation_error_count INTEGER,
validation_warning_count INTEGER,
retry_count INTEGER,
input_token_estimate INTEGER,
output_token_estimate INTEGER,
company_count INTEGER,
recorded_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE,
model_version VARCHAR
) WITH (
format = 'PARQUET',
partitioned_by = ARRAY['dt', 'model_version'],
external_location = 's3a://stonks-lakehouse/warehouse/model_performance/'
);
+8 -1
View File
@@ -1,12 +1,19 @@
-- Analytical fact table: pnl_daily -- Analytical fact table: pnl_daily
-- Partitioned by dt on MinIO -- Daily profit and loss records per symbol per account.
-- Partitioned by dt on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/pnl_daily/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 9.4, 9.5, 10.1, 10.3
-- Design ref: Section 7 (lake.pnl_daily)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.pnl_daily ( CREATE TABLE IF NOT EXISTS lakehouse.stonks.pnl_daily (
ticker VARCHAR, ticker VARCHAR,
realized_pnl DOUBLE, realized_pnl DOUBLE,
unrealized_pnl DOUBLE, unrealized_pnl DOUBLE,
total_pnl DOUBLE, total_pnl DOUBLE,
fees DOUBLE,
net_pnl DOUBLE,
broker_account VARCHAR, broker_account VARCHAR,
execution_mode VARCHAR,
dt DATE dt DATE
) WITH ( ) WITH (
format = 'PARQUET', format = 'PARQUET',
+7 -1
View File
@@ -1,13 +1,19 @@
-- Analytical fact table: positions_daily -- Analytical fact table: positions_daily
-- Partitioned by dt on MinIO -- End-of-day position snapshots.
-- Partitioned by dt on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/positions_daily/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 9.4, 9.5, 10.1, 10.3
-- Design ref: Section 7 (lake.positions_daily)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.positions_daily ( CREATE TABLE IF NOT EXISTS lakehouse.stonks.positions_daily (
ticker VARCHAR, ticker VARCHAR,
quantity DOUBLE, quantity DOUBLE,
avg_entry_price DOUBLE, avg_entry_price DOUBLE,
close_price DOUBLE, close_price DOUBLE,
market_value DOUBLE,
unrealized_pnl DOUBLE, unrealized_pnl DOUBLE,
broker_account VARCHAR, broker_account VARCHAR,
execution_mode VARCHAR,
snapshot_at TIMESTAMP(6) WITH TIME ZONE, snapshot_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE dt DATE
) WITH ( ) WITH (
+16 -11
View File
@@ -1,19 +1,24 @@
-- Analytical fact table: prediction_vs_outcome -- Analytical fact table: prediction_vs_outcome
-- Partitioned by dt on MinIO -- Prediction accuracy tracking: predicted signals vs realized market moves.
-- Partitioned by dt and model_version on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/prediction_vs_outcome/dt={yyyy-mm-dd}/model_version={ver}/part-*.parquet
-- Requirements: 9.4, 9.5, 10.1, 10.3
-- Design ref: Section 7 (lake.prediction_vs_outcome)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.prediction_vs_outcome ( CREATE TABLE IF NOT EXISTS lakehouse.stonks.prediction_vs_outcome (
recommendation_id VARCHAR, recommendation_id VARCHAR,
ticker VARCHAR, ticker VARCHAR,
predicted_action VARCHAR, predicted_action VARCHAR,
predicted_confidence DOUBLE, predicted_confidence DOUBLE,
actual_move_pct DOUBLE, actual_move_pct DOUBLE,
outcome VARCHAR, outcome VARCHAR,
horizon_days INTEGER, horizon_days INTEGER,
predicted_at TIMESTAMP(6) WITH TIME ZONE, predicted_at TIMESTAMP(6) WITH TIME ZONE,
evaluated_at TIMESTAMP(6) WITH TIME ZONE, evaluated_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE model_version VARCHAR,
dt DATE
) WITH ( ) WITH (
format = 'PARQUET', format = 'PARQUET',
partitioned_by = ARRAY['dt'], partitioned_by = ARRAY['dt', 'model_version'],
external_location = 's3a://stonks-lakehouse/warehouse/prediction_vs_outcome/' external_location = 's3a://stonks-lakehouse/warehouse/prediction_vs_outcome/'
); );
+6 -1
View File
@@ -1,5 +1,9 @@
-- Analytical fact table: trade_fills -- Analytical fact table: trade_fills
-- Partitioned by dt on MinIO -- Fill and execution records from broker.
-- Partitioned by dt on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/trade_fills/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 9.4, 9.5, 10.1, 10.3
-- Design ref: Section 7 (lake.trade_fills)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_fills ( CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_fills (
fill_id VARCHAR, fill_id VARCHAR,
@@ -8,6 +12,7 @@ CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_fills (
side VARCHAR, side VARCHAR,
fill_price DOUBLE, fill_price DOUBLE,
fill_quantity DOUBLE, fill_quantity DOUBLE,
commission DOUBLE,
broker_account VARCHAR, broker_account VARCHAR,
filled_at TIMESTAMP(6) WITH TIME ZONE, filled_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE dt DATE
+7 -1
View File
@@ -1,14 +1,20 @@
-- Analytical fact table: trade_orders -- Analytical fact table: trade_orders
-- Partitioned by dt on MinIO -- Order submission records for paper and live trading.
-- Partitioned by dt on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/trade_orders/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 8.3, 9.4, 9.5, 10.1, 10.3
-- Design ref: Section 7 (lake.trade_orders)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_orders ( CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_orders (
order_id VARCHAR, order_id VARCHAR,
recommendation_id VARCHAR,
ticker VARCHAR, ticker VARCHAR,
side VARCHAR, side VARCHAR,
order_type VARCHAR, order_type VARCHAR,
quantity DOUBLE, quantity DOUBLE,
limit_price DOUBLE, limit_price DOUBLE,
status VARCHAR, status VARCHAR,
execution_mode VARCHAR,
broker_account VARCHAR, broker_account VARCHAR,
submitted_at TIMESTAMP(6) WITH TIME ZONE, submitted_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE dt DATE
+18 -10
View File
@@ -1,16 +1,24 @@
-- Analytical fact table: trade_signals -- Analytical fact table: trade_signals
-- Partitioned by dt on MinIO -- Aggregated trend signals and recommendation actions.
-- Partitioned by dt on MinIO.
-- Path: s3://stonks-lakehouse/warehouse/trade_signals/dt={yyyy-mm-dd}/part-*.parquet
-- Requirements: 6.1, 6.2, 6.4, 6.5, 7.1, 9.4, 9.5, 10.1
-- Design ref: Section 6.4, Section 6.5, Section 7 (lake.trade_signals)
CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_signals ( CREATE TABLE IF NOT EXISTS lakehouse.stonks.trade_signals (
signal_id VARCHAR, signal_id VARCHAR,
ticker VARCHAR, ticker VARCHAR,
trend_direction VARCHAR, trend_direction VARCHAR,
trend_strength DOUBLE, trend_strength DOUBLE,
confidence DOUBLE, confidence DOUBLE,
action VARCHAR, contradiction_score DOUBLE,
time_horizon VARCHAR, dominant_catalysts VARCHAR,
generated_at TIMESTAMP(6) WITH TIME ZONE, material_risks VARCHAR,
dt DATE action VARCHAR,
time_horizon VARCHAR,
recommendation_id VARCHAR,
generated_at TIMESTAMP(6) WITH TIME ZONE,
dt DATE
) WITH ( ) WITH (
format = 'PARQUET', format = 'PARQUET',
partitioned_by = ARRAY['dt'], partitioned_by = ARRAY['dt'],
+23
View File
@@ -0,0 +1,23 @@
# Lakehouse Views
Example SQL views for Trino over MinIO-backed analytical fact tables.
These views are designed to be created in the `lakehouse.stonks` schema and
can be used directly in Superset dashboards or ad hoc Trino queries.
## Views
- `prediction_accuracy` — Joins predicted signals with realized market moves to score prediction quality
- `paper_trade_scorecard` — Aggregates paper trading performance by symbol with win rates and PnL
- `paper_trade_detail` — Per-order paper trade detail with fill prices and realized outcomes
- `signal_hit_rate` — Daily signal accuracy summary across all symbols
## Usage
Connect to Trino and run each `.sql` file to create the view:
```bash
trino --catalog lakehouse --schema stonks < lakehouse/views/prediction_accuracy.sql
```
Or paste into the Superset SQL Lab to explore interactively.
+47
View File
@@ -0,0 +1,47 @@
-- View: paper_trade_detail
-- Per-order paper trade detail joining orders, fills, and the originating
-- recommendation's prediction outcome. Useful for drill-down from the scorecard.
-- Requirements: 10.1, 10.3, 10.4
-- Design ref: Section 9.2 (evidence-to-outcome drill-down)
CREATE OR REPLACE VIEW lakehouse.stonks.paper_trade_detail AS
SELECT
o.order_id,
o.recommendation_id,
o.ticker,
o.side,
o.order_type,
o.quantity,
o.limit_price,
o.status AS order_status,
o.submitted_at,
f.fill_id,
f.fill_price,
f.fill_quantity,
f.commission,
f.filled_at,
-- Slippage: difference between limit and fill price (buys positive = worse)
CASE
WHEN o.limit_price IS NOT NULL AND o.limit_price > 0 THEN
(f.fill_price - o.limit_price) / o.limit_price * 100
ELSE NULL
END AS slippage_pct,
-- Link back to prediction outcome
pvo.predicted_action,
pvo.predicted_confidence,
pvo.actual_move_pct,
pvo.outcome AS prediction_outcome,
o.broker_account,
o.dt
FROM
lakehouse.stonks.trade_orders o
LEFT JOIN
lakehouse.stonks.trade_fills f
ON o.order_id = f.order_id
AND o.dt = f.dt
LEFT JOIN
lakehouse.stonks.prediction_vs_outcome pvo
ON o.recommendation_id = pvo.recommendation_id
AND o.dt = pvo.dt
WHERE
o.execution_mode = 'paper';
+42
View File
@@ -0,0 +1,42 @@
-- View: paper_trade_scorecard
-- Aggregates paper trading performance per symbol with win rates, PnL, and
-- average fill quality. Filters to paper execution mode only.
-- Requirements: 10.1, 10.2, 10.3
-- Design ref: Section 9.2 (paper trading PnL scorecard)
CREATE OR REPLACE VIEW lakehouse.stonks.paper_trade_scorecard AS
SELECT
pnl.ticker,
pnl.broker_account,
COUNT(DISTINCT pnl.dt) AS trading_days,
SUM(pnl.realized_pnl) AS total_realized_pnl,
SUM(pnl.unrealized_pnl) AS total_unrealized_pnl,
SUM(pnl.net_pnl) AS total_net_pnl,
SUM(pnl.fees) AS total_fees,
AVG(pnl.net_pnl) AS avg_daily_pnl,
-- Win rate: fraction of days with positive net PnL
CAST(
COUNT(CASE WHEN pnl.net_pnl > 0 THEN 1 END) AS DOUBLE
) / NULLIF(COUNT(*), 0) AS win_rate,
-- Worst and best single-day PnL
MIN(pnl.net_pnl) AS worst_day_pnl,
MAX(pnl.net_pnl) AS best_day_pnl,
-- Order counts from trade_orders
COUNT(DISTINCT o.order_id) AS total_orders,
COUNT(DISTINCT CASE WHEN o.status = 'filled' THEN o.order_id END)
AS filled_orders,
MIN(pnl.dt) AS first_trade_date,
MAX(pnl.dt) AS last_trade_date
FROM
lakehouse.stonks.pnl_daily pnl
LEFT JOIN
lakehouse.stonks.trade_orders o
ON pnl.ticker = o.ticker
AND pnl.broker_account = o.broker_account
AND pnl.dt = o.dt
AND o.execution_mode = 'paper'
WHERE
pnl.execution_mode = 'paper'
GROUP BY
pnl.ticker,
pnl.broker_account;
+44
View File
@@ -0,0 +1,44 @@
-- View: prediction_accuracy
-- Joins prediction_vs_outcome with trade_signals and market_bars to provide
-- a comprehensive prediction accuracy scorecard.
-- Requirements: 10.1, 10.2, 10.3, 10.4
-- Design ref: Section 9.2 (prediction confidence vs realized move)
CREATE OR REPLACE VIEW lakehouse.stonks.prediction_accuracy AS
SELECT
pvo.recommendation_id,
pvo.ticker,
pvo.predicted_action,
pvo.predicted_confidence,
pvo.actual_move_pct,
pvo.outcome,
pvo.horizon_days,
pvo.predicted_at,
pvo.evaluated_at,
pvo.model_version,
ts.trend_direction,
ts.trend_strength,
ts.contradiction_score,
ts.dominant_catalysts,
-- Confidence bucket for dashboard grouping
CASE
WHEN pvo.predicted_confidence >= 0.8 THEN 'high'
WHEN pvo.predicted_confidence >= 0.5 THEN 'medium'
ELSE 'low'
END AS confidence_bucket,
-- Direction correctness: did the predicted action match the actual move?
CASE
WHEN pvo.predicted_action = 'buy' AND pvo.actual_move_pct > 0 THEN true
WHEN pvo.predicted_action = 'sell' AND pvo.actual_move_pct < 0 THEN true
WHEN pvo.predicted_action IN ('hold', 'watch') THEN NULL
ELSE false
END AS direction_correct,
-- Magnitude of prediction error
ABS(pvo.actual_move_pct) AS abs_move_pct,
pvo.dt
FROM
lakehouse.stonks.prediction_vs_outcome pvo
LEFT JOIN
lakehouse.stonks.trade_signals ts
ON pvo.recommendation_id = ts.recommendation_id
AND pvo.dt = ts.dt;
+31
View File
@@ -0,0 +1,31 @@
-- View: signal_hit_rate
-- Daily summary of signal accuracy across all symbols and model versions.
-- Designed for the Superset prediction accuracy dashboard.
-- Requirements: 10.1, 10.2, 10.3
-- Design ref: Section 9.2 (prediction confidence vs realized move)
CREATE OR REPLACE VIEW lakehouse.stonks.signal_hit_rate AS
SELECT
pvo.dt,
pvo.model_version,
COUNT(*) AS total_predictions,
COUNT(CASE WHEN pvo.outcome = 'correct' THEN 1 END) AS correct_predictions,
COUNT(CASE WHEN pvo.outcome = 'incorrect' THEN 1 END) AS incorrect_predictions,
COUNT(CASE WHEN pvo.outcome = 'neutral' THEN 1 END) AS neutral_predictions,
-- Hit rate
CAST(
COUNT(CASE WHEN pvo.outcome = 'correct' THEN 1 END) AS DOUBLE
) / NULLIF(COUNT(*), 0) AS hit_rate,
-- Average confidence of correct vs incorrect
AVG(CASE WHEN pvo.outcome = 'correct' THEN pvo.predicted_confidence END)
AS avg_confidence_correct,
AVG(CASE WHEN pvo.outcome = 'incorrect' THEN pvo.predicted_confidence END)
AS avg_confidence_incorrect,
-- Average realized move magnitude
AVG(ABS(pvo.actual_move_pct)) AS avg_abs_move_pct,
AVG(pvo.actual_move_pct) AS avg_move_pct
FROM
lakehouse.stonks.prediction_vs_outcome pvo
GROUP BY
pvo.dt,
pvo.model_version;
+6
View File
@@ -24,6 +24,12 @@ pandas>=2.2.0
# Trino # Trino
trino>=0.330.0 trino>=0.330.0
# Observability
prometheus_client>=0.21.0
# YAML parsing (used by K8s security tests)
pyyaml>=6.0.0
# Testing # Testing
pytest>=8.0.0 pytest>=8.0.0
pytest-asyncio>=0.24.0 pytest-asyncio>=0.24.0
+44
View File
@@ -1 +1,45 @@
# Ingestion Adapters # Ingestion Adapters
from .base import AdapterResult, BaseAdapter
from .resilient import ResilientAdapter, RetryConfig, RetryStats, compute_delay
from .broker_adapter import (
AccountInfo,
AlpacaBrokerAdapter,
BrokerDataAdapter,
OrderEventType,
OrderRequest,
OrderResponse,
OrderSide,
OrderStatus,
OrderType,
PositionInfo,
TradingMode,
)
from .filings_adapter import FilingsDataAdapter, SECEdgarAdapter
from .market_adapter import MarketDataAdapter, PolygonMarketAdapter
from .news_adapter import NewsDataAdapter, PolygonNewsAdapter
__all__ = [
"AccountInfo",
"AdapterResult",
"AlpacaBrokerAdapter",
"BaseAdapter",
"BrokerDataAdapter",
"FilingsDataAdapter",
"MarketDataAdapter",
"NewsDataAdapter",
"OrderEventType",
"OrderRequest",
"OrderResponse",
"OrderSide",
"OrderStatus",
"OrderType",
"PolygonMarketAdapter",
"PolygonNewsAdapter",
"PositionInfo",
"ResilientAdapter",
"RetryConfig",
"RetryStats",
"SECEdgarAdapter",
"TradingMode",
"compute_delay",
]
+63 -8
View File
@@ -1,29 +1,84 @@
"""Base adapter interface for all external API integrations.""" """Base adapter interface for all external API integrations.
All ingestion adapters follow the same contract:
1. Fetch external payloads for a given ticker/source config.
2. Return a structured result with raw bytes, parsed items, and metadata.
3. The ingestion worker handles MinIO upload, PostgreSQL metadata, and downstream job emission.
Requirements: 2.1, 2.2, 2.3, 2.4, 2.5, 3.1, 3.2, 3.3, 3.4
"""
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass from dataclasses import dataclass, field
from datetime import datetime from datetime import datetime
from typing import Any, Dict, List, Optional from typing import Any
@dataclass @dataclass
class AdapterResult: class AdapterResult:
"""Result of a single adapter fetch operation."""
source_type: str source_type: str
ticker: str ticker: str
items: List[Dict[str, Any]] items: list[dict[str, Any]]
raw_payload: bytes raw_payload: bytes
content_hash: str content_hash: str
fetched_at: datetime fetched_at: datetime
error: Optional[str] = None error: str | None = None
# HTTP metadata for observability
http_status: int | None = None
response_time_ms: float | None = None
# Additional metadata the adapter wants to pass downstream
metadata: dict[str, Any] = field(default_factory=dict)
@property
def ok(self) -> bool:
"""True if the fetch succeeded without error."""
return self.error is None and len(self.items) > 0
@property
def item_count(self) -> int:
return len(self.items)
class BaseAdapter(ABC): class BaseAdapter(ABC):
"""Interface for all ingestion adapters.""" """Interface for all ingestion adapters.
Subclasses implement fetch() for their specific API and source_type()
to identify the adapter class. The ingestion worker orchestrates
persistence and downstream job emission.
"""
@abstractmethod @abstractmethod
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult: async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch data for a given ticker using source config.""" """Fetch data for a given ticker using source config.
Args:
ticker: The company ticker symbol.
config: Source-specific configuration from the sources table.
Returns:
AdapterResult with raw payload, parsed items, and metadata.
"""
... ...
@abstractmethod @abstractmethod
def source_type(self) -> str: def source_type(self) -> str:
"""Return the source type identifier for this adapter (e.g. 'market_api')."""
... ...
def bucket_name(self) -> str:
"""Return the MinIO bucket name for raw artifact storage.
Override in subclasses if the bucket differs from the default pattern.
"""
return f"stonks-raw-{self.source_type().replace('_api', '').replace('_', '-')}"
def artifact_path(self, ticker: str, document_id: str, now: datetime) -> str:
"""Build the MinIO object path for a raw artifact.
Pattern: /{source_type}/{ticker}/{yyyy}/{mm}/{dd}/{document_id}/raw.json
"""
return (
f"{self.source_type()}/{ticker}/"
f"{now.strftime('%Y/%m/%d')}/{document_id}/raw.json"
)
+558 -61
View File
@@ -1,9 +1,19 @@
"""Broker API adapter - paper/live trading, orders, positions, balances.""" """Broker API adapter interface for paper trading and order events.
The BrokerDataAdapter is the abstract interface for all broker integrations.
AlpacaBrokerAdapter is the first concrete implementation, targeting the
Alpaca Markets REST API for paper and live trading.
Requirements: 2.4, 2.5, 8.1, 8.3, 8.5
"""
import hashlib import hashlib
import logging import logging
import time
import uuid import uuid
from datetime import datetime from abc import ABC, abstractmethod
from typing import Any, Dict, Optional from datetime import datetime, timezone
from enum import Enum
from typing import Any
import httpx import httpx
@@ -12,97 +22,584 @@ from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("broker_adapter") logger = logging.getLogger("broker_adapter")
class BrokerAdapter(BaseAdapter): # --- Broker-specific enums ---
"""Broker API adapter supporting paper and live modes."""
def __init__(self, api_key: str = "", api_secret: str = "", base_url: str = "", mode: str = "paper"):
self.api_key = api_key class OrderSide(str, Enum):
self.api_secret = api_secret BUY = "buy"
self.base_url = base_url SELL = "sell"
self.mode = mode # paper | live
class OrderType(str, Enum):
MARKET = "market"
LIMIT = "limit"
STOP = "stop"
STOP_LIMIT = "stop_limit"
class OrderStatus(str, Enum):
PENDING = "pending"
SUBMITTED = "submitted"
ACCEPTED = "accepted"
PARTIALLY_FILLED = "partially_filled"
FILLED = "filled"
CANCELLED = "cancelled"
REJECTED = "rejected"
EXPIRED = "expired"
class TradingMode(str, Enum):
PAPER = "paper"
LIVE = "live"
class OrderEventType(str, Enum):
SUBMITTED = "submitted"
ACCEPTED = "accepted"
REJECTED = "rejected"
FILL = "fill"
PARTIAL_FILL = "partial_fill"
CANCELLED = "cancelled"
EXPIRED = "expired"
# --- Data structures ---
class OrderRequest:
"""Represents an order to be submitted to a broker."""
def __init__(
self,
ticker: str,
side: OrderSide,
quantity: float,
order_type: OrderType = OrderType.MARKET,
limit_price: float | None = None,
stop_price: float | None = None,
time_in_force: str = "day",
idempotency_key: str | None = None,
) -> None:
self.ticker = ticker
self.side = side
self.quantity = quantity
self.order_type = order_type
self.limit_price = limit_price
self.stop_price = stop_price
self.time_in_force = time_in_force
self.idempotency_key = idempotency_key or str(uuid.uuid4())
def to_dict(self) -> dict[str, Any]:
"""Serialize to a dict for audit/persistence."""
d: dict[str, Any] = {
"ticker": self.ticker,
"side": self.side.value,
"quantity": self.quantity,
"order_type": self.order_type.value,
"time_in_force": self.time_in_force,
"idempotency_key": self.idempotency_key,
}
if self.limit_price is not None:
d["limit_price"] = self.limit_price
if self.stop_price is not None:
d["stop_price"] = self.stop_price
return d
class OrderResponse:
"""Represents a broker's response to an order submission."""
def __init__(
self,
broker_order_id: str,
status: OrderStatus,
ticker: str,
side: OrderSide,
quantity: float,
filled_quantity: float = 0.0,
filled_avg_price: float | None = None,
submitted_at: datetime | None = None,
raw_response: dict[str, Any] | None = None,
error: str | None = None,
) -> None:
self.broker_order_id = broker_order_id
self.status = status
self.ticker = ticker
self.side = side
self.quantity = quantity
self.filled_quantity = filled_quantity
self.filled_avg_price = filled_avg_price
self.submitted_at = submitted_at or datetime.now(timezone.utc)
self.raw_response = raw_response or {}
self.error = error
@property
def ok(self) -> bool:
return self.error is None and self.status not in (
OrderStatus.REJECTED,
OrderStatus.CANCELLED,
OrderStatus.EXPIRED,
)
def to_dict(self) -> dict[str, Any]:
return {
"broker_order_id": self.broker_order_id,
"status": self.status.value,
"ticker": self.ticker,
"side": self.side.value,
"quantity": self.quantity,
"filled_quantity": self.filled_quantity,
"filled_avg_price": self.filled_avg_price,
"submitted_at": self.submitted_at.isoformat(),
"error": self.error,
}
class PositionInfo:
"""Represents a current position from the broker."""
def __init__(
self,
ticker: str,
quantity: float,
avg_entry_price: float,
current_price: float,
unrealized_pnl: float,
market_value: float,
side: str = "long",
) -> None:
self.ticker = ticker
self.quantity = quantity
self.avg_entry_price = avg_entry_price
self.current_price = current_price
self.unrealized_pnl = unrealized_pnl
self.market_value = market_value
self.side = side
def to_dict(self) -> dict[str, Any]:
return {
"ticker": self.ticker,
"quantity": self.quantity,
"avg_entry_price": self.avg_entry_price,
"current_price": self.current_price,
"unrealized_pnl": self.unrealized_pnl,
"market_value": self.market_value,
"side": self.side,
}
class AccountInfo:
"""Represents broker account summary."""
def __init__(
self,
account_id: str,
buying_power: float,
cash: float,
portfolio_value: float,
currency: str = "USD",
mode: TradingMode = TradingMode.PAPER,
) -> None:
self.account_id = account_id
self.buying_power = buying_power
self.cash = cash
self.portfolio_value = portfolio_value
self.currency = currency
self.mode = mode
def to_dict(self) -> dict[str, Any]:
return {
"account_id": self.account_id,
"buying_power": self.buying_power,
"cash": self.cash,
"portfolio_value": self.portfolio_value,
"currency": self.currency,
"mode": self.mode.value,
}
# --- Abstract interface ---
class BrokerDataAdapter(BaseAdapter, ABC):
"""Abstract interface for broker API integrations.
Extends BaseAdapter with broker-specific operations:
- submit_order: place an order with idempotency key
- cancel_order: cancel an existing order
- get_order_status: check order state
- get_positions: list current positions
- get_account: retrieve account summary
All concrete adapters must enforce:
- Idempotent order submission via idempotency_key (Req 8.5)
- Paper/live mode separation (Req 8.1)
- Fail-closed on broker unavailability (Req 8.5)
"""
def __init__(self, mode: TradingMode = TradingMode.PAPER) -> None:
self._mode = mode
@property
def mode(self) -> TradingMode:
return self._mode
def source_type(self) -> str: def source_type(self) -> str:
return "broker" return "broker"
def _headers(self) -> Dict[str, str]: @abstractmethod
async def submit_order(self, order: OrderRequest) -> OrderResponse:
"""Submit an order to the broker.
Must use order.idempotency_key to prevent duplicate submissions.
Must fail closed if the broker is unavailable or returns ambiguous state.
"""
...
@abstractmethod
async def cancel_order(self, broker_order_id: str) -> OrderResponse:
"""Cancel an existing order by broker order ID."""
...
@abstractmethod
async def get_order_status(self, broker_order_id: str) -> OrderResponse:
"""Get the current status of an order."""
...
@abstractmethod
async def get_positions(self) -> list[PositionInfo]:
"""Get all current positions."""
...
@abstractmethod
async def get_account(self) -> AccountInfo:
"""Get account summary (balance, buying power, etc.)."""
...
# --- Concrete Alpaca implementation ---
class AlpacaBrokerAdapter(BrokerDataAdapter):
"""Concrete broker adapter for the Alpaca Markets REST API.
Supports:
- Paper trading via paper-api.alpaca.markets
- Live trading via api.alpaca.markets
- Order submission, cancellation, and status
- Position and account queries
Config options for fetch():
endpoint: One of "positions", "orders", "account" (default "positions")
"""
PAPER_BASE_URL: str = "https://paper-api.alpaca.markets"
LIVE_BASE_URL: str = "https://api.alpaca.markets"
def __init__(
self,
api_key: str,
api_secret: str,
mode: TradingMode = TradingMode.PAPER,
base_url: str | None = None,
) -> None:
super().__init__(mode=mode)
self.api_key = api_key
self.api_secret = api_secret
if base_url:
self.base_url = base_url.rstrip("/")
elif mode == TradingMode.LIVE:
self.base_url = self.LIVE_BASE_URL
else:
self.base_url = self.PAPER_BASE_URL
def _headers(self) -> dict[str, str]:
return { return {
"Authorization": f"Bearer {self.api_key}", "APCA-API-KEY-ID": self.api_key,
"APCA-API-SECRET-KEY": self.api_secret,
"Content-Type": "application/json", "Content-Type": "application/json",
} }
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult: async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch positions and recent orders for a ticker.""" """Fetch positions or recent orders for a ticker from Alpaca.
This satisfies the BaseAdapter contract for the ingestion pipeline.
The broker adapter uses fetch() to pull position/order snapshots
that get persisted as raw artifacts.
"""
endpoint = config.get("endpoint", "positions")
url = self._build_fetch_url(ticker, endpoint)
async with httpx.AsyncClient(timeout=30) as client: async with httpx.AsyncClient(timeout=30) as client:
t0 = time.monotonic()
try: try:
resp = await client.get( resp = await client.get(url, headers=self._headers())
f"{self.base_url}/v2/positions/{ticker}", elapsed_ms = (time.monotonic() - t0) * 1000
headers=self._headers(), resp.raise_for_status()
)
raw = resp.content raw = resp.content
data = resp.json() if resp.status_code == 200 else {} data = resp.json()
content_hash = hashlib.sha256(raw).hexdigest() content_hash = hashlib.sha256(raw).hexdigest()
items = [data] if isinstance(data, dict) else data if isinstance(data, list) else []
return AdapterResult( return AdapterResult(
source_type="broker", source_type="broker",
ticker=ticker, ticker=ticker,
items=[data] if data else [], items=items,
raw_payload=raw, raw_payload=raw,
content_hash=content_hash, content_hash=content_hash,
fetched_at=datetime.utcnow(), fetched_at=datetime.now(timezone.utc),
http_status=resp.status_code,
response_time_ms=round(elapsed_ms, 1),
metadata={
"provider": "alpaca",
"mode": self._mode.value,
"endpoint": endpoint,
},
)
except httpx.HTTPStatusError as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Alpaca HTTP error for %s: %s", ticker, e)
return self._error_result(
ticker, str(e), elapsed_ms,
http_status=e.response.status_code if e.response else None,
raw=e.response.content if e.response else b"",
) )
except Exception as e: except Exception as e:
logger.error(f"Broker fetch failed for {ticker}: {e}") elapsed_ms = (time.monotonic() - t0) * 1000
return AdapterResult( logger.error("Alpaca fetch failed for %s: %s", ticker, e)
source_type="broker", return self._error_result(ticker, str(e), elapsed_ms)
ticker=ticker,
items=[],
raw_payload=b"",
content_hash="",
fetched_at=datetime.utcnow(),
error=str(e),
)
async def submit_order( def _build_fetch_url(self, ticker: str, endpoint: str) -> str:
self, """Build the URL for a fetch operation."""
ticker: str, if endpoint == "orders":
side: str, return f"{self.base_url}/v2/orders?symbols={ticker}&status=all&limit=50"
qty: float, if endpoint == "account":
order_type: str = "market", return f"{self.base_url}/v2/account"
limit_price: Optional[float] = None, # Default: positions for ticker
idempotency_key: Optional[str] = None, return f"{self.base_url}/v2/positions/{ticker}"
) -> Dict[str, Any]:
"""Submit an order to the broker. Returns broker response."""
if self.mode == "live":
logger.warning("LIVE order submission")
idem_key = idempotency_key or str(uuid.uuid4()) async def submit_order(self, order: OrderRequest) -> OrderResponse:
payload = { """Submit an order to Alpaca with idempotency key.
"symbol": ticker,
"qty": str(qty), Fails closed: any network error or ambiguous response returns
"side": side, a rejected OrderResponse rather than risking duplicate orders.
"type": order_type, """
"time_in_force": "day", if self._mode == TradingMode.LIVE:
logger.warning("LIVE order submission: %s %s %s", order.side.value, order.quantity, order.ticker)
payload: dict[str, Any] = {
"symbol": order.ticker,
"qty": str(order.quantity),
"side": order.side.value,
"type": order.order_type.value,
"time_in_force": order.time_in_force,
} }
if limit_price and order_type == "limit": if order.limit_price is not None and order.order_type in (OrderType.LIMIT, OrderType.STOP_LIMIT):
payload["limit_price"] = str(limit_price) payload["limit_price"] = str(order.limit_price)
if order.stop_price is not None and order.order_type in (OrderType.STOP, OrderType.STOP_LIMIT):
payload["stop_price"] = str(order.stop_price)
headers = {**self._headers(), "Idempotency-Key": order.idempotency_key}
async with httpx.AsyncClient(timeout=30) as client: async with httpx.AsyncClient(timeout=30) as client:
try: try:
resp = await client.post( resp = await client.post(
f"{self.base_url}/v2/orders", f"{self.base_url}/v2/orders",
headers={**self._headers(), "Idempotency-Key": idem_key}, headers=headers,
json=payload, json=payload,
) )
resp.raise_for_status() resp.raise_for_status()
return resp.json() data = resp.json()
return self._parse_order_response(data)
except httpx.HTTPStatusError as e: except httpx.HTTPStatusError as e:
logger.error(f"Order rejected: {e.response.text}") error_body = e.response.text if e.response else "unknown"
return {"error": e.response.text, "status": e.response.status_code} logger.error("Order rejected by Alpaca: %s", error_body)
return OrderResponse(
broker_order_id="",
status=OrderStatus.REJECTED,
ticker=order.ticker,
side=order.side,
quantity=order.quantity,
error=f"HTTP {e.response.status_code}: {error_body}" if e.response else str(e),
raw_response={"error": error_body},
)
except Exception as e: except Exception as e:
logger.error(f"Order submission failed: {e}") # Fail closed: treat any unexpected error as rejection
return {"error": str(e)} logger.error("Order submission failed (fail-closed): %s", e)
return OrderResponse(
broker_order_id="",
status=OrderStatus.REJECTED,
ticker=order.ticker,
side=order.side,
quantity=order.quantity,
error=f"fail-closed: {e}",
)
async def get_account(self) -> Dict[str, Any]: async def cancel_order(self, broker_order_id: str) -> OrderResponse:
"""Cancel an order on Alpaca."""
async with httpx.AsyncClient(timeout=30) as client: async with httpx.AsyncClient(timeout=30) as client:
resp = await client.get(f"{self.base_url}/v2/account", headers=self._headers()) try:
return resp.json() resp = await client.delete(
f"{self.base_url}/v2/orders/{broker_order_id}",
headers=self._headers(),
)
if resp.status_code == 204:
return OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.CANCELLED,
ticker="",
side=OrderSide.BUY,
quantity=0,
)
resp.raise_for_status()
data = resp.json()
return self._parse_order_response(data)
except Exception as e:
logger.error("Cancel failed for %s: %s", broker_order_id, e)
return OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.REJECTED,
ticker="",
side=OrderSide.BUY,
quantity=0,
error=str(e),
)
async def get_order_status(self, broker_order_id: str) -> OrderResponse:
"""Get order status from Alpaca."""
async with httpx.AsyncClient(timeout=30) as client:
try:
resp = await client.get(
f"{self.base_url}/v2/orders/{broker_order_id}",
headers=self._headers(),
)
resp.raise_for_status()
data = resp.json()
return self._parse_order_response(data)
except Exception as e:
logger.error("Get order status failed for %s: %s", broker_order_id, e)
return OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.REJECTED,
ticker="",
side=OrderSide.BUY,
quantity=0,
error=str(e),
)
async def get_positions(self) -> list[PositionInfo]:
"""Get all current positions from Alpaca."""
async with httpx.AsyncClient(timeout=30) as client:
try:
resp = await client.get(
f"{self.base_url}/v2/positions",
headers=self._headers(),
)
resp.raise_for_status()
data = resp.json()
if not isinstance(data, list):
return []
return [self._parse_position(p) for p in data if isinstance(p, dict)]
except Exception as e:
logger.error("Get positions failed: %s", e)
return []
async def get_account(self) -> AccountInfo:
"""Get account summary from Alpaca."""
async with httpx.AsyncClient(timeout=30) as client:
try:
resp = await client.get(
f"{self.base_url}/v2/account",
headers=self._headers(),
)
resp.raise_for_status()
data = resp.json()
return AccountInfo(
account_id=str(data.get("id", "")),
buying_power=float(data.get("buying_power", 0)),
cash=float(data.get("cash", 0)),
portfolio_value=float(data.get("portfolio_value", 0)),
currency=str(data.get("currency", "USD")),
mode=self._mode,
)
except Exception as e:
logger.error("Get account failed: %s", e)
return AccountInfo(
account_id="",
buying_power=0,
cash=0,
portfolio_value=0,
mode=self._mode,
)
def _parse_order_response(self, data: dict[str, Any]) -> OrderResponse:
"""Parse an Alpaca order response into an OrderResponse."""
status_map: dict[str, OrderStatus] = {
"new": OrderStatus.SUBMITTED,
"accepted": OrderStatus.ACCEPTED,
"partially_filled": OrderStatus.PARTIALLY_FILLED,
"filled": OrderStatus.FILLED,
"done_for_day": OrderStatus.FILLED,
"canceled": OrderStatus.CANCELLED,
"expired": OrderStatus.EXPIRED,
"replaced": OrderStatus.SUBMITTED,
"pending_new": OrderStatus.PENDING,
"pending_cancel": OrderStatus.PENDING,
"pending_replace": OrderStatus.PENDING,
"rejected": OrderStatus.REJECTED,
}
raw_status = str(data.get("status", "pending"))
status = status_map.get(raw_status, OrderStatus.PENDING)
side_str = str(data.get("side", "buy"))
side = OrderSide.SELL if side_str == "sell" else OrderSide.BUY
filled_qty = float(data.get("filled_qty", 0) or 0)
filled_avg = data.get("filled_avg_price")
filled_avg_price = float(filled_avg) if filled_avg else None
return OrderResponse(
broker_order_id=str(data.get("id", "")),
status=status,
ticker=str(data.get("symbol", "")),
side=side,
quantity=float(data.get("qty", 0) or 0),
filled_quantity=filled_qty,
filled_avg_price=filled_avg_price,
raw_response=data,
)
def _parse_position(self, data: dict[str, Any]) -> PositionInfo:
"""Parse an Alpaca position response into a PositionInfo."""
return PositionInfo(
ticker=str(data.get("symbol", "")),
quantity=float(data.get("qty", 0) or 0),
avg_entry_price=float(data.get("avg_entry_price", 0) or 0),
current_price=float(data.get("current_price", 0) or 0),
unrealized_pnl=float(data.get("unrealized_pl", 0) or 0),
market_value=float(data.get("market_value", 0) or 0),
side=str(data.get("side", "long")),
)
def _error_result(
self,
ticker: str,
error: str,
elapsed_ms: float,
http_status: int | None = None,
raw: bytes = b"",
) -> AdapterResult:
"""Build an error AdapterResult for broker fetches."""
return AdapterResult(
source_type="broker",
ticker=ticker,
items=[],
raw_payload=raw,
content_hash="",
fetched_at=datetime.now(timezone.utc),
error=error,
http_status=http_status,
response_time_ms=round(elapsed_ms, 1),
metadata={"provider": "alpaca", "mode": self._mode.value},
)
+832
View File
@@ -0,0 +1,832 @@
"""Broker adapter service - standalone worker for sandbox order execution.
Runs the Alpaca broker adapter in sandbox (paper) mode, processing order
requests from the broker queue, evaluating them through the risk engine,
submitting to Alpaca's paper trading API, and persisting the full audit trail.
Also periodically syncs positions and account state from Alpaca.
Implements idempotent order submission keys and duplicate prevention:
- Deterministic idempotency key generation from job attributes
- Redis-based fast-path duplicate detection before broker submission
- PostgreSQL UNIQUE constraint on idempotency_key as durable fallback
Requirements: 2.4, 8.1, 8.3, 8.5
Design: Section 4.9 - Broker Adapter
"""
from __future__ import annotations
import asyncio
import hashlib
import json
import logging
import uuid
from datetime import datetime, timezone
from typing import Any
import asyncpg
import redis.asyncio as aioredis
from services.adapters.broker_adapter import (
AlpacaBrokerAdapter,
OrderRequest,
OrderResponse,
OrderSide,
OrderStatus,
OrderType,
TradingMode,
)
from services.risk.engine import (
AccountRiskState,
PortfolioRiskConfig,
ProposedOrder,
evaluate_order,
)
from services.risk.approval import (
ApprovalRequest,
ApprovalStatus,
compute_expiry,
create_approval_request,
requires_approval,
)
from services.shared.audit import (
audit_approval_requested,
audit_duplicate_prevented,
audit_order_filled,
audit_order_rejected,
audit_order_submitted,
audit_risk_evaluated,
)
from services.lake_publisher.worker import (
publish_trade_order,
publish_trade_fill,
publish_positions_daily_batch,
LAKEHOUSE_BUCKET,
)
from services.shared.config import load_config
from services.shared.db import get_pg_pool, get_redis
from services.shared.logging import Span, new_trace_id, set_trace_context, setup_logging
from services.shared.metrics import (
ORDERS_DUPLICATES_PREVENTED,
ORDERS_FILLED,
ORDERS_REJECTED,
ORDERS_SUBMITTED,
POSITIONS_SYNCED,
RISK_CHECK_FAILURES,
RISK_EVALUATIONS_TOTAL,
)
from services.shared.redis_keys import QUEUE_BROKER, queue_key
logger = logging.getLogger("broker_service")
POSITION_SYNC_INTERVAL = 60 # seconds
# Redis TTL for idempotency markers (24 hours)
ORDER_IDEMPOTENCY_TTL = 86400
ORDER_IDEMPOTENCY_PREFIX = "stonks:order_idempotency"
# ---------------------------------------------------------------------------
# DB persistence helpers
# ---------------------------------------------------------------------------
_UPSERT_BROKER_ACCOUNT = """
INSERT INTO broker_accounts (id, provider, account_id, mode, config, active)
VALUES ($1::uuid, $2, $3, $4, $5::jsonb, TRUE)
ON CONFLICT (id) DO UPDATE SET
config = EXCLUDED.config,
mode = EXCLUDED.mode,
active = TRUE
"""
_INSERT_ORDER = """
INSERT INTO orders (
id, recommendation_id, broker_account_id, ticker, side, order_type,
quantity, limit_price, stop_price, status, idempotency_key,
broker_order_id, decision_trace, submitted_at, filled_at,
fill_price, fill_quantity
) VALUES (
$1::uuid, $2, $3::uuid, $4, $5, $6,
$7, $8, $9, $10, $11,
$12, $13::jsonb, $14, $15,
$16, $17
)
ON CONFLICT (idempotency_key) DO UPDATE SET
status = EXCLUDED.status,
broker_order_id = EXCLUDED.broker_order_id,
filled_at = EXCLUDED.filled_at,
fill_price = EXCLUDED.fill_price,
fill_quantity = EXCLUDED.fill_quantity,
updated_at = NOW()
"""
_INSERT_ORDER_EVENT = """
INSERT INTO order_events (order_id, event_type, data, broker_timestamp)
VALUES ($1::uuid, $2, $3::jsonb, $4)
"""
_INSERT_RISK_EVALUATION = """
INSERT INTO risk_evaluations (id, recommendation_id, eligible, allowed_mode, rejection_reasons, risk_checks, evaluated_at)
VALUES ($1::uuid, $2::uuid, $3, $4, $5::jsonb, $6::jsonb, $7)
"""
_UPSERT_POSITION = """
INSERT INTO positions (broker_account_id, ticker, quantity, avg_entry_price, current_price, unrealized_pnl, updated_at)
VALUES ($1::uuid, $2, $3, $4, $5, $6, $7)
ON CONFLICT (broker_account_id, ticker)
DO UPDATE SET
quantity = EXCLUDED.quantity,
avg_entry_price = EXCLUDED.avg_entry_price,
current_price = EXCLUDED.current_price,
unrealized_pnl = EXCLUDED.unrealized_pnl,
updated_at = EXCLUDED.updated_at
"""
_LOAD_RISK_CONFIG = """
SELECT config FROM risk_configs WHERE active = TRUE ORDER BY updated_at DESC LIMIT 1
"""
_LOAD_DAILY_SNAPSHOT = """
SELECT portfolio_value, daily_pnl, daily_trade_count, positions_by_sector
FROM daily_risk_snapshots
WHERE account_id = $1 AND snapshot_date = CURRENT_DATE
LIMIT 1
"""
_CHECK_ORDER_BY_IDEMPOTENCY_KEY = """
SELECT id, status, broker_order_id FROM orders
WHERE idempotency_key = $1
LIMIT 1
"""
# ---------------------------------------------------------------------------
# Idempotency helpers (Requirement 8.5)
# ---------------------------------------------------------------------------
def generate_idempotency_key(job: dict[str, Any]) -> str:
"""Generate a deterministic idempotency key from job attributes.
If the job already carries an explicit idempotency_key, use it.
Otherwise, derive a stable key from the combination of
recommendation_id, ticker, side, quantity, and order_type so that
replayed queue messages produce the same key and are detected as
duplicates.
"""
explicit = job.get("idempotency_key")
if explicit:
return str(explicit)
# Build a deterministic key from job content
parts = [
str(job.get("recommendation_id", "")),
str(job.get("ticker", "")),
str(job.get("side", "buy")),
str(job.get("quantity", 0)),
str(job.get("order_type", "market")),
str(job.get("limit_price", "")),
str(job.get("stop_price", "")),
]
raw = "|".join(parts)
return hashlib.sha256(raw.encode()).hexdigest()[:40]
def _redis_idempotency_key(idempotency_key: str) -> str:
"""Build the Redis key for an order idempotency marker."""
return f"{ORDER_IDEMPOTENCY_PREFIX}:{idempotency_key}"
async def check_idempotency_redis(
rds: aioredis.Redis,
idempotency_key: str,
) -> str | None:
"""Fast-path: check Redis for a previously processed idempotency key.
Returns the existing order_id if found, None otherwise.
"""
redis_key = _redis_idempotency_key(idempotency_key)
cached = await rds.get(redis_key)
if cached:
return str(cached)
return None
async def check_idempotency_db(
pool: asyncpg.Pool,
idempotency_key: str,
) -> dict[str, Any] | None:
"""Durable fallback: check PostgreSQL for an existing order with this key.
Returns a dict with id, status, broker_order_id if found, None otherwise.
"""
row = await pool.fetchrow(_CHECK_ORDER_BY_IDEMPOTENCY_KEY, idempotency_key)
if row:
return {
"id": str(row["id"]),
"status": str(row["status"]),
"broker_order_id": str(row["broker_order_id"] or ""),
}
return None
async def mark_idempotency_redis(
rds: aioredis.Redis,
idempotency_key: str,
order_id: str,
) -> None:
"""Set the Redis idempotency marker after an order is processed."""
redis_key = _redis_idempotency_key(idempotency_key)
await rds.set(redis_key, order_id, ex=ORDER_IDEMPOTENCY_TTL)
# ---------------------------------------------------------------------------
# Core service logic
# ---------------------------------------------------------------------------
def build_order_request(job: dict[str, Any]) -> OrderRequest:
"""Build an OrderRequest from a broker queue job payload."""
side = OrderSide.SELL if job.get("side", "buy") == "sell" else OrderSide.BUY
order_type_str = job.get("order_type", "market")
order_type_map = {
"market": OrderType.MARKET,
"limit": OrderType.LIMIT,
"stop": OrderType.STOP,
"stop_limit": OrderType.STOP_LIMIT,
}
return OrderRequest(
ticker=job["ticker"],
side=side,
quantity=float(job.get("quantity", 0)),
order_type=order_type_map.get(order_type_str, OrderType.MARKET),
limit_price=job.get("limit_price"),
stop_price=job.get("stop_price"),
time_in_force=job.get("time_in_force", "day"),
idempotency_key=generate_idempotency_key(job),
)
def build_proposed_order(job: dict[str, Any]) -> ProposedOrder:
"""Build a ProposedOrder for risk evaluation from a broker queue job."""
return ProposedOrder(
recommendation_id=job.get("recommendation_id"),
ticker=job["ticker"],
sector=job.get("sector", ""),
action=job.get("side", "buy"),
quantity=float(job.get("quantity", 0)),
estimated_value=float(job.get("estimated_value", 0)),
confidence=float(job.get("confidence", 0)),
)
async def load_risk_config(pool: asyncpg.Pool) -> PortfolioRiskConfig:
"""Load the active risk configuration from the database."""
row = await pool.fetchrow(_LOAD_RISK_CONFIG)
if row and row["config"]:
data = row["config"] if isinstance(row["config"], dict) else json.loads(row["config"])
return PortfolioRiskConfig.from_db_json(data)
return PortfolioRiskConfig()
async def load_account_risk_state(
pool: asyncpg.Pool,
adapter: AlpacaBrokerAdapter,
account_uuid: str,
) -> AccountRiskState:
"""Build an AccountRiskState from the broker and daily snapshot."""
state = AccountRiskState(account_id=account_uuid)
# Get live account info from Alpaca
try:
acct = await adapter.get_account()
state.portfolio_value = acct.portfolio_value
state.cash = acct.cash
state.buying_power = acct.buying_power
except Exception as e:
logger.warning("Failed to fetch account from Alpaca: %s", e)
# Get positions from Alpaca
try:
positions = await adapter.get_positions()
for pos in positions:
state.positions_by_symbol[pos.ticker] = pos.market_value
state.open_position_count = len(positions)
except Exception as e:
logger.warning("Failed to fetch positions from Alpaca: %s", e)
# Overlay daily snapshot from DB
row = await pool.fetchrow(_LOAD_DAILY_SNAPSHOT, account_uuid)
if row:
state.daily_pnl = float(row["daily_pnl"] or 0)
state.daily_trade_count = int(row["daily_trade_count"] or 0)
sector_data = row["positions_by_sector"]
if sector_data:
state.positions_by_sector = (
sector_data if isinstance(sector_data, dict) else json.loads(sector_data)
)
return state
async def persist_order(
pool: asyncpg.Pool,
order_id: str,
order: OrderRequest,
resp: OrderResponse,
account_uuid: str,
risk_eval: dict[str, Any],
recommendation_id: str | None = None,
) -> None:
"""Persist order, events, and risk evaluation to PostgreSQL."""
now = datetime.now(timezone.utc)
filled_at = now if resp.status == OrderStatus.FILLED else None
decision_trace = {
"risk_evaluation": risk_eval,
"order_request": order.to_dict(),
"broker_response": resp.to_dict(),
}
async with pool.acquire() as conn:
async with conn.transaction():
await conn.execute(
_INSERT_ORDER,
order_id,
recommendation_id,
account_uuid,
order.ticker,
order.side.value,
order.order_type.value,
order.quantity,
order.limit_price,
order.stop_price,
resp.status.value,
order.idempotency_key,
resp.broker_order_id,
json.dumps(decision_trace),
resp.submitted_at or now,
filled_at,
resp.filled_avg_price,
resp.filled_quantity,
)
# Record order events
for event_type in ["submitted"]:
await conn.execute(
_INSERT_ORDER_EVENT,
order_id,
event_type,
json.dumps({"ticker": order.ticker, "side": order.side.value}),
now,
)
if resp.status == OrderStatus.FILLED:
await conn.execute(
_INSERT_ORDER_EVENT,
order_id,
"fill",
json.dumps({
"fill_price": resp.filled_avg_price,
"fill_qty": resp.filled_quantity,
}),
now,
)
elif resp.status == OrderStatus.REJECTED:
await conn.execute(
_INSERT_ORDER_EVENT,
order_id,
"rejected",
json.dumps({"error": resp.error}),
now,
)
async def sync_positions(
adapter: AlpacaBrokerAdapter,
pool: asyncpg.Pool,
account_uuid: str,
minio_client: Any | None = None,
) -> None:
"""Sync current positions from Alpaca to PostgreSQL and publish to lake."""
now = datetime.now(timezone.utc)
try:
positions = await adapter.get_positions()
async with pool.acquire() as conn:
for pos in positions:
await conn.execute(
_UPSERT_POSITION,
account_uuid,
pos.ticker,
pos.quantity,
pos.avg_entry_price,
pos.current_price,
pos.unrealized_pnl,
now,
)
logger.info("Synced %d positions from Alpaca", len(positions))
POSITIONS_SYNCED.inc()
# Publish positions snapshot to analytical lake
if minio_client is not None and positions:
try:
pos_dicts = [
{
"ticker": p.ticker,
"quantity": p.quantity,
"avg_entry_price": p.avg_entry_price,
"close_price": p.current_price,
"unrealized_pnl": p.unrealized_pnl,
}
for p in positions
]
publish_positions_daily_batch(
minio_client, pos_dicts, account_uuid, now,
)
except Exception as e:
logger.warning("Failed to publish positions to lake: %s", e)
except Exception as e:
logger.error("Position sync failed: %s", e)
async def register_broker_account(
pool: asyncpg.Pool,
account_uuid: str,
adapter: AlpacaBrokerAdapter,
) -> None:
"""Register or update the broker account in PostgreSQL."""
try:
acct = await adapter.get_account()
config_json = json.dumps({
"provider": "alpaca",
"buying_power": acct.buying_power,
"cash": acct.cash,
"portfolio_value": acct.portfolio_value,
})
await pool.execute(
_UPSERT_BROKER_ACCOUNT,
account_uuid,
"alpaca",
acct.account_id or account_uuid,
adapter.mode.value,
config_json,
)
logger.info(
"Registered Alpaca account: id=%s mode=%s portfolio=%.2f",
acct.account_id, adapter.mode.value, acct.portfolio_value,
)
except Exception as e:
logger.error("Failed to register broker account: %s", e)
async def process_order_job(
job: dict[str, Any],
adapter: AlpacaBrokerAdapter,
pool: asyncpg.Pool,
account_uuid: str,
rds: aioredis.Redis | None = None,
minio_client: Any | None = None,
) -> None:
"""Process a single order job from the broker queue.
1. Generate deterministic idempotency key
2. Check Redis + DB for duplicate (Req 8.5)
3. Build proposed order and run risk evaluation
4. If risk passes, submit to Alpaca
5. Persist order, events, and risk evaluation
6. Set Redis idempotency marker
"""
ticker = job.get("ticker", "???")
order_id = str(uuid.uuid4())
idempotency_key = generate_idempotency_key(job)
# --- Duplicate prevention (Requirement 8.5) ---
# Fast path: Redis check
if rds is not None:
existing_order_id = await check_idempotency_redis(rds, idempotency_key)
if existing_order_id:
logger.info(
"Duplicate order detected (redis) for %s key=%s existing=%s",
ticker, idempotency_key[:16], existing_order_id,
)
ORDERS_DUPLICATES_PREVENTED.labels(detected_via="redis").inc()
await audit_duplicate_prevented(
pool, existing_order_id, ticker, idempotency_key, detected_via="redis",
)
return
# Durable fallback: DB check
existing = await check_idempotency_db(pool, idempotency_key)
if existing:
logger.info(
"Duplicate order detected (db) for %s key=%s existing=%s status=%s",
ticker, idempotency_key[:16], existing["id"], existing["status"],
)
ORDERS_DUPLICATES_PREVENTED.labels(detected_via="db").inc()
await audit_duplicate_prevented(
pool, existing["id"], ticker, idempotency_key, detected_via="db",
)
# Warm Redis cache for future fast-path hits
if rds is not None:
await mark_idempotency_redis(rds, idempotency_key, existing["id"])
return
# Risk evaluation
risk_config = await load_risk_config(pool)
risk_state = await load_account_risk_state(pool, adapter, account_uuid)
proposed = build_proposed_order(job)
evaluation = evaluate_order(proposed, risk_config, risk_state)
risk_eval_dict = {
"evaluation_id": evaluation.evaluation_id,
"eligible": evaluation.eligible,
"allowed_mode": evaluation.allowed_mode.value,
"rejection_reasons": evaluation.rejection_reasons,
"checks": [c.model_dump(mode="json") for c in evaluation.checks],
}
# Persist risk evaluation
rec_id = job.get("recommendation_id")
try:
await pool.execute(
_INSERT_RISK_EVALUATION,
evaluation.evaluation_id,
rec_id,
evaluation.eligible,
evaluation.allowed_mode.value,
json.dumps(evaluation.rejection_reasons),
json.dumps(risk_eval_dict["checks"]),
evaluation.evaluated_at,
)
except Exception as e:
logger.warning("Failed to persist risk evaluation: %s", e)
# Audit: risk evaluation result
await audit_risk_evaluated(
pool,
evaluation_id=evaluation.evaluation_id,
recommendation_id=rec_id,
ticker=ticker,
eligible=evaluation.eligible,
allowed_mode=evaluation.allowed_mode.value,
rejection_reasons=evaluation.rejection_reasons,
check_count=len(evaluation.checks),
)
if not evaluation.eligible:
RISK_EVALUATIONS_TOTAL.labels(result="rejected").inc()
for check in evaluation.checks:
if check.result.value == "fail":
RISK_CHECK_FAILURES.labels(check_name=check.check_name).inc()
ORDERS_REJECTED.labels(reason_category="risk_engine").inc()
logger.info(
"Order rejected by risk engine for %s: %s",
ticker, evaluation.rejection_reasons,
)
# Persist the rejected order for audit
order_req = build_order_request(job)
rejected_resp = OrderResponse(
broker_order_id="",
status=OrderStatus.REJECTED,
ticker=ticker,
side=OrderSide.SELL if job.get("side") == "sell" else OrderSide.BUY,
quantity=float(job.get("quantity", 0)),
error=f"Risk rejected: {'; '.join(evaluation.rejection_reasons)}",
)
await persist_order(
pool, order_id, order_req, rejected_resp,
account_uuid, risk_eval_dict, rec_id,
)
# Publish rejected order fact to analytical lake
if minio_client is not None:
try:
publish_trade_order(
minio_client, order_id, ticker,
side=job.get("side", "buy"),
order_type=job.get("order_type", "market"),
quantity=float(job.get("quantity", 0)),
limit_price=job.get("limit_price"),
status="rejected",
broker_account=account_uuid,
submitted_at=datetime.now(timezone.utc),
)
except Exception as e:
logger.warning("Failed to publish rejected order to lake: %s", e)
# Audit: order rejected by risk engine
await audit_order_rejected(
pool, order_id, ticker,
reason=f"Risk rejected: {'; '.join(evaluation.rejection_reasons)}",
source="risk_engine",
)
# Mark idempotency even for rejected orders to prevent reprocessing
if rds is not None:
await mark_idempotency_redis(rds, idempotency_key, order_id)
return
# --- Operator approval gate (Requirement 8.2) ---
if requires_approval(risk_config, evaluation.allowed_mode):
expiry = compute_expiry(risk_config)
approval_req = ApprovalRequest(
order_job=job,
recommendation_id=rec_id,
ticker=ticker,
side=job.get("side", "buy"),
quantity=float(job.get("quantity", 0)),
estimated_value=float(job.get("estimated_value", 0)),
risk_evaluation_id=evaluation.evaluation_id,
expires_at=expiry,
)
try:
await create_approval_request(pool, approval_req)
logger.info(
"Order for %s held for operator approval (id=%s, expires=%s)",
ticker, approval_req.approval_id, expiry.isoformat(),
)
await audit_approval_requested(
pool,
approval_id=approval_req.approval_id,
ticker=ticker,
side=approval_req.side,
quantity=approval_req.quantity,
estimated_value=approval_req.estimated_value,
recommendation_id=rec_id,
expires_at=expiry.isoformat(),
)
except Exception as e:
logger.error("Failed to create approval request for %s: %s", ticker, e)
# Do NOT mark idempotency — the job will be re-submitted after approval
return
# Submit to Alpaca
order_req = build_order_request(job)
RISK_EVALUATIONS_TOTAL.labels(result="passed").inc()
# Audit: order submitted to broker
await audit_order_submitted(
pool,
order_id=order_id,
ticker=ticker,
side=order_req.side.value,
quantity=order_req.quantity,
order_type=order_req.order_type.value,
idempotency_key=order_req.idempotency_key,
recommendation_id=rec_id,
evaluation_id=evaluation.evaluation_id,
)
resp = await adapter.submit_order(order_req)
await persist_order(
pool, order_id, order_req, resp,
account_uuid, risk_eval_dict, rec_id,
)
# Publish order fact to analytical lake
if minio_client is not None:
try:
publish_trade_order(
minio_client, order_id, ticker,
side=order_req.side.value,
order_type=order_req.order_type.value,
quantity=order_req.quantity,
limit_price=order_req.limit_price,
status=resp.status.value,
broker_account=account_uuid,
submitted_at=resp.submitted_at or datetime.now(timezone.utc),
)
except Exception as e:
logger.warning("Failed to publish order to lake: %s", e)
# Publish fill fact if the order was filled
if resp.status == OrderStatus.FILLED and resp.filled_avg_price is not None:
try:
fill_id = str(uuid.uuid4())
publish_trade_fill(
minio_client, fill_id, order_id, ticker,
side=order_req.side.value,
fill_price=resp.filled_avg_price,
fill_quantity=resp.filled_quantity,
broker_account=account_uuid,
filled_at=datetime.now(timezone.utc),
)
except Exception as e:
logger.warning("Failed to publish fill to lake: %s", e)
# Mark idempotency after successful persistence
if rds is not None:
await mark_idempotency_redis(rds, idempotency_key, order_id)
if resp.ok:
mode = "paper" if adapter.mode == TradingMode.PAPER else "live"
ORDERS_SUBMITTED.labels(
side=order_req.side.value,
order_type=order_req.order_type.value,
mode=mode,
).inc()
logger.info(
"Order submitted to Alpaca: %s %s %.0f %s @ %s | broker_id=%s",
resp.status.value, order_req.side.value, order_req.quantity,
ticker, resp.filled_avg_price, resp.broker_order_id,
)
# Audit: order filled
if resp.status == OrderStatus.FILLED:
ORDERS_FILLED.labels(side=order_req.side.value).inc()
await audit_order_filled(
pool, order_id, ticker,
side=order_req.side.value,
fill_quantity=resp.filled_quantity,
fill_price=resp.filled_avg_price,
broker_order_id=resp.broker_order_id,
)
else:
ORDERS_REJECTED.labels(reason_category="broker").inc()
logger.warning(
"Order failed for %s: %s (status=%s)",
ticker, resp.error, resp.status.value,
)
# Audit: order rejected by broker
await audit_order_rejected(
pool, order_id, ticker,
reason=resp.error or f"Broker status: {resp.status.value}",
source="broker",
)
async def position_sync_loop(
adapter: AlpacaBrokerAdapter,
pool: asyncpg.Pool,
account_uuid: str,
minio_client: Any | None = None,
) -> None:
"""Periodically sync positions from Alpaca to PostgreSQL and lake."""
while True:
await sync_positions(adapter, pool, account_uuid, minio_client)
await asyncio.sleep(POSITION_SYNC_INTERVAL)
async def main() -> None:
config = load_config()
setup_logging("broker_service", level=config.log_level, json_output=config.json_logs)
pool = await get_pg_pool(config)
rds = get_redis(config)
# Initialize MinIO client for lake publishing
from minio import Minio
minio_client = Minio(
config.minio.endpoint,
access_key=config.minio.access_key,
secret_key=config.minio.secret_key,
secure=config.minio.secure,
)
# Ensure lakehouse bucket exists
if not minio_client.bucket_exists(LAKEHOUSE_BUCKET):
minio_client.make_bucket(LAKEHOUSE_BUCKET)
# Determine mode — default to paper for safety (Req 8.1)
mode = TradingMode.LIVE if config.broker.mode == "live" else TradingMode.PAPER
if mode == TradingMode.LIVE:
logger.warning("LIVE trading mode enabled — orders will be submitted to real broker")
adapter = AlpacaBrokerAdapter(
api_key=config.broker.api_key or "",
api_secret=config.broker.api_secret or "",
mode=mode,
base_url=config.broker.base_url,
)
# Generate a stable account UUID from the API key
account_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"alpaca-{config.broker.api_key or 'default'}"))
# Register broker account on startup
await register_broker_account(pool, account_uuid, adapter)
# Start position sync in background
sync_task = asyncio.create_task(
position_sync_loop(adapter, pool, account_uuid, minio_client)
)
queue = queue_key(QUEUE_BROKER)
logger.info("Broker service started (mode=%s)", mode.value)
try:
while True:
result = await rds.lpop(queue)
raw = str(result) if result else None
if raw:
try:
job = json.loads(raw)
await process_order_job(job, adapter, pool, account_uuid, rds, minio_client)
except Exception:
logger.exception("Error processing broker job")
else:
await asyncio.sleep(2)
finally:
sync_task.cancel()
await pool.close()
await rds.close()
if __name__ == "__main__":
asyncio.run(main())
+170 -27
View File
@@ -1,8 +1,17 @@
"""Filings / Regulatory API adapter - fetches SEC-style submissions.""" """Filings / Regulatory API adapter interface and concrete SEC EDGAR provider.
The FilingsDataAdapter is the abstract interface for all filings data providers.
SECEdgarAdapter is the first concrete implementation, targeting the SEC EDGAR
full-text search system (EFTS) for company filings discovery.
Requirements: 2.3, 2.5, 3.1, 3.2, 3.3
"""
import hashlib import hashlib
import logging import logging
from datetime import datetime import time
from typing import Any, Dict from abc import ABC
from datetime import datetime, timezone
from typing import Any
import httpx import httpx
@@ -11,48 +20,182 @@ from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("filings_adapter") logger = logging.getLogger("filings_adapter")
class FilingsAdapter(BaseAdapter): class FilingsDataAdapter(BaseAdapter, ABC):
"""Concrete adapter for SEC EDGAR or similar filings API.""" """Abstract interface for filings / regulatory data providers.
def __init__(self, base_url: str = "https://efts.sec.gov", user_agent: str = "StonksOracle/1.0"): Subclasses implement fetch() for their specific filings API.
self.base_url = base_url source_type() is concrete here since all filings adapters share the same type.
self.user_agent = user_agent """
def source_type(self) -> str: def source_type(self) -> str:
return "filings_api" return "filings_api"
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
_cik = config.get("cik", "")
endpoint = config.get("endpoint", f"/LATEST/search-index?q=%22{ticker}%22&dateRange=custom&startdt=2026-01-01&forms=8-K,10-Q,10-K")
url = f"{self.base_url}{endpoint}"
headers = {"User-Agent": self.user_agent} class SECEdgarAdapter(FilingsDataAdapter):
"""Concrete adapter for the SEC EDGAR full-text search system (EFTS).
Supports:
- Full-text search (/LATEST/search-index) for 8-K, 10-Q, 10-K, and other forms
- Filtering by date range, form type, and entity
The SEC EDGAR EFTS API is public and does not require an API key,
but requires a descriptive User-Agent header per SEC fair-access policy.
Config options:
cik: Company CIK number (optional, narrows search)
forms: Comma-separated form types to search (default "8-K,10-Q,10-K")
start_date: Only filings on or after this date, YYYY-MM-DD (optional)
end_date: Only filings on or before this date, YYYY-MM-DD (optional)
query: Custom search query override (optional, replaces ticker-based query)
"""
SEARCH_ENDPOINT: str = "/LATEST/search-index"
def __init__(
self,
base_url: str = "https://efts.sec.gov",
user_agent: str = "StonksOracle/1.0 ([email])",
) -> None:
self.base_url: str = base_url.rstrip("/")
self.user_agent: str = user_agent
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch filings from SEC EDGAR EFTS for a given ticker.
Args:
ticker: The company ticker symbol.
config: Source-specific configuration from the sources table.
Returns:
AdapterResult with raw payload, parsed filing items, and metadata.
"""
url, params, headers = self._build_request(ticker, config)
async with httpx.AsyncClient(timeout=30) as client: async with httpx.AsyncClient(timeout=30) as client:
t0 = time.monotonic()
try: try:
resp = await client.get(url, headers=headers) resp = await client.get(url, params=params, headers=headers)
elapsed_ms = (time.monotonic() - t0) * 1000
resp.raise_for_status() resp.raise_for_status()
raw = resp.content raw = resp.content
data = resp.json() data = resp.json()
content_hash = hashlib.sha256(raw).hexdigest() content_hash = hashlib.sha256(raw).hexdigest()
items = self._extract_items(data)
hits = data.get("hits", {}).get("hits", [])
return AdapterResult( return AdapterResult(
source_type="filings_api", source_type="filings_api",
ticker=ticker, ticker=ticker,
items=hits, items=items,
raw_payload=raw, raw_payload=raw,
content_hash=content_hash, content_hash=content_hash,
fetched_at=datetime.utcnow(), fetched_at=datetime.now(timezone.utc),
http_status=resp.status_code,
response_time_ms=round(elapsed_ms, 1),
metadata={
"provider": "sec_edgar",
"results_count": len(items),
"total_hits": self._total_hits(data),
"query": params.get("q", ""),
"forms": params.get("forms", ""),
},
) )
except httpx.HTTPStatusError as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("SEC EDGAR HTTP error for %s: %s", ticker, e)
return self._error_result(
ticker, str(e), elapsed_ms,
http_status=e.response.status_code if e.response else None,
raw=e.response.content if e.response else b"",
)
except httpx.TimeoutException as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("SEC EDGAR timeout for %s: %s", ticker, e)
return self._error_result(ticker, f"timeout: {e}", elapsed_ms)
except Exception as e: except Exception as e:
logger.error(f"Filings fetch failed for {ticker}: {e}") elapsed_ms = (time.monotonic() - t0) * 1000
return AdapterResult( logger.error("SEC EDGAR fetch failed for %s: %s", ticker, e)
source_type="filings_api", return self._error_result(ticker, str(e), elapsed_ms)
ticker=ticker,
items=[], def _build_request(
raw_payload=b"", self, ticker: str, config: dict[str, Any]
content_hash="", ) -> tuple[str, dict[str, str], dict[str, str]]:
fetched_at=datetime.utcnow(), """Build the URL, query params, and headers for an EDGAR EFTS request."""
error=str(e), params: dict[str, str] = {}
) headers: dict[str, str] = {"User-Agent": self.user_agent}
# Query: use custom override or default to ticker-based search
query = config.get("query")
if query:
params["q"] = str(query)
else:
params["q"] = f'"{ticker}"'
# Form types filter
forms = config.get("forms", "8-K,10-Q,10-K")
params["forms"] = str(forms)
# Date range
if config.get("start_date"):
params["dateRange"] = "custom"
params["startdt"] = str(config["start_date"])
if config.get("end_date"):
params["dateRange"] = "custom"
params["enddt"] = str(config["end_date"])
# CIK filter (entity-level narrowing)
cik = config.get("cik")
if cik:
params["q"] = f'{params["q"]} AND cik:{cik}'
url = f"{self.base_url}{self.SEARCH_ENDPOINT}"
return url, params, headers
def _extract_items(self, data: dict[str, Any]) -> list[dict[str, Any]]:
"""Extract the filing hits from an EDGAR EFTS response.
EFTS returns results under hits.hits as a list of objects,
each containing _source with fields like file_date, form_type,
entity_name, file_num, and period_of_report.
"""
hits_wrapper = data.get("hits", {})
if not isinstance(hits_wrapper, dict):
return []
hits = hits_wrapper.get("hits", [])
if isinstance(hits, list):
return hits
return []
def _total_hits(self, data: dict[str, Any]) -> int:
"""Extract total hit count from EFTS response."""
hits_wrapper = data.get("hits", {})
if not isinstance(hits_wrapper, dict):
return 0
total = hits_wrapper.get("total", {})
if isinstance(total, dict):
return int(total.get("value", 0))
if isinstance(total, int):
return total
return 0
def _error_result(
self,
ticker: str,
error: str,
elapsed_ms: float,
http_status: int | None = None,
raw: bytes = b"",
) -> AdapterResult:
"""Build an error AdapterResult for filings fetches."""
return AdapterResult(
source_type="filings_api",
ticker=ticker,
items=[],
raw_payload=raw,
content_hash="",
fetched_at=datetime.now(timezone.utc),
error=error,
http_status=http_status,
response_time_ms=round(elapsed_ms, 1),
metadata={"provider": "sec_edgar"},
)
+145 -27
View File
@@ -1,8 +1,16 @@
"""Market data API adapter - fetches quotes, bars, and reference data.""" """Market data API adapter interface and concrete Polygon.io provider.
The MarketDataAdapter is the abstract interface for all market data providers.
PolygonMarketAdapter is the first concrete implementation, targeting the
Polygon.io REST API for previous-day bars, quotes, and ticker details.
Requirements: 2.1, 2.5, 3.1, 3.2, 3.3
"""
import hashlib import hashlib
import logging import logging
from datetime import datetime import time
from typing import Any, Dict from datetime import datetime, timezone
from typing import Any
import httpx import httpx
@@ -12,48 +20,158 @@ logger = logging.getLogger("market_adapter")
class MarketDataAdapter(BaseAdapter): class MarketDataAdapter(BaseAdapter):
"""Concrete adapter for a market data provider (e.g., Alpha Vantage, Polygon, Yahoo).""" """Abstract interface for market data providers.
def __init__(self, api_key: str = "", base_url: str = ""): Subclasses implement fetch() for their specific market data API.
self.api_key = api_key """
self.base_url = base_url
def source_type(self) -> str: def source_type(self) -> str:
return "market_api" return "market_api"
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
endpoint = config.get("endpoint", "/v2/aggs/ticker/{ticker}/prev") class PolygonMarketAdapter(MarketDataAdapter):
url = f"{self.base_url}{endpoint.format(ticker=ticker)}" """Concrete adapter for the Polygon.io REST API.
params = config.get("params", {})
if self.api_key: Supports:
params["apiKey"] = self.api_key - Previous-day aggregate bars (/v2/aggs/ticker/{ticker}/prev)
- Grouped daily bars (/v2/aggs/grouped/locale/us/market/stocks/{date})
- Ticker details (/v3/reference/tickers/{ticker})
The endpoint is selected via the source config's "endpoint" field,
defaulting to previous-day bars.
"""
PREV_BARS = "/v2/aggs/ticker/{ticker}/prev"
RANGE_BARS = "/v2/aggs/ticker/{ticker}/range/{multiplier}/{timespan}/{from_date}/{to_date}"
TICKER_DETAILS = "/v3/reference/tickers/{ticker}"
def __init__(self, api_key: str, base_url: str = "https://api.polygon.io") -> None:
self.api_key: str = api_key
self.base_url: str = base_url.rstrip("/")
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch market data from Polygon.io for a given ticker.
Config options:
endpoint: One of "prev_bars" (default), "range_bars", "ticker_details"
multiplier: Bar multiplier for range queries (default 1)
timespan: Bar timespan for range queries (default "day")
from_date: Start date for range queries (YYYY-MM-DD)
to_date: End date for range queries (YYYY-MM-DD)
adjusted: Whether bars are adjusted for splits (default true)
"""
endpoint_key = config.get("endpoint", "prev_bars")
url, params = self._build_request(ticker, endpoint_key, config)
async with httpx.AsyncClient(timeout=30) as client: async with httpx.AsyncClient(timeout=30) as client:
t0 = time.monotonic()
try: try:
resp = await client.get(url, params=params) resp = await client.get(url, params=params)
elapsed_ms = (time.monotonic() - t0) * 1000
resp.raise_for_status() resp.raise_for_status()
raw = resp.content raw = resp.content
data = resp.json() data = resp.json()
content_hash = hashlib.sha256(raw).hexdigest() content_hash = hashlib.sha256(raw).hexdigest()
items = self._extract_items(data, endpoint_key)
items = data.get("results", [data]) if isinstance(data, dict) else data
return AdapterResult( return AdapterResult(
source_type="market_api", source_type="market_api",
ticker=ticker, ticker=ticker,
items=items if isinstance(items, list) else [items], items=items,
raw_payload=raw, raw_payload=raw,
content_hash=content_hash, content_hash=content_hash,
fetched_at=datetime.utcnow(), fetched_at=datetime.now(timezone.utc),
http_status=resp.status_code,
response_time_ms=round(elapsed_ms, 1),
metadata={
"provider": "polygon",
"endpoint": endpoint_key,
"results_count": data.get("resultsCount", len(items)),
"request_id": data.get("request_id", ""),
},
) )
except httpx.HTTPStatusError as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Polygon HTTP error for %s: %s", ticker, e)
return self._error_result(
ticker, str(e), elapsed_ms,
http_status=e.response.status_code if e.response else None,
raw=e.response.content if e.response else b"",
)
except httpx.TimeoutException as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Polygon timeout for %s: %s", ticker, e)
return self._error_result(ticker, f"timeout: {e}", elapsed_ms)
except Exception as e: except Exception as e:
logger.error(f"Market fetch failed for {ticker}: {e}") elapsed_ms = (time.monotonic() - t0) * 1000
return AdapterResult( logger.error("Polygon fetch failed for %s: %s", ticker, e)
source_type="market_api", return self._error_result(ticker, str(e), elapsed_ms)
ticker=ticker,
items=[], def _build_request(
raw_payload=b"", self, ticker: str, endpoint_key: str, config: dict[str, Any]
content_hash="", ) -> tuple[str, dict[str, str]]:
fetched_at=datetime.utcnow(), """Build the URL and query params for a Polygon request."""
error=str(e), params: dict[str, str] = {"apiKey": self.api_key}
)
if endpoint_key == "range_bars":
multiplier = str(config.get("multiplier", 1))
timespan = config.get("timespan", "day")
from_date = config.get("from_date", "")
to_date = config.get("to_date", "")
path = self.RANGE_BARS.format(
ticker=ticker,
multiplier=multiplier,
timespan=timespan,
from_date=from_date,
to_date=to_date,
)
if config.get("adjusted") is not None:
params["adjusted"] = str(config["adjusted"]).lower()
if config.get("sort"):
params["sort"] = config["sort"]
if config.get("limit"):
params["limit"] = str(config["limit"])
elif endpoint_key == "ticker_details":
path = self.TICKER_DETAILS.format(ticker=ticker)
else:
# Default: previous-day bars
path = self.PREV_BARS.format(ticker=ticker)
if config.get("adjusted") is not None:
params["adjusted"] = str(config["adjusted"]).lower()
return f"{self.base_url}{path}", params
def _extract_items(self, data: dict[str, Any], endpoint_key: str) -> list[dict[str, Any]]:
"""Extract the relevant items list from a Polygon response."""
if endpoint_key == "ticker_details":
results = data.get("results", {})
return [results] if isinstance(results, dict) and results else []
# Aggregate endpoints return results as a list
results = data.get("results", [])
if isinstance(results, list):
return results
return [results] if results else []
def _error_result(
self,
ticker: str,
error: str,
elapsed_ms: float,
http_status: int | None = None,
raw: bytes = b"",
) -> AdapterResult:
"""Build an error AdapterResult."""
return AdapterResult(
source_type="market_api",
ticker=ticker,
items=[],
raw_payload=raw,
content_hash="",
fetched_at=datetime.now(timezone.utc),
error=error,
http_status=http_status,
response_time_ms=round(elapsed_ms, 1),
metadata={"provider": "polygon"},
)
+135 -30
View File
@@ -1,8 +1,17 @@
"""News API adapter - fetches company-linked headlines and article metadata.""" """News API adapter interface and concrete Polygon.io news provider.
The NewsDataAdapter is the abstract interface for all news data providers.
PolygonNewsAdapter is the first concrete implementation, targeting the
Polygon.io REST API for company-linked news articles and headlines.
Requirements: 2.2, 2.5, 3.1, 3.2, 3.3
"""
import hashlib import hashlib
import logging import logging
from datetime import datetime import time
from typing import Any, Dict from abc import ABC
from datetime import datetime, timezone
from typing import Any
import httpx import httpx
@@ -11,51 +20,147 @@ from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("news_adapter") logger = logging.getLogger("news_adapter")
class NewsApiAdapter(BaseAdapter): class NewsDataAdapter(BaseAdapter, ABC):
"""Concrete adapter for a news API provider.""" """Abstract interface for news data providers.
def __init__(self, api_key: str = "", base_url: str = ""): Subclasses implement fetch() for their specific news API.
self.api_key = api_key source_type() is concrete here since all news adapters share the same type.
self.base_url = base_url """
def source_type(self) -> str: def source_type(self) -> str:
return "news_api" return "news_api"
async def fetch(self, ticker: str, config: Dict[str, Any]) -> AdapterResult:
endpoint = config.get("endpoint", "/v2/everything") class PolygonNewsAdapter(NewsDataAdapter):
url = f"{self.base_url}{endpoint}" """Concrete adapter for the Polygon.io ticker news endpoint.
params = config.get("params", {})
params.setdefault("q", ticker) Supports:
params.setdefault("sortBy", "publishedAt") - Ticker news (/v2/reference/news?ticker={ticker})
params.setdefault("pageSize", 20)
if self.api_key: Config options:
params["apiKey"] = self.api_key limit: Max articles to return per request (default 20, max 1000)
published_utc_gte: Only articles published on or after this date (YYYY-MM-DD)
published_utc_lte: Only articles published on or before this date (YYYY-MM-DD)
order: Sort order for results, "asc" or "desc" (default "desc")
"""
NEWS_ENDPOINT = "/v2/reference/news"
def __init__(self, api_key: str, base_url: str = "https://api.polygon.io") -> None:
self.api_key: str = api_key
self.base_url: str = base_url.rstrip("/")
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch news articles from Polygon.io for a given ticker.
Args:
ticker: The company ticker symbol.
config: Source-specific configuration from the sources table.
Returns:
AdapterResult with raw payload, parsed article items, and metadata.
"""
url, params = self._build_request(ticker, config)
async with httpx.AsyncClient(timeout=30) as client: async with httpx.AsyncClient(timeout=30) as client:
t0 = time.monotonic()
try: try:
resp = await client.get(url, params=params) resp = await client.get(url, params=params)
elapsed_ms = (time.monotonic() - t0) * 1000
resp.raise_for_status() resp.raise_for_status()
raw = resp.content raw = resp.content
data = resp.json() data = resp.json()
content_hash = hashlib.sha256(raw).hexdigest() content_hash = hashlib.sha256(raw).hexdigest()
items = self._extract_items(data)
articles = data.get("articles", [])
return AdapterResult( return AdapterResult(
source_type="news_api", source_type="news_api",
ticker=ticker, ticker=ticker,
items=articles, items=items,
raw_payload=raw, raw_payload=raw,
content_hash=content_hash, content_hash=content_hash,
fetched_at=datetime.utcnow(), fetched_at=datetime.now(timezone.utc),
http_status=resp.status_code,
response_time_ms=round(elapsed_ms, 1),
metadata={
"provider": "polygon",
"results_count": data.get("count", len(items)),
"next_url": data.get("next_url", ""),
"request_id": data.get("request_id", ""),
},
) )
except httpx.HTTPStatusError as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Polygon news HTTP error for %s: %s", ticker, e)
return self._error_result(
ticker, str(e), elapsed_ms,
http_status=e.response.status_code if e.response else None,
raw=e.response.content if e.response else b"",
)
except httpx.TimeoutException as e:
elapsed_ms = (time.monotonic() - t0) * 1000
logger.error("Polygon news timeout for %s: %s", ticker, e)
return self._error_result(ticker, f"timeout: {e}", elapsed_ms)
except Exception as e: except Exception as e:
logger.error(f"News fetch failed for {ticker}: {e}") elapsed_ms = (time.monotonic() - t0) * 1000
return AdapterResult( logger.error("Polygon news fetch failed for %s: %s", ticker, e)
source_type="news_api", return self._error_result(ticker, str(e), elapsed_ms)
ticker=ticker,
items=[], def _build_request(
raw_payload=b"", self, ticker: str, config: dict[str, Any]
content_hash="", ) -> tuple[str, dict[str, str]]:
fetched_at=datetime.utcnow(), """Build the URL and query params for a Polygon news request."""
error=str(e), params: dict[str, str] = {
) "apiKey": self.api_key,
"ticker": ticker,
}
limit = config.get("limit", 20)
params["limit"] = str(min(int(limit), 1000))
if config.get("order"):
params["order"] = config["order"]
if config.get("published_utc_gte"):
params["published_utc.gte"] = config["published_utc_gte"]
if config.get("published_utc_lte"):
params["published_utc.lte"] = config["published_utc_lte"]
url = f"{self.base_url}{self.NEWS_ENDPOINT}"
return url, params
def _extract_items(self, data: dict[str, Any]) -> list[dict[str, Any]]:
"""Extract the article list from a Polygon news response.
Polygon returns articles under the "results" key as a list of objects,
each containing fields like id, publisher, title, article_url, tickers,
published_utc, description, and keywords.
"""
results = data.get("results", [])
if isinstance(results, list):
return results
return []
def _error_result(
self,
ticker: str,
error: str,
elapsed_ms: float,
http_status: int | None = None,
raw: bytes = b"",
) -> AdapterResult:
"""Build an error AdapterResult for news fetches."""
return AdapterResult(
source_type="news_api",
ticker=ticker,
items=[],
raw_payload=raw,
content_hash="",
fetched_at=datetime.now(timezone.utc),
error=error,
http_status=http_status,
response_time_ms=round(elapsed_ms, 1),
metadata={"provider": "polygon"},
)
+603
View File
@@ -0,0 +1,603 @@
"""Paper trading adapter - local order simulation and state sync.
Implements a fully local paper trading engine that simulates order
execution without requiring a real broker API. Tracks positions,
account balance, fills, and order events in-memory with PostgreSQL
persistence for state sync and audit trail.
Requirements: 8.1, 8.3, 8.5, 2.4
Design: Section 4.9 - Broker Adapter (paper mode)
"""
from __future__ import annotations
import json
import logging
import uuid
from datetime import datetime, timezone
from typing import Any
import asyncpg
from services.adapters.broker_adapter import (
AccountInfo,
BrokerDataAdapter,
OrderEventType,
OrderRequest,
OrderResponse,
OrderSide,
OrderStatus,
OrderType,
PositionInfo,
TradingMode,
)
from services.adapters.base import AdapterResult
logger = logging.getLogger("paper_trading")
# ---------------------------------------------------------------------------
# In-memory paper trading state
# ---------------------------------------------------------------------------
class PaperPosition:
"""Tracks a single paper position."""
def __init__(
self,
ticker: str,
quantity: float = 0.0,
avg_entry_price: float = 0.0,
realized_pnl: float = 0.0,
) -> None:
self.ticker = ticker
self.quantity = quantity
self.avg_entry_price = avg_entry_price
self.realized_pnl = realized_pnl
def apply_fill(self, side: OrderSide, fill_qty: float, fill_price: float) -> float:
"""Apply a fill to this position. Returns realized PnL from the fill."""
realized = 0.0
if side == OrderSide.BUY:
# Buying: average up the entry price
total_cost = self.avg_entry_price * self.quantity + fill_price * fill_qty
self.quantity += fill_qty
if self.quantity > 0:
self.avg_entry_price = total_cost / self.quantity
else:
# Selling: realize PnL on the sold shares
if self.quantity > 0:
sell_qty = min(fill_qty, self.quantity)
realized = sell_qty * (fill_price - self.avg_entry_price)
self.quantity -= sell_qty
self.realized_pnl += realized
if self.quantity <= 0:
self.quantity = 0.0
self.avg_entry_price = 0.0
return realized
@property
def is_open(self) -> bool:
return self.quantity > 0
def to_position_info(self, current_price: float | None = None) -> PositionInfo:
"""Convert to a PositionInfo for the broker interface."""
price = current_price if current_price is not None else self.avg_entry_price
unrealized = (price - self.avg_entry_price) * self.quantity if self.quantity > 0 else 0.0
market_value = price * self.quantity
return PositionInfo(
ticker=self.ticker,
quantity=self.quantity,
avg_entry_price=self.avg_entry_price,
current_price=price,
unrealized_pnl=round(unrealized, 4),
market_value=round(market_value, 4),
side="long" if self.quantity > 0 else "flat",
)
class PaperAccount:
"""In-memory paper trading account state."""
def __init__(
self,
account_id: str = "paper-default",
initial_cash: float = 100_000.0,
) -> None:
self.account_id = account_id
self.initial_cash = initial_cash
self.cash = initial_cash
self.positions: dict[str, PaperPosition] = {}
self.orders: dict[str, OrderResponse] = {}
self.order_events: list[dict[str, Any]] = []
self._seen_idempotency_keys: dict[str, str] = {} # key -> order_id
@property
def portfolio_value(self) -> float:
position_value = sum(
p.quantity * p.avg_entry_price for p in self.positions.values() if p.is_open
)
return self.cash + position_value
@property
def buying_power(self) -> float:
return self.cash
def get_position(self, ticker: str) -> PaperPosition:
if ticker not in self.positions:
self.positions[ticker] = PaperPosition(ticker=ticker)
return self.positions[ticker]
def to_account_info(self) -> AccountInfo:
return AccountInfo(
account_id=self.account_id,
buying_power=round(self.buying_power, 2),
cash=round(self.cash, 2),
portfolio_value=round(self.portfolio_value, 2),
currency="USD",
mode=TradingMode.PAPER,
)
# ---------------------------------------------------------------------------
# Paper trading adapter
# ---------------------------------------------------------------------------
class PaperTradingAdapter(BrokerDataAdapter):
"""Local paper trading adapter that simulates order execution.
All orders are filled immediately at the estimated price (market orders)
or at the limit/stop price when applicable. No real broker API is called.
Features:
- Idempotent order submission via idempotency_key (Req 8.5)
- Full order event trail for audit (Req 8.3)
- Position tracking with average entry price
- Cash balance management
- State sync to/from PostgreSQL
The adapter operates in PAPER mode only and rejects any attempt
to switch to LIVE mode.
"""
def __init__(
self,
account_id: str = "paper-default",
initial_cash: float = 100_000.0,
simulated_slippage_pct: float = 0.001,
) -> None:
super().__init__(mode=TradingMode.PAPER)
self.account = PaperAccount(account_id=account_id, initial_cash=initial_cash)
self.slippage_pct = simulated_slippage_pct
def source_type(self) -> str:
return "broker"
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch paper positions/account as a raw artifact snapshot."""
endpoint = config.get("endpoint", "positions")
now = datetime.now(timezone.utc)
if endpoint == "account":
data = self.account.to_account_info().to_dict()
items = [data]
elif endpoint == "orders":
items = [
resp.to_dict()
for resp in self.account.orders.values()
if resp.ticker == ticker or ticker == "*"
]
else:
pos = self.account.get_position(ticker)
data = pos.to_position_info().to_dict()
items = [data] if pos.is_open else []
raw = json.dumps(items).encode()
return AdapterResult(
source_type="broker",
ticker=ticker,
items=items,
raw_payload=raw,
content_hash="",
fetched_at=now,
metadata={"provider": "paper", "mode": "paper", "endpoint": endpoint},
)
async def submit_order(self, order: OrderRequest) -> OrderResponse:
"""Simulate order submission and immediate fill.
Idempotency: if the same idempotency_key was already used,
return the original response (Req 8.5).
"""
# Idempotency check
existing_id = self.account._seen_idempotency_keys.get(order.idempotency_key)
if existing_id and existing_id in self.account.orders:
logger.info("Duplicate order key %s — returning cached response", order.idempotency_key)
return self.account.orders[existing_id]
now = datetime.now(timezone.utc)
order_id = str(uuid.uuid4())
# Determine fill price based on order type
fill_price = self._compute_fill_price(order)
# Check if we have enough cash for buys
if order.side == OrderSide.BUY:
required_cash = fill_price * order.quantity
if required_cash > self.account.cash:
resp = OrderResponse(
broker_order_id=order_id,
status=OrderStatus.REJECTED,
ticker=order.ticker,
side=order.side,
quantity=order.quantity,
submitted_at=now,
error=f"Insufficient cash: need {required_cash:.2f}, have {self.account.cash:.2f}",
)
self._record_event(order_id, OrderEventType.REJECTED, resp.to_dict(), now)
self.account.orders[order_id] = resp
self.account._seen_idempotency_keys[order.idempotency_key] = order_id
return resp
# Check if we have enough shares for sells
if order.side == OrderSide.SELL:
pos = self.account.get_position(order.ticker)
if pos.quantity < order.quantity:
resp = OrderResponse(
broker_order_id=order_id,
status=OrderStatus.REJECTED,
ticker=order.ticker,
side=order.side,
quantity=order.quantity,
submitted_at=now,
error=f"Insufficient shares: need {order.quantity}, have {pos.quantity}",
)
self._record_event(order_id, OrderEventType.REJECTED, resp.to_dict(), now)
self.account.orders[order_id] = resp
self.account._seen_idempotency_keys[order.idempotency_key] = order_id
return resp
# Simulate immediate fill
position = self.account.get_position(order.ticker)
realized_pnl = position.apply_fill(order.side, order.quantity, fill_price)
# Update cash
if order.side == OrderSide.BUY:
self.account.cash -= fill_price * order.quantity
else:
self.account.cash += fill_price * order.quantity
resp = OrderResponse(
broker_order_id=order_id,
status=OrderStatus.FILLED,
ticker=order.ticker,
side=order.side,
quantity=order.quantity,
filled_quantity=order.quantity,
filled_avg_price=fill_price,
submitted_at=now,
raw_response={
"realized_pnl": round(realized_pnl, 4),
"cash_after": round(self.account.cash, 2),
"position_qty_after": position.quantity,
"simulated": True,
},
)
# Record events
self._record_event(order_id, OrderEventType.SUBMITTED, {"ticker": order.ticker}, now)
self._record_event(order_id, OrderEventType.ACCEPTED, {"ticker": order.ticker}, now)
self._record_event(order_id, OrderEventType.FILL, {
"fill_price": fill_price,
"fill_qty": order.quantity,
"realized_pnl": round(realized_pnl, 4),
}, now)
self.account.orders[order_id] = resp
self.account._seen_idempotency_keys[order.idempotency_key] = order_id
logger.info(
"Paper fill: %s %s %.0f %s @ %.2f | cash=%.2f pnl=%.4f",
order_id[:8], order.side.value, order.quantity,
order.ticker, fill_price, self.account.cash, realized_pnl,
)
return resp
async def cancel_order(self, broker_order_id: str) -> OrderResponse:
"""Cancel a paper order. Only pending orders can be cancelled."""
existing = self.account.orders.get(broker_order_id)
if existing is None:
return OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.REJECTED,
ticker="",
side=OrderSide.BUY,
quantity=0,
error=f"Order {broker_order_id} not found",
)
# Paper orders fill immediately, so they can't be cancelled
if existing.status == OrderStatus.FILLED:
return OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.REJECTED,
ticker=existing.ticker,
side=existing.side,
quantity=existing.quantity,
error="Cannot cancel a filled order",
)
now = datetime.now(timezone.utc)
cancelled = OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.CANCELLED,
ticker=existing.ticker,
side=existing.side,
quantity=existing.quantity,
submitted_at=existing.submitted_at,
)
self.account.orders[broker_order_id] = cancelled
self._record_event(broker_order_id, OrderEventType.CANCELLED, {}, now)
return cancelled
async def get_order_status(self, broker_order_id: str) -> OrderResponse:
"""Get the status of a paper order."""
existing = self.account.orders.get(broker_order_id)
if existing is None:
return OrderResponse(
broker_order_id=broker_order_id,
status=OrderStatus.REJECTED,
ticker="",
side=OrderSide.BUY,
quantity=0,
error=f"Order {broker_order_id} not found",
)
return existing
async def get_positions(self) -> list[PositionInfo]:
"""Get all open paper positions."""
return [
p.to_position_info()
for p in self.account.positions.values()
if p.is_open
]
async def get_account(self) -> AccountInfo:
"""Get paper account summary."""
return self.account.to_account_info()
# -----------------------------------------------------------------------
# Internal helpers
# -----------------------------------------------------------------------
def _compute_fill_price(self, order: OrderRequest) -> float:
"""Determine the simulated fill price for an order.
Market orders use the limit_price as a proxy (or 0 if not set).
Limit orders fill at the limit price.
Stop orders fill at the stop price.
A small slippage is applied to market orders.
"""
if order.order_type == OrderType.LIMIT and order.limit_price is not None:
return order.limit_price
if order.order_type == OrderType.STOP and order.stop_price is not None:
return order.stop_price
if order.order_type == OrderType.STOP_LIMIT and order.limit_price is not None:
return order.limit_price
# Market order: use limit_price as estimate, or a default
base_price = order.limit_price if order.limit_price is not None else 100.0
if order.side == OrderSide.BUY:
return round(base_price * (1 + self.slippage_pct), 4)
return round(base_price * (1 - self.slippage_pct), 4)
def _record_event(
self,
order_id: str,
event_type: OrderEventType,
data: dict[str, Any],
timestamp: datetime,
) -> None:
"""Record an order event for audit trail."""
self.account.order_events.append({
"order_id": order_id,
"event_type": event_type.value,
"data": data,
"timestamp": timestamp.isoformat(),
})
# ---------------------------------------------------------------------------
# State sync: persist and restore paper trading state to/from PostgreSQL
# ---------------------------------------------------------------------------
# SQL for persisting paper orders to the orders table
_INSERT_PAPER_ORDER = """
INSERT INTO orders (
id, recommendation_id, broker_account_id, ticker, side, order_type,
quantity, limit_price, stop_price, status, idempotency_key,
broker_order_id, decision_trace, submitted_at, filled_at,
fill_price, fill_quantity
) VALUES (
$1::uuid, $2, $3, $4, $5, $6,
$7, $8, $9, $10, $11,
$12, $13::jsonb, $14, $15,
$16, $17
)
ON CONFLICT (idempotency_key) DO NOTHING
"""
_INSERT_PAPER_ORDER_EVENT = """
INSERT INTO order_events (order_id, event_type, data, broker_timestamp)
VALUES ($1::uuid, $2, $3::jsonb, $4)
"""
_UPSERT_PAPER_POSITION = """
INSERT INTO positions (broker_account_id, ticker, quantity, avg_entry_price, realized_pnl, updated_at)
VALUES ($1, $2, $3, $4, $5, $6)
ON CONFLICT (broker_account_id, ticker)
DO UPDATE SET
quantity = EXCLUDED.quantity,
avg_entry_price = EXCLUDED.avg_entry_price,
realized_pnl = EXCLUDED.realized_pnl,
updated_at = EXCLUDED.updated_at
"""
_UPSERT_PAPER_ACCOUNT = """
INSERT INTO broker_accounts (id, provider, account_id, mode, config, active)
VALUES ($1::uuid, 'paper', $2, 'paper', $3::jsonb, TRUE)
ON CONFLICT (id) DO UPDATE SET
config = EXCLUDED.config,
active = TRUE
"""
_LOAD_PAPER_POSITIONS = """
SELECT ticker, quantity, avg_entry_price, COALESCE(realized_pnl, 0) AS realized_pnl
FROM positions
WHERE broker_account_id = $1 AND quantity > 0
"""
_LOAD_PAPER_ACCOUNT_CONFIG = """
SELECT config FROM broker_accounts
WHERE account_id = $1 AND mode = 'paper' AND active = TRUE
LIMIT 1
"""
_LOAD_PAPER_ORDERS = """
SELECT
id, ticker, side, order_type, quantity, status,
idempotency_key, broker_order_id, fill_price, fill_quantity,
submitted_at
FROM orders
WHERE broker_account_id = (
SELECT id FROM broker_accounts WHERE account_id = $1 AND mode = 'paper' LIMIT 1
)
ORDER BY submitted_at DESC
LIMIT 500
"""
async def sync_state_to_db(
adapter: PaperTradingAdapter,
pool: asyncpg.Pool,
broker_account_uuid: str | None = None,
) -> None:
"""Persist the current paper trading state to PostgreSQL.
Writes:
- broker_accounts row for the paper account
- positions rows for all open positions
- orders rows for all orders (idempotent via ON CONFLICT)
- order_events for audit trail
This enables state recovery after restarts and provides the
full execution audit trail (Requirement 8.3).
"""
acct = adapter.account
now = datetime.now(timezone.utc)
acct_uuid = broker_account_uuid or str(uuid.uuid5(uuid.NAMESPACE_DNS, acct.account_id))
async with pool.acquire() as conn:
async with conn.transaction():
# 1. Upsert broker account
config_json = json.dumps({
"initial_cash": acct.initial_cash,
"current_cash": round(acct.cash, 2),
"portfolio_value": round(acct.portfolio_value, 2),
"slippage_pct": adapter.slippage_pct,
})
await conn.execute(_UPSERT_PAPER_ACCOUNT, acct_uuid, acct.account_id, config_json)
# 2. Upsert positions
for ticker, pos in acct.positions.items():
await conn.execute(
_UPSERT_PAPER_POSITION,
acct_uuid, ticker,
pos.quantity, pos.avg_entry_price, pos.realized_pnl,
now,
)
# 3. Insert orders (idempotent)
for order_id, resp in acct.orders.items():
filled_at = now if resp.status == OrderStatus.FILLED else None
await conn.execute(
_INSERT_PAPER_ORDER,
order_id,
None, # recommendation_id
acct_uuid,
resp.ticker,
resp.side.value,
"market", # paper orders are always market-simulated
resp.quantity,
resp.filled_avg_price, # limit_price
None, # stop_price
resp.status.value,
order_id, # use order_id as idempotency_key fallback
order_id,
json.dumps(resp.raw_response),
resp.submitted_at,
filled_at,
resp.filled_avg_price,
resp.filled_quantity,
)
# 4. Insert order events
for event in acct.order_events:
await conn.execute(
_INSERT_PAPER_ORDER_EVENT,
event["order_id"],
event["event_type"],
json.dumps(event["data"]),
datetime.fromisoformat(event["timestamp"]),
)
logger.info(
"Synced paper state to DB: account=%s positions=%d orders=%d events=%d",
acct.account_id, len(acct.positions), len(acct.orders), len(acct.order_events),
)
# Clear events after sync to avoid re-inserting
acct.order_events.clear()
async def load_state_from_db(
adapter: PaperTradingAdapter,
pool: asyncpg.Pool,
) -> bool:
"""Restore paper trading state from PostgreSQL.
Loads positions and account config from the DB so the adapter
can resume after a restart. Returns True if state was found.
"""
acct = adapter.account
async with pool.acquire() as conn:
# Load account config
row = await conn.fetchrow(_LOAD_PAPER_ACCOUNT_CONFIG, acct.account_id)
if row is None:
logger.info("No saved paper account state for %s", acct.account_id)
return False
config = json.loads(row["config"]) if isinstance(row["config"], str) else row["config"]
acct.cash = float(config.get("current_cash", acct.initial_cash))
# Load positions
pos_rows = await conn.fetch(_LOAD_PAPER_POSITIONS, acct.account_id)
for pr in pos_rows:
ticker = pr["ticker"]
acct.positions[ticker] = PaperPosition(
ticker=ticker,
quantity=float(pr["quantity"]),
avg_entry_price=float(pr["avg_entry_price"] or 0),
realized_pnl=float(pr["realized_pnl"]),
)
logger.info(
"Loaded paper state from DB: account=%s cash=%.2f positions=%d",
acct.account_id, acct.cash, len(acct.positions),
)
return True
+241
View File
@@ -0,0 +1,241 @@
"""Resilient adapter wrapper with rate-limit coordination, retries, and backoff.
Wraps any BaseAdapter with:
- Per-source-type rate limiting via Redis (distributed across workers)
- Exponential backoff with jitter on retryable failures
- Configurable retry counts and retryable HTTP status codes
- Graceful degradation when Redis is unavailable
Requirements: 2.5, 3.4
"""
import asyncio
import logging
import random
import time
from dataclasses import dataclass
from typing import Any
import redis.asyncio as aioredis
from services.shared.redis_keys import rate_limit_key
from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("resilient_adapter")
# HTTP status codes that are safe to retry
RETRYABLE_STATUS_CODES: frozenset[int] = frozenset({429, 500, 502, 503, 504})
@dataclass
class RetryConfig:
"""Configuration for retry and rate-limit behavior."""
max_retries: int = 3
base_delay: float = 1.0
max_delay: float = 60.0
jitter_factor: float = 0.5
retryable_status_codes: frozenset[int] = RETRYABLE_STATUS_CODES
# Rate limit: max requests per window per source type
rate_limit_max: int = 30
rate_limit_window_seconds: int = 60
# Sensible defaults per source type
DEFAULT_RETRY_CONFIGS: dict[str, RetryConfig] = {
"market_api": RetryConfig(max_retries=3, rate_limit_max=30),
"news_api": RetryConfig(max_retries=3, rate_limit_max=20),
"filings_api": RetryConfig(max_retries=2, rate_limit_max=10, base_delay=2.0),
"web_scrape": RetryConfig(max_retries=2, rate_limit_max=10, base_delay=2.0),
"broker": RetryConfig(max_retries=2, rate_limit_max=60, base_delay=0.5),
}
def compute_delay(attempt: int, config: RetryConfig) -> float:
"""Compute backoff delay with jitter for a given attempt number."""
exp_delay = config.base_delay * (2 ** attempt)
capped = min(exp_delay, config.max_delay)
jitter = capped * config.jitter_factor * random.random()
return capped + jitter
@dataclass
class RetryStats:
"""Tracks retry statistics for observability."""
attempts: int = 0
total_delay: float = 0.0
rate_limited_waits: int = 0
last_error: str | None = None
retryable: bool = False
class ResilientAdapter:
"""Wraps a BaseAdapter with rate-limit coordination, retries, and backoff.
Usage:
adapter = PolygonMarketAdapter(api_key="...")
resilient = ResilientAdapter(adapter, redis=rds)
result = await resilient.fetch(ticker, config)
If redis is None, rate limiting is skipped (local dev / testing).
"""
def __init__(
self,
adapter: BaseAdapter,
redis: aioredis.Redis | None = None,
retry_config: RetryConfig | None = None,
) -> None:
self._adapter = adapter
self._redis = redis
source_type = adapter.source_type()
self._config = retry_config or DEFAULT_RETRY_CONFIGS.get(
source_type, RetryConfig()
)
@property
def adapter(self) -> BaseAdapter:
"""Access the underlying adapter."""
return self._adapter
@property
def config(self) -> RetryConfig:
return self._config
def source_type(self) -> str:
return self._adapter.source_type()
async def _check_rate_limit(self) -> float:
"""Check distributed rate limit via Redis.
Returns 0.0 if allowed, or the number of seconds to wait.
"""
if self._redis is None:
return 0.0
source_type = self._adapter.source_type()
window_sec = self._config.rate_limit_window_seconds
# Use a time-bucketed key so counters auto-expire
bucket = int(time.time()) // window_sec
key = rate_limit_key(source_type, str(bucket))
try:
count = await self._redis.incr(key)
if count == 1:
await self._redis.expire(key, window_sec * 2)
if count > self._config.rate_limit_max:
# Over limit — compute how long until the window rolls over
elapsed_in_window = time.time() % window_sec
wait = window_sec - elapsed_in_window
return max(wait, 0.5)
except Exception:
# Redis unavailable — degrade gracefully, allow the request
logger.warning("Redis rate-limit check failed, allowing request")
return 0.0
def _is_retryable(self, result: AdapterResult) -> bool:
"""Determine if a failed result is worth retrying."""
if result.ok:
return False
# Retry on known retryable HTTP status codes
if result.http_status and result.http_status in self._config.retryable_status_codes:
return True
# Retry on timeouts
if result.error and "timeout" in result.error.lower():
return True
# Retry on connection errors
if result.error and any(
kw in result.error.lower()
for kw in ("connection", "connect", "reset", "refused")
):
return True
return False
def _extract_retry_after(self, result: AdapterResult) -> float | None:
"""Extract Retry-After hint from result metadata if present."""
retry_after = result.metadata.get("retry_after")
if retry_after is not None:
try:
return float(retry_after)
except (ValueError, TypeError):
pass
return None
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch with rate-limit coordination, retries, and exponential backoff.
Returns the AdapterResult from the underlying adapter. On retryable
failures, retries up to max_retries times with exponential backoff
and jitter. Rate-limit waits are applied before each attempt.
The returned result's metadata includes retry stats under the
"retry_stats" key.
"""
stats = RetryStats()
last_result: AdapterResult | None = None
for attempt in range(self._config.max_retries + 1):
stats.attempts = attempt + 1
# Rate limit check
wait = await self._check_rate_limit()
if wait > 0:
stats.rate_limited_waits += 1
logger.info(
"Rate limited for %s/%s, waiting %.1fs",
self.source_type(), ticker, wait,
)
stats.total_delay += wait
await asyncio.sleep(wait)
# Execute the fetch
result = await self._adapter.fetch(ticker, config)
last_result = result
# Success — attach stats and return
if result.ok:
result.metadata["retry_stats"] = {
"attempts": stats.attempts,
"total_delay": round(stats.total_delay, 2),
"rate_limited_waits": stats.rate_limited_waits,
}
return result
# Check if retryable
if not self._is_retryable(result):
stats.last_error = result.error
stats.retryable = False
break
stats.retryable = True
stats.last_error = result.error
# Don't sleep after the last attempt
if attempt < self._config.max_retries:
# Respect Retry-After header for 429s
retry_after = self._extract_retry_after(result)
if result.http_status == 429 and retry_after is not None:
delay = min(retry_after, self._config.max_delay)
else:
delay = compute_delay(attempt, self._config)
logger.info(
"Retrying %s/%s (attempt %d/%d) after %.1fs: %s",
self.source_type(), ticker, attempt + 1,
self._config.max_retries + 1, delay, result.error,
)
stats.total_delay += delay
await asyncio.sleep(delay)
# All retries exhausted — return last result with stats
assert last_result is not None
last_result.metadata["retry_stats"] = {
"attempts": stats.attempts,
"total_delay": round(stats.total_delay, 2),
"rate_limited_waits": stats.rate_limited_waits,
"exhausted": True,
"last_error": stats.last_error,
}
return last_result
+321
View File
@@ -0,0 +1,321 @@
"""Web scrape adapter for curated URLs and article pages.
Fetches full article HTML from curated URLs (investor relations pages,
press releases, earnings transcripts, etc.) using BeautifulSoup + requests
with retry adapters, content hashing, boilerplate awareness, and quality scoring.
Inspired by Noctipede crawler patterns: BeautifulSoup + requests with retry
adapters, content hashing, boilerplate stripping, quality scoring.
Requirements: 1.2, 2.5, 3.1, 3.2, 3.3, 3.4
"""
import json
import logging
import time
from datetime import datetime, timezone
from urllib.parse import urlparse
from typing import Any
import httpx
from bs4 import BeautifulSoup
from services.shared.content import content_hash, normalize_url
from .base import AdapterResult, BaseAdapter
logger = logging.getLogger("web_scrape_adapter")
# Default request settings
DEFAULT_TIMEOUT = 30
DEFAULT_USER_AGENT = "StonksOracle/1.0 (+https://stonks-oracle.celestium.life)"
MAX_CONTENT_LENGTH = 10 * 1024 * 1024 # 10MB cap
def extract_metadata_from_html(html: str, url: str) -> dict[str, str | None]:
"""Extract title, author, publisher, published date, and links from HTML."""
soup = BeautifulSoup(html, "html.parser")
meta: dict[str, str | None] = {}
# Title: prefer og:title, then <title>
og_title = soup.find("meta", property="og:title")
if og_title and og_title.get("content"):
content = og_title["content"]
meta["title"] = content.strip() if isinstance(content, str) else ""
elif soup.title and soup.title.string:
meta["title"] = soup.title.string.strip()
else:
meta["title"] = ""
# Author
author_tag = soup.find("meta", attrs={"name": "author"})
if author_tag and author_tag.get("content"):
content = author_tag["content"]
meta["author"] = content.strip() if isinstance(content, str) else ""
else:
meta["author"] = ""
# Publisher: og:site_name
site_name = soup.find("meta", property="og:site_name")
if site_name and site_name.get("content"):
content = site_name["content"]
meta["publisher"] = content.strip() if isinstance(content, str) else ""
else:
meta["publisher"] = urlparse(url).hostname or ""
# Published date: article:published_time or datePublished
pub_time = soup.find("meta", property="article:published_time")
if pub_time and pub_time.get("content"):
content = pub_time["content"]
meta["published_at"] = content.strip() if isinstance(content, str) else None
else:
# Try JSON-LD datePublished
for script in soup.find_all("script", type="application/ld+json"):
if script.string and "datePublished" in script.string:
try:
ld = json.loads(script.string)
if isinstance(ld, dict) and "datePublished" in ld:
meta["published_at"] = str(ld["datePublished"])
break
if isinstance(ld, list):
for item in ld:
if isinstance(item, dict) and "datePublished" in item:
meta["published_at"] = str(item["datePublished"])
break
except (json.JSONDecodeError, TypeError):
pass
if "published_at" not in meta:
meta["published_at"] = None
# Canonical URL
canonical = soup.find("link", rel="canonical")
if canonical and canonical.get("href"):
href = canonical["href"]
meta["canonical_url"] = str(href) if href else normalize_url(url)
else:
og_url = soup.find("meta", property="og:url")
if og_url and og_url.get("content"):
content = og_url["content"]
meta["canonical_url"] = str(content) if content else normalize_url(url)
else:
meta["canonical_url"] = normalize_url(url)
# Language
html_tag = soup.find("html")
if html_tag and html_tag.get("lang"):
lang = html_tag["lang"]
meta["language"] = str(lang)[:5] if lang else "en"
else:
meta["language"] = "en"
# Description for summary
desc = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
if desc and desc.get("content"):
content = desc["content"]
meta["description"] = content.strip() if isinstance(content, str) else ""
else:
meta["description"] = ""
return meta
def extract_body_text(html: str) -> str:
"""Extract main body text from HTML, stripping nav/footer/ads."""
soup = BeautifulSoup(html, "html.parser")
# Remove non-content elements
for tag in soup.find_all(
["script", "style", "nav", "footer", "header", "aside", "iframe", "noscript"]
):
tag.decompose()
# Try to find article body
article = soup.find("article")
if not article:
for div in soup.find_all("div"):
cls = div.get("class", [])
cls_str = " ".join(cls) if isinstance(cls, list) else str(cls) if cls else ""
if any(kw in cls_str for kw in ["article-body", "post-content", "entry-content", "story-body"]):
article = div
break
if article:
text = article.get_text(separator="\n", strip=True)
else:
# Fallback: use body
body = soup.find("body")
text = body.get_text(separator="\n", strip=True) if body else soup.get_text(separator="\n", strip=True)
# Collapse whitespace
lines = [line.strip() for line in text.splitlines() if line.strip()]
return "\n".join(lines)
class WebScrapeAdapter(BaseAdapter):
"""Adapter for fetching curated web pages and article URLs.
Config options (from source config):
urls: List of URLs to scrape for this company
url: Single URL to scrape (alternative to urls)
timeout: Request timeout in seconds (default 30)
user_agent: Custom user agent string
follow_links: Whether to follow article links from index pages (default False)
max_pages: Max pages to fetch per cycle (default 5)
"""
def __init__(self) -> None:
pass
def source_type(self) -> str:
return "web_scrape"
def bucket_name(self) -> str:
"""Web scrape artifacts go to the news raw bucket."""
return "stonks-raw-news"
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
"""Fetch HTML from curated URLs for a given ticker.
Supports both single URL and multi-URL configs. Each URL is fetched,
HTML is preserved as raw payload, and metadata is extracted.
"""
urls = config.get("urls", [])
if not urls and config.get("url"):
urls = [config["url"]]
if not urls:
return self._error_result(ticker, "No URLs configured for web_scrape source", 0)
timeout = config.get("timeout", DEFAULT_TIMEOUT)
user_agent = config.get("user_agent", DEFAULT_USER_AGENT)
max_pages = min(config.get("max_pages", 5), 20)
items: list[dict[str, Any]] = []
all_raw: list[bytes] = []
total_elapsed = 0.0
errors: list[str] = []
async with httpx.AsyncClient(
timeout=timeout,
follow_redirects=True,
headers={"User-Agent": user_agent},
) as client:
for url in urls[:max_pages]:
t0 = time.monotonic()
try:
resp = await client.get(url)
elapsed_ms = (time.monotonic() - t0) * 1000
total_elapsed += elapsed_ms
resp.raise_for_status()
# Content length guard
if len(resp.content) > MAX_CONTENT_LENGTH:
errors.append(f"Content too large for {url}: {len(resp.content)} bytes")
continue
html = resp.text
raw_bytes = resp.content
all_raw.append(raw_bytes)
item_content_hash = content_hash(raw_bytes)
meta = extract_metadata_from_html(html, url)
body_text = extract_body_text(html)
item: dict[str, Any] = {
"url": url,
"canonical_url": meta.get("canonical_url", normalize_url(url)),
"title": meta.get("title", ""),
"author": meta.get("author", ""),
"publisher": meta.get("publisher", ""),
"published_at": meta.get("published_at"),
"language": meta.get("language", "en"),
"description": meta.get("description", ""),
"content_hash": item_content_hash,
"body_text": body_text,
"body_length": len(body_text),
"html_length": len(html),
"http_status": resp.status_code,
"response_time_ms": round(elapsed_ms, 1),
}
items.append(item)
except httpx.HTTPStatusError as e:
elapsed_ms = (time.monotonic() - t0) * 1000
total_elapsed += elapsed_ms
status = e.response.status_code if e.response else None
errors.append(f"HTTP {status} for {url}: {e}")
logger.warning("Scrape HTTP error for %s/%s: %s", ticker, url, e)
except httpx.TimeoutException as e:
elapsed_ms = (time.monotonic() - t0) * 1000
total_elapsed += elapsed_ms
errors.append(f"Timeout for {url}: {e}")
logger.warning("Scrape timeout for %s/%s: %s", ticker, url, e)
except Exception as e:
elapsed_ms = (time.monotonic() - t0) * 1000
total_elapsed += elapsed_ms
errors.append(f"Error for {url}: {e}")
logger.warning("Scrape error for %s/%s: %s", ticker, url, e)
if not items:
error_msg = "; ".join(errors) if errors else "No pages fetched"
return self._error_result(ticker, error_msg, total_elapsed)
# Combine all raw payloads into a single artifact
combined_raw = json.dumps({
"ticker": ticker,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"pages": [
{
"url": item["url"],
"content_hash": item["content_hash"],
"html_length": item["html_length"],
"body_length": item["body_length"],
}
for item in items
],
"errors": errors,
}).encode("utf-8")
combined_hash = content_hash(
b"".join(item["content_hash"].encode() for item in items)
)
return AdapterResult(
source_type="web_scrape",
ticker=ticker,
items=items,
raw_payload=combined_raw,
content_hash=combined_hash,
fetched_at=datetime.now(timezone.utc),
http_status=200,
response_time_ms=round(total_elapsed, 1),
metadata={
"provider": "web_scrape",
"pages_fetched": len(items),
"pages_failed": len(errors),
"errors": errors,
},
)
def _error_result(
self,
ticker: str,
error: str,
elapsed_ms: float,
) -> AdapterResult:
"""Build an error AdapterResult for scrape fetches."""
return AdapterResult(
source_type="web_scrape",
ticker=ticker,
items=[],
raw_payload=b"",
content_hash="",
fetched_at=datetime.now(timezone.utc),
error=error,
http_status=None,
response_time_ms=round(elapsed_ms, 1),
metadata={"provider": "web_scrape"},
)
+169
View File
@@ -0,0 +1,169 @@
"""Contradiction detection and disagreement representation.
Analyses weighted signals to detect and represent disagreement explicitly,
rather than collapsing contradictory evidence into a single unsupported
conclusion.
Requirements: 6.4, 6.5
"""
from __future__ import annotations
from dataclasses import dataclass
from services.aggregation.scoring import WeightedSignal
from services.shared.schemas import DisagreementDetail
@dataclass
class CatalystEntry:
"""Lightweight carrier for per-document catalyst info needed by
contradiction detection. Avoids importing ImpactRow and creating
a circular dependency with worker.py."""
document_id: str
catalyst_type: str
@dataclass
class ContradictionResult:
"""Full contradiction analysis output."""
score: float # 0-1, same semantics as existing compute_contradiction_score
details: list[DisagreementDetail]
def detect_contradictions(
signals: list[WeightedSignal],
catalyst_entries: list[CatalystEntry] | None = None,
) -> ContradictionResult:
"""Run contradiction detection across multiple dimensions.
Analyses:
1. Sentiment disagreement — the core positive-vs-negative split
2. Catalyst disagreement — same catalyst type with opposing sentiment
Returns a ContradictionResult with an overall score and per-dimension
disagreement details.
"""
details: list[DisagreementDetail] = []
sentiment_detail = _detect_sentiment_disagreement(signals)
if sentiment_detail is not None:
details.append(sentiment_detail)
if catalyst_entries:
catalyst_details = _detect_catalyst_disagreement(signals, catalyst_entries)
details.extend(catalyst_details)
score = _compute_overall_score(signals)
return ContradictionResult(score=score, details=details)
def _compute_overall_score(signals: list[WeightedSignal]) -> float:
"""Minority/majority weight ratio — backward-compatible formula."""
if not signals:
return 0.0
pos_weight = 0.0
neg_weight = 0.0
for sig in signals:
w = sig.weight.combined * sig.impact_score
if sig.sentiment_value > 0:
pos_weight += w
elif sig.sentiment_value < 0:
neg_weight += w
total = pos_weight + neg_weight
if total == 0.0:
return 0.0
minority = min(pos_weight, neg_weight)
return round(minority / total, 4)
def _detect_sentiment_disagreement(
signals: list[WeightedSignal],
) -> DisagreementDetail | None:
"""Detect when both positive and negative sentiment signals exist."""
pos_ids: list[str] = []
neg_ids: list[str] = []
pos_weight = 0.0
neg_weight = 0.0
for sig in signals:
w = sig.weight.combined * sig.impact_score
if w <= 0:
continue
if sig.sentiment_value > 0:
pos_ids.append(sig.document_id)
pos_weight += w
elif sig.sentiment_value < 0:
neg_ids.append(sig.document_id)
neg_weight += w
if not pos_ids or not neg_ids:
return None
total = pos_weight + neg_weight
minority_pct = min(pos_weight, neg_weight) / total if total > 0 else 0.0
return DisagreementDetail(
dimension="sentiment",
positive_doc_ids=pos_ids,
negative_doc_ids=neg_ids,
positive_weight=round(pos_weight, 4),
negative_weight=round(neg_weight, 4),
description=(
f"Sentiment split: {len(pos_ids)} positive vs {len(neg_ids)} negative signals "
f"(minority weight ratio {minority_pct:.0%})"
),
)
def _detect_catalyst_disagreement(
signals: list[WeightedSignal],
catalyst_entries: list[CatalystEntry],
) -> list[DisagreementDetail]:
"""Detect when the same catalyst type has both positive and negative signals."""
# Build lookup: document_id → (sentiment_value, combined_weight)
sig_lookup: dict[str, tuple[float, float]] = {}
for sig in signals:
w = sig.weight.combined * sig.impact_score
if w > 0:
sig_lookup[sig.document_id] = (sig.sentiment_value, w)
# Group by catalyst type
from collections import defaultdict
catalyst_groups: dict[str, list[tuple[str, float, float]]] = defaultdict(list)
for entry in catalyst_entries:
if entry.document_id in sig_lookup:
sent_val, weight = sig_lookup[entry.document_id]
if sent_val != 0.0:
catalyst_groups[entry.catalyst_type].append(
(entry.document_id, sent_val, weight)
)
details: list[DisagreementDetail] = []
for catalyst, entries in catalyst_groups.items():
pos_ids = [doc_id for doc_id, sv, _ in entries if sv > 0]
neg_ids = [doc_id for doc_id, sv, _ in entries if sv < 0]
if not pos_ids or not neg_ids:
continue
pos_w = sum(w for _, sv, w in entries if sv > 0)
neg_w = sum(w for _, sv, w in entries if sv < 0)
details.append(DisagreementDetail(
dimension=f"catalyst:{catalyst}",
positive_doc_ids=pos_ids,
negative_doc_ids=neg_ids,
positive_weight=round(pos_w, 4),
negative_weight=round(neg_w, 4),
description=(
f"Catalyst '{catalyst}' has {len(pos_ids)} positive and "
f"{len(neg_ids)} negative signals"
),
))
return details
+141
View File
@@ -0,0 +1,141 @@
"""Evidence ranking for supporting and opposing documents.
Ranks document signals by a composite score that considers multiple
factors beyond raw weight, producing explainable evidence lists for
trend summaries.
Requirements: 6.5
"""
from __future__ import annotations
from dataclasses import dataclass
from services.aggregation.scoring import WeightedSignal
@dataclass(frozen=True)
class EvidenceRankConfig:
"""Weights for the composite evidence ranking score."""
# How much the combined signal weight matters (recency * credibility * novelty * market)
weight_factor: float = 0.40
# How much the document's impact score matters
impact_factor: float = 0.30
# How much recency alone matters (favours fresh evidence in the ranking)
recency_factor: float = 0.20
# How much extraction confidence matters
confidence_factor: float = 0.10
# Maximum evidence refs per side (supporting / opposing)
max_refs: int = 10
DEFAULT_RANK_CONFIG = EvidenceRankConfig()
@dataclass
class RankedEvidence:
"""A document with its composite ranking score and breakdown."""
document_id: str
rank_score: float
weight_component: float
impact_component: float
recency_component: float
confidence_component: float
sentiment_value: float # +1 / -1 / 0
def compute_evidence_rank(
signal: WeightedSignal,
config: EvidenceRankConfig = DEFAULT_RANK_CONFIG,
) -> RankedEvidence:
"""Compute a composite ranking score for a single signal.
The score blends:
- combined signal weight (captures recency decay, credibility, novelty, market ctx)
- raw impact score
- recency weight alone (extra boost for freshness in the ranking)
- extraction confidence (via the credibility component of the weight)
All components are in [0, 1] so the composite is bounded by the sum
of the factor weights.
"""
w = signal.weight
weight_component = w.combined * config.weight_factor
impact_component = signal.impact_score * config.impact_factor
recency_component = w.recency * config.recency_factor
confidence_component = w.credibility * config.confidence_factor
rank_score = weight_component + impact_component + recency_component + confidence_component
return RankedEvidence(
document_id=signal.document_id,
rank_score=round(rank_score, 6),
weight_component=round(weight_component, 6),
impact_component=round(impact_component, 6),
recency_component=round(recency_component, 6),
confidence_component=round(confidence_component, 6),
sentiment_value=signal.sentiment_value,
)
def rank_evidence(
signals: list[WeightedSignal],
config: EvidenceRankConfig = DEFAULT_RANK_CONFIG,
) -> tuple[list[str], list[str]]:
"""Rank signals into top supporting and opposing document ID lists.
Supporting = positive sentiment, Opposing = negative sentiment.
Neutral/mixed signals are excluded.
Returns (supporting_ids, opposing_ids) each capped at config.max_refs.
"""
supporting: list[RankedEvidence] = []
opposing: list[RankedEvidence] = []
for sig in signals:
if sig.sentiment_value == 0.0:
continue
ranked = compute_evidence_rank(sig, config)
if sig.sentiment_value > 0:
supporting.append(ranked)
else:
opposing.append(ranked)
supporting.sort(key=lambda r: r.rank_score, reverse=True)
opposing.sort(key=lambda r: r.rank_score, reverse=True)
return (
[r.document_id for r in supporting[: config.max_refs]],
[r.document_id for r in opposing[: config.max_refs]],
)
def rank_evidence_detailed(
signals: list[WeightedSignal],
config: EvidenceRankConfig = DEFAULT_RANK_CONFIG,
) -> tuple[list[RankedEvidence], list[RankedEvidence]]:
"""Like rank_evidence but returns full RankedEvidence objects.
Useful when callers need the score breakdown for explainability.
"""
supporting: list[RankedEvidence] = []
opposing: list[RankedEvidence] = []
for sig in signals:
if sig.sentiment_value == 0.0:
continue
ranked = compute_evidence_rank(sig, config)
if sig.sentiment_value > 0:
supporting.append(ranked)
else:
opposing.append(ranked)
supporting.sort(key=lambda r: r.rank_score, reverse=True)
opposing.sort(key=lambda r: r.rank_score, reverse=True)
return (
supporting[: config.max_refs],
opposing[: config.max_refs],
)
+57
View File
@@ -0,0 +1,57 @@
"""Aggregation worker entrypoint - polls Redis for aggregation jobs."""
from __future__ import annotations
import asyncio
import json
import logging
import asyncpg
from services.aggregation.worker import aggregate_company
from services.shared.config import load_config
from services.shared.logging import setup_logging
from services.shared.redis_keys import QUEUE_AGGREGATION, queue_key
logger = logging.getLogger("aggregation_main")
async def main() -> None:
config = load_config()
setup_logging("aggregation", level=config.log_level, json_output=config.json_logs)
pool = await asyncpg.create_pool(dsn=config.postgres.dsn, min_size=2, max_size=8)
import redis.asyncio as aioredis
redis_client = aioredis.from_url(config.redis.url)
queue = queue_key(QUEUE_AGGREGATION)
logger.info("Aggregation worker started, polling %s", queue)
try:
while True:
raw = await redis_client.lpop(queue)
if raw is None:
await asyncio.sleep(1)
continue
payload = raw
job = json.loads(payload)
ticker = job.get("ticker", "")
logger.info("Processing aggregation job for %s", ticker)
try:
summaries = await aggregate_company(pool, ticker)
logger.info(
"Aggregation complete for %s: %d windows",
ticker, len(summaries),
)
except Exception:
logger.exception("Aggregation failed for %s", ticker)
finally:
await pool.close()
await redis_client.close()
if __name__ == "__main__":
asyncio.run(main())
+150
View File
@@ -0,0 +1,150 @@
"""Market context feature computation for aggregation windows.
Fetches recent market snapshots from PostgreSQL and computes context
features (price change, volume trend, volatility) that enrich trend
summaries and modulate signal weighting.
Requirements: 6.1, 6.2
"""
from __future__ import annotations
import math
from datetime import datetime, timedelta, timezone
from typing import Any
import asyncpg
from services.shared.schemas import MarketContext, TrendWindow
# Map TrendWindow values to lookback durations in days.
WINDOW_LOOKBACK_DAYS: dict[str, int] = {
TrendWindow.INTRADAY.value: 1,
TrendWindow.ONE_DAY.value: 2,
TrendWindow.SEVEN_DAY.value: 8,
TrendWindow.THIRTY_DAY.value: 35,
TrendWindow.NINETY_DAY.value: 95,
}
async def fetch_market_context(
pool: asyncpg.Pool,
ticker: str,
window: str,
reference_time: datetime | None = None,
) -> MarketContext:
"""Build a MarketContext for *ticker* over the given trend *window*.
Queries the ``market_snapshots`` table for recent bars and computes:
- price_change_pct: (last_close - first_close) / first_close
- avg_volume: mean volume across bars
- volume_change_pct: second-half avg volume vs first-half avg volume
- volatility: std-dev of close prices
- latest_close / latest_bar_at
Returns a MarketContext with ``bars_available == 0`` when no data exists.
"""
if reference_time is None:
reference_time = datetime.now(timezone.utc)
lookback_days = WINDOW_LOOKBACK_DAYS.get(window, 8)
start = reference_time - timedelta(days=lookback_days)
rows = await pool.fetch(
"""
SELECT data, captured_at
FROM market_snapshots
WHERE ticker = $1
AND captured_at >= $2
AND captured_at <= $3
ORDER BY captured_at ASC
""",
ticker,
start,
reference_time,
)
if not rows:
return MarketContext(ticker=ticker)
bars = _extract_bars(rows)
if not bars:
return MarketContext(ticker=ticker)
return _compute_context(ticker, bars)
def _extract_bars(rows: list[Any]) -> list[dict[str, Any]]:
"""Extract OHLCV bar dicts from market_snapshot rows.
The ``data`` column is JSONB. Polygon prev-day bars store fields like
``o``, ``h``, ``l``, ``c``, ``v``, ``t``. We normalise to a common
dict with ``close``, ``volume``, ``captured_at``.
"""
bars: list[dict[str, Any]] = []
for row in rows:
data = row["data"]
if isinstance(data, str):
import json
data = json.loads(data)
# Polygon-style single bar or list of bars
items = data if isinstance(data, list) else [data]
for item in items:
close = item.get("c") or item.get("close")
volume = item.get("v") or item.get("volume")
if close is not None:
bars.append({
"close": float(close),
"volume": float(volume) if volume is not None else 0.0,
"captured_at": row["captured_at"],
})
return bars
def _compute_context(ticker: str, bars: list[dict[str, Any]]) -> MarketContext:
"""Derive market context features from a sorted list of bar dicts."""
closes = [b["close"] for b in bars]
volumes = [b["volume"] for b in bars]
first_close = closes[0]
last_close = closes[-1]
price_change_pct = (
((last_close - first_close) / first_close * 100.0)
if first_close != 0
else 0.0
)
avg_volume = sum(volumes) / len(volumes) if volumes else 0.0
# Volume trend: compare second half to first half
mid = len(volumes) // 2
if mid > 0:
first_half_avg = sum(volumes[:mid]) / mid
second_half_avg = sum(volumes[mid:]) / len(volumes[mid:])
volume_change_pct = (
((second_half_avg - first_half_avg) / first_half_avg * 100.0)
if first_half_avg > 0
else 0.0
)
else:
volume_change_pct = 0.0
# Volatility: std dev of closes
if len(closes) > 1:
mean_close = sum(closes) / len(closes)
variance = sum((c - mean_close) ** 2 for c in closes) / len(closes)
volatility = math.sqrt(variance)
else:
volatility = 0.0
return MarketContext(
ticker=ticker,
price_change_pct=round(price_change_pct, 4),
avg_volume=round(avg_volume, 2),
volume_change_pct=round(volume_change_pct, 4),
volatility=round(volatility, 6),
latest_close=last_close,
latest_bar_at=bars[-1]["captured_at"],
bars_available=len(bars),
)
+439
View File
@@ -0,0 +1,439 @@
"""Sector and market-level rollup aggregation.
Aggregates company-level trend summaries into sector and market-level
summaries, enabling top-down views of sentiment and risk across the
portfolio.
Requirements: 6.3, 6.4, 6.5
"""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
import asyncpg
from services.shared.schemas import (
DisagreementDetail,
TrendDirection,
TrendSummary,
TrendWindow,
)
logger = logging.getLogger(__name__)
@dataclass
class CompanyTrendRow:
"""A company-level trend summary fetched from the DB for rollup."""
entity_id: str # ticker
sector: str
window: str
trend_direction: str
trend_strength: float
confidence: float
contradiction_score: float
dominant_catalysts: list[str]
material_risks: list[str]
top_supporting_evidence: list[str]
top_opposing_evidence: list[str]
# ---------------------------------------------------------------------------
# Fetch latest company trends for a given window
# ---------------------------------------------------------------------------
_LATEST_COMPANY_TRENDS_QUERY = """
SELECT DISTINCT ON (tw.entity_id)
tw.entity_id,
c.sector,
tw.window,
tw.trend_direction,
tw.trend_strength,
tw.confidence,
tw.contradiction_score,
tw.dominant_catalysts,
tw.material_risks,
tw.top_supporting_evidence,
tw.top_opposing_evidence
FROM trend_windows tw
JOIN companies c ON c.ticker = tw.entity_id AND c.active = TRUE
WHERE tw.entity_type = 'company'
AND tw.window = $1
AND tw.generated_at >= $2
ORDER BY tw.entity_id, tw.generated_at DESC
"""
def _parse_jsonb_list(val: object) -> list[str]:
"""Safely parse a JSONB column that should be a list of strings."""
if isinstance(val, list):
return [str(v) for v in val]
if isinstance(val, str):
parsed = json.loads(val)
if isinstance(parsed, list):
return [str(v) for v in parsed]
return []
def _parse_company_trend_row(row: object) -> CompanyTrendRow:
"""Convert an asyncpg Record to a CompanyTrendRow."""
# asyncpg Records support dict() but aren't typed; use getattr-style access
get = getattr(row, "__getitem__", None)
if get is None:
raise TypeError(f"Expected a mapping-like row, got {type(row)}")
def _str(key: str, default: str = "") -> str:
val = get(key)
return str(val) if val is not None else default
def _float(key: str) -> float:
val = get(key)
return float(val) if val is not None else 0.0
return CompanyTrendRow(
entity_id=_str("entity_id"),
sector=_str("sector", "Unknown") or "Unknown",
window=_str("window"),
trend_direction=_str("trend_direction"),
trend_strength=_float("trend_strength"),
confidence=_float("confidence"),
contradiction_score=_float("contradiction_score"),
dominant_catalysts=_parse_jsonb_list(get("dominant_catalysts")),
material_risks=_parse_jsonb_list(get("material_risks")),
top_supporting_evidence=_parse_jsonb_list(get("top_supporting_evidence")),
top_opposing_evidence=_parse_jsonb_list(get("top_opposing_evidence")),
)
async def fetch_latest_company_trends(
pool: asyncpg.Pool,
window: str,
since: datetime,
) -> list[CompanyTrendRow]:
"""Fetch the most recent company-level trend for each ticker in a window."""
rows = await pool.fetch(_LATEST_COMPANY_TRENDS_QUERY, window, since)
return [_parse_company_trend_row(r) for r in rows]
# ---------------------------------------------------------------------------
# Pure rollup logic
# ---------------------------------------------------------------------------
# Direction mapping for numeric aggregation
_DIRECTION_VALUES = {
TrendDirection.BULLISH.value: 1.0,
TrendDirection.BEARISH.value: -1.0,
TrendDirection.MIXED.value: 0.0,
TrendDirection.NEUTRAL.value: 0.0,
}
BULLISH_THRESHOLD = 0.15
BEARISH_THRESHOLD = -0.15
def rollup_trends(
trends: list[CompanyTrendRow],
entity_type: str,
entity_id: str,
window: str,
reference_time: datetime,
) -> TrendSummary:
"""Aggregate a list of company-level trends into a single rollup summary.
Each company trend is weighted by its confidence to produce a
confidence-weighted average of direction, strength, and contradiction.
"""
if not trends:
return TrendSummary(
entity_type=entity_type,
entity_id=entity_id,
window=TrendWindow(window),
trend_direction=TrendDirection.NEUTRAL,
trend_strength=0.0,
confidence=0.0,
generated_at=reference_time,
)
total_weight = 0.0
weighted_direction = 0.0
weighted_strength = 0.0
weighted_contradiction = 0.0
catalyst_weights: dict[str, float] = {}
risk_set: dict[str, float] = {}
all_supporting: list[str] = []
all_opposing: list[str] = []
for t in trends:
w = t.confidence
total_weight += w
dir_val = _DIRECTION_VALUES.get(t.trend_direction, 0.0)
weighted_direction += w * dir_val
weighted_strength += w * t.trend_strength
weighted_contradiction += w * t.contradiction_score
for cat in t.dominant_catalysts:
catalyst_weights[cat] = catalyst_weights.get(cat, 0.0) + w
for risk in t.material_risks:
norm = risk.strip().lower()
if norm not in risk_set:
risk_set[norm] = w
else:
risk_set[norm] = max(risk_set[norm], w)
all_supporting.extend(t.top_supporting_evidence)
all_opposing.extend(t.top_opposing_evidence)
if total_weight == 0.0:
return TrendSummary(
entity_type=entity_type,
entity_id=entity_id,
window=TrendWindow(window),
trend_direction=TrendDirection.NEUTRAL,
trend_strength=0.0,
confidence=0.0,
generated_at=reference_time,
)
avg_direction = weighted_direction / total_weight
avg_strength = weighted_strength / total_weight
avg_contradiction = weighted_contradiction / total_weight
avg_confidence = total_weight / len(trends)
# Derive direction
direction = _derive_rollup_direction(avg_direction, avg_contradiction)
# Top catalysts
sorted_catalysts = sorted(catalyst_weights.items(), key=lambda x: x[1], reverse=True)
catalysts = [c for c, _ in sorted_catalysts[:5]]
# Top risks (deduplicated, by weight)
sorted_risks = sorted(risk_set.items(), key=lambda x: x[1], reverse=True)
risks = [r for r, _ in sorted_risks[:5]]
# Disagreement details
disagreement = _build_rollup_disagreement(trends, entity_id)
return TrendSummary(
entity_type=entity_type,
entity_id=entity_id,
window=TrendWindow(window),
trend_direction=direction,
trend_strength=round(min(abs(avg_strength), 1.0), 4),
confidence=round(max(0.0, min(avg_confidence, 1.0)), 4),
top_supporting_evidence=list(dict.fromkeys(all_supporting))[:10],
top_opposing_evidence=list(dict.fromkeys(all_opposing))[:10],
dominant_catalysts=catalysts,
material_risks=risks,
contradiction_score=round(max(0.0, min(avg_contradiction, 1.0)), 4),
disagreement_details=disagreement,
generated_at=reference_time,
)
def _derive_rollup_direction(
avg_direction: float,
avg_contradiction: float,
) -> TrendDirection:
"""Map averaged direction value to a TrendDirection."""
if avg_contradiction > 0.10 and abs(avg_direction) < 0.3:
return TrendDirection.MIXED
if avg_direction >= BULLISH_THRESHOLD:
return TrendDirection.BULLISH
if avg_direction <= BEARISH_THRESHOLD:
return TrendDirection.BEARISH
return TrendDirection.NEUTRAL
def _build_rollup_disagreement(
trends: list[CompanyTrendRow],
entity_id: str,
) -> list[DisagreementDetail]:
"""Build disagreement details showing which companies are bullish vs bearish."""
bullish_ids: list[str] = []
bearish_ids: list[str] = []
bullish_weight = 0.0
bearish_weight = 0.0
for t in trends:
if t.trend_direction == TrendDirection.BULLISH.value:
bullish_ids.append(t.entity_id)
bullish_weight += t.confidence
elif t.trend_direction == TrendDirection.BEARISH.value:
bearish_ids.append(t.entity_id)
bearish_weight += t.confidence
if not bullish_ids or not bearish_ids:
return []
return [
DisagreementDetail(
dimension="company_direction",
positive_doc_ids=bullish_ids,
negative_doc_ids=bearish_ids,
positive_weight=round(bullish_weight, 4),
negative_weight=round(bearish_weight, 4),
description=(
f"{entity_id}: {len(bullish_ids)} bullish vs "
f"{len(bearish_ids)} bearish companies"
),
)
]
# ---------------------------------------------------------------------------
# Persist rollup (reuses the same trend_windows table)
# ---------------------------------------------------------------------------
_UPSERT_TREND = """
INSERT INTO trend_windows (
entity_type, entity_id, window, trend_direction, trend_strength,
confidence, top_supporting_evidence, top_opposing_evidence,
dominant_catalysts, material_risks, contradiction_score,
disagreement_details, market_context, generated_at
) VALUES (
$1, $2, $3, $4, $5,
$6, $7::jsonb, $8::jsonb,
$9::jsonb, $10::jsonb, $11,
$12::jsonb, $13::jsonb, $14
)
RETURNING id
"""
async def persist_rollup(
pool: asyncpg.Pool,
summary: TrendSummary,
) -> str:
"""Insert a rollup trend summary and return its UUID."""
row = await pool.fetchrow(
_UPSERT_TREND,
summary.entity_type,
summary.entity_id,
summary.window.value,
summary.trend_direction.value,
summary.trend_strength,
summary.confidence,
json.dumps(summary.top_supporting_evidence),
json.dumps(summary.top_opposing_evidence),
json.dumps(summary.dominant_catalysts),
json.dumps(summary.material_risks),
summary.contradiction_score,
json.dumps([d.model_dump() for d in summary.disagreement_details]),
json.dumps({}),
summary.generated_at,
)
return str(row["id"]) # type: ignore[index]
# ---------------------------------------------------------------------------
# High-level rollup entry points
# ---------------------------------------------------------------------------
async def aggregate_sector(
pool: asyncpg.Pool,
sector: str,
window: str,
reference_time: datetime | None = None,
since: datetime | None = None,
) -> TrendSummary:
"""Compute and persist a sector-level rollup for one window.
Fetches the latest company trends, filters to the given sector,
and rolls them up into a single sector summary.
"""
if reference_time is None:
reference_time = datetime.now(timezone.utc)
if since is None:
since = reference_time - _window_lookback(window)
all_trends = await fetch_latest_company_trends(pool, window, since)
sector_trends = [t for t in all_trends if t.sector == sector]
summary = rollup_trends(sector_trends, "sector", sector, window, reference_time)
if sector_trends:
rollup_id = await persist_rollup(pool, summary)
logger.info(
"Persisted sector rollup %s for %s/%s: direction=%s strength=%.3f companies=%d",
rollup_id, sector, window, summary.trend_direction.value,
summary.trend_strength, len(sector_trends),
)
return summary
async def aggregate_market(
pool: asyncpg.Pool,
window: str,
reference_time: datetime | None = None,
since: datetime | None = None,
) -> TrendSummary:
"""Compute and persist a market-wide rollup for one window.
Aggregates all company trends regardless of sector.
"""
if reference_time is None:
reference_time = datetime.now(timezone.utc)
if since is None:
since = reference_time - _window_lookback(window)
all_trends = await fetch_latest_company_trends(pool, window, since)
summary = rollup_trends(all_trends, "market", "all", window, reference_time)
if all_trends:
rollup_id = await persist_rollup(pool, summary)
logger.info(
"Persisted market rollup %s for %s: direction=%s strength=%.3f companies=%d",
rollup_id, window, summary.trend_direction.value,
summary.trend_strength, len(all_trends),
)
return summary
async def aggregate_all_sectors(
pool: asyncpg.Pool,
window: str,
reference_time: datetime | None = None,
since: datetime | None = None,
) -> list[TrendSummary]:
"""Compute sector rollups for every sector that has company trends."""
if reference_time is None:
reference_time = datetime.now(timezone.utc)
if since is None:
since = reference_time - _window_lookback(window)
all_trends = await fetch_latest_company_trends(pool, window, since)
# Group by sector
sectors: dict[str, list[CompanyTrendRow]] = {}
for t in all_trends:
sectors.setdefault(t.sector, []).append(t)
summaries: list[TrendSummary] = []
for sector, trends in sectors.items():
summary = rollup_trends(trends, "sector", sector, window, reference_time)
if trends:
_id = await persist_rollup(pool, summary)
summaries.append(summary)
return summaries
def _window_lookback(window: str) -> timedelta:
"""Return a reasonable lookback for finding recent company trends."""
mapping = {
TrendWindow.INTRADAY.value: timedelta(hours=24),
TrendWindow.ONE_DAY.value: timedelta(days=2),
TrendWindow.SEVEN_DAY.value: timedelta(days=8),
TrendWindow.THIRTY_DAY.value: timedelta(days=35),
TrendWindow.NINETY_DAY.value: timedelta(days=95),
}
return mapping.get(window, timedelta(days=8))
+285
View File
@@ -0,0 +1,285 @@
"""Recency decay, source credibility weighting, and market context
integration for aggregation.
Provides scoring functions used by the aggregation engine to weight
document intelligence signals when computing trend summaries.
Requirements: 6.1, 6.2, 6.5
"""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from datetime import datetime, timezone
from services.shared.schemas import MarketContext
@dataclass(frozen=True)
class ScoringConfig:
"""Tunable parameters for signal scoring."""
# Recency decay: exponential half-life in hours per window.
# After one half-life, a document's recency weight drops to 0.5.
half_life_hours: dict[str, float] = field(default_factory=lambda: {
"intraday": 2.0,
"1d": 12.0,
"7d": 72.0,
"30d": 240.0,
"90d": 720.0,
})
# Minimum recency weight — prevents very old docs from being zeroed out
# entirely so they can still contribute trace-level signal.
min_recency_weight: float = 0.01
# Source credibility bounds — credibility scores outside this range
# are clamped before weighting.
credibility_floor: float = 0.1
credibility_ceiling: float = 1.0
# Exponent applied to credibility score. >1 penalises low-credibility
# sources more aggressively; <1 flattens the curve.
credibility_exponent: float = 1.0
# Novelty bonus: multiplier range applied on top of base weight.
# A novelty_score of 1.0 gets the full bonus; 0.0 gets none.
novelty_bonus_max: float = 0.25
# Confidence floor — documents below this extraction confidence
# receive zero weight (they are too unreliable to aggregate).
confidence_floor: float = 0.2
# Market context modulation ---
# When volatility exceeds this threshold (in price units), recency
# signals are amplified because fast-moving markets make fresh data
# more important.
volatility_recency_boost_threshold: float = 1.0
volatility_recency_boost_max: float = 0.30 # max extra multiplier
# When volume surges above this % change, signals get a small boost
# because high-volume moves carry more conviction.
volume_surge_threshold_pct: float = 50.0
volume_surge_boost: float = 0.15
# Singleton default config
DEFAULT_CONFIG = ScoringConfig()
# ---------------------------------------------------------------------------
# Recency decay
# ---------------------------------------------------------------------------
def recency_weight(
published_at: datetime,
reference_time: datetime,
window: str,
config: ScoringConfig = DEFAULT_CONFIG,
) -> float:
"""Compute an exponential recency decay weight for a document.
Uses the formula: w = 2^(-age_hours / half_life)
Args:
published_at: When the document was published (tz-aware).
reference_time: The "now" anchor for the aggregation window (tz-aware).
window: One of the TrendWindow values (e.g. "7d").
config: Scoring parameters.
Returns:
A weight in [config.min_recency_weight, 1.0].
"""
# Ensure both are tz-aware; treat naive as UTC.
if published_at.tzinfo is None:
published_at = published_at.replace(tzinfo=timezone.utc)
if reference_time.tzinfo is None:
reference_time = reference_time.replace(tzinfo=timezone.utc)
age_seconds = (reference_time - published_at).total_seconds()
if age_seconds <= 0:
return 1.0
age_hours = age_seconds / 3600.0
half_life = config.half_life_hours.get(window, 72.0)
weight = math.pow(2.0, -age_hours / half_life)
return max(weight, config.min_recency_weight)
# ---------------------------------------------------------------------------
# Source credibility weighting
# ---------------------------------------------------------------------------
def credibility_weight(
source_credibility: float,
config: ScoringConfig = DEFAULT_CONFIG,
) -> float:
"""Compute a weight from a source's credibility score.
The raw credibility (0-1) is clamped to [floor, ceiling] then raised
to ``credibility_exponent``.
Args:
source_credibility: The credibility score from the source or
document intelligence record (0-1).
config: Scoring parameters.
Returns:
A weight in [floor^exp, ceiling^exp].
"""
clamped = max(config.credibility_floor, min(source_credibility, config.credibility_ceiling))
return math.pow(clamped, config.credibility_exponent)
# ---------------------------------------------------------------------------
# Market context adjustment
# ---------------------------------------------------------------------------
def market_context_multiplier(
market_ctx: MarketContext | None,
config: ScoringConfig = DEFAULT_CONFIG,
) -> float:
"""Compute a multiplicative adjustment from market context features.
Returns a value >= 1.0 that amplifies signal weights when market
conditions suggest heightened importance (high volatility or volume
surges). Returns 1.0 when no market context is available.
"""
if market_ctx is None or not market_ctx.has_data:
return 1.0
boost = 0.0
# Volatility boost — more volatile markets make recent signals more valuable
if market_ctx.volatility is not None and market_ctx.volatility > config.volatility_recency_boost_threshold:
excess = market_ctx.volatility - config.volatility_recency_boost_threshold
# Logarithmic scaling so extreme volatility doesn't blow up the weight
boost += min(
math.log1p(excess) * 0.15,
config.volatility_recency_boost_max,
)
# Volume surge boost
if market_ctx.volume_change_pct is not None and market_ctx.volume_change_pct > config.volume_surge_threshold_pct:
boost += config.volume_surge_boost
return 1.0 + boost
# ---------------------------------------------------------------------------
# Combined document signal weight
# ---------------------------------------------------------------------------
@dataclass
class SignalWeight:
"""Breakdown of a document's aggregation weight."""
recency: float
credibility: float
novelty_bonus: float
confidence_gate: float # 0.0 or 1.0
market_ctx_multiplier: float # >= 1.0
combined: float
def compute_signal_weight(
published_at: datetime,
reference_time: datetime,
window: str,
source_credibility: float,
novelty_score: float = 0.5,
extraction_confidence: float = 0.5,
market_ctx: MarketContext | None = None,
config: ScoringConfig = DEFAULT_CONFIG,
) -> SignalWeight:
"""Compute the combined aggregation weight for a single document signal.
The formula is:
combined = confidence_gate * recency * credibility
* (1 + novelty_bonus) * market_ctx_multiplier
where novelty_bonus = novelty_score * config.novelty_bonus_max
and market_ctx_multiplier >= 1.0 based on volatility/volume features.
Documents with extraction_confidence below config.confidence_floor
receive a combined weight of 0.0 (gated out).
Args:
published_at: Document publication time.
reference_time: Aggregation anchor time.
window: Trend window identifier.
source_credibility: Source credibility score (0-1).
novelty_score: Document novelty score (0-1).
extraction_confidence: Extraction confidence from the model (0-1).
market_ctx: Optional market context features for the symbol.
config: Scoring parameters.
Returns:
A ``SignalWeight`` with the component breakdown and combined score.
"""
# Confidence gate
gate = 1.0 if extraction_confidence >= config.confidence_floor else 0.0
rec = recency_weight(published_at, reference_time, window, config)
cred = credibility_weight(source_credibility, config)
bonus = novelty_score * config.novelty_bonus_max
mkt_mult = market_context_multiplier(market_ctx, config)
combined = gate * rec * cred * (1.0 + bonus) * mkt_mult
return SignalWeight(
recency=rec,
credibility=cred,
novelty_bonus=bonus,
confidence_gate=gate,
market_ctx_multiplier=mkt_mult,
combined=combined,
)
# ---------------------------------------------------------------------------
# Batch helpers
# ---------------------------------------------------------------------------
@dataclass
class WeightedSignal:
"""A document intelligence reference paired with its computed weight."""
document_id: str
weight: SignalWeight
sentiment_value: float # numeric sentiment: +1 positive, -1 negative, 0 neutral/mixed
impact_score: float
def sentiment_to_numeric(sentiment: str) -> float:
"""Map a sentiment label to a signed numeric value."""
mapping = {
"positive": 1.0,
"negative": -1.0,
"neutral": 0.0,
"mixed": 0.0,
}
return mapping.get(sentiment.lower(), 0.0)
def weighted_sentiment_average(signals: list[WeightedSignal]) -> float:
"""Compute a weight-adjusted average sentiment across signals.
Returns a value in [-1, 1]. Returns 0.0 when total weight is zero.
"""
total_weight = 0.0
weighted_sum = 0.0
for sig in signals:
w = sig.weight.combined * sig.impact_score
weighted_sum += w * sig.sentiment_value
total_weight += w
if total_weight == 0.0:
return 0.0
return weighted_sum / total_weight
+650 -1
View File
@@ -1 +1,650 @@
"""Aggregation worker - rolling trend summaries, contradiction detection, evidence ranking.""" """Aggregation worker - company-level rolling window trend summaries.
Queries document intelligence and market context for a given ticker,
computes weighted signal scores, and produces TrendSummary objects
persisted to the trend_windows table.
Requirements: 6.1, 6.2, 6.5
"""
from __future__ import annotations
import json
import logging
import time
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Any
import asyncpg
from services.aggregation.contradiction import CatalystEntry, detect_contradictions
from services.aggregation.evidence import (
EvidenceRankConfig,
RankedEvidence,
rank_evidence as _rank_evidence_composite,
rank_evidence_detailed,
)
from services.aggregation.market_context import fetch_market_context
from services.aggregation.scoring import (
ScoringConfig,
WeightedSignal,
compute_signal_weight,
sentiment_to_numeric,
weighted_sentiment_average,
)
from services.shared.schemas import TrendDirection, TrendSummary, TrendWindow
from services.shared.metrics import (
AGGREGATION_CONTRADICTION_SCORE,
AGGREGATION_DURATION,
AGGREGATION_SIGNALS_PROCESSED,
AGGREGATION_WINDOWS_COMPUTED,
)
logger = logging.getLogger(__name__)
# Map TrendWindow values to lookback durations.
WINDOW_DURATIONS: dict[str, timedelta] = {
TrendWindow.INTRADAY.value: timedelta(hours=12),
TrendWindow.ONE_DAY.value: timedelta(days=1),
TrendWindow.SEVEN_DAY.value: timedelta(days=7),
TrendWindow.THIRTY_DAY.value: timedelta(days=30),
TrendWindow.NINETY_DAY.value: timedelta(days=90),
}
# How many evidence document IDs to keep in supporting/opposing lists.
MAX_EVIDENCE_REFS = 10
@dataclass
class AggregationConfig:
"""Controls which windows to compute and scoring parameters."""
windows: list[str] | None = None # None = all windows
scoring: ScoringConfig | None = None
max_evidence: int = MAX_EVIDENCE_REFS
def effective_windows(self) -> list[str]:
if self.windows:
return self.windows
return [w.value for w in TrendWindow]
def effective_scoring(self) -> ScoringConfig:
return self.scoring or ScoringConfig()
# ---------------------------------------------------------------------------
# Fetch impact records for a ticker within a time window
# ---------------------------------------------------------------------------
_IMPACT_QUERY = """
SELECT
di.document_id,
di.confidence,
di.novelty_score,
di.source_credibility,
dir.sentiment,
dir.impact_score,
dir.catalyst_type,
dir.key_facts,
dir.risks,
d.published_at
FROM document_impact_records dir
JOIN document_intelligence di ON di.id = dir.intelligence_id
JOIN documents d ON d.id = di.document_id
WHERE dir.ticker = $1
AND d.published_at >= $2
AND d.published_at <= $3
AND di.validation_status = 'valid'
AND d.status != 'rejected'
ORDER BY d.published_at DESC
"""
@dataclass
class ImpactRow:
"""Parsed row from the impact query."""
document_id: str
confidence: float
novelty_score: float
source_credibility: float
sentiment: str
impact_score: float
catalyst_type: str
key_facts: list[str]
risks: list[str]
published_at: datetime
def _parse_impact_row(row: Any) -> ImpactRow:
"""Convert an asyncpg Record to an ImpactRow."""
key_facts = row["key_facts"]
if isinstance(key_facts, str):
key_facts = json.loads(key_facts)
risks = row["risks"]
if isinstance(risks, str):
risks = json.loads(risks)
return ImpactRow(
document_id=str(row["document_id"]),
confidence=float(row["confidence"] or 0.5),
novelty_score=float(row["novelty_score"] or 0.5),
source_credibility=float(row["source_credibility"] or 0.5),
sentiment=row["sentiment"] or "neutral",
impact_score=float(row["impact_score"] or 0.0),
catalyst_type=row["catalyst_type"] or "other",
key_facts=key_facts if isinstance(key_facts, list) else [],
risks=risks if isinstance(risks, list) else [],
published_at=row["published_at"],
)
async def fetch_impact_records(
pool: asyncpg.Pool,
ticker: str,
window_start: datetime,
window_end: datetime,
) -> list[ImpactRow]:
"""Fetch validated document impact records for a ticker in a time range."""
rows = await pool.fetch(_IMPACT_QUERY, ticker, window_start, window_end)
return [_parse_impact_row(r) for r in rows]
# ---------------------------------------------------------------------------
# Build weighted signals from impact records
# ---------------------------------------------------------------------------
def build_weighted_signals(
impacts: list[ImpactRow],
reference_time: datetime,
window: str,
market_ctx: Any | None = None,
config: ScoringConfig | None = None,
) -> list[WeightedSignal]:
"""Convert impact records into WeightedSignal objects using the scoring module."""
cfg = config or ScoringConfig()
signals: list[WeightedSignal] = []
for imp in impacts:
sw = compute_signal_weight(
published_at=imp.published_at,
reference_time=reference_time,
window=window,
source_credibility=imp.source_credibility,
novelty_score=imp.novelty_score,
extraction_confidence=imp.confidence,
market_ctx=market_ctx,
config=cfg,
)
signals.append(
WeightedSignal(
document_id=imp.document_id,
weight=sw,
sentiment_value=sentiment_to_numeric(imp.sentiment),
impact_score=imp.impact_score,
)
)
return signals
# ---------------------------------------------------------------------------
# Derive trend direction from weighted sentiment
# ---------------------------------------------------------------------------
# Thresholds for mapping numeric sentiment to direction.
BULLISH_THRESHOLD = 0.15
BEARISH_THRESHOLD = -0.15
MIXED_THRESHOLD = 0.10 # contradiction score above this → mixed
def derive_trend_direction(
avg_sentiment: float,
contradiction_score: float = 0.0,
) -> TrendDirection:
"""Map a weighted average sentiment to a TrendDirection.
If contradiction is high, the direction is MIXED regardless of
the average sentiment value.
"""
if contradiction_score > MIXED_THRESHOLD and abs(avg_sentiment) < 0.3:
return TrendDirection.MIXED
if avg_sentiment >= BULLISH_THRESHOLD:
return TrendDirection.BULLISH
if avg_sentiment <= BEARISH_THRESHOLD:
return TrendDirection.BEARISH
return TrendDirection.NEUTRAL
# ---------------------------------------------------------------------------
# Compute contradiction score
# ---------------------------------------------------------------------------
def compute_contradiction_score(signals: list[WeightedSignal]) -> float:
"""Measure how much disagreement exists among weighted signals.
Returns a value in [0, 1] where 0 means full agreement and 1 means
equal-weight positive and negative signals.
The formula computes the ratio of the minority-side total weight to
the majority-side total weight.
"""
if not signals:
return 0.0
pos_weight = 0.0
neg_weight = 0.0
for sig in signals:
w = sig.weight.combined * sig.impact_score
if sig.sentiment_value > 0:
pos_weight += w
elif sig.sentiment_value < 0:
neg_weight += w
total = pos_weight + neg_weight
if total == 0.0:
return 0.0
minority = min(pos_weight, neg_weight)
return round(minority / total, 4)
# ---------------------------------------------------------------------------
# Rank evidence (supporting vs opposing)
# ---------------------------------------------------------------------------
def rank_evidence(
signals: list[WeightedSignal],
max_refs: int = MAX_EVIDENCE_REFS,
) -> tuple[list[str], list[str]]:
"""Return top supporting and opposing document IDs ranked by composite score.
Delegates to the evidence ranking module which considers multiple
factors (weight, impact, recency, confidence) rather than raw weight alone.
Supporting = positive sentiment, Opposing = negative sentiment.
Neutral/mixed signals are excluded from evidence lists.
"""
config = EvidenceRankConfig(max_refs=max_refs)
return _rank_evidence_composite(signals, config)
# ---------------------------------------------------------------------------
# Extract dominant catalysts and material risks
# ---------------------------------------------------------------------------
def extract_catalysts_and_risks(
impacts: list[ImpactRow],
signals: list[WeightedSignal],
) -> tuple[list[str], list[str]]:
"""Return dominant catalyst types and material risks weighted by signal strength.
Catalysts are ranked by cumulative weight. Risks are deduplicated and
ordered by the weight of the signal that surfaced them.
"""
catalyst_weights: dict[str, float] = {}
risk_entries: list[tuple[float, str]] = []
# Build a lookup from document_id to combined weight
weight_by_doc = {s.document_id: s.weight.combined * s.impact_score for s in signals}
for imp in impacts:
w = weight_by_doc.get(imp.document_id, 0.0)
if w <= 0.0:
continue
catalyst_weights[imp.catalyst_type] = catalyst_weights.get(imp.catalyst_type, 0.0) + w
for risk in imp.risks:
risk_entries.append((w, risk))
# Top catalysts by cumulative weight
sorted_catalysts = sorted(catalyst_weights.items(), key=lambda x: x[1], reverse=True)
catalysts = [cat for cat, _ in sorted_catalysts[:5]]
# Deduplicated risks ordered by weight
seen_risks: set[str] = set()
risks: list[str] = []
risk_entries.sort(key=lambda x: x[0], reverse=True)
for _, risk_text in risk_entries:
normalized = risk_text.strip().lower()
if normalized not in seen_risks:
seen_risks.add(normalized)
risks.append(risk_text.strip())
if len(risks) >= 5:
break
return catalysts, risks
# ---------------------------------------------------------------------------
# Compute trend confidence
# ---------------------------------------------------------------------------
def compute_trend_confidence(
signals: list[WeightedSignal],
contradiction_score: float,
) -> float:
"""Derive an overall confidence for the trend summary.
Confidence is based on:
- Number of contributing signals (more = higher base)
- Average extraction confidence of contributing signals
- Contradiction penalty (high contradiction lowers confidence)
Returns a value in [0, 1].
"""
if not signals:
return 0.0
active = [s for s in signals if s.weight.combined > 0]
if not active:
return 0.0
# Base confidence from signal count (diminishing returns)
count_factor = min(len(active) / 20.0, 1.0)
# Average extraction confidence (from the confidence_gate — if gated,
# the signal wouldn't be in active list, so we use the raw confidence
# from the weight breakdown).
avg_conf = sum(s.weight.credibility for s in active) / len(active)
# Contradiction penalty
contradiction_penalty = contradiction_score * 0.4
confidence = (0.4 * count_factor + 0.6 * avg_conf) - contradiction_penalty
return round(max(0.0, min(1.0, confidence)), 4)
# ---------------------------------------------------------------------------
# Assemble a TrendSummary from components
# ---------------------------------------------------------------------------
@dataclass
class AssembledTrend:
"""A trend summary paired with its detailed evidence rankings."""
summary: TrendSummary
supporting_evidence: list[RankedEvidence]
opposing_evidence: list[RankedEvidence]
def assemble_trend_summary(
ticker: str,
window: str,
signals: list[WeightedSignal],
impacts: list[ImpactRow],
market_ctx: Any | None = None,
max_evidence: int = MAX_EVIDENCE_REFS,
reference_time: datetime | None = None,
) -> TrendSummary:
"""Build a complete TrendSummary from weighted signals and impact records."""
result = assemble_trend_with_evidence(
ticker, window, signals, impacts, market_ctx, max_evidence, reference_time,
)
return result.summary
def assemble_trend_with_evidence(
ticker: str,
window: str,
signals: list[WeightedSignal],
impacts: list[ImpactRow],
market_ctx: Any | None = None,
max_evidence: int = MAX_EVIDENCE_REFS,
reference_time: datetime | None = None,
) -> AssembledTrend:
"""Build a TrendSummary and return detailed evidence rankings for persistence."""
if reference_time is None:
reference_time = datetime.now(timezone.utc)
avg_sentiment = weighted_sentiment_average(signals)
# Run full contradiction detection (Requirement 6.4)
catalyst_entries = [
CatalystEntry(document_id=imp.document_id, catalyst_type=imp.catalyst_type)
for imp in impacts
]
contradiction_result = detect_contradictions(signals, catalyst_entries)
contradiction = contradiction_result.score
direction = derive_trend_direction(avg_sentiment, contradiction)
confidence = compute_trend_confidence(signals, contradiction)
# Get detailed evidence rankings for persistence
config = EvidenceRankConfig(max_refs=max_evidence)
supporting_ranked, opposing_ranked = rank_evidence_detailed(signals, config)
supporting = [r.document_id for r in supporting_ranked]
opposing = [r.document_id for r in opposing_ranked]
catalysts, risks = extract_catalysts_and_risks(impacts, signals)
# Trend strength: absolute value of weighted sentiment, clamped to [0, 1]
strength = round(min(abs(avg_sentiment), 1.0), 4)
summary = TrendSummary(
entity_type="company",
entity_id=ticker,
window=TrendWindow(window),
trend_direction=direction,
trend_strength=strength,
confidence=confidence,
top_supporting_evidence=supporting,
top_opposing_evidence=opposing,
dominant_catalysts=catalysts,
material_risks=risks,
contradiction_score=contradiction,
disagreement_details=contradiction_result.details,
market_context=market_ctx,
generated_at=reference_time,
)
return AssembledTrend(
summary=summary,
supporting_evidence=supporting_ranked,
opposing_evidence=opposing_ranked,
)
# ---------------------------------------------------------------------------
# Persist trend summary to PostgreSQL
# ---------------------------------------------------------------------------
_UPSERT_TREND = """
INSERT INTO trend_windows (
entity_type, entity_id, window, trend_direction, trend_strength,
confidence, top_supporting_evidence, top_opposing_evidence,
dominant_catalysts, material_risks, contradiction_score,
disagreement_details, market_context, generated_at
) VALUES (
$1, $2, $3, $4, $5,
$6, $7::jsonb, $8::jsonb,
$9::jsonb, $10::jsonb, $11,
$12::jsonb, $13::jsonb, $14
)
RETURNING id
"""
async def persist_trend_summary(
pool: asyncpg.Pool,
summary: TrendSummary,
) -> str:
"""Insert a trend summary row and return its UUID."""
row = await pool.fetchrow(
_UPSERT_TREND,
summary.entity_type,
summary.entity_id,
summary.window.value,
summary.trend_direction.value,
summary.trend_strength,
summary.confidence,
json.dumps(summary.top_supporting_evidence),
json.dumps(summary.top_opposing_evidence),
json.dumps(summary.dominant_catalysts),
json.dumps(summary.material_risks),
summary.contradiction_score,
json.dumps([d.model_dump() for d in summary.disagreement_details]),
json.dumps(summary.market_context.model_dump() if summary.market_context else {}),
summary.generated_at,
)
return str(row["id"])
# ---------------------------------------------------------------------------
# Persist evidence mappings to trend_evidence table
# ---------------------------------------------------------------------------
_INSERT_EVIDENCE = """
INSERT INTO trend_evidence (
trend_window_id, document_id, evidence_type,
rank_score, weight_component, impact_component,
recency_component, confidence_component, sentiment_value
) VALUES (
$1, $2::uuid, $3,
$4, $5, $6,
$7, $8, $9
)
"""
async def persist_trend_evidence(
pool: asyncpg.Pool,
trend_window_id: str,
supporting: list[RankedEvidence],
opposing: list[RankedEvidence],
) -> int:
"""Insert evidence mapping rows for a trend window. Returns count inserted."""
rows: list[tuple[str, str, str, float, float, float, float, float, float]] = []
for ev in supporting:
rows.append((
trend_window_id, ev.document_id, "supporting",
ev.rank_score, ev.weight_component, ev.impact_component,
ev.recency_component, ev.confidence_component, ev.sentiment_value,
))
for ev in opposing:
rows.append((
trend_window_id, ev.document_id, "opposing",
ev.rank_score, ev.weight_component, ev.impact_component,
ev.recency_component, ev.confidence_component, ev.sentiment_value,
))
if not rows:
return 0
await pool.executemany(_INSERT_EVIDENCE, rows)
return len(rows)
# ---------------------------------------------------------------------------
# Main aggregation entry point for a single ticker + window
# ---------------------------------------------------------------------------
async def aggregate_company_window(
pool: asyncpg.Pool,
ticker: str,
window: str,
reference_time: datetime | None = None,
config: AggregationConfig | None = None,
) -> TrendSummary:
"""Compute and persist a trend summary for one ticker and one window.
Steps:
1. Determine the time range for the window.
2. Fetch document impact records from PostgreSQL.
3. Fetch market context for the ticker.
4. Build weighted signals using the scoring module.
5. Assemble the TrendSummary.
6. Persist to trend_windows table.
Returns the assembled TrendSummary.
"""
cfg = config or AggregationConfig()
scoring_cfg = cfg.effective_scoring()
if reference_time is None:
reference_time = datetime.now(timezone.utc)
_agg_start = time.monotonic()
duration = WINDOW_DURATIONS.get(window, timedelta(days=7))
window_start = reference_time - duration
# 1. Fetch impact records
impacts = await fetch_impact_records(pool, ticker, window_start, reference_time)
# 2. Fetch market context
market_ctx = await fetch_market_context(pool, ticker, window, reference_time)
# 3. Build weighted signals
signals = build_weighted_signals(
impacts, reference_time, window, market_ctx, scoring_cfg,
)
# 4. Assemble trend summary with evidence details
assembled = assemble_trend_with_evidence(
ticker=ticker,
window=window,
signals=signals,
impacts=impacts,
market_ctx=market_ctx if market_ctx.has_data else None,
max_evidence=cfg.max_evidence,
reference_time=reference_time,
)
summary = assembled.summary
# 5. Persist trend window
trend_id = await persist_trend_summary(pool, summary)
# 6. Persist evidence mappings
evidence_count = await persist_trend_evidence(
pool, trend_id,
assembled.supporting_evidence,
assembled.opposing_evidence,
)
logger.info(
"Persisted trend %s for %s/%s: direction=%s strength=%.3f confidence=%.3f signals=%d evidence=%d",
trend_id, ticker, window, summary.trend_direction.value,
summary.trend_strength, summary.confidence, len(signals), evidence_count,
)
# Prometheus metrics
AGGREGATION_WINDOWS_COMPUTED.labels(window=window).inc()
AGGREGATION_SIGNALS_PROCESSED.labels(window=window).inc(len(signals))
AGGREGATION_CONTRADICTION_SCORE.observe(summary.contradiction_score)
AGGREGATION_DURATION.labels(window=window).observe(time.monotonic() - _agg_start)
return summary
# ---------------------------------------------------------------------------
# Aggregate all windows for a single ticker
# ---------------------------------------------------------------------------
async def aggregate_company(
pool: asyncpg.Pool,
ticker: str,
reference_time: datetime | None = None,
config: AggregationConfig | None = None,
) -> list[TrendSummary]:
"""Compute trend summaries for all configured windows for a ticker."""
cfg = config or AggregationConfig()
if reference_time is None:
reference_time = datetime.now(timezone.utc)
summaries: list[TrendSummary] = []
for window in cfg.effective_windows():
summary = await aggregate_company_window(
pool, ticker, window, reference_time, cfg,
)
summaries.append(summary)
return summaries
+1507 -1
View File
File diff suppressed because it is too large Load Diff
+268
View File
@@ -0,0 +1,268 @@
"""Ollama client wrapper using structured output format.
Sends documents to a local Ollama instance via the /api/chat endpoint
with the ``format`` parameter set to the extraction JSON schema, ensuring
the model returns schema-compliant JSON.
Includes retry logic for invalid or incomplete model responses with
exponential backoff, error classification, and full audit preservation.
Requirements: 5.1, 5.2, 5.4
"""
from __future__ import annotations
import asyncio
import json
import logging
import time
from dataclasses import dataclass, field
import httpx
from services.extractor.prompts import (
build_extraction_prompt,
get_json_schema,
get_prompt_metadata,
)
from services.extractor.schemas import ExtractionResult, ValidationReport, validate_extraction
from services.shared.config import OllamaConfig
logger = logging.getLogger("ollama_client")
# Errors that should NOT be retried — the request itself is bad.
_NON_RETRYABLE_ERRORS = frozenset({
"http_400",
"http_401",
"http_403",
"http_404",
"http_422",
})
def _is_retryable(error: str | None) -> bool:
"""Determine whether an extraction error warrants a retry."""
if error is None:
return False
return error not in _NON_RETRYABLE_ERRORS
@dataclass
class ExtractionAttempt:
"""Record of a single extraction attempt for audit."""
raw_output: str = ""
validation: ValidationReport | None = None
error: str | None = None
duration_ms: int = 0
model: str = ""
retryable: bool = True
@dataclass
class ExtractionResponse:
"""Full response from an extraction call, including all attempts."""
success: bool = False
result: ExtractionResult | None = None
attempts: list[ExtractionAttempt] = field(default_factory=list)
prompt_metadata: dict[str, str] = field(default_factory=dict)
model: str = ""
total_duration_ms: int = 0
def _compute_backoff(
attempt_num: int,
base_delay: float,
max_delay: float,
multiplier: float,
) -> float:
"""Compute exponential backoff delay for a given attempt number."""
delay = base_delay * (multiplier ** attempt_num)
return min(delay, max_delay)
class OllamaClient:
"""Async client for Ollama structured extraction.
Usage::
config = OllamaConfig(base_url="http://localhost:11434", model="llama3.1:8b")
client = OllamaClient(config)
response = await client.extract(
document_text="Apple reported record earnings...",
document_type="article",
document_id="abc-123",
)
if response.success:
print(response.result)
"""
_config: OllamaConfig
_max_retries: int
_base_delay: float
_max_delay: float
_backoff_multiplier: float
_owns_client: bool
_http: httpx.AsyncClient
def __init__(
self,
config: OllamaConfig,
max_retries: int | None = None,
http_client: httpx.AsyncClient | None = None,
) -> None:
self._config = config
self._max_retries = max_retries if max_retries is not None else config.max_retries
self._base_delay = config.retry_base_delay
self._max_delay = config.retry_max_delay
self._backoff_multiplier = config.retry_backoff_multiplier
self._owns_client = http_client is None
self._http = http_client or httpx.AsyncClient(timeout=config.timeout)
async def close(self) -> None:
"""Close the underlying HTTP client if we own it."""
if self._owns_client:
await self._http.aclose()
async def extract(
self,
document_text: str,
document_type: str = "article",
document_id: str = "",
known_tickers: list[str] | None = None,
) -> ExtractionResponse:
"""Send a document to Ollama for structured intelligence extraction.
Retries up to ``max_retries`` times when the model returns invalid
or incomplete JSON. Uses exponential backoff between retries.
Non-retryable errors (e.g. HTTP 400) stop retries immediately.
Each attempt and its validation result are preserved for audit.
Args:
document_text: Normalized text content of the document.
document_type: One of article, filing, transcript, press_release.
document_id: Optional document ID for traceability.
known_tickers: Optional ticker hints for the model.
Returns:
An ``ExtractionResponse`` with the parsed result on success.
"""
prompts = build_extraction_prompt(
document_text=document_text,
document_type=document_type,
document_id=document_id,
known_tickers=known_tickers,
)
json_schema = get_json_schema()
prompt_meta = get_prompt_metadata()
response = ExtractionResponse(
prompt_metadata=prompt_meta,
model=self._config.model,
)
total_start = time.monotonic()
for attempt_num in range(self._max_retries + 1):
attempt = await self._call_ollama(prompts, json_schema, document_text)
response.attempts.append(attempt)
if attempt.error is None and attempt.validation and attempt.validation.valid:
response.success = True
response.result = attempt.validation.parsed
break
# Check if the error is non-retryable — stop immediately
if not _is_retryable(attempt.error):
attempt.retryable = False
logger.warning(
"Non-retryable error for doc %s: %s — stopping retries",
document_id or "unknown",
attempt.error,
)
break
if attempt_num < self._max_retries:
delay = _compute_backoff(
attempt_num,
self._base_delay,
self._max_delay,
self._backoff_multiplier,
)
logger.warning(
"Extraction attempt %d/%d failed for doc %s: %s — retrying in %.1fs",
attempt_num + 1,
self._max_retries + 1,
document_id or "unknown",
attempt.error or "validation failed",
delay,
)
await asyncio.sleep(delay)
response.total_duration_ms = int((time.monotonic() - total_start) * 1000)
return response
async def _call_ollama(
self,
prompts: dict[str, str],
json_schema: dict[str, object],
document_text: str = "",
) -> ExtractionAttempt:
"""Make a single call to the Ollama /api/chat endpoint."""
attempt = ExtractionAttempt(model=self._config.model)
start = time.monotonic()
payload = {
"model": self._config.model,
"messages": [
{"role": "system", "content": prompts["system"]},
{"role": "user", "content": prompts["user"]},
],
"format": json_schema,
"stream": False,
}
try:
resp = await self._http.post(
f"{self._config.base_url}/api/chat",
json=payload,
)
_ = resp.raise_for_status()
except httpx.TimeoutException:
attempt.error = "timeout"
attempt.duration_ms = int((time.monotonic() - start) * 1000)
return attempt
except httpx.HTTPStatusError as exc:
attempt.error = f"http_{exc.response.status_code}"
attempt.retryable = _is_retryable(attempt.error)
attempt.duration_ms = int((time.monotonic() - start) * 1000)
return attempt
except httpx.HTTPError as exc:
attempt.error = f"connection_error: {exc}"
attempt.duration_ms = int((time.monotonic() - start) * 1000)
return attempt
attempt.duration_ms = int((time.monotonic() - start) * 1000)
# Parse the Ollama response envelope
try:
body: dict[str, object] = resp.json()
except json.JSONDecodeError:
attempt.error = "invalid_response_json"
attempt.raw_output = resp.text
return attempt
msg = body.get("message")
content: str = msg.get("content", "") if isinstance(msg, dict) else ""
attempt.raw_output = content
if not content:
attempt.error = "empty_model_response"
return attempt
# Validate against extraction schema
attempt.validation = validate_extraction(content, document_text=document_text)
if not attempt.validation.valid:
attempt.error = "; ".join(attempt.validation.errors)
return attempt
+72
View File
@@ -0,0 +1,72 @@
"""Extractor worker entrypoint - polls Redis for extraction jobs."""
from __future__ import annotations
import asyncio
import logging
import asyncpg
from minio import Minio
from services.extractor.client import OllamaClient
from services.extractor.worker import persist_extraction
from services.shared.config import load_config
from services.shared.logging import setup_logging
from services.shared.redis_keys import QUEUE_EXTRACTION, queue_key
logger = logging.getLogger("extractor_main")
async def main() -> None:
config = load_config()
setup_logging("extractor", level=config.log_level, json_output=config.json_logs)
pool = await asyncpg.create_pool(dsn=config.postgres.dsn, min_size=2, max_size=8)
minio_client = Minio(
config.minio.endpoint,
access_key=config.minio.access_key,
secret_key=config.minio.secret_key,
secure=config.minio.secure,
)
ollama = OllamaClient(config.ollama)
import json
import redis.asyncio as aioredis
redis_client = aioredis.from_url(config.redis.url)
queue = queue_key(QUEUE_EXTRACTION)
logger.info("Extractor worker started, polling %s", queue)
try:
while True:
raw = await redis_client.lpop(queue)
if raw is None:
await asyncio.sleep(1)
continue
payload = raw
job = json.loads(payload)
document_id = job.get("document_id", "")
ticker = job.get("ticker", "")
text = job.get("text", "")
logger.info("Processing extraction job for doc %s / %s", document_id, ticker)
try:
extraction_response = await ollama.extract(text)
await persist_extraction(
pool=pool,
minio_client=minio_client,
document_id=document_id,
ticker=ticker,
extraction_response=extraction_response,
document_text_length=len(text),
)
except Exception:
logger.exception("Extraction failed for doc %s", document_id)
finally:
await pool.close()
await redis_client.close()
if __name__ == "__main__":
asyncio.run(main())
+250
View File
@@ -0,0 +1,250 @@
"""Model performance metrics collection and persistence.
Tracks extraction success/failure rates, latency percentiles, retry counts,
validation error distributions, confidence scores, and token usage estimates.
Metrics are persisted to PostgreSQL for operational dashboards and published
to the analytical lake for Trino/Superset queries.
Requirements: 5.2, 5.4, 12.1, 12.2
"""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass, field
from datetime import datetime, timezone
import asyncpg
from services.extractor.client import ExtractionResponse
logger = logging.getLogger("extractor_metrics")
# Rough token estimate: ~4 chars per token for English text
_CHARS_PER_TOKEN = 4
@dataclass
class ExtractionMetrics:
"""Metrics extracted from a single extraction run."""
document_id: str = ""
ticker: str = ""
model_name: str = ""
prompt_version: str = ""
schema_version: str = ""
success: bool = False
attempt_count: int = 0
total_duration_ms: int = 0
first_attempt_duration_ms: int = 0
final_attempt_duration_ms: int = 0
confidence: float = 0.0
validation_status: str = "unknown"
validation_error_count: int = 0
validation_warning_count: int = 0
validation_errors: list[str] = field(default_factory=list)
retry_count: int = 0
input_token_estimate: int = 0
output_token_estimate: int = 0
company_count: int = 0
recorded_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
def collect_metrics(
extraction_response: ExtractionResponse,
*,
document_id: str = "",
ticker: str = "",
document_text_length: int = 0,
) -> ExtractionMetrics:
"""Collect metrics from an ExtractionResponse.
Args:
extraction_response: The full response from OllamaClient.extract().
document_id: UUID of the source document.
ticker: Primary ticker symbol.
document_text_length: Length of the input document text in characters.
Returns:
An ExtractionMetrics dataclass with all computed fields.
"""
attempts = extraction_response.attempts
first_dur = attempts[0].duration_ms if attempts else 0
final_dur = attempts[-1].duration_ms if attempts else 0
# Gather validation info from the final attempt
final_attempt = attempts[-1] if attempts else None
val_errors: list[str] = []
val_warnings: list[str] = []
if final_attempt and final_attempt.validation:
val_errors = final_attempt.validation.errors
val_warnings = final_attempt.validation.warnings
# Determine validation status
if extraction_response.success:
validation_status = "valid"
elif attempts:
validation_status = "failed"
else:
validation_status = "unknown"
# Confidence from the result, or 0 if failed
confidence = 0.0
company_count = 0
if extraction_response.result:
confidence = extraction_response.result.confidence
company_count = len(extraction_response.result.companies)
# Token estimates
input_tokens = document_text_length // _CHARS_PER_TOKEN if document_text_length > 0 else 0
output_tokens = 0
if final_attempt and final_attempt.raw_output:
output_tokens = len(final_attempt.raw_output) // _CHARS_PER_TOKEN
return ExtractionMetrics(
document_id=document_id,
ticker=ticker,
model_name=extraction_response.model,
prompt_version=extraction_response.prompt_metadata.get("prompt_version", ""),
schema_version=extraction_response.prompt_metadata.get("schema_version", ""),
success=extraction_response.success,
attempt_count=len(attempts),
total_duration_ms=extraction_response.total_duration_ms,
first_attempt_duration_ms=first_dur,
final_attempt_duration_ms=final_dur,
confidence=confidence,
validation_status=validation_status,
validation_error_count=len(val_errors),
validation_warning_count=len(val_warnings),
validation_errors=val_errors,
retry_count=max(0, len(attempts) - 1),
input_token_estimate=input_tokens,
output_token_estimate=output_tokens,
company_count=company_count,
)
async def persist_metrics(
pool: asyncpg.Pool,
metrics: ExtractionMetrics,
) -> str:
"""Persist extraction metrics to the model_performance_metrics table.
Args:
pool: PostgreSQL connection pool.
metrics: Collected metrics from an extraction run.
Returns:
The UUID of the inserted metrics row.
"""
row_id = await pool.fetchval(
"""INSERT INTO model_performance_metrics
(document_id, ticker, model_name, prompt_version, schema_version,
success, attempt_count, total_duration_ms,
first_attempt_duration_ms, final_attempt_duration_ms,
confidence, validation_status, validation_error_count,
validation_warning_count, validation_errors, retry_count,
input_token_estimate, output_token_estimate, company_count,
recorded_at)
VALUES ($1::uuid, $2, $3, $4, $5, $6, $7, $8, $9, $10,
$11, $12, $13, $14, $15::jsonb, $16, $17, $18, $19, $20)
RETURNING id""",
metrics.document_id,
metrics.ticker,
metrics.model_name,
metrics.prompt_version,
metrics.schema_version,
metrics.success,
metrics.attempt_count,
metrics.total_duration_ms,
metrics.first_attempt_duration_ms,
metrics.final_attempt_duration_ms,
metrics.confidence,
metrics.validation_status,
metrics.validation_error_count,
metrics.validation_warning_count,
json.dumps(metrics.validation_errors),
metrics.retry_count,
metrics.input_token_estimate,
metrics.output_token_estimate,
metrics.company_count,
metrics.recorded_at,
)
logger.info(
"Persisted extraction metrics %s for doc %s: success=%s duration=%dms retries=%d",
row_id, metrics.document_id, metrics.success,
metrics.total_duration_ms, metrics.retry_count,
)
return str(row_id)
async def get_model_performance_summary(
pool: asyncpg.Pool,
*,
model_name: str | None = None,
hours: int = 24,
) -> dict[str, object]:
"""Query aggregated model performance metrics for dashboards.
Returns a summary dict with success rate, avg latency, retry rate,
confidence distribution, and error breakdown for the given time window.
Args:
pool: PostgreSQL connection pool.
model_name: Optional filter by model name.
hours: Lookback window in hours (default 24).
Returns:
Dict with aggregated performance metrics.
"""
model_filter = "AND model_name = $2" if model_name else ""
params: list[object] = [hours]
if model_name:
params.append(model_name)
row = await pool.fetchrow(
f"""SELECT
COUNT(*) AS total_extractions,
COUNT(*) FILTER (WHERE success) AS successful,
COUNT(*) FILTER (WHERE NOT success) AS failed,
ROUND(AVG(total_duration_ms)::numeric, 1) AS avg_duration_ms,
ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 1) AS p50_duration_ms,
ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 1) AS p95_duration_ms,
ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY total_duration_ms)::numeric, 1) AS p99_duration_ms,
ROUND(AVG(retry_count)::numeric, 2) AS avg_retries,
ROUND(AVG(confidence)::numeric, 3) AS avg_confidence,
SUM(input_token_estimate) AS total_input_tokens,
SUM(output_token_estimate) AS total_output_tokens,
ROUND(AVG(company_count)::numeric, 2) AS avg_companies_per_doc,
ROUND(AVG(validation_error_count)::numeric, 2) AS avg_validation_errors,
ROUND(AVG(validation_warning_count)::numeric, 2) AS avg_validation_warnings
FROM model_performance_metrics
WHERE recorded_at >= NOW() - INTERVAL '1 hour' * $1
{model_filter}""",
*params,
)
if not row or row["total_extractions"] == 0:
return {"total_extractions": 0, "success_rate": 0.0}
total = row["total_extractions"]
successful = row["successful"]
return {
"total_extractions": total,
"successful": successful,
"failed": row["failed"],
"success_rate": round(successful / total, 4) if total > 0 else 0.0,
"avg_duration_ms": float(row["avg_duration_ms"] or 0),
"p50_duration_ms": float(row["p50_duration_ms"] or 0),
"p95_duration_ms": float(row["p95_duration_ms"] or 0),
"p99_duration_ms": float(row["p99_duration_ms"] or 0),
"avg_retries": float(row["avg_retries"] or 0),
"avg_confidence": float(row["avg_confidence"] or 0),
"total_input_tokens": int(row["total_input_tokens"] or 0),
"total_output_tokens": int(row["total_output_tokens"] or 0),
"avg_companies_per_doc": float(row["avg_companies_per_doc"] or 0),
"avg_validation_errors": float(row["avg_validation_errors"] or 0),
"avg_validation_warnings": float(row["avg_validation_warnings"] or 0),
"hours": hours,
}
+149
View File
@@ -0,0 +1,149 @@
"""Extraction prompt templates with anti-hallucination instructions.
Builds structured prompts for Ollama document intelligence extraction.
Each prompt includes the target JSON schema, anti-hallucination rules,
and document-type-specific guidance.
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
"""
from __future__ import annotations
import json
from typing import Any
from services.extractor.schemas import generate_json_schema, SCHEMA_VERSION
from services.shared.schemas import (
DocumentType,
)
PROMPT_VERSION = "document-intel-v1"
# --- JSON schema for structured output (generated from Pydantic models) ---
EXTRACTION_JSON_SCHEMA: dict[str, Any] = generate_json_schema()
# --- Anti-hallucination system prompt ---
SYSTEM_PROMPT = """\
You are a financial document analysis system. You extract structured intelligence \
from financial documents into JSON.
STRICT RULES — VIOLATIONS WILL INVALIDATE YOUR OUTPUT:
1. ONLY extract information explicitly stated in the document text provided.
2. NEVER fabricate facts, quotes, numbers, dates, or company names.
3. NEVER infer information that is not directly supported by the text.
4. If the document does not mention a company, do NOT include that company.
5. If the document is ambiguous about sentiment or impact, use "neutral" or "mixed" \
and set confidence lower.
6. evidence_spans MUST be short verbatim quotes copied from the document. \
Do NOT paraphrase or invent quotes.
7. key_facts MUST be directly stated in the document. Do NOT add external knowledge.
8. If you are uncertain about any field, lower the confidence score and add a warning \
to extraction_warnings.
9. If the document text is too short, garbled, or uninformative, return an empty \
companies array, set confidence below 0.3, and add "insufficient_content" to warnings.
10. Return ONLY valid JSON matching the provided schema. No commentary, no markdown fences."""
# --- Document-type-specific guidance ---
_DOCTYPE_GUIDANCE: dict[str, str] = {
DocumentType.ARTICLE: (
"This is a news article. Focus on reported facts, quoted sources, and stated "
"analyst opinions. Distinguish between the journalist's framing and actual "
"company developments. Do not treat speculative language as confirmed fact."
),
DocumentType.FILING: (
"This is a regulatory filing (e.g. SEC 10-K, 10-Q, 8-K). Extract concrete "
"financial figures, risk factors, and material events as stated. Filings use "
"precise legal language — preserve that precision in your extraction."
),
DocumentType.TRANSCRIPT: (
"This is an earnings call or event transcript. Distinguish between management "
"forward-looking statements and reported results. Flag forward-looking language "
"as lower confidence. Extract specific guidance numbers when stated."
),
DocumentType.PRESS_RELEASE: (
"This is a company press release. Be aware that press releases are promotional. "
"Extract stated facts and figures but note that sentiment may be biased positive. "
"Look for concrete metrics rather than marketing language."
),
}
def _get_doctype_guidance(document_type: str) -> str:
"""Return document-type-specific extraction guidance."""
return _DOCTYPE_GUIDANCE.get(document_type, _DOCTYPE_GUIDANCE[DocumentType.ARTICLE])
# --- Prompt builder ---
def build_extraction_prompt(
document_text: str,
document_type: str = DocumentType.ARTICLE,
known_tickers: list[str] | None = None,
document_id: str = "",
) -> dict[str, str]:
"""Build system and user prompts for Ollama structured extraction.
Args:
document_text: Normalized text content of the document.
document_type: One of the DocumentType enum values.
known_tickers: Optional list of tickers the document may reference.
Helps the model focus but does NOT mean all tickers are relevant.
document_id: Optional document ID for traceability.
Returns:
Dict with 'system' and 'user' prompt strings.
"""
doctype_guidance = _get_doctype_guidance(document_type)
ticker_hint = ""
if known_tickers:
tickers_str = ", ".join(known_tickers)
ticker_hint = (
f"\nThe following tickers may be referenced in this document: {tickers_str}\n"
"Only include a ticker in your output if the document actually discusses that company. "
"Do NOT include a ticker just because it appears in this hint."
)
schema_str = json.dumps(EXTRACTION_JSON_SCHEMA, indent=2)
doc_id_line = f"Document ID: {document_id}\n" if document_id else ""
user_prompt = f"""\
Extract structured intelligence from the following document.
{doc_id_line}Document type: {document_type}
{doctype_guidance}
{ticker_hint}
Your output MUST be a single JSON object conforming to this schema:
{schema_str}
REMEMBER:
- Only extract what is explicitly in the text below.
- evidence_spans must be verbatim quotes from the text.
- If the text is insufficient, return empty companies and low confidence.
- Return ONLY the JSON object. No other text.
--- DOCUMENT TEXT ---
{document_text}
--- END DOCUMENT TEXT ---"""
return {
"system": SYSTEM_PROMPT,
"user": user_prompt,
}
def get_prompt_metadata() -> dict[str, str]:
"""Return metadata about the current prompt version for audit trails."""
return {
"prompt_version": PROMPT_VERSION,
"schema_version": SCHEMA_VERSION,
}
def get_json_schema() -> dict[str, Any]:
"""Return the extraction JSON schema for Ollama structured output format parameter."""
return EXTRACTION_JSON_SCHEMA
+250
View File
@@ -0,0 +1,250 @@
"""Replay dataset loader and runner for deterministic extraction testing.
Loads archived document fixtures from JSON files, validates their expected
extraction outputs against the current schema, and provides a runner that
can compare live Ollama extraction results against expected baselines.
This enables:
- Schema regression testing: verify expected outputs still pass validation
- Prompt regression testing: detect drift when prompts or schemas change
- End-to-end replay: run fixtures through a live Ollama and compare
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
"""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from services.extractor.schemas import (
ExtractionResult,
ValidationReport,
get_schema_version,
validate_extraction,
)
logger = logging.getLogger("extractor_replay")
FIXTURES_DIR = Path(__file__).resolve().parent.parent.parent / "tests" / "replay_fixtures"
@dataclass
class ReplayFixture:
"""A single replay fixture loaded from disk."""
document_id: str
document_type: str
document_text: str
known_tickers: list[str]
expected_extraction: dict[str, Any]
metadata: dict[str, str]
source_path: str = ""
@property
def expected_result(self) -> ExtractionResult:
"""Parse expected_extraction into a validated ExtractionResult."""
return ExtractionResult.model_validate(self.expected_extraction)
@dataclass
class ReplayValidationResult:
"""Result of validating a single fixture against the current schema."""
fixture_id: str
schema_valid: bool = False
validation_report: ValidationReport | None = None
schema_version: str = ""
error: str | None = None
@dataclass
class ReplayComparisonResult:
"""Result of comparing a live extraction against the expected baseline."""
fixture_id: str
expected_companies: list[str] = field(default_factory=list)
actual_companies: list[str] = field(default_factory=list)
companies_match: bool = False
expected_sentiment_map: dict[str, str] = field(default_factory=dict)
actual_sentiment_map: dict[str, str] = field(default_factory=dict)
sentiment_match: bool = False
expected_catalyst_map: dict[str, str] = field(default_factory=dict)
actual_catalyst_map: dict[str, str] = field(default_factory=dict)
catalyst_match: bool = False
actual_schema_valid: bool = False
warnings: list[str] = field(default_factory=list)
def load_fixture(path: Path) -> ReplayFixture:
"""Load a single replay fixture from a JSON file.
Args:
path: Path to the fixture JSON file.
Returns:
A ReplayFixture with all fields populated.
Raises:
ValueError: If the fixture is missing required fields.
json.JSONDecodeError: If the file is not valid JSON.
"""
with open(path) as f:
data = json.load(f)
required = {"document_id", "document_type", "document_text", "expected_extraction"}
missing = required - set(data.keys())
if missing:
raise ValueError(f"Fixture {path.name} missing required fields: {missing}")
return ReplayFixture(
document_id=data["document_id"],
document_type=data["document_type"],
document_text=data["document_text"],
known_tickers=data.get("known_tickers", []),
expected_extraction=data["expected_extraction"],
metadata=data.get("metadata", {}),
source_path=str(path),
)
def load_all_fixtures(fixtures_dir: Path | None = None) -> list[ReplayFixture]:
"""Load all replay fixtures from the fixtures directory.
Args:
fixtures_dir: Override path to fixtures directory.
Defaults to tests/replay_fixtures/.
Returns:
List of loaded ReplayFixture objects, sorted by document_id.
"""
directory = fixtures_dir or FIXTURES_DIR
if not directory.is_dir():
logger.warning("Fixtures directory not found: %s", directory)
return []
fixtures: list[ReplayFixture] = []
for path in sorted(directory.glob("*.json")):
try:
fixture = load_fixture(path)
fixtures.append(fixture)
except (ValueError, json.JSONDecodeError) as exc:
logger.warning("Skipping invalid fixture %s: %s", path.name, exc)
logger.info("Loaded %d replay fixtures from %s", len(fixtures), directory)
return fixtures
def validate_fixture(fixture: ReplayFixture) -> ReplayValidationResult:
"""Validate a fixture's expected extraction against the current schema.
This is the core deterministic test: the expected output must still
pass schema and semantic validation with the current code. If it
doesn't, either the fixture is stale or the schema has regressed.
Args:
fixture: The replay fixture to validate.
Returns:
A ReplayValidationResult indicating pass/fail.
"""
result = ReplayValidationResult(
fixture_id=fixture.document_id,
schema_version=get_schema_version(),
)
try:
report = validate_extraction(
fixture.expected_extraction,
document_text=fixture.document_text,
)
result.validation_report = report
result.schema_valid = report.valid
except Exception as exc: # noqa: BLE001
result.error = str(exc)
result.schema_valid = False
return result
def validate_all_fixtures(
fixtures_dir: Path | None = None,
) -> list[ReplayValidationResult]:
"""Load and validate all fixtures against the current schema.
Args:
fixtures_dir: Override path to fixtures directory.
Returns:
List of validation results, one per fixture.
"""
fixtures = load_all_fixtures(fixtures_dir)
return [validate_fixture(f) for f in fixtures]
def compare_extraction(
fixture: ReplayFixture,
actual_result: ExtractionResult,
) -> ReplayComparisonResult:
"""Compare a live extraction result against the fixture's expected output.
Checks structural alignment (same companies detected, same sentiments,
same catalyst types) rather than exact string equality, since LLM
outputs vary in wording across runs.
Args:
fixture: The replay fixture with expected output.
actual_result: The ExtractionResult from a live extraction.
Returns:
A ReplayComparisonResult with match details.
"""
expected = fixture.expected_result
comparison = ReplayComparisonResult(fixture_id=fixture.document_id)
# Company ticker sets
comparison.expected_companies = sorted(c.ticker for c in expected.companies)
comparison.actual_companies = sorted(c.ticker for c in actual_result.companies)
comparison.companies_match = (
set(comparison.expected_companies) == set(comparison.actual_companies)
)
# Sentiment by ticker
comparison.expected_sentiment_map = {
c.ticker: c.sentiment for c in expected.companies
}
comparison.actual_sentiment_map = {
c.ticker: c.sentiment for c in actual_result.companies
}
comparison.sentiment_match = (
comparison.expected_sentiment_map == comparison.actual_sentiment_map
)
# Catalyst type by ticker
comparison.expected_catalyst_map = {
c.ticker: c.catalyst_type for c in expected.companies
}
comparison.actual_catalyst_map = {
c.ticker: c.catalyst_type for c in actual_result.companies
}
comparison.catalyst_match = (
comparison.expected_catalyst_map == comparison.actual_catalyst_map
)
# Schema validity of actual result
actual_report = validate_extraction(
actual_result.model_dump(mode="json"),
document_text=fixture.document_text,
)
comparison.actual_schema_valid = actual_report.valid
if actual_report.warnings:
comparison.warnings = actual_report.warnings
if not comparison.companies_match:
comparison.warnings.append(
f"company_mismatch: expected={comparison.expected_companies} actual={comparison.actual_companies}"
)
return comparison
+316
View File
@@ -0,0 +1,316 @@
"""JSON schema definitions for document intelligence extraction.
Generates Ollama-compatible JSON schemas from Pydantic models so the
extraction contract stays in sync with the shared data models. Also
provides schema validation and semantic validation helpers.
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5
"""
from __future__ import annotations
import json
import re
from typing import Any
from pydantic import BaseModel, Field
from services.shared.schemas import (
CatalystType,
Sentiment,
)
SCHEMA_VERSION = "2.0.0"
# ---------------------------------------------------------------------------
# Pydantic model that mirrors the Ollama extraction output contract.
# This is the *response* shape we ask the model to produce — it intentionally
# omits server-side fields like document_id, source_credibility, and model
# metadata that are attached after extraction.
# ---------------------------------------------------------------------------
class CompanyExtractionItem(BaseModel):
"""Per-company extraction output expected from the model.
All fields are required (no defaults) so the generated JSON schema
forces the model to produce every field explicitly.
"""
ticker: str = Field(description="Stock ticker symbol mentioned in the document.")
company_name: str = Field(description="Full company name as referenced in the document.")
relevance: float = Field(
ge=0,
le=1,
description="How relevant the document is to this company. 0=tangential, 1=primary subject.",
)
sentiment: Sentiment = Field(description="Overall sentiment toward this company in the document.")
impact_score: float = Field(
ge=0,
le=1,
description="Estimated magnitude of impact. 0=negligible, 1=highly material.",
)
impact_horizon: str = Field(
description="One of: intraday, 1d, 1d_7d, 1d_30d, 30d_90d, 90d_plus",
)
catalyst_type: CatalystType = Field(description="Primary catalyst category.")
key_facts: list[str] = Field(
description="Facts explicitly stated in the document. Do NOT infer or fabricate.",
)
risks: list[str] = Field(
description="Risks explicitly mentioned in the document.",
)
evidence_spans: list[str] = Field(
description="Short verbatim quotes from the document supporting the analysis.",
)
class ExtractionResult(BaseModel):
"""Top-level structured output the model must return.
All fields are required (no defaults) so the generated JSON schema
forces the model to produce every field explicitly.
"""
summary: str = Field(
description="A concise 1-3 sentence summary of the document's main point.",
)
companies: list[CompanyExtractionItem] = Field(
description="Per-company intelligence extracted from the document.",
)
macro_themes: list[str] = Field(
description="Broad economic or market themes mentioned (e.g. rates, inflation, ai_capex).",
)
novelty_score: float = Field(
ge=0,
le=1,
description="How novel or surprising the information is. 0=routine, 1=highly novel.",
)
confidence: float = Field(
ge=0,
le=1,
description="Model confidence in the accuracy of this extraction. Lower if text is ambiguous.",
)
extraction_warnings: list[str] = Field(
description="Any issues encountered: ambiguous_ticker, incomplete_text, low_confidence, etc.",
)
# ---------------------------------------------------------------------------
# Schema generation
# ---------------------------------------------------------------------------
def generate_json_schema() -> dict[str, Any]:
"""Generate the JSON schema from the Pydantic model.
Returns a plain JSON Schema dict suitable for Ollama's ``format``
parameter. Pydantic ``$defs`` are inlined so the schema is
self-contained.
"""
raw = ExtractionResult.model_json_schema()
# Inline $defs so the schema is flat and Ollama-friendly
return _inline_defs(raw)
def get_schema_version() -> str:
"""Return the current schema version string."""
return SCHEMA_VERSION
# ---------------------------------------------------------------------------
# Validation helpers
# ---------------------------------------------------------------------------
class ValidationReport(BaseModel):
"""Result of validating a raw model response."""
valid: bool = False
errors: list[str] = Field(default_factory=list)
warnings: list[str] = Field(default_factory=list)
parsed: ExtractionResult | None = None
def validate_extraction(
raw_json: str | dict[str, Any],
*,
document_text: str = "",
) -> ValidationReport:
"""Validate raw model output against the extraction schema.
Performs structural (JSON / Pydantic) validation followed by semantic
checks that catch hallucination indicators, cross-field inconsistencies,
and data-quality issues.
Args:
raw_json: Either a JSON string or an already-parsed dict.
document_text: Optional original document text used for evidence
span verification.
Returns:
A ``ValidationReport`` with parsed result on success.
"""
errors: list[str] = []
warnings: list[str] = []
# --- Parse JSON string if needed ---
if isinstance(raw_json, str):
try:
data = json.loads(raw_json)
except json.JSONDecodeError as exc:
return ValidationReport(valid=False, errors=[f"Invalid JSON: {exc}"])
else:
data = raw_json
if not isinstance(data, dict):
return ValidationReport(valid=False, errors=["Expected a JSON object at top level."])
# --- Pydantic structural validation ---
try:
result = ExtractionResult.model_validate(data)
except Exception as exc: # noqa: BLE001
return ValidationReport(valid=False, errors=[f"Schema validation failed: {exc}"])
# --- Semantic checks ---
sem_errors, sem_warnings = _semantic_checks(result, document_text)
errors.extend(sem_errors)
warnings.extend(sem_warnings)
# Semantic errors make the report invalid — the caller should retry.
valid = len(errors) == 0
return ValidationReport(
valid=valid,
errors=errors,
warnings=warnings,
parsed=result,
)
# ---------------------------------------------------------------------------
# Known valid impact horizons
# ---------------------------------------------------------------------------
VALID_IMPACT_HORIZONS = frozenset({
"intraday",
"1d",
"1d_7d",
"1d_30d",
"30d_90d",
"90d_plus",
})
# Ticker: 1-5 uppercase letters (covers NYSE, NASDAQ, etc.)
_TICKER_RE = re.compile(r"^[A-Z]{1,5}$")
# Evidence span length bounds (characters)
_MIN_EVIDENCE_LEN = 8
_MAX_EVIDENCE_LEN = 500
# ---------------------------------------------------------------------------
# Semantic validation rules
# ---------------------------------------------------------------------------
def _semantic_checks(
result: ExtractionResult,
document_text: str = "",
) -> tuple[list[str], list[str]]:
"""Run semantic checks on a parsed extraction.
Returns a tuple of (errors, warnings). Errors are issues severe enough
to warrant a retry; warnings are informational.
"""
errors: list[str] = []
warnings: list[str] = []
# --- Top-level checks ---
if not result.summary:
warnings.append("empty_summary")
if result.confidence < 0.3 and len(result.companies) > 0:
warnings.append("low_confidence_with_companies")
# Duplicate tickers across company entries
tickers_seen: list[str] = []
for comp in result.companies:
if comp.ticker in tickers_seen:
errors.append(f"duplicate_ticker_{comp.ticker}")
tickers_seen.append(comp.ticker)
# --- Per-company checks ---
for comp in result.companies:
tag = comp.ticker or "unknown"
# Ticker format
if not comp.ticker:
errors.append("company_missing_ticker")
elif not _TICKER_RE.match(comp.ticker):
warnings.append(f"invalid_ticker_format_{tag}")
# Impact horizon must be a known value
if comp.impact_horizon not in VALID_IMPACT_HORIZONS:
errors.append(f"invalid_impact_horizon_{comp.impact_horizon}_for_{tag}")
# Evidence spans
if not comp.evidence_spans:
warnings.append(f"no_evidence_spans_for_{tag}")
else:
for idx, span in enumerate(comp.evidence_spans):
if len(span) < _MIN_EVIDENCE_LEN:
warnings.append(f"evidence_span_too_short_for_{tag}_{idx}")
if len(span) > _MAX_EVIDENCE_LEN:
warnings.append(f"evidence_span_too_long_for_{tag}_{idx}")
# Cross-field: high impact but no facts
if not comp.key_facts and comp.impact_score > 0.5:
warnings.append(f"high_impact_no_facts_for_{tag}")
# Cross-field: very low relevance
if comp.relevance < 0.2:
warnings.append(f"very_low_relevance_for_{tag}")
# Cross-field: strong sentiment but low impact
if comp.sentiment in (Sentiment.POSITIVE, Sentiment.NEGATIVE) and comp.impact_score < 0.1:
warnings.append(f"strong_sentiment_low_impact_for_{tag}")
# --- Evidence grounding check (when source text is available) ---
if document_text:
doc_lower = document_text.lower()
for comp in result.companies:
for idx, span in enumerate(comp.evidence_spans):
if span.lower() not in doc_lower:
warnings.append(
f"evidence_span_not_found_in_document_for_{comp.ticker or 'unknown'}_{idx}"
)
return errors, warnings
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _inline_defs(schema: dict[str, Any]) -> dict[str, Any]:
"""Recursively inline ``$defs`` / ``$ref`` so the schema is self-contained."""
defs = schema.pop("$defs", {})
return _resolve_refs(schema, defs)
def _resolve_refs(node: Any, defs: dict[str, Any]) -> Any:
"""Walk the schema tree and replace ``$ref`` pointers with their definitions."""
if isinstance(node, dict):
if "$ref" in node:
ref_path = node["$ref"] # e.g. "#/$defs/CompanyExtractionItem"
ref_name = ref_path.rsplit("/", 1)[-1]
if ref_name in defs:
resolved = defs[ref_name].copy()
# The resolved def may itself contain refs
return _resolve_refs(resolved, defs)
return node # unresolvable ref, leave as-is
return {k: _resolve_refs(v, defs) for k, v in node.items()}
if isinstance(node, list):
return [_resolve_refs(item, defs) for item in node]
return node
+291 -1
View File
@@ -1 +1,291 @@
"""Extraction worker - sends documents to Ollama for structured intelligence extraction.""" """Extraction worker - sends documents to Ollama for structured intelligence extraction.
Orchestrates the full extraction pipeline for a single document:
1. Calls OllamaClient to get structured extraction
2. Uploads prompts, raw outputs, and validation reports to MinIO
3. Persists the final intelligence object and per-company impact records to PostgreSQL
4. Updates document status
Requirements: 5.1, 5.2, 5.3, 5.4, 5.5, 9.1, 9.2
"""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass
from datetime import datetime, timezone
import asyncpg
from minio import Minio
from services.extractor.client import ExtractionResponse
from services.extractor.metrics import collect_metrics, persist_metrics
from services.shared.metadata import (
persist_document_impact,
persist_document_intelligence,
update_document_status,
)
from services.shared.storage import (
upload_extraction_intelligence,
upload_extraction_prompt,
upload_extraction_raw_output,
upload_extraction_validation,
)
from services.shared.logging import Span
from services.shared.metrics import (
EXTRACTION_ATTEMPTS,
EXTRACTION_CONFIDENCE,
EXTRACTION_DURATION,
EXTRACTION_JOBS_TOTAL,
EXTRACTION_RETRIES,
EXTRACTION_TOKEN_ESTIMATE,
EXTRACTION_VALIDATION_ERRORS,
)
logger = logging.getLogger("extractor_worker")
@dataclass
class ExtractionPersistResult:
"""Result of persisting an extraction to storage and database."""
intelligence_id: str | None = None
prompt_ref: str | None = None
raw_output_ref: str | None = None
validation_ref: str | None = None
intelligence_ref: str | None = None
impact_ids: list[str] | None = None
metrics_id: str | None = None
success: bool = False
async def persist_extraction(
*,
pool: asyncpg.Pool,
minio_client: Minio,
document_id: str,
ticker: str,
extraction_response: ExtractionResponse,
company_id_map: dict[str, str] | None = None,
source_credibility: float = 0.5,
timestamp: datetime | None = None,
document_text_length: int = 0,
) -> ExtractionPersistResult:
"""Persist all extraction artifacts to MinIO and PostgreSQL.
Uploads prompts, raw model outputs, validation reports, and the final
intelligence object to MinIO. Persists the intelligence record and
per-company impact records to PostgreSQL. Updates document status.
Also collects and persists model performance metrics.
Args:
pool: PostgreSQL connection pool.
minio_client: MinIO client.
document_id: UUID of the source document.
ticker: Primary ticker for path construction.
extraction_response: Full response from OllamaClient.extract().
company_id_map: Optional mapping of ticker -> company UUID for impact records.
source_credibility: Credibility score to attach to the intelligence record.
timestamp: Override timestamp for MinIO paths (defaults to UTC now).
document_text_length: Length of the input document text for token estimation.
Returns:
ExtractionPersistResult with references to all persisted artifacts.
"""
ts = timestamp or datetime.now(timezone.utc)
result = ExtractionPersistResult()
company_id_map = company_id_map or {}
# 1. Upload prompt metadata to MinIO
prompt_payload = json.dumps({
"prompt_metadata": extraction_response.prompt_metadata,
"model": extraction_response.model,
}, indent=2).encode()
result.prompt_ref = upload_extraction_prompt(
minio_client, ticker, document_id, prompt_payload, timestamp=ts,
)
# 2. Upload raw outputs for each attempt
attempts_data: list[dict[str, object]] = []
for idx, attempt in enumerate(extraction_response.attempts):
attempt_record: dict[str, object] = {
"attempt_index": idx,
"raw_output": attempt.raw_output,
"error": attempt.error,
"duration_ms": attempt.duration_ms,
"model": attempt.model,
"retryable": attempt.retryable,
}
if attempt.validation:
attempt_record["validation"] = {
"valid": attempt.validation.valid,
"errors": attempt.validation.errors,
"warnings": attempt.validation.warnings,
}
attempts_data.append(attempt_record)
raw_output_payload = json.dumps({
"document_id": document_id,
"attempts": attempts_data,
"total_duration_ms": extraction_response.total_duration_ms,
"success": extraction_response.success,
}, indent=2).encode()
result.raw_output_ref = upload_extraction_raw_output(
minio_client, ticker, document_id, raw_output_payload, timestamp=ts,
)
# 3. Upload validation report
final_attempt = extraction_response.attempts[-1] if extraction_response.attempts else None
validation_payload = json.dumps({
"document_id": document_id,
"success": extraction_response.success,
"attempt_count": len(extraction_response.attempts),
"final_validation": {
"valid": final_attempt.validation.valid if final_attempt and final_attempt.validation else False,
"errors": final_attempt.validation.errors if final_attempt and final_attempt.validation else [],
"warnings": final_attempt.validation.warnings if final_attempt and final_attempt.validation else [],
} if final_attempt else None,
}, indent=2).encode()
result.validation_ref = upload_extraction_validation(
minio_client, ticker, document_id, validation_payload, timestamp=ts,
)
# 4. Determine validation status and persist intelligence
if extraction_response.success and extraction_response.result:
extraction = extraction_response.result
validation_status = "valid"
validation_errors: list[str] = []
# Upload final intelligence object to MinIO
intelligence_payload = json.dumps(
extraction.model_dump(mode="json"), indent=2,
).encode()
result.intelligence_ref = upload_extraction_intelligence(
minio_client, ticker, document_id, intelligence_payload, timestamp=ts,
)
# Persist to PostgreSQL
intel_id = await persist_document_intelligence(
pool,
document_id=document_id,
summary=extraction.summary,
macro_themes=extraction.macro_themes,
novelty_score=extraction.novelty_score,
source_credibility=source_credibility,
extraction_warnings=extraction.extraction_warnings,
confidence=extraction.confidence,
model_provider="ollama",
model_name=extraction_response.model,
prompt_version=extraction_response.prompt_metadata.get("prompt_version", ""),
schema_version=extraction_response.prompt_metadata.get("schema_version", ""),
raw_output_ref=result.raw_output_ref,
prompt_ref=result.prompt_ref,
validation_status=validation_status,
validation_errors=validation_errors,
retry_count=len(extraction_response.attempts) - 1,
)
result.intelligence_id = intel_id
# Persist per-company impact records
result.impact_ids = []
for company in extraction.companies:
cid = company_id_map.get(company.ticker)
if not cid:
logger.warning(
"No company_id for ticker %s in doc %s, skipping impact record",
company.ticker, document_id,
)
continue
impact_id = await persist_document_impact(
pool,
intelligence_id=intel_id,
company_id=cid,
ticker=company.ticker,
relevance=company.relevance,
sentiment=company.sentiment,
impact_score=company.impact_score,
impact_horizon=company.impact_horizon,
catalyst_type=company.catalyst_type,
key_facts=company.key_facts,
risks=company.risks,
evidence_spans=company.evidence_spans,
)
result.impact_ids.append(impact_id)
await update_document_status(pool, document_id=document_id, status="extracted")
result.success = True
logger.info(
"Extraction persisted for doc %s: intel=%s, impacts=%d",
document_id, intel_id, len(result.impact_ids),
)
else:
# Failed extraction — still persist the attempt data
all_errors: list[str] = []
for attempt in extraction_response.attempts:
if attempt.error:
all_errors.append(attempt.error)
intel_id = await persist_document_intelligence(
pool,
document_id=document_id,
summary="",
macro_themes=[],
novelty_score=0.0,
source_credibility=source_credibility,
extraction_warnings=["extraction_failed"],
confidence=0.0,
model_provider="ollama",
model_name=extraction_response.model,
prompt_version=extraction_response.prompt_metadata.get("prompt_version", ""),
schema_version=extraction_response.prompt_metadata.get("schema_version", ""),
raw_output_ref=result.raw_output_ref,
prompt_ref=result.prompt_ref,
validation_status="failed",
validation_errors=all_errors,
retry_count=len(extraction_response.attempts),
)
result.intelligence_id = intel_id
await update_document_status(pool, document_id=document_id, status="extraction_failed")
logger.warning(
"Extraction failed for doc %s after %d attempts: %s",
document_id, len(extraction_response.attempts), "; ".join(all_errors),
)
# Collect and persist model performance metrics
try:
metrics = collect_metrics(
extraction_response,
document_id=document_id,
ticker=ticker,
document_text_length=document_text_length,
)
metrics.recorded_at = ts
metrics_id = await persist_metrics(pool, metrics)
result.metrics_id = metrics_id
except Exception:
logger.exception("Failed to persist extraction metrics for doc %s", document_id)
# Prometheus metrics
EXTRACTION_ATTEMPTS.inc(len(extraction_response.attempts))
EXTRACTION_DURATION.observe(extraction_response.total_duration_ms / 1000.0)
retry_count = max(0, len(extraction_response.attempts) - 1)
if retry_count > 0:
EXTRACTION_RETRIES.inc(retry_count)
if extraction_response.success:
EXTRACTION_JOBS_TOTAL.labels(status="success").inc()
if extraction_response.result:
EXTRACTION_CONFIDENCE.observe(extraction_response.result.confidence)
else:
EXTRACTION_JOBS_TOTAL.labels(status="failed").inc()
# Count validation errors from final attempt
final = extraction_response.attempts[-1] if extraction_response.attempts else None
if final and final.validation and final.validation.errors:
EXTRACTION_VALIDATION_ERRORS.inc(len(final.validation.errors))
# Token estimates
if document_text_length > 0:
EXTRACTION_TOKEN_ESTIMATE.labels(direction="input").inc(document_text_length // 4)
if final and final.raw_output:
EXTRACTION_TOKEN_ESTIMATE.labels(direction="output").inc(len(final.raw_output) // 4)
return result
+151 -80
View File
@@ -1,47 +1,50 @@
"""Ingestion worker - processes jobs from the ingestion queue.""" """Ingestion worker - processes jobs from the ingestion queue."""
import asyncio import asyncio
import hashlib
import io
import json import json
import logging import logging
from datetime import datetime
import asyncpg import asyncpg
import redis.asyncio as aioredis import redis.asyncio as aioredis
from minio import Minio from minio import Minio
from services.adapters.base import AdapterResult from services.adapters.base import AdapterResult
from services.adapters.filings_adapter import FilingsAdapter from services.adapters.broker_adapter import AlpacaBrokerAdapter, TradingMode
from services.adapters.market_adapter import MarketDataAdapter from services.adapters.filings_adapter import SECEdgarAdapter
from services.adapters.news_adapter import NewsApiAdapter from services.adapters.market_adapter import PolygonMarketAdapter
from services.adapters.news_adapter import PolygonNewsAdapter
from services.adapters.web_scrape_adapter import WebScrapeAdapter
from services.shared.config import load_config from services.shared.config import load_config
from services.shared.db import get_minio, get_pg_pool, get_redis from services.shared.db import get_minio, get_pg_pool, get_redis
from services.shared.dedupe import dedupe_items, mark_as_seen
from services.shared.metadata import (
persist_ingestion_items,
record_retrieval_failure,
reset_source_retry_state,
)
from services.shared.redis_keys import ( from services.shared.redis_keys import (
QUEUE_INGESTION, QUEUE_INGESTION,
QUEUE_PARSING, QUEUE_PARSING,
dedupe_key, dedupe_key,
queue_key, queue_key,
) )
from services.shared.logging import Span, extract_trace_context, inject_trace_context, new_trace_id, set_trace_context, setup_logging
from services.shared.metrics import (
ACTIVE_JOBS,
INGESTION_ADAPTER_DURATION,
INGESTION_ERRORS,
INGESTION_ITEMS_DEDUPED,
INGESTION_ITEMS_FETCHED,
INGESTION_ITEMS_NEW,
INGESTION_JOBS_TOTAL,
)
from services.shared.storage import (
bucket_for_source,
ensure_buckets,
upload_raw_artifact,
)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ingestion_worker") logger = logging.getLogger("ingestion_worker")
BUCKET_MAP = {
"market_api": "stonks-raw-market",
"news_api": "stonks-raw-news",
"filings_api": "stonks-raw-filings",
"broker": "stonks-raw-market",
}
def build_storage_path(source_type: str, ticker: str, doc_id: str) -> str:
now = datetime.utcnow()
return f"{source_type}/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.json"
async def store_raw_artifact(minio_client: Minio, bucket: str, path: str, data: bytes):
minio_client.put_object(bucket, path, io.BytesIO(data), len(data), content_type="application/json")
async def process_job( async def process_job(
job: dict, job: dict,
@@ -55,9 +58,11 @@ async def process_job(
source_id = job["source_id"] source_id = job["source_id"]
config = job.get("config", {}) config = job.get("config", {})
set_trace_context(trace_id=job.get("_trace_id") or new_trace_id())
adapter = adapters.get(source_type) adapter = adapters.get(source_type)
if not adapter: if not adapter:
logger.warning(f"No adapter for source_type={source_type}") logger.warning("No adapter for source_type=%s", source_type)
return return
# Record ingestion run # Record ingestion run
@@ -68,25 +73,37 @@ async def process_job(
) )
try: try:
result: AdapterResult = await adapter.fetch(ticker, config) with Span("adapter_fetch", ticker=ticker, source_type=source_type):
with INGESTION_ADAPTER_DURATION.labels(source_type=source_type).time():
result: AdapterResult = await adapter.fetch(ticker, config)
if result.error: if result.error:
await pool.execute( INGESTION_JOBS_TOTAL.labels(source_type=source_type, status="error").inc()
"UPDATE ingestion_runs SET status='failed', error_message=$2, completed_at=NOW() WHERE id=$1", await record_retrieval_failure(
run_id, result.error, pool,
run_id=str(run_id),
source_id=source_id,
error_message=result.error,
) )
return return
# Store raw payload # Store raw payload in MinIO
bucket = BUCKET_MAP.get(source_type, "stonks-raw-market") bucket = bucket_for_source(source_type)
storage_path = build_storage_path(source_type, ticker, str(run_id)) artifact_type = "raw_html" if source_type == "web_scrape" else "raw_json"
await store_raw_artifact(minio_client, bucket, storage_path, result.raw_payload) storage_uri = upload_raw_artifact(
minio_client,
source_type=source_type,
ticker=ticker,
document_id=str(run_id),
data=result.raw_payload,
artifact_type=artifact_type,
)
# Dedupe check # Dedupe check on the overall payload hash
if result.content_hash: if result.content_hash:
already_seen = await rds.get(dedupe_key(result.content_hash)) already_seen = await rds.get(dedupe_key(result.content_hash))
if already_seen: if already_seen:
logger.info(f"Duplicate content for {ticker}, skipping") logger.info("Duplicate content for %s, skipping", ticker)
await pool.execute( await pool.execute(
"UPDATE ingestion_runs SET status='completed', items_fetched=$2, items_new=0, completed_at=NOW() WHERE id=$1", "UPDATE ingestion_runs SET status='completed', items_fetched=$2, items_new=0, completed_at=NOW() WHERE id=$1",
run_id, len(result.items), run_id, len(result.items),
@@ -94,72 +111,126 @@ async def process_job(
return return
await rds.set(dedupe_key(result.content_hash), "1", ex=86400) await rds.set(dedupe_key(result.content_hash), "1", ex=86400)
new_items = 0 # Cross-source dedupe on individual document items (news, filings, web_scrape)
for item in result.items: items_to_persist = result.items
item_json = json.dumps(item) deduped_count = 0
item_hash = hashlib.sha256(item_json.encode()).hexdigest() if source_type not in ("market_api", "broker"):
items_to_persist, dup_items = await dedupe_items(pool, rds, result.items)
deduped_count = len(dup_items)
if deduped_count:
INGESTION_ITEMS_DEDUPED.labels(source_type=source_type).inc(deduped_count)
logger.info(
"Deduped %d/%d items for %s/%s",
deduped_count, len(result.items), ticker, source_type,
)
# Check if document already exists # Persist metadata via the unified metadata module
exists = await pool.fetchval("SELECT 1 FROM documents WHERE content_hash = $1", item_hash) new_items, new_ids = await persist_ingestion_items(
if exists: pool,
continue source_type=source_type,
ticker=ticker,
company_id=job.get("company_id"),
items=items_to_persist,
storage_ref=storage_uri,
adapter_metadata=result.metadata,
content_hash=result.content_hash,
)
title = item.get("title", item.get("name", "")) # Enqueue new document items for parsing (not market/broker)
url = item.get("url", item.get("link", "")) if source_type not in ("market_api", "broker"):
published = item.get("publishedAt", item.get("published_at")) for doc_id in new_ids:
await rds.rpush(queue_key(QUEUE_PARSING), json.dumps(inject_trace_context({
"document_id": doc_id,
"ticker": ticker,
"source_type": source_type,
})))
doc_id = await pool.fetchval( # Mark newly persisted documents in Redis for fast future dedupe
"""INSERT INTO documents (document_type, source_type, publisher, url, title, published_at, content_hash, raw_storage_ref, status) for item, doc_id in zip(items_to_persist, new_ids):
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, 'ingested') await mark_as_seen(
RETURNING id""", rds,
"article" if source_type == "news_api" else "filing" if source_type == "filings_api" else "article", content_hash=item.get("content_hash", ""),
source_type, canonical_url=item.get("canonical_url"),
item.get("source", {}).get("name", "") if isinstance(item.get("source"), dict) else str(item.get("source", "")), document_id=doc_id,
url, title, )
datetime.fromisoformat(published.replace("Z", "+00:00")) if published else None,
item_hash,
f"s3://{bucket}/{storage_path}",
)
# Enqueue for parsing # Link duplicate documents to this company if not already linked
await rds.rpush(queue_key(QUEUE_PARSING), json.dumps({ company_id = job.get("company_id")
"document_id": str(doc_id), if company_id and deduped_count:
"ticker": ticker, from services.shared.metadata import persist_document_company_mention
"source_type": source_type, for dup in dup_items:
"url": url, existing_id = dup.get("_dedupe_existing_id")
})) if existing_id:
new_items += 1 try:
await persist_document_company_mention(
pool,
document_id=existing_id,
company_id=company_id,
ticker=ticker,
mention_type="cross_source",
)
except Exception:
# Duplicate mention link — safe to ignore
pass
await pool.execute( await pool.execute(
"UPDATE ingestion_runs SET status='completed', items_fetched=$2, items_new=$3, completed_at=NOW() WHERE id=$1", "UPDATE ingestion_runs SET status='completed', items_fetched=$2, items_new=$3, completed_at=NOW() WHERE id=$1",
run_id, len(result.items), new_items, run_id, len(result.items), new_items,
) )
logger.info(f"Ingested {ticker}/{source_type}: {len(result.items)} fetched, {new_items} new") # Clear any accumulated retry backoff after success
await reset_source_retry_state(pool, source_id)
INGESTION_ITEMS_FETCHED.labels(source_type=source_type).inc(len(result.items))
INGESTION_ITEMS_NEW.labels(source_type=source_type).inc(new_items)
INGESTION_JOBS_TOTAL.labels(source_type=source_type, status="success").inc()
logger.info(
"Ingested %s/%s: %d fetched, %d new",
ticker, source_type, len(result.items), new_items,
extra={"ticker": ticker, "source_type": source_type, "count": new_items},
)
except Exception as e: except Exception as e:
logger.error(f"Ingestion error for {ticker}: {e}") INGESTION_ERRORS.labels(source_type=source_type).inc()
await pool.execute( INGESTION_JOBS_TOTAL.labels(source_type=source_type, status="error").inc()
"UPDATE ingestion_runs SET status='failed', error_message=$2, completed_at=NOW() WHERE id=$1", logger.error(
run_id, str(e), "Ingestion error for %s: %s", ticker, e,
extra={"ticker": ticker, "source_type": source_type, "error": str(e)},
)
await record_retrieval_failure(
pool,
run_id=str(run_id),
source_id=source_id,
error_message=str(e),
) )
async def main(): async def main():
config = load_config() cfg = load_config()
pool = await get_pg_pool(config) setup_logging("ingestion_worker", level=cfg.log_level, json_output=cfg.json_logs)
rds = get_redis(config)
minio_client = get_minio(config) pool = await get_pg_pool(cfg)
rds = get_redis(cfg)
minio_client = get_minio(cfg)
# Ensure all required buckets exist
ensure_buckets(minio_client)
adapters = { adapters = {
"market_api": MarketDataAdapter( "market_api": PolygonMarketAdapter(
api_key=config.broker.api_key or "", api_key=cfg.market_data.api_key,
base_url=cfg.market_data.base_url,
),
"news_api": PolygonNewsAdapter(
api_key=cfg.market_data.api_key,
base_url="https://api.polygon.io", base_url="https://api.polygon.io",
), ),
"news_api": NewsApiAdapter( "filings_api": SECEdgarAdapter(),
api_key="", "web_scrape": WebScrapeAdapter(),
base_url="https://newsapi.org", "broker": AlpacaBrokerAdapter(
api_key=cfg.broker.api_key or "",
api_secret=cfg.broker.api_secret or "",
mode=TradingMode.LIVE if cfg.broker.mode == "live" else TradingMode.PAPER,
base_url=cfg.broker.base_url,
), ),
"filings_api": FilingsAdapter(),
} }
logger.info("Ingestion worker started") logger.info("Ingestion worker started")
+1 -1
View File
@@ -1 +1 @@
# Lake Publisher - transforms operational data into analytical fact datasets """Lake publisher — writes partitioned Parquet facts to MinIO for Trino/Superset."""
+39
View File
@@ -0,0 +1,39 @@
"""Helpers for enqueuing lake publish jobs from upstream workers.
Other services import these helpers to push jobs onto the QUEUE_LAKE_PUBLISH
Redis queue. The lake publisher worker (jobs.py) consumes them.
Usage:
await enqueue_lake_job(rds, "document", document_id)
await enqueue_lake_job(rds, "trade_order", order_id)
await enqueue_lake_job(rds, "bulk_documents", since=cutoff.isoformat())
"""
from __future__ import annotations
import json
import redis.asyncio as aioredis
from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key
async def enqueue_lake_job(
rds: aioredis.Redis,
job_type: str,
entity_id: str = "",
since: str | None = None,
) -> None:
"""Push a lake publish job onto the Redis queue.
Args:
rds: Async Redis client.
job_type: One of the supported job types (document, document_extraction,
market_snapshot, trade_order, trade_fill, positions_snapshot,
pnl_snapshot, bulk_documents, bulk_extractions).
entity_id: UUID or identifier for the entity to publish.
since: ISO datetime string for bulk jobs (cutoff timestamp).
"""
payload: dict[str, str] = {"job_type": job_type, "entity_id": entity_id}
if since:
payload["since"] = since
await rds.rpush(queue_key(QUEUE_LAKE_PUBLISH), json.dumps(payload)) # type: ignore[misc]
+420
View File
@@ -0,0 +1,420 @@
"""Iceberg table creation and metadata management for analytical datasets.
Manages Iceberg tables in Trino's Iceberg catalog, providing:
- Table creation with proper schemas and partition specs
- Schema synchronization between PyArrow definitions and Iceberg tables
- Table metadata inspection (existence checks, schema retrieval, partition listing)
The Iceberg catalog complements the existing Hive-compatible partition layout.
Parquet files written by the lake publisher are stored in the same MinIO paths,
but Iceberg metadata enables schema evolution, snapshot isolation, and better
partition pruning via Trino's Iceberg connector.
Requirements: 9.4, 9.5, 10.1, N4, N6
Design ref: Section 5.3 (Lakehouse model), Section 4.12 (SQL Query Engine)
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from typing import Any
import pyarrow as pa
from trino.dbapi import connect as trino_connect
from services.lake_publisher.partitions import (
LAKEHOUSE_BUCKET,
TABLE_PARTITIONS,
WAREHOUSE_PREFIX,
PartitionSpec,
)
from services.lake_publisher.worker import (
COMPANY_EVENTS_SCHEMA,
DOCUMENTS_SCHEMA,
DOCUMENT_EXTRACTIONS_SCHEMA,
MARKET_BARS_SCHEMA,
MARKET_QUOTES_SCHEMA,
MODEL_PERFORMANCE_SCHEMA,
PNL_DAILY_SCHEMA,
POSITIONS_DAILY_SCHEMA,
PREDICTION_VS_OUTCOME_SCHEMA,
TRADE_FILLS_SCHEMA,
TRADE_ORDERS_SCHEMA,
TRADE_SIGNALS_SCHEMA,
)
logger = logging.getLogger(__name__)
ICEBERG_CATALOG = "iceberg"
ICEBERG_SCHEMA = "stonks"
def _get_iceberg_catalog() -> str:
"""Return the Iceberg catalog name from env or default."""
import os
return os.getenv("TRINO_ICEBERG_CATALOG", ICEBERG_CATALOG)
# Map PyArrow types to Trino/Iceberg SQL types.
_ARROW_TO_TRINO: dict[str, str] = {
"string": "VARCHAR",
"utf8": "VARCHAR",
"large_string": "VARCHAR",
"large_utf8": "VARCHAR",
"float64": "DOUBLE",
"double": "DOUBLE",
"float32": "REAL",
"float": "REAL",
"int8": "TINYINT",
"int16": "SMALLINT",
"int32": "INTEGER",
"int64": "BIGINT",
"bool": "BOOLEAN",
"date32": "DATE",
"date32[day]": "DATE",
"date64": "DATE",
}
def _arrow_type_to_trino(arrow_type: pa.DataType) -> str:
"""Convert a PyArrow data type to a Trino SQL type string."""
type_str = str(arrow_type)
# Handle timestamp types (with or without timezone)
if type_str.startswith("timestamp"):
if "tz=" in type_str:
return "TIMESTAMP(6) WITH TIME ZONE"
return "TIMESTAMP(6)"
# Direct lookup
result = _ARROW_TO_TRINO.get(type_str)
if result:
return result
# Fallback for type IDs
if pa.types.is_string(arrow_type) or pa.types.is_large_string(arrow_type):
return "VARCHAR"
if pa.types.is_floating(arrow_type):
return "DOUBLE"
if pa.types.is_integer(arrow_type):
return "BIGINT"
if pa.types.is_boolean(arrow_type):
return "BOOLEAN"
if pa.types.is_date(arrow_type):
return "DATE"
if pa.types.is_timestamp(arrow_type):
return "TIMESTAMP(6) WITH TIME ZONE"
raise ValueError(f"Unsupported PyArrow type for Iceberg DDL: {arrow_type}")
# Registry mapping table names to their PyArrow schemas.
TABLE_SCHEMAS: dict[str, pa.Schema] = {
"market_bars": MARKET_BARS_SCHEMA,
"market_quotes": MARKET_QUOTES_SCHEMA,
"company_events": COMPANY_EVENTS_SCHEMA,
"documents": DOCUMENTS_SCHEMA,
"document_extractions": DOCUMENT_EXTRACTIONS_SCHEMA,
"trade_signals": TRADE_SIGNALS_SCHEMA,
"trade_orders": TRADE_ORDERS_SCHEMA,
"trade_fills": TRADE_FILLS_SCHEMA,
"positions_daily": POSITIONS_DAILY_SCHEMA,
"pnl_daily": PNL_DAILY_SCHEMA,
"prediction_vs_outcome": PREDICTION_VS_OUTCOME_SCHEMA,
"model_performance": MODEL_PERFORMANCE_SCHEMA,
}
@dataclass(frozen=True)
class IcebergTableDef:
"""Definition for an Iceberg table derived from PyArrow schema + partition spec."""
table_name: str
schema: pa.Schema
partition_spec: PartitionSpec
@property
def qualified_name(self) -> str:
return f"{ICEBERG_CATALOG}.{ICEBERG_SCHEMA}.{self.table_name}"
@property
def location(self) -> str:
return f"s3a://{LAKEHOUSE_BUCKET}/{WAREHOUSE_PREFIX}/{self.table_name}/"
def column_defs_sql(self) -> list[str]:
"""Generate SQL column definitions from the PyArrow schema.
Partition columns are included in the column list (Iceberg stores them
in the data files, unlike Hive external tables).
"""
cols: list[str] = []
for i in range(len(self.schema)):
name = self.schema.field(i).name
arrow_type = self.schema.field(i).type
trino_type = _arrow_type_to_trino(arrow_type)
cols.append(f" {name} {trino_type}")
return cols
def partition_keys_sql(self) -> str:
"""Generate the partitioning clause for CREATE TABLE."""
keys = list(self.partition_spec.all_keys)
if not keys:
return ""
quoted = ", ".join(f"'{k}'" for k in keys)
return f"partitioning = ARRAY[{quoted}]"
def create_table_sql(self) -> str:
"""Generate a CREATE TABLE IF NOT EXISTS statement for Trino's Iceberg catalog."""
col_lines = ",\n".join(self.column_defs_sql())
with_clauses = [
"format = 'PARQUET'",
f"location = '{self.location}'",
]
part_sql = self.partition_keys_sql()
if part_sql:
with_clauses.append(part_sql)
with_block = ",\n ".join(with_clauses)
return (
f"CREATE TABLE IF NOT EXISTS {self.qualified_name} (\n"
f"{col_lines}\n"
f") WITH (\n"
f" {with_block}\n"
f")"
)
def get_all_table_defs() -> list[IcebergTableDef]:
"""Build IcebergTableDef for every registered analytical table."""
defs: list[IcebergTableDef] = []
for table_name, partition_spec in TABLE_PARTITIONS.items():
schema = TABLE_SCHEMAS.get(table_name)
if schema is None:
logger.warning("No PyArrow schema for table %s, skipping", table_name)
continue
defs.append(IcebergTableDef(
table_name=table_name,
schema=schema,
partition_spec=partition_spec,
))
return defs
def get_table_def(table_name: str) -> IcebergTableDef:
"""Get the IcebergTableDef for a single table by name."""
if table_name not in TABLE_PARTITIONS:
raise ValueError(f"Unknown table: {table_name}")
schema = TABLE_SCHEMAS.get(table_name)
if schema is None:
raise ValueError(f"No PyArrow schema registered for table: {table_name}")
return IcebergTableDef(
table_name=table_name,
schema=schema,
partition_spec=TABLE_PARTITIONS[table_name],
)
@dataclass
class IcebergManager:
"""Manages Iceberg tables via Trino's Iceberg catalog.
Provides table creation, existence checks, schema inspection,
and metadata operations against the Trino Iceberg connector.
"""
host: str = "localhost"
port: int = 8080
user: str = "stonks"
catalog: str = ICEBERG_CATALOG
schema: str = ICEBERG_SCHEMA
def _get_connection(self) -> Any:
"""Create a Trino DBAPI connection."""
return trino_connect(
host=self.host,
port=self.port,
user=self.user,
catalog=self.catalog,
schema=self.schema,
)
def _execute(self, sql: str) -> list[list[Any]]:
"""Execute a SQL statement and return all rows."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(sql)
return cursor.fetchall()
finally:
conn.close()
def _execute_no_fetch(self, sql: str) -> None:
"""Execute a DDL statement that returns no rows."""
conn = self._get_connection()
try:
cursor = conn.cursor()
cursor.execute(sql)
# DDL statements in Trino still need fetchall to complete
try:
cursor.fetchall()
except Exception:
pass
finally:
conn.close()
def ensure_schema(self) -> None:
"""Create the Iceberg schema if it doesn't exist."""
sql = f"CREATE SCHEMA IF NOT EXISTS {self.catalog}.{self.schema}"
logger.info("Ensuring Iceberg schema: %s.%s", self.catalog, self.schema)
self._execute_no_fetch(sql)
def table_exists(self, table_name: str) -> bool:
"""Check if an Iceberg table exists."""
sql = (
f"SELECT table_name FROM {self.catalog}.information_schema.tables "
f"WHERE table_schema = '{self.schema}' AND table_name = '{table_name}'"
)
rows = self._execute(sql)
return len(rows) > 0
def create_table(self, table_name: str) -> bool:
"""Create a single Iceberg table if it doesn't exist.
Returns True if the table was created, False if it already existed.
"""
table_def = get_table_def(table_name)
ddl = table_def.create_table_sql()
logger.info("Creating Iceberg table: %s", table_def.qualified_name)
self._execute_no_fetch(ddl)
logger.info("Iceberg table ready: %s", table_def.qualified_name)
return True
def create_all_tables(self) -> dict[str, bool]:
"""Create all registered Iceberg tables.
Returns a dict mapping table_name -> True (created) or False (error).
"""
self.ensure_schema()
results: dict[str, bool] = {}
for table_def in get_all_table_defs():
try:
self.create_table(table_def.table_name)
results[table_def.table_name] = True
except Exception:
logger.exception("Failed to create Iceberg table: %s", table_def.table_name)
results[table_def.table_name] = False
return results
def get_table_schema(self, table_name: str) -> list[dict[str, str]]:
"""Retrieve the column schema of an Iceberg table from Trino.
Returns a list of dicts with 'column_name', 'data_type', and 'is_nullable'.
"""
sql = (
f"SELECT column_name, data_type, is_nullable "
f"FROM {self.catalog}.information_schema.columns "
f"WHERE table_schema = '{self.schema}' AND table_name = '{table_name}' "
f"ORDER BY ordinal_position"
)
rows = self._execute(sql)
return [
{"column_name": r[0], "data_type": r[1], "is_nullable": r[2]}
for r in rows
]
def get_table_snapshots(self, table_name: str) -> list[dict[str, Any]]:
"""List Iceberg snapshots for a table (useful for auditing and rollback).
Returns snapshot metadata from Trino's $snapshots metadata table.
"""
qualified = f"{self.catalog}.{self.schema}.{table_name}"
sql = f'SELECT * FROM "{qualified}$snapshots"'
try:
rows = self._execute(sql)
return [{"snapshot_id": r[0], "parent_id": r[1], "operation": r[2],
"manifest_list": r[3], "summary": r[4]} for r in rows]
except Exception:
logger.debug("Could not read snapshots for %s (table may be empty)", table_name)
return []
def get_table_partitions(self, table_name: str) -> list[dict[str, Any]]:
"""List partition values for an Iceberg table.
Returns partition metadata from Trino's $partitions metadata table.
"""
qualified = f"{self.catalog}.{self.schema}.{table_name}"
sql = f'SELECT * FROM "{qualified}$partitions"'
try:
rows = self._execute(sql)
return [{"row": r} for r in rows]
except Exception:
logger.debug("Could not read partitions for %s (table may be empty)", table_name)
return []
def list_tables(self) -> list[str]:
"""List all tables in the Iceberg schema."""
sql = (
f"SELECT table_name FROM {self.catalog}.information_schema.tables "
f"WHERE table_schema = '{self.schema}' ORDER BY table_name"
)
rows = self._execute(sql)
return [r[0] for r in rows]
def drop_table(self, table_name: str) -> None:
"""Drop an Iceberg table (for testing/reset purposes)."""
qualified = f"{self.catalog}.{self.schema}.{table_name}"
logger.warning("Dropping Iceberg table: %s", qualified)
self._execute_no_fetch(f"DROP TABLE IF EXISTS {qualified}")
def sync_table_schema(self, table_name: str) -> list[str]:
"""Compare the expected PyArrow schema with the actual Iceberg table schema.
If columns are missing from the Iceberg table, adds them via ALTER TABLE.
Returns a list of columns that were added.
This supports forward-only schema evolution — columns are never dropped.
"""
table_def = get_table_def(table_name)
existing = self.get_table_schema(table_name)
existing_names = {col["column_name"] for col in existing}
added: list[str] = []
qualified = table_def.qualified_name
for i in range(len(table_def.schema)):
col_name = table_def.schema.field(i).name
if col_name not in existing_names:
trino_type = _arrow_type_to_trino(table_def.schema.field(i).type)
alter_sql = f"ALTER TABLE {qualified} ADD COLUMN {col_name} {trino_type}"
logger.info("Adding column %s to %s", col_name, qualified)
self._execute_no_fetch(alter_sql)
added.append(col_name)
return added
def sync_all_schemas(self) -> dict[str, list[str]]:
"""Sync schemas for all registered tables. Returns table_name -> added columns."""
results: dict[str, list[str]] = {}
for table_def in get_all_table_defs():
try:
if self.table_exists(table_def.table_name):
added = self.sync_table_schema(table_def.table_name)
results[table_def.table_name] = added
else:
logger.info("Table %s doesn't exist yet, skipping sync", table_def.table_name)
results[table_def.table_name] = []
except Exception:
logger.exception("Failed to sync schema for %s", table_def.table_name)
results[table_def.table_name] = []
return results
def create_iceberg_manager_from_config(
host: str = "localhost",
port: int = 8080,
user: str = "stonks",
) -> IcebergManager:
"""Factory that creates an IcebergManager from explicit connection params."""
return IcebergManager(host=host, port=port, user=user)
+673
View File
@@ -0,0 +1,673 @@
"""Lake publisher async job runner — transforms operational data into analytical facts.
Reads jobs from the QUEUE_LAKE_PUBLISH Redis queue, queries PostgreSQL for
operational records, and publishes them as partitioned Parquet files to MinIO
via the existing publish_* functions in worker.py.
Job message format:
{"job_type": "<table_name>", "entity_id": "<uuid or ticker>", "dt": "2026-04-11T..."}
Supported job types:
- document: publish a single document metadata fact
- document_extraction: publish extraction facts for a document
- market_snapshot: publish market bars/quotes from a snapshot
- trade_order: publish an order fact
- trade_fill: publish fill facts for an order
- positions_snapshot: publish daily position snapshots for a broker account
- pnl_snapshot: publish daily PnL for a broker account
- company_event: publish a company event fact
- bulk_documents: publish all unpublished documents since a cutoff
- bulk_extractions: publish all unpublished extractions since a cutoff
Requirements: 9.4, 9.5, 10.1
Design ref: Section 4.10 (Lake Publisher), Section 8.4 (Lake publication flow)
"""
from __future__ import annotations
import asyncio
import json
import logging
from datetime import datetime, timezone
import asyncpg
import redis.asyncio as aioredis
from minio import Minio
from services.lake_publisher.worker import (
publish_document_extraction,
publish_document_fact,
publish_market_bar,
publish_market_quote,
publish_trade_order,
publish_trade_fill,
publish_pnl_daily,
publish_documents_batch,
publish_document_extractions_batch,
publish_positions_daily_batch,
)
from services.lake_publisher.partitions import partition_values
from services.shared.config import load_config
from services.shared.db import get_minio, get_pg_pool, get_redis
from services.shared.logging import setup_logging
from services.shared.redis_keys import QUEUE_LAKE_PUBLISH, queue_key
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# SQL queries for fetching operational data
# ---------------------------------------------------------------------------
_FETCH_DOCUMENT = """
SELECT
d.id, d.document_type, d.source_type, d.publisher, d.title,
d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
d.content_hash, d.parse_quality_score,
COALESCE(
(SELECT dcm.ticker FROM document_company_mentions dcm
WHERE dcm.document_id = d.id LIMIT 1),
''
) AS ticker
FROM documents d
WHERE d.id = $1::uuid
"""
_FETCH_EXTRACTIONS = """
SELECT
di.document_id, dir.ticker, dir.relevance, dir.sentiment,
dir.impact_score, dir.impact_horizon, dir.catalyst_type,
di.confidence, di.novelty_score, di.source_credibility,
dir.key_facts, dir.risks, di.macro_themes,
di.model_name, di.prompt_version, di.schema_version,
di.created_at AS extraction_at,
COALESCE(c.legal_name, '') AS company_name
FROM document_intelligence di
JOIN document_impact_records dir ON dir.intelligence_id = di.id
LEFT JOIN companies c ON c.id = dir.company_id
WHERE di.document_id = $1::uuid
AND di.validation_status = 'valid'
"""
_FETCH_MARKET_SNAPSHOT = """
SELECT
ms.ticker, ms.snapshot_type, ms.data, ms.source_provider, ms.captured_at
FROM market_snapshots ms
WHERE ms.id = $1::uuid
"""
_FETCH_ORDER = """
SELECT
o.id, o.recommendation_id, o.ticker, o.side, o.order_type,
o.quantity, o.limit_price, o.status, o.submitted_at,
o.fill_price, o.fill_quantity, o.filled_at,
COALESCE(ba.account_id, '') AS broker_account,
COALESCE(ba.mode, 'paper') AS execution_mode
FROM orders o
LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
WHERE o.id = $1::uuid
"""
_FETCH_ORDER_FILLS = """
SELECT
oe.id AS fill_id, oe.order_id, oe.data, oe.broker_timestamp,
o.ticker, o.side,
COALESCE(ba.account_id, '') AS broker_account
FROM order_events oe
JOIN orders o ON o.id = oe.order_id
LEFT JOIN broker_accounts ba ON ba.id = o.broker_account_id
WHERE oe.order_id = $1::uuid AND oe.event_type = 'fill'
"""
_FETCH_POSITIONS = """
SELECT
p.ticker, p.quantity, p.avg_entry_price, p.current_price,
p.unrealized_pnl, p.realized_pnl,
COALESCE(ba.account_id, '') AS broker_account,
COALESCE(ba.mode, 'paper') AS execution_mode
FROM positions p
LEFT JOIN broker_accounts ba ON ba.id = p.broker_account_id
WHERE p.broker_account_id = $1::uuid AND p.quantity != 0
"""
_FETCH_BULK_DOCUMENTS = """
SELECT
d.id, d.document_type, d.source_type, d.publisher, d.title,
d.url, d.canonical_url, d.language, d.published_at, d.retrieved_at,
d.content_hash, d.parse_quality_score,
COALESCE(
(SELECT dcm.ticker FROM document_company_mentions dcm
WHERE dcm.document_id = d.id LIMIT 1),
''
) AS ticker
FROM documents d
WHERE d.created_at >= $1
AND d.status IN ('parsed', 'extracted')
ORDER BY d.created_at
LIMIT 500
"""
_FETCH_BULK_EXTRACTIONS = """
SELECT
di.document_id, dir.ticker, dir.relevance, dir.sentiment,
dir.impact_score, dir.impact_horizon, dir.catalyst_type,
di.confidence, di.novelty_score, di.source_credibility,
dir.key_facts, dir.risks, di.macro_themes,
di.model_name, di.prompt_version, di.schema_version,
di.created_at AS extraction_at,
COALESCE(c.legal_name, '') AS company_name
FROM document_intelligence di
JOIN document_impact_records dir ON dir.intelligence_id = di.id
LEFT JOIN companies c ON c.id = dir.company_id
WHERE di.created_at >= $1
AND di.validation_status = 'valid'
ORDER BY di.created_at
LIMIT 500
"""
# ---------------------------------------------------------------------------
# Job handlers — each transforms operational rows into lake facts
# ---------------------------------------------------------------------------
def _jsonb_to_str(val: object) -> str:
"""Convert a JSONB column value (list or str) to a comma-separated string."""
if val is None:
return ""
if isinstance(val, str):
try:
parsed = json.loads(val)
if isinstance(parsed, list):
return ", ".join(str(x) for x in parsed)
return val
except (json.JSONDecodeError, TypeError):
return val
if isinstance(val, list):
return ", ".join(str(x) for x in val)
return str(val)
async def publish_document_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> str:
"""Publish a single document metadata fact from PostgreSQL to the lake."""
row = await pool.fetchrow(_FETCH_DOCUMENT, entity_id)
if row is None:
logger.warning("Document %s not found, skipping lake publish", entity_id)
return ""
published_at = row["published_at"] or row["retrieved_at"]
return publish_document_fact(
client=minio_client,
document_id=str(row["id"]),
document_type=row["document_type"],
source_type=row["source_type"],
ticker=row["ticker"] or "",
publisher=row["publisher"] or "",
title=row["title"] or "",
published_at=published_at,
content_hash=row["content_hash"],
url=row["url"] or "",
canonical_url=row["canonical_url"] or "",
language=row["language"] or "en",
confidence=float(row["parse_quality_score"] or 0.0),
retrieved_at=row["retrieved_at"],
)
async def publish_extraction_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> list[str]:
"""Publish document extraction facts for a document from PostgreSQL to the lake."""
rows = await pool.fetch(_FETCH_EXTRACTIONS, entity_id)
if not rows:
logger.info("No valid extractions for document %s", entity_id)
return []
refs: list[str] = []
for row in rows:
ref = publish_document_extraction(
client=minio_client,
document_id=str(row["document_id"]),
ticker=row["ticker"],
sentiment=row["sentiment"] or "neutral",
impact_score=float(row["impact_score"] or 0.0),
catalyst_type=row["catalyst_type"] or "other",
confidence=float(row["confidence"] or 0.0),
extraction_at=row["extraction_at"],
model_name=row["model_name"] or "",
prompt_version=row["prompt_version"] or "",
company_name=row["company_name"] or "",
relevance=float(row["relevance"] or 0.0),
impact_horizon=row["impact_horizon"] or "",
novelty_score=float(row["novelty_score"] or 0.0),
source_credibility=float(row["source_credibility"] or 0.0),
key_facts=_jsonb_to_str(row["key_facts"]),
risks=_jsonb_to_str(row["risks"]),
macro_themes=_jsonb_to_str(row["macro_themes"]),
schema_version=row["schema_version"] or "",
)
refs.append(ref)
return refs
async def publish_market_snapshot_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> list[str]:
"""Publish market bar/quote facts from a market_snapshots row."""
row = await pool.fetchrow(_FETCH_MARKET_SNAPSHOT, entity_id)
if row is None:
logger.warning("Market snapshot %s not found", entity_id)
return []
ticker = row["ticker"]
data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"])
source = row["source_provider"] or ""
captured_at = row["captured_at"]
snapshot_type = row["snapshot_type"]
refs: list[str] = []
if snapshot_type == "bar" or snapshot_type == "bars":
# Single bar or list of bars
bars = data.get("bars", [data]) if "bars" in data else [data]
for bar in bars:
ref = publish_market_bar(
client=minio_client,
ticker=ticker,
open_price=float(bar.get("open", bar.get("o", 0))),
high_price=float(bar.get("high", bar.get("h", 0))),
low_price=float(bar.get("low", bar.get("l", 0))),
close_price=float(bar.get("close", bar.get("c", 0))),
volume=int(bar.get("volume", bar.get("v", 0))),
bar_timestamp=captured_at,
source=source,
vwap=float(bar.get("vwap", bar.get("vw", 0))),
trade_count=int(bar.get("trade_count", bar.get("n", 0))),
bar_interval=bar.get("interval", "1d"),
)
refs.append(ref)
elif snapshot_type == "quote" or snapshot_type == "quotes":
ref = publish_market_quote(
client=minio_client,
ticker=ticker,
bid_price=float(data.get("bid_price", data.get("bp", 0))),
ask_price=float(data.get("ask_price", data.get("ap", 0))),
last_price=float(data.get("last_price", data.get("lp", 0))),
quote_at=captured_at,
source=source,
bid_size=int(data.get("bid_size", data.get("bs", 0))),
ask_size=int(data.get("ask_size", data.get("as", 0))),
last_size=int(data.get("last_size", data.get("ls", 0))),
)
refs.append(ref)
return refs
async def publish_order_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> str:
"""Publish a trade order fact from PostgreSQL to the lake."""
row = await pool.fetchrow(_FETCH_ORDER, entity_id)
if row is None:
logger.warning("Order %s not found", entity_id)
return ""
submitted_at = row["submitted_at"] or datetime.now(timezone.utc)
return publish_trade_order(
client=minio_client,
order_id=str(row["id"]),
ticker=row["ticker"],
side=row["side"],
order_type=row["order_type"],
quantity=float(row["quantity"]),
limit_price=float(row["limit_price"]) if row["limit_price"] else None,
status=row["status"],
broker_account=row["broker_account"],
submitted_at=submitted_at,
recommendation_id=str(row["recommendation_id"]) if row["recommendation_id"] else "",
execution_mode=row["execution_mode"],
)
async def publish_fills_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> list[str]:
"""Publish trade fill facts for an order from PostgreSQL to the lake."""
rows = await pool.fetch(_FETCH_ORDER_FILLS, entity_id)
if not rows:
logger.info("No fill events for order %s", entity_id)
return []
refs: list[str] = []
for row in rows:
data = row["data"] if isinstance(row["data"], dict) else json.loads(row["data"] or "{}")
filled_at = row["broker_timestamp"] or datetime.now(timezone.utc)
ref = publish_trade_fill(
client=minio_client,
fill_id=str(row["fill_id"]),
order_id=str(row["order_id"]),
ticker=row["ticker"],
side=row["side"],
fill_price=float(data.get("fill_price", data.get("price", 0))),
fill_quantity=float(data.get("fill_quantity", data.get("qty", 0))),
broker_account=row["broker_account"],
filled_at=filled_at,
commission=float(data.get("commission", 0)),
)
refs.append(ref)
return refs
async def publish_positions_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> str:
"""Publish daily position snapshots for a broker account."""
rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
if not rows:
logger.info("No open positions for account %s", entity_id)
return ""
snapshot_at = datetime.now(timezone.utc)
positions = [
{
"ticker": row["ticker"],
"quantity": float(row["quantity"]),
"avg_entry_price": float(row["avg_entry_price"] or 0),
"close_price": float(row["current_price"] or 0),
"unrealized_pnl": float(row["unrealized_pnl"] or 0),
}
for row in rows
]
broker_account = rows[0]["broker_account"] if rows else ""
return publish_positions_daily_batch(
client=minio_client,
positions=positions,
broker_account=broker_account,
snapshot_at=snapshot_at,
)
async def publish_pnl_job(
pool: asyncpg.Pool,
minio_client: Minio,
entity_id: str,
) -> list[str]:
"""Publish daily PnL facts for a broker account's positions."""
rows = await pool.fetch(_FETCH_POSITIONS, entity_id)
if not rows:
logger.info("No positions for PnL snapshot, account %s", entity_id)
return []
now = datetime.now(timezone.utc)
refs: list[str] = []
for row in rows:
realized = float(row["realized_pnl"] or 0)
unrealized = float(row["unrealized_pnl"] or 0)
total = realized + unrealized
ref = publish_pnl_daily(
client=minio_client,
ticker=row["ticker"],
realized_pnl=realized,
unrealized_pnl=unrealized,
total_pnl=total,
broker_account=row["broker_account"],
dt=now,
execution_mode=row["execution_mode"],
)
refs.append(ref)
return refs
async def publish_bulk_documents_job(
pool: asyncpg.Pool,
minio_client: Minio,
since: datetime,
) -> list[str]:
"""Publish all documents created since a cutoff as a batch."""
rows = await pool.fetch(_FETCH_BULK_DOCUMENTS, since)
if not rows:
logger.info("No documents to bulk-publish since %s", since)
return []
doc_rows: list[dict[str, object]] = []
for row in rows:
published_at = row["published_at"] or row["retrieved_at"]
doc_rows.append({
"document_id": str(row["id"]),
"document_type": row["document_type"],
"source_type": row["source_type"],
"ticker": row["ticker"] or "",
"publisher": row["publisher"] or "",
"title": row["title"] or "",
"url": row["url"] or "",
"canonical_url": row["canonical_url"] or "",
"language": row["language"] or "en",
"published_at": published_at,
"retrieved_at": row["retrieved_at"],
"content_hash": row["content_hash"],
"confidence": float(row["parse_quality_score"] or 0.0),
**partition_values(published_at),
})
ref = publish_documents_batch(minio_client, doc_rows, since)
return [ref] if ref else []
async def publish_bulk_extractions_job(
pool: asyncpg.Pool,
minio_client: Minio,
since: datetime,
) -> list[str]:
"""Publish all extractions created since a cutoff as a batch."""
rows = await pool.fetch(_FETCH_BULK_EXTRACTIONS, since)
if not rows:
logger.info("No extractions to bulk-publish since %s", since)
return []
extraction_rows: list[dict[str, object]] = []
for row in rows:
model_ver = row["schema_version"] or row["prompt_version"] or ""
extraction_rows.append({
"document_id": str(row["document_id"]),
"ticker": row["ticker"],
"company_name": row["company_name"] or "",
"relevance": float(row["relevance"] or 0.0),
"sentiment": row["sentiment"] or "neutral",
"impact_score": float(row["impact_score"] or 0.0),
"impact_horizon": row["impact_horizon"] or "",
"catalyst_type": row["catalyst_type"] or "other",
"confidence": float(row["confidence"] or 0.0),
"novelty_score": float(row["novelty_score"] or 0.0),
"source_credibility": float(row["source_credibility"] or 0.0),
"key_facts": _jsonb_to_str(row["key_facts"]),
"risks": _jsonb_to_str(row["risks"]),
"macro_themes": _jsonb_to_str(row["macro_themes"]),
"model_name": row["model_name"] or "",
"prompt_version": row["prompt_version"] or "",
"schema_version": row["schema_version"] or "",
"extraction_at": row["extraction_at"],
**partition_values(row["extraction_at"], {"model_version": model_ver}),
})
model_ver = extraction_rows[0].get("model_version", "") if extraction_rows else ""
ref = publish_document_extractions_batch(
minio_client, extraction_rows, since,
model_version=str(model_ver),
)
return [ref] if ref else []
# ---------------------------------------------------------------------------
# Job dispatcher
# ---------------------------------------------------------------------------
JOB_TYPES = {
"document",
"document_extraction",
"market_snapshot",
"trade_order",
"trade_fill",
"positions_snapshot",
"pnl_snapshot",
"company_event",
"bulk_documents",
"bulk_extractions",
}
async def dispatch_job(
pool: asyncpg.Pool,
minio_client: Minio,
job: dict[str, str],
) -> dict[str, object]:
"""Dispatch a lake publish job to the appropriate handler.
Args:
pool: PostgreSQL connection pool.
minio_client: MinIO client for writing Parquet files.
job: Job dict with at least 'job_type' and 'entity_id'.
Returns:
A result dict with 'job_type', 'entity_id', 'refs' (list of s3 URIs),
and 'error' (None on success).
"""
job_type = job.get("job_type", "")
entity_id = job.get("entity_id", "")
since_str = job.get("since")
result: dict[str, object] = {
"job_type": job_type,
"entity_id": entity_id,
"refs": [],
"error": None,
}
try:
if job_type == "document":
ref = await publish_document_job(pool, minio_client, entity_id)
result["refs"] = [ref] if ref else []
elif job_type == "document_extraction":
refs = await publish_extraction_job(pool, minio_client, entity_id)
result["refs"] = refs
elif job_type == "market_snapshot":
refs = await publish_market_snapshot_job(pool, minio_client, entity_id)
result["refs"] = refs
elif job_type == "trade_order":
ref = await publish_order_job(pool, minio_client, entity_id)
result["refs"] = [ref] if ref else []
elif job_type == "trade_fill":
refs = await publish_fills_job(pool, minio_client, entity_id)
result["refs"] = refs
elif job_type == "positions_snapshot":
ref = await publish_positions_job(pool, minio_client, entity_id)
result["refs"] = [ref] if ref else []
elif job_type == "pnl_snapshot":
refs = await publish_pnl_job(pool, minio_client, entity_id)
result["refs"] = refs
elif job_type == "bulk_documents":
since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
refs = await publish_bulk_documents_job(pool, minio_client, since)
result["refs"] = refs
elif job_type == "bulk_extractions":
since = datetime.fromisoformat(since_str) if since_str else datetime.now(timezone.utc)
refs = await publish_bulk_extractions_job(pool, minio_client, since)
result["refs"] = refs
else:
result["error"] = f"Unknown job_type: {job_type}"
logger.warning("Unknown lake publish job type: %s", job_type)
except Exception as exc:
result["error"] = str(exc)
logger.exception("Lake publish job failed: %s/%s", job_type, entity_id)
return result
# ---------------------------------------------------------------------------
# Async worker loop
# ---------------------------------------------------------------------------
async def run_worker(
pool: asyncpg.Pool,
rds: aioredis.Redis,
minio_client: Minio,
poll_interval: float = 2.0,
) -> None:
"""Main worker loop — reads jobs from Redis and dispatches them.
Runs indefinitely until cancelled. Each job is processed sequentially
to keep MinIO write ordering predictable.
"""
queue = queue_key(QUEUE_LAKE_PUBLISH)
logger.info("Lake publisher worker started, listening on %s", queue)
while True:
raw = await rds.lpop(queue) # type: ignore[misc]
if raw is None:
await asyncio.sleep(poll_interval)
continue
try:
job = json.loads(str(raw))
except (json.JSONDecodeError, TypeError):
logger.error("Invalid lake publish job payload: %s", raw)
continue
result = await dispatch_job(pool, minio_client, job)
refs = result.get("refs") or []
error = result.get("error")
if error:
logger.error(
"Lake publish job %s/%s failed: %s",
result["job_type"], result["entity_id"], error,
)
else:
ref_count = len(refs) if isinstance(refs, list) else 0
logger.info(
"Lake publish job %s/%s completed: %d facts written",
result["job_type"], result["entity_id"], ref_count,
)
async def main() -> None:
"""Entry point for the lake publisher worker process."""
config = load_config()
pool = await get_pg_pool(config)
rds = get_redis(config)
minio_client = get_minio(config)
try:
await run_worker(pool, rds, minio_client)
finally:
await pool.close()
await rds.close()
if __name__ == "__main__":
cfg = load_config()
setup_logging("lake_publisher", level=cfg.log_level, json_output=cfg.json_logs)
asyncio.run(main())
+128
View File
@@ -0,0 +1,128 @@
"""Hive-compatible partition layout conventions for the MinIO lakehouse.
Centralizes partition path generation, partition column injection, and
bucket provisioning so that all lake publisher writers produce layouts
that Trino's Hive and Iceberg connectors can discover and prune.
Design ref: Section 5.2, 5.3 (Lakehouse model)
Requirements: 9.4, 9.5, N4, N6
Layout convention:
s3://stonks-lakehouse/warehouse/{table_name}/dt={YYYY-MM-DD}[/{extra_key}={value}]/part-{uuid}.parquet
Rules:
- Every fact table is partitioned by ``dt`` (DATE) derived from the row timestamp.
- Some tables have a second partition key (e.g. ``model_version``).
- Partition columns MUST appear in the Parquet file so Trino can read them
without relying solely on path parsing.
- File names use a UUID suffix to avoid collisions on concurrent writes.
"""
from __future__ import annotations
import uuid
from dataclasses import dataclass, field
from datetime import date, datetime, timezone
LAKEHOUSE_BUCKET = "stonks-lakehouse"
WAREHOUSE_PREFIX = "warehouse"
@dataclass(frozen=True)
class PartitionSpec:
"""Describes the partition layout for a single fact table."""
table_name: str
extra_keys: tuple[str, ...] = field(default_factory=tuple)
@property
def all_keys(self) -> tuple[str, ...]:
"""Return all partition keys in order (dt first, then extras)."""
return ("dt", *self.extra_keys)
# Registry of every analytical fact table and its partition keys.
# This is the single source of truth — DDL, publisher, and tests should agree.
TABLE_PARTITIONS: dict[str, PartitionSpec] = {
"market_bars": PartitionSpec("market_bars"),
"market_quotes": PartitionSpec("market_quotes"),
"company_events": PartitionSpec("company_events"),
"documents": PartitionSpec("documents"),
"document_extractions": PartitionSpec("document_extractions", extra_keys=("model_version",)),
"trade_signals": PartitionSpec("trade_signals"),
"trade_orders": PartitionSpec("trade_orders"),
"trade_fills": PartitionSpec("trade_fills"),
"positions_daily": PartitionSpec("positions_daily"),
"pnl_daily": PartitionSpec("pnl_daily"),
"prediction_vs_outcome": PartitionSpec("prediction_vs_outcome", extra_keys=("model_version",)),
"model_performance": PartitionSpec("model_performance", extra_keys=("model_version",)),
}
def partition_path(
table_name: str,
dt: datetime | date,
extra_partitions: dict[str, str] | None = None,
file_id: str | None = None,
) -> str:
"""Build a Hive-compatible object path for a Parquet file.
Args:
table_name: Logical fact table name (must be in TABLE_PARTITIONS).
dt: Row timestamp or date used to derive the ``dt=`` partition.
extra_partitions: Additional partition key/value pairs (e.g. model_version).
file_id: Optional override for the file suffix (defaults to a UUID4).
Returns:
Object key relative to the bucket root, e.g.
``warehouse/trade_signals/dt=2026-04-11/part-<uuid>.parquet``
"""
spec = TABLE_PARTITIONS.get(table_name)
if spec is None:
raise ValueError(f"Unknown table: {table_name}. Register it in TABLE_PARTITIONS.")
if isinstance(dt, datetime):
dt_str = dt.strftime("%Y-%m-%d")
else:
dt_str = dt.isoformat()
segments = [WAREHOUSE_PREFIX, table_name, f"dt={dt_str}"]
# Append extra partition directories in the order declared by the spec.
extras = extra_partitions or {}
for key in spec.extra_keys:
value = extras.get(key, "__NONE__")
segments.append(f"{key}={value}")
suffix = file_id or uuid.uuid4().hex[:16]
segments.append(f"part-{suffix}.parquet")
return "/".join(segments)
def partition_values(
dt: datetime | date,
extra_partitions: dict[str, str] | None = None,
) -> dict[str, object]:
"""Return partition column values to inject into Parquet row data.
Trino's Hive connector can read partition values from the directory path,
but embedding them in the Parquet file as well ensures compatibility with
engines that don't parse Hive paths (e.g. plain PyArrow reads, DuckDB).
Returns a dict like ``{"dt": date(2026, 4, 11), "model_version": "v2"}``.
"""
if isinstance(dt, datetime):
dt_date = dt.date()
else:
dt_date = dt
values: dict[str, object] = {"dt": dt_date}
if extra_partitions:
values.update(extra_partitions)
return values
def s3_uri(path: str) -> str:
"""Build an s3:// URI from a bucket-relative object path."""
return f"s3://{LAKEHOUSE_BUCKET}/{path}"
File diff suppressed because it is too large Load Diff
+858
View File
@@ -0,0 +1,858 @@
"""HTML-to-text parsing pipeline using BeautifulSoup.
Provides structured HTML parsing with boilerplate removal, metadata extraction,
outbound link extraction, and quality scoring. Inspired by Noctipede crawler
patterns: BeautifulSoup + content hashing, boilerplate stripping, quality scoring.
Requirements: 4.1, 4.2, 4.3
"""
from __future__ import annotations
import json
import logging
import math
import re
from dataclasses import dataclass, field
from urllib.parse import urlparse
from bs4 import BeautifulSoup, Tag
logger = logging.getLogger("html_parser")
# Tags that never contain useful article content
STRIP_TAGS = [
"script", "style", "nav", "footer", "header", "aside",
"iframe", "noscript", "svg", "form", "button",
]
# CSS class / id substrings that signal boilerplate containers
BOILERPLATE_SIGNALS = [
"sidebar", "widget", "advert", "promo", "newsletter",
"social-share", "share-bar", "related-posts", "comment",
"cookie", "popup", "modal", "banner", "breadcrumb",
"pagination", "nav-", "menu", "toolbar", "signup",
"subscribe", "follow-us", "social-media", "share-button",
"ad-slot", "ad-container", "sponsored",
]
# Regex patterns for residual boilerplate in extracted text
BOILERPLATE_TEXT_PATTERNS = [
re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
re.compile(r"(?i)advertisement\s*\n?"),
re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
re.compile(r"(?i)sign up for .*?(?:\n|$)"),
re.compile(r"(?i)follow us on .*?(?:\n|$)"),
re.compile(r"(?i)share this (article|story|post).*?(?:\n|$)"),
re.compile(r"(?i)read more:?\s*$"),
re.compile(r"(?i)recommended for you.*?(?:\n|$)"),
re.compile(r"(?i)you may also like.*?(?:\n|$)"),
re.compile(r"(?i)trending now.*?(?:\n|$)"),
re.compile(r"(?i)most (popular|read).*?(?:\n|$)"),
re.compile(r"(?i)^tags:\s*$"),
re.compile(r"(?i)^\s*photo\s*:.*?(?:\n|$)"),
re.compile(r"(?i)^\s*image\s*(credit|source|courtesy)\s*:.*?(?:\n|$)"),
]
# Selectors for article body candidates, in priority order
ARTICLE_SELECTORS = [
"article",
"[role='main']",
".article-body",
".post-content",
".entry-content",
".story-body",
".article-content",
"#article-body",
"#story-body",
".article-text",
".post-body",
".content-body",
"main",
]
# Minimum text density (text chars / total chars including markup) for a block
# to be considered content-rich rather than boilerplate
_MIN_TEXT_DENSITY = 0.25
# Minimum word count for a block to be a viable body candidate
_MIN_BLOCK_WORDS = 20
@dataclass
class QualitySignals:
"""Individual quality signals contributing to the overall parse score.
Each signal is a float in [0, 1] representing how well the parsed
content performs on that dimension.
Requirements: 4.3
"""
word_count_signal: float = 0.0
diversity_signal: float = 0.0
sentence_signal: float = 0.0
paragraph_signal: float = 0.0
body_found_signal: float = 0.0
metadata_signal: float = 0.0
def as_dict(self) -> dict[str, float]:
return {
"word_count": self.word_count_signal,
"diversity": self.diversity_signal,
"sentence": self.sentence_signal,
"paragraph": self.paragraph_signal,
"body_found": self.body_found_signal,
"metadata": self.metadata_signal,
}
@dataclass
class CompanyMention:
"""A detected company mention in parsed text.
Requirements: 1.3, 4.1
"""
company_id: str
ticker: str
mention_type: str # ticker, legal_name, alias, brand
confidence: float
match_count: int = 1
@dataclass
class ParsedDocument:
"""Result of HTML-to-text parsing pipeline."""
body_text: str = ""
title: str = ""
author: str = ""
publisher: str = ""
published_at: str | None = None
canonical_url: str | None = None
language: str = "en"
description: str = ""
document_type: str = "article"
outbound_links: list[str] = field(default_factory=list)
tags: list[str] = field(default_factory=list)
mentioned_companies: list[CompanyMention] = field(default_factory=list)
quality_score: float = 0.0
confidence: str = "low"
word_count: int = 0
quality_signals: QualitySignals = field(default_factory=QualitySignals)
low_quality_flag: bool = False
quality_warnings: list[str] = field(default_factory=list)
def _attr_str(tag: Tag, attr: str) -> str:
"""Safely get a tag attribute as a joined string."""
val = tag.get(attr, "")
if isinstance(val, list):
return " ".join(val)
return str(val) if val else ""
def _is_boilerplate_container(tag: Tag) -> bool:
"""Check if a tag looks like a boilerplate container by class/id."""
cls = _attr_str(tag, "class").lower()
tag_id = _attr_str(tag, "id").lower()
combined = f"{cls} {tag_id}"
return any(sig in combined for sig in BOILERPLATE_SIGNALS)
def _strip_boilerplate_tags(soup: BeautifulSoup) -> None:
"""Remove known non-content tags and boilerplate containers in-place."""
for tag_name in STRIP_TAGS:
for tag in soup.find_all(tag_name):
tag.decompose()
for tag in soup.find_all(True):
if _is_boilerplate_container(tag):
tag.decompose()
def _reduce_boilerplate_text(text: str) -> str:
"""Apply regex patterns to strip residual boilerplate from extracted text."""
for pattern in BOILERPLATE_TEXT_PATTERNS:
text = pattern.sub("", text)
return text.strip()
def _text_density(tag: Tag) -> float:
"""Compute text density for a tag: ratio of text length to total markup length.
Higher density means more actual text relative to HTML structure,
which is a strong signal for content blocks vs boilerplate.
Requirements: 4.2
"""
markup_len = len(str(tag))
if markup_len == 0:
return 0.0
text_len = len(tag.get_text(strip=True))
return text_len / markup_len
def _link_density(tag: Tag) -> float:
"""Compute link density: ratio of text inside <a> tags to total text.
High link density signals navigation/boilerplate blocks (menus, sidebars).
Low link density signals content paragraphs.
Requirements: 4.2
"""
total_text = len(tag.get_text(strip=True))
if total_text == 0:
return 1.0
link_text = sum(len(a.get_text(strip=True)) for a in tag.find_all("a"))
return link_text / total_text
def _block_score(tag: Tag) -> float:
"""Score a block element as a body candidate using text density heuristics.
Combines text density, link density, paragraph count, and word count
into a composite score. Higher is more likely to be the article body.
Requirements: 4.2
"""
text = tag.get_text(strip=True)
word_count = len(text.split())
if word_count < _MIN_BLOCK_WORDS:
return 0.0
td = _text_density(tag)
ld = _link_density(tag)
p_count = len(tag.find_all("p"))
# Base score from text density (0-1), penalized by link density
score = td * (1.0 - ld)
# Bonus for paragraph-rich blocks (structured article content)
if p_count >= 2:
score += 0.1 * min(p_count, 10)
# Bonus for word count (log-scaled to avoid runaway scores)
score += 0.05 * math.log(max(word_count, 1))
return score
def _find_article_body(soup: BeautifulSoup) -> Tag | None:
"""Find the most likely article body element.
First tries semantic selectors (article, [role=main], etc.).
If no semantic match, falls back to text-density scoring across
candidate block elements to find the content-richest container.
Requirements: 4.2
"""
# Priority 1: semantic selectors
for selector in ARTICLE_SELECTORS:
result = soup.select_one(selector)
if result:
text = result.get_text(strip=True)
if len(text.split()) >= _MIN_BLOCK_WORDS:
return result
# Priority 2: text-density scoring on block-level containers
candidates: list[tuple[float, Tag]] = []
for tag in soup.find_all(["div", "section", "td"]):
score = _block_score(tag)
if score > 0:
candidates.append((score, tag))
if candidates:
candidates.sort(key=lambda x: x[0], reverse=True)
return candidates[0][1]
return None
def _collapse_whitespace(text: str) -> str:
"""Collapse runs of blank lines into single separators."""
lines = [line.strip() for line in text.splitlines()]
result: list[str] = []
prev_blank = False
for line in lines:
if not line:
if not prev_blank:
result.append("")
prev_blank = True
else:
result.append(line)
prev_blank = False
return "\n".join(result).strip()
def _remove_short_orphan_lines(text: str, min_words: int = 3) -> str:
"""Remove very short orphan lines that are likely UI fragments or captions.
Lines shorter than min_words that don't end with sentence punctuation
are stripped. This catches leftover button labels, image captions,
and navigation fragments.
Requirements: 4.2
"""
lines = text.splitlines()
kept: list[str] = []
for line in lines:
stripped = line.strip()
words = stripped.split()
if len(words) < min_words and not stripped.endswith((".", "!", "?", ":")):
continue
kept.append(line)
return "\n".join(kept)
def _detect_repeated_blocks(text: str, min_len: int = 40) -> str:
"""Remove repeated text blocks that appear more than once.
Template text (disclaimers, repeated footers) often appears verbatim
in multiple places. This strips exact duplicate blocks.
Requirements: 4.2
"""
lines = text.splitlines()
seen: dict[str, int] = {}
for line in lines:
stripped = line.strip()
if len(stripped) >= min_len:
seen[stripped] = seen.get(stripped, 0) + 1
duplicates = {k for k, v in seen.items() if v > 1}
if not duplicates:
return text
kept: list[str] = []
emitted: set[str] = set()
for line in lines:
stripped = line.strip()
if stripped in duplicates:
if stripped not in emitted:
kept.append(line)
emitted.add(stripped)
# Skip subsequent duplicates
else:
kept.append(line)
return "\n".join(kept)
def extract_body_text(html: str) -> str:
"""Extract main body text from HTML with boilerplate removal.
Pipeline:
1. Strip non-content tags (script, style, nav, footer, etc.)
2. Strip boilerplate containers by class/id signals
3. Find article body via semantic selectors or text-density scoring
4. Extract text from best candidate
5. Remove residual boilerplate via regex patterns
6. Remove short orphan lines (UI fragments)
7. Detect and collapse repeated template blocks
8. Collapse whitespace
Requirements: 4.1, 4.2
"""
soup = BeautifulSoup(html, "html.parser")
_strip_boilerplate_tags(soup)
article = _find_article_body(soup)
if article:
raw_text = article.get_text(separator="\n", strip=True)
else:
body = soup.find("body")
raw_text = (body or soup).get_text(separator="\n", strip=True)
# Multi-stage text cleaning
text = _reduce_boilerplate_text(raw_text)
text = _remove_short_orphan_lines(text)
text = _detect_repeated_blocks(text)
text = _collapse_whitespace(text)
return text
def extract_metadata(html: str, url: str = "") -> dict[str, str | None]:
"""Extract document metadata from HTML head elements.
Extracts title, author, publisher, published date, canonical URL,
language, description, and tags/keywords.
Requirements: 4.1
"""
soup = BeautifulSoup(html, "html.parser")
meta: dict[str, str | None] = {}
# Title: og:title > <title>
og_title = soup.find("meta", property="og:title")
if og_title and og_title.get("content"):
content = og_title["content"]
meta["title"] = content.strip() if isinstance(content, str) else ""
elif soup.title and soup.title.string:
meta["title"] = soup.title.string.strip()
else:
meta["title"] = ""
# Author
author_tag = soup.find("meta", attrs={"name": "author"})
if author_tag and author_tag.get("content"):
content = author_tag["content"]
meta["author"] = content.strip() if isinstance(content, str) else ""
else:
meta["author"] = ""
# Publisher: og:site_name > hostname
site_name = soup.find("meta", property="og:site_name")
if site_name and site_name.get("content"):
content = site_name["content"]
meta["publisher"] = content.strip() if isinstance(content, str) else ""
else:
meta["publisher"] = urlparse(url).hostname or "" if url else ""
# Published date: article:published_time > JSON-LD datePublished
pub_time = soup.find("meta", property="article:published_time")
if pub_time and pub_time.get("content"):
content = pub_time["content"]
meta["published_at"] = content.strip() if isinstance(content, str) else None
else:
meta["published_at"] = _extract_jsonld_date(soup)
# Canonical URL
canonical = soup.find("link", rel="canonical")
if canonical and canonical.get("href"):
meta["canonical_url"] = str(canonical["href"])
else:
og_url = soup.find("meta", property="og:url")
if og_url and og_url.get("content"):
meta["canonical_url"] = str(og_url["content"])
else:
meta["canonical_url"] = url or None
# Language
html_tag = soup.find("html")
if html_tag and html_tag.get("lang"):
lang = html_tag["lang"]
meta["language"] = str(lang)[:5] if lang else "en"
else:
meta["language"] = "en"
# Description
desc = soup.find("meta", property="og:description") or soup.find(
"meta", attrs={"name": "description"}
)
if desc and desc.get("content"):
content = desc["content"]
meta["description"] = content.strip() if isinstance(content, str) else ""
else:
meta["description"] = ""
# Tags / keywords
keywords = soup.find("meta", attrs={"name": "keywords"})
if keywords and keywords.get("content"):
content = keywords["content"]
raw = content.strip() if isinstance(content, str) else ""
meta["tags"] = raw # comma-separated string
else:
meta["tags"] = ""
return meta
def _extract_jsonld_date(soup: BeautifulSoup) -> str | None:
"""Try to extract datePublished from JSON-LD script tags."""
for script in soup.find_all("script", type="application/ld+json"):
if script.string and "datePublished" in script.string:
try:
ld = json.loads(script.string)
if isinstance(ld, dict) and "datePublished" in ld:
return str(ld["datePublished"])
if isinstance(ld, list):
for item in ld:
if isinstance(item, dict) and "datePublished" in item:
return str(item["datePublished"])
except (json.JSONDecodeError, TypeError):
pass
return None
def extract_outbound_links(html: str, base_url: str = "") -> list[str]:
"""Extract outbound links from HTML, filtering out self-references.
Requirements: 4.1
"""
soup = BeautifulSoup(html, "html.parser")
base_host = urlparse(base_url).hostname or "" if base_url else ""
links: list[str] = []
for a_tag in soup.find_all("a", href=True):
href = str(a_tag["href"]).strip()
if not href or href.startswith("#") or href.startswith("javascript:"):
continue
parsed = urlparse(href)
# Only include absolute URLs that point to different hosts
if parsed.scheme in ("http", "https") and parsed.hostname:
if parsed.hostname != base_host:
links.append(href)
# Dedupe while preserving order
seen: set[str] = set()
unique: list[str] = []
for link in links:
if link not in seen:
seen.add(link)
unique.append(link)
return unique
def _count_sentences(text: str) -> int:
"""Count approximate sentence count by terminal punctuation."""
return len(re.findall(r"[.!?]+(?:\s|$)", text))
def _count_paragraphs(text: str) -> int:
"""Count non-empty paragraph blocks separated by blank lines."""
blocks = re.split(r"\n\s*\n", text.strip())
return sum(1 for b in blocks if len(b.strip().split()) >= 5)
def score_parse_quality(
text: str,
*,
body_found: bool = True,
has_title: bool = False,
has_author: bool = False,
has_publisher: bool = False,
has_published_at: bool = False,
) -> tuple[float, str, QualitySignals, list[str]]:
"""Score parse quality using multiple content and metadata signals.
Returns (score, confidence_label, signals, warnings).
Signals considered:
- word_count_signal: length of extracted text
- diversity_signal: vocabulary richness (unique/total words)
- sentence_signal: presence of proper sentence structure
- paragraph_signal: multi-paragraph structure
- body_found_signal: whether a semantic article body was located
- metadata_signal: presence of title, author, publisher, date
Requirements: 4.3
"""
warnings: list[str] = []
words = text.split()
word_count = len(words)
# --- word count signal ---
if word_count < 20:
wc_sig = 0.1
warnings.append("very_short_text")
elif word_count < 50:
wc_sig = 0.3
warnings.append("short_text")
elif word_count < 150:
wc_sig = 0.6
elif word_count < 300:
wc_sig = 0.8
else:
wc_sig = 1.0
# --- diversity signal ---
if word_count > 0:
unique = len(set(w.lower() for w in words))
diversity = unique / word_count
else:
diversity = 0.0
if diversity < 0.2:
div_sig = 0.2
if word_count >= 20:
warnings.append("low_vocabulary_diversity")
elif diversity < 0.4:
div_sig = 0.5
else:
div_sig = 1.0
# --- sentence signal ---
sentence_count = _count_sentences(text)
if sentence_count == 0:
sent_sig = 0.1
if word_count >= 20:
warnings.append("no_sentence_structure")
elif sentence_count < 3:
sent_sig = 0.5
else:
sent_sig = 1.0
# --- paragraph signal ---
para_count = _count_paragraphs(text)
if para_count == 0:
para_sig = 0.2
elif para_count == 1:
para_sig = 0.5
else:
para_sig = 1.0
# --- body found signal ---
body_sig = 1.0 if body_found else 0.3
if not body_found:
warnings.append("no_article_body_found")
# --- metadata signal ---
meta_hits = sum([has_title, has_author, has_publisher, has_published_at])
meta_sig = meta_hits / 4.0
signals = QualitySignals(
word_count_signal=wc_sig,
diversity_signal=div_sig,
sentence_signal=sent_sig,
paragraph_signal=para_sig,
body_found_signal=body_sig,
metadata_signal=meta_sig,
)
# Weighted composite score
score = (
0.30 * wc_sig
+ 0.15 * div_sig
+ 0.15 * sent_sig
+ 0.10 * para_sig
+ 0.20 * body_sig
+ 0.10 * meta_sig
)
score = round(min(score, 0.95), 2)
# Confidence label
if score < 0.35:
confidence = "low"
elif score < 0.65:
confidence = "medium"
else:
confidence = "high"
return score, confidence, signals, warnings
def score_quality(text: str) -> tuple[float, str]:
"""Score parse quality based on extracted text characteristics.
Returns (score, confidence_label) where confidence is low/medium/high.
Thin wrapper around score_parse_quality for backward compatibility.
Requirements: 4.3
"""
score, confidence, _signals, _warnings = score_parse_quality(text)
return score, confidence
def infer_document_type(html: str, url: str = "") -> str:
"""Infer document type from URL patterns and HTML content.
Requirements: 4.1
"""
url_lower = url.lower()
if any(kw in url_lower for kw in ["sec.gov", "edgar", "filing", "10-k", "10-q", "8-k"]):
return "filing"
if any(kw in url_lower for kw in ["transcript", "earnings-call", "earnings_call"]):
return "transcript"
if any(kw in url_lower for kw in ["press-release", "press_release", "newsroom"]):
return "press_release"
# html reserved for future content-based inference
_ = html
return "article"
def parse_html(html: str, url: str = "", aliases: list[dict[str, str]] | None = None) -> ParsedDocument:
"""Full HTML-to-text parsing pipeline.
Combines body extraction, metadata extraction, link extraction,
quality scoring, document type inference, and company mention
detection into a single result.
Requirements: 1.3, 4.1, 4.2, 4.3
"""
soup = BeautifulSoup(html, "html.parser")
_strip_boilerplate_tags(soup)
article = _find_article_body(soup)
body_found = article is not None
if article:
raw_text = article.get_text(separator="\n", strip=True)
else:
body = soup.find("body")
raw_text = (body or soup).get_text(separator="\n", strip=True)
# Multi-stage text cleaning
text = _reduce_boilerplate_text(raw_text)
text = _remove_short_orphan_lines(text)
text = _detect_repeated_blocks(text)
text = _collapse_whitespace(text)
metadata = extract_metadata(html, url)
outbound_links = extract_outbound_links(html, url)
doc_type = infer_document_type(html, url)
word_count = len(text.split())
tags_raw = metadata.get("tags", "") or ""
tags = [t.strip() for t in tags_raw.split(",") if t.strip()] if tags_raw else []
# Rich quality scoring with all available signals
quality, confidence, signals, warnings = score_parse_quality(
text,
body_found=body_found,
has_title=bool(metadata.get("title")),
has_author=bool(metadata.get("author")),
has_publisher=bool(metadata.get("publisher")),
has_published_at=bool(metadata.get("published_at")),
)
low_quality_flag = confidence == "low"
# Company mention detection
mentioned: list[CompanyMention] = []
if aliases and text:
# Search title + body for mentions
search_text = f"{metadata.get('title', '')} {text}"
raw_mentions = detect_company_mentions(search_text, aliases)
for m in raw_mentions:
mentioned.append(CompanyMention(
company_id=str(m["company_id"]),
ticker=str(m["ticker"]),
mention_type=str(m["mention_type"]),
confidence=float(m["confidence"]),
match_count=int(m["match_count"]),
))
return ParsedDocument(
body_text=text,
title=metadata.get("title", "") or "",
author=metadata.get("author", "") or "",
publisher=metadata.get("publisher", "") or "",
published_at=metadata.get("published_at"),
canonical_url=metadata.get("canonical_url"),
language=metadata.get("language", "en") or "en",
description=metadata.get("description", "") or "",
document_type=doc_type,
outbound_links=outbound_links,
tags=tags,
mentioned_companies=mentioned,
quality_score=quality,
confidence=confidence,
word_count=word_count,
quality_signals=signals,
low_quality_flag=low_quality_flag,
quality_warnings=warnings,
)
@dataclass
class AliasEntry:
"""A company alias used for mention detection."""
company_id: str
alias: str
alias_type: str = "alias"
ticker: str = ""
# Confidence by alias type — tickers are most precise, brands least
_CONFIDENCE_BY_TYPE: dict[str, float] = {
"ticker": 0.9,
"legal_name": 0.85,
"alias": 0.7,
"brand": 0.6,
}
def _build_alias_entries(aliases: list[dict[str, str]]) -> list[AliasEntry]:
"""Convert raw alias dicts to typed AliasEntry objects."""
entries: list[AliasEntry] = []
for a in aliases:
alias_val = a.get("alias", "")
if not alias_val:
continue
entries.append(AliasEntry(
company_id=a.get("company_id", ""),
alias=alias_val,
alias_type=a.get("alias_type", "alias"),
ticker=a.get("ticker", ""),
))
return entries
def _count_matches(text: str, pattern: re.Pattern[str]) -> int:
"""Count non-overlapping matches of pattern in text."""
return len(pattern.findall(text))
def detect_company_mentions(
text: str,
aliases: list[dict[str, str]],
) -> list[dict[str, str | float | int]]:
"""Detect company mentions using ticker, alias, and name matching.
Matching strategy by alias length:
- 1-2 chars: case-sensitive word-boundary match (avoids "A" matching "a")
- 3-4 chars: case-insensitive word-boundary match (standard tickers)
- 5+ chars: case-insensitive substring match (company names, brands)
Confidence varies by alias_type: ticker > legal_name > alias > brand.
Multiple alias hits for the same company are deduplicated, keeping the
highest-confidence match and summing match counts.
Requirements: 1.3, 4.1
"""
if not text:
return []
entries = _build_alias_entries(aliases)
text_upper = text.upper()
# Track best match per company: company_id -> (confidence, ticker, mention_type, count)
best: dict[str, tuple[float, str, str, int]] = {}
for entry in entries:
alias = entry.alias
alias_type = entry.alias_type
base_confidence = _CONFIDENCE_BY_TYPE.get(alias_type, 0.7)
match_count = 0
if len(alias) <= 2:
# Very short: case-sensitive word boundary
pattern = re.compile(r"\b" + re.escape(alias) + r"\b")
match_count = _count_matches(text, pattern)
elif len(alias) <= 4:
# Standard ticker length: case-insensitive word boundary
pattern = re.compile(r"\b" + re.escape(alias.upper()) + r"\b")
match_count = _count_matches(text_upper, pattern)
else:
# Longer names: case-insensitive substring
alias_up = alias.upper()
match_count = text_upper.count(alias_up)
if match_count == 0:
continue
cid = entry.company_id
existing = best.get(cid)
if existing is None:
best[cid] = (base_confidence, entry.ticker, alias_type, match_count)
else:
# Keep highest confidence, accumulate match count
prev_conf, prev_ticker, prev_type, prev_count = existing
if base_confidence > prev_conf:
best[cid] = (base_confidence, entry.ticker, alias_type, prev_count + match_count)
else:
best[cid] = (prev_conf, prev_ticker, prev_type, prev_count + match_count)
mentions: list[dict[str, str | float | int]] = []
for cid, (confidence, ticker, mention_type, count) in best.items():
mentions.append({
"company_id": cid,
"ticker": ticker,
"mention_type": mention_type,
"confidence": confidence,
"match_count": count,
})
return mentions
+108 -107
View File
@@ -1,84 +1,41 @@
"""Parser worker - HTML-to-text, boilerplate reduction, quality scoring.""" """Parser worker - HTML-to-text, boilerplate reduction, quality scoring.
Uses BeautifulSoup-based parsing pipeline for structured HTML extraction,
metadata extraction, outbound link extraction, and quality scoring.
Persists normalized text and structured parser output to MinIO,
and updates document metadata in PostgreSQL.
Requirements: 4.1, 4.2, 4.3, 9.1, 9.2
"""
import asyncio import asyncio
import io
import json import json
import logging import logging
import re import time
from datetime import datetime from datetime import datetime, timezone
from typing import List, Optional, Tuple from typing import Any, Optional
import asyncpg import asyncpg
import httpx import httpx
import redis.asyncio as aioredis import redis.asyncio as aioredis
from minio import Minio from minio import Minio
from services.parser.html_parser import ParsedDocument, detect_company_mentions, parse_html
from services.shared.config import load_config from services.shared.config import load_config
from services.shared.db import get_minio, get_pg_pool, get_redis from services.shared.db import get_minio, get_pg_pool, get_redis
from services.shared.logging import Span, extract_trace_context, inject_trace_context, new_trace_id, set_trace_context, setup_logging
from services.shared.metrics import (
ACTIVE_JOBS,
PARSE_DURATION,
PARSE_JOBS_TOTAL,
PARSE_LOW_QUALITY_TOTAL,
PARSE_QUALITY_SCORE,
)
from services.shared.metadata import update_document_parse_results
from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key from services.shared.redis_keys import QUEUE_EXTRACTION, QUEUE_PARSING, queue_key
from services.shared.storage import upload_normalized_text, upload_parser_output
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("parser_worker") logger = logging.getLogger("parser_worker")
# Simple boilerplate patterns to strip
BOILERPLATE_PATTERNS = [
re.compile(r"(?i)subscribe to our newsletter.*?(?:\n|$)"),
re.compile(r"(?i)click here to read more.*?(?:\n|$)"),
re.compile(r"(?i)advertisement\s*\n"),
re.compile(r"(?i)copyright ©.*?(?:\n|$)"),
re.compile(r"(?i)all rights reserved.*?(?:\n|$)"),
re.compile(r"(?i)terms of (use|service).*?(?:\n|$)"),
re.compile(r"(?i)privacy policy.*?(?:\n|$)"),
re.compile(r"\s*\[.*?ad.*?\]\s*", re.IGNORECASE),
]
def strip_html_tags(html: str) -> str:
"""Basic HTML tag removal."""
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"&nbsp;", " ", text)
text = re.sub(r"&amp;", "&", text)
text = re.sub(r"&lt;", "<", text)
text = re.sub(r"&gt;", ">", text)
text = re.sub(r"&#\d+;", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def reduce_boilerplate(text: str) -> str:
for pattern in BOILERPLATE_PATTERNS:
text = pattern.sub("", text)
return text.strip()
def score_quality(text: str) -> Tuple[float, str]:
"""Score parse quality. Returns (score, confidence_label)."""
word_count = len(text.split())
if word_count < 20:
return 0.1, "low"
if word_count < 50:
return 0.3, "low"
if word_count < 150:
return 0.6, "medium"
return 0.85, "high"
def detect_company_mentions(text: str, aliases: List[dict]) -> List[dict]:
"""Detect company mentions using ticker, alias, and name matching."""
mentions = []
text_upper = text.upper()
for alias_info in aliases:
alias = alias_info["alias"]
if alias.upper() in text_upper:
mentions.append({
"company_id": alias_info["company_id"],
"ticker": alias_info.get("ticker", ""),
"mention_type": alias_info.get("alias_type", "alias"),
"confidence": 0.7,
})
return mentions
async def fetch_html(url: str) -> Optional[str]: async def fetch_html(url: str) -> Optional[str]:
"""Fetch article HTML for scraping.""" """Fetch article HTML for scraping."""
@@ -94,48 +51,65 @@ async def fetch_html(url: str) -> Optional[str]:
return None return None
def build_parser_output_json(parsed: ParsedDocument, mentions: list[dict[str, Any]]) -> dict[str, Any]:
"""Build a structured JSON dict from ParsedDocument and detected mentions.
This captures the full parser output for audit and downstream use:
metadata, quality signals, warnings, outbound links, tags, and mentions.
"""
return {
"title": parsed.title,
"author": parsed.author,
"publisher": parsed.publisher,
"published_at": parsed.published_at,
"canonical_url": parsed.canonical_url,
"language": parsed.language,
"description": parsed.description,
"document_type": parsed.document_type,
"word_count": parsed.word_count,
"outbound_links": parsed.outbound_links,
"tags": parsed.tags,
"quality_score": parsed.quality_score,
"confidence": parsed.confidence,
"low_quality_flag": parsed.low_quality_flag,
"quality_warnings": parsed.quality_warnings,
"quality_signals": parsed.quality_signals.as_dict(),
"mentioned_companies": mentions,
}
async def process_job( async def process_job(
job: dict, job: dict[str, Any],
pool: asyncpg.Pool, pool: asyncpg.Pool,
rds: aioredis.Redis, rds: aioredis.Redis,
minio_client: Minio, minio_client: Minio,
): ) -> None:
doc_id = job["document_id"] doc_id = job["document_id"]
ticker = job["ticker"] ticker = job["ticker"]
url = job.get("url", "") url = job.get("url", "")
now = datetime.now(timezone.utc)
_parse_start = time.monotonic()
set_trace_context(trace_id=job.get("_trace_id") or new_trace_id())
# Fetch HTML if we have a URL # Fetch HTML if we have a URL
html = await fetch_html(url) if url else None html = await fetch_html(url) if url else None
if html: if html:
# Store raw HTML # Parse using BeautifulSoup pipeline
html_bytes = html.encode("utf-8") parsed = parse_html(html, url)
now = datetime.utcnow()
html_path = f"scrape/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/raw.html"
minio_client.put_object(
"stonks-raw-news", html_path, io.BytesIO(html_bytes), len(html_bytes),
content_type="text/html",
)
# Parse
text = strip_html_tags(html)
text = reduce_boilerplate(text)
else: else:
text = "" parsed = ParsedDocument()
quality_score, confidence = score_quality(text) text = parsed.body_text
# Store normalized text # Upload normalized text to MinIO
norm_ref: str | None = None
if text: if text:
text_bytes = text.encode("utf-8") norm_ref = upload_normalized_text(
now = datetime.utcnow() minio_client, ticker, doc_id,
norm_path = f"parsed/{ticker}/{now.year}/{now.month:02d}/{now.day:02d}/{doc_id}/normalized.txt" text.encode("utf-8"), timestamp=now,
minio_client.put_object(
"stonks-normalized", norm_path, io.BytesIO(text_bytes), len(text_bytes),
content_type="text/plain",
) )
else:
norm_path = None
# Detect company mentions # Detect company mentions
aliases = await pool.fetch( aliases = await pool.fetch(
@@ -150,14 +124,24 @@ async def process_job(
) )
mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else [] mentions = detect_company_mentions(text, [dict(a) for a in aliases]) if text else []
# Update document # Build and upload structured parser output JSON
status = "parsed" if confidence != "low" else "low_quality" output_json = build_parser_output_json(parsed, mentions)
await pool.execute( output_bytes = json.dumps(output_json, default=str, indent=2).encode("utf-8")
"""UPDATE documents SET parser_output_ref = upload_parser_output(
normalized_storage_ref=$2, parse_quality_score=$3, parse_confidence=$4, status=$5, updated_at=NOW() minio_client, ticker, doc_id,
WHERE id=$1""", output_bytes, timestamp=now,
doc_id, f"s3://stonks-normalized/{norm_path}" if norm_path else None, )
quality_score, confidence, status,
# Update document in PostgreSQL
status = "parsed" if parsed.confidence != "low" else "low_quality"
await update_document_parse_results(
pool,
document_id=doc_id,
normalized_storage_ref=norm_ref,
parser_output_ref=parser_output_ref,
parse_quality_score=parsed.quality_score,
parse_confidence=parsed.confidence,
status=status,
) )
# Insert company mentions # Insert company mentions
@@ -169,19 +153,36 @@ async def process_job(
) )
# Only enqueue for extraction if quality is acceptable # Only enqueue for extraction if quality is acceptable
if confidence != "low": if parsed.confidence != "low":
await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps({ await rds.rpush(queue_key(QUEUE_EXTRACTION), json.dumps(inject_trace_context({
"document_id": doc_id, "document_id": doc_id,
"ticker": ticker, "ticker": ticker,
"normalized_text": text[:8000], # Truncate for prompt "normalized_text": text[:8000],
})) })))
logger.info(f"Parsed doc {doc_id} for {ticker}: quality={quality_score:.2f}, confidence={confidence}") PARSE_JOBS_TOTAL.labels(status="parsed").inc()
PARSE_QUALITY_SCORE.observe(parsed.quality_score)
PARSE_DURATION.observe(time.monotonic() - _parse_start)
logger.info(
"Parsed doc %s for %s: quality=%.2f, confidence=%s",
doc_id, ticker, parsed.quality_score, parsed.confidence,
extra={"ticker": ticker, "document_id": doc_id},
)
else: else:
logger.warning(f"Low quality parse for doc {doc_id}, skipping extraction") PARSE_JOBS_TOTAL.labels(status="low_quality").inc()
PARSE_LOW_QUALITY_TOTAL.inc()
PARSE_QUALITY_SCORE.observe(parsed.quality_score)
PARSE_DURATION.observe(time.monotonic() - _parse_start)
logger.warning(
"Low quality parse for doc %s, skipping extraction",
doc_id,
extra={"ticker": ticker, "document_id": doc_id},
)
async def main(): async def main() -> None:
config = load_config() config = load_config()
setup_logging("parser_worker", level=config.log_level, json_output=config.json_logs)
pool = await get_pg_pool(config) pool = await get_pg_pool(config)
rds = get_redis(config) rds = get_redis(config)
minio_client = get_minio(config) minio_client = get_minio(config)
@@ -197,7 +198,7 @@ async def main():
try: try:
await process_job(job, pool, rds, minio_client) await process_job(job, pool, rds, minio_client)
except Exception as e: except Exception as e:
logger.error(f"Parse error: {e}") logger.error("Parse error: %s", e, exc_info=True)
else: else:
await asyncio.sleep(2) await asyncio.sleep(2)
finally: finally:

Some files were not shown because too many files have changed in this diff Show More