feat: comprehensive docs, unit tests, docker-compose app services
- Add scheduler and ingestion unit tests (test_scheduler_unit.py, test_ingestion_unit.py) - Add all 13 app services + dashboard to docker-compose.yml - Add full documentation suite: API reference, Helm reference, Docker deployment guide, 3 architecture diagrams (K8s, Docker Compose, data pipeline), AI agent guide, backup/restore guide, observability/metrics reference, per-service docs - Add intelligence pipeline deep-dive docs with Mermaid diagrams - Update README with documentation index and links - Add specs for comprehensive-quality-docs, intelligence-pipeline-deep-dive, sanitized-pipeline-docs
This commit is contained in:
@@ -0,0 +1,81 @@
|
||||
# Ingestion-to-Extraction Flow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
subgraph Scheduler["Scheduler\nservices/scheduler/app.py"]
|
||||
S1["schedule_cycle()"]
|
||||
S2["Cadence check\nmarket_api: 300s\nnews_api: 300s\nfilings_api: 3600s\nmacro_news: 600s"]
|
||||
S3["Rate limit check\ncheck_rate_limit()"]
|
||||
S1 --> S2 --> S3
|
||||
end
|
||||
|
||||
S3 -->|"rpush"| Q_ING["stonks:queue:ingestion"]
|
||||
|
||||
Q_ING -->|"lpop"| ING
|
||||
|
||||
subgraph ING["Ingestion Worker\nservices/ingestion/worker.py"]
|
||||
direction TB
|
||||
AD["Adapter Dispatch\nprocess_job()"]
|
||||
AD --> PA["PolygonMarketAdapter\nservices/adapters/market_adapter.py"]
|
||||
AD --> PB["PolygonNewsAdapter\nservices/adapters/news_adapter.py"]
|
||||
AD --> PC["SECEdgarAdapter\nservices/adapters/filings_adapter.py"]
|
||||
AD --> PD["MacroNewsAdapter\nservices/adapters/macro_news_adapter.py"]
|
||||
AD --> PE["WebScrapeAdapter\nservices/adapters/web_scrape_adapter.py"]
|
||||
end
|
||||
|
||||
ING -->|"Content hash check\nstonks:dedupe:*\nTTL 24h"| REDIS_DEDUPE[("Redis\nDedupe Markers")]
|
||||
|
||||
ING -->|"upload_raw_artifact()"| MINIO_RAW
|
||||
|
||||
subgraph MINIO_RAW["MinIO Raw Storage"]
|
||||
B1["stonks-raw-market"]
|
||||
B2["stonks-raw-news"]
|
||||
B3["stonks-raw-filings"]
|
||||
end
|
||||
|
||||
ING -->|"persist_ingestion_items()"| PG_ING
|
||||
|
||||
subgraph PG_ING["PostgreSQL"]
|
||||
T1["documents"]
|
||||
T2["ingestion_runs"]
|
||||
T3["document_company_mentions"]
|
||||
end
|
||||
|
||||
ING -->|"rpush new doc IDs"| Q_PARSE["stonks:queue:parsing"]
|
||||
|
||||
Q_PARSE -->|"lpop"| PARSER
|
||||
|
||||
subgraph PARSER["Parser Worker\nservices/parser/worker.py"]
|
||||
P1["fetch_html() → parse_html()"]
|
||||
P2["Quality scoring\nconfidence: high / medium / low"]
|
||||
P3["Company mention detection\ndetect_company_mentions()"]
|
||||
P4["Routing decision"]
|
||||
P1 --> P2 --> P3 --> P4
|
||||
end
|
||||
|
||||
PARSER -->|"upload_normalized_text()\nupload_parser_output()"| MINIO_NORM["MinIO\nstonks-normalized"]
|
||||
PARSER -->|"update_document_parse_results()"| PG_ING
|
||||
|
||||
P4 -->|"doc_type = macro_event"| Q_MACRO["stonks:queue:macro_classification"]
|
||||
P4 -->|"doc_type ≠ macro_event"| Q_EXT["stonks:queue:extraction"]
|
||||
|
||||
Q_EXT -->|"lpop"| EXT
|
||||
Q_MACRO -->|"lpop"| EXT
|
||||
|
||||
subgraph EXT["Extractor Worker\nservices/extractor/main.py"]
|
||||
E1["Document Intelligence\nExtractor agent\nslug: document-extractor"]
|
||||
E2["Global Event Classifier\nslug: event-classifier\nservices/extractor/event_classifier.py"]
|
||||
E3["persist_extraction()\nservices/extractor/worker.py"]
|
||||
end
|
||||
|
||||
EXT -->|"persist to"| PG_EXT
|
||||
|
||||
subgraph PG_EXT["PostgreSQL"]
|
||||
T4["document_intelligence"]
|
||||
T5["document_impact_records"]
|
||||
T6["global_events"]
|
||||
T7["macro_impact_records"]
|
||||
end
|
||||
|
||||
EXT -->|"rpush"| Q_AGG["stonks:queue:aggregation"]
|
||||
```
|
||||
Reference in New Issue
Block a user