feat: comprehensive docs, unit tests, docker-compose app services

- Add scheduler and ingestion unit tests (test_scheduler_unit.py, test_ingestion_unit.py)
- Add all 13 app services + dashboard to docker-compose.yml
- Add full documentation suite: API reference, Helm reference, Docker deployment guide,
  3 architecture diagrams (K8s, Docker Compose, data pipeline), AI agent guide,
  backup/restore guide, observability/metrics reference, per-service docs
- Add intelligence pipeline deep-dive docs with Mermaid diagrams
- Update README with documentation index and links
- Add specs for comprehensive-quality-docs, intelligence-pipeline-deep-dive,
  sanitized-pipeline-docs
This commit is contained in:
Celes Renata
2026-04-22 02:56:41 +00:00
parent f251c53f92
commit 88ad1e8d99
57 changed files with 13318 additions and 51 deletions
@@ -0,0 +1,81 @@
# Ingestion-to-Extraction Flow
```mermaid
flowchart TD
subgraph Scheduler["Scheduler\nservices/scheduler/app.py"]
S1["schedule_cycle()"]
S2["Cadence check\nmarket_api: 300s\nnews_api: 300s\nfilings_api: 3600s\nmacro_news: 600s"]
S3["Rate limit check\ncheck_rate_limit()"]
S1 --> S2 --> S3
end
S3 -->|"rpush"| Q_ING["app:queue:ingestion"]
Q_ING -->|"lpop"| ING
subgraph ING["Ingestion Worker\nservices/ingestion/worker.py"]
direction TB
AD["Adapter Dispatch\nprocess_job()"]
AD --> PA["ExternalDataAdapter\nservices/adapters/market_adapter.py"]
AD --> PB["ExternalNewsAdapter\nservices/adapters/news_adapter.py"]
AD --> PC["RegulatoryFilingsAdapter\nservices/adapters/filings_adapter.py"]
AD --> PD["MacroNewsAdapter\nservices/adapters/macro_news_adapter.py"]
AD --> PE["WebScrapeAdapter\nservices/adapters/web_scrape_adapter.py"]
end
ING -->|"Content hash check\napp:dedupe:*\nTTL 24h"| REDIS_DEDUPE[("Redis\nDedupe Markers")]
ING -->|"upload_raw_artifact()"| MINIO_RAW
subgraph MINIO_RAW["MinIO Raw Storage"]
B1["app-raw-data"]
B2["app-raw-content"]
B3["app-raw-filings"]
end
ING -->|"persist_ingestion_items()"| PG_ING
subgraph PG_ING["PostgreSQL"]
T1["documents"]
T2["ingestion_runs"]
T3["document_company_mentions"]
end
ING -->|"rpush new doc IDs"| Q_PARSE["app:queue:parsing"]
Q_PARSE -->|"lpop"| PARSER
subgraph PARSER["Parser Worker\nservices/parser/worker.py"]
P1["fetch_html() → parse_html()"]
P2["Quality scoring\nconfidence: high / medium / low"]
P3["Company mention detection\ndetect_company_mentions()"]
P4["Routing decision"]
P1 --> P2 --> P3 --> P4
end
PARSER -->|"upload_normalized_text()\nupload_parser_output()"| MINIO_NORM["MinIO\napp-normalized"]
PARSER -->|"update_document_parse_results()"| PG_ING
P4 -->|"doc_type = macro_event"| Q_MACRO["app:queue:macro_classification"]
P4 -->|"doc_type ≠ macro_event"| Q_EXT["app:queue:extraction"]
Q_EXT -->|"lpop"| EXT
Q_MACRO -->|"lpop"| EXT
subgraph EXT["Extractor Worker\nservices/extractor/main.py"]
E1["Document Intelligence\nExtractor agent\nslug: document-extractor"]
E2["Global Event Classifier\nslug: event-classifier\nservices/extractor/event_classifier.py"]
E3["persist_extraction()\nservices/extractor/worker.py"]
end
EXT -->|"persist to"| PG_EXT
subgraph PG_EXT["PostgreSQL"]
T4["document_intelligence"]
T5["document_impact_records"]
T6["global_events"]
T7["macro_impact_records"]
end
EXT -->|"rpush"| Q_AGG["app:queue:aggregation"]
```