feat: comprehensive docs, unit tests, docker-compose app services

- Add scheduler and ingestion unit tests (test_scheduler_unit.py, test_ingestion_unit.py) - Add all 13 app services + dashboard to docker-compose.yml - Add full documentation suite: API reference, Helm reference, Docker deployment guide, 3 architecture diagrams (K8s, Docker Compose, data pipeline), AI agent guide, backup/restore guide, observability/metrics reference, per-service docs - Add intelligence pipeline deep-dive docs with Mermaid diagrams - Update README with documentation index and links - Add specs for comprehensive-quality-docs, intelligence-pipeline-deep-dive, sanitized-pipeline-docs
2026-04-22 02:56:41 +00:00
parent f251c53f92
commit 88ad1e8d99
57 changed files with 13318 additions and 51 deletions
@@ -0,0 +1,81 @@
+# Ingestion-to-Extraction Flow
+
+```mermaid
+flowchart TD
+    subgraph Scheduler["Scheduler\nservices/scheduler/app.py"]
+        S1["schedule_cycle()"]
+        S2["Cadence check\nmarket_api: 300s\nnews_api: 300s\nfilings_api: 3600s\nmacro_news: 600s"]
+        S3["Rate limit check\ncheck_rate_limit()"]
+        S1 --> S2 --> S3
+    end
+
+    S3 -->|"rpush"| Q_ING["app:queue:ingestion"]
+
+    Q_ING -->|"lpop"| ING
+
+    subgraph ING["Ingestion Worker\nservices/ingestion/worker.py"]
+        direction TB
+        AD["Adapter Dispatch\nprocess_job()"]
+        AD --> PA["ExternalDataAdapter\nservices/adapters/market_adapter.py"]
+        AD --> PB["ExternalNewsAdapter\nservices/adapters/news_adapter.py"]
+        AD --> PC["RegulatoryFilingsAdapter\nservices/adapters/filings_adapter.py"]
+        AD --> PD["MacroNewsAdapter\nservices/adapters/macro_news_adapter.py"]
+        AD --> PE["WebScrapeAdapter\nservices/adapters/web_scrape_adapter.py"]
+    end
+
+    ING -->|"Content hash check\napp:dedupe:*\nTTL 24h"| REDIS_DEDUPE[("Redis\nDedupe Markers")]
+
+    ING -->|"upload_raw_artifact()"| MINIO_RAW
+
+    subgraph MINIO_RAW["MinIO Raw Storage"]
+        B1["app-raw-data"]
+        B2["app-raw-content"]
+        B3["app-raw-filings"]
+    end
+
+    ING -->|"persist_ingestion_items()"| PG_ING
+
+    subgraph PG_ING["PostgreSQL"]
+        T1["documents"]
+        T2["ingestion_runs"]
+        T3["document_company_mentions"]
+    end
+
+    ING -->|"rpush new doc IDs"| Q_PARSE["app:queue:parsing"]
+
+    Q_PARSE -->|"lpop"| PARSER
+
+    subgraph PARSER["Parser Worker\nservices/parser/worker.py"]
+        P1["fetch_html() → parse_html()"]
+        P2["Quality scoring\nconfidence: high / medium / low"]
+        P3["Company mention detection\ndetect_company_mentions()"]
+        P4["Routing decision"]
+        P1 --> P2 --> P3 --> P4
+    end
+
+    PARSER -->|"upload_normalized_text()\nupload_parser_output()"| MINIO_NORM["MinIO\napp-normalized"]
+    PARSER -->|"update_document_parse_results()"| PG_ING
+
+    P4 -->|"doc_type = macro_event"| Q_MACRO["app:queue:macro_classification"]
+    P4 -->|"doc_type ≠ macro_event"| Q_EXT["app:queue:extraction"]
+
+    Q_EXT -->|"lpop"| EXT
+    Q_MACRO -->|"lpop"| EXT
+
+    subgraph EXT["Extractor Worker\nservices/extractor/main.py"]
+        E1["Document Intelligence\nExtractor agent\nslug: document-extractor"]
+        E2["Global Event Classifier\nslug: event-classifier\nservices/extractor/event_classifier.py"]
+        E3["persist_extraction()\nservices/extractor/worker.py"]
+    end
+
+    EXT -->|"persist to"| PG_EXT
+
+    subgraph PG_EXT["PostgreSQL"]
+        T4["document_intelligence"]
+        T5["document_impact_records"]
+        T6["global_events"]
+        T7["macro_impact_records"]
+    end
+
+    EXT -->|"rpush"| Q_AGG["app:queue:aggregation"]
+```