Files
stonks-oracle/docs/intelligence-pipeline-deep-dive/diagrams/ingestion-to-extraction-flow.md
T
Celes Renata 88ad1e8d99 feat: comprehensive docs, unit tests, docker-compose app services
- Add scheduler and ingestion unit tests (test_scheduler_unit.py, test_ingestion_unit.py)
- Add all 13 app services + dashboard to docker-compose.yml
- Add full documentation suite: API reference, Helm reference, Docker deployment guide,
  3 architecture diagrams (K8s, Docker Compose, data pipeline), AI agent guide,
  backup/restore guide, observability/metrics reference, per-service docs
- Add intelligence pipeline deep-dive docs with Mermaid diagrams
- Update README with documentation index and links
- Add specs for comprehensive-quality-docs, intelligence-pipeline-deep-dive,
  sanitized-pipeline-docs
2026-04-22 02:56:41 +00:00

2.7 KiB

Ingestion-to-Extraction Flow

flowchart TD
    subgraph Scheduler["Scheduler\nservices/scheduler/app.py"]
        S1["schedule_cycle()"]
        S2["Cadence check\nmarket_api: 300s\nnews_api: 300s\nfilings_api: 3600s\nmacro_news: 600s"]
        S3["Rate limit check\ncheck_rate_limit()"]
        S1 --> S2 --> S3
    end

    S3 -->|"rpush"| Q_ING["stonks:queue:ingestion"]

    Q_ING -->|"lpop"| ING

    subgraph ING["Ingestion Worker\nservices/ingestion/worker.py"]
        direction TB
        AD["Adapter Dispatch\nprocess_job()"]
        AD --> PA["PolygonMarketAdapter\nservices/adapters/market_adapter.py"]
        AD --> PB["PolygonNewsAdapter\nservices/adapters/news_adapter.py"]
        AD --> PC["SECEdgarAdapter\nservices/adapters/filings_adapter.py"]
        AD --> PD["MacroNewsAdapter\nservices/adapters/macro_news_adapter.py"]
        AD --> PE["WebScrapeAdapter\nservices/adapters/web_scrape_adapter.py"]
    end

    ING -->|"Content hash check\nstonks:dedupe:*\nTTL 24h"| REDIS_DEDUPE[("Redis\nDedupe Markers")]

    ING -->|"upload_raw_artifact()"| MINIO_RAW

    subgraph MINIO_RAW["MinIO Raw Storage"]
        B1["stonks-raw-market"]
        B2["stonks-raw-news"]
        B3["stonks-raw-filings"]
    end

    ING -->|"persist_ingestion_items()"| PG_ING

    subgraph PG_ING["PostgreSQL"]
        T1["documents"]
        T2["ingestion_runs"]
        T3["document_company_mentions"]
    end

    ING -->|"rpush new doc IDs"| Q_PARSE["stonks:queue:parsing"]

    Q_PARSE -->|"lpop"| PARSER

    subgraph PARSER["Parser Worker\nservices/parser/worker.py"]
        P1["fetch_html() → parse_html()"]
        P2["Quality scoring\nconfidence: high / medium / low"]
        P3["Company mention detection\ndetect_company_mentions()"]
        P4["Routing decision"]
        P1 --> P2 --> P3 --> P4
    end

    PARSER -->|"upload_normalized_text()\nupload_parser_output()"| MINIO_NORM["MinIO\nstonks-normalized"]
    PARSER -->|"update_document_parse_results()"| PG_ING

    P4 -->|"doc_type = macro_event"| Q_MACRO["stonks:queue:macro_classification"]
    P4 -->|"doc_type ≠ macro_event"| Q_EXT["stonks:queue:extraction"]

    Q_EXT -->|"lpop"| EXT
    Q_MACRO -->|"lpop"| EXT

    subgraph EXT["Extractor Worker\nservices/extractor/main.py"]
        E1["Document Intelligence\nExtractor agent\nslug: document-extractor"]
        E2["Global Event Classifier\nslug: event-classifier\nservices/extractor/event_classifier.py"]
        E3["persist_extraction()\nservices/extractor/worker.py"]
    end

    EXT -->|"persist to"| PG_EXT

    subgraph PG_EXT["PostgreSQL"]
        T4["document_intelligence"]
        T5["document_impact_records"]
        T6["global_events"]
        T7["macro_impact_records"]
    end

    EXT -->|"rpush"| Q_AGG["stonks:queue:aggregation"]