stonks-oracle/services/extractor/worker.py

"""Extraction worker - sends documents to Ollama for structured intelligence extraction.

Orchestrates the full extraction pipeline for a single document:
1. Calls OllamaClient to get structured extraction
2. Uploads prompts, raw outputs, and validation reports to MinIO
3. Persists the final intelligence object and per-company impact records to PostgreSQL
4. Updates document status

Requirements: 5.1, 5.2, 5.3, 5.4, 5.5, 9.1, 9.2
"""
from __future__ import annotations

import json
import logging
from dataclasses import dataclass
from datetime import datetime, timezone

import asyncpg
from minio import Minio

from services.extractor.client import ExtractionResponse
from services.extractor.metrics import collect_metrics, persist_metrics
from services.shared.metadata import (
    persist_document_impact,
    persist_document_intelligence,
    update_document_status,
)
from services.shared.metrics import (
    EXTRACTION_ATTEMPTS,
    EXTRACTION_CONFIDENCE,
    EXTRACTION_DURATION,
    EXTRACTION_JOBS_TOTAL,
    EXTRACTION_RETRIES,
    EXTRACTION_TOKEN_ESTIMATE,
    EXTRACTION_VALIDATION_ERRORS,
)
from services.shared.storage import (
    upload_extraction_intelligence,
    upload_extraction_prompt,
    upload_extraction_raw_output,
    upload_extraction_validation,
)

logger = logging.getLogger("extractor_worker")


@dataclass
class ExtractionPersistResult:
    """Result of persisting an extraction to storage and database."""

    intelligence_id: str | None = None
    prompt_ref: str | None = None
    raw_output_ref: str | None = None
    validation_ref: str | None = None
    intelligence_ref: str | None = None
    impact_ids: list[str] | None = None
    metrics_id: str | None = None
    success: bool = False


async def persist_extraction(
    *,
    pool: asyncpg.Pool,
    minio_client: Minio,
    document_id: str,
    ticker: str,
    extraction_response: ExtractionResponse,
    company_id_map: dict[str, str] | None = None,
    source_credibility: float = 0.5,
    timestamp: datetime | None = None,
    document_text_length: int = 0,
) -> ExtractionPersistResult:
    """Persist all extraction artifacts to MinIO and PostgreSQL.

    Uploads prompts, raw model outputs, validation reports, and the final
    intelligence object to MinIO. Persists the intelligence record and
    per-company impact records to PostgreSQL. Updates document status.
    Also collects and persists model performance metrics.

    Args:
        pool: PostgreSQL connection pool.
        minio_client: MinIO client.
        document_id: UUID of the source document.
        ticker: Primary ticker for path construction.
        extraction_response: Full response from OllamaClient.extract().
        company_id_map: Optional mapping of ticker -> company UUID for impact records.
        source_credibility: Credibility score to attach to the intelligence record.
        timestamp: Override timestamp for MinIO paths (defaults to UTC now).
        document_text_length: Length of the input document text for token estimation.

    Returns:
        ExtractionPersistResult with references to all persisted artifacts.
    """
    ts = timestamp or datetime.now(timezone.utc)
    result = ExtractionPersistResult()
    company_id_map = company_id_map or {}

    # 1. Upload prompt metadata to MinIO
    prompt_payload = json.dumps({
        "prompt_metadata": extraction_response.prompt_metadata,
        "model": extraction_response.model,
    }, indent=2).encode()
    result.prompt_ref = upload_extraction_prompt(
        minio_client, ticker, document_id, prompt_payload, timestamp=ts,
    )

    # 2. Upload raw outputs for each attempt
    attempts_data: list[dict[str, object]] = []
    for idx, attempt in enumerate(extraction_response.attempts):
        attempt_record: dict[str, object] = {
            "attempt_index": idx,
            "raw_output": attempt.raw_output,
            "error": attempt.error,
            "duration_ms": attempt.duration_ms,
            "model": attempt.model,
            "retryable": attempt.retryable,
        }
        if attempt.validation:
            attempt_record["validation"] = {
                "valid": attempt.validation.valid,
                "errors": attempt.validation.errors,
                "warnings": attempt.validation.warnings,
            }
        attempts_data.append(attempt_record)

    raw_output_payload = json.dumps({
        "document_id": document_id,
        "attempts": attempts_data,
        "total_duration_ms": extraction_response.total_duration_ms,
        "success": extraction_response.success,
    }, indent=2).encode()
    result.raw_output_ref = upload_extraction_raw_output(
        minio_client, ticker, document_id, raw_output_payload, timestamp=ts,
    )

    # 3. Upload validation report
    final_attempt = extraction_response.attempts[-1] if extraction_response.attempts else None
    validation_payload = json.dumps({
        "document_id": document_id,
        "success": extraction_response.success,
        "attempt_count": len(extraction_response.attempts),
        "final_validation": {
            "valid": final_attempt.validation.valid if final_attempt and final_attempt.validation else False,
            "errors": final_attempt.validation.errors if final_attempt and final_attempt.validation else [],
            "warnings": final_attempt.validation.warnings if final_attempt and final_attempt.validation else [],
        } if final_attempt else None,
    }, indent=2).encode()
    result.validation_ref = upload_extraction_validation(
        minio_client, ticker, document_id, validation_payload, timestamp=ts,
    )

    # 4. Determine validation status and persist intelligence
    if extraction_response.success and extraction_response.result:
        extraction = extraction_response.result
        validation_status = "valid"
        validation_errors: list[str] = []

        # Upload final intelligence object to MinIO
        intelligence_payload = json.dumps(
            extraction.model_dump(mode="json"), indent=2,
        ).encode()
        result.intelligence_ref = upload_extraction_intelligence(
            minio_client, ticker, document_id, intelligence_payload, timestamp=ts,
        )

        # Persist to PostgreSQL
        intel_id = await persist_document_intelligence(
            pool,
            document_id=document_id,
            summary=extraction.summary,
            macro_themes=extraction.macro_themes,
            novelty_score=extraction.novelty_score,
            source_credibility=source_credibility,
            extraction_warnings=extraction.extraction_warnings,
            confidence=extraction.confidence,
            model_provider="ollama",
            model_name=extraction_response.model,
            prompt_version=extraction_response.prompt_metadata.get("prompt_version", ""),
            schema_version=extraction_response.prompt_metadata.get("schema_version", ""),
            raw_output_ref=result.raw_output_ref,
            prompt_ref=result.prompt_ref,
            validation_status=validation_status,
            validation_errors=validation_errors,
            retry_count=len(extraction_response.attempts) - 1,
        )
        result.intelligence_id = intel_id

        # Persist per-company impact records
        result.impact_ids = []
        for company in extraction.companies:
            cid = company_id_map.get(company.ticker)
            if not cid:
                logger.warning(
                    "No company_id for ticker %s in doc %s, skipping impact record",
                    company.ticker, document_id,
                )
                continue
            impact_id = await persist_document_impact(
                pool,
                intelligence_id=intel_id,
                company_id=cid,
                ticker=company.ticker,
                relevance=company.relevance,
                sentiment=company.sentiment,
                impact_score=company.impact_score,
                impact_horizon=company.impact_horizon,
                catalyst_type=company.catalyst_type,
                key_facts=company.key_facts,
                risks=company.risks,
                evidence_spans=company.evidence_spans,
            )
            result.impact_ids.append(impact_id)

        await update_document_status(pool, document_id=document_id, status="extracted")
        result.success = True
        logger.info(
            "Extraction persisted for doc %s: intel=%s, impacts=%d",
            document_id, intel_id, len(result.impact_ids),
        )
    else:
        # Failed extraction — still persist the attempt data
        all_errors: list[str] = []
        for attempt in extraction_response.attempts:
            if attempt.error:
                all_errors.append(attempt.error)

        intel_id = await persist_document_intelligence(
            pool,
            document_id=document_id,
            summary="",
            macro_themes=[],
            novelty_score=0.0,
            source_credibility=source_credibility,
            extraction_warnings=["extraction_failed"],
            confidence=0.0,
            model_provider="ollama",
            model_name=extraction_response.model,
            prompt_version=extraction_response.prompt_metadata.get("prompt_version", ""),
            schema_version=extraction_response.prompt_metadata.get("schema_version", ""),
            raw_output_ref=result.raw_output_ref,
            prompt_ref=result.prompt_ref,
            validation_status="failed",
            validation_errors=all_errors,
            retry_count=len(extraction_response.attempts),
        )
        result.intelligence_id = intel_id

        await update_document_status(pool, document_id=document_id, status="extraction_failed")
        logger.warning(
            "Extraction failed for doc %s after %d attempts: %s",
            document_id, len(extraction_response.attempts), "; ".join(all_errors),
        )

    # Collect and persist model performance metrics
    try:
        metrics = collect_metrics(
            extraction_response,
            document_id=document_id,
            ticker=ticker,
            document_text_length=document_text_length,
        )
        metrics.recorded_at = ts
        metrics_id = await persist_metrics(pool, metrics)
        result.metrics_id = metrics_id
    except Exception:
        logger.exception("Failed to persist extraction metrics for doc %s", document_id)

    # Prometheus metrics
    EXTRACTION_ATTEMPTS.inc(len(extraction_response.attempts))
    EXTRACTION_DURATION.observe(extraction_response.total_duration_ms / 1000.0)
    retry_count = max(0, len(extraction_response.attempts) - 1)
    if retry_count > 0:
        EXTRACTION_RETRIES.inc(retry_count)
    if extraction_response.success:
        EXTRACTION_JOBS_TOTAL.labels(status="success").inc()
        if extraction_response.result:
            EXTRACTION_CONFIDENCE.observe(extraction_response.result.confidence)
    else:
        EXTRACTION_JOBS_TOTAL.labels(status="failed").inc()
    # Count validation errors from final attempt
    final = extraction_response.attempts[-1] if extraction_response.attempts else None
    if final and final.validation and final.validation.errors:
        EXTRACTION_VALIDATION_ERRORS.inc(len(final.validation.errors))
    # Token estimates
    if document_text_length > 0:
        EXTRACTION_TOKEN_ESTIMATE.labels(direction="input").inc(document_text_length // 4)
    if final and final.raw_output:
        EXTRACTION_TOKEN_ESTIMATE.labels(direction="output").inc(len(final.raw_output) // 4)

    return result