phase 14-15: docker build validation and helm deployment

This commit is contained in:
Celes Renata
2026-04-11 11:59:45 -07:00
parent 7394d241c9
commit ce10afa034
179 changed files with 32559 additions and 576 deletions
+108
View File
@@ -43,6 +43,10 @@ class OllamaConfig:
base_url: str = "http://localhost:11434"
model: str = "llama3.1:8b"
timeout: int = 120
max_retries: int = 2
retry_base_delay: float = 1.0
retry_max_delay: float = 10.0
retry_backoff_multiplier: float = 2.0
@dataclass
@@ -51,16 +55,82 @@ class TrinoConfig:
port: int = 8080
catalog: str = "lakehouse"
schema: str = "stonks"
iceberg_catalog: str = "iceberg"
@dataclass
class MarketDataConfig:
api_key: str = ""
base_url: str = "https://api.polygon.io"
provider: str = "polygon"
@dataclass
class BrokerConfig:
mode: str = "paper" # paper | live
provider: str = "alpaca"
api_key: Optional[str] = None
api_secret: Optional[str] = None
base_url: Optional[str] = None
@dataclass
class RetentionConfig:
"""Default retention periods (days) per bucket class.
These can be overridden per-bucket via the retention_policies DB table.
The cleanup_interval_hours controls how often the retention worker runs.
"""
raw_market_days: int = 90
raw_news_days: int = 180
raw_filings_days: int = 365
normalized_days: int = 180
llm_prompts_days: int = 365
llm_results_days: int = 365
lakehouse_days: int = 730
audit_days: int = 730
cleanup_interval_hours: int = 24
batch_size: int = 1000
# Map bucket names to RetentionConfig field names
BUCKET_RETENTION_FIELDS: dict[str, str] = {
"stonks-raw-market": "raw_market_days",
"stonks-raw-news": "raw_news_days",
"stonks-raw-filings": "raw_filings_days",
"stonks-normalized": "normalized_days",
"stonks-llm-prompts": "llm_prompts_days",
"stonks-llm-results": "llm_results_days",
"stonks-lakehouse": "lakehouse_days",
"stonks-audit": "audit_days",
}
@dataclass
class AlertingConfig:
"""Thresholds for operational alerting rules.
Requirements: 12.3
"""
# Source failure alerting
source_failure_threshold: int = 3 # consecutive failures before alert
source_failure_window_hours: int = 6 # lookback window
# Schema/extraction failure spike
schema_failure_rate_threshold: float = 0.3 # 30% failure rate triggers alert
schema_failure_window_hours: int = 1
# Analytical (lake publication) lag
lake_lag_threshold_minutes: int = 60 # minutes since last successful publish
# Broker issues
broker_error_threshold: int = 3 # consecutive broker errors
broker_error_window_hours: int = 1
# Evaluation interval
check_interval_seconds: int = 120
@dataclass
class AppConfig:
postgres: PostgresConfig = field(default_factory=PostgresConfig)
@@ -68,8 +138,12 @@ class AppConfig:
minio: MinioConfig = field(default_factory=MinioConfig)
ollama: OllamaConfig = field(default_factory=OllamaConfig)
trino: TrinoConfig = field(default_factory=TrinoConfig)
market_data: MarketDataConfig = field(default_factory=MarketDataConfig)
broker: BrokerConfig = field(default_factory=BrokerConfig)
retention: RetentionConfig = field(default_factory=RetentionConfig)
alerting: AlertingConfig = field(default_factory=AlertingConfig)
log_level: str = "INFO"
json_logs: bool = True
def load_config() -> AppConfig:
@@ -98,18 +172,52 @@ def load_config() -> AppConfig:
base_url=os.getenv("OLLAMA_BASE_URL", "http://localhost:11434"),
model=os.getenv("OLLAMA_MODEL", "llama3.1:8b"),
timeout=int(os.getenv("OLLAMA_TIMEOUT", "120")),
max_retries=int(os.getenv("OLLAMA_MAX_RETRIES", "2")),
retry_base_delay=float(os.getenv("OLLAMA_RETRY_BASE_DELAY", "1.0")),
retry_max_delay=float(os.getenv("OLLAMA_RETRY_MAX_DELAY", "10.0")),
retry_backoff_multiplier=float(os.getenv("OLLAMA_RETRY_BACKOFF_MULTIPLIER", "2.0")),
),
trino=TrinoConfig(
host=os.getenv("TRINO_HOST", "localhost"),
port=int(os.getenv("TRINO_PORT", "8080")),
catalog=os.getenv("TRINO_CATALOG", "lakehouse"),
schema=os.getenv("TRINO_SCHEMA", "stonks"),
iceberg_catalog=os.getenv("TRINO_ICEBERG_CATALOG", "iceberg"),
),
market_data=MarketDataConfig(
api_key=os.getenv("MARKET_DATA_API_KEY", ""),
base_url=os.getenv("MARKET_DATA_BASE_URL", "https://api.polygon.io"),
provider=os.getenv("MARKET_DATA_PROVIDER", "polygon"),
),
broker=BrokerConfig(
mode=os.getenv("BROKER_MODE", "paper"),
provider=os.getenv("BROKER_PROVIDER", "alpaca"),
api_key=os.getenv("BROKER_API_KEY", None),
api_secret=os.getenv("BROKER_API_SECRET", None),
base_url=os.getenv("BROKER_BASE_URL", None),
),
retention=RetentionConfig(
raw_market_days=int(os.getenv("RETENTION_RAW_MARKET_DAYS", "90")),
raw_news_days=int(os.getenv("RETENTION_RAW_NEWS_DAYS", "180")),
raw_filings_days=int(os.getenv("RETENTION_RAW_FILINGS_DAYS", "365")),
normalized_days=int(os.getenv("RETENTION_NORMALIZED_DAYS", "180")),
llm_prompts_days=int(os.getenv("RETENTION_LLM_PROMPTS_DAYS", "365")),
llm_results_days=int(os.getenv("RETENTION_LLM_RESULTS_DAYS", "365")),
lakehouse_days=int(os.getenv("RETENTION_LAKEHOUSE_DAYS", "730")),
audit_days=int(os.getenv("RETENTION_AUDIT_DAYS", "730")),
cleanup_interval_hours=int(os.getenv("RETENTION_CLEANUP_INTERVAL_HOURS", "24")),
batch_size=int(os.getenv("RETENTION_BATCH_SIZE", "1000")),
),
alerting=AlertingConfig(
source_failure_threshold=int(os.getenv("ALERT_SOURCE_FAILURE_THRESHOLD", "3")),
source_failure_window_hours=int(os.getenv("ALERT_SOURCE_FAILURE_WINDOW_HOURS", "6")),
schema_failure_rate_threshold=float(os.getenv("ALERT_SCHEMA_FAILURE_RATE_THRESHOLD", "0.3")),
schema_failure_window_hours=int(os.getenv("ALERT_SCHEMA_FAILURE_WINDOW_HOURS", "1")),
lake_lag_threshold_minutes=int(os.getenv("ALERT_LAKE_LAG_THRESHOLD_MINUTES", "60")),
broker_error_threshold=int(os.getenv("ALERT_BROKER_ERROR_THRESHOLD", "3")),
broker_error_window_hours=int(os.getenv("ALERT_BROKER_ERROR_WINDOW_HOURS", "1")),
check_interval_seconds=int(os.getenv("ALERT_CHECK_INTERVAL_SECONDS", "120")),
),
log_level=os.getenv("LOG_LEVEL", "INFO"),
json_logs=os.getenv("JSON_LOGS", "true").lower() == "true",
)