phase 17: fix Polygon article_url and published_utc field mapping in metadata persistence
This commit is contained in:
@@ -182,7 +182,7 @@
|
||||
- Verify companies and sources appear in the dashboard and via `curl https://stonks-registry.celestium.life/companies`
|
||||
- _Requirements: 1.1, 1.2, 1.3, 2.1_
|
||||
|
||||
- [-] 17.2 Wire the scheduler to enqueue ingestion jobs for active sources
|
||||
- [x] 17.2 Wire the scheduler to enqueue ingestion jobs for active sources
|
||||
- Verify the scheduler service reads active companies and sources from PostgreSQL
|
||||
- Verify it enqueues Redis jobs for each source on its polling interval
|
||||
- Check scheduler logs: `kubectl logs -n stonks-oracle deployment/scheduler --tail=50`
|
||||
@@ -190,7 +190,7 @@
|
||||
- Fix any issues with the scheduler → source → Redis queue flow
|
||||
- _Requirements: 3.1, 3.2_
|
||||
|
||||
- [ ] 17.3 Validate ingestion workers pull data from Polygon and persist to MinIO/PostgreSQL
|
||||
- [x] 17.3 Validate ingestion workers pull data from Polygon and persist to MinIO/PostgreSQL
|
||||
- Check ingestion worker logs for successful API calls to Polygon
|
||||
- Verify raw market data artifacts land in MinIO `stonks-raw-market` bucket
|
||||
- Verify document metadata rows appear in PostgreSQL `documents` table
|
||||
@@ -198,7 +198,7 @@
|
||||
- Debug and fix any adapter errors (auth, rate limits, response parsing)
|
||||
- _Requirements: 4.1, 4.2, 4.3_
|
||||
|
||||
- [ ] 17.4 Validate parser normalizes documents and extractor produces intelligence
|
||||
- [-] 17.4 Validate parser normalizes documents and extractor produces intelligence
|
||||
- Check parser worker logs for document processing
|
||||
- Verify normalized text appears in MinIO `stonks-normalized` bucket
|
||||
- Verify `parse_quality_score` and `parse_confidence` are set on documents
|
||||
|
||||
@@ -232,7 +232,7 @@ def _extract_publisher(item: dict[str, Any]) -> str:
|
||||
|
||||
def _parse_published_at(item: dict[str, Any]) -> datetime | None:
|
||||
"""Parse published_at from various adapter item formats."""
|
||||
raw = item.get("publishedAt") or item.get("published_at")
|
||||
raw = item.get("publishedAt") or item.get("published_at") or item.get("published_utc")
|
||||
if not raw:
|
||||
return None
|
||||
if isinstance(raw, datetime):
|
||||
@@ -392,7 +392,7 @@ async def _persist_document_items(
|
||||
json.dumps(item, sort_keys=True)
|
||||
)
|
||||
title = item.get("title", item.get("name", ""))
|
||||
url = item.get("url", item.get("link", ""))
|
||||
url = item.get("url", item.get("link", item.get("article_url", "")))
|
||||
canonical_url = item.get("canonical_url") or (
|
||||
normalize_url(url) if url else None
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user