fix: track last_published_at per source to avoid re-fetching same articles — applies to both news_api and macro_news
This commit is contained in:
@@ -195,6 +195,23 @@ async def process_job(
|
||||
extra={"ticker": ticker, "source_type": source_type, "count": new_items},
|
||||
)
|
||||
|
||||
# Track the latest published_utc so next fetch only gets newer articles
|
||||
if source_type in ("macro_news", "news_api") and result.items:
|
||||
latest_pub = None
|
||||
for item in result.items:
|
||||
pub = item.get("published_utc")
|
||||
if pub and (latest_pub is None or pub > latest_pub):
|
||||
latest_pub = pub
|
||||
if latest_pub:
|
||||
try:
|
||||
await pool.execute(
|
||||
"UPDATE sources SET config = config || $1::jsonb WHERE id = $2",
|
||||
json.dumps({"last_published_at": latest_pub}),
|
||||
source_id,
|
||||
)
|
||||
except Exception:
|
||||
pass # Non-critical
|
||||
|
||||
except Exception as e:
|
||||
INGESTION_ERRORS.labels(source_type=source_type).inc()
|
||||
INGESTION_JOBS_TOTAL.labels(source_type=source_type, status="error").inc()
|
||||
|
||||
Reference in New Issue
Block a user