phase 17: enrich short parsed articles with Polygon description/keywords from raw payload

This commit is contained in:
Celes Renata
2026-04-12 08:52:46 -07:00
parent cd32c3e3fe
commit 59f89d03d2
+39
View File
@@ -115,6 +115,45 @@ async def process_job(
text = parsed.body_text
# If parsed body is short (<500 chars), enrich with Polygon description from raw payload
if len(text or "") < 500:
doc_row = await pool.fetchrow(
"SELECT title, raw_storage_ref FROM documents WHERE id = $1::uuid", doc_id,
)
if doc_row and doc_row["raw_storage_ref"]:
try:
ref = doc_row["raw_storage_ref"]
parts = ref.replace("s3://", "").split("/", 1)
if len(parts) == 2:
raw_obj = minio_client.get_object(parts[0], parts[1])
raw_data = json.loads(raw_obj.read())
raw_obj.close()
raw_obj.release_conn()
# Find matching article by title
items = raw_data.get("results", [])
if isinstance(items, list):
doc_title = doc_row["title"] or ""
for item in items:
if item.get("title", "") == doc_title:
desc = item.get("description", "")
keywords = item.get("keywords", [])
author = item.get("author", "")
enriched_parts = []
if doc_title:
enriched_parts.append(doc_title)
if author:
enriched_parts.append(f"By {author}")
if desc:
enriched_parts.append(desc)
if keywords:
enriched_parts.append(f"Keywords: {', '.join(keywords)}")
if text:
enriched_parts.append(text)
text = "\n\n".join(enriched_parts)
break
except Exception as e:
logger.debug("Could not enrich short text for doc %s: %s", doc_id, e)
# Upload normalized text to MinIO
norm_ref: str | None = None
if text: