phase 17: enrich short parsed articles with Polygon description/keywords from raw payload
This commit is contained in:
@@ -115,6 +115,45 @@ async def process_job(
|
||||
|
||||
text = parsed.body_text
|
||||
|
||||
# If parsed body is short (<500 chars), enrich with Polygon description from raw payload
|
||||
if len(text or "") < 500:
|
||||
doc_row = await pool.fetchrow(
|
||||
"SELECT title, raw_storage_ref FROM documents WHERE id = $1::uuid", doc_id,
|
||||
)
|
||||
if doc_row and doc_row["raw_storage_ref"]:
|
||||
try:
|
||||
ref = doc_row["raw_storage_ref"]
|
||||
parts = ref.replace("s3://", "").split("/", 1)
|
||||
if len(parts) == 2:
|
||||
raw_obj = minio_client.get_object(parts[0], parts[1])
|
||||
raw_data = json.loads(raw_obj.read())
|
||||
raw_obj.close()
|
||||
raw_obj.release_conn()
|
||||
# Find matching article by title
|
||||
items = raw_data.get("results", [])
|
||||
if isinstance(items, list):
|
||||
doc_title = doc_row["title"] or ""
|
||||
for item in items:
|
||||
if item.get("title", "") == doc_title:
|
||||
desc = item.get("description", "")
|
||||
keywords = item.get("keywords", [])
|
||||
author = item.get("author", "")
|
||||
enriched_parts = []
|
||||
if doc_title:
|
||||
enriched_parts.append(doc_title)
|
||||
if author:
|
||||
enriched_parts.append(f"By {author}")
|
||||
if desc:
|
||||
enriched_parts.append(desc)
|
||||
if keywords:
|
||||
enriched_parts.append(f"Keywords: {', '.join(keywords)}")
|
||||
if text:
|
||||
enriched_parts.append(text)
|
||||
text = "\n\n".join(enriched_parts)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug("Could not enrich short text for doc %s: %s", doc_id, e)
|
||||
|
||||
# Upload normalized text to MinIO
|
||||
norm_ref: str | None = None
|
||||
if text:
|
||||
|
||||
Reference in New Issue
Block a user