phase 17: enrich short parsed articles with Polygon description/keywords from raw payload
This commit is contained in:
@@ -115,6 +115,45 @@ async def process_job(
|
|||||||
|
|
||||||
text = parsed.body_text
|
text = parsed.body_text
|
||||||
|
|
||||||
|
# If parsed body is short (<500 chars), enrich with Polygon description from raw payload
|
||||||
|
if len(text or "") < 500:
|
||||||
|
doc_row = await pool.fetchrow(
|
||||||
|
"SELECT title, raw_storage_ref FROM documents WHERE id = $1::uuid", doc_id,
|
||||||
|
)
|
||||||
|
if doc_row and doc_row["raw_storage_ref"]:
|
||||||
|
try:
|
||||||
|
ref = doc_row["raw_storage_ref"]
|
||||||
|
parts = ref.replace("s3://", "").split("/", 1)
|
||||||
|
if len(parts) == 2:
|
||||||
|
raw_obj = minio_client.get_object(parts[0], parts[1])
|
||||||
|
raw_data = json.loads(raw_obj.read())
|
||||||
|
raw_obj.close()
|
||||||
|
raw_obj.release_conn()
|
||||||
|
# Find matching article by title
|
||||||
|
items = raw_data.get("results", [])
|
||||||
|
if isinstance(items, list):
|
||||||
|
doc_title = doc_row["title"] or ""
|
||||||
|
for item in items:
|
||||||
|
if item.get("title", "") == doc_title:
|
||||||
|
desc = item.get("description", "")
|
||||||
|
keywords = item.get("keywords", [])
|
||||||
|
author = item.get("author", "")
|
||||||
|
enriched_parts = []
|
||||||
|
if doc_title:
|
||||||
|
enriched_parts.append(doc_title)
|
||||||
|
if author:
|
||||||
|
enriched_parts.append(f"By {author}")
|
||||||
|
if desc:
|
||||||
|
enriched_parts.append(desc)
|
||||||
|
if keywords:
|
||||||
|
enriched_parts.append(f"Keywords: {', '.join(keywords)}")
|
||||||
|
if text:
|
||||||
|
enriched_parts.append(text)
|
||||||
|
text = "\n\n".join(enriched_parts)
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Could not enrich short text for doc %s: %s", doc_id, e)
|
||||||
|
|
||||||
# Upload normalized text to MinIO
|
# Upload normalized text to MinIO
|
||||||
norm_ref: str | None = None
|
norm_ref: str | None = None
|
||||||
if text:
|
if text:
|
||||||
|
|||||||
Reference in New Issue
Block a user