diff --git a/services/parser/worker.py b/services/parser/worker.py index b0374e3..791ae8b 100644 --- a/services/parser/worker.py +++ b/services/parser/worker.py @@ -115,6 +115,45 @@ async def process_job( text = parsed.body_text + # If parsed body is short (<500 chars), enrich with Polygon description from raw payload + if len(text or "") < 500: + doc_row = await pool.fetchrow( + "SELECT title, raw_storage_ref FROM documents WHERE id = $1::uuid", doc_id, + ) + if doc_row and doc_row["raw_storage_ref"]: + try: + ref = doc_row["raw_storage_ref"] + parts = ref.replace("s3://", "").split("/", 1) + if len(parts) == 2: + raw_obj = minio_client.get_object(parts[0], parts[1]) + raw_data = json.loads(raw_obj.read()) + raw_obj.close() + raw_obj.release_conn() + # Find matching article by title + items = raw_data.get("results", []) + if isinstance(items, list): + doc_title = doc_row["title"] or "" + for item in items: + if item.get("title", "") == doc_title: + desc = item.get("description", "") + keywords = item.get("keywords", []) + author = item.get("author", "") + enriched_parts = [] + if doc_title: + enriched_parts.append(doc_title) + if author: + enriched_parts.append(f"By {author}") + if desc: + enriched_parts.append(desc) + if keywords: + enriched_parts.append(f"Keywords: {', '.join(keywords)}") + if text: + enriched_parts.append(text) + text = "\n\n".join(enriched_parts) + break + except Exception as e: + logger.debug("Could not enrich short text for doc %s: %s", doc_id, e) + # Upload normalized text to MinIO norm_ref: str | None = None if text: