phase 17: fix parser URL lookup from DB and extractor text field name mismatch
This commit is contained in:
@@ -48,7 +48,7 @@ async def main() -> None:
|
|||||||
job = json.loads(payload)
|
job = json.loads(payload)
|
||||||
document_id = job.get("document_id", "")
|
document_id = job.get("document_id", "")
|
||||||
ticker = job.get("ticker", "")
|
ticker = job.get("ticker", "")
|
||||||
text = job.get("text", "")
|
text = job.get("text", "") or job.get("normalized_text", "")
|
||||||
|
|
||||||
logger.info("Processing extraction job for doc %s / %s", document_id, ticker)
|
logger.info("Processing extraction job for doc %s / %s", document_id, ticker)
|
||||||
|
|
||||||
|
|||||||
@@ -96,6 +96,14 @@ async def process_job(
|
|||||||
|
|
||||||
set_trace_context(trace_id=job.get("_trace_id") or new_trace_id())
|
set_trace_context(trace_id=job.get("_trace_id") or new_trace_id())
|
||||||
|
|
||||||
|
# If no URL in job, look it up from the documents table
|
||||||
|
if not url:
|
||||||
|
row = await pool.fetchrow(
|
||||||
|
"SELECT url FROM documents WHERE id = $1::uuid", doc_id,
|
||||||
|
)
|
||||||
|
if row and row["url"]:
|
||||||
|
url = row["url"]
|
||||||
|
|
||||||
# Fetch HTML if we have a URL
|
# Fetch HTML if we have a URL
|
||||||
html = await fetch_html(url) if url else None
|
html = await fetch_html(url) if url else None
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user