50 lines
1.5 KiB
Python
50 lines
1.5 KiB
Python
import os
|
|
|
|
from minio import Minio
|
|
|
|
mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False)
|
|
|
|
# Check the most recent extraction - what text did the model get?
|
|
# Look at the normalized text for a known doc
|
|
import asyncio
|
|
|
|
import asyncpg
|
|
|
|
|
|
async def main():
|
|
pool = await asyncpg.create_pool(
|
|
host=os.environ["POSTGRES_HOST"],
|
|
port=int(os.environ["POSTGRES_PORT"]),
|
|
database=os.environ["POSTGRES_DB"],
|
|
user=os.environ["POSTGRES_USER"],
|
|
password=os.environ["POSTGRES_PASSWORD"],
|
|
)
|
|
|
|
# Get a recently extracted doc
|
|
row = await pool.fetchrow(
|
|
"SELECT id, title, normalized_storage_ref, parse_quality_score "
|
|
"FROM documents WHERE source_type = 'news_api' AND parse_quality_score > 0.8 "
|
|
"ORDER BY updated_at DESC LIMIT 1"
|
|
)
|
|
|
|
if row:
|
|
print(f"Doc: {row['id']}")
|
|
print(f"Title: {row['title']}")
|
|
print(f"Quality: {row['parse_quality_score']}")
|
|
print(f"Ref: {row['normalized_storage_ref']}")
|
|
|
|
ref = row["normalized_storage_ref"]
|
|
parts = ref.replace("s3://", "").split("/", 1)
|
|
if len(parts) == 2:
|
|
obj = mc.get_object(parts[0], parts[1])
|
|
text = obj.read().decode("utf-8")
|
|
obj.close()
|
|
obj.release_conn()
|
|
print(f"Text length: {len(text)} chars")
|
|
print("First 500 chars:")
|
|
print(text[:500])
|
|
|
|
await pool.close()
|
|
|
|
asyncio.run(main())
|