import os from minio import Minio mc = Minio(os.environ["MINIO_ENDPOINT"], access_key=os.environ["MINIO_ACCESS_KEY"], secret_key=os.environ["MINIO_SECRET_KEY"], secure=False) # Check the most recent extraction - what text did the model get? # Look at the normalized text for a known doc import asyncio import asyncpg async def main(): pool = await asyncpg.create_pool( host=os.environ["POSTGRES_HOST"], port=int(os.environ["POSTGRES_PORT"]), database=os.environ["POSTGRES_DB"], user=os.environ["POSTGRES_USER"], password=os.environ["POSTGRES_PASSWORD"], ) # Get a recently extracted doc row = await pool.fetchrow( "SELECT id, title, normalized_storage_ref, parse_quality_score " "FROM documents WHERE source_type = 'news_api' AND parse_quality_score > 0.8 " "ORDER BY updated_at DESC LIMIT 1" ) if row: print(f"Doc: {row['id']}") print(f"Title: {row['title']}") print(f"Quality: {row['parse_quality_score']}") print(f"Ref: {row['normalized_storage_ref']}") ref = row["normalized_storage_ref"] parts = ref.replace("s3://", "").split("/", 1) if len(parts) == 2: obj = mc.get_object(parts[0], parts[1]) text = obj.read().decode("utf-8") obj.close() obj.release_conn() print(f"Text length: {len(text)} chars") print("First 500 chars:") print(text[:500]) await pool.close() asyncio.run(main())