phase 14-15: docker build validation and helm deployment
This commit is contained in:
@@ -0,0 +1,321 @@
|
||||
"""Web scrape adapter for curated URLs and article pages.
|
||||
|
||||
Fetches full article HTML from curated URLs (investor relations pages,
|
||||
press releases, earnings transcripts, etc.) using BeautifulSoup + requests
|
||||
with retry adapters, content hashing, boilerplate awareness, and quality scoring.
|
||||
|
||||
Inspired by Noctipede crawler patterns: BeautifulSoup + requests with retry
|
||||
adapters, content hashing, boilerplate stripping, quality scoring.
|
||||
|
||||
Requirements: 1.2, 2.5, 3.1, 3.2, 3.3, 3.4
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from urllib.parse import urlparse
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from services.shared.content import content_hash, normalize_url
|
||||
|
||||
from .base import AdapterResult, BaseAdapter
|
||||
|
||||
logger = logging.getLogger("web_scrape_adapter")
|
||||
|
||||
# Default request settings
|
||||
DEFAULT_TIMEOUT = 30
|
||||
DEFAULT_USER_AGENT = "StonksOracle/1.0 (+https://stonks-oracle.celestium.life)"
|
||||
MAX_CONTENT_LENGTH = 10 * 1024 * 1024 # 10MB cap
|
||||
|
||||
|
||||
def extract_metadata_from_html(html: str, url: str) -> dict[str, str | None]:
|
||||
"""Extract title, author, publisher, published date, and links from HTML."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
meta: dict[str, str | None] = {}
|
||||
|
||||
# Title: prefer og:title, then <title>
|
||||
og_title = soup.find("meta", property="og:title")
|
||||
if og_title and og_title.get("content"):
|
||||
content = og_title["content"]
|
||||
meta["title"] = content.strip() if isinstance(content, str) else ""
|
||||
elif soup.title and soup.title.string:
|
||||
meta["title"] = soup.title.string.strip()
|
||||
else:
|
||||
meta["title"] = ""
|
||||
|
||||
# Author
|
||||
author_tag = soup.find("meta", attrs={"name": "author"})
|
||||
if author_tag and author_tag.get("content"):
|
||||
content = author_tag["content"]
|
||||
meta["author"] = content.strip() if isinstance(content, str) else ""
|
||||
else:
|
||||
meta["author"] = ""
|
||||
|
||||
# Publisher: og:site_name
|
||||
site_name = soup.find("meta", property="og:site_name")
|
||||
if site_name and site_name.get("content"):
|
||||
content = site_name["content"]
|
||||
meta["publisher"] = content.strip() if isinstance(content, str) else ""
|
||||
else:
|
||||
meta["publisher"] = urlparse(url).hostname or ""
|
||||
|
||||
# Published date: article:published_time or datePublished
|
||||
pub_time = soup.find("meta", property="article:published_time")
|
||||
if pub_time and pub_time.get("content"):
|
||||
content = pub_time["content"]
|
||||
meta["published_at"] = content.strip() if isinstance(content, str) else None
|
||||
else:
|
||||
# Try JSON-LD datePublished
|
||||
for script in soup.find_all("script", type="application/ld+json"):
|
||||
if script.string and "datePublished" in script.string:
|
||||
try:
|
||||
ld = json.loads(script.string)
|
||||
if isinstance(ld, dict) and "datePublished" in ld:
|
||||
meta["published_at"] = str(ld["datePublished"])
|
||||
break
|
||||
if isinstance(ld, list):
|
||||
for item in ld:
|
||||
if isinstance(item, dict) and "datePublished" in item:
|
||||
meta["published_at"] = str(item["datePublished"])
|
||||
break
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
if "published_at" not in meta:
|
||||
meta["published_at"] = None
|
||||
|
||||
# Canonical URL
|
||||
canonical = soup.find("link", rel="canonical")
|
||||
if canonical and canonical.get("href"):
|
||||
href = canonical["href"]
|
||||
meta["canonical_url"] = str(href) if href else normalize_url(url)
|
||||
else:
|
||||
og_url = soup.find("meta", property="og:url")
|
||||
if og_url and og_url.get("content"):
|
||||
content = og_url["content"]
|
||||
meta["canonical_url"] = str(content) if content else normalize_url(url)
|
||||
else:
|
||||
meta["canonical_url"] = normalize_url(url)
|
||||
|
||||
# Language
|
||||
html_tag = soup.find("html")
|
||||
if html_tag and html_tag.get("lang"):
|
||||
lang = html_tag["lang"]
|
||||
meta["language"] = str(lang)[:5] if lang else "en"
|
||||
else:
|
||||
meta["language"] = "en"
|
||||
|
||||
# Description for summary
|
||||
desc = soup.find("meta", property="og:description") or soup.find(
|
||||
"meta", attrs={"name": "description"}
|
||||
)
|
||||
if desc and desc.get("content"):
|
||||
content = desc["content"]
|
||||
meta["description"] = content.strip() if isinstance(content, str) else ""
|
||||
else:
|
||||
meta["description"] = ""
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
def extract_body_text(html: str) -> str:
|
||||
"""Extract main body text from HTML, stripping nav/footer/ads."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Remove non-content elements
|
||||
for tag in soup.find_all(
|
||||
["script", "style", "nav", "footer", "header", "aside", "iframe", "noscript"]
|
||||
):
|
||||
tag.decompose()
|
||||
|
||||
# Try to find article body
|
||||
article = soup.find("article")
|
||||
if not article:
|
||||
for div in soup.find_all("div"):
|
||||
cls = div.get("class", [])
|
||||
cls_str = " ".join(cls) if isinstance(cls, list) else str(cls) if cls else ""
|
||||
if any(kw in cls_str for kw in ["article-body", "post-content", "entry-content", "story-body"]):
|
||||
article = div
|
||||
break
|
||||
|
||||
if article:
|
||||
text = article.get_text(separator="\n", strip=True)
|
||||
else:
|
||||
# Fallback: use body
|
||||
body = soup.find("body")
|
||||
text = body.get_text(separator="\n", strip=True) if body else soup.get_text(separator="\n", strip=True)
|
||||
|
||||
# Collapse whitespace
|
||||
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
class WebScrapeAdapter(BaseAdapter):
|
||||
"""Adapter for fetching curated web pages and article URLs.
|
||||
|
||||
Config options (from source config):
|
||||
urls: List of URLs to scrape for this company
|
||||
url: Single URL to scrape (alternative to urls)
|
||||
timeout: Request timeout in seconds (default 30)
|
||||
user_agent: Custom user agent string
|
||||
follow_links: Whether to follow article links from index pages (default False)
|
||||
max_pages: Max pages to fetch per cycle (default 5)
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def source_type(self) -> str:
|
||||
return "web_scrape"
|
||||
|
||||
def bucket_name(self) -> str:
|
||||
"""Web scrape artifacts go to the news raw bucket."""
|
||||
return "stonks-raw-news"
|
||||
|
||||
async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
|
||||
"""Fetch HTML from curated URLs for a given ticker.
|
||||
|
||||
Supports both single URL and multi-URL configs. Each URL is fetched,
|
||||
HTML is preserved as raw payload, and metadata is extracted.
|
||||
"""
|
||||
urls = config.get("urls", [])
|
||||
if not urls and config.get("url"):
|
||||
urls = [config["url"]]
|
||||
|
||||
if not urls:
|
||||
return self._error_result(ticker, "No URLs configured for web_scrape source", 0)
|
||||
|
||||
timeout = config.get("timeout", DEFAULT_TIMEOUT)
|
||||
user_agent = config.get("user_agent", DEFAULT_USER_AGENT)
|
||||
max_pages = min(config.get("max_pages", 5), 20)
|
||||
|
||||
items: list[dict[str, Any]] = []
|
||||
all_raw: list[bytes] = []
|
||||
total_elapsed = 0.0
|
||||
errors: list[str] = []
|
||||
|
||||
async with httpx.AsyncClient(
|
||||
timeout=timeout,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": user_agent},
|
||||
) as client:
|
||||
for url in urls[:max_pages]:
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
resp = await client.get(url)
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
total_elapsed += elapsed_ms
|
||||
resp.raise_for_status()
|
||||
|
||||
# Content length guard
|
||||
if len(resp.content) > MAX_CONTENT_LENGTH:
|
||||
errors.append(f"Content too large for {url}: {len(resp.content)} bytes")
|
||||
continue
|
||||
|
||||
html = resp.text
|
||||
raw_bytes = resp.content
|
||||
all_raw.append(raw_bytes)
|
||||
|
||||
item_content_hash = content_hash(raw_bytes)
|
||||
meta = extract_metadata_from_html(html, url)
|
||||
body_text = extract_body_text(html)
|
||||
|
||||
item: dict[str, Any] = {
|
||||
"url": url,
|
||||
"canonical_url": meta.get("canonical_url", normalize_url(url)),
|
||||
"title": meta.get("title", ""),
|
||||
"author": meta.get("author", ""),
|
||||
"publisher": meta.get("publisher", ""),
|
||||
"published_at": meta.get("published_at"),
|
||||
"language": meta.get("language", "en"),
|
||||
"description": meta.get("description", ""),
|
||||
"content_hash": item_content_hash,
|
||||
"body_text": body_text,
|
||||
"body_length": len(body_text),
|
||||
"html_length": len(html),
|
||||
"http_status": resp.status_code,
|
||||
"response_time_ms": round(elapsed_ms, 1),
|
||||
}
|
||||
items.append(item)
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
total_elapsed += elapsed_ms
|
||||
status = e.response.status_code if e.response else None
|
||||
errors.append(f"HTTP {status} for {url}: {e}")
|
||||
logger.warning("Scrape HTTP error for %s/%s: %s", ticker, url, e)
|
||||
|
||||
except httpx.TimeoutException as e:
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
total_elapsed += elapsed_ms
|
||||
errors.append(f"Timeout for {url}: {e}")
|
||||
logger.warning("Scrape timeout for %s/%s: %s", ticker, url, e)
|
||||
|
||||
except Exception as e:
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
total_elapsed += elapsed_ms
|
||||
errors.append(f"Error for {url}: {e}")
|
||||
logger.warning("Scrape error for %s/%s: %s", ticker, url, e)
|
||||
|
||||
if not items:
|
||||
error_msg = "; ".join(errors) if errors else "No pages fetched"
|
||||
return self._error_result(ticker, error_msg, total_elapsed)
|
||||
|
||||
# Combine all raw payloads into a single artifact
|
||||
combined_raw = json.dumps({
|
||||
"ticker": ticker,
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
"pages": [
|
||||
{
|
||||
"url": item["url"],
|
||||
"content_hash": item["content_hash"],
|
||||
"html_length": item["html_length"],
|
||||
"body_length": item["body_length"],
|
||||
}
|
||||
for item in items
|
||||
],
|
||||
"errors": errors,
|
||||
}).encode("utf-8")
|
||||
|
||||
combined_hash = content_hash(
|
||||
b"".join(item["content_hash"].encode() for item in items)
|
||||
)
|
||||
|
||||
return AdapterResult(
|
||||
source_type="web_scrape",
|
||||
ticker=ticker,
|
||||
items=items,
|
||||
raw_payload=combined_raw,
|
||||
content_hash=combined_hash,
|
||||
fetched_at=datetime.now(timezone.utc),
|
||||
http_status=200,
|
||||
response_time_ms=round(total_elapsed, 1),
|
||||
metadata={
|
||||
"provider": "web_scrape",
|
||||
"pages_fetched": len(items),
|
||||
"pages_failed": len(errors),
|
||||
"errors": errors,
|
||||
},
|
||||
)
|
||||
|
||||
def _error_result(
|
||||
self,
|
||||
ticker: str,
|
||||
error: str,
|
||||
elapsed_ms: float,
|
||||
) -> AdapterResult:
|
||||
"""Build an error AdapterResult for scrape fetches."""
|
||||
return AdapterResult(
|
||||
source_type="web_scrape",
|
||||
ticker=ticker,
|
||||
items=[],
|
||||
raw_payload=b"",
|
||||
content_hash="",
|
||||
fetched_at=datetime.now(timezone.utc),
|
||||
error=error,
|
||||
http_status=None,
|
||||
response_time_ms=round(elapsed_ms, 1),
|
||||
metadata={"provider": "web_scrape"},
|
||||
)
|
||||
Reference in New Issue
Block a user