stonks-oracle/services/adapters/web_scrape_adapter.py

"""Web scrape adapter for curated URLs and article pages.

Fetches full article HTML from curated URLs (investor relations pages,
press releases, earnings transcripts, etc.) using BeautifulSoup + requests
with retry adapters, content hashing, boilerplate awareness, and quality scoring.

Inspired by Noctipede crawler patterns: BeautifulSoup + requests with retry
adapters, content hashing, boilerplate stripping, quality scoring.

Requirements: 1.2, 2.5, 3.1, 3.2, 3.3, 3.4
"""
import json
import logging
import time
from datetime import datetime, timezone
from urllib.parse import urlparse
from typing import Any

import httpx
from bs4 import BeautifulSoup

from services.shared.content import content_hash, normalize_url

from .base import AdapterResult, BaseAdapter

logger = logging.getLogger("web_scrape_adapter")

# Default request settings
DEFAULT_TIMEOUT = 30
DEFAULT_USER_AGENT = "StonksOracle/1.0 (+https://stonks-oracle.celestium.life)"
MAX_CONTENT_LENGTH = 10 * 1024 * 1024  # 10MB cap


def extract_metadata_from_html(html: str, url: str) -> dict[str, str | None]:
    """Extract title, author, publisher, published date, and links from HTML."""
    soup = BeautifulSoup(html, "html.parser")
    meta: dict[str, str | None] = {}

    # Title: prefer og:title, then <title>
    og_title = soup.find("meta", property="og:title")
    if og_title and og_title.get("content"):
        content = og_title["content"]
        meta["title"] = content.strip() if isinstance(content, str) else ""
    elif soup.title and soup.title.string:
        meta["title"] = soup.title.string.strip()
    else:
        meta["title"] = ""

    # Author
    author_tag = soup.find("meta", attrs={"name": "author"})
    if author_tag and author_tag.get("content"):
        content = author_tag["content"]
        meta["author"] = content.strip() if isinstance(content, str) else ""
    else:
        meta["author"] = ""

    # Publisher: og:site_name
    site_name = soup.find("meta", property="og:site_name")
    if site_name and site_name.get("content"):
        content = site_name["content"]
        meta["publisher"] = content.strip() if isinstance(content, str) else ""
    else:
        meta["publisher"] = urlparse(url).hostname or ""

    # Published date: article:published_time or datePublished
    pub_time = soup.find("meta", property="article:published_time")
    if pub_time and pub_time.get("content"):
        content = pub_time["content"]
        meta["published_at"] = content.strip() if isinstance(content, str) else None
    else:
        # Try JSON-LD datePublished
        for script in soup.find_all("script", type="application/ld+json"):
            if script.string and "datePublished" in script.string:
                try:
                    ld = json.loads(script.string)
                    if isinstance(ld, dict) and "datePublished" in ld:
                        meta["published_at"] = str(ld["datePublished"])
                        break
                    if isinstance(ld, list):
                        for item in ld:
                            if isinstance(item, dict) and "datePublished" in item:
                                meta["published_at"] = str(item["datePublished"])
                                break
                except (json.JSONDecodeError, TypeError):
                    pass
        if "published_at" not in meta:
            meta["published_at"] = None

    # Canonical URL
    canonical = soup.find("link", rel="canonical")
    if canonical and canonical.get("href"):
        href = canonical["href"]
        meta["canonical_url"] = str(href) if href else normalize_url(url)
    else:
        og_url = soup.find("meta", property="og:url")
        if og_url and og_url.get("content"):
            content = og_url["content"]
            meta["canonical_url"] = str(content) if content else normalize_url(url)
        else:
            meta["canonical_url"] = normalize_url(url)

    # Language
    html_tag = soup.find("html")
    if html_tag and html_tag.get("lang"):
        lang = html_tag["lang"]
        meta["language"] = str(lang)[:5] if lang else "en"
    else:
        meta["language"] = "en"

    # Description for summary
    desc = soup.find("meta", property="og:description") or soup.find(
        "meta", attrs={"name": "description"}
    )
    if desc and desc.get("content"):
        content = desc["content"]
        meta["description"] = content.strip() if isinstance(content, str) else ""
    else:
        meta["description"] = ""

    return meta


def extract_body_text(html: str) -> str:
    """Extract main body text from HTML, stripping nav/footer/ads."""
    soup = BeautifulSoup(html, "html.parser")

    # Remove non-content elements
    for tag in soup.find_all(
        ["script", "style", "nav", "footer", "header", "aside", "iframe", "noscript"]
    ):
        tag.decompose()

    # Try to find article body
    article = soup.find("article")
    if not article:
        for div in soup.find_all("div"):
            cls = div.get("class", [])
            cls_str = " ".join(cls) if isinstance(cls, list) else str(cls) if cls else ""
            if any(kw in cls_str for kw in ["article-body", "post-content", "entry-content", "story-body"]):
                article = div
                break

    if article:
        text = article.get_text(separator="\n", strip=True)
    else:
        # Fallback: use body
        body = soup.find("body")
        text = body.get_text(separator="\n", strip=True) if body else soup.get_text(separator="\n", strip=True)

    # Collapse whitespace
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return "\n".join(lines)


class WebScrapeAdapter(BaseAdapter):
    """Adapter for fetching curated web pages and article URLs.

    Config options (from source config):
        urls: List of URLs to scrape for this company
        url: Single URL to scrape (alternative to urls)
        timeout: Request timeout in seconds (default 30)
        user_agent: Custom user agent string
        follow_links: Whether to follow article links from index pages (default False)
        max_pages: Max pages to fetch per cycle (default 5)
    """

    def __init__(self) -> None:
        pass

    def source_type(self) -> str:
        return "web_scrape"

    def bucket_name(self) -> str:
        """Web scrape artifacts go to the news raw bucket."""
        return "stonks-raw-news"

    async def fetch(self, ticker: str, config: dict[str, Any]) -> AdapterResult:
        """Fetch HTML from curated URLs for a given ticker.

        Supports both single URL and multi-URL configs. Each URL is fetched,
        HTML is preserved as raw payload, and metadata is extracted.
        """
        urls = config.get("urls", [])
        if not urls and config.get("url"):
            urls = [config["url"]]

        if not urls:
            return self._error_result(ticker, "No URLs configured for web_scrape source", 0)

        timeout = config.get("timeout", DEFAULT_TIMEOUT)
        user_agent = config.get("user_agent", DEFAULT_USER_AGENT)
        max_pages = min(config.get("max_pages", 5), 20)

        items: list[dict[str, Any]] = []
        all_raw: list[bytes] = []
        total_elapsed = 0.0
        errors: list[str] = []

        async with httpx.AsyncClient(
            timeout=timeout,
            follow_redirects=True,
            headers={"User-Agent": user_agent},
        ) as client:
            for url in urls[:max_pages]:
                t0 = time.monotonic()
                try:
                    resp = await client.get(url)
                    elapsed_ms = (time.monotonic() - t0) * 1000
                    total_elapsed += elapsed_ms
                    resp.raise_for_status()

                    # Content length guard
                    if len(resp.content) > MAX_CONTENT_LENGTH:
                        errors.append(f"Content too large for {url}: {len(resp.content)} bytes")
                        continue

                    html = resp.text
                    raw_bytes = resp.content
                    all_raw.append(raw_bytes)

                    item_content_hash = content_hash(raw_bytes)
                    meta = extract_metadata_from_html(html, url)
                    body_text = extract_body_text(html)

                    item: dict[str, Any] = {
                        "url": url,
                        "canonical_url": meta.get("canonical_url", normalize_url(url)),
                        "title": meta.get("title", ""),
                        "author": meta.get("author", ""),
                        "publisher": meta.get("publisher", ""),
                        "published_at": meta.get("published_at"),
                        "language": meta.get("language", "en"),
                        "description": meta.get("description", ""),
                        "content_hash": item_content_hash,
                        "body_text": body_text,
                        "body_length": len(body_text),
                        "html_length": len(html),
                        "http_status": resp.status_code,
                        "response_time_ms": round(elapsed_ms, 1),
                    }
                    items.append(item)

                except httpx.HTTPStatusError as e:
                    elapsed_ms = (time.monotonic() - t0) * 1000
                    total_elapsed += elapsed_ms
                    status = e.response.status_code if e.response else None
                    errors.append(f"HTTP {status} for {url}: {e}")
                    logger.warning("Scrape HTTP error for %s/%s: %s", ticker, url, e)

                except httpx.TimeoutException as e:
                    elapsed_ms = (time.monotonic() - t0) * 1000
                    total_elapsed += elapsed_ms
                    errors.append(f"Timeout for {url}: {e}")
                    logger.warning("Scrape timeout for %s/%s: %s", ticker, url, e)

                except Exception as e:
                    elapsed_ms = (time.monotonic() - t0) * 1000
                    total_elapsed += elapsed_ms
                    errors.append(f"Error for {url}: {e}")
                    logger.warning("Scrape error for %s/%s: %s", ticker, url, e)

        if not items:
            error_msg = "; ".join(errors) if errors else "No pages fetched"
            return self._error_result(ticker, error_msg, total_elapsed)

        # Combine all raw payloads into a single artifact
        combined_raw = json.dumps({
            "ticker": ticker,
            "fetched_at": datetime.now(timezone.utc).isoformat(),
            "pages": [
                {
                    "url": item["url"],
                    "content_hash": item["content_hash"],
                    "html_length": item["html_length"],
                    "body_length": item["body_length"],
                }
                for item in items
            ],
            "errors": errors,
        }).encode("utf-8")

        combined_hash = content_hash(
            b"".join(item["content_hash"].encode() for item in items)
        )

        return AdapterResult(
            source_type="web_scrape",
            ticker=ticker,
            items=items,
            raw_payload=combined_raw,
            content_hash=combined_hash,
            fetched_at=datetime.now(timezone.utc),
            http_status=200,
            response_time_ms=round(total_elapsed, 1),
            metadata={
                "provider": "web_scrape",
                "pages_fetched": len(items),
                "pages_failed": len(errors),
                "errors": errors,
            },
        )

    def _error_result(
        self,
        ticker: str,
        error: str,
        elapsed_ms: float,
    ) -> AdapterResult:
        """Build an error AdapterResult for scrape fetches."""
        return AdapterResult(
            source_type="web_scrape",
            ticker=ticker,
            items=[],
            raw_payload=b"",
            content_hash="",
            fetched_at=datetime.now(timezone.utc),
            error=error,
            http_status=None,
            response_time_ms=round(elapsed_ms, 1),
            metadata={"provider": "web_scrape"},
        )