pi_mcps/mcp/webscraper/src/server.py

"""Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps."""

import httpx
from bs4 import BeautifulSoup
from html2text import html2text
from urllib.parse import urljoin, quote_plus
from typing import List, Dict, Tuple, Annotated
import re
import ssl
import os
import certifi
from pathlib import Path
from fastmcp import FastMCP
from pydantic import Field

mcp = FastMCP("webscraper")

# Build a single SSL context at module load — certifi bundle + any extra certs
# shipped in the certs/ directory next to this file.
_EXTRA_CERTS_DIR = Path(__file__).resolve().parent.parent / "certs"

def _build_ssl_context() -> ssl.SSLContext:
    """Build an SSL context from certifi + extra bundled root certs."""
    ctx = ssl.create_default_context(cafile=certifi.where())
    if _EXTRA_CERTS_DIR.is_dir():
        for pem in _EXTRA_CERTS_DIR.glob("*.pem"):
            ctx.load_verify_locations(cafile=str(pem))
    return ctx

_SSL_CTX = _build_ssl_context()

_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    )
}

def _fetch_page(url: str) -> Tuple[httpx.Response, BeautifulSoup]:
    """Shared fetch helper — returns response and parsed soup."""
    response = httpx.get(url, timeout=10.0, verify=_SSL_CTX, headers=_HEADERS)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'lxml')
    return response, soup

def clean_soup(soup):
    """Remove script, style, and other junk from soup before extraction."""
    for element in soup(["script", "style", "nav", "footer", "header"]):
        element.decompose()
    return soup

def filter_junk_links(href: str) -> bool:
    """Filter out junk links: mailto, javascript, tel, data."""
    junk_patterns = [r'^mailto:', r'^javascript:', r'^tel:', r'^data:']
    return not any(re.match(pattern, href.lower()) for pattern in junk_patterns)

@mcp.tool()
def webscraper_fetch(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters in the markdown body (default: 5000)")] = 5000) -> str:
    """Fetch a URL and return title + markdown body + metadata.

    Returns:
        Markdown string with title, body, and metadata
    """
    try:
        response, soup = _fetch_page(url)
        title = soup.title.string if soup.title else "No Title"
        soup = clean_soup(soup)
        body = html2text(str(soup.body if soup.body else soup), bodywidth=0)
        body = body[:max_chars] + "..." if len(body) > max_chars else body

        metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"

        return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}"
    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return f"# Error fetching {url}\n\n{str(e)}"

@mcp.tool()
def webscraper_fetch_links(url: Annotated[str, Field(description="The URL to fetch")], deduplicate: Annotated[bool, Field(description="Remove duplicate links (default: True)")] = True) -> List[str]:
    """Fetch a URL and extract all href links.

    Returns:
        List of unique href URLs
    """
    try:
        _, soup = _fetch_page(url)
        links = []
        for a in soup.find_all('a', href=True):
            href = a['href']
            full_url = urljoin(url, href)
            if filter_junk_links(full_url):
                links.append(full_url)

        if deduplicate:
            links = list(set(links))

        return links
    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return [f"Error: {str(e)}"]

@mcp.tool()
def webscraper_fetch_tables(url: Annotated[str, Field(description="The URL to fetch")]) -> List[str]:
    """Fetch a URL and extract all HTML tables as markdown.

    Returns:
        List of markdown tables
    """
    try:
        _, soup = _fetch_page(url)
        tables = []
        for table in soup.find_all('table'):
            markdown_table = html2text(str(table), bodywidth=0)
            tables.append(markdown_table)
        return tables if tables else ["No tables found."]
    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return [f"Error: {str(e)}"]

@mcp.tool()
def webscraper_fetch_all(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters (default: 5000)")] = 5000) -> Dict:
    """Fetch everything: markdown + links + tables + meta.

    Returns:
        Dict with 'markdown', 'links', 'tables', 'meta'
    """
    try:
        response, soup = _fetch_page(url)

        # Markdown
        title = soup.title.string if soup.title else "No Title"
        soup_clean = clean_soup(soup)
        body = html2text(str(soup_clean.body if soup_clean.body else soup_clean), bodywidth=0)
        body = body[:max_chars] + "..." if len(body) > max_chars else body
        markdown = f"# {title}\n\n{body}\n\n## Metadata\nURL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"

        # Links
        links = []
        for a in soup.find_all('a', href=True):
            href = a['href']
            full_url = urljoin(url, href)
            if filter_junk_links(full_url):
                links.append(full_url)
        links = list(set(links))

        # Tables
        tables = []
        for table in soup.find_all('table'):
            markdown_table = html2text(str(table), bodywidth=0)
            tables.append(markdown_table)
        tables = tables if tables else ["No tables found."]

        # Meta
        meta = {}
        meta['title'] = title
        desc_tag = soup.find('meta', attrs={'name': 'description'})
        meta['description'] = desc_tag['content'] if desc_tag else "No description"
        og_title = soup.find('meta', attrs={'property': 'og:title'})
        meta['og:title'] = og_title['content'] if og_title else title
        og_desc = soup.find('meta', attrs={'property': 'og:description'})
        meta['og:description'] = og_desc['content'] if og_desc else meta['description']

        return {
            "markdown": markdown,
            "links": links,
            "tables": tables,
            "meta": meta
        }
    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return {"error": str(e)}

@mcp.tool()
def webscraper_fetch_section(url: Annotated[str, Field(description="The URL to fetch")], selector: Annotated[str, Field(description="CSS selector (e.g., '.content')")]) -> str:
    """Fetch a URL and extract specific section by CSS selector.

    Returns:
        Markdown of the selected section
    """
    try:
        _, soup = _fetch_page(url)
        try:
            section = soup.select_one(selector)
        except Exception as e:
            if "selector" in str(e).lower():
                return f"Invalid CSS selector '{selector}' on {url}"
            raise

        if not section:
            return f"No element found for selector '{selector}' on {url}"

        soup_clean = clean_soup(section)
        markdown = html2text(str(soup_clean), bodywidth=0)
        return markdown
    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return f"Error: {str(e)}"

@mcp.tool()
def webscraper_fetch_meta(url: Annotated[str, Field(description="The URL to fetch")]) -> Dict[str, str]:
    """Fetch a URL and return page metadata: title, description, OG tags.

    Returns:
        Dict of metadata
    """
    try:
        _, soup = _fetch_page(url)
        meta = {}
        meta['title'] = soup.title.string if soup.title else "No Title"

        desc_tag = soup.find('meta', attrs={'name': 'description'})
        meta['description'] = desc_tag['content'] if desc_tag else "No description"

        og_title = soup.find('meta', attrs={'property': 'og:title'})
        meta['og:title'] = og_title['content'] if og_title else meta['title']

        og_desc = soup.find('meta', attrs={'property': 'og:description'})
        meta['og:description'] = og_desc['content'] if og_desc else meta['description']

        return meta
    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return {"error": str(e)}

@mcp.tool()
def webscraper_fetch_sitemap(url: Annotated[str, Field(description="Sitemap URL (or auto-discover)")], max_urls: Annotated[int, Field(description="Maximum URLs to return (default: 100)")] = 100) -> List[str]:
    """Fetch sitemap.xml and return list of URLs.

    Returns:
        List of sitemap URLs
    """
    try:
        response, soup = _fetch_page(url)
        urls = []
        for loc in soup.find_all('loc')[:max_urls]:
            urls.append(loc.text.strip())

        # Simple loop protection: check for self-reference
        if url in urls:
            urls.remove(url)

        return urls if urls else [f"No URLs in sitemap {url}"]
    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return [f"Error: {str(e)}"]

@mcp.tool()
def webscraper_search_hint(query: Annotated[str, Field(description="Search query (e.g. \"MacBook Pro M4 price Germany\")")], max_results: Annotated[int, Field(description="Maximum number of results to return (default: 5)")] = 5) -> Dict:
    """Search Brave Search and return top results as a scraping hint.

    Use this sparingly — once per research task — to get oriented before
    scraping individual pages. Returns top result URLs + snippets so you
    can decide which pages are worth scraping deeply.

    Returns:
        Dict with 'query', 'search_url', 'results' (list of {title, url, snippet}),
        'result_count', 'hint'
    """
    search_url = f"https://search.brave.com/search?q={quote_plus(query)}&source=web"
    try:
        _, soup = _fetch_page(search_url)

        results = []
        seen_urls: set = set()

        # Brave Search result cards: each div.snippet with a .result-wrapper is a web result.
        # Skip video clusters, FAQ blocks, and LLM snippets (they have no .result-wrapper).
        # Class names as of 2026-04 (updated from .snippet-title / .snippet-description):
        #   title   → .search-snippet-title
        #   url     → a.l1  (the primary result anchor, avoids favicon <a> tags)
        #   snippet → .content.t-primary
        for card in soup.select('.snippet'):
            if len(results) >= max_results:
                break

            # Skip non-web-result snippets (videos, FAQ, LLM answer blocks)
            if not card.select_one('.result-wrapper'):
                continue

            title_el = card.select_one('.search-snippet-title')
            url_el = card.select_one('a.l1')
            desc_el = card.select_one('.content.t-primary')

            title = title_el.get_text(strip=True) if title_el else ""
            url = url_el['href'] if url_el and url_el.get('href') else ""
            snippet = desc_el.get_text(strip=True) if desc_el else ""

            # Filter: must have a valid http(s) URL
            if not url or not url.startswith('http'):
                continue

            # Filter: skip results with no useful content at all
            if not title and not snippet:
                continue

            # Deduplicate by URL
            if url in seen_urls:
                continue
            seen_urls.add(url)

            results.append({"title": title, "url": url, "snippet": snippet})

        # Richer hint: title + url + first 120 chars of snippet for AI context
        if results:
            hint_parts = []
            for r in results:
                part = f"{r['title']} ({r['url']})"
                if r['snippet']:
                    part += f": {r['snippet'][:120]}"
                hint_parts.append(part)
            hint = " | ".join(hint_parts)
        else:
            hint = "No results found"

        return {
            "query": query,
            "search_url": search_url,
            "results": results,
            "result_count": len(results),
            "hint": hint,
        }
    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return {
            "query": query,
            "search_url": search_url,
            "results": [],
            "result_count": 0,
            "hint": f"Error: {str(e)}",
        }


if __name__ == "__main__":
    mcp.run(transport="stdio")