Add webscraper MCP server (7 tools, httpx+bs4, 15+ tests)

2026-04-03 13:40:50 +02:00
parent 6623fe0337
commit 38a2b89bd3
9 changed files with 2266 additions and 0 deletions
@@ -0,0 +1,214 @@
+"""Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps."""
+
+import httpx
+from bs4 import BeautifulSoup
+from html2text import html2text
+from urllib.parse import urljoin, urlparse
+from typing import List, Dict, Optional
+import re
+from fastmcp import FastMCP
+
+mcp = FastMCP("webscraper")
+
+def clean_soup(soup):
+    """Remove script, style, and other junk from soup before extraction."""
+    for element in soup(["script", "style", "nav", "footer", "header"]):
+        element.decompose()
+    return soup
+
+def filter_junk_links(href: str) -> bool:
+    """Filter out junk links: mailto, javascript, tel, data."""
+    junk_patterns = [r'^mailto:', r'^javascript:', r'^tel:', r'^data:']
+    return not any(re.match(pattern, href.lower()) for pattern in junk_patterns)
+
+@mcp.tool()
+def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
+    """Fetch a URL and return title + markdown body + metadata.
+    
+    Args:
+        url: The URL to fetch
+        max_chars: Maximum characters in the markdown body (default: 5000)
+    
+    Returns:
+        Markdown string with title, body, and metadata
+    """
+    try:
+        response = httpx.get(url, timeout=10.0)
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.text, 'lxml')
+        title = soup.title.string if soup.title else "No Title"
+        soup = clean_soup(soup)
+        body = html2text(str(soup.body if soup.body else soup), bodywidth=0)
+        body = body[:max_chars] + "..." if len(body) > max_chars else body
+        
+        metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
+        
+        return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}"
+    except httpx.RequestError as e:
+        return f"# Error fetching {url}\n\n{str(e)}"
+
+@mcp.tool()
+def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
+    """Fetch a URL and extract all href links.
+    
+    Args:
+        url: The URL to fetch
+        deduplicate: Remove duplicate links (default: True)
+    
+    Returns:
+        List of unique href URLs
+    """
+    try:
+        response = httpx.get(url, timeout=10.0)
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.text, 'lxml')
+        links = []
+        for a in soup.find_all('a', href=True):
+            href = a['href']
+            if href.startswith('http') or href.startswith('/'):
+                full_url = urljoin(url, href)
+                if filter_junk_links(full_url):
+                    links.append(full_url)
+        
+        if deduplicate:
+            links = list(set(links))
+        
+        return links
+    except httpx.RequestError as e:
+        return [f"Error: {str(e)}"]
+
+@mcp.tool()
+def webscraper_fetch_tables(url: str) -> List[str]:
+    """Fetch a URL and extract all HTML tables as markdown.
+    
+    Args:
+        url: The URL to fetch
+    
+    Returns:
+        List of markdown tables
+    """
+    try:
+        response = httpx.get(url, timeout=10.0)
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.text, 'lxml')
+        tables = []
+        for table in soup.find_all('table'):
+            markdown_table = html2text(str(table), bodywidth=0)
+            tables.append(markdown_table)
+        return tables if tables else ["No tables found."]
+    except httpx.RequestError as e:
+        return [f"Error: {str(e)}"]
+
+@mcp.tool()
+def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
+    """Fetch everything: markdown + links + tables + meta.
+    
+    Args:
+        url: The URL to fetch
+        max_chars: Maximum characters (default: 5000)
+    
+    Returns:
+        Dict with 'markdown', 'links', 'tables', 'meta'
+    """
+    markdown = webscraper_fetch(url, max_chars)
+    links = webscraper_fetch_links(url)
+    tables = webscraper_fetch_tables(url)
+    meta = webscraper_fetch_meta(url)
+    
+    return {
+        "markdown": markdown,
+        "links": links,
+        "tables": tables,
+        "meta": meta
+    }
+
+@mcp.tool()
+def webscraper_fetch_section(url: str, selector: str) -> str:
+    """Fetch a URL and extract specific section by CSS selector.
+    
+    Args:
+        url: The URL to fetch
+        selector: CSS selector (e.g., '.content')
+    
+    Returns:
+        Markdown of the selected section
+    """
+    try:
+        response = httpx.get(url, timeout=10.0)
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.text, 'lxml')
+        section = soup.select_one(selector)
+        if not section:
+            return f"No element found for selector '{selector}' on {url}"
+        
+        soup = clean_soup(section)
+        markdown = html2text(str(soup), bodywidth=0)
+        return markdown
+    except httpx.RequestError as e:
+        return f"Error: {str(e)}"
+
+@mcp.tool()
+def webscraper_fetch_meta(url: str) -> Dict[str, str]:
+    """Fetch a URL and return page metadata: title, description, OG tags.
+    
+    Args:
+        url: The URL to fetch
+    
+    Returns:
+        Dict of metadata
+    """
+    try:
+        response = httpx.get(url, timeout=10.0)
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.text, 'lxml')
+        meta = {}
+        meta['title'] = soup.title.string if soup.title else "No Title"
+        
+        desc_tag = soup.find('meta', attrs={'name': 'description'})
+        meta['description'] = desc_tag['content'] if desc_tag else "No description"
+        
+        og_title = soup.find('meta', attrs={'property': 'og:title'})
+        meta['og:title'] = og_title['content'] if og_title else meta['title']
+        
+        og_desc = soup.find('meta', attrs={'property': 'og:description'})
+        meta['og:description'] = og_desc['content'] if og_desc else meta['description']
+        
+        return meta
+    except httpx.RequestError as e:
+        return {"error": str(e)}
+
+@mcp.tool()
+def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
+    """Fetch sitemap.xml and return list of URLs.
+    
+    Args:
+        url: Sitemap URL (or auto-discover)
+        max_urls: Maximum URLs to return (default: 100)
+    
+    Returns:
+        List of sitemap URLs
+    """
+    try:
+        response = httpx.get(url, timeout=10.0)
+        response.raise_for_status()
+        
+        soup = BeautifulSoup(response.text, 'xml')
+        urls = []
+        for loc in soup.find_all('loc')[:max_urls]:
+            urls.append(loc.text.strip())
+        
+        # Simple loop protection: check for self-reference
+        if url in urls:
+            urls.remove(url)
+        
+        return urls if urls else [f"No URLs in sitemap {url}"]
+    except httpx.RequestError as e:
+        return [f"Error: {str(e)}"]
+
+if __name__ == "__main__":
+    mcp.run(transport="stdio")