Add webscraper MCP server (7 tools, httpx+bs4, 15+ tests)
This commit is contained in:
@@ -0,0 +1,214 @@
|
||||
"""Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps."""
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from html2text import html2text
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from typing import List, Dict, Optional
|
||||
import re
|
||||
from fastmcp import FastMCP
|
||||
|
||||
mcp = FastMCP("webscraper")
|
||||
|
||||
def clean_soup(soup):
|
||||
"""Remove script, style, and other junk from soup before extraction."""
|
||||
for element in soup(["script", "style", "nav", "footer", "header"]):
|
||||
element.decompose()
|
||||
return soup
|
||||
|
||||
def filter_junk_links(href: str) -> bool:
|
||||
"""Filter out junk links: mailto, javascript, tel, data."""
|
||||
junk_patterns = [r'^mailto:', r'^javascript:', r'^tel:', r'^data:']
|
||||
return not any(re.match(pattern, href.lower()) for pattern in junk_patterns)
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
|
||||
"""Fetch a URL and return title + markdown body + metadata.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
max_chars: Maximum characters in the markdown body (default: 5000)
|
||||
|
||||
Returns:
|
||||
Markdown string with title, body, and metadata
|
||||
"""
|
||||
try:
|
||||
response = httpx.get(url, timeout=10.0)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
title = soup.title.string if soup.title else "No Title"
|
||||
soup = clean_soup(soup)
|
||||
body = html2text(str(soup.body if soup.body else soup), bodywidth=0)
|
||||
body = body[:max_chars] + "..." if len(body) > max_chars else body
|
||||
|
||||
metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
|
||||
|
||||
return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}"
|
||||
except httpx.RequestError as e:
|
||||
return f"# Error fetching {url}\n\n{str(e)}"
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
|
||||
"""Fetch a URL and extract all href links.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
deduplicate: Remove duplicate links (default: True)
|
||||
|
||||
Returns:
|
||||
List of unique href URLs
|
||||
"""
|
||||
try:
|
||||
response = httpx.get(url, timeout=10.0)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
links = []
|
||||
for a in soup.find_all('a', href=True):
|
||||
href = a['href']
|
||||
if href.startswith('http') or href.startswith('/'):
|
||||
full_url = urljoin(url, href)
|
||||
if filter_junk_links(full_url):
|
||||
links.append(full_url)
|
||||
|
||||
if deduplicate:
|
||||
links = list(set(links))
|
||||
|
||||
return links
|
||||
except httpx.RequestError as e:
|
||||
return [f"Error: {str(e)}"]
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_tables(url: str) -> List[str]:
|
||||
"""Fetch a URL and extract all HTML tables as markdown.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
|
||||
Returns:
|
||||
List of markdown tables
|
||||
"""
|
||||
try:
|
||||
response = httpx.get(url, timeout=10.0)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
tables = []
|
||||
for table in soup.find_all('table'):
|
||||
markdown_table = html2text(str(table), bodywidth=0)
|
||||
tables.append(markdown_table)
|
||||
return tables if tables else ["No tables found."]
|
||||
except httpx.RequestError as e:
|
||||
return [f"Error: {str(e)}"]
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
|
||||
"""Fetch everything: markdown + links + tables + meta.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
max_chars: Maximum characters (default: 5000)
|
||||
|
||||
Returns:
|
||||
Dict with 'markdown', 'links', 'tables', 'meta'
|
||||
"""
|
||||
markdown = webscraper_fetch(url, max_chars)
|
||||
links = webscraper_fetch_links(url)
|
||||
tables = webscraper_fetch_tables(url)
|
||||
meta = webscraper_fetch_meta(url)
|
||||
|
||||
return {
|
||||
"markdown": markdown,
|
||||
"links": links,
|
||||
"tables": tables,
|
||||
"meta": meta
|
||||
}
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_section(url: str, selector: str) -> str:
|
||||
"""Fetch a URL and extract specific section by CSS selector.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
selector: CSS selector (e.g., '.content')
|
||||
|
||||
Returns:
|
||||
Markdown of the selected section
|
||||
"""
|
||||
try:
|
||||
response = httpx.get(url, timeout=10.0)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
section = soup.select_one(selector)
|
||||
if not section:
|
||||
return f"No element found for selector '{selector}' on {url}"
|
||||
|
||||
soup = clean_soup(section)
|
||||
markdown = html2text(str(soup), bodywidth=0)
|
||||
return markdown
|
||||
except httpx.RequestError as e:
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_meta(url: str) -> Dict[str, str]:
|
||||
"""Fetch a URL and return page metadata: title, description, OG tags.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
|
||||
Returns:
|
||||
Dict of metadata
|
||||
"""
|
||||
try:
|
||||
response = httpx.get(url, timeout=10.0)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
meta = {}
|
||||
meta['title'] = soup.title.string if soup.title else "No Title"
|
||||
|
||||
desc_tag = soup.find('meta', attrs={'name': 'description'})
|
||||
meta['description'] = desc_tag['content'] if desc_tag else "No description"
|
||||
|
||||
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
||||
meta['og:title'] = og_title['content'] if og_title else meta['title']
|
||||
|
||||
og_desc = soup.find('meta', attrs={'property': 'og:description'})
|
||||
meta['og:description'] = og_desc['content'] if og_desc else meta['description']
|
||||
|
||||
return meta
|
||||
except httpx.RequestError as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
|
||||
"""Fetch sitemap.xml and return list of URLs.
|
||||
|
||||
Args:
|
||||
url: Sitemap URL (or auto-discover)
|
||||
max_urls: Maximum URLs to return (default: 100)
|
||||
|
||||
Returns:
|
||||
List of sitemap URLs
|
||||
"""
|
||||
try:
|
||||
response = httpx.get(url, timeout=10.0)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'xml')
|
||||
urls = []
|
||||
for loc in soup.find_all('loc')[:max_urls]:
|
||||
urls.append(loc.text.strip())
|
||||
|
||||
# Simple loop protection: check for self-reference
|
||||
if url in urls:
|
||||
urls.remove(url)
|
||||
|
||||
return urls if urls else [f"No URLs in sitemap {url}"]
|
||||
except httpx.RequestError as e:
|
||||
return [f"Error: {str(e)}"]
|
||||
|
||||
if __name__ == "__main__":
|
||||
mcp.run(transport="stdio")
|
||||
Reference in New Issue
Block a user