242 lines
8.1 KiB
Python
242 lines
8.1 KiB
Python
"""Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps."""
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
from html2text import html2text
|
|
from urllib.parse import urljoin
|
|
from typing import List, Dict, Tuple
|
|
import re
|
|
from fastmcp import FastMCP
|
|
|
|
mcp = FastMCP("webscraper")
|
|
|
|
def _fetch_page(url: str) -> Tuple[httpx.Response, BeautifulSoup]:
|
|
"""Shared fetch helper — returns response and parsed soup."""
|
|
response = httpx.get(url, timeout=10.0)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, 'lxml')
|
|
return response, soup
|
|
|
|
def clean_soup(soup):
|
|
"""Remove script, style, and other junk from soup before extraction."""
|
|
for element in soup(["script", "style", "nav", "footer", "header"]):
|
|
element.decompose()
|
|
return soup
|
|
|
|
def filter_junk_links(href: str) -> bool:
|
|
"""Filter out junk links: mailto, javascript, tel, data."""
|
|
junk_patterns = [r'^mailto:', r'^javascript:', r'^tel:', r'^data:']
|
|
return not any(re.match(pattern, href.lower()) for pattern in junk_patterns)
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
|
|
"""Fetch a URL and return title + markdown body + metadata.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
max_chars: Maximum characters in the markdown body (default: 5000)
|
|
|
|
Returns:
|
|
Markdown string with title, body, and metadata
|
|
"""
|
|
try:
|
|
response, soup = _fetch_page(url)
|
|
title = soup.title.string if soup.title else "No Title"
|
|
soup = clean_soup(soup)
|
|
body = html2text(str(soup.body if soup.body else soup), bodywidth=0)
|
|
body = body[:max_chars] + "..." if len(body) > max_chars else body
|
|
|
|
metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
|
|
|
|
return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}"
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return f"# Error fetching {url}\n\n{str(e)}"
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
|
|
"""Fetch a URL and extract all href links.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
deduplicate: Remove duplicate links (default: True)
|
|
|
|
Returns:
|
|
List of unique href URLs
|
|
"""
|
|
try:
|
|
_, soup = _fetch_page(url)
|
|
links = []
|
|
for a in soup.find_all('a', href=True):
|
|
href = a['href']
|
|
full_url = urljoin(url, href)
|
|
if filter_junk_links(full_url):
|
|
links.append(full_url)
|
|
|
|
if deduplicate:
|
|
links = list(set(links))
|
|
|
|
return links
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return [f"Error: {str(e)}"]
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_tables(url: str) -> List[str]:
|
|
"""Fetch a URL and extract all HTML tables as markdown.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
|
|
Returns:
|
|
List of markdown tables
|
|
"""
|
|
try:
|
|
_, soup = _fetch_page(url)
|
|
tables = []
|
|
for table in soup.find_all('table'):
|
|
markdown_table = html2text(str(table), bodywidth=0)
|
|
tables.append(markdown_table)
|
|
return tables if tables else ["No tables found."]
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return [f"Error: {str(e)}"]
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
|
|
"""Fetch everything: markdown + links + tables + meta.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
max_chars: Maximum characters (default: 5000)
|
|
|
|
Returns:
|
|
Dict with 'markdown', 'links', 'tables', 'meta'
|
|
"""
|
|
try:
|
|
response, soup = _fetch_page(url)
|
|
|
|
# Markdown
|
|
title = soup.title.string if soup.title else "No Title"
|
|
soup_clean = clean_soup(soup)
|
|
body = html2text(str(soup_clean.body if soup_clean.body else soup_clean), bodywidth=0)
|
|
body = body[:max_chars] + "..." if len(body) > max_chars else body
|
|
markdown = f"# {title}\n\n{body}\n\n## Metadata\nURL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
|
|
|
|
# Links
|
|
links = []
|
|
for a in soup.find_all('a', href=True):
|
|
href = a['href']
|
|
full_url = urljoin(url, href)
|
|
if filter_junk_links(full_url):
|
|
links.append(full_url)
|
|
links = list(set(links))
|
|
|
|
# Tables
|
|
tables = []
|
|
for table in soup.find_all('table'):
|
|
markdown_table = html2text(str(table), bodywidth=0)
|
|
tables.append(markdown_table)
|
|
tables = tables if tables else ["No tables found."]
|
|
|
|
# Meta
|
|
meta = {}
|
|
meta['title'] = title
|
|
desc_tag = soup.find('meta', attrs={'name': 'description'})
|
|
meta['description'] = desc_tag['content'] if desc_tag else "No description"
|
|
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
|
meta['og:title'] = og_title['content'] if og_title else title
|
|
og_desc = soup.find('meta', attrs={'property': 'og:description'})
|
|
meta['og:description'] = og_desc['content'] if og_desc else meta['description']
|
|
|
|
return {
|
|
"markdown": markdown,
|
|
"links": links,
|
|
"tables": tables,
|
|
"meta": meta
|
|
}
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return {"error": str(e)}
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_section(url: str, selector: str) -> str:
|
|
"""Fetch a URL and extract specific section by CSS selector.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
selector: CSS selector (e.g., '.content')
|
|
|
|
Returns:
|
|
Markdown of the selected section
|
|
"""
|
|
try:
|
|
_, soup = _fetch_page(url)
|
|
try:
|
|
section = soup.select_one(selector)
|
|
except Exception as e:
|
|
if "selector" in str(e).lower():
|
|
return f"Invalid CSS selector '{selector}' on {url}"
|
|
raise
|
|
|
|
if not section:
|
|
return f"No element found for selector '{selector}' on {url}"
|
|
|
|
soup_clean = clean_soup(section)
|
|
markdown = html2text(str(soup_clean), bodywidth=0)
|
|
return markdown
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return f"Error: {str(e)}"
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_meta(url: str) -> Dict[str, str]:
|
|
"""Fetch a URL and return page metadata: title, description, OG tags.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
|
|
Returns:
|
|
Dict of metadata
|
|
"""
|
|
try:
|
|
_, soup = _fetch_page(url)
|
|
meta = {}
|
|
meta['title'] = soup.title.string if soup.title else "No Title"
|
|
|
|
desc_tag = soup.find('meta', attrs={'name': 'description'})
|
|
meta['description'] = desc_tag['content'] if desc_tag else "No description"
|
|
|
|
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
|
meta['og:title'] = og_title['content'] if og_title else meta['title']
|
|
|
|
og_desc = soup.find('meta', attrs={'property': 'og:description'})
|
|
meta['og:description'] = og_desc['content'] if og_desc else meta['description']
|
|
|
|
return meta
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return {"error": str(e)}
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
|
|
"""Fetch sitemap.xml and return list of URLs.
|
|
|
|
Args:
|
|
url: Sitemap URL (or auto-discover)
|
|
max_urls: Maximum URLs to return (default: 100)
|
|
|
|
Returns:
|
|
List of sitemap URLs
|
|
"""
|
|
try:
|
|
response, soup = _fetch_page(url)
|
|
urls = []
|
|
for loc in soup.find_all('loc')[:max_urls]:
|
|
urls.append(loc.text.strip())
|
|
|
|
# Simple loop protection: check for self-reference
|
|
if url in urls:
|
|
urls.remove(url)
|
|
|
|
return urls if urls else [f"No URLs in sitemap {url}"]
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return [f"Error: {str(e)}"]
|
|
|
|
if __name__ == "__main__":
|
|
mcp.run(transport="stdio")
|