"""Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps.""" import httpx from bs4 import BeautifulSoup from html2text import html2text from urllib.parse import urljoin from typing import List, Dict, Tuple import re from fastmcp import FastMCP mcp = FastMCP("webscraper") def _fetch_page(url: str) -> Tuple[httpx.Response, BeautifulSoup]: """Shared fetch helper — returns response and parsed soup.""" response = httpx.get(url, timeout=10.0) response.raise_for_status() soup = BeautifulSoup(response.text, 'lxml') return response, soup def clean_soup(soup): """Remove script, style, and other junk from soup before extraction.""" for element in soup(["script", "style", "nav", "footer", "header"]): element.decompose() return soup def filter_junk_links(href: str) -> bool: """Filter out junk links: mailto, javascript, tel, data.""" junk_patterns = [r'^mailto:', r'^javascript:', r'^tel:', r'^data:'] return not any(re.match(pattern, href.lower()) for pattern in junk_patterns) @mcp.tool() def webscraper_fetch(url: str, max_chars: int = 5000) -> str: """Fetch a URL and return title + markdown body + metadata. Args: url: The URL to fetch max_chars: Maximum characters in the markdown body (default: 5000) Returns: Markdown string with title, body, and metadata """ try: response, soup = _fetch_page(url) title = soup.title.string if soup.title else "No Title" soup = clean_soup(soup) body = html2text(str(soup.body if soup.body else soup), bodywidth=0) body = body[:max_chars] + "..." if len(body) > max_chars else body metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}" return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}" except (httpx.RequestError, httpx.HTTPStatusError) as e: return f"# Error fetching {url}\n\n{str(e)}" @mcp.tool() def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]: """Fetch a URL and extract all href links. Args: url: The URL to fetch deduplicate: Remove duplicate links (default: True) Returns: List of unique href URLs """ try: _, soup = _fetch_page(url) links = [] for a in soup.find_all('a', href=True): href = a['href'] full_url = urljoin(url, href) if filter_junk_links(full_url): links.append(full_url) if deduplicate: links = list(set(links)) return links except (httpx.RequestError, httpx.HTTPStatusError) as e: return [f"Error: {str(e)}"] @mcp.tool() def webscraper_fetch_tables(url: str) -> List[str]: """Fetch a URL and extract all HTML tables as markdown. Args: url: The URL to fetch Returns: List of markdown tables """ try: _, soup = _fetch_page(url) tables = [] for table in soup.find_all('table'): markdown_table = html2text(str(table), bodywidth=0) tables.append(markdown_table) return tables if tables else ["No tables found."] except (httpx.RequestError, httpx.HTTPStatusError) as e: return [f"Error: {str(e)}"] @mcp.tool() def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict: """Fetch everything: markdown + links + tables + meta. Args: url: The URL to fetch max_chars: Maximum characters (default: 5000) Returns: Dict with 'markdown', 'links', 'tables', 'meta' """ try: response, soup = _fetch_page(url) # Markdown title = soup.title.string if soup.title else "No Title" soup_clean = clean_soup(soup) body = html2text(str(soup_clean.body if soup_clean.body else soup_clean), bodywidth=0) body = body[:max_chars] + "..." if len(body) > max_chars else body markdown = f"# {title}\n\n{body}\n\n## Metadata\nURL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}" # Links links = [] for a in soup.find_all('a', href=True): href = a['href'] full_url = urljoin(url, href) if filter_junk_links(full_url): links.append(full_url) links = list(set(links)) # Tables tables = [] for table in soup.find_all('table'): markdown_table = html2text(str(table), bodywidth=0) tables.append(markdown_table) tables = tables if tables else ["No tables found."] # Meta meta = {} meta['title'] = title desc_tag = soup.find('meta', attrs={'name': 'description'}) meta['description'] = desc_tag['content'] if desc_tag else "No description" og_title = soup.find('meta', attrs={'property': 'og:title'}) meta['og:title'] = og_title['content'] if og_title else title og_desc = soup.find('meta', attrs={'property': 'og:description'}) meta['og:description'] = og_desc['content'] if og_desc else meta['description'] return { "markdown": markdown, "links": links, "tables": tables, "meta": meta } except (httpx.RequestError, httpx.HTTPStatusError) as e: return {"error": str(e)} @mcp.tool() def webscraper_fetch_section(url: str, selector: str) -> str: """Fetch a URL and extract specific section by CSS selector. Args: url: The URL to fetch selector: CSS selector (e.g., '.content') Returns: Markdown of the selected section """ try: _, soup = _fetch_page(url) try: section = soup.select_one(selector) except Exception as e: if "selector" in str(e).lower(): return f"Invalid CSS selector '{selector}' on {url}" raise if not section: return f"No element found for selector '{selector}' on {url}" soup_clean = clean_soup(section) markdown = html2text(str(soup_clean), bodywidth=0) return markdown except (httpx.RequestError, httpx.HTTPStatusError) as e: return f"Error: {str(e)}" @mcp.tool() def webscraper_fetch_meta(url: str) -> Dict[str, str]: """Fetch a URL and return page metadata: title, description, OG tags. Args: url: The URL to fetch Returns: Dict of metadata """ try: _, soup = _fetch_page(url) meta = {} meta['title'] = soup.title.string if soup.title else "No Title" desc_tag = soup.find('meta', attrs={'name': 'description'}) meta['description'] = desc_tag['content'] if desc_tag else "No description" og_title = soup.find('meta', attrs={'property': 'og:title'}) meta['og:title'] = og_title['content'] if og_title else meta['title'] og_desc = soup.find('meta', attrs={'property': 'og:description'}) meta['og:description'] = og_desc['content'] if og_desc else meta['description'] return meta except (httpx.RequestError, httpx.HTTPStatusError) as e: return {"error": str(e)} @mcp.tool() def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]: """Fetch sitemap.xml and return list of URLs. Args: url: Sitemap URL (or auto-discover) max_urls: Maximum URLs to return (default: 100) Returns: List of sitemap URLs """ try: response, soup = _fetch_page(url) urls = [] for loc in soup.find_all('loc')[:max_urls]: urls.append(loc.text.strip()) # Simple loop protection: check for self-reference if url in urls: urls.remove(url) return urls if urls else [f"No URLs in sitemap {url}"] except (httpx.RequestError, httpx.HTTPStatusError) as e: return [f"Error: {str(e)}"] if __name__ == "__main__": mcp.run(transport="stdio")