"""Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps.""" import httpx from bs4 import BeautifulSoup from html2text import html2text from urllib.parse import urljoin, quote_plus from typing import List, Dict, Tuple, Annotated import re import ssl import os import certifi from pathlib import Path from fastmcp import FastMCP from pydantic import Field mcp = FastMCP("webscraper") # Build a single SSL context at module load — certifi bundle + any extra certs # shipped in the certs/ directory next to this file. _EXTRA_CERTS_DIR = Path(__file__).resolve().parent.parent / "certs" def _build_ssl_context() -> ssl.SSLContext: """Build an SSL context from certifi + extra bundled root certs.""" ctx = ssl.create_default_context(cafile=certifi.where()) if _EXTRA_CERTS_DIR.is_dir(): for pem in _EXTRA_CERTS_DIR.glob("*.pem"): ctx.load_verify_locations(cafile=str(pem)) return ctx _SSL_CTX = _build_ssl_context() _HEADERS = { "User-Agent": ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" ) } def _fetch_page(url: str) -> Tuple[httpx.Response, BeautifulSoup]: """Shared fetch helper — returns response and parsed soup.""" response = httpx.get(url, timeout=10.0, verify=_SSL_CTX, headers=_HEADERS) response.raise_for_status() soup = BeautifulSoup(response.text, 'lxml') return response, soup def clean_soup(soup): """Remove script, style, and other junk from soup before extraction.""" for element in soup(["script", "style", "nav", "footer", "header"]): element.decompose() return soup def filter_junk_links(href: str) -> bool: """Filter out junk links: mailto, javascript, tel, data.""" junk_patterns = [r'^mailto:', r'^javascript:', r'^tel:', r'^data:'] return not any(re.match(pattern, href.lower()) for pattern in junk_patterns) @mcp.tool() def webscraper_fetch(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters in the markdown body (default: 5000)")] = 5000) -> str: """Fetch a URL and return title + markdown body + metadata. Returns: Markdown string with title, body, and metadata """ try: response, soup = _fetch_page(url) title = soup.title.string if soup.title else "No Title" soup = clean_soup(soup) body = html2text(str(soup.body if soup.body else soup), bodywidth=0) body = body[:max_chars] + "..." if len(body) > max_chars else body metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}" return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}" except (httpx.RequestError, httpx.HTTPStatusError) as e: return f"# Error fetching {url}\n\n{str(e)}" @mcp.tool() def webscraper_fetch_links(url: Annotated[str, Field(description="The URL to fetch")], deduplicate: Annotated[bool, Field(description="Remove duplicate links (default: True)")] = True) -> List[str]: """Fetch a URL and extract all href links. Returns: List of unique href URLs """ try: _, soup = _fetch_page(url) links = [] for a in soup.find_all('a', href=True): href = a['href'] full_url = urljoin(url, href) if filter_junk_links(full_url): links.append(full_url) if deduplicate: links = list(set(links)) return links except (httpx.RequestError, httpx.HTTPStatusError) as e: return [f"Error: {str(e)}"] @mcp.tool() def webscraper_fetch_tables(url: Annotated[str, Field(description="The URL to fetch")]) -> List[str]: """Fetch a URL and extract all HTML tables as markdown. Returns: List of markdown tables """ try: _, soup = _fetch_page(url) tables = [] for table in soup.find_all('table'): markdown_table = html2text(str(table), bodywidth=0) tables.append(markdown_table) return tables if tables else ["No tables found."] except (httpx.RequestError, httpx.HTTPStatusError) as e: return [f"Error: {str(e)}"] @mcp.tool() def webscraper_fetch_all(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters (default: 5000)")] = 5000) -> Dict: """Fetch everything: markdown + links + tables + meta. Returns: Dict with 'markdown', 'links', 'tables', 'meta' """ try: response, soup = _fetch_page(url) # Markdown title = soup.title.string if soup.title else "No Title" soup_clean = clean_soup(soup) body = html2text(str(soup_clean.body if soup_clean.body else soup_clean), bodywidth=0) body = body[:max_chars] + "..." if len(body) > max_chars else body markdown = f"# {title}\n\n{body}\n\n## Metadata\nURL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}" # Links links = [] for a in soup.find_all('a', href=True): href = a['href'] full_url = urljoin(url, href) if filter_junk_links(full_url): links.append(full_url) links = list(set(links)) # Tables tables = [] for table in soup.find_all('table'): markdown_table = html2text(str(table), bodywidth=0) tables.append(markdown_table) tables = tables if tables else ["No tables found."] # Meta meta = {} meta['title'] = title desc_tag = soup.find('meta', attrs={'name': 'description'}) meta['description'] = desc_tag['content'] if desc_tag else "No description" og_title = soup.find('meta', attrs={'property': 'og:title'}) meta['og:title'] = og_title['content'] if og_title else title og_desc = soup.find('meta', attrs={'property': 'og:description'}) meta['og:description'] = og_desc['content'] if og_desc else meta['description'] return { "markdown": markdown, "links": links, "tables": tables, "meta": meta } except (httpx.RequestError, httpx.HTTPStatusError) as e: return {"error": str(e)} @mcp.tool() def webscraper_fetch_section(url: Annotated[str, Field(description="The URL to fetch")], selector: Annotated[str, Field(description="CSS selector (e.g., '.content')")]) -> str: """Fetch a URL and extract specific section by CSS selector. Returns: Markdown of the selected section """ try: _, soup = _fetch_page(url) try: section = soup.select_one(selector) except Exception as e: if "selector" in str(e).lower(): return f"Invalid CSS selector '{selector}' on {url}" raise if not section: return f"No element found for selector '{selector}' on {url}" soup_clean = clean_soup(section) markdown = html2text(str(soup_clean), bodywidth=0) return markdown except (httpx.RequestError, httpx.HTTPStatusError) as e: return f"Error: {str(e)}" @mcp.tool() def webscraper_fetch_meta(url: Annotated[str, Field(description="The URL to fetch")]) -> Dict[str, str]: """Fetch a URL and return page metadata: title, description, OG tags. Returns: Dict of metadata """ try: _, soup = _fetch_page(url) meta = {} meta['title'] = soup.title.string if soup.title else "No Title" desc_tag = soup.find('meta', attrs={'name': 'description'}) meta['description'] = desc_tag['content'] if desc_tag else "No description" og_title = soup.find('meta', attrs={'property': 'og:title'}) meta['og:title'] = og_title['content'] if og_title else meta['title'] og_desc = soup.find('meta', attrs={'property': 'og:description'}) meta['og:description'] = og_desc['content'] if og_desc else meta['description'] return meta except (httpx.RequestError, httpx.HTTPStatusError) as e: return {"error": str(e)} @mcp.tool() def webscraper_fetch_sitemap(url: Annotated[str, Field(description="Sitemap URL (or auto-discover)")], max_urls: Annotated[int, Field(description="Maximum URLs to return (default: 100)")] = 100) -> List[str]: """Fetch sitemap.xml and return list of URLs. Returns: List of sitemap URLs """ try: response, soup = _fetch_page(url) urls = [] for loc in soup.find_all('loc')[:max_urls]: urls.append(loc.text.strip()) # Simple loop protection: check for self-reference if url in urls: urls.remove(url) return urls if urls else [f"No URLs in sitemap {url}"] except (httpx.RequestError, httpx.HTTPStatusError) as e: return [f"Error: {str(e)}"] @mcp.tool() def webscraper_search_hint(query: Annotated[str, Field(description="Search query (e.g. \"MacBook Pro M4 price Germany\")")], max_results: Annotated[int, Field(description="Maximum number of results to return (default: 5)")] = 5) -> Dict: """Search Brave Search and return top results as a scraping hint. Use this sparingly — once per research task — to get oriented before scraping individual pages. Returns top result URLs + snippets so you can decide which pages are worth scraping deeply. Returns: Dict with 'query', 'search_url', 'results' (list of {title, url, snippet}), 'result_count', 'hint' """ search_url = f"https://search.brave.com/search?q={quote_plus(query)}&source=web" try: _, soup = _fetch_page(search_url) results = [] seen_urls: set = set() # Brave Search result cards: each div.snippet with a .result-wrapper is a web result. # Skip video clusters, FAQ blocks, and LLM snippets (they have no .result-wrapper). # Class names as of 2026-04 (updated from .snippet-title / .snippet-description): # title → .search-snippet-title # url → a.l1 (the primary result anchor, avoids favicon tags) # snippet → .content.t-primary for card in soup.select('.snippet'): if len(results) >= max_results: break # Skip non-web-result snippets (videos, FAQ, LLM answer blocks) if not card.select_one('.result-wrapper'): continue title_el = card.select_one('.search-snippet-title') url_el = card.select_one('a.l1') desc_el = card.select_one('.content.t-primary') title = title_el.get_text(strip=True) if title_el else "" url = url_el['href'] if url_el and url_el.get('href') else "" snippet = desc_el.get_text(strip=True) if desc_el else "" # Filter: must have a valid http(s) URL if not url or not url.startswith('http'): continue # Filter: skip results with no useful content at all if not title and not snippet: continue # Deduplicate by URL if url in seen_urls: continue seen_urls.add(url) results.append({"title": title, "url": url, "snippet": snippet}) # Richer hint: title + url + first 120 chars of snippet for AI context if results: hint_parts = [] for r in results: part = f"{r['title']} ({r['url']})" if r['snippet']: part += f": {r['snippet'][:120]}" hint_parts.append(part) hint = " | ".join(hint_parts) else: hint = "No results found" return { "query": query, "search_url": search_url, "results": results, "result_count": len(results), "hint": hint, } except (httpx.RequestError, httpx.HTTPStatusError) as e: return { "query": query, "search_url": search_url, "results": [], "result_count": 0, "hint": f"Error: {str(e)}", } if __name__ == "__main__": mcp.run(transport="stdio")