2ab847f51d
- Add webscraper_search_hint() tool using Brave Search as backend (no CAPTCHA/GDPR consent wall, works with plain httpx) - Add User-Agent header to _fetch_page() — fixes 403 on Wikipedia, Feynman Lectures, and other sites that block headless requests - Add 5 new tests for search hint (23 total, 90% coverage) Brave Search URL: https://search.brave.com/search?q={query}&source=web Use sparingly — once per research task as orientation, not in loops
313 lines
11 KiB
Python
313 lines
11 KiB
Python
"""Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps."""
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
from html2text import html2text
|
|
from urllib.parse import urljoin
|
|
from typing import List, Dict, Tuple
|
|
import re
|
|
import ssl
|
|
import os
|
|
import certifi
|
|
from pathlib import Path
|
|
from fastmcp import FastMCP
|
|
|
|
mcp = FastMCP("webscraper")
|
|
|
|
# Build a single SSL context at module load — certifi bundle + any extra certs
|
|
# shipped in the certs/ directory next to this file.
|
|
_EXTRA_CERTS_DIR = Path(__file__).resolve().parent.parent / "certs"
|
|
|
|
def _build_ssl_context() -> ssl.SSLContext:
|
|
"""Build an SSL context from certifi + extra bundled root certs."""
|
|
ctx = ssl.create_default_context(cafile=certifi.where())
|
|
if _EXTRA_CERTS_DIR.is_dir():
|
|
for pem in _EXTRA_CERTS_DIR.glob("*.pem"):
|
|
ctx.load_verify_locations(cafile=str(pem))
|
|
return ctx
|
|
|
|
_SSL_CTX = _build_ssl_context()
|
|
|
|
_HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
)
|
|
}
|
|
|
|
def _fetch_page(url: str) -> Tuple[httpx.Response, BeautifulSoup]:
|
|
"""Shared fetch helper — returns response and parsed soup."""
|
|
response = httpx.get(url, timeout=10.0, verify=_SSL_CTX, headers=_HEADERS)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, 'lxml')
|
|
return response, soup
|
|
|
|
def clean_soup(soup):
|
|
"""Remove script, style, and other junk from soup before extraction."""
|
|
for element in soup(["script", "style", "nav", "footer", "header"]):
|
|
element.decompose()
|
|
return soup
|
|
|
|
def filter_junk_links(href: str) -> bool:
|
|
"""Filter out junk links: mailto, javascript, tel, data."""
|
|
junk_patterns = [r'^mailto:', r'^javascript:', r'^tel:', r'^data:']
|
|
return not any(re.match(pattern, href.lower()) for pattern in junk_patterns)
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
|
|
"""Fetch a URL and return title + markdown body + metadata.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
max_chars: Maximum characters in the markdown body (default: 5000)
|
|
|
|
Returns:
|
|
Markdown string with title, body, and metadata
|
|
"""
|
|
try:
|
|
response, soup = _fetch_page(url)
|
|
title = soup.title.string if soup.title else "No Title"
|
|
soup = clean_soup(soup)
|
|
body = html2text(str(soup.body if soup.body else soup), bodywidth=0)
|
|
body = body[:max_chars] + "..." if len(body) > max_chars else body
|
|
|
|
metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
|
|
|
|
return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}"
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return f"# Error fetching {url}\n\n{str(e)}"
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
|
|
"""Fetch a URL and extract all href links.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
deduplicate: Remove duplicate links (default: True)
|
|
|
|
Returns:
|
|
List of unique href URLs
|
|
"""
|
|
try:
|
|
_, soup = _fetch_page(url)
|
|
links = []
|
|
for a in soup.find_all('a', href=True):
|
|
href = a['href']
|
|
full_url = urljoin(url, href)
|
|
if filter_junk_links(full_url):
|
|
links.append(full_url)
|
|
|
|
if deduplicate:
|
|
links = list(set(links))
|
|
|
|
return links
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return [f"Error: {str(e)}"]
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_tables(url: str) -> List[str]:
|
|
"""Fetch a URL and extract all HTML tables as markdown.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
|
|
Returns:
|
|
List of markdown tables
|
|
"""
|
|
try:
|
|
_, soup = _fetch_page(url)
|
|
tables = []
|
|
for table in soup.find_all('table'):
|
|
markdown_table = html2text(str(table), bodywidth=0)
|
|
tables.append(markdown_table)
|
|
return tables if tables else ["No tables found."]
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return [f"Error: {str(e)}"]
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
|
|
"""Fetch everything: markdown + links + tables + meta.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
max_chars: Maximum characters (default: 5000)
|
|
|
|
Returns:
|
|
Dict with 'markdown', 'links', 'tables', 'meta'
|
|
"""
|
|
try:
|
|
response, soup = _fetch_page(url)
|
|
|
|
# Markdown
|
|
title = soup.title.string if soup.title else "No Title"
|
|
soup_clean = clean_soup(soup)
|
|
body = html2text(str(soup_clean.body if soup_clean.body else soup_clean), bodywidth=0)
|
|
body = body[:max_chars] + "..." if len(body) > max_chars else body
|
|
markdown = f"# {title}\n\n{body}\n\n## Metadata\nURL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
|
|
|
|
# Links
|
|
links = []
|
|
for a in soup.find_all('a', href=True):
|
|
href = a['href']
|
|
full_url = urljoin(url, href)
|
|
if filter_junk_links(full_url):
|
|
links.append(full_url)
|
|
links = list(set(links))
|
|
|
|
# Tables
|
|
tables = []
|
|
for table in soup.find_all('table'):
|
|
markdown_table = html2text(str(table), bodywidth=0)
|
|
tables.append(markdown_table)
|
|
tables = tables if tables else ["No tables found."]
|
|
|
|
# Meta
|
|
meta = {}
|
|
meta['title'] = title
|
|
desc_tag = soup.find('meta', attrs={'name': 'description'})
|
|
meta['description'] = desc_tag['content'] if desc_tag else "No description"
|
|
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
|
meta['og:title'] = og_title['content'] if og_title else title
|
|
og_desc = soup.find('meta', attrs={'property': 'og:description'})
|
|
meta['og:description'] = og_desc['content'] if og_desc else meta['description']
|
|
|
|
return {
|
|
"markdown": markdown,
|
|
"links": links,
|
|
"tables": tables,
|
|
"meta": meta
|
|
}
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return {"error": str(e)}
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_section(url: str, selector: str) -> str:
|
|
"""Fetch a URL and extract specific section by CSS selector.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
selector: CSS selector (e.g., '.content')
|
|
|
|
Returns:
|
|
Markdown of the selected section
|
|
"""
|
|
try:
|
|
_, soup = _fetch_page(url)
|
|
try:
|
|
section = soup.select_one(selector)
|
|
except Exception as e:
|
|
if "selector" in str(e).lower():
|
|
return f"Invalid CSS selector '{selector}' on {url}"
|
|
raise
|
|
|
|
if not section:
|
|
return f"No element found for selector '{selector}' on {url}"
|
|
|
|
soup_clean = clean_soup(section)
|
|
markdown = html2text(str(soup_clean), bodywidth=0)
|
|
return markdown
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return f"Error: {str(e)}"
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_meta(url: str) -> Dict[str, str]:
|
|
"""Fetch a URL and return page metadata: title, description, OG tags.
|
|
|
|
Args:
|
|
url: The URL to fetch
|
|
|
|
Returns:
|
|
Dict of metadata
|
|
"""
|
|
try:
|
|
_, soup = _fetch_page(url)
|
|
meta = {}
|
|
meta['title'] = soup.title.string if soup.title else "No Title"
|
|
|
|
desc_tag = soup.find('meta', attrs={'name': 'description'})
|
|
meta['description'] = desc_tag['content'] if desc_tag else "No description"
|
|
|
|
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
|
meta['og:title'] = og_title['content'] if og_title else meta['title']
|
|
|
|
og_desc = soup.find('meta', attrs={'property': 'og:description'})
|
|
meta['og:description'] = og_desc['content'] if og_desc else meta['description']
|
|
|
|
return meta
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return {"error": str(e)}
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
|
|
"""Fetch sitemap.xml and return list of URLs.
|
|
|
|
Args:
|
|
url: Sitemap URL (or auto-discover)
|
|
max_urls: Maximum URLs to return (default: 100)
|
|
|
|
Returns:
|
|
List of sitemap URLs
|
|
"""
|
|
try:
|
|
response, soup = _fetch_page(url)
|
|
urls = []
|
|
for loc in soup.find_all('loc')[:max_urls]:
|
|
urls.append(loc.text.strip())
|
|
|
|
# Simple loop protection: check for self-reference
|
|
if url in urls:
|
|
urls.remove(url)
|
|
|
|
return urls if urls else [f"No URLs in sitemap {url}"]
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return [f"Error: {str(e)}"]
|
|
|
|
@mcp.tool()
|
|
def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
|
|
"""Search Brave Search and return top results as a scraping hint.
|
|
|
|
Use this sparingly — once per research task — to get oriented before
|
|
scraping individual pages. Returns top result URLs + snippets so you
|
|
can decide which pages are worth scraping deeply.
|
|
|
|
Args:
|
|
query: Search query (e.g. "MacBook Pro M4 price Germany")
|
|
max_results: Maximum number of results to return (default: 5)
|
|
|
|
Returns:
|
|
Dict with 'query', 'results' (list of {title, url, snippet}), 'hint'
|
|
"""
|
|
try:
|
|
search_url = f"https://search.brave.com/search?q={query.replace(' ', '+')}&source=web"
|
|
_, soup = _fetch_page(search_url)
|
|
|
|
results = []
|
|
# Brave Search result cards: each <a> with class snippet contains title + description
|
|
for card in soup.select('.snippet')[:max_results]:
|
|
title_el = card.select_one('.snippet-title')
|
|
url_el = card.select_one('a')
|
|
desc_el = card.select_one('.snippet-description')
|
|
|
|
title = title_el.get_text(strip=True) if title_el else ""
|
|
url = url_el['href'] if url_el and url_el.get('href') else ""
|
|
snippet = desc_el.get_text(strip=True) if desc_el else ""
|
|
|
|
if url and url.startswith('http'):
|
|
results.append({"title": title, "url": url, "snippet": snippet})
|
|
|
|
hint = "; ".join(
|
|
f"{r['title']}: {r['url']}" for r in results
|
|
) if results else "No results found"
|
|
|
|
return {
|
|
"query": query,
|
|
"results": results,
|
|
"hint": hint,
|
|
}
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return {"query": query, "results": [], "hint": f"Error: {str(e)}"}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
mcp.run(transport="stdio")
|