327 lines
12 KiB
Python
327 lines
12 KiB
Python
"""Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps."""
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
from html2text import html2text
|
|
from urllib.parse import urljoin, quote_plus
|
|
from typing import List, Dict, Tuple, Annotated
|
|
import re
|
|
import ssl
|
|
import os
|
|
import certifi
|
|
from pathlib import Path
|
|
from fastmcp import FastMCP
|
|
from pydantic import Field
|
|
|
|
mcp = FastMCP("webscraper")
|
|
|
|
# Build a single SSL context at module load — certifi bundle + any extra certs
|
|
# shipped in the certs/ directory next to this file.
|
|
_EXTRA_CERTS_DIR = Path(__file__).resolve().parent.parent / "certs"
|
|
|
|
def _build_ssl_context() -> ssl.SSLContext:
|
|
"""Build an SSL context from certifi + extra bundled root certs."""
|
|
ctx = ssl.create_default_context(cafile=certifi.where())
|
|
if _EXTRA_CERTS_DIR.is_dir():
|
|
for pem in _EXTRA_CERTS_DIR.glob("*.pem"):
|
|
ctx.load_verify_locations(cafile=str(pem))
|
|
return ctx
|
|
|
|
_SSL_CTX = _build_ssl_context()
|
|
|
|
_HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
)
|
|
}
|
|
|
|
def _fetch_page(url: str) -> Tuple[httpx.Response, BeautifulSoup]:
|
|
"""Shared fetch helper — returns response and parsed soup."""
|
|
response = httpx.get(url, timeout=10.0, verify=_SSL_CTX, headers=_HEADERS)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, 'lxml')
|
|
return response, soup
|
|
|
|
def clean_soup(soup):
|
|
"""Remove script, style, and other junk from soup before extraction."""
|
|
for element in soup(["script", "style", "nav", "footer", "header"]):
|
|
element.decompose()
|
|
return soup
|
|
|
|
def filter_junk_links(href: str) -> bool:
|
|
"""Filter out junk links: mailto, javascript, tel, data."""
|
|
junk_patterns = [r'^mailto:', r'^javascript:', r'^tel:', r'^data:']
|
|
return not any(re.match(pattern, href.lower()) for pattern in junk_patterns)
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters in the markdown body (default: 5000)")] = 5000) -> str:
|
|
"""Fetch a URL and return title + markdown body + metadata.
|
|
|
|
Returns:
|
|
Markdown string with title, body, and metadata
|
|
"""
|
|
try:
|
|
response, soup = _fetch_page(url)
|
|
title = soup.title.string if soup.title else "No Title"
|
|
soup = clean_soup(soup)
|
|
body = html2text(str(soup.body if soup.body else soup), bodywidth=0)
|
|
body = body[:max_chars] + "..." if len(body) > max_chars else body
|
|
|
|
metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
|
|
|
|
return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}"
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return f"# Error fetching {url}\n\n{str(e)}"
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_links(url: Annotated[str, Field(description="The URL to fetch")], deduplicate: Annotated[bool, Field(description="Remove duplicate links (default: True)")] = True) -> List[str]:
|
|
"""Fetch a URL and extract all href links.
|
|
|
|
Returns:
|
|
List of unique href URLs
|
|
"""
|
|
try:
|
|
_, soup = _fetch_page(url)
|
|
links = []
|
|
for a in soup.find_all('a', href=True):
|
|
href = a['href']
|
|
full_url = urljoin(url, href)
|
|
if filter_junk_links(full_url):
|
|
links.append(full_url)
|
|
|
|
if deduplicate:
|
|
links = list(set(links))
|
|
|
|
return links
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return [f"Error: {str(e)}"]
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_tables(url: Annotated[str, Field(description="The URL to fetch")]) -> List[str]:
|
|
"""Fetch a URL and extract all HTML tables as markdown.
|
|
|
|
Returns:
|
|
List of markdown tables
|
|
"""
|
|
try:
|
|
_, soup = _fetch_page(url)
|
|
tables = []
|
|
for table in soup.find_all('table'):
|
|
markdown_table = html2text(str(table), bodywidth=0)
|
|
tables.append(markdown_table)
|
|
return tables if tables else ["No tables found."]
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return [f"Error: {str(e)}"]
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_all(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters (default: 5000)")] = 5000) -> Dict:
|
|
"""Fetch everything: markdown + links + tables + meta.
|
|
|
|
Returns:
|
|
Dict with 'markdown', 'links', 'tables', 'meta'
|
|
"""
|
|
try:
|
|
response, soup = _fetch_page(url)
|
|
|
|
# Markdown
|
|
title = soup.title.string if soup.title else "No Title"
|
|
soup_clean = clean_soup(soup)
|
|
body = html2text(str(soup_clean.body if soup_clean.body else soup_clean), bodywidth=0)
|
|
body = body[:max_chars] + "..." if len(body) > max_chars else body
|
|
markdown = f"# {title}\n\n{body}\n\n## Metadata\nURL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
|
|
|
|
# Links
|
|
links = []
|
|
for a in soup.find_all('a', href=True):
|
|
href = a['href']
|
|
full_url = urljoin(url, href)
|
|
if filter_junk_links(full_url):
|
|
links.append(full_url)
|
|
links = list(set(links))
|
|
|
|
# Tables
|
|
tables = []
|
|
for table in soup.find_all('table'):
|
|
markdown_table = html2text(str(table), bodywidth=0)
|
|
tables.append(markdown_table)
|
|
tables = tables if tables else ["No tables found."]
|
|
|
|
# Meta
|
|
meta = {}
|
|
meta['title'] = title
|
|
desc_tag = soup.find('meta', attrs={'name': 'description'})
|
|
meta['description'] = desc_tag['content'] if desc_tag else "No description"
|
|
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
|
meta['og:title'] = og_title['content'] if og_title else title
|
|
og_desc = soup.find('meta', attrs={'property': 'og:description'})
|
|
meta['og:description'] = og_desc['content'] if og_desc else meta['description']
|
|
|
|
return {
|
|
"markdown": markdown,
|
|
"links": links,
|
|
"tables": tables,
|
|
"meta": meta
|
|
}
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return {"error": str(e)}
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_section(url: Annotated[str, Field(description="The URL to fetch")], selector: Annotated[str, Field(description="CSS selector (e.g., '.content')")]) -> str:
|
|
"""Fetch a URL and extract specific section by CSS selector.
|
|
|
|
Returns:
|
|
Markdown of the selected section
|
|
"""
|
|
try:
|
|
_, soup = _fetch_page(url)
|
|
try:
|
|
section = soup.select_one(selector)
|
|
except Exception as e:
|
|
if "selector" in str(e).lower():
|
|
return f"Invalid CSS selector '{selector}' on {url}"
|
|
raise
|
|
|
|
if not section:
|
|
return f"No element found for selector '{selector}' on {url}"
|
|
|
|
soup_clean = clean_soup(section)
|
|
markdown = html2text(str(soup_clean), bodywidth=0)
|
|
return markdown
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return f"Error: {str(e)}"
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_meta(url: Annotated[str, Field(description="The URL to fetch")]) -> Dict[str, str]:
|
|
"""Fetch a URL and return page metadata: title, description, OG tags.
|
|
|
|
Returns:
|
|
Dict of metadata
|
|
"""
|
|
try:
|
|
_, soup = _fetch_page(url)
|
|
meta = {}
|
|
meta['title'] = soup.title.string if soup.title else "No Title"
|
|
|
|
desc_tag = soup.find('meta', attrs={'name': 'description'})
|
|
meta['description'] = desc_tag['content'] if desc_tag else "No description"
|
|
|
|
og_title = soup.find('meta', attrs={'property': 'og:title'})
|
|
meta['og:title'] = og_title['content'] if og_title else meta['title']
|
|
|
|
og_desc = soup.find('meta', attrs={'property': 'og:description'})
|
|
meta['og:description'] = og_desc['content'] if og_desc else meta['description']
|
|
|
|
return meta
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return {"error": str(e)}
|
|
|
|
@mcp.tool()
|
|
def webscraper_fetch_sitemap(url: Annotated[str, Field(description="Sitemap URL (or auto-discover)")], max_urls: Annotated[int, Field(description="Maximum URLs to return (default: 100)")] = 100) -> List[str]:
|
|
"""Fetch sitemap.xml and return list of URLs.
|
|
|
|
Returns:
|
|
List of sitemap URLs
|
|
"""
|
|
try:
|
|
response, soup = _fetch_page(url)
|
|
urls = []
|
|
for loc in soup.find_all('loc')[:max_urls]:
|
|
urls.append(loc.text.strip())
|
|
|
|
# Simple loop protection: check for self-reference
|
|
if url in urls:
|
|
urls.remove(url)
|
|
|
|
return urls if urls else [f"No URLs in sitemap {url}"]
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return [f"Error: {str(e)}"]
|
|
|
|
@mcp.tool()
|
|
def webscraper_search_hint(query: Annotated[str, Field(description="Search query (e.g. \"MacBook Pro M4 price Germany\")")], max_results: Annotated[int, Field(description="Maximum number of results to return (default: 5)")] = 5) -> Dict:
|
|
"""Search Brave Search and return top results as a scraping hint.
|
|
|
|
Use this sparingly — once per research task — to get oriented before
|
|
scraping individual pages. Returns top result URLs + snippets so you
|
|
can decide which pages are worth scraping deeply.
|
|
|
|
Returns:
|
|
Dict with 'query', 'search_url', 'results' (list of {title, url, snippet}),
|
|
'result_count', 'hint'
|
|
"""
|
|
search_url = f"https://search.brave.com/search?q={quote_plus(query)}&source=web"
|
|
try:
|
|
_, soup = _fetch_page(search_url)
|
|
|
|
results = []
|
|
seen_urls: set = set()
|
|
|
|
# Brave Search result cards: each div.snippet with a .result-wrapper is a web result.
|
|
# Skip video clusters, FAQ blocks, and LLM snippets (they have no .result-wrapper).
|
|
# Class names as of 2026-04 (updated from .snippet-title / .snippet-description):
|
|
# title → .search-snippet-title
|
|
# url → a.l1 (the primary result anchor, avoids favicon <a> tags)
|
|
# snippet → .content.t-primary
|
|
for card in soup.select('.snippet'):
|
|
if len(results) >= max_results:
|
|
break
|
|
|
|
# Skip non-web-result snippets (videos, FAQ, LLM answer blocks)
|
|
if not card.select_one('.result-wrapper'):
|
|
continue
|
|
|
|
title_el = card.select_one('.search-snippet-title')
|
|
url_el = card.select_one('a.l1')
|
|
desc_el = card.select_one('.content.t-primary')
|
|
|
|
title = title_el.get_text(strip=True) if title_el else ""
|
|
url = url_el['href'] if url_el and url_el.get('href') else ""
|
|
snippet = desc_el.get_text(strip=True) if desc_el else ""
|
|
|
|
# Filter: must have a valid http(s) URL
|
|
if not url or not url.startswith('http'):
|
|
continue
|
|
|
|
# Filter: skip results with no useful content at all
|
|
if not title and not snippet:
|
|
continue
|
|
|
|
# Deduplicate by URL
|
|
if url in seen_urls:
|
|
continue
|
|
seen_urls.add(url)
|
|
|
|
results.append({"title": title, "url": url, "snippet": snippet})
|
|
|
|
# Richer hint: title + url + first 120 chars of snippet for AI context
|
|
if results:
|
|
hint_parts = []
|
|
for r in results:
|
|
part = f"{r['title']} ({r['url']})"
|
|
if r['snippet']:
|
|
part += f": {r['snippet'][:120]}"
|
|
hint_parts.append(part)
|
|
hint = " | ".join(hint_parts)
|
|
else:
|
|
hint = "No results found"
|
|
|
|
return {
|
|
"query": query,
|
|
"search_url": search_url,
|
|
"results": results,
|
|
"result_count": len(results),
|
|
"hint": hint,
|
|
}
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
return {
|
|
"query": query,
|
|
"search_url": search_url,
|
|
"results": [],
|
|
"result_count": 0,
|
|
"hint": f"Error: {str(e)}",
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
mcp.run(transport="stdio")
|