diff --git a/webscraper/README.md b/webscraper/README.md new file mode 100644 index 0000000..72aed20 --- /dev/null +++ b/webscraper/README.md @@ -0,0 +1,42 @@ +# Webscraper MCP Server + +MCP server for web scraping operations: fetch pages, extract links/tables, parse sitemaps. + +## Tools + +- `webscraper_fetch(url, max_chars=5000)` — Title + markdown body + metadata +- `webscraper_fetch_links(url, deduplicate=True)` — Extract all hrefs +- `webscraper_fetch_tables(url)` — HTML tables as markdown +- `webscraper_fetch_all(url, max_chars=5000)` — Everything in one call +- `webscraper_fetch_section(url, selector)` — Specific CSS section +- `webscraper_fetch_meta(url)` — Title, description, OG tags +- `webscraper_fetch_sitemap(url, max_urls=100)` — Sitemap URL list + +## Stack + +- httpx (HTTP client) +- BeautifulSoup4 + lxml (HTML parsing) +- html2text (HTML to markdown) + +## Run + +```bash +./run.sh # uv sync && uv run src/server.py +``` + +## Tests + +```bash +uv run pytest tests/ --cov=src +``` + +## MCP Config + +Add to `.roo/mcp.json`: + +```json +"webscraper": { + "command": "uv", + "args": ["run", "--directory", "/home/pplate/pi_mcps/webscraper", "src/server.py"] +} +``` diff --git a/webscraper/pyproject.toml b/webscraper/pyproject.toml new file mode 100644 index 0000000..d02948e --- /dev/null +++ b/webscraper/pyproject.toml @@ -0,0 +1,43 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "webscraper" +dynamic = ["version"] +description = "MCP server for web scraping: fetch pages, extract links/tables, sitemap parsing" +readme = "README.md" +requires-python = ">=3.11" +license = "MIT" +authors = [{name = "Patrick Plate", email = "patrickplate@gmx.de"}] +dependencies = [ + "fastmcp>=0.1.0", + "httpx>=0.28.0", + "beautifulsoup4>=4.14.0", + "lxml>=6.0.0", + "html2text>=2025.4.15", +] + +[project.optional-dependencies] +test = [ + "pytest>=7.0", + "pytest-mock>=3.0", + "pytest-cov>=4.0", +] + +[tool.hatch.version] +path = "src/__init__.py" + +[tool.hatch.build.targets.sdist] +include = ["/src", "/tests"] + +[tool.hatch.build.targets.wheel] +include = ["/src", "/tests"] +packages = ["src/webscraper"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = "test_*.py" +python_classes = "Test*" +python_functions = "test_*" +addopts = "--cov=src --cov-report=term-missing --cov-report=xml" diff --git a/webscraper/run.sh b/webscraper/run.sh new file mode 100644 index 0000000..5b4050d --- /dev/null +++ b/webscraper/run.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Webscraper MCP server runner + +BASEDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" + +# Add ~/.local/bin to PATH for uv +export PATH="$HOME/.local/bin:$PATH" + +# Sync dependencies if .venv doesn't exist +if [ ! -d ".venv" ]; then + uv sync +fi + +# Run the server +cd "$BASEDIR" +uv run src/server.py diff --git a/webscraper/src/__init__.py b/webscraper/src/__init__.py new file mode 100644 index 0000000..9f2eef8 --- /dev/null +++ b/webscraper/src/__init__.py @@ -0,0 +1,2 @@ +"""Webscraper MCP server package.""" +__version__ = "1.0.0" diff --git a/webscraper/src/server.py b/webscraper/src/server.py new file mode 100644 index 0000000..616492a --- /dev/null +++ b/webscraper/src/server.py @@ -0,0 +1,214 @@ +"""Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps.""" + +import httpx +from bs4 import BeautifulSoup +from html2text import html2text +from urllib.parse import urljoin, urlparse +from typing import List, Dict, Optional +import re +from fastmcp import FastMCP + +mcp = FastMCP("webscraper") + +def clean_soup(soup): + """Remove script, style, and other junk from soup before extraction.""" + for element in soup(["script", "style", "nav", "footer", "header"]): + element.decompose() + return soup + +def filter_junk_links(href: str) -> bool: + """Filter out junk links: mailto, javascript, tel, data.""" + junk_patterns = [r'^mailto:', r'^javascript:', r'^tel:', r'^data:'] + return not any(re.match(pattern, href.lower()) for pattern in junk_patterns) + +@mcp.tool() +def webscraper_fetch(url: str, max_chars: int = 5000) -> str: + """Fetch a URL and return title + markdown body + metadata. + + Args: + url: The URL to fetch + max_chars: Maximum characters in the markdown body (default: 5000) + + Returns: + Markdown string with title, body, and metadata + """ + try: + response = httpx.get(url, timeout=10.0) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'lxml') + title = soup.title.string if soup.title else "No Title" + soup = clean_soup(soup) + body = html2text(str(soup.body if soup.body else soup), bodywidth=0) + body = body[:max_chars] + "..." if len(body) > max_chars else body + + metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}" + + return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}" + except httpx.RequestError as e: + return f"# Error fetching {url}\n\n{str(e)}" + +@mcp.tool() +def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]: + """Fetch a URL and extract all href links. + + Args: + url: The URL to fetch + deduplicate: Remove duplicate links (default: True) + + Returns: + List of unique href URLs + """ + try: + response = httpx.get(url, timeout=10.0) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'lxml') + links = [] + for a in soup.find_all('a', href=True): + href = a['href'] + if href.startswith('http') or href.startswith('/'): + full_url = urljoin(url, href) + if filter_junk_links(full_url): + links.append(full_url) + + if deduplicate: + links = list(set(links)) + + return links + except httpx.RequestError as e: + return [f"Error: {str(e)}"] + +@mcp.tool() +def webscraper_fetch_tables(url: str) -> List[str]: + """Fetch a URL and extract all HTML tables as markdown. + + Args: + url: The URL to fetch + + Returns: + List of markdown tables + """ + try: + response = httpx.get(url, timeout=10.0) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'lxml') + tables = [] + for table in soup.find_all('table'): + markdown_table = html2text(str(table), bodywidth=0) + tables.append(markdown_table) + return tables if tables else ["No tables found."] + except httpx.RequestError as e: + return [f"Error: {str(e)}"] + +@mcp.tool() +def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict: + """Fetch everything: markdown + links + tables + meta. + + Args: + url: The URL to fetch + max_chars: Maximum characters (default: 5000) + + Returns: + Dict with 'markdown', 'links', 'tables', 'meta' + """ + markdown = webscraper_fetch(url, max_chars) + links = webscraper_fetch_links(url) + tables = webscraper_fetch_tables(url) + meta = webscraper_fetch_meta(url) + + return { + "markdown": markdown, + "links": links, + "tables": tables, + "meta": meta + } + +@mcp.tool() +def webscraper_fetch_section(url: str, selector: str) -> str: + """Fetch a URL and extract specific section by CSS selector. + + Args: + url: The URL to fetch + selector: CSS selector (e.g., '.content') + + Returns: + Markdown of the selected section + """ + try: + response = httpx.get(url, timeout=10.0) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'lxml') + section = soup.select_one(selector) + if not section: + return f"No element found for selector '{selector}' on {url}" + + soup = clean_soup(section) + markdown = html2text(str(soup), bodywidth=0) + return markdown + except httpx.RequestError as e: + return f"Error: {str(e)}" + +@mcp.tool() +def webscraper_fetch_meta(url: str) -> Dict[str, str]: + """Fetch a URL and return page metadata: title, description, OG tags. + + Args: + url: The URL to fetch + + Returns: + Dict of metadata + """ + try: + response = httpx.get(url, timeout=10.0) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'lxml') + meta = {} + meta['title'] = soup.title.string if soup.title else "No Title" + + desc_tag = soup.find('meta', attrs={'name': 'description'}) + meta['description'] = desc_tag['content'] if desc_tag else "No description" + + og_title = soup.find('meta', attrs={'property': 'og:title'}) + meta['og:title'] = og_title['content'] if og_title else meta['title'] + + og_desc = soup.find('meta', attrs={'property': 'og:description'}) + meta['og:description'] = og_desc['content'] if og_desc else meta['description'] + + return meta + except httpx.RequestError as e: + return {"error": str(e)} + +@mcp.tool() +def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]: + """Fetch sitemap.xml and return list of URLs. + + Args: + url: Sitemap URL (or auto-discover) + max_urls: Maximum URLs to return (default: 100) + + Returns: + List of sitemap URLs + """ + try: + response = httpx.get(url, timeout=10.0) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'xml') + urls = [] + for loc in soup.find_all('loc')[:max_urls]: + urls.append(loc.text.strip()) + + # Simple loop protection: check for self-reference + if url in urls: + urls.remove(url) + + return urls if urls else [f"No URLs in sitemap {url}"] + except httpx.RequestError as e: + return [f"Error: {str(e)}"] + +if __name__ == "__main__": + mcp.run(transport="stdio") diff --git a/webscraper/tests/__init__.py b/webscraper/tests/__init__.py new file mode 100644 index 0000000..7655aea --- /dev/null +++ b/webscraper/tests/__init__.py @@ -0,0 +1 @@ +"""Webscraper tests package.""" diff --git a/webscraper/tests/conftest.py b/webscraper/tests/conftest.py new file mode 100644 index 0000000..cb6a0f4 --- /dev/null +++ b/webscraper/tests/conftest.py @@ -0,0 +1,30 @@ +"""Shared test fixtures for webscraper.""" + +import sys +from pathlib import Path + +# Add src to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +import pytest +from unittest.mock import MagicMock + +@pytest.fixture +def mock_httpx(): + """Mock httpx for all network calls.""" + mock_get = MagicMock() + mock_get.return_value.status_code = 200 + mock_get.return_value.text = "Test" + mock_get.return_value.headers = {"content-type": "text/html"} + + with MagicMock() as mock_module: + mock_module.get.return_value = mock_get + sys.modules["httpx"] = mock_module + yield mock_module + +@pytest.fixture +def mock_bs4(): + """Mock BeautifulSoup for parsing.""" + from bs4 import BeautifulSoup + soup = BeautifulSoup("Test", "html.parser") + return soup diff --git a/webscraper/tests/test_server.py b/webscraper/tests/test_server.py new file mode 100644 index 0000000..b873eb9 --- /dev/null +++ b/webscraper/tests/test_server.py @@ -0,0 +1,197 @@ +"""Comprehensive tests for webscraper server.""" + +import pytest +from unittest.mock import MagicMock, patch +from src.server import ( + webscraper_fetch, webscraper_fetch_links, webscraper_fetch_tables, + webscraper_fetch_all, webscraper_fetch_section, webscraper_fetch_meta, + webscraper_fetch_sitemap, clean_soup, filter_junk_links +) + +@pytest.fixture +def mock_response(): + """Mock httpx response.""" + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = """ + + Test Page + + + + +

Header

+

Paragraph 1

+ Link 1 + Junk Mail + Junk JS +
Cell1Cell2
+
Selected content
+ + + """ + mock_resp.headers = {"content-type": "text/html"} + return mock_resp + +@pytest.fixture +def mock_sitemap_response(): + """Mock sitemap response.""" + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = """ + + + https://example.com/page1 + https://example.com/page2 + https://example.com/sitemap.xml + + """ + return mock_resp + +@patch('httpx.get') +def test_webscraper_fetch(mock_get, mock_response): + """Test webscraper_fetch tool.""" + mock_get.return_value = mock_response + result = webscraper_fetch("https://example.com", max_chars=100) + assert "# Test Page" in result + assert "Paragraph 1" in result + assert "URL: https://example.com" in result + assert len(result) < 500 # Truncated + +@patch('httpx.get') +def test_webscraper_fetch_error(mock_get): + """Test error handling in webscraper_fetch.""" + mock_get.side_effect = httpx.RequestError("Connection failed") + result = webscraper_fetch("https://fail.com") + assert "Error fetching" in result + +@patch('httpx.get') +def test_webscraper_fetch_links(mock_get, mock_response): + """Test webscraper_fetch_links tool.""" + mock_get.return_value = mock_response + result = webscraper_fetch_links("https://example.com", deduplicate=True) + assert isinstance(result, list) + assert "https://example.com/link1" in result + assert len(result) == 1 # Only valid link + +@patch('httpx.get') +def test_webscraper_fetch_links_no_dedup(mock_get, mock_response): + """Test without deduplication.""" + mock_get.return_value = mock_response + result = webscraper_fetch_links("https://example.com", deduplicate=False) + assert len(result) == 1 # Still one unique + +@patch('httpx.get') +def test_webscraper_fetch_tables(mock_get, mock_response): + """Test webscraper_fetch_tables tool.""" + mock_get.return_value = mock_response + result = webscraper_fetch_tables("https://example.com") + assert isinstance(result, list) + assert "| Cell1 | Cell2 |" in result[0] + +@patch('httpx.get') +def test_webscraper_fetch_all(mock_get, mock_response): + """Test webscraper_fetch_all tool.""" + mock_get.return_value = mock_response + result = webscraper_fetch_all("https://example.com", max_chars=100) + assert "markdown" in result + assert "links" in result + assert "tables" in result + assert "meta" in result + +@patch('httpx.get') +def test_webscraper_fetch_section(mock_get, mock_response): + """Test webscraper_fetch_section tool.""" + mock_get.return_value = mock_response + result = webscraper_fetch_section("https://example.com", ".content") + assert "Selected content" in result + +@patch('httpx.get') +def test_webscraper_fetch_section_no_match(mock_get, mock_response): + """Test selector with no match.""" + mock_get.return_value = mock_response + result = webscraper_fetch_section("https://example.com", ".nonexistent") + assert "No element found" in result + +@patch('httpx.get') +def test_webscraper_fetch_meta(mock_get, mock_response): + """Test webscraper_fetch_meta tool.""" + mock_get.return_value = mock_response + result = webscraper_fetch_meta("https://example.com") + assert result["title"] == "Test Page" + assert result["description"] == "Test desc" + assert result["og:title"] == "OG Title" + +@patch('httpx.get') +def test_webscraper_fetch_sitemap(mock_get, mock_sitemap_response): + """Test webscraper_fetch_sitemap tool.""" + mock_get.return_value = mock_sitemap_response + result = webscraper_fetch_sitemap("https://example.com/sitemap.xml", max_urls=2) + assert isinstance(result, list) + assert "https://example.com/page1" in result + assert len(result) == 2 # Limited by max_urls + +@patch('httpx.get') +def test_webscraper_fetch_sitemap_loop_protection(mock_get, mock_sitemap_response): + """Test sitemap loop protection.""" + mock_get.return_value = mock_sitemap_response + result = webscraper_fetch_sitemap("https://example.com/sitemap.xml") + assert "https://example.com/sitemap.xml" not in result # Self-reference removed + +def test_clean_soup(): + """Test clean_soup helper.""" + from bs4 import BeautifulSoup + soup = BeautifulSoup('

Text

', 'lxml') + cleaned = clean_soup(soup) + assert '