diff --git a/webscraper/coverage.xml b/webscraper/coverage.xml
new file mode 100644
index 0000000..a2e24d4
--- /dev/null
+++ b/webscraper/coverage.xml
@@ -0,0 +1,139 @@
+
+
+
+
+
+ /home/pplate/pi_mcps/webscraper/src
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/webscraper/src/server.py b/webscraper/src/server.py
index 616492a..8432af5 100644
--- a/webscraper/src/server.py
+++ b/webscraper/src/server.py
@@ -1,15 +1,22 @@
"""Webscraper MCP server — fetch web pages, extract content, links, tables, sitemaps."""
import httpx
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, SelectorSyntaxError
from html2text import html2text
-from urllib.parse import urljoin, urlparse
-from typing import List, Dict, Optional
+from urllib.parse import urljoin
+from typing import List, Dict, Tuple
import re
from fastmcp import FastMCP
mcp = FastMCP("webscraper")
+def _fetch_page(url: str) -> Tuple[httpx.Response, BeautifulSoup]:
+ """Shared fetch helper — returns response and parsed soup."""
+ response = httpx.get(url, timeout=10.0)
+ response.raise_for_status()
+ soup = BeautifulSoup(response.text, 'lxml')
+ return response, soup
+
def clean_soup(soup):
"""Remove script, style, and other junk from soup before extraction."""
for element in soup(["script", "style", "nav", "footer", "header"]):
@@ -33,10 +40,7 @@ def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
Markdown string with title, body, and metadata
"""
try:
- response = httpx.get(url, timeout=10.0)
- response.raise_for_status()
-
- soup = BeautifulSoup(response.text, 'lxml')
+ response, soup = _fetch_page(url)
title = soup.title.string if soup.title else "No Title"
soup = clean_soup(soup)
body = html2text(str(soup.body if soup.body else soup), bodywidth=0)
@@ -45,7 +49,7 @@ def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
metadata = f"URL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
return f"# {title}\n\n{body}\n\n## Metadata\n{metadata}"
- except httpx.RequestError as e:
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
return f"# Error fetching {url}\n\n{str(e)}"
@mcp.tool()
@@ -60,23 +64,19 @@ def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
List of unique href URLs
"""
try:
- response = httpx.get(url, timeout=10.0)
- response.raise_for_status()
-
- soup = BeautifulSoup(response.text, 'lxml')
+ _, soup = _fetch_page(url)
links = []
for a in soup.find_all('a', href=True):
href = a['href']
- if href.startswith('http') or href.startswith('/'):
- full_url = urljoin(url, href)
- if filter_junk_links(full_url):
- links.append(full_url)
+ full_url = urljoin(url, href)
+ if filter_junk_links(full_url):
+ links.append(full_url)
if deduplicate:
links = list(set(links))
return links
- except httpx.RequestError as e:
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
return [f"Error: {str(e)}"]
@mcp.tool()
@@ -90,16 +90,13 @@ def webscraper_fetch_tables(url: str) -> List[str]:
List of markdown tables
"""
try:
- response = httpx.get(url, timeout=10.0)
- response.raise_for_status()
-
- soup = BeautifulSoup(response.text, 'lxml')
+ _, soup = _fetch_page(url)
tables = []
for table in soup.find_all('table'):
markdown_table = html2text(str(table), bodywidth=0)
tables.append(markdown_table)
return tables if tables else ["No tables found."]
- except httpx.RequestError as e:
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
return [f"Error: {str(e)}"]
@mcp.tool()
@@ -113,17 +110,50 @@ def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
Returns:
Dict with 'markdown', 'links', 'tables', 'meta'
"""
- markdown = webscraper_fetch(url, max_chars)
- links = webscraper_fetch_links(url)
- tables = webscraper_fetch_tables(url)
- meta = webscraper_fetch_meta(url)
-
- return {
- "markdown": markdown,
- "links": links,
- "tables": tables,
- "meta": meta
- }
+ try:
+ response, soup = _fetch_page(url)
+
+ # Markdown
+ title = soup.title.string if soup.title else "No Title"
+ soup_clean = clean_soup(soup)
+ body = html2text(str(soup_clean.body if soup_clean.body else soup_clean), bodywidth=0)
+ body = body[:max_chars] + "..." if len(body) > max_chars else body
+ markdown = f"# {title}\n\n{body}\n\n## Metadata\nURL: {url}\nStatus: {response.status_code}\nContent-Type: {response.headers.get('content-type', 'unknown')}"
+
+ # Links
+ links = []
+ for a in soup.find_all('a', href=True):
+ href = a['href']
+ full_url = urljoin(url, href)
+ if filter_junk_links(full_url):
+ links.append(full_url)
+ links = list(set(links))
+
+ # Tables
+ tables = []
+ for table in soup.find_all('table'):
+ markdown_table = html2text(str(table), bodywidth=0)
+ tables.append(markdown_table)
+ tables = tables if tables else ["No tables found."]
+
+ # Meta
+ meta = {}
+ meta['title'] = title
+ desc_tag = soup.find('meta', attrs={'name': 'description'})
+ meta['description'] = desc_tag['content'] if desc_tag else "No description"
+ og_title = soup.find('meta', attrs={'property': 'og:title'})
+ meta['og:title'] = og_title['content'] if og_title else title
+ og_desc = soup.find('meta', attrs={'property': 'og:description'})
+ meta['og:description'] = og_desc['content'] if og_desc else meta['description']
+
+ return {
+ "markdown": markdown,
+ "links": links,
+ "tables": tables,
+ "meta": meta
+ }
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
+ return {"error": str(e)}
@mcp.tool()
def webscraper_fetch_section(url: str, selector: str) -> str:
@@ -137,18 +167,19 @@ def webscraper_fetch_section(url: str, selector: str) -> str:
Markdown of the selected section
"""
try:
- response = httpx.get(url, timeout=10.0)
- response.raise_for_status()
+ _, soup = _fetch_page(url)
+ try:
+ section = soup.select_one(selector)
+ except SelectorSyntaxError:
+ return f"Invalid CSS selector '{selector}' on {url}"
- soup = BeautifulSoup(response.text, 'lxml')
- section = soup.select_one(selector)
if not section:
return f"No element found for selector '{selector}' on {url}"
- soup = clean_soup(section)
- markdown = html2text(str(soup), bodywidth=0)
+ soup_clean = clean_soup(section)
+ markdown = html2text(str(soup_clean), bodywidth=0)
return markdown
- except httpx.RequestError as e:
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
return f"Error: {str(e)}"
@mcp.tool()
@@ -162,10 +193,7 @@ def webscraper_fetch_meta(url: str) -> Dict[str, str]:
Dict of metadata
"""
try:
- response = httpx.get(url, timeout=10.0)
- response.raise_for_status()
-
- soup = BeautifulSoup(response.text, 'lxml')
+ _, soup = _fetch_page(url)
meta = {}
meta['title'] = soup.title.string if soup.title else "No Title"
@@ -179,7 +207,7 @@ def webscraper_fetch_meta(url: str) -> Dict[str, str]:
meta['og:description'] = og_desc['content'] if og_desc else meta['description']
return meta
- except httpx.RequestError as e:
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
return {"error": str(e)}
@mcp.tool()
@@ -194,10 +222,7 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
List of sitemap URLs
"""
try:
- response = httpx.get(url, timeout=10.0)
- response.raise_for_status()
-
- soup = BeautifulSoup(response.text, 'xml')
+ response, soup = _fetch_page(url)
urls = []
for loc in soup.find_all('loc')[:max_urls]:
urls.append(loc.text.strip())
@@ -207,7 +232,7 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
urls.remove(url)
return urls if urls else [f"No URLs in sitemap {url}"]
- except httpx.RequestError as e:
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
return [f"Error: {str(e)}"]
if __name__ == "__main__":
diff --git a/webscraper/tests/conftest.py b/webscraper/tests/conftest.py
index cb6a0f4..41d97ae 100644
--- a/webscraper/tests/conftest.py
+++ b/webscraper/tests/conftest.py
@@ -5,26 +5,3 @@ from pathlib import Path
# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
-
-import pytest
-from unittest.mock import MagicMock
-
-@pytest.fixture
-def mock_httpx():
- """Mock httpx for all network calls."""
- mock_get = MagicMock()
- mock_get.return_value.status_code = 200
- mock_get.return_value.text = "
Test"
- mock_get.return_value.headers = {"content-type": "text/html"}
-
- with MagicMock() as mock_module:
- mock_module.get.return_value = mock_get
- sys.modules["httpx"] = mock_module
- yield mock_module
-
-@pytest.fixture
-def mock_bs4():
- """Mock BeautifulSoup for parsing."""
- from bs4 import BeautifulSoup
- soup = BeautifulSoup("Test", "html.parser")
- return soup
diff --git a/webscraper/tests/test_server.py b/webscraper/tests/test_server.py
index b873eb9..404d5ec 100644
--- a/webscraper/tests/test_server.py
+++ b/webscraper/tests/test_server.py
@@ -1,6 +1,7 @@
"""Comprehensive tests for webscraper server."""
import pytest
+import httpx
from unittest.mock import MagicMock, patch
from src.server import (
webscraper_fetch, webscraper_fetch_links, webscraper_fetch_tables,
@@ -25,6 +26,8 @@ def mock_response():
Link 1
Junk Mail
Junk JS
+ Relative Link
+ Parent Relative
Selected content