feat(mcp): update bigmind/mcp-image-gen/webscraper servers; add image-gen batch scripts

This commit is contained in:
Patrick Plate
2026-06-11 09:02:09 +02:00
parent 0cb94122bf
commit bf721c1379
9 changed files with 2659 additions and 297 deletions
+30 -50
View File
@@ -4,13 +4,14 @@ import httpx
from bs4 import BeautifulSoup
from html2text import html2text
from urllib.parse import urljoin, quote_plus
from typing import List, Dict, Tuple
from typing import List, Dict, Tuple, Annotated
import re
import ssl
import os
import certifi
from pathlib import Path
from fastmcp import FastMCP
from pydantic import Field
mcp = FastMCP("webscraper")
@@ -54,13 +55,9 @@ def filter_junk_links(href: str) -> bool:
return not any(re.match(pattern, href.lower()) for pattern in junk_patterns)
@mcp.tool()
def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
def webscraper_fetch(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters in the markdown body (default: 5000)")] = 5000) -> str:
"""Fetch a URL and return title + markdown body + metadata.
Args:
url: The URL to fetch
max_chars: Maximum characters in the markdown body (default: 5000)
Returns:
Markdown string with title, body, and metadata
"""
@@ -78,13 +75,9 @@ def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
return f"# Error fetching {url}\n\n{str(e)}"
@mcp.tool()
def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
def webscraper_fetch_links(url: Annotated[str, Field(description="The URL to fetch")], deduplicate: Annotated[bool, Field(description="Remove duplicate links (default: True)")] = True) -> List[str]:
"""Fetch a URL and extract all href links.
Args:
url: The URL to fetch
deduplicate: Remove duplicate links (default: True)
Returns:
List of unique href URLs
"""
@@ -105,12 +98,9 @@ def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
return [f"Error: {str(e)}"]
@mcp.tool()
def webscraper_fetch_tables(url: str) -> List[str]:
def webscraper_fetch_tables(url: Annotated[str, Field(description="The URL to fetch")]) -> List[str]:
"""Fetch a URL and extract all HTML tables as markdown.
Args:
url: The URL to fetch
Returns:
List of markdown tables
"""
@@ -125,13 +115,9 @@ def webscraper_fetch_tables(url: str) -> List[str]:
return [f"Error: {str(e)}"]
@mcp.tool()
def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
def webscraper_fetch_all(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters (default: 5000)")] = 5000) -> Dict:
"""Fetch everything: markdown + links + tables + meta.
Args:
url: The URL to fetch
max_chars: Maximum characters (default: 5000)
Returns:
Dict with 'markdown', 'links', 'tables', 'meta'
"""
@@ -181,13 +167,9 @@ def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
return {"error": str(e)}
@mcp.tool()
def webscraper_fetch_section(url: str, selector: str) -> str:
def webscraper_fetch_section(url: Annotated[str, Field(description="The URL to fetch")], selector: Annotated[str, Field(description="CSS selector (e.g., '.content')")]) -> str:
"""Fetch a URL and extract specific section by CSS selector.
Args:
url: The URL to fetch
selector: CSS selector (e.g., '.content')
Returns:
Markdown of the selected section
"""
@@ -210,12 +192,9 @@ def webscraper_fetch_section(url: str, selector: str) -> str:
return f"Error: {str(e)}"
@mcp.tool()
def webscraper_fetch_meta(url: str) -> Dict[str, str]:
def webscraper_fetch_meta(url: Annotated[str, Field(description="The URL to fetch")]) -> Dict[str, str]:
"""Fetch a URL and return page metadata: title, description, OG tags.
Args:
url: The URL to fetch
Returns:
Dict of metadata
"""
@@ -238,13 +217,9 @@ def webscraper_fetch_meta(url: str) -> Dict[str, str]:
return {"error": str(e)}
@mcp.tool()
def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
def webscraper_fetch_sitemap(url: Annotated[str, Field(description="Sitemap URL (or auto-discover)")], max_urls: Annotated[int, Field(description="Maximum URLs to return (default: 100)")] = 100) -> List[str]:
"""Fetch sitemap.xml and return list of URLs.
Args:
url: Sitemap URL (or auto-discover)
max_urls: Maximum URLs to return (default: 100)
Returns:
List of sitemap URLs
"""
@@ -263,17 +238,13 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
return [f"Error: {str(e)}"]
@mcp.tool()
def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
def webscraper_search_hint(query: Annotated[str, Field(description="Search query (e.g. \"MacBook Pro M4 price Germany\")")], max_results: Annotated[int, Field(description="Maximum number of results to return (default: 5)")] = 5) -> Dict:
"""Search Brave Search and return top results as a scraping hint.
Use this sparingly — once per research task — to get oriented before
scraping individual pages. Returns top result URLs + snippets so you
can decide which pages are worth scraping deeply.
Args:
query: Search query (e.g. "MacBook Pro M4 price Germany")
max_results: Maximum number of results to return (default: 5)
Returns:
Dict with 'query', 'search_url', 'results' (list of {title, url, snippet}),
'result_count', 'hint'
@@ -285,14 +256,23 @@ def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
results = []
seen_urls: set = set()
# Brave Search result cards: each div.snippet contains title, URL, description
# Brave Search result cards: each div.snippet with a .result-wrapper is a web result.
# Skip video clusters, FAQ blocks, and LLM snippets (they have no .result-wrapper).
# Class names as of 2026-04 (updated from .snippet-title / .snippet-description):
# title → .search-snippet-title
# url → a.l1 (the primary result anchor, avoids favicon <a> tags)
# snippet → .content.t-primary
for card in soup.select('.snippet'):
if len(results) >= max_results:
break
title_el = card.select_one('.snippet-title')
url_el = card.select_one('a')
desc_el = card.select_one('.snippet-description')
# Skip non-web-result snippets (videos, FAQ, LLM answer blocks)
if not card.select_one('.result-wrapper'):
continue
title_el = card.select_one('.search-snippet-title')
url_el = card.select_one('a.l1')
desc_el = card.select_one('.content.t-primary')
title = title_el.get_text(strip=True) if title_el else ""
url = url_el['href'] if url_el and url_el.get('href') else ""
+35 -43
View File
@@ -206,27 +206,39 @@ def test_sitemap_max_urls(mock_get, mock_sitemap_response):
# --- webscraper_search_hint tests ---
# Helper to build a Brave-style result card with the new 2026-04 class names.
# Real result cards have a .result-wrapper; non-result blocks (videos, FAQ) do not.
def _brave_card(href: str, title: str, snippet: str) -> str:
"""Return a mock Brave Search .snippet card with .result-wrapper (web result)."""
return f"""
<div class="snippet svelte-jmfu5f">
<div class="result-wrapper svelte-1rq4ngz">
<div class="result-content svelte-1rq4ngz">
<a class="l1 svelte-14r20fy" href="{href}">
<div class="search-snippet-title line-clamp-1 svelte-14r20fy">{title}</div>
</a>
<div class="generic-snippet svelte-1cwdgg3">
<div class="content desktop-default-regular t-primary line-clamp-dynamic svelte-1cwdgg3">{snippet}</div>
</div>
</div>
</div>
</div>"""
@pytest.fixture
def mock_brave_response():
"""Mock Brave Search HTML response with result cards."""
"""Mock Brave Search HTML response with result cards (2026-04 class names)."""
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = """
<html><body>
<div class="snippet">
<a href="https://example.com/article1" class="snippet-title">Feynman on Electric Fields</a>
<div class="snippet-title">Feynman on Electric Fields</div>
<div class="snippet-description">Richard Feynman explains that all matter has an electric field.</div>
</div>
<div class="snippet">
<a href="https://example.com/article2" class="snippet-title">Electric Fields Everywhere</a>
<div class="snippet-title">Electric Fields Everywhere</div>
<div class="snippet-description">Everything in the universe is surrounded by electric fields.</div>
</div>
<div class="snippet">
<a href="javascript:void(0)" class="snippet-title">JS Junk</a>
<div class="snippet-title">JS Junk</div>
<div class="snippet-description">Should be filtered out.</div>
<html><body id="results">
""" + _brave_card("https://example.com/article1", "Feynman on Electric Fields",
"Richard Feynman explains that all matter has an electric field.") + """
""" + _brave_card("https://example.com/article2", "Electric Fields Everywhere",
"Everything in the universe is surrounded by electric fields.") + """
<!-- Non-result block (no .result-wrapper) — should be skipped -->
<div class="snippet svelte-jmfu5f standalone" id="faq">
<header class="desktop-heading-h4">FAQ</header>
</div>
</body></html>
"""
@@ -240,22 +252,10 @@ def mock_brave_response_dups():
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = """
<html><body>
<div class="snippet">
<a href="https://example.com/dup">Dup Result A</a>
<div class="snippet-title">Dup Result A</div>
<div class="snippet-description">First occurrence.</div>
</div>
<div class="snippet">
<a href="https://example.com/dup">Dup Result B</a>
<div class="snippet-title">Dup Result B</div>
<div class="snippet-description">Second occurrence — same URL.</div>
</div>
<div class="snippet">
<a href="https://example.com/unique">Unique Result</a>
<div class="snippet-title">Unique Result</div>
<div class="snippet-description">Only once.</div>
</div>
<html><body id="results">
""" + _brave_card("https://example.com/dup", "Dup Result A", "First occurrence.") + """
""" + _brave_card("https://example.com/dup", "Dup Result B", "Second occurrence — same URL.") + """
""" + _brave_card("https://example.com/unique", "Unique Result", "Only once.") + """
</body></html>
"""
mock_resp.headers = {"content-type": "text/html"}
@@ -268,17 +268,9 @@ def mock_brave_response_empty_content():
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = """
<html><body>
<div class="snippet">
<a href="https://example.com/ghost"></a>
<div class="snippet-title"></div>
<div class="snippet-description"></div>
</div>
<div class="snippet">
<a href="https://example.com/real">Real Result</a>
<div class="snippet-title">Real Result</div>
<div class="snippet-description">Has content.</div>
</div>
<html><body id="results">
""" + _brave_card("https://example.com/ghost", "", "") + """
""" + _brave_card("https://example.com/real", "Real Result", "Has content.") + """
</body></html>
"""
mock_resp.headers = {"content-type": "text/html"}