feat(mcp): update bigmind/mcp-image-gen/webscraper servers; add image-gen batch scripts
This commit is contained in:
@@ -4,13 +4,14 @@ import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from html2text import html2text
|
||||
from urllib.parse import urljoin, quote_plus
|
||||
from typing import List, Dict, Tuple
|
||||
from typing import List, Dict, Tuple, Annotated
|
||||
import re
|
||||
import ssl
|
||||
import os
|
||||
import certifi
|
||||
from pathlib import Path
|
||||
from fastmcp import FastMCP
|
||||
from pydantic import Field
|
||||
|
||||
mcp = FastMCP("webscraper")
|
||||
|
||||
@@ -54,13 +55,9 @@ def filter_junk_links(href: str) -> bool:
|
||||
return not any(re.match(pattern, href.lower()) for pattern in junk_patterns)
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
|
||||
def webscraper_fetch(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters in the markdown body (default: 5000)")] = 5000) -> str:
|
||||
"""Fetch a URL and return title + markdown body + metadata.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
max_chars: Maximum characters in the markdown body (default: 5000)
|
||||
|
||||
|
||||
Returns:
|
||||
Markdown string with title, body, and metadata
|
||||
"""
|
||||
@@ -78,13 +75,9 @@ def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
|
||||
return f"# Error fetching {url}\n\n{str(e)}"
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
|
||||
def webscraper_fetch_links(url: Annotated[str, Field(description="The URL to fetch")], deduplicate: Annotated[bool, Field(description="Remove duplicate links (default: True)")] = True) -> List[str]:
|
||||
"""Fetch a URL and extract all href links.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
deduplicate: Remove duplicate links (default: True)
|
||||
|
||||
|
||||
Returns:
|
||||
List of unique href URLs
|
||||
"""
|
||||
@@ -105,12 +98,9 @@ def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
|
||||
return [f"Error: {str(e)}"]
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_tables(url: str) -> List[str]:
|
||||
def webscraper_fetch_tables(url: Annotated[str, Field(description="The URL to fetch")]) -> List[str]:
|
||||
"""Fetch a URL and extract all HTML tables as markdown.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
|
||||
|
||||
Returns:
|
||||
List of markdown tables
|
||||
"""
|
||||
@@ -125,13 +115,9 @@ def webscraper_fetch_tables(url: str) -> List[str]:
|
||||
return [f"Error: {str(e)}"]
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
|
||||
def webscraper_fetch_all(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters (default: 5000)")] = 5000) -> Dict:
|
||||
"""Fetch everything: markdown + links + tables + meta.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
max_chars: Maximum characters (default: 5000)
|
||||
|
||||
|
||||
Returns:
|
||||
Dict with 'markdown', 'links', 'tables', 'meta'
|
||||
"""
|
||||
@@ -181,13 +167,9 @@ def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
|
||||
return {"error": str(e)}
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_section(url: str, selector: str) -> str:
|
||||
def webscraper_fetch_section(url: Annotated[str, Field(description="The URL to fetch")], selector: Annotated[str, Field(description="CSS selector (e.g., '.content')")]) -> str:
|
||||
"""Fetch a URL and extract specific section by CSS selector.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
selector: CSS selector (e.g., '.content')
|
||||
|
||||
|
||||
Returns:
|
||||
Markdown of the selected section
|
||||
"""
|
||||
@@ -210,12 +192,9 @@ def webscraper_fetch_section(url: str, selector: str) -> str:
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_meta(url: str) -> Dict[str, str]:
|
||||
def webscraper_fetch_meta(url: Annotated[str, Field(description="The URL to fetch")]) -> Dict[str, str]:
|
||||
"""Fetch a URL and return page metadata: title, description, OG tags.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
|
||||
|
||||
Returns:
|
||||
Dict of metadata
|
||||
"""
|
||||
@@ -238,13 +217,9 @@ def webscraper_fetch_meta(url: str) -> Dict[str, str]:
|
||||
return {"error": str(e)}
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
|
||||
def webscraper_fetch_sitemap(url: Annotated[str, Field(description="Sitemap URL (or auto-discover)")], max_urls: Annotated[int, Field(description="Maximum URLs to return (default: 100)")] = 100) -> List[str]:
|
||||
"""Fetch sitemap.xml and return list of URLs.
|
||||
|
||||
Args:
|
||||
url: Sitemap URL (or auto-discover)
|
||||
max_urls: Maximum URLs to return (default: 100)
|
||||
|
||||
|
||||
Returns:
|
||||
List of sitemap URLs
|
||||
"""
|
||||
@@ -263,17 +238,13 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
|
||||
return [f"Error: {str(e)}"]
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
|
||||
def webscraper_search_hint(query: Annotated[str, Field(description="Search query (e.g. \"MacBook Pro M4 price Germany\")")], max_results: Annotated[int, Field(description="Maximum number of results to return (default: 5)")] = 5) -> Dict:
|
||||
"""Search Brave Search and return top results as a scraping hint.
|
||||
|
||||
Use this sparingly — once per research task — to get oriented before
|
||||
scraping individual pages. Returns top result URLs + snippets so you
|
||||
can decide which pages are worth scraping deeply.
|
||||
|
||||
Args:
|
||||
query: Search query (e.g. "MacBook Pro M4 price Germany")
|
||||
max_results: Maximum number of results to return (default: 5)
|
||||
|
||||
Returns:
|
||||
Dict with 'query', 'search_url', 'results' (list of {title, url, snippet}),
|
||||
'result_count', 'hint'
|
||||
@@ -285,14 +256,23 @@ def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
|
||||
results = []
|
||||
seen_urls: set = set()
|
||||
|
||||
# Brave Search result cards: each div.snippet contains title, URL, description
|
||||
# Brave Search result cards: each div.snippet with a .result-wrapper is a web result.
|
||||
# Skip video clusters, FAQ blocks, and LLM snippets (they have no .result-wrapper).
|
||||
# Class names as of 2026-04 (updated from .snippet-title / .snippet-description):
|
||||
# title → .search-snippet-title
|
||||
# url → a.l1 (the primary result anchor, avoids favicon <a> tags)
|
||||
# snippet → .content.t-primary
|
||||
for card in soup.select('.snippet'):
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
|
||||
title_el = card.select_one('.snippet-title')
|
||||
url_el = card.select_one('a')
|
||||
desc_el = card.select_one('.snippet-description')
|
||||
# Skip non-web-result snippets (videos, FAQ, LLM answer blocks)
|
||||
if not card.select_one('.result-wrapper'):
|
||||
continue
|
||||
|
||||
title_el = card.select_one('.search-snippet-title')
|
||||
url_el = card.select_one('a.l1')
|
||||
desc_el = card.select_one('.content.t-primary')
|
||||
|
||||
title = title_el.get_text(strip=True) if title_el else ""
|
||||
url = url_el['href'] if url_el and url_el.get('href') else ""
|
||||
|
||||
@@ -206,27 +206,39 @@ def test_sitemap_max_urls(mock_get, mock_sitemap_response):
|
||||
|
||||
# --- webscraper_search_hint tests ---
|
||||
|
||||
# Helper to build a Brave-style result card with the new 2026-04 class names.
|
||||
# Real result cards have a .result-wrapper; non-result blocks (videos, FAQ) do not.
|
||||
def _brave_card(href: str, title: str, snippet: str) -> str:
|
||||
"""Return a mock Brave Search .snippet card with .result-wrapper (web result)."""
|
||||
return f"""
|
||||
<div class="snippet svelte-jmfu5f">
|
||||
<div class="result-wrapper svelte-1rq4ngz">
|
||||
<div class="result-content svelte-1rq4ngz">
|
||||
<a class="l1 svelte-14r20fy" href="{href}">
|
||||
<div class="search-snippet-title line-clamp-1 svelte-14r20fy">{title}</div>
|
||||
</a>
|
||||
<div class="generic-snippet svelte-1cwdgg3">
|
||||
<div class="content desktop-default-regular t-primary line-clamp-dynamic svelte-1cwdgg3">{snippet}</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>"""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_brave_response():
|
||||
"""Mock Brave Search HTML response with result cards."""
|
||||
"""Mock Brave Search HTML response with result cards (2026-04 class names)."""
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.text = """
|
||||
<html><body>
|
||||
<div class="snippet">
|
||||
<a href="https://example.com/article1" class="snippet-title">Feynman on Electric Fields</a>
|
||||
<div class="snippet-title">Feynman on Electric Fields</div>
|
||||
<div class="snippet-description">Richard Feynman explains that all matter has an electric field.</div>
|
||||
</div>
|
||||
<div class="snippet">
|
||||
<a href="https://example.com/article2" class="snippet-title">Electric Fields Everywhere</a>
|
||||
<div class="snippet-title">Electric Fields Everywhere</div>
|
||||
<div class="snippet-description">Everything in the universe is surrounded by electric fields.</div>
|
||||
</div>
|
||||
<div class="snippet">
|
||||
<a href="javascript:void(0)" class="snippet-title">JS Junk</a>
|
||||
<div class="snippet-title">JS Junk</div>
|
||||
<div class="snippet-description">Should be filtered out.</div>
|
||||
<html><body id="results">
|
||||
""" + _brave_card("https://example.com/article1", "Feynman on Electric Fields",
|
||||
"Richard Feynman explains that all matter has an electric field.") + """
|
||||
""" + _brave_card("https://example.com/article2", "Electric Fields Everywhere",
|
||||
"Everything in the universe is surrounded by electric fields.") + """
|
||||
<!-- Non-result block (no .result-wrapper) — should be skipped -->
|
||||
<div class="snippet svelte-jmfu5f standalone" id="faq">
|
||||
<header class="desktop-heading-h4">FAQ</header>
|
||||
</div>
|
||||
</body></html>
|
||||
"""
|
||||
@@ -240,22 +252,10 @@ def mock_brave_response_dups():
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.text = """
|
||||
<html><body>
|
||||
<div class="snippet">
|
||||
<a href="https://example.com/dup">Dup Result A</a>
|
||||
<div class="snippet-title">Dup Result A</div>
|
||||
<div class="snippet-description">First occurrence.</div>
|
||||
</div>
|
||||
<div class="snippet">
|
||||
<a href="https://example.com/dup">Dup Result B</a>
|
||||
<div class="snippet-title">Dup Result B</div>
|
||||
<div class="snippet-description">Second occurrence — same URL.</div>
|
||||
</div>
|
||||
<div class="snippet">
|
||||
<a href="https://example.com/unique">Unique Result</a>
|
||||
<div class="snippet-title">Unique Result</div>
|
||||
<div class="snippet-description">Only once.</div>
|
||||
</div>
|
||||
<html><body id="results">
|
||||
""" + _brave_card("https://example.com/dup", "Dup Result A", "First occurrence.") + """
|
||||
""" + _brave_card("https://example.com/dup", "Dup Result B", "Second occurrence — same URL.") + """
|
||||
""" + _brave_card("https://example.com/unique", "Unique Result", "Only once.") + """
|
||||
</body></html>
|
||||
"""
|
||||
mock_resp.headers = {"content-type": "text/html"}
|
||||
@@ -268,17 +268,9 @@ def mock_brave_response_empty_content():
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.text = """
|
||||
<html><body>
|
||||
<div class="snippet">
|
||||
<a href="https://example.com/ghost"></a>
|
||||
<div class="snippet-title"></div>
|
||||
<div class="snippet-description"></div>
|
||||
</div>
|
||||
<div class="snippet">
|
||||
<a href="https://example.com/real">Real Result</a>
|
||||
<div class="snippet-title">Real Result</div>
|
||||
<div class="snippet-description">Has content.</div>
|
||||
</div>
|
||||
<html><body id="results">
|
||||
""" + _brave_card("https://example.com/ghost", "", "") + """
|
||||
""" + _brave_card("https://example.com/real", "Real Result", "Has content.") + """
|
||||
</body></html>
|
||||
"""
|
||||
mock_resp.headers = {"content-type": "text/html"}
|
||||
|
||||
Reference in New Issue
Block a user