feat(mcp): update bigmind/mcp-image-gen/webscraper servers; add image-gen batch scripts

2026-06-11 09:02:09 +02:00
parent 0cb94122bf
commit bf721c1379
9 changed files with 2659 additions and 297 deletions
@@ -4,13 +4,14 @@ import httpx
 from bs4 import BeautifulSoup
 from html2text import html2text
 from urllib.parse import urljoin, quote_plus
-from typing import List, Dict, Tuple
+from typing import List, Dict, Tuple, Annotated
 import re
 import ssl
 import os
 import certifi
 from pathlib import Path
 from fastmcp import FastMCP
+from pydantic import Field

 mcp = FastMCP("webscraper")

@@ -54,13 +55,9 @@ def filter_junk_links(href: str) -> bool:
    return not any(re.match(pattern, href.lower()) for pattern in junk_patterns)

@mcp.tool()
-def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
+def webscraper_fetch(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters in the markdown body (default: 5000)")] = 5000) -> str:
    """Fetch a URL and return title + markdown body + metadata.
-    
-    Args:
-        url: The URL to fetch
-        max_chars: Maximum characters in the markdown body (default: 5000)
-    
+
    Returns:
        Markdown string with title, body, and metadata
    """
@@ -78,13 +75,9 @@ def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
        return f"# Error fetching {url}\n\n{str(e)}"

@mcp.tool()
-def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
+def webscraper_fetch_links(url: Annotated[str, Field(description="The URL to fetch")], deduplicate: Annotated[bool, Field(description="Remove duplicate links (default: True)")] = True) -> List[str]:
    """Fetch a URL and extract all href links.
-    
-    Args:
-        url: The URL to fetch
-        deduplicate: Remove duplicate links (default: True)
-    
+
    Returns:
        List of unique href URLs
    """
@@ -105,12 +98,9 @@ def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
        return [f"Error: {str(e)}"]

@mcp.tool()
-def webscraper_fetch_tables(url: str) -> List[str]:
+def webscraper_fetch_tables(url: Annotated[str, Field(description="The URL to fetch")]) -> List[str]:
    """Fetch a URL and extract all HTML tables as markdown.
-    
-    Args:
-        url: The URL to fetch
-    
+
    Returns:
        List of markdown tables
    """
@@ -125,13 +115,9 @@ def webscraper_fetch_tables(url: str) -> List[str]:
        return [f"Error: {str(e)}"]

@mcp.tool()
-def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
+def webscraper_fetch_all(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters (default: 5000)")] = 5000) -> Dict:
    """Fetch everything: markdown + links + tables + meta.
-    
-    Args:
-        url: The URL to fetch
-        max_chars: Maximum characters (default: 5000)
-    
+
    Returns:
        Dict with 'markdown', 'links', 'tables', 'meta'
    """
@@ -181,13 +167,9 @@ def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
        return {"error": str(e)}

@mcp.tool()
-def webscraper_fetch_section(url: str, selector: str) -> str:
+def webscraper_fetch_section(url: Annotated[str, Field(description="The URL to fetch")], selector: Annotated[str, Field(description="CSS selector (e.g., '.content')")]) -> str:
    """Fetch a URL and extract specific section by CSS selector.
-    
-    Args:
-        url: The URL to fetch
-        selector: CSS selector (e.g., '.content')
-    
+
    Returns:
        Markdown of the selected section
    """
@@ -210,12 +192,9 @@ def webscraper_fetch_section(url: str, selector: str) -> str:
        return f"Error: {str(e)}"

@mcp.tool()
-def webscraper_fetch_meta(url: str) -> Dict[str, str]:
+def webscraper_fetch_meta(url: Annotated[str, Field(description="The URL to fetch")]) -> Dict[str, str]:
    """Fetch a URL and return page metadata: title, description, OG tags.
-    
-    Args:
-        url: The URL to fetch
-    
+
    Returns:
        Dict of metadata
    """
@@ -238,13 +217,9 @@ def webscraper_fetch_meta(url: str) -> Dict[str, str]:
        return {"error": str(e)}

@mcp.tool()
-def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
+def webscraper_fetch_sitemap(url: Annotated[str, Field(description="Sitemap URL (or auto-discover)")], max_urls: Annotated[int, Field(description="Maximum URLs to return (default: 100)")] = 100) -> List[str]:
    """Fetch sitemap.xml and return list of URLs.
-    
-    Args:
-        url: Sitemap URL (or auto-discover)
-        max_urls: Maximum URLs to return (default: 100)
-    
+
    Returns:
        List of sitemap URLs
    """
@@ -263,17 +238,13 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
        return [f"Error: {str(e)}"]

@mcp.tool()
-def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
+def webscraper_search_hint(query: Annotated[str, Field(description="Search query (e.g. \"MacBook Pro M4 price Germany\")")], max_results: Annotated[int, Field(description="Maximum number of results to return (default: 5)")] = 5) -> Dict:
    """Search Brave Search and return top results as a scraping hint.

    Use this sparingly — once per research task — to get oriented before
    scraping individual pages. Returns top result URLs + snippets so you
    can decide which pages are worth scraping deeply.

-    Args:
-        query: Search query (e.g. "MacBook Pro M4 price Germany")
-        max_results: Maximum number of results to return (default: 5)
-
    Returns:
        Dict with 'query', 'search_url', 'results' (list of {title, url, snippet}),
        'result_count', 'hint'
@@ -285,14 +256,23 @@ def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
        results = []
        seen_urls: set = set()

-        # Brave Search result cards: each div.snippet contains title, URL, description
+        # Brave Search result cards: each div.snippet with a .result-wrapper is a web result.
+        # Skip video clusters, FAQ blocks, and LLM snippets (they have no .result-wrapper).
+        # Class names as of 2026-04 (updated from .snippet-title / .snippet-description):
+        #   title   → .search-snippet-title
+        #   url     → a.l1  (the primary result anchor, avoids favicon <a> tags)
+        #   snippet → .content.t-primary
        for card in soup.select('.snippet'):
            if len(results) >= max_results:
                break

-            title_el = card.select_one('.snippet-title')
-            url_el = card.select_one('a')
-            desc_el = card.select_one('.snippet-description')
+            # Skip non-web-result snippets (videos, FAQ, LLM answer blocks)
+            if not card.select_one('.result-wrapper'):
+                continue
+
+            title_el = card.select_one('.search-snippet-title')
+            url_el = card.select_one('a.l1')
+            desc_el = card.select_one('.content.t-primary')

            title = title_el.get_text(strip=True) if title_el else ""
            url = url_el['href'] if url_el and url_el.get('href') else ""
@@ -206,27 +206,39 @@ def test_sitemap_max_urls(mock_get, mock_sitemap_response):

 # --- webscraper_search_hint tests ---

+# Helper to build a Brave-style result card with the new 2026-04 class names.
+# Real result cards have a .result-wrapper; non-result blocks (videos, FAQ) do not.
+def _brave_card(href: str, title: str, snippet: str) -> str:
+    """Return a mock Brave Search .snippet card with .result-wrapper (web result)."""
+    return f"""
+    <div class="snippet svelte-jmfu5f">
+        <div class="result-wrapper svelte-1rq4ngz">
+            <div class="result-content svelte-1rq4ngz">
+                <a class="l1 svelte-14r20fy" href="{href}">
+                    <div class="search-snippet-title line-clamp-1 svelte-14r20fy">{title}</div>
+                </a>
+                <div class="generic-snippet svelte-1cwdgg3">
+                    <div class="content desktop-default-regular t-primary line-clamp-dynamic svelte-1cwdgg3">{snippet}</div>
+                </div>
+            </div>
+        </div>
+    </div>"""
+
+
@pytest.fixture
 def mock_brave_response():
-    """Mock Brave Search HTML response with result cards."""
+    """Mock Brave Search HTML response with result cards (2026-04 class names)."""
    mock_resp = MagicMock()
    mock_resp.status_code = 200
    mock_resp.text = """
-    <html><body>
-        <div class="snippet">
-            <a href="https://example.com/article1" class="snippet-title">Feynman on Electric Fields</a>
-            <div class="snippet-title">Feynman on Electric Fields</div>
-            <div class="snippet-description">Richard Feynman explains that all matter has an electric field.</div>
-        </div>
-        <div class="snippet">
-            <a href="https://example.com/article2" class="snippet-title">Electric Fields Everywhere</a>
-            <div class="snippet-title">Electric Fields Everywhere</div>
-            <div class="snippet-description">Everything in the universe is surrounded by electric fields.</div>
-        </div>
-        <div class="snippet">
-            <a href="javascript:void(0)" class="snippet-title">JS Junk</a>
-            <div class="snippet-title">JS Junk</div>
-            <div class="snippet-description">Should be filtered out.</div>
+    <html><body id="results">
+        """ + _brave_card("https://example.com/article1", "Feynman on Electric Fields",
+                          "Richard Feynman explains that all matter has an electric field.") + """
+        """ + _brave_card("https://example.com/article2", "Electric Fields Everywhere",
+                          "Everything in the universe is surrounded by electric fields.") + """
+        <!-- Non-result block (no .result-wrapper) — should be skipped -->
+        <div class="snippet svelte-jmfu5f standalone" id="faq">
+            <header class="desktop-heading-h4">FAQ</header>
        </div>
    </body></html>
    """
@@ -240,22 +252,10 @@ def mock_brave_response_dups():
    mock_resp = MagicMock()
    mock_resp.status_code = 200
    mock_resp.text = """
-    <html><body>
-        <div class="snippet">
-            <a href="https://example.com/dup">Dup Result A</a>
-            <div class="snippet-title">Dup Result A</div>
-            <div class="snippet-description">First occurrence.</div>
-        </div>
-        <div class="snippet">
-            <a href="https://example.com/dup">Dup Result B</a>
-            <div class="snippet-title">Dup Result B</div>
-            <div class="snippet-description">Second occurrence — same URL.</div>
-        </div>
-        <div class="snippet">
-            <a href="https://example.com/unique">Unique Result</a>
-            <div class="snippet-title">Unique Result</div>
-            <div class="snippet-description">Only once.</div>
-        </div>
+    <html><body id="results">
+        """ + _brave_card("https://example.com/dup", "Dup Result A", "First occurrence.") + """
+        """ + _brave_card("https://example.com/dup", "Dup Result B", "Second occurrence — same URL.") + """
+        """ + _brave_card("https://example.com/unique", "Unique Result", "Only once.") + """
    </body></html>
    """
    mock_resp.headers = {"content-type": "text/html"}
@@ -268,17 +268,9 @@ def mock_brave_response_empty_content():
    mock_resp = MagicMock()
    mock_resp.status_code = 200
    mock_resp.text = """
-    <html><body>
-        <div class="snippet">
-            <a href="https://example.com/ghost"></a>
-            <div class="snippet-title"></div>
-            <div class="snippet-description"></div>
-        </div>
-        <div class="snippet">
-            <a href="https://example.com/real">Real Result</a>
-            <div class="snippet-title">Real Result</div>
-            <div class="snippet-description">Has content.</div>
-        </div>
+    <html><body id="results">
+        """ + _brave_card("https://example.com/ghost", "", "") + """
+        """ + _brave_card("https://example.com/real", "Real Result", "Has content.") + """
    </body></html>
    """
    mock_resp.headers = {"content-type": "text/html"}