feat(mcp): update bigmind/mcp-image-gen/webscraper servers; add image-gen batch scripts

2026-06-11 09:02:09 +02:00
parent 0cb94122bf
commit bf721c1379
9 changed files with 2659 additions and 297 deletions
@@ -4,13 +4,14 @@ import httpx
 from bs4 import BeautifulSoup
 from html2text import html2text
 from urllib.parse import urljoin, quote_plus
-from typing import List, Dict, Tuple
+from typing import List, Dict, Tuple, Annotated
 import re
 import ssl
 import os
 import certifi
 from pathlib import Path
 from fastmcp import FastMCP
+from pydantic import Field

 mcp = FastMCP("webscraper")

@@ -54,13 +55,9 @@ def filter_junk_links(href: str) -> bool:
    return not any(re.match(pattern, href.lower()) for pattern in junk_patterns)

@mcp.tool()
-def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
+def webscraper_fetch(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters in the markdown body (default: 5000)")] = 5000) -> str:
    """Fetch a URL and return title + markdown body + metadata.
-    
-    Args:
-        url: The URL to fetch
-        max_chars: Maximum characters in the markdown body (default: 5000)
-    
+
    Returns:
        Markdown string with title, body, and metadata
    """
@@ -78,13 +75,9 @@ def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
        return f"# Error fetching {url}\n\n{str(e)}"

@mcp.tool()
-def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
+def webscraper_fetch_links(url: Annotated[str, Field(description="The URL to fetch")], deduplicate: Annotated[bool, Field(description="Remove duplicate links (default: True)")] = True) -> List[str]:
    """Fetch a URL and extract all href links.
-    
-    Args:
-        url: The URL to fetch
-        deduplicate: Remove duplicate links (default: True)
-    
+
    Returns:
        List of unique href URLs
    """
@@ -105,12 +98,9 @@ def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
        return [f"Error: {str(e)}"]

@mcp.tool()
-def webscraper_fetch_tables(url: str) -> List[str]:
+def webscraper_fetch_tables(url: Annotated[str, Field(description="The URL to fetch")]) -> List[str]:
    """Fetch a URL and extract all HTML tables as markdown.
-    
-    Args:
-        url: The URL to fetch
-    
+
    Returns:
        List of markdown tables
    """
@@ -125,13 +115,9 @@ def webscraper_fetch_tables(url: str) -> List[str]:
        return [f"Error: {str(e)}"]

@mcp.tool()
-def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
+def webscraper_fetch_all(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters (default: 5000)")] = 5000) -> Dict:
    """Fetch everything: markdown + links + tables + meta.
-    
-    Args:
-        url: The URL to fetch
-        max_chars: Maximum characters (default: 5000)
-    
+
    Returns:
        Dict with 'markdown', 'links', 'tables', 'meta'
    """
@@ -181,13 +167,9 @@ def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
        return {"error": str(e)}

@mcp.tool()
-def webscraper_fetch_section(url: str, selector: str) -> str:
+def webscraper_fetch_section(url: Annotated[str, Field(description="The URL to fetch")], selector: Annotated[str, Field(description="CSS selector (e.g., '.content')")]) -> str:
    """Fetch a URL and extract specific section by CSS selector.
-    
-    Args:
-        url: The URL to fetch
-        selector: CSS selector (e.g., '.content')
-    
+
    Returns:
        Markdown of the selected section
    """
@@ -210,12 +192,9 @@ def webscraper_fetch_section(url: str, selector: str) -> str:
        return f"Error: {str(e)}"

@mcp.tool()
-def webscraper_fetch_meta(url: str) -> Dict[str, str]:
+def webscraper_fetch_meta(url: Annotated[str, Field(description="The URL to fetch")]) -> Dict[str, str]:
    """Fetch a URL and return page metadata: title, description, OG tags.
-    
-    Args:
-        url: The URL to fetch
-    
+
    Returns:
        Dict of metadata
    """
@@ -238,13 +217,9 @@ def webscraper_fetch_meta(url: str) -> Dict[str, str]:
        return {"error": str(e)}

@mcp.tool()
-def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
+def webscraper_fetch_sitemap(url: Annotated[str, Field(description="Sitemap URL (or auto-discover)")], max_urls: Annotated[int, Field(description="Maximum URLs to return (default: 100)")] = 100) -> List[str]:
    """Fetch sitemap.xml and return list of URLs.
-    
-    Args:
-        url: Sitemap URL (or auto-discover)
-        max_urls: Maximum URLs to return (default: 100)
-    
+
    Returns:
        List of sitemap URLs
    """
@@ -263,17 +238,13 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
        return [f"Error: {str(e)}"]

@mcp.tool()
-def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
+def webscraper_search_hint(query: Annotated[str, Field(description="Search query (e.g. \"MacBook Pro M4 price Germany\")")], max_results: Annotated[int, Field(description="Maximum number of results to return (default: 5)")] = 5) -> Dict:
    """Search Brave Search and return top results as a scraping hint.

    Use this sparingly — once per research task — to get oriented before
    scraping individual pages. Returns top result URLs + snippets so you
    can decide which pages are worth scraping deeply.

-    Args:
-        query: Search query (e.g. "MacBook Pro M4 price Germany")
-        max_results: Maximum number of results to return (default: 5)
-
    Returns:
        Dict with 'query', 'search_url', 'results' (list of {title, url, snippet}),
        'result_count', 'hint'
@@ -285,14 +256,23 @@ def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
        results = []
        seen_urls: set = set()

-        # Brave Search result cards: each div.snippet contains title, URL, description
+        # Brave Search result cards: each div.snippet with a .result-wrapper is a web result.
+        # Skip video clusters, FAQ blocks, and LLM snippets (they have no .result-wrapper).
+        # Class names as of 2026-04 (updated from .snippet-title / .snippet-description):
+        #   title   → .search-snippet-title
+        #   url     → a.l1  (the primary result anchor, avoids favicon <a> tags)
+        #   snippet → .content.t-primary
        for card in soup.select('.snippet'):
            if len(results) >= max_results:
                break

-            title_el = card.select_one('.snippet-title')
-            url_el = card.select_one('a')
-            desc_el = card.select_one('.snippet-description')
+            # Skip non-web-result snippets (videos, FAQ, LLM answer blocks)
+            if not card.select_one('.result-wrapper'):
+                continue
+
+            title_el = card.select_one('.search-snippet-title')
+            url_el = card.select_one('a.l1')
+            desc_el = card.select_one('.content.t-primary')

            title = title_el.get_text(strip=True) if title_el else ""
            url = url_el['href'] if url_el and url_el.get('href') else ""