feat(mcp): update bigmind/mcp-image-gen/webscraper servers; add image-gen batch scripts
This commit is contained in:
@@ -4,13 +4,14 @@ import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from html2text import html2text
|
||||
from urllib.parse import urljoin, quote_plus
|
||||
from typing import List, Dict, Tuple
|
||||
from typing import List, Dict, Tuple, Annotated
|
||||
import re
|
||||
import ssl
|
||||
import os
|
||||
import certifi
|
||||
from pathlib import Path
|
||||
from fastmcp import FastMCP
|
||||
from pydantic import Field
|
||||
|
||||
mcp = FastMCP("webscraper")
|
||||
|
||||
@@ -54,13 +55,9 @@ def filter_junk_links(href: str) -> bool:
|
||||
return not any(re.match(pattern, href.lower()) for pattern in junk_patterns)
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
|
||||
def webscraper_fetch(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters in the markdown body (default: 5000)")] = 5000) -> str:
|
||||
"""Fetch a URL and return title + markdown body + metadata.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
max_chars: Maximum characters in the markdown body (default: 5000)
|
||||
|
||||
|
||||
Returns:
|
||||
Markdown string with title, body, and metadata
|
||||
"""
|
||||
@@ -78,13 +75,9 @@ def webscraper_fetch(url: str, max_chars: int = 5000) -> str:
|
||||
return f"# Error fetching {url}\n\n{str(e)}"
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
|
||||
def webscraper_fetch_links(url: Annotated[str, Field(description="The URL to fetch")], deduplicate: Annotated[bool, Field(description="Remove duplicate links (default: True)")] = True) -> List[str]:
|
||||
"""Fetch a URL and extract all href links.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
deduplicate: Remove duplicate links (default: True)
|
||||
|
||||
|
||||
Returns:
|
||||
List of unique href URLs
|
||||
"""
|
||||
@@ -105,12 +98,9 @@ def webscraper_fetch_links(url: str, deduplicate: bool = True) -> List[str]:
|
||||
return [f"Error: {str(e)}"]
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_tables(url: str) -> List[str]:
|
||||
def webscraper_fetch_tables(url: Annotated[str, Field(description="The URL to fetch")]) -> List[str]:
|
||||
"""Fetch a URL and extract all HTML tables as markdown.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
|
||||
|
||||
Returns:
|
||||
List of markdown tables
|
||||
"""
|
||||
@@ -125,13 +115,9 @@ def webscraper_fetch_tables(url: str) -> List[str]:
|
||||
return [f"Error: {str(e)}"]
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
|
||||
def webscraper_fetch_all(url: Annotated[str, Field(description="The URL to fetch")], max_chars: Annotated[int, Field(description="Maximum characters (default: 5000)")] = 5000) -> Dict:
|
||||
"""Fetch everything: markdown + links + tables + meta.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
max_chars: Maximum characters (default: 5000)
|
||||
|
||||
|
||||
Returns:
|
||||
Dict with 'markdown', 'links', 'tables', 'meta'
|
||||
"""
|
||||
@@ -181,13 +167,9 @@ def webscraper_fetch_all(url: str, max_chars: int = 5000) -> Dict:
|
||||
return {"error": str(e)}
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_section(url: str, selector: str) -> str:
|
||||
def webscraper_fetch_section(url: Annotated[str, Field(description="The URL to fetch")], selector: Annotated[str, Field(description="CSS selector (e.g., '.content')")]) -> str:
|
||||
"""Fetch a URL and extract specific section by CSS selector.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
selector: CSS selector (e.g., '.content')
|
||||
|
||||
|
||||
Returns:
|
||||
Markdown of the selected section
|
||||
"""
|
||||
@@ -210,12 +192,9 @@ def webscraper_fetch_section(url: str, selector: str) -> str:
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_meta(url: str) -> Dict[str, str]:
|
||||
def webscraper_fetch_meta(url: Annotated[str, Field(description="The URL to fetch")]) -> Dict[str, str]:
|
||||
"""Fetch a URL and return page metadata: title, description, OG tags.
|
||||
|
||||
Args:
|
||||
url: The URL to fetch
|
||||
|
||||
|
||||
Returns:
|
||||
Dict of metadata
|
||||
"""
|
||||
@@ -238,13 +217,9 @@ def webscraper_fetch_meta(url: str) -> Dict[str, str]:
|
||||
return {"error": str(e)}
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
|
||||
def webscraper_fetch_sitemap(url: Annotated[str, Field(description="Sitemap URL (or auto-discover)")], max_urls: Annotated[int, Field(description="Maximum URLs to return (default: 100)")] = 100) -> List[str]:
|
||||
"""Fetch sitemap.xml and return list of URLs.
|
||||
|
||||
Args:
|
||||
url: Sitemap URL (or auto-discover)
|
||||
max_urls: Maximum URLs to return (default: 100)
|
||||
|
||||
|
||||
Returns:
|
||||
List of sitemap URLs
|
||||
"""
|
||||
@@ -263,17 +238,13 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
|
||||
return [f"Error: {str(e)}"]
|
||||
|
||||
@mcp.tool()
|
||||
def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
|
||||
def webscraper_search_hint(query: Annotated[str, Field(description="Search query (e.g. \"MacBook Pro M4 price Germany\")")], max_results: Annotated[int, Field(description="Maximum number of results to return (default: 5)")] = 5) -> Dict:
|
||||
"""Search Brave Search and return top results as a scraping hint.
|
||||
|
||||
Use this sparingly — once per research task — to get oriented before
|
||||
scraping individual pages. Returns top result URLs + snippets so you
|
||||
can decide which pages are worth scraping deeply.
|
||||
|
||||
Args:
|
||||
query: Search query (e.g. "MacBook Pro M4 price Germany")
|
||||
max_results: Maximum number of results to return (default: 5)
|
||||
|
||||
Returns:
|
||||
Dict with 'query', 'search_url', 'results' (list of {title, url, snippet}),
|
||||
'result_count', 'hint'
|
||||
@@ -285,14 +256,23 @@ def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
|
||||
results = []
|
||||
seen_urls: set = set()
|
||||
|
||||
# Brave Search result cards: each div.snippet contains title, URL, description
|
||||
# Brave Search result cards: each div.snippet with a .result-wrapper is a web result.
|
||||
# Skip video clusters, FAQ blocks, and LLM snippets (they have no .result-wrapper).
|
||||
# Class names as of 2026-04 (updated from .snippet-title / .snippet-description):
|
||||
# title → .search-snippet-title
|
||||
# url → a.l1 (the primary result anchor, avoids favicon <a> tags)
|
||||
# snippet → .content.t-primary
|
||||
for card in soup.select('.snippet'):
|
||||
if len(results) >= max_results:
|
||||
break
|
||||
|
||||
title_el = card.select_one('.snippet-title')
|
||||
url_el = card.select_one('a')
|
||||
desc_el = card.select_one('.snippet-description')
|
||||
# Skip non-web-result snippets (videos, FAQ, LLM answer blocks)
|
||||
if not card.select_one('.result-wrapper'):
|
||||
continue
|
||||
|
||||
title_el = card.select_one('.search-snippet-title')
|
||||
url_el = card.select_one('a.l1')
|
||||
desc_el = card.select_one('.content.t-primary')
|
||||
|
||||
title = title_el.get_text(strip=True) if title_el else ""
|
||||
url = url_el['href'] if url_el and url_el.get('href') else ""
|
||||
|
||||
Reference in New Issue
Block a user