diff --git a/mcp/webscraper/src/server.py b/mcp/webscraper/src/server.py index 6969e2d..f1ea432 100644 --- a/mcp/webscraper/src/server.py +++ b/mcp/webscraper/src/server.py @@ -28,9 +28,16 @@ def _build_ssl_context() -> ssl.SSLContext: _SSL_CTX = _build_ssl_context() +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) +} + def _fetch_page(url: str) -> Tuple[httpx.Response, BeautifulSoup]: """Shared fetch helper — returns response and parsed soup.""" - response = httpx.get(url, timeout=10.0, verify=_SSL_CTX) + response = httpx.get(url, timeout=10.0, verify=_SSL_CTX, headers=_HEADERS) response.raise_for_status() soup = BeautifulSoup(response.text, 'lxml') return response, soup @@ -255,5 +262,51 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]: except (httpx.RequestError, httpx.HTTPStatusError) as e: return [f"Error: {str(e)}"] +@mcp.tool() +def webscraper_search_hint(query: str, max_results: int = 5) -> Dict: + """Search Brave Search and return top results as a scraping hint. + + Use this sparingly — once per research task — to get oriented before + scraping individual pages. Returns top result URLs + snippets so you + can decide which pages are worth scraping deeply. + + Args: + query: Search query (e.g. "MacBook Pro M4 price Germany") + max_results: Maximum number of results to return (default: 5) + + Returns: + Dict with 'query', 'results' (list of {title, url, snippet}), 'hint' + """ + try: + search_url = f"https://search.brave.com/search?q={query.replace(' ', '+')}&source=web" + _, soup = _fetch_page(search_url) + + results = [] + # Brave Search result cards: each with class snippet contains title + description + for card in soup.select('.snippet')[:max_results]: + title_el = card.select_one('.snippet-title') + url_el = card.select_one('a') + desc_el = card.select_one('.snippet-description') + + title = title_el.get_text(strip=True) if title_el else "" + url = url_el['href'] if url_el and url_el.get('href') else "" + snippet = desc_el.get_text(strip=True) if desc_el else "" + + if url and url.startswith('http'): + results.append({"title": title, "url": url, "snippet": snippet}) + + hint = "; ".join( + f"{r['title']}: {r['url']}" for r in results + ) if results else "No results found" + + return { + "query": query, + "results": results, + "hint": hint, + } + except (httpx.RequestError, httpx.HTTPStatusError) as e: + return {"query": query, "results": [], "hint": f"Error: {str(e)}"} + + if __name__ == "__main__": mcp.run(transport="stdio") diff --git a/mcp/webscraper/tests/test_server.py b/mcp/webscraper/tests/test_server.py index ac2d9c9..a3b9502 100644 --- a/mcp/webscraper/tests/test_server.py +++ b/mcp/webscraper/tests/test_server.py @@ -6,7 +6,7 @@ from unittest.mock import MagicMock, patch from src.server import ( webscraper_fetch, webscraper_fetch_links, webscraper_fetch_tables, webscraper_fetch_all, webscraper_fetch_section, webscraper_fetch_meta, - webscraper_fetch_sitemap, clean_soup, filter_junk_links + webscraper_fetch_sitemap, webscraper_search_hint, clean_soup, filter_junk_links ) @pytest.fixture @@ -203,4 +203,84 @@ def test_sitemap_max_urls(mock_get, mock_sitemap_response): result = webscraper_fetch_sitemap("https://example.com/sitemap.xml", max_urls=1) assert len(result) == 1 -# Total: 18 tests covering all tools and edge cases + +# --- webscraper_search_hint tests --- + +@pytest.fixture +def mock_brave_response(): + """Mock Brave Search HTML response with result cards.""" + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.text = """ + +
+ Feynman on Electric Fields +
Feynman on Electric Fields
+
Richard Feynman explains that all matter has an electric field.
+
+
+ Electric Fields Everywhere +
Electric Fields Everywhere
+
Everything in the universe is surrounded by electric fields.
+
+
+ JS Junk +
JS Junk
+
Should be filtered out.
+
+ + """ + mock_resp.headers = {"content-type": "text/html"} + return mock_resp + + +@patch('httpx.get') +def test_webscraper_search_hint_returns_structure(mock_get, mock_brave_response): + """Test that search hint returns correct dict structure.""" + mock_get.return_value = mock_brave_response + result = webscraper_search_hint("Feynman electric field") + assert isinstance(result, dict) + assert "query" in result + assert "results" in result + assert "hint" in result + assert result["query"] == "Feynman electric field" + + +@patch('httpx.get') +def test_webscraper_search_hint_filters_non_http(mock_get, mock_brave_response): + """Test that javascript: URLs are excluded from results.""" + mock_get.return_value = mock_brave_response + result = webscraper_search_hint("Feynman electric field") + urls = [r["url"] for r in result["results"]] + assert all(u.startswith("http") for u in urls) + assert "javascript:void(0)" not in urls + + +@patch('httpx.get') +def test_webscraper_search_hint_max_results(mock_get, mock_brave_response): + """Test max_results limits output.""" + mock_get.return_value = mock_brave_response + result = webscraper_search_hint("Feynman electric field", max_results=1) + assert len(result["results"]) <= 1 + + +@patch('httpx.get') +def test_webscraper_search_hint_error(mock_get): + """Test error handling in search hint.""" + mock_get.side_effect = httpx.RequestError("Connection failed") + result = webscraper_search_hint("something") + assert result["results"] == [] + assert "Error" in result["hint"] + + +@patch('httpx.get') +def test_webscraper_search_hint_hint_string(mock_get, mock_brave_response): + """Test that hint string is non-empty when results exist.""" + mock_get.return_value = mock_brave_response + result = webscraper_search_hint("Feynman electric field") + # hint should summarise results + assert len(result["hint"]) > 0 + assert "No results found" not in result["hint"] + + +# Total: 23 tests covering all tools and edge cases