feat(webscraper): add Brave Search hint tool and User-Agent header

- Add webscraper_search_hint() tool using Brave Search as backend (no CAPTCHA/GDPR consent wall, works with plain httpx) - Add User-Agent header to _fetch_page() — fixes 403 on Wikipedia, Feynman Lectures, and other sites that block headless requests - Add 5 new tests for search hint (23 total, 90% coverage) Brave Search URL: https://search.brave.com/search?q={query}&source=web Use sparingly — once per research task as orientation, not in loops
Added new picture for bigmind page
2026-04-05 09:37:30 +02:00 · 2026-04-04 20:03:59 +02:00 · 2026-04-04 19:29:15 +02:00 · 2026-04-04 19:27:24 +02:00
16 changed files with 154 additions and 12 deletions
@@ -10,11 +10,10 @@
 			"alwaysAllow": [
 				"git_status",
 				"git_diff_unstaged",
-				"git_log",
-				"git_add",
-				"git_commit",
 				"git_branch",
-				"git_create_branch"
+				"git_create_branch",
+				"git_add",
+				"git_commit"
 			]
 		},
 		"filesystem": {
@@ -34,7 +33,8 @@
 				"src/server.py"
 			],
 			"alwaysAllow": [
-				"webscraper_fetch"
+				"webscraper_fetch",
+				"webscraper_fetch_links"
 			]
 		},
 		"gitea": {
@@ -54,8 +54,10 @@
 				"create_issue_comment",
 				"create_pull_request",
 				"get_repository",
-				"list_my_repositories"
-			]
+				"list_my_repositories",
+				"create_wiki_page"
+			],
+			"disabled": true
 		},
 		"playwright": {
 			"command": "npx",
@@ -82,7 +84,13 @@
 			"env": {
 				"COMFYUI_URL": "http://localhost:8188",
 				"IMAGE_OUTPUT_DIR": "/home/pplate/Pictures/mcp-generated"
-			}
+			},
+			"alwaysAllow": [
+				"list_available_models",
+				"get_generation_status",
+				"get_output_directory",
+				"generate_image"
+			]
 		}
 	}
 }
@@ -33,7 +33,8 @@ def _render_achievements(achievements: list) -> str:

        if a.get("image"):
            tier = a["id"].rsplit("_", 1)[-1]
-            visual_html = f'<div class="ach-image tier-{tier}">{lock_overlay}</div>'
+            img_url = _esc(a["image"])
+            visual_html = f'<div class="ach-image tier-{tier}" style="background-image: url({img_url});">{lock_overlay}</div>'
        else:
            visual_html = f'<div class="ach-icon">{a["icon"]}{lock_overlay}</div>'

@@ -28,9 +28,16 @@ def _build_ssl_context() -> ssl.SSLContext:

 _SSL_CTX = _build_ssl_context()

+_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
+    )
+}
+
 def _fetch_page(url: str) -> Tuple[httpx.Response, BeautifulSoup]:
    """Shared fetch helper — returns response and parsed soup."""
-    response = httpx.get(url, timeout=10.0, verify=_SSL_CTX)
+    response = httpx.get(url, timeout=10.0, verify=_SSL_CTX, headers=_HEADERS)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'lxml')
    return response, soup
@@ -255,5 +262,51 @@ def webscraper_fetch_sitemap(url: str, max_urls: int = 100) -> List[str]:
    except (httpx.RequestError, httpx.HTTPStatusError) as e:
        return [f"Error: {str(e)}"]

+@mcp.tool()
+def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
+    """Search Brave Search and return top results as a scraping hint.
+
+    Use this sparingly — once per research task — to get oriented before
+    scraping individual pages. Returns top result URLs + snippets so you
+    can decide which pages are worth scraping deeply.
+
+    Args:
+        query: Search query (e.g. "MacBook Pro M4 price Germany")
+        max_results: Maximum number of results to return (default: 5)
+
+    Returns:
+        Dict with 'query', 'results' (list of {title, url, snippet}), 'hint'
+    """
+    try:
+        search_url = f"https://search.brave.com/search?q={query.replace(' ', '+')}&source=web"
+        _, soup = _fetch_page(search_url)
+
+        results = []
+        # Brave Search result cards: each <a> with class snippet contains title + description
+        for card in soup.select('.snippet')[:max_results]:
+            title_el = card.select_one('.snippet-title')
+            url_el = card.select_one('a')
+            desc_el = card.select_one('.snippet-description')
+
+            title = title_el.get_text(strip=True) if title_el else ""
+            url = url_el['href'] if url_el and url_el.get('href') else ""
+            snippet = desc_el.get_text(strip=True) if desc_el else ""
+
+            if url and url.startswith('http'):
+                results.append({"title": title, "url": url, "snippet": snippet})
+
+        hint = "; ".join(
+            f"{r['title']}: {r['url']}" for r in results
+        ) if results else "No results found"
+
+        return {
+            "query": query,
+            "results": results,
+            "hint": hint,
+        }
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
+        return {"query": query, "results": [], "hint": f"Error: {str(e)}"}
+
+
 if __name__ == "__main__":
    mcp.run(transport="stdio")
@@ -6,7 +6,7 @@ from unittest.mock import MagicMock, patch
 from src.server import (
    webscraper_fetch, webscraper_fetch_links, webscraper_fetch_tables,
    webscraper_fetch_all, webscraper_fetch_section, webscraper_fetch_meta,
-    webscraper_fetch_sitemap, clean_soup, filter_junk_links
+    webscraper_fetch_sitemap, webscraper_search_hint, clean_soup, filter_junk_links
 )

@pytest.fixture
@@ -203,4 +203,84 @@ def test_sitemap_max_urls(mock_get, mock_sitemap_response):
    result = webscraper_fetch_sitemap("https://example.com/sitemap.xml", max_urls=1)
    assert len(result) == 1

-# Total: 18 tests covering all tools and edge cases
+
+# --- webscraper_search_hint tests ---
+
+@pytest.fixture
+def mock_brave_response():
+    """Mock Brave Search HTML response with result cards."""
+    mock_resp = MagicMock()
+    mock_resp.status_code = 200
+    mock_resp.text = """
+    <html><body>
+        <div class="snippet">
+            <a href="https://example.com/article1" class="snippet-title">Feynman on Electric Fields</a>
+            <div class="snippet-title">Feynman on Electric Fields</div>
+            <div class="snippet-description">Richard Feynman explains that all matter has an electric field.</div>
+        </div>
+        <div class="snippet">
+            <a href="https://example.com/article2" class="snippet-title">Electric Fields Everywhere</a>
+            <div class="snippet-title">Electric Fields Everywhere</div>
+            <div class="snippet-description">Everything in the universe is surrounded by electric fields.</div>
+        </div>
+        <div class="snippet">
+            <a href="javascript:void(0)" class="snippet-title">JS Junk</a>
+            <div class="snippet-title">JS Junk</div>
+            <div class="snippet-description">Should be filtered out.</div>
+        </div>
+    </body></html>
+    """
+    mock_resp.headers = {"content-type": "text/html"}
+    return mock_resp
+
+
+@patch('httpx.get')
+def test_webscraper_search_hint_returns_structure(mock_get, mock_brave_response):
+    """Test that search hint returns correct dict structure."""
+    mock_get.return_value = mock_brave_response
+    result = webscraper_search_hint("Feynman electric field")
+    assert isinstance(result, dict)
+    assert "query" in result
+    assert "results" in result
+    assert "hint" in result
+    assert result["query"] == "Feynman electric field"
+
+
+@patch('httpx.get')
+def test_webscraper_search_hint_filters_non_http(mock_get, mock_brave_response):
+    """Test that javascript: URLs are excluded from results."""
+    mock_get.return_value = mock_brave_response
+    result = webscraper_search_hint("Feynman electric field")
+    urls = [r["url"] for r in result["results"]]
+    assert all(u.startswith("http") for u in urls)
+    assert "javascript:void(0)" not in urls
+
+
+@patch('httpx.get')
+def test_webscraper_search_hint_max_results(mock_get, mock_brave_response):
+    """Test max_results limits output."""
+    mock_get.return_value = mock_brave_response
+    result = webscraper_search_hint("Feynman electric field", max_results=1)
+    assert len(result["results"]) <= 1
+
+
+@patch('httpx.get')
+def test_webscraper_search_hint_error(mock_get):
+    """Test error handling in search hint."""
+    mock_get.side_effect = httpx.RequestError("Connection failed")
+    result = webscraper_search_hint("something")
+    assert result["results"] == []
+    assert "Error" in result["hint"]
+
+
+@patch('httpx.get')
+def test_webscraper_search_hint_hint_string(mock_get, mock_brave_response):
+    """Test that hint string is non-empty when results exist."""
+    mock_get.return_value = mock_brave_response
+    result = webscraper_search_hint("Feynman electric field")
+    # hint should summarise results
+    assert len(result["hint"]) > 0
+    assert "No results found" not in result["hint"]
+
+
+# Total: 23 tests covering all tools and edge cases
Author	SHA1	Message	Date
Patrick Plate	2ab847f51d	feat(webscraper): add Brave Search hint tool and User-Agent header - Add webscraper_search_hint() tool using Brave Search as backend (no CAPTCHA/GDPR consent wall, works with plain httpx) - Add User-Agent header to _fetch_page() — fixes 403 on Wikipedia, Feynman Lectures, and other sites that block headless requests - Add 5 new tests for search hint (23 total, 90% coverage) Brave Search URL: https://search.brave.com/search?q={query}&source=web Use sparingly — once per research task as orientation, not in loops	2026-04-05 09:37:30 +02:00
Patrick Plate	d5510f590e	Added new picture for bigmind page	2026-04-04 20:03:59 +02:00
Patrick Plate	cf102e8b3e	fix(bigmind): render achievement card background images via inline style	2026-04-04 19:29:15 +02:00
Patrick Plate	13659fd414	fix(bigmind): add background-image inline style to achievement card ach-image divs The .ach-image div had correct CSS dimensions (64x64) and background-size:cover but was missing the inline style="background-image: url(...)" — so the div rendered as an empty circle. Fixed by extracting img_url variable and applying it as style attribute in the f-string. All 39 achievement PNGs now load. 303/303 tests passing.	2026-04-04 19:27:24 +02:00