feat(webscraper): add Brave Search hint tool and User-Agent header

- Add webscraper_search_hint() tool using Brave Search as backend
  (no CAPTCHA/GDPR consent wall, works with plain httpx)
- Add User-Agent header to _fetch_page() — fixes 403 on Wikipedia,
  Feynman Lectures, and other sites that block headless requests
- Add 5 new tests for search hint (23 total, 90% coverage)

Brave Search URL: https://search.brave.com/search?q={query}&source=web
Use sparingly — once per research task as orientation, not in loops
This commit is contained in:
Patrick Plate
2026-04-05 09:37:30 +02:00
parent d5510f590e
commit 2ab847f51d
2 changed files with 136 additions and 3 deletions
+82 -2
View File
@@ -6,7 +6,7 @@ from unittest.mock import MagicMock, patch
from src.server import (
webscraper_fetch, webscraper_fetch_links, webscraper_fetch_tables,
webscraper_fetch_all, webscraper_fetch_section, webscraper_fetch_meta,
webscraper_fetch_sitemap, clean_soup, filter_junk_links
webscraper_fetch_sitemap, webscraper_search_hint, clean_soup, filter_junk_links
)
@pytest.fixture
@@ -203,4 +203,84 @@ def test_sitemap_max_urls(mock_get, mock_sitemap_response):
result = webscraper_fetch_sitemap("https://example.com/sitemap.xml", max_urls=1)
assert len(result) == 1
# Total: 18 tests covering all tools and edge cases
# --- webscraper_search_hint tests ---
@pytest.fixture
def mock_brave_response():
"""Mock Brave Search HTML response with result cards."""
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = """
<html><body>
<div class="snippet">
<a href="https://example.com/article1" class="snippet-title">Feynman on Electric Fields</a>
<div class="snippet-title">Feynman on Electric Fields</div>
<div class="snippet-description">Richard Feynman explains that all matter has an electric field.</div>
</div>
<div class="snippet">
<a href="https://example.com/article2" class="snippet-title">Electric Fields Everywhere</a>
<div class="snippet-title">Electric Fields Everywhere</div>
<div class="snippet-description">Everything in the universe is surrounded by electric fields.</div>
</div>
<div class="snippet">
<a href="javascript:void(0)" class="snippet-title">JS Junk</a>
<div class="snippet-title">JS Junk</div>
<div class="snippet-description">Should be filtered out.</div>
</div>
</body></html>
"""
mock_resp.headers = {"content-type": "text/html"}
return mock_resp
@patch('httpx.get')
def test_webscraper_search_hint_returns_structure(mock_get, mock_brave_response):
"""Test that search hint returns correct dict structure."""
mock_get.return_value = mock_brave_response
result = webscraper_search_hint("Feynman electric field")
assert isinstance(result, dict)
assert "query" in result
assert "results" in result
assert "hint" in result
assert result["query"] == "Feynman electric field"
@patch('httpx.get')
def test_webscraper_search_hint_filters_non_http(mock_get, mock_brave_response):
"""Test that javascript: URLs are excluded from results."""
mock_get.return_value = mock_brave_response
result = webscraper_search_hint("Feynman electric field")
urls = [r["url"] for r in result["results"]]
assert all(u.startswith("http") for u in urls)
assert "javascript:void(0)" not in urls
@patch('httpx.get')
def test_webscraper_search_hint_max_results(mock_get, mock_brave_response):
"""Test max_results limits output."""
mock_get.return_value = mock_brave_response
result = webscraper_search_hint("Feynman electric field", max_results=1)
assert len(result["results"]) <= 1
@patch('httpx.get')
def test_webscraper_search_hint_error(mock_get):
"""Test error handling in search hint."""
mock_get.side_effect = httpx.RequestError("Connection failed")
result = webscraper_search_hint("something")
assert result["results"] == []
assert "Error" in result["hint"]
@patch('httpx.get')
def test_webscraper_search_hint_hint_string(mock_get, mock_brave_response):
"""Test that hint string is non-empty when results exist."""
mock_get.return_value = mock_brave_response
result = webscraper_search_hint("Feynman electric field")
# hint should summarise results
assert len(result["hint"]) > 0
assert "No results found" not in result["hint"]
# Total: 23 tests covering all tools and edge cases