fix(mcp-webscraper): improve search_hint quality — quote_plus, richer hint, dedup, result_count

- Use urllib.parse.quote_plus instead of str.replace(' ', '+') for correct
  URL encoding of special chars (&, %, +, #, =)
- Add search_url field to return dict so caller can verify/debug the query
- Add result_count field for quick summary without len(results)
- Deduplicate results by URL via seen_urls set
- Filter cards with both empty title AND empty snippet
- Richer hint string: 'Title (url): snippet[:120]' pipe-separated
- Max-results guard now breaks early (no over-fetching)
- 5 new tests (23→28): URL encoding, result_count, dedup, empty filter, hint format
This commit is contained in:
Patrick Plate
2026-04-05 09:57:43 +02:00
parent c2dd262727
commit 62c3b67e66
2 changed files with 165 additions and 18 deletions
+45 -11
View File
@@ -3,7 +3,7 @@
import httpx
from bs4 import BeautifulSoup
from html2text import html2text
from urllib.parse import urljoin
from urllib.parse import urljoin, quote_plus
from typing import List, Dict, Tuple
import re
import ssl
@@ -275,15 +275,21 @@ def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
max_results: Maximum number of results to return (default: 5)
Returns:
Dict with 'query', 'results' (list of {title, url, snippet}), 'hint'
Dict with 'query', 'search_url', 'results' (list of {title, url, snippet}),
'result_count', 'hint'
"""
search_url = f"https://search.brave.com/search?q={quote_plus(query)}&source=web"
try:
search_url = f"https://search.brave.com/search?q={query.replace(' ', '+')}&source=web"
_, soup = _fetch_page(search_url)
results = []
# Brave Search result cards: each <a> with class snippet contains title + description
for card in soup.select('.snippet')[:max_results]:
seen_urls: set = set()
# Brave Search result cards: each div.snippet contains title, URL, description
for card in soup.select('.snippet'):
if len(results) >= max_results:
break
title_el = card.select_one('.snippet-title')
url_el = card.select_one('a')
desc_el = card.select_one('.snippet-description')
@@ -292,20 +298,48 @@ def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
url = url_el['href'] if url_el and url_el.get('href') else ""
snippet = desc_el.get_text(strip=True) if desc_el else ""
if url and url.startswith('http'):
results.append({"title": title, "url": url, "snippet": snippet})
# Filter: must have a valid http(s) URL
if not url or not url.startswith('http'):
continue
hint = "; ".join(
f"{r['title']}: {r['url']}" for r in results
) if results else "No results found"
# Filter: skip results with no useful content at all
if not title and not snippet:
continue
# Deduplicate by URL
if url in seen_urls:
continue
seen_urls.add(url)
results.append({"title": title, "url": url, "snippet": snippet})
# Richer hint: title + url + first 120 chars of snippet for AI context
if results:
hint_parts = []
for r in results:
part = f"{r['title']} ({r['url']})"
if r['snippet']:
part += f": {r['snippet'][:120]}"
hint_parts.append(part)
hint = " | ".join(hint_parts)
else:
hint = "No results found"
return {
"query": query,
"search_url": search_url,
"results": results,
"result_count": len(results),
"hint": hint,
}
except (httpx.RequestError, httpx.HTTPStatusError) as e:
return {"query": query, "results": [], "hint": f"Error: {str(e)}"}
return {
"query": query,
"search_url": search_url,
"results": [],
"result_count": 0,
"hint": f"Error: {str(e)}",
}
if __name__ == "__main__":