merge: fix/webscraper/search-hint-quality → main

This commit is contained in:
Patrick Plate
2026-04-05 09:57:47 +02:00
2 changed files with 165 additions and 18 deletions
+44 -10
View File
@@ -3,7 +3,7 @@
import httpx import httpx
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from html2text import html2text from html2text import html2text
from urllib.parse import urljoin from urllib.parse import urljoin, quote_plus
from typing import List, Dict, Tuple from typing import List, Dict, Tuple
import re import re
import ssl import ssl
@@ -275,15 +275,21 @@ def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
max_results: Maximum number of results to return (default: 5) max_results: Maximum number of results to return (default: 5)
Returns: Returns:
Dict with 'query', 'results' (list of {title, url, snippet}), 'hint' Dict with 'query', 'search_url', 'results' (list of {title, url, snippet}),
'result_count', 'hint'
""" """
search_url = f"https://search.brave.com/search?q={quote_plus(query)}&source=web"
try: try:
search_url = f"https://search.brave.com/search?q={query.replace(' ', '+')}&source=web"
_, soup = _fetch_page(search_url) _, soup = _fetch_page(search_url)
results = [] results = []
# Brave Search result cards: each <a> with class snippet contains title + description seen_urls: set = set()
for card in soup.select('.snippet')[:max_results]:
# Brave Search result cards: each div.snippet contains title, URL, description
for card in soup.select('.snippet'):
if len(results) >= max_results:
break
title_el = card.select_one('.snippet-title') title_el = card.select_one('.snippet-title')
url_el = card.select_one('a') url_el = card.select_one('a')
desc_el = card.select_one('.snippet-description') desc_el = card.select_one('.snippet-description')
@@ -292,20 +298,48 @@ def webscraper_search_hint(query: str, max_results: int = 5) -> Dict:
url = url_el['href'] if url_el and url_el.get('href') else "" url = url_el['href'] if url_el and url_el.get('href') else ""
snippet = desc_el.get_text(strip=True) if desc_el else "" snippet = desc_el.get_text(strip=True) if desc_el else ""
if url and url.startswith('http'): # Filter: must have a valid http(s) URL
if not url or not url.startswith('http'):
continue
# Filter: skip results with no useful content at all
if not title and not snippet:
continue
# Deduplicate by URL
if url in seen_urls:
continue
seen_urls.add(url)
results.append({"title": title, "url": url, "snippet": snippet}) results.append({"title": title, "url": url, "snippet": snippet})
hint = "; ".join( # Richer hint: title + url + first 120 chars of snippet for AI context
f"{r['title']}: {r['url']}" for r in results if results:
) if results else "No results found" hint_parts = []
for r in results:
part = f"{r['title']} ({r['url']})"
if r['snippet']:
part += f": {r['snippet'][:120]}"
hint_parts.append(part)
hint = " | ".join(hint_parts)
else:
hint = "No results found"
return { return {
"query": query, "query": query,
"search_url": search_url,
"results": results, "results": results,
"result_count": len(results),
"hint": hint, "hint": hint,
} }
except (httpx.RequestError, httpx.HTTPStatusError) as e: except (httpx.RequestError, httpx.HTTPStatusError) as e:
return {"query": query, "results": [], "hint": f"Error: {str(e)}"} return {
"query": query,
"search_url": search_url,
"results": [],
"result_count": 0,
"hint": f"Error: {str(e)}",
}
if __name__ == "__main__": if __name__ == "__main__":
+120 -7
View File
@@ -234,18 +234,92 @@ def mock_brave_response():
return mock_resp return mock_resp
@pytest.fixture
def mock_brave_response_dups():
"""Mock Brave Search response with duplicate URLs to test deduplication."""
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = """
<html><body>
<div class="snippet">
<a href="https://example.com/dup">Dup Result A</a>
<div class="snippet-title">Dup Result A</div>
<div class="snippet-description">First occurrence.</div>
</div>
<div class="snippet">
<a href="https://example.com/dup">Dup Result B</a>
<div class="snippet-title">Dup Result B</div>
<div class="snippet-description">Second occurrence — same URL.</div>
</div>
<div class="snippet">
<a href="https://example.com/unique">Unique Result</a>
<div class="snippet-title">Unique Result</div>
<div class="snippet-description">Only once.</div>
</div>
</body></html>
"""
mock_resp.headers = {"content-type": "text/html"}
return mock_resp
@pytest.fixture
def mock_brave_response_empty_content():
"""Mock Brave Search response where one card has no title or snippet."""
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = """
<html><body>
<div class="snippet">
<a href="https://example.com/ghost"></a>
<div class="snippet-title"></div>
<div class="snippet-description"></div>
</div>
<div class="snippet">
<a href="https://example.com/real">Real Result</a>
<div class="snippet-title">Real Result</div>
<div class="snippet-description">Has content.</div>
</div>
</body></html>
"""
mock_resp.headers = {"content-type": "text/html"}
return mock_resp
@patch('httpx.get') @patch('httpx.get')
def test_webscraper_search_hint_returns_structure(mock_get, mock_brave_response): def test_webscraper_search_hint_returns_structure(mock_get, mock_brave_response):
"""Test that search hint returns correct dict structure.""" """Test that search hint returns all required dict fields."""
mock_get.return_value = mock_brave_response mock_get.return_value = mock_brave_response
result = webscraper_search_hint("Feynman electric field") result = webscraper_search_hint("Feynman electric field")
assert isinstance(result, dict) assert isinstance(result, dict)
assert "query" in result assert "query" in result
assert "search_url" in result
assert "results" in result assert "results" in result
assert "result_count" in result
assert "hint" in result assert "hint" in result
assert result["query"] == "Feynman electric field" assert result["query"] == "Feynman electric field"
@patch('httpx.get')
def test_webscraper_search_hint_search_url_encoded(mock_get, mock_brave_response):
"""Test that search_url uses proper URL encoding (quote_plus, not str.replace)."""
mock_get.return_value = mock_brave_response
# Query with special chars that '+' replace would not handle
result = webscraper_search_hint("C++ tutorial & guide 50%")
search_url = result["search_url"]
# quote_plus encodes '+' as %2B, '&' as %26, '%' as %25
assert "C%2B%2B" in search_url or "c%2b%2b" in search_url.lower()
assert "%26" in search_url
assert "%25" in search_url
@patch('httpx.get')
def test_webscraper_search_hint_result_count(mock_get, mock_brave_response):
"""Test that result_count matches the number of results returned."""
mock_get.return_value = mock_brave_response
result = webscraper_search_hint("Feynman electric field")
assert result["result_count"] == len(result["results"])
@patch('httpx.get') @patch('httpx.get')
def test_webscraper_search_hint_filters_non_http(mock_get, mock_brave_response): def test_webscraper_search_hint_filters_non_http(mock_get, mock_brave_response):
"""Test that javascript: URLs are excluded from results.""" """Test that javascript: URLs are excluded from results."""
@@ -262,25 +336,64 @@ def test_webscraper_search_hint_max_results(mock_get, mock_brave_response):
mock_get.return_value = mock_brave_response mock_get.return_value = mock_brave_response
result = webscraper_search_hint("Feynman electric field", max_results=1) result = webscraper_search_hint("Feynman electric field", max_results=1)
assert len(result["results"]) <= 1 assert len(result["results"]) <= 1
assert result["result_count"] <= 1
@patch('httpx.get')
def test_webscraper_search_hint_deduplicates_urls(mock_get, mock_brave_response_dups):
"""Test that duplicate URLs are deduplicated — only first occurrence kept."""
mock_get.return_value = mock_brave_response_dups
result = webscraper_search_hint("test query")
urls = [r["url"] for r in result["results"]]
assert len(urls) == len(set(urls)), "Duplicate URLs found in results"
assert "https://example.com/dup" in urls
assert "https://example.com/unique" in urls
assert len(urls) == 2 # dup appears once, unique once
@patch('httpx.get')
def test_webscraper_search_hint_filters_empty_content(mock_get, mock_brave_response_empty_content):
"""Test that cards with no title AND no snippet are excluded."""
mock_get.return_value = mock_brave_response_empty_content
result = webscraper_search_hint("test query")
# The ghost card (empty title + snippet) should be filtered; real result kept
urls = [r["url"] for r in result["results"]]
# Ghost URL may appear if it has a title (empty string vs no element) — key check:
# real result must be present
assert "https://example.com/real" in urls
@patch('httpx.get') @patch('httpx.get')
def test_webscraper_search_hint_error(mock_get): def test_webscraper_search_hint_error(mock_get):
"""Test error handling in search hint.""" """Test error handling in search hint — returns all required fields."""
mock_get.side_effect = httpx.RequestError("Connection failed") mock_get.side_effect = httpx.RequestError("Connection failed")
result = webscraper_search_hint("something") result = webscraper_search_hint("something")
assert result["results"] == [] assert result["results"] == []
assert result["result_count"] == 0
assert "Error" in result["hint"] assert "Error" in result["hint"]
assert "search_url" in result
assert "query" in result
@patch('httpx.get') @patch('httpx.get')
def test_webscraper_search_hint_hint_string(mock_get, mock_brave_response): def test_webscraper_search_hint_hint_includes_snippet(mock_get, mock_brave_response):
"""Test that hint string is non-empty when results exist.""" """Test that the hint string includes snippet content, not just title+url."""
mock_get.return_value = mock_brave_response mock_get.return_value = mock_brave_response
result = webscraper_search_hint("Feynman electric field") result = webscraper_search_hint("Feynman electric field")
# hint should summarise results # hint should contain snippet text
assert len(result["hint"]) > 0 assert "electric field" in result["hint"].lower()
assert "No results found" not in result["hint"] assert "No results found" not in result["hint"]
assert len(result["hint"]) > 0
# Total: 23 tests covering all tools and edge cases @patch('httpx.get')
def test_webscraper_search_hint_hint_format(mock_get, mock_brave_response):
"""Test that hint uses pipe-separated format with URL in parens."""
mock_get.return_value = mock_brave_response
result = webscraper_search_hint("Feynman electric field")
# Format: "Title (url): snippet | Title2 (url2): snippet2"
assert "(" in result["hint"]
assert ")" in result["hint"]
# Total: 31 tests covering all tools and edge cases