"""Comprehensive tests for webscraper server."""
import pytest
import httpx
from unittest.mock import MagicMock, patch
from src.server import (
webscraper_fetch, webscraper_fetch_links, webscraper_fetch_tables,
webscraper_fetch_all, webscraper_fetch_section, webscraper_fetch_meta,
webscraper_fetch_sitemap, webscraper_search_hint, clean_soup, filter_junk_links
)
@pytest.fixture
def mock_response():
"""Mock httpx response."""
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = """
Test Page
Header
Paragraph 1
Link 1
Junk Mail
Junk JS
Relative Link
Parent Relative
Selected content
"""
mock_resp.headers = {"content-type": "text/html"}
return mock_resp
@pytest.fixture
def mock_sitemap_response():
"""Mock sitemap response."""
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.text = """
https://example.com/page1
https://example.com/page2
https://example.com/sitemap.xml
"""
return mock_resp
@patch('httpx.get')
def test_webscraper_fetch(mock_get, mock_response):
"""Test webscraper_fetch tool."""
mock_get.return_value = mock_response
result = webscraper_fetch("https://example.com", max_chars=100)
assert "# Test Page" in result
assert "Paragraph 1" in result
assert "URL: https://example.com" in result
assert len(result) < 500 # Truncated
@patch('httpx.get')
def test_webscraper_fetch_error(mock_get):
"""Test error handling in webscraper_fetch."""
mock_get.side_effect = httpx.RequestError("Connection failed")
result = webscraper_fetch("https://fail.com")
assert "Error fetching" in result
@patch('httpx.get')
def test_webscraper_fetch_links(mock_get, mock_response):
"""Test webscraper_fetch_links tool."""
mock_get.return_value = mock_response
result = webscraper_fetch_links("https://example.com", deduplicate=True)
assert isinstance(result, list)
assert "https://example.com/link1" in result
assert "https://example.com/relative.html" in result
assert "https://example.com/dir/page.html" in result
assert len(result) == 3 # Valid links only
@patch('httpx.get')
def test_webscraper_fetch_links_no_dedup(mock_get, mock_response):
"""Test without deduplication."""
mock_get.return_value = mock_response
result = webscraper_fetch_links("https://example.com", deduplicate=False)
assert len(result) == 3 # Still three unique
@patch('httpx.get')
def test_webscraper_fetch_tables(mock_get, mock_response):
"""Test webscraper_fetch_tables tool."""
mock_get.return_value = mock_response
result = webscraper_fetch_tables("https://example.com")
assert isinstance(result, list)
assert "Cell1" in result[0]
assert "Cell2" in result[0]
@patch('httpx.get')
def test_webscraper_fetch_all(mock_get, mock_response):
"""Test webscraper_fetch_all tool."""
mock_get.return_value = mock_response
result = webscraper_fetch_all("https://example.com", max_chars=100)
assert "markdown" in result
assert "links" in result
assert "tables" in result
assert "meta" in result
@patch('httpx.get')
def test_webscraper_fetch_section(mock_get, mock_response):
"""Test webscraper_fetch_section tool."""
mock_get.return_value = mock_response
result = webscraper_fetch_section("https://example.com", ".content")
assert "Selected content" in result
@patch('httpx.get')
def test_webscraper_fetch_section_no_match(mock_get, mock_response):
"""Test selector with no match."""
mock_get.return_value = mock_response
result = webscraper_fetch_section("https://example.com", ".nonexistent")
assert "No element found" in result
@patch('httpx.get')
def test_webscraper_fetch_meta(mock_get, mock_response):
"""Test webscraper_fetch_meta tool."""
mock_get.return_value = mock_response
result = webscraper_fetch_meta("https://example.com")
assert result["title"] == "Test Page"
assert result["description"] == "Test desc"
assert result["og:title"] == "OG Title"
@patch('httpx.get')
def test_webscraper_fetch_sitemap(mock_get, mock_sitemap_response):
"""Test webscraper_fetch_sitemap tool."""
mock_get.return_value = mock_sitemap_response
result = webscraper_fetch_sitemap("https://example.com/sitemap.xml", max_urls=2)
assert isinstance(result, list)
assert "https://example.com/page1" in result
assert len(result) == 2 # Limited by max_urls
@patch('httpx.get')
def test_webscraper_fetch_sitemap_loop_protection(mock_get, mock_sitemap_response):
"""Test sitemap loop protection."""
mock_get.return_value = mock_sitemap_response
result = webscraper_fetch_sitemap("https://example.com/sitemap.xml")
assert "https://example.com/sitemap.xml" not in result # Self-reference removed
def test_clean_soup():
"""Test clean_soup helper."""
from bs4 import BeautifulSoup
soup = BeautifulSoup('Text
', 'lxml')
cleaned = clean_soup(soup)
assert '