"""Comprehensive tests for webscraper server.""" import pytest import httpx from unittest.mock import MagicMock, patch from src.server import ( webscraper_fetch, webscraper_fetch_links, webscraper_fetch_tables, webscraper_fetch_all, webscraper_fetch_section, webscraper_fetch_meta, webscraper_fetch_sitemap, webscraper_search_hint, clean_soup, filter_junk_links ) @pytest.fixture def mock_response(): """Mock httpx response.""" mock_resp = MagicMock() mock_resp.status_code = 200 mock_resp.text = """ Test Page

Header

Paragraph 1

Link 1 Junk Mail Junk JS Relative Link Parent Relative
Cell1Cell2
Selected content
""" mock_resp.headers = {"content-type": "text/html"} return mock_resp @pytest.fixture def mock_sitemap_response(): """Mock sitemap response.""" mock_resp = MagicMock() mock_resp.status_code = 200 mock_resp.text = """ https://example.com/page1 https://example.com/page2 https://example.com/sitemap.xml """ return mock_resp @patch('httpx.get') def test_webscraper_fetch(mock_get, mock_response): """Test webscraper_fetch tool.""" mock_get.return_value = mock_response result = webscraper_fetch("https://example.com", max_chars=100) assert "# Test Page" in result assert "Paragraph 1" in result assert "URL: https://example.com" in result assert len(result) < 500 # Truncated @patch('httpx.get') def test_webscraper_fetch_error(mock_get): """Test error handling in webscraper_fetch.""" mock_get.side_effect = httpx.RequestError("Connection failed") result = webscraper_fetch("https://fail.com") assert "Error fetching" in result @patch('httpx.get') def test_webscraper_fetch_links(mock_get, mock_response): """Test webscraper_fetch_links tool.""" mock_get.return_value = mock_response result = webscraper_fetch_links("https://example.com", deduplicate=True) assert isinstance(result, list) assert "https://example.com/link1" in result assert "https://example.com/relative.html" in result assert "https://example.com/dir/page.html" in result assert len(result) == 3 # Valid links only @patch('httpx.get') def test_webscraper_fetch_links_no_dedup(mock_get, mock_response): """Test without deduplication.""" mock_get.return_value = mock_response result = webscraper_fetch_links("https://example.com", deduplicate=False) assert len(result) == 3 # Still three unique @patch('httpx.get') def test_webscraper_fetch_tables(mock_get, mock_response): """Test webscraper_fetch_tables tool.""" mock_get.return_value = mock_response result = webscraper_fetch_tables("https://example.com") assert isinstance(result, list) assert "Cell1" in result[0] assert "Cell2" in result[0] @patch('httpx.get') def test_webscraper_fetch_all(mock_get, mock_response): """Test webscraper_fetch_all tool.""" mock_get.return_value = mock_response result = webscraper_fetch_all("https://example.com", max_chars=100) assert "markdown" in result assert "links" in result assert "tables" in result assert "meta" in result @patch('httpx.get') def test_webscraper_fetch_section(mock_get, mock_response): """Test webscraper_fetch_section tool.""" mock_get.return_value = mock_response result = webscraper_fetch_section("https://example.com", ".content") assert "Selected content" in result @patch('httpx.get') def test_webscraper_fetch_section_no_match(mock_get, mock_response): """Test selector with no match.""" mock_get.return_value = mock_response result = webscraper_fetch_section("https://example.com", ".nonexistent") assert "No element found" in result @patch('httpx.get') def test_webscraper_fetch_meta(mock_get, mock_response): """Test webscraper_fetch_meta tool.""" mock_get.return_value = mock_response result = webscraper_fetch_meta("https://example.com") assert result["title"] == "Test Page" assert result["description"] == "Test desc" assert result["og:title"] == "OG Title" @patch('httpx.get') def test_webscraper_fetch_sitemap(mock_get, mock_sitemap_response): """Test webscraper_fetch_sitemap tool.""" mock_get.return_value = mock_sitemap_response result = webscraper_fetch_sitemap("https://example.com/sitemap.xml", max_urls=2) assert isinstance(result, list) assert "https://example.com/page1" in result assert len(result) == 2 # Limited by max_urls @patch('httpx.get') def test_webscraper_fetch_sitemap_loop_protection(mock_get, mock_sitemap_response): """Test sitemap loop protection.""" mock_get.return_value = mock_sitemap_response result = webscraper_fetch_sitemap("https://example.com/sitemap.xml") assert "https://example.com/sitemap.xml" not in result # Self-reference removed def test_clean_soup(): """Test clean_soup helper.""" from bs4 import BeautifulSoup soup = BeautifulSoup('

Text

', 'lxml') cleaned = clean_soup(soup) assert '