chore: reorganize into polyglot monorepo (workshop)
- Move bigmind/ -> mcp/bigmind/ - Move webscraper/ -> mcp/webscraper/ - Move mss-failsafe/ -> java/mss-failsafe/ - Move Wellmann-Shop/ -> java/wellmann-shop/ (normalize to kebab-case) - Add .roo/ IDE config files to tracking - Add plans/REPO_STRATEGY.md (monorepo strategy document) - Expand .gitignore: Java/Maven, Node/TS, coverage, uv.lock - Rewrite README.md as navigation index - Update .roo/mcp.json webscraper path to mcp/webscraper/
This commit is contained in:
@@ -0,0 +1,205 @@
|
||||
"""Comprehensive tests for webscraper server."""
|
||||
|
||||
import pytest
|
||||
import httpx
|
||||
from unittest.mock import MagicMock, patch
|
||||
from src.server import (
|
||||
webscraper_fetch, webscraper_fetch_links, webscraper_fetch_tables,
|
||||
webscraper_fetch_all, webscraper_fetch_section, webscraper_fetch_meta,
|
||||
webscraper_fetch_sitemap, clean_soup, filter_junk_links
|
||||
)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_response():
|
||||
"""Mock httpx response."""
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.text = """
|
||||
<html>
|
||||
<head><title>Test Page</title><meta name="description" content="Test desc">
|
||||
<meta property="og:title" content="OG Title">
|
||||
<meta property="og:description" content="OG Desc">
|
||||
</head>
|
||||
<body>
|
||||
<h1>Header</h1>
|
||||
<p>Paragraph 1</p>
|
||||
<a href="https://example.com/link1">Link 1</a>
|
||||
<a href="mailto:foo@bar.com">Junk Mail</a>
|
||||
<a href="javascript:alert()">Junk JS</a>
|
||||
<a href="relative.html">Relative Link</a>
|
||||
<a href="../dir/page.html">Parent Relative</a>
|
||||
<table><tr><td>Cell1</td><td>Cell2</td></tr></table>
|
||||
<div class="content">Selected content</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
mock_resp.headers = {"content-type": "text/html"}
|
||||
return mock_resp
|
||||
|
||||
@pytest.fixture
|
||||
def mock_sitemap_response():
|
||||
"""Mock sitemap response."""
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.text = """
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
||||
<url><loc>https://example.com/page1</loc></url>
|
||||
<url><loc>https://example.com/page2</loc></url>
|
||||
<url><loc>https://example.com/sitemap.xml</loc></url>
|
||||
</urlset>
|
||||
"""
|
||||
return mock_resp
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch(mock_get, mock_response):
|
||||
"""Test webscraper_fetch tool."""
|
||||
mock_get.return_value = mock_response
|
||||
result = webscraper_fetch("https://example.com", max_chars=100)
|
||||
assert "# Test Page" in result
|
||||
assert "Paragraph 1" in result
|
||||
assert "URL: https://example.com" in result
|
||||
assert len(result) < 500 # Truncated
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_error(mock_get):
|
||||
"""Test error handling in webscraper_fetch."""
|
||||
mock_get.side_effect = httpx.RequestError("Connection failed")
|
||||
result = webscraper_fetch("https://fail.com")
|
||||
assert "Error fetching" in result
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_links(mock_get, mock_response):
|
||||
"""Test webscraper_fetch_links tool."""
|
||||
mock_get.return_value = mock_response
|
||||
result = webscraper_fetch_links("https://example.com", deduplicate=True)
|
||||
assert isinstance(result, list)
|
||||
assert "https://example.com/link1" in result
|
||||
assert "https://example.com/relative.html" in result
|
||||
assert "https://example.com/dir/page.html" in result
|
||||
assert len(result) == 3 # Valid links only
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_links_no_dedup(mock_get, mock_response):
|
||||
"""Test without deduplication."""
|
||||
mock_get.return_value = mock_response
|
||||
result = webscraper_fetch_links("https://example.com", deduplicate=False)
|
||||
assert len(result) == 3 # Still three unique
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_tables(mock_get, mock_response):
|
||||
"""Test webscraper_fetch_tables tool."""
|
||||
mock_get.return_value = mock_response
|
||||
result = webscraper_fetch_tables("https://example.com")
|
||||
assert isinstance(result, list)
|
||||
assert "Cell1" in result[0]
|
||||
assert "Cell2" in result[0]
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_all(mock_get, mock_response):
|
||||
"""Test webscraper_fetch_all tool."""
|
||||
mock_get.return_value = mock_response
|
||||
result = webscraper_fetch_all("https://example.com", max_chars=100)
|
||||
assert "markdown" in result
|
||||
assert "links" in result
|
||||
assert "tables" in result
|
||||
assert "meta" in result
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_section(mock_get, mock_response):
|
||||
"""Test webscraper_fetch_section tool."""
|
||||
mock_get.return_value = mock_response
|
||||
result = webscraper_fetch_section("https://example.com", ".content")
|
||||
assert "Selected content" in result
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_section_no_match(mock_get, mock_response):
|
||||
"""Test selector with no match."""
|
||||
mock_get.return_value = mock_response
|
||||
result = webscraper_fetch_section("https://example.com", ".nonexistent")
|
||||
assert "No element found" in result
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_meta(mock_get, mock_response):
|
||||
"""Test webscraper_fetch_meta tool."""
|
||||
mock_get.return_value = mock_response
|
||||
result = webscraper_fetch_meta("https://example.com")
|
||||
assert result["title"] == "Test Page"
|
||||
assert result["description"] == "Test desc"
|
||||
assert result["og:title"] == "OG Title"
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_sitemap(mock_get, mock_sitemap_response):
|
||||
"""Test webscraper_fetch_sitemap tool."""
|
||||
mock_get.return_value = mock_sitemap_response
|
||||
result = webscraper_fetch_sitemap("https://example.com/sitemap.xml", max_urls=2)
|
||||
assert isinstance(result, list)
|
||||
assert "https://example.com/page1" in result
|
||||
assert len(result) == 2 # Limited by max_urls
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_webscraper_fetch_sitemap_loop_protection(mock_get, mock_sitemap_response):
|
||||
"""Test sitemap loop protection."""
|
||||
mock_get.return_value = mock_sitemap_response
|
||||
result = webscraper_fetch_sitemap("https://example.com/sitemap.xml")
|
||||
assert "https://example.com/sitemap.xml" not in result # Self-reference removed
|
||||
|
||||
def test_clean_soup():
|
||||
"""Test clean_soup helper."""
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup('<html><script>alert()</script><p>Text</p></html>', 'lxml')
|
||||
cleaned = clean_soup(soup)
|
||||
assert '<script>' not in str(cleaned)
|
||||
assert '<p>Text</p>' in str(cleaned)
|
||||
|
||||
def test_filter_junk_links():
|
||||
"""Test filter_junk_links helper."""
|
||||
assert filter_junk_links("https://example.com") == True
|
||||
assert filter_junk_links("mailto:foo@bar.com") == False
|
||||
assert filter_junk_links("javascript:alert()") == False
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_word_count_before_truncation(mock_get, mock_response):
|
||||
"""Test word count before truncation (from memory bug fix)."""
|
||||
mock_get.return_value = mock_response
|
||||
result = webscraper_fetch("https://example.com", max_chars=10)
|
||||
# Implementation uses len(body) > max_chars, which is char count, but test ensures no post-trunc count bug
|
||||
assert "..." in result # Truncated
|
||||
|
||||
# Additional edge cases
|
||||
@patch('httpx.get')
|
||||
def test_empty_page(mock_get):
|
||||
"""Test empty HTML response."""
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.text = ""
|
||||
mock_get.return_value = mock_resp
|
||||
result = webscraper_fetch("https://empty.com")
|
||||
assert "No Title" in result
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_404(mock_get):
|
||||
"""Test 404 response."""
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 404
|
||||
mock_resp.text = "Not Found"
|
||||
mock_get.side_effect = httpx.HTTPStatusError("Client Error", response=mock_resp)
|
||||
result = webscraper_fetch("https://notfound.com")
|
||||
assert "Error fetching" in result
|
||||
assert "404" in result
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_invalid_selector(mock_get, mock_response):
|
||||
"""Test invalid CSS selector handling."""
|
||||
mock_get.return_value = mock_response
|
||||
# Implementation uses select_one, which returns None for invalid — already tested in no_match
|
||||
pass
|
||||
|
||||
@patch('httpx.get')
|
||||
def test_sitemap_max_urls(mock_get, mock_sitemap_response):
|
||||
"""Test sitemap max_urls limit."""
|
||||
mock_get.return_value = mock_sitemap_response
|
||||
result = webscraper_fetch_sitemap("https://example.com/sitemap.xml", max_urls=1)
|
||||
assert len(result) == 1
|
||||
|
||||
# Total: 18 tests covering all tools and edge cases
|
||||
Reference in New Issue
Block a user