Commit be486ba

mo khan <mo@mokhan.ca>
2025-06-22 19:52:29
feat: integrate advanced HTML processor into fetch server
- Replace custom HTML parsing with htmlprocessor.ContentExtractor - Significantly improved content extraction and markdown conversion - Remove old HTML processing methods (htmlToMarkdown, extractMarkdown, etc.) - Enhanced filtering of unwanted elements (scripts, ads, navigation) - Better markdown formatting with proper link, code, and list handling Testing shows improved content extraction: - Filters out sidebar, ads, nav, footer, scripts automatically - Preserves semantic HTML structure in markdown output - Maintains backward compatibility with raw mode - Clean fallback handling for conversion errors 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 34b687d
Changed files (1)
pkg
pkg/fetch/server.go
@@ -6,19 +6,19 @@ import (
 	"io"
 	"net/http"
 	"net/url"
-	"regexp"
 	"strings"
 	"time"
 
+	"github.com/xlgmokha/mcp/pkg/htmlprocessor"
 	"github.com/xlgmokha/mcp/pkg/mcp"
-	"golang.org/x/net/html"
 )
 
 // Server implements the Fetch MCP server
 type Server struct {
 	*mcp.Server
-	httpClient *http.Client
-	userAgent  string
+	httpClient    *http.Client
+	userAgent     string
+	htmlProcessor *htmlprocessor.ContentExtractor
 }
 
 // FetchResult represents the result of a fetch operation
@@ -40,7 +40,8 @@ func New() *Server {
 		httpClient: &http.Client{
 			Timeout: 30 * time.Second,
 		},
-		userAgent: "ModelContextProtocol/1.0 (Fetch; +https://github.com/xlgmokha/mcp)",
+		userAgent:     "ModelContextProtocol/1.0 (Fetch; +https://github.com/xlgmokha/mcp)",
+		htmlProcessor: htmlprocessor.NewContentExtractor(),
 	}
 
 	// Register all fetch tools
@@ -198,8 +199,13 @@ func (fs *Server) fetchContent(urlStr string, maxLength, startIndex int, raw boo
 	if raw || !isHTMLContent(string(body), contentType) {
 		content = string(body)
 	} else {
-		// Convert HTML to markdown
-		content = fs.htmlToMarkdown(string(body))
+		// Convert HTML to markdown using improved processor
+		var err error
+		content, err = fs.htmlProcessor.ToMarkdown(string(body))
+		if err != nil {
+			// Fallback to raw content if markdown conversion fails
+			content = string(body)
+		}
 	}
 
 	// Apply start index first
@@ -250,133 +256,3 @@ func isHTMLContent(content, contentType string) bool {
 	return strings.Contains(strings.ToLower(prefix), "<html")
 }
 
-func (fs *Server) htmlToMarkdown(htmlContent string) string {
-	// Parse HTML
-	doc, err := html.Parse(strings.NewReader(htmlContent))
-	if err != nil {
-		// If parsing fails, return cleaned text
-		return fs.stripHTMLTags(htmlContent)
-	}
-
-	// Extract text content and convert to markdown
-	var result strings.Builder
-	fs.extractMarkdown(doc, &result, 0)
-
-	// Clean up the result
-	content := result.String()
-	content = fs.cleanMarkdown(content)
-
-	return content
-}
-
-func (fs *Server) extractMarkdown(node *html.Node, result *strings.Builder, depth int) {
-	if node.Type == html.TextNode {
-		text := strings.TrimSpace(node.Data)
-		if text != "" {
-			result.WriteString(text)
-			result.WriteString(" ")
-		}
-		return
-	}
-
-	if node.Type == html.ElementNode {
-		switch strings.ToLower(node.Data) {
-		case "h1":
-			result.WriteString("\n# ")
-		case "h2":
-			result.WriteString("\n## ")
-		case "h3":
-			result.WriteString("\n### ")
-		case "h4":
-			result.WriteString("\n#### ")
-		case "h5":
-			result.WriteString("\n##### ")
-		case "h6":
-			result.WriteString("\n###### ")
-		case "p":
-			result.WriteString("\n\n")
-		case "br":
-			result.WriteString("\n")
-		case "li":
-			result.WriteString("\n- ")
-		case "blockquote":
-			result.WriteString("\n> ")
-		case "code":
-			result.WriteString("`")
-		case "pre":
-			result.WriteString("\n```\n")
-		case "strong", "b":
-			result.WriteString("**")
-		case "em", "i":
-			result.WriteString("*")
-		case "a":
-			// Extract href attribute for links
-			for _, attr := range node.Attr {
-				if attr.Key == "href" {
-					result.WriteString("[")
-					break
-				}
-			}
-		}
-	}
-
-	// Process child nodes
-	for child := node.FirstChild; child != nil; child = child.NextSibling {
-		fs.extractMarkdown(child, result, depth+1)
-	}
-
-	// Closing tags
-	if node.Type == html.ElementNode {
-		switch strings.ToLower(node.Data) {
-		case "h1", "h2", "h3", "h4", "h5", "h6":
-			result.WriteString("\n")
-		case "p":
-			result.WriteString("\n")
-		case "code":
-			result.WriteString("`")
-		case "pre":
-			result.WriteString("\n```\n")
-		case "strong", "b":
-			result.WriteString("**")
-		case "em", "i":
-			result.WriteString("*")
-		case "a":
-			// Close link and add URL
-			for _, attr := range node.Attr {
-				if attr.Key == "href" {
-					result.WriteString("](")
-					result.WriteString(attr.Val)
-					result.WriteString(")")
-					break
-				}
-			}
-		}
-	}
-}
-
-func (fs *Server) stripHTMLTags(content string) string {
-	// Remove HTML tags using regex
-	re := regexp.MustCompile(`<[^>]*>`)
-	text := re.ReplaceAllString(content, " ")
-
-	// Clean up whitespace
-	re = regexp.MustCompile(`\s+`)
-	text = re.ReplaceAllString(text, " ")
-
-	return strings.TrimSpace(text)
-}
-
-func (fs *Server) cleanMarkdown(content string) string {
-	// Remove excessive newlines
-	re := regexp.MustCompile(`\n{3,}`)
-	content = re.ReplaceAllString(content, "\n\n")
-
-	// Remove excessive spaces
-	re = regexp.MustCompile(` {2,}`)
-	content = re.ReplaceAllString(content, " ")
-
-	// Trim whitespace
-	content = strings.TrimSpace(content)
-
-	return content
-}