Commit be486ba
Changed files (1)
pkg
fetch
pkg/fetch/server.go
@@ -6,19 +6,19 @@ import (
"io"
"net/http"
"net/url"
- "regexp"
"strings"
"time"
+ "github.com/xlgmokha/mcp/pkg/htmlprocessor"
"github.com/xlgmokha/mcp/pkg/mcp"
- "golang.org/x/net/html"
)
// Server implements the Fetch MCP server
type Server struct {
*mcp.Server
- httpClient *http.Client
- userAgent string
+ httpClient *http.Client
+ userAgent string
+ htmlProcessor *htmlprocessor.ContentExtractor
}
// FetchResult represents the result of a fetch operation
@@ -40,7 +40,8 @@ func New() *Server {
httpClient: &http.Client{
Timeout: 30 * time.Second,
},
- userAgent: "ModelContextProtocol/1.0 (Fetch; +https://github.com/xlgmokha/mcp)",
+ userAgent: "ModelContextProtocol/1.0 (Fetch; +https://github.com/xlgmokha/mcp)",
+ htmlProcessor: htmlprocessor.NewContentExtractor(),
}
// Register all fetch tools
@@ -198,8 +199,13 @@ func (fs *Server) fetchContent(urlStr string, maxLength, startIndex int, raw boo
if raw || !isHTMLContent(string(body), contentType) {
content = string(body)
} else {
- // Convert HTML to markdown
- content = fs.htmlToMarkdown(string(body))
+ // Convert HTML to markdown using improved processor
+ var err error
+ content, err = fs.htmlProcessor.ToMarkdown(string(body))
+ if err != nil {
+ // Fallback to raw content if markdown conversion fails
+ content = string(body)
+ }
}
// Apply start index first
@@ -250,133 +256,3 @@ func isHTMLContent(content, contentType string) bool {
return strings.Contains(strings.ToLower(prefix), "<html")
}
-func (fs *Server) htmlToMarkdown(htmlContent string) string {
- // Parse HTML
- doc, err := html.Parse(strings.NewReader(htmlContent))
- if err != nil {
- // If parsing fails, return cleaned text
- return fs.stripHTMLTags(htmlContent)
- }
-
- // Extract text content and convert to markdown
- var result strings.Builder
- fs.extractMarkdown(doc, &result, 0)
-
- // Clean up the result
- content := result.String()
- content = fs.cleanMarkdown(content)
-
- return content
-}
-
-func (fs *Server) extractMarkdown(node *html.Node, result *strings.Builder, depth int) {
- if node.Type == html.TextNode {
- text := strings.TrimSpace(node.Data)
- if text != "" {
- result.WriteString(text)
- result.WriteString(" ")
- }
- return
- }
-
- if node.Type == html.ElementNode {
- switch strings.ToLower(node.Data) {
- case "h1":
- result.WriteString("\n# ")
- case "h2":
- result.WriteString("\n## ")
- case "h3":
- result.WriteString("\n### ")
- case "h4":
- result.WriteString("\n#### ")
- case "h5":
- result.WriteString("\n##### ")
- case "h6":
- result.WriteString("\n###### ")
- case "p":
- result.WriteString("\n\n")
- case "br":
- result.WriteString("\n")
- case "li":
- result.WriteString("\n- ")
- case "blockquote":
- result.WriteString("\n> ")
- case "code":
- result.WriteString("`")
- case "pre":
- result.WriteString("\n```\n")
- case "strong", "b":
- result.WriteString("**")
- case "em", "i":
- result.WriteString("*")
- case "a":
- // Extract href attribute for links
- for _, attr := range node.Attr {
- if attr.Key == "href" {
- result.WriteString("[")
- break
- }
- }
- }
- }
-
- // Process child nodes
- for child := node.FirstChild; child != nil; child = child.NextSibling {
- fs.extractMarkdown(child, result, depth+1)
- }
-
- // Closing tags
- if node.Type == html.ElementNode {
- switch strings.ToLower(node.Data) {
- case "h1", "h2", "h3", "h4", "h5", "h6":
- result.WriteString("\n")
- case "p":
- result.WriteString("\n")
- case "code":
- result.WriteString("`")
- case "pre":
- result.WriteString("\n```\n")
- case "strong", "b":
- result.WriteString("**")
- case "em", "i":
- result.WriteString("*")
- case "a":
- // Close link and add URL
- for _, attr := range node.Attr {
- if attr.Key == "href" {
- result.WriteString("](")
- result.WriteString(attr.Val)
- result.WriteString(")")
- break
- }
- }
- }
- }
-}
-
-func (fs *Server) stripHTMLTags(content string) string {
- // Remove HTML tags using regex
- re := regexp.MustCompile(`<[^>]*>`)
- text := re.ReplaceAllString(content, " ")
-
- // Clean up whitespace
- re = regexp.MustCompile(`\s+`)
- text = re.ReplaceAllString(text, " ")
-
- return strings.TrimSpace(text)
-}
-
-func (fs *Server) cleanMarkdown(content string) string {
- // Remove excessive newlines
- re := regexp.MustCompile(`\n{3,}`)
- content = re.ReplaceAllString(content, "\n\n")
-
- // Remove excessive spaces
- re = regexp.MustCompile(` {2,}`)
- content = re.ReplaceAllString(content, " ")
-
- // Trim whitespace
- content = strings.TrimSpace(content)
-
- return content
-}