Commit `34b687d`

mo khan <mo@mokhan.ca>

2025-06-22 19:49:58

feat: implement HTML content extraction and markdown conversion

- Create pkg/htmlprocessor with ContentExtractor for better HTML processing - Add ExtractReadableContent() method using goquery for content extraction - Add ToMarkdown() method with html-to-markdown for better conversion - Include comprehensive tests for both text extraction and markdown conversion - Remove unwanted elements (scripts, styles, nav, ads) from extracted content - Focus on main content areas (main, article, .content) with semantic fallbacks Tests cover: - Content extraction from complex HTML with sidebars/ads - Script and style tag removal - Basic markdown formatting (headers, bold, italic, lists, links, code) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>

main

1 parent 3f49e4c

Changed files (2)

pkg

htmlprocessor

processor.go

processor_test.go

pkg/htmlprocessor/processor.go

@@ -0,0 +1,98 @@
+package htmlprocessor
+
+import (
+	"strings"
+
+	"github.com/JohannesKaufmann/html-to-markdown"
+	"github.com/PuerkitoBio/goquery"
+)
+
+// ContentExtractor handles HTML content extraction and conversion
+type ContentExtractor struct {
+	converter *md.Converter
+}
+
+// NewContentExtractor creates a new ContentExtractor with default settings
+func NewContentExtractor() *ContentExtractor {
+	converter := md.NewConverter("", true, nil)
+	
+	// Add custom rules to remove unwanted elements
+	converter.AddRules(
+		md.Rule{
+			Filter: []string{"script", "style", "nav", "header", "footer", "aside"},
+			Replacement: func(content string, selec *goquery.Selection, opt *md.Options) *string {
+				// Remove these elements entirely
+				empty := ""
+				return &empty
+			},
+		},
+	)
+
+	return &ContentExtractor{
+		converter: converter,
+	}
+}
+
+// ExtractReadableContent extracts the main readable content from HTML
+// It removes navigation, ads, scripts, styles, and other non-content elements
+func (e *ContentExtractor) ExtractReadableContent(html string) (string, error) {
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+	if err != nil {
+		return "", err
+	}
+
+	// Remove unwanted elements
+	doc.Find("script, style, nav, header, footer, aside, .sidebar, .ads, .advertisement").Remove()
+
+	// Try to find main content areas in order of preference
+	var contentSelection *goquery.Selection
+
+	// Look for semantic HTML5 elements first
+	if main := doc.Find("main"); main.Length() > 0 {
+		contentSelection = main.First()
+	} else if article := doc.Find("article"); article.Length() > 0 {
+		contentSelection = article.First()
+	} else if content := doc.Find(".content, .main-content, #content, #main"); content.Length() > 0 {
+		contentSelection = content.First()
+	} else {
+		// Fallback to body
+		contentSelection = doc.Find("body")
+	}
+
+	// Extract text content
+	var textParts []string
+	contentSelection.Find("h1, h2, h3, h4, h5, h6, p, li").Each(func(i int, s *goquery.Selection) {
+		text := strings.TrimSpace(s.Text())
+		if text != "" {
+			textParts = append(textParts, text)
+		}
+	})
+
+	return strings.Join(textParts, "\n"), nil
+}
+
+// ToMarkdown converts HTML to markdown format
+func (e *ContentExtractor) ToMarkdown(html string) (string, error) {
+	markdown, err := e.converter.ConvertString(html)
+	if err != nil {
+		return "", err
+	}
+
+	// Clean up extra whitespace
+	lines := strings.Split(markdown, "\n")
+	var cleanLines []string
+	
+	for _, line := range lines {
+		trimmed := strings.TrimSpace(line)
+		if trimmed != "" || (len(cleanLines) > 0 && cleanLines[len(cleanLines)-1] != "") {
+			cleanLines = append(cleanLines, trimmed)
+		}
+	}
+
+	// Remove trailing empty lines
+	for len(cleanLines) > 0 && cleanLines[len(cleanLines)-1] == "" {
+		cleanLines = cleanLines[:len(cleanLines)-1]
+	}
+
+	return strings.Join(cleanLines, "\n"), nil
+}
\ No newline at end of file

pkg/htmlprocessor/processor_test.go

@@ -0,0 +1,139 @@
+package htmlprocessor
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestContentExtractor_ExtractReadableContent(t *testing.T) {
+	extractor := NewContentExtractor()
+
+	tests := []struct {
+		name     string
+		html     string
+		expected string
+	}{
+		{
+			name: "simple article with header and paragraph",
+			html: `
+				<html>
+					<head><title>Test Article</title></head>
+					<body>
+						<header>
+							<nav>Navigation</nav>
+						</header>
+						<main>
+							<h1>Main Title</h1>
+							<p>This is the main content that should be extracted.</p>
+							<p>Another paragraph with important information.</p>
+						</main>
+						<footer>Footer content</footer>
+					</body>
+				</html>
+			`,
+			expected: "Main Title\nThis is the main content that should be extracted.\nAnother paragraph with important information.",
+		},
+		{
+			name: "article with sidebar and ads",
+			html: `
+				<html>
+					<body>
+						<aside class="sidebar">Sidebar content</aside>
+						<div class="ads">Advertisement</div>
+						<article>
+							<h2>Article Title</h2>
+							<p>Article content here.</p>
+						</article>
+					</body>
+				</html>
+			`,
+			expected: "Article Title\nArticle content here.",
+		},
+		{
+			name: "content with script and style tags",
+			html: `
+				<html>
+					<head>
+						<style>body { color: red; }</style>
+					</head>
+					<body>
+						<h1>Clean Title</h1>
+						<script>console.log('should be removed');</script>
+						<p>Clean paragraph.</p>
+						<style>.hidden { display: none; }</style>
+					</body>
+				</html>
+			`,
+			expected: "Clean Title\nClean paragraph.",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := extractor.ExtractReadableContent(tt.html)
+			if err != nil {
+				t.Fatalf("ExtractReadableContent() error = %v", err)
+			}
+
+			// Normalize whitespace for comparison
+			result = strings.TrimSpace(strings.ReplaceAll(result, "\n\n", "\n"))
+			expected := strings.TrimSpace(strings.ReplaceAll(tt.expected, "\n\n", "\n"))
+
+			if result != expected {
+				t.Errorf("ExtractReadableContent() = %q, want %q", result, expected)
+			}
+		})
+	}
+}
+
+func TestContentExtractor_ToMarkdown(t *testing.T) {
+	extractor := NewContentExtractor()
+
+	tests := []struct {
+		name     string
+		html     string
+		expected string
+	}{
+		{
+			name: "basic formatting",
+			html: `<h1>Title</h1><p>Paragraph with <strong>bold</strong> and <em>italic</em> text.</p>`,
+			expected: "# Title\n\nParagraph with **bold** and _italic_ text.",
+		},
+		{
+			name: "lists",
+			html: `
+				<ul>
+					<li>First item</li>
+					<li>Second item</li>
+				</ul>
+				<ol>
+					<li>Numbered first</li>
+					<li>Numbered second</li>
+				</ol>
+			`,
+			expected: "- First item\n- Second item\n\n1. Numbered first\n2. Numbered second",
+		},
+		{
+			name: "links and code",
+			html: `<p>Visit <a href="https://example.com">Example</a> for <code>code samples</code>.</p>`,
+			expected: "Visit [Example](https://example.com) for `code samples`.",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := extractor.ToMarkdown(tt.html)
+			if err != nil {
+				t.Fatalf("ToMarkdown() error = %v", err)
+			}
+
+			// Normalize whitespace for comparison
+			result = strings.TrimSpace(result)
+			expected := strings.TrimSpace(tt.expected)
+
+			if result != expected {
+				t.Errorf("ToMarkdown() = %q, want %q", result, expected)
+			}
+		})
+	}
+}
\ No newline at end of file

Commit 34b687d

Commit `34b687d`