Commit 34b687d
Changed files (2)
pkg
htmlprocessor
pkg/htmlprocessor/processor.go
@@ -0,0 +1,98 @@
+package htmlprocessor
+
+import (
+ "strings"
+
+ "github.com/JohannesKaufmann/html-to-markdown"
+ "github.com/PuerkitoBio/goquery"
+)
+
+// ContentExtractor handles HTML content extraction and conversion
+type ContentExtractor struct {
+ converter *md.Converter
+}
+
+// NewContentExtractor creates a new ContentExtractor with default settings
+func NewContentExtractor() *ContentExtractor {
+ converter := md.NewConverter("", true, nil)
+
+ // Add custom rules to remove unwanted elements
+ converter.AddRules(
+ md.Rule{
+ Filter: []string{"script", "style", "nav", "header", "footer", "aside"},
+ Replacement: func(content string, selec *goquery.Selection, opt *md.Options) *string {
+ // Remove these elements entirely
+ empty := ""
+ return &empty
+ },
+ },
+ )
+
+ return &ContentExtractor{
+ converter: converter,
+ }
+}
+
+// ExtractReadableContent extracts the main readable content from HTML
+// It removes navigation, ads, scripts, styles, and other non-content elements
+func (e *ContentExtractor) ExtractReadableContent(html string) (string, error) {
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+ if err != nil {
+ return "", err
+ }
+
+ // Remove unwanted elements
+ doc.Find("script, style, nav, header, footer, aside, .sidebar, .ads, .advertisement").Remove()
+
+ // Try to find main content areas in order of preference
+ var contentSelection *goquery.Selection
+
+ // Look for semantic HTML5 elements first
+ if main := doc.Find("main"); main.Length() > 0 {
+ contentSelection = main.First()
+ } else if article := doc.Find("article"); article.Length() > 0 {
+ contentSelection = article.First()
+ } else if content := doc.Find(".content, .main-content, #content, #main"); content.Length() > 0 {
+ contentSelection = content.First()
+ } else {
+ // Fallback to body
+ contentSelection = doc.Find("body")
+ }
+
+ // Extract text content
+ var textParts []string
+ contentSelection.Find("h1, h2, h3, h4, h5, h6, p, li").Each(func(i int, s *goquery.Selection) {
+ text := strings.TrimSpace(s.Text())
+ if text != "" {
+ textParts = append(textParts, text)
+ }
+ })
+
+ return strings.Join(textParts, "\n"), nil
+}
+
+// ToMarkdown converts HTML to markdown format
+func (e *ContentExtractor) ToMarkdown(html string) (string, error) {
+ markdown, err := e.converter.ConvertString(html)
+ if err != nil {
+ return "", err
+ }
+
+ // Clean up extra whitespace
+ lines := strings.Split(markdown, "\n")
+ var cleanLines []string
+
+ for _, line := range lines {
+ trimmed := strings.TrimSpace(line)
+ if trimmed != "" || (len(cleanLines) > 0 && cleanLines[len(cleanLines)-1] != "") {
+ cleanLines = append(cleanLines, trimmed)
+ }
+ }
+
+ // Remove trailing empty lines
+ for len(cleanLines) > 0 && cleanLines[len(cleanLines)-1] == "" {
+ cleanLines = cleanLines[:len(cleanLines)-1]
+ }
+
+ return strings.Join(cleanLines, "\n"), nil
+}
\ No newline at end of file
pkg/htmlprocessor/processor_test.go
@@ -0,0 +1,139 @@
+package htmlprocessor
+
+import (
+ "strings"
+ "testing"
+)
+
+func TestContentExtractor_ExtractReadableContent(t *testing.T) {
+ extractor := NewContentExtractor()
+
+ tests := []struct {
+ name string
+ html string
+ expected string
+ }{
+ {
+ name: "simple article with header and paragraph",
+ html: `
+ <html>
+ <head><title>Test Article</title></head>
+ <body>
+ <header>
+ <nav>Navigation</nav>
+ </header>
+ <main>
+ <h1>Main Title</h1>
+ <p>This is the main content that should be extracted.</p>
+ <p>Another paragraph with important information.</p>
+ </main>
+ <footer>Footer content</footer>
+ </body>
+ </html>
+ `,
+ expected: "Main Title\nThis is the main content that should be extracted.\nAnother paragraph with important information.",
+ },
+ {
+ name: "article with sidebar and ads",
+ html: `
+ <html>
+ <body>
+ <aside class="sidebar">Sidebar content</aside>
+ <div class="ads">Advertisement</div>
+ <article>
+ <h2>Article Title</h2>
+ <p>Article content here.</p>
+ </article>
+ </body>
+ </html>
+ `,
+ expected: "Article Title\nArticle content here.",
+ },
+ {
+ name: "content with script and style tags",
+ html: `
+ <html>
+ <head>
+ <style>body { color: red; }</style>
+ </head>
+ <body>
+ <h1>Clean Title</h1>
+ <script>console.log('should be removed');</script>
+ <p>Clean paragraph.</p>
+ <style>.hidden { display: none; }</style>
+ </body>
+ </html>
+ `,
+ expected: "Clean Title\nClean paragraph.",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ result, err := extractor.ExtractReadableContent(tt.html)
+ if err != nil {
+ t.Fatalf("ExtractReadableContent() error = %v", err)
+ }
+
+ // Normalize whitespace for comparison
+ result = strings.TrimSpace(strings.ReplaceAll(result, "\n\n", "\n"))
+ expected := strings.TrimSpace(strings.ReplaceAll(tt.expected, "\n\n", "\n"))
+
+ if result != expected {
+ t.Errorf("ExtractReadableContent() = %q, want %q", result, expected)
+ }
+ })
+ }
+}
+
+func TestContentExtractor_ToMarkdown(t *testing.T) {
+ extractor := NewContentExtractor()
+
+ tests := []struct {
+ name string
+ html string
+ expected string
+ }{
+ {
+ name: "basic formatting",
+ html: `<h1>Title</h1><p>Paragraph with <strong>bold</strong> and <em>italic</em> text.</p>`,
+ expected: "# Title\n\nParagraph with **bold** and _italic_ text.",
+ },
+ {
+ name: "lists",
+ html: `
+ <ul>
+ <li>First item</li>
+ <li>Second item</li>
+ </ul>
+ <ol>
+ <li>Numbered first</li>
+ <li>Numbered second</li>
+ </ol>
+ `,
+ expected: "- First item\n- Second item\n\n1. Numbered first\n2. Numbered second",
+ },
+ {
+ name: "links and code",
+ html: `<p>Visit <a href="https://example.com">Example</a> for <code>code samples</code>.</p>`,
+ expected: "Visit [Example](https://example.com) for `code samples`.",
+ },
+ }
+
+ for _, tt := range tests {
+ t.Run(tt.name, func(t *testing.T) {
+ result, err := extractor.ToMarkdown(tt.html)
+ if err != nil {
+ t.Fatalf("ToMarkdown() error = %v", err)
+ }
+
+ // Normalize whitespace for comparison
+ result = strings.TrimSpace(result)
+ expected := strings.TrimSpace(tt.expected)
+
+ if result != expected {
+ t.Errorf("ToMarkdown() = %q, want %q", result, expected)
+ }
+ })
+ }
+}
\ No newline at end of file