Go: parse HTML file (#16018)

### What problem does this PR solve? ``` RAGFlow(api/default)> parse file 'test.html'; Parsing HTML file: test.html <html> ...... ``` Signed-off-by: Jin Hai <haijin.chn@gmail.com>
2026-06-29 15:31:05 +08:00 · 2026-06-15 15:49:17 +08:00
parent 53d4d9b3bd
commit e3cb86d540
3 changed files with 123 additions and 0 deletions
--- a/internal/ingestion/parser/html_parser.go
+++ b/internal/ingestion/parser/html_parser.go
@@ -0,0 +1,118 @@
+//
+// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package parser
+
+import (
+	"fmt"
+	"strings"
+
+	"golang.org/x/net/html"
+)
+
+const (
+	Official string = "official"
+)
+
+type HTMLParser struct {
+	libType string
+}
+
+func NewHTMLParser(libType string) (*HTMLParser, error) {
+	switch libType {
+	case Official:
+		return &HTMLParser{
+			libType: Official,
+		}, nil
+	default:
+		return nil, fmt.Errorf("unsupported HTML library type: %s", libType)
+	}
+}
+
+func (p *HTMLParser) Parse(filename string, data []byte) error {
+	fmt.Printf("Parsing HTML file: %s\n", filename)
+	switch p.libType {
+	case Official:
+		return p.OfficialHTMLParse(data)
+	default:
+		return fmt.Errorf("unsupported HTML library type: %s", p.libType)
+	}
+}
+
+func (p *HTMLParser) OfficialHTMLParse(data []byte) error {
+	doc, _ := html.Parse(strings.NewReader(string(data)))
+	p.WalkIterative(doc)
+	return nil
+}
+
+func (p *HTMLParser) WalkIterative(root *html.Node) {
+	if root == nil {
+		return
+	}
+
+	// Stack: stores node and its depth
+	type item struct {
+		node  *html.Node
+		depth int
+	}
+	stack := []item{{root, 0}}
+
+	for len(stack) > 0 {
+		// Pop the top of the stack
+		current := stack[len(stack)-1]
+		stack = stack[:len(stack)-1]
+
+		indent := strings.Repeat("  ", current.depth)
+
+		// Handle different node types
+		switch current.node.Type {
+		case html.ElementNode:
+			// Print opening tag
+			fmt.Printf("%s<%s", indent, current.node.Data)
+			// Optionally print attributes
+			for _, attr := range current.node.Attr {
+				fmt.Printf(" %s=%q", attr.Key, attr.Val)
+			}
+			fmt.Println(">")
+
+		case html.TextNode:
+			// Print text content (trim extra whitespace)
+			text := strings.TrimSpace(current.node.Data)
+			if text != "" {
+				fmt.Printf("%stext: %q\n", indent, text)
+			}
+
+		case html.CommentNode:
+			fmt.Printf("%scomment: %s\n", indent, current.node.Data)
+
+		case html.DoctypeNode:
+			fmt.Printf("%sDOCTYPE: %s\n", indent, current.node.Data)
+		}
+
+		// Push children onto stack in reverse order to maintain original sequence
+		var children []*html.Node
+		for child := current.node.FirstChild; child != nil; child = child.NextSibling {
+			children = append([]*html.Node{child}, children...) // Reverse order
+		}
+		for _, child := range children {
+			stack = append(stack, item{child, current.depth + 1})
+		}
+	}
+}
+
+func (p *HTMLParser) String() string {
+	return "HTMLParser"
+}
--- a/internal/ingestion/parser/type.go
+++ b/internal/ingestion/parser/type.go
@@ -41,6 +41,8 @@ func GetParser(fileType utility.FileType, config map[string]string) (FileParser,
 		return NewDOCParser(libType)
 	case utility.FileTypePDF:
 		return NewPDFParser(), nil
+	case utility.FileTypeHTML:
+		return NewHTMLParser(Official)
 	case utility.FileTypeMarkdown:
 		return NewMarkdownParser(GoMarkdown)
 	default:
--- a/internal/utility/file.go
+++ b/internal/utility/file.go
@@ -32,6 +32,7 @@ const (
 	FileTypePPTX     FileType = "pptx"
 	FileTypeXLS      FileType = "xls"
 	FileTypeXLSX     FileType = "xlsx"
+	FileTypeHTML     FileType = "html"
 	FileTypeMarkdown FileType = "md"
 	FileTypeVISUAL   FileType = "visual"
 	FileTypeAURAL    FileType = "aural"
@@ -83,6 +84,8 @@ func GetFileType(filename string) FileType {
 		return FileTypePPT
 	case "pptx":
 		return FileTypePPTX
+	case "html", "htm":
+		return FileTypeHTML
 	case "md":
 		return FileTypeMarkdown
 	default: