Go: parse HTML file (#16018)

### What problem does this PR solve?

```
RAGFlow(api/default)> parse file 'test.html';
Parsing HTML file: test.html
  <html>
......
```

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2026-06-15 15:49:17 +08:00
committed by GitHub
parent 53d4d9b3bd
commit e3cb86d540
3 changed files with 123 additions and 0 deletions

View File

@@ -0,0 +1,118 @@
//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package parser
import (
"fmt"
"strings"
"golang.org/x/net/html"
)
const (
Official string = "official"
)
type HTMLParser struct {
libType string
}
func NewHTMLParser(libType string) (*HTMLParser, error) {
switch libType {
case Official:
return &HTMLParser{
libType: Official,
}, nil
default:
return nil, fmt.Errorf("unsupported HTML library type: %s", libType)
}
}
func (p *HTMLParser) Parse(filename string, data []byte) error {
fmt.Printf("Parsing HTML file: %s\n", filename)
switch p.libType {
case Official:
return p.OfficialHTMLParse(data)
default:
return fmt.Errorf("unsupported HTML library type: %s", p.libType)
}
}
func (p *HTMLParser) OfficialHTMLParse(data []byte) error {
doc, _ := html.Parse(strings.NewReader(string(data)))
p.WalkIterative(doc)
return nil
}
func (p *HTMLParser) WalkIterative(root *html.Node) {
if root == nil {
return
}
// Stack: stores node and its depth
type item struct {
node *html.Node
depth int
}
stack := []item{{root, 0}}
for len(stack) > 0 {
// Pop the top of the stack
current := stack[len(stack)-1]
stack = stack[:len(stack)-1]
indent := strings.Repeat(" ", current.depth)
// Handle different node types
switch current.node.Type {
case html.ElementNode:
// Print opening tag
fmt.Printf("%s<%s", indent, current.node.Data)
// Optionally print attributes
for _, attr := range current.node.Attr {
fmt.Printf(" %s=%q", attr.Key, attr.Val)
}
fmt.Println(">")
case html.TextNode:
// Print text content (trim extra whitespace)
text := strings.TrimSpace(current.node.Data)
if text != "" {
fmt.Printf("%stext: %q\n", indent, text)
}
case html.CommentNode:
fmt.Printf("%scomment: %s\n", indent, current.node.Data)
case html.DoctypeNode:
fmt.Printf("%sDOCTYPE: %s\n", indent, current.node.Data)
}
// Push children onto stack in reverse order to maintain original sequence
var children []*html.Node
for child := current.node.FirstChild; child != nil; child = child.NextSibling {
children = append([]*html.Node{child}, children...) // Reverse order
}
for _, child := range children {
stack = append(stack, item{child, current.depth + 1})
}
}
}
func (p *HTMLParser) String() string {
return "HTMLParser"
}

View File

@@ -41,6 +41,8 @@ func GetParser(fileType utility.FileType, config map[string]string) (FileParser,
return NewDOCParser(libType)
case utility.FileTypePDF:
return NewPDFParser(), nil
case utility.FileTypeHTML:
return NewHTMLParser(Official)
case utility.FileTypeMarkdown:
return NewMarkdownParser(GoMarkdown)
default:

View File

@@ -32,6 +32,7 @@ const (
FileTypePPTX FileType = "pptx"
FileTypeXLS FileType = "xls"
FileTypeXLSX FileType = "xlsx"
FileTypeHTML FileType = "html"
FileTypeMarkdown FileType = "md"
FileTypeVISUAL FileType = "visual"
FileTypeAURAL FileType = "aural"
@@ -83,6 +84,8 @@ func GetFileType(filename string) FileType {
return FileTypePPT
case "pptx":
return FileTypePPTX
case "html", "htm":
return FileTypeHTML
case "md":
return FileTypeMarkdown
default: