diff --git a/internal/ingestion/parser/html_parser.go b/internal/ingestion/parser/html_parser.go new file mode 100644 index 0000000000..4b6af46bc7 --- /dev/null +++ b/internal/ingestion/parser/html_parser.go @@ -0,0 +1,118 @@ +// +// Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package parser + +import ( + "fmt" + "strings" + + "golang.org/x/net/html" +) + +const ( + Official string = "official" +) + +type HTMLParser struct { + libType string +} + +func NewHTMLParser(libType string) (*HTMLParser, error) { + switch libType { + case Official: + return &HTMLParser{ + libType: Official, + }, nil + default: + return nil, fmt.Errorf("unsupported HTML library type: %s", libType) + } +} + +func (p *HTMLParser) Parse(filename string, data []byte) error { + fmt.Printf("Parsing HTML file: %s\n", filename) + switch p.libType { + case Official: + return p.OfficialHTMLParse(data) + default: + return fmt.Errorf("unsupported HTML library type: %s", p.libType) + } +} + +func (p *HTMLParser) OfficialHTMLParse(data []byte) error { + doc, _ := html.Parse(strings.NewReader(string(data))) + p.WalkIterative(doc) + return nil +} + +func (p *HTMLParser) WalkIterative(root *html.Node) { + if root == nil { + return + } + + // Stack: stores node and its depth + type item struct { + node *html.Node + depth int + } + stack := []item{{root, 0}} + + for len(stack) > 0 { + // Pop the top of the stack + current := stack[len(stack)-1] + stack = stack[:len(stack)-1] + + indent := strings.Repeat(" ", current.depth) + + // Handle different node types + switch current.node.Type { + case html.ElementNode: + // Print opening tag + fmt.Printf("%s<%s", indent, current.node.Data) + // Optionally print attributes + for _, attr := range current.node.Attr { + fmt.Printf(" %s=%q", attr.Key, attr.Val) + } + fmt.Println(">") + + case html.TextNode: + // Print text content (trim extra whitespace) + text := strings.TrimSpace(current.node.Data) + if text != "" { + fmt.Printf("%stext: %q\n", indent, text) + } + + case html.CommentNode: + fmt.Printf("%scomment: %s\n", indent, current.node.Data) + + case html.DoctypeNode: + fmt.Printf("%sDOCTYPE: %s\n", indent, current.node.Data) + } + + // Push children onto stack in reverse order to maintain original sequence + var children []*html.Node + for child := current.node.FirstChild; child != nil; child = child.NextSibling { + children = append([]*html.Node{child}, children...) // Reverse order + } + for _, child := range children { + stack = append(stack, item{child, current.depth + 1}) + } + } +} + +func (p *HTMLParser) String() string { + return "HTMLParser" +} diff --git a/internal/ingestion/parser/type.go b/internal/ingestion/parser/type.go index 45f61de111..44a3b10288 100644 --- a/internal/ingestion/parser/type.go +++ b/internal/ingestion/parser/type.go @@ -41,6 +41,8 @@ func GetParser(fileType utility.FileType, config map[string]string) (FileParser, return NewDOCParser(libType) case utility.FileTypePDF: return NewPDFParser(), nil + case utility.FileTypeHTML: + return NewHTMLParser(Official) case utility.FileTypeMarkdown: return NewMarkdownParser(GoMarkdown) default: diff --git a/internal/utility/file.go b/internal/utility/file.go index ed21f030bf..2c12e59480 100644 --- a/internal/utility/file.go +++ b/internal/utility/file.go @@ -32,6 +32,7 @@ const ( FileTypePPTX FileType = "pptx" FileTypeXLS FileType = "xls" FileTypeXLSX FileType = "xlsx" + FileTypeHTML FileType = "html" FileTypeMarkdown FileType = "md" FileTypeVISUAL FileType = "visual" FileTypeAURAL FileType = "aural" @@ -83,6 +84,8 @@ func GetFileType(filename string) FileType { return FileTypePPT case "pptx": return FileTypePPTX + case "html", "htm": + return FileTypeHTML case "md": return FileTypeMarkdown default: