diff --git a/go.mod b/go.mod index 0846ed663d..d07674b5f5 100644 --- a/go.mod +++ b/go.mod @@ -20,6 +20,7 @@ require ( github.com/glebarez/sqlite v1.11.0 github.com/go-sql-driver/mysql v1.7.0 github.com/goccy/go-json v0.10.2 + github.com/gomarkdown/markdown v0.0.0-20260614204949-e08cff860f76 github.com/google/uuid v1.6.0 github.com/infiniflow/infinity-go-sdk v0.0.0-00010101000000-000000000000 github.com/iromli/go-itsdangerous v0.0.0-20220223194502-9c8bef8dac6a @@ -34,9 +35,8 @@ require ( github.com/signintech/gopdf v0.36.1 github.com/siongui/gojianfan v0.0.0-20210926212422-2f175ac615de github.com/spf13/viper v1.18.2 - github.com/yfedoseev/office_oxide/go v0.1.2 - github.com/yfedoseev/pdf_oxide/go v0.3.63 github.com/xuri/excelize/v2 v2.10.1 + github.com/yfedoseev/office_oxide/go v0.1.2 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.69.0 go.opentelemetry.io/otel v1.44.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.44.0 @@ -82,7 +82,6 @@ require ( github.com/cloudwego/base64x v0.1.6 // indirect github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/dustin/go-humanize v1.0.1 // indirect - github.com/ebitengine/purego v0.10.1 // indirect github.com/eino-contrib/jsonschema v1.0.3 // indirect github.com/elastic/elastic-transport-go/v8 v8.8.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect diff --git a/go.sum b/go.sum index d41227473d..d72083c1a6 100644 --- a/go.sum +++ b/go.sum @@ -97,8 +97,6 @@ github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cu github.com/dnaeon/go-vcr v1.2.0/go.mod h1:R4UdLID7HZT3taECzJs4YgbbH6PIGXB6W/sc5OLb6RQ= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= -github.com/ebitengine/purego v0.10.1 h1:dewVBCBT2GaMu1SrNTYxQhgQBethzfhiwvZiLGP/qyY= -github.com/ebitengine/purego v0.10.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= github.com/eino-contrib/jsonschema v1.0.3 h1:2Kfsm1xlMV0ssY2nuxshS4AwbLFuqmPmzIjLVJ1Fsp0= github.com/eino-contrib/jsonschema v1.0.3/go.mod h1:cpnX4SyKjWjGC7iN2EbhxaTdLqGjCi0e9DxpLYxddD4= github.com/elastic/elastic-transport-go/v8 v8.8.0 h1:7k1Ua+qluFr6p1jfJjGDl97ssJS/P7cHNInzfxgBQAo= @@ -169,6 +167,8 @@ github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QD github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/gomarkdown/markdown v0.0.0-20260614204949-e08cff860f76 h1:Ltt9ldIaSYEsjA7sPY2c8r9dOmnKM1vlzhh3dxlhBHM= +github.com/gomarkdown/markdown v0.0.0-20260614204949-e08cff860f76/go.mod h1:JDGcbDT52eL4fju3sZ4TeHGsQwhG9nbDV21aMyhwPoA= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= @@ -365,10 +365,6 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE= github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg= -github.com/yfedoseev/office_oxide/go v0.1.2 h1:LnyVGXgJJF4tanuRUYVHZNn8e+IwGvOqtIFmQGDjPE4= -github.com/yfedoseev/office_oxide/go v0.1.2/go.mod h1:YLtMlKUkRCp/Q96wsy7D6yoBKDeJnP66UH+c9Bb+E+M= -github.com/yfedoseev/pdf_oxide/go v0.3.63 h1:6qlNQdaiGBGlo70je1fApQcCjeKg6AVUSUo+URCLl/s= -github.com/yfedoseev/pdf_oxide/go v0.3.63/go.mod h1:QbJ/nLbez0al2EnqEdEPIlGflFprWmiuUM4mo9rNNOI= github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/fJgbpc= github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw= github.com/x-cray/logrus-prefixed-formatter v0.5.2 h1:00txxvfBM9muc0jiLIEAkAcIMJzfthRT6usrui8uGmg= @@ -381,6 +377,8 @@ github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 h1:+C0TIdyyYmzadGaL/HBL github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9/go.mod h1:WwHg+CVyzlv/TX9xqBFXEZAuxOPxn2k1GNHwG41IIUQ= github.com/yargevad/filepathx v1.0.0 h1:SYcT+N3tYGi+NvazubCNlvgIPbzAk7i7y2dwg3I5FYc= github.com/yargevad/filepathx v1.0.0/go.mod h1:BprfX/gpYNJHJfc35GjRRpVcwWXS89gGulUIU5tK3tA= +github.com/yfedoseev/office_oxide/go v0.1.2 h1:LnyVGXgJJF4tanuRUYVHZNn8e+IwGvOqtIFmQGDjPE4= +github.com/yfedoseev/office_oxide/go v0.1.2/go.mod h1:YLtMlKUkRCp/Q96wsy7D6yoBKDeJnP66UH+c9Bb+E+M= github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M= github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= diff --git a/internal/ingestion/parser/markdown_parser.go b/internal/ingestion/parser/markdown_parser.go new file mode 100644 index 0000000000..3b980973be --- /dev/null +++ b/internal/ingestion/parser/markdown_parser.go @@ -0,0 +1,72 @@ +// +// Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package parser + +import ( + "fmt" + "os" + + "github.com/gomarkdown/markdown/ast" + "github.com/gomarkdown/markdown/parser" +) + +const ( + GoMarkdown = "go_markdown" +) + +type MarkdownParser struct { + libType string +} + +func NewMarkdownParser(libType string) (*MarkdownParser, error) { + switch libType { + case GoMarkdown: + return &MarkdownParser{ + libType: GoMarkdown, + }, nil + default: + return nil, fmt.Errorf("unsupported Markdown library type: %s", libType) + } +} + +func (p *MarkdownParser) Parse(filename string, data []byte) error { + fmt.Printf("Parsing Markdown file: %s\n", filename) + switch p.libType { + case GoMarkdown: + return p.GoMarkdownParse(data) + default: + return fmt.Errorf("unsupported Markdown library type: %s", p.libType) + } +} + +func (p *MarkdownParser) GoMarkdownParse(data []byte) error { + // create Markdown parser with extensions + extensions := parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock + parser.NewWithExtensions(extensions) + markdownParser := parser.NewWithExtensions(extensions) + doc := markdownParser.Parse(data) + + fmt.Print("--- AST tree:\n") + ast.Print(os.Stdout, doc) + fmt.Print("\n") + + return nil +} + +func (p *MarkdownParser) String() string { + return "MarkdownParser" +} diff --git a/internal/ingestion/parser/type.go b/internal/ingestion/parser/type.go index da40976f79..45f61de111 100644 --- a/internal/ingestion/parser/type.go +++ b/internal/ingestion/parser/type.go @@ -41,6 +41,8 @@ func GetParser(fileType utility.FileType, config map[string]string) (FileParser, return NewDOCParser(libType) case utility.FileTypePDF: return NewPDFParser(), nil + case utility.FileTypeMarkdown: + return NewMarkdownParser(GoMarkdown) default: return nil, fmt.Errorf("unsupported file type: %s", fileType) } diff --git a/internal/utility/file.go b/internal/utility/file.go index 1b372a3af4..ed21f030bf 100644 --- a/internal/utility/file.go +++ b/internal/utility/file.go @@ -25,17 +25,18 @@ import ( type FileType string const ( - FileTypePDF FileType = "pdf" - FileTypeDOC FileType = "doc" - FileTypeDOCX FileType = "docx" - FileTypePPT FileType = "ppt" - FileTypePPTX FileType = "pptx" - FileTypeXLS FileType = "xls" - FileTypeXLSX FileType = "xlsx" - FileTypeVISUAL FileType = "visual" - FileTypeAURAL FileType = "aural" - FileTypeFOLDER FileType = "folder" - FileTypeOTHER FileType = "other" + FileTypePDF FileType = "pdf" + FileTypeDOC FileType = "doc" + FileTypeDOCX FileType = "docx" + FileTypePPT FileType = "ppt" + FileTypePPTX FileType = "pptx" + FileTypeXLS FileType = "xls" + FileTypeXLSX FileType = "xlsx" + FileTypeMarkdown FileType = "md" + FileTypeVISUAL FileType = "visual" + FileTypeAURAL FileType = "aural" + FileTypeFOLDER FileType = "folder" + FileTypeOTHER FileType = "other" ) var ( @@ -82,6 +83,8 @@ func GetFileType(filename string) FileType { return FileTypePPT case "pptx": return FileTypePPTX + case "md": + return FileTypeMarkdown default: return FileTypeOTHER }