Go: add Markdown parser (#16016)

### What problem does this PR solve?

```
RAGFlow(api/default)> parse file 'README.md';
Parsing Markdown file: README.md
--- AST tree:
HTMLBlock '<div align="center">\n<a href="https:…'
```

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2026-06-15 15:07:29 +08:00
committed by GitHub
parent fcebcebe1e
commit 2846216674
5 changed files with 94 additions and 20 deletions

5
go.mod
View File

@@ -20,6 +20,7 @@ require (
github.com/glebarez/sqlite v1.11.0
github.com/go-sql-driver/mysql v1.7.0
github.com/goccy/go-json v0.10.2
github.com/gomarkdown/markdown v0.0.0-20260614204949-e08cff860f76
github.com/google/uuid v1.6.0
github.com/infiniflow/infinity-go-sdk v0.0.0-00010101000000-000000000000
github.com/iromli/go-itsdangerous v0.0.0-20220223194502-9c8bef8dac6a
@@ -34,9 +35,8 @@ require (
github.com/signintech/gopdf v0.36.1
github.com/siongui/gojianfan v0.0.0-20210926212422-2f175ac615de
github.com/spf13/viper v1.18.2
github.com/yfedoseev/office_oxide/go v0.1.2
github.com/yfedoseev/pdf_oxide/go v0.3.63
github.com/xuri/excelize/v2 v2.10.1
github.com/yfedoseev/office_oxide/go v0.1.2
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.69.0
go.opentelemetry.io/otel v1.44.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.44.0
@@ -82,7 +82,6 @@ require (
github.com/cloudwego/base64x v0.1.6 // indirect
github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/ebitengine/purego v0.10.1 // indirect
github.com/eino-contrib/jsonschema v1.0.3 // indirect
github.com/elastic/elastic-transport-go/v8 v8.8.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect

10
go.sum
View File

@@ -97,8 +97,6 @@ github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cu
github.com/dnaeon/go-vcr v1.2.0/go.mod h1:R4UdLID7HZT3taECzJs4YgbbH6PIGXB6W/sc5OLb6RQ=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/ebitengine/purego v0.10.1 h1:dewVBCBT2GaMu1SrNTYxQhgQBethzfhiwvZiLGP/qyY=
github.com/ebitengine/purego v0.10.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
github.com/eino-contrib/jsonschema v1.0.3 h1:2Kfsm1xlMV0ssY2nuxshS4AwbLFuqmPmzIjLVJ1Fsp0=
github.com/eino-contrib/jsonschema v1.0.3/go.mod h1:cpnX4SyKjWjGC7iN2EbhxaTdLqGjCi0e9DxpLYxddD4=
github.com/elastic/elastic-transport-go/v8 v8.8.0 h1:7k1Ua+qluFr6p1jfJjGDl97ssJS/P7cHNInzfxgBQAo=
@@ -169,6 +167,8 @@ github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QD
github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI=
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
github.com/gomarkdown/markdown v0.0.0-20260614204949-e08cff860f76 h1:Ltt9ldIaSYEsjA7sPY2c8r9dOmnKM1vlzhh3dxlhBHM=
github.com/gomarkdown/markdown v0.0.0-20260614204949-e08cff860f76/go.mod h1:JDGcbDT52eL4fju3sZ4TeHGsQwhG9nbDV21aMyhwPoA=
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU=
@@ -365,10 +365,6 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
github.com/yfedoseev/office_oxide/go v0.1.2 h1:LnyVGXgJJF4tanuRUYVHZNn8e+IwGvOqtIFmQGDjPE4=
github.com/yfedoseev/office_oxide/go v0.1.2/go.mod h1:YLtMlKUkRCp/Q96wsy7D6yoBKDeJnP66UH+c9Bb+E+M=
github.com/yfedoseev/pdf_oxide/go v0.3.63 h1:6qlNQdaiGBGlo70je1fApQcCjeKg6AVUSUo+URCLl/s=
github.com/yfedoseev/pdf_oxide/go v0.3.63/go.mod h1:QbJ/nLbez0al2EnqEdEPIlGflFprWmiuUM4mo9rNNOI=
github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/fJgbpc=
github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw=
github.com/x-cray/logrus-prefixed-formatter v0.5.2 h1:00txxvfBM9muc0jiLIEAkAcIMJzfthRT6usrui8uGmg=
@@ -381,6 +377,8 @@ github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 h1:+C0TIdyyYmzadGaL/HBL
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9/go.mod h1:WwHg+CVyzlv/TX9xqBFXEZAuxOPxn2k1GNHwG41IIUQ=
github.com/yargevad/filepathx v1.0.0 h1:SYcT+N3tYGi+NvazubCNlvgIPbzAk7i7y2dwg3I5FYc=
github.com/yargevad/filepathx v1.0.0/go.mod h1:BprfX/gpYNJHJfc35GjRRpVcwWXS89gGulUIU5tK3tA=
github.com/yfedoseev/office_oxide/go v0.1.2 h1:LnyVGXgJJF4tanuRUYVHZNn8e+IwGvOqtIFmQGDjPE4=
github.com/yfedoseev/office_oxide/go v0.1.2/go.mod h1:YLtMlKUkRCp/Q96wsy7D6yoBKDeJnP66UH+c9Bb+E+M=
github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M=
github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw=
github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=

View File

@@ -0,0 +1,72 @@
//
// Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package parser
import (
"fmt"
"os"
"github.com/gomarkdown/markdown/ast"
"github.com/gomarkdown/markdown/parser"
)
const (
GoMarkdown = "go_markdown"
)
type MarkdownParser struct {
libType string
}
func NewMarkdownParser(libType string) (*MarkdownParser, error) {
switch libType {
case GoMarkdown:
return &MarkdownParser{
libType: GoMarkdown,
}, nil
default:
return nil, fmt.Errorf("unsupported Markdown library type: %s", libType)
}
}
func (p *MarkdownParser) Parse(filename string, data []byte) error {
fmt.Printf("Parsing Markdown file: %s\n", filename)
switch p.libType {
case GoMarkdown:
return p.GoMarkdownParse(data)
default:
return fmt.Errorf("unsupported Markdown library type: %s", p.libType)
}
}
func (p *MarkdownParser) GoMarkdownParse(data []byte) error {
// create Markdown parser with extensions
extensions := parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock
parser.NewWithExtensions(extensions)
markdownParser := parser.NewWithExtensions(extensions)
doc := markdownParser.Parse(data)
fmt.Print("--- AST tree:\n")
ast.Print(os.Stdout, doc)
fmt.Print("\n")
return nil
}
func (p *MarkdownParser) String() string {
return "MarkdownParser"
}

View File

@@ -41,6 +41,8 @@ func GetParser(fileType utility.FileType, config map[string]string) (FileParser,
return NewDOCParser(libType)
case utility.FileTypePDF:
return NewPDFParser(), nil
case utility.FileTypeMarkdown:
return NewMarkdownParser(GoMarkdown)
default:
return nil, fmt.Errorf("unsupported file type: %s", fileType)
}

View File

@@ -25,17 +25,18 @@ import (
type FileType string
const (
FileTypePDF FileType = "pdf"
FileTypeDOC FileType = "doc"
FileTypeDOCX FileType = "docx"
FileTypePPT FileType = "ppt"
FileTypePPTX FileType = "pptx"
FileTypeXLS FileType = "xls"
FileTypeXLSX FileType = "xlsx"
FileTypeVISUAL FileType = "visual"
FileTypeAURAL FileType = "aural"
FileTypeFOLDER FileType = "folder"
FileTypeOTHER FileType = "other"
FileTypePDF FileType = "pdf"
FileTypeDOC FileType = "doc"
FileTypeDOCX FileType = "docx"
FileTypePPT FileType = "ppt"
FileTypePPTX FileType = "pptx"
FileTypeXLS FileType = "xls"
FileTypeXLSX FileType = "xlsx"
FileTypeMarkdown FileType = "md"
FileTypeVISUAL FileType = "visual"
FileTypeAURAL FileType = "aural"
FileTypeFOLDER FileType = "folder"
FileTypeOTHER FileType = "other"
)
var (
@@ -82,6 +83,8 @@ func GetFileType(filename string) FileType {
return FileTypePPT
case "pptx":
return FileTypePPTX
case "md":
return FileTypeMarkdown
default:
return FileTypeOTHER
}