Go: file parser config (#15989)

### What problem does this PR solve?

Add parser config

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai
2026-06-13 19:40:43 +08:00
committed by GitHub
parent f671e7cb34
commit e89afbae21
9 changed files with 137 additions and 21 deletions

View File

@@ -3252,8 +3252,10 @@ func (c *CLI) UserParseLocalFile(cmd *Command) (ResponseIf, error) {
}
fileType := utility.GetFileType(filename)
fileParser, err := parser.GetParser(fileType)
config := map[string]string{
"lib_type": "office_oxide",
}
fileParser, err := parser.GetParser(fileType, config)
if err != nil {
return nil, err
}

View File

@@ -23,14 +23,31 @@ import (
)
type DOCParser struct {
libType string
}
func NewDOCParser() *DOCParser {
return &DOCParser{}
func NewDOCParser(libType string) (*DOCParser, error) {
switch libType {
case OfficeOxide:
return &DOCParser{
libType: OfficeOxide,
}, nil
default:
return nil, fmt.Errorf("unsupported DOC library type: %s", libType)
}
}
func (p *DOCParser) Parse(filename string, data []byte) error {
fmt.Printf("Parsing DOC file: %s\n", filename)
switch p.libType {
case OfficeOxide:
return p.OfficeOxideParse(data)
default:
return fmt.Errorf("unsupported DOC library type: %s", p.libType)
}
}
func (p *DOCParser) OfficeOxideParse(data []byte) error {
doc, err := officeOxide.OpenFromBytes(data, "doc")
if err != nil {
return err

View File

@@ -22,16 +22,38 @@ import (
officeOxide "github.com/yfedoseev/office_oxide/go"
)
const (
OfficeOxide string = "office_oxide"
)
type DOCXParser struct {
libType string
}
func NewDOCXParser() *DOCXParser {
return &DOCXParser{}
func NewDOCXParser(libType string) (*DOCXParser, error) {
switch libType {
case OfficeOxide:
return &DOCXParser{
libType: OfficeOxide,
}, nil
default:
return nil, fmt.Errorf("unsupported DOCX library type: %s", libType)
}
}
func (p *DOCXParser) Parse(filename string, data []byte) error {
fmt.Printf("Parsing DOCX file: %s\n", filename)
switch p.libType {
case OfficeOxide:
return p.OfficeOxideParse(data)
default:
return fmt.Errorf("unsupported DOCX library type: %s", p.libType)
}
}
func (p *DOCXParser) OfficeOxideParse(data []byte) error {
doc, err := officeOxide.OpenFromBytes(data, "docx")
if err != nil {
return err

View File

@@ -19,6 +19,9 @@ package parser
import "fmt"
type PDFParser struct {
ParserType string // DeepDoc, PaddleOCR, MinerU
Model string // DeepDoc@buildin@ragflow
LibType string // pdf_oxide, used by DeepDoc
}
func NewPDFParser() *PDFParser {

View File

@@ -23,14 +23,31 @@ import (
)
type PPTParser struct {
libType string
}
func NewPPTParser() *PPTParser {
return &PPTParser{}
func NewPPTParser(libType string) (*PPTParser, error) {
switch libType {
case OfficeOxide:
return &PPTParser{
libType: OfficeOxide,
}, nil
default:
return nil, fmt.Errorf("unsupported PPT library type: %s", libType)
}
}
func (p *PPTParser) Parse(filename string, data []byte) error {
fmt.Printf("Parsing PPT file: %s\n", filename)
switch p.libType {
case OfficeOxide:
return p.OfficeOxideParse(data)
default:
return fmt.Errorf("unsupported PPT library type: %s", p.libType)
}
}
func (p *PPTParser) OfficeOxideParse(data []byte) error {
doc, err := officeOxide.OpenFromBytes(data, "ppt")
if err != nil {
return err

View File

@@ -23,14 +23,31 @@ import (
)
type PPTXParser struct {
libType string
}
func NewPPTXParser() *PPTXParser {
return &PPTXParser{}
func NewPPTXParser(libType string) (*PPTXParser, error) {
switch libType {
case OfficeOxide:
return &PPTXParser{
libType: OfficeOxide,
}, nil
default:
return nil, fmt.Errorf("unsupported PPTX library type: %s", libType)
}
}
func (p *PPTXParser) Parse(filename string, data []byte) error {
fmt.Printf("Parsing PPTX file: %s\n", filename)
switch p.libType {
case OfficeOxide:
return p.OfficeOxideParse(data)
default:
return fmt.Errorf("unsupported PPTX library type: %s", p.libType)
}
}
func (p *PPTXParser) OfficeOxideParse(data []byte) error {
doc, err := officeOxide.OpenFromBytes(data, "pptx")
if err != nil {
return err

View File

@@ -21,20 +21,24 @@ import (
"ragflow/internal/utility"
)
func GetParser(fileType utility.FileType) (FileParser, error) {
func GetParser(fileType utility.FileType, config map[string]string) (FileParser, error) {
libType, ok := config["lib_type"]
if !ok {
return nil, fmt.Errorf("missing lib_type config")
}
switch fileType {
case utility.FileTypePPTX:
return NewPPTXParser(), nil
return NewPPTXParser(libType)
case utility.FileTypePPT:
return NewPPTParser(), nil
return NewPPTParser(libType)
case utility.FileTypeXLSX:
return NewXLSXParser(), nil
return NewXLSXParser(libType)
case utility.FileTypeXLS:
return NewXLSParser(), nil
return NewXLSParser(libType)
case utility.FileTypeDOCX:
return NewDOCXParser(), nil
return NewDOCXParser(libType)
case utility.FileTypeDOC:
return NewDOCParser(), nil
return NewDOCParser(libType)
case utility.FileTypePDF:
return NewPDFParser(), nil
default:

View File

@@ -23,14 +23,31 @@ import (
)
type XLSParser struct {
libType string
}
func NewXLSParser() *XLSParser {
return &XLSParser{}
func NewXLSParser(libType string) (*XLSParser, error) {
switch libType {
case OfficeOxide:
return &XLSParser{
libType: OfficeOxide,
}, nil
default:
return nil, fmt.Errorf("unsupported XLS library type: %s", libType)
}
}
func (p *XLSParser) Parse(filename string, data []byte) error {
fmt.Printf("Parsing XLS file: %s\n", filename)
switch p.libType {
case OfficeOxide:
return p.OfficeOxideParse(data)
default:
return fmt.Errorf("unsupported XLS library type: %s", p.libType)
}
}
func (p *XLSParser) OfficeOxideParse(data []byte) error {
doc, err := officeOxide.OpenFromBytes(data, "xls")
if err != nil {
return err

View File

@@ -23,14 +23,31 @@ import (
)
type XLSXParser struct {
libType string
}
func NewXLSXParser() *XLSXParser {
return &XLSXParser{}
func NewXLSXParser(libType string) (*XLSXParser, error) {
switch libType {
case OfficeOxide:
return &XLSXParser{
libType: OfficeOxide,
}, nil
default:
return nil, fmt.Errorf("unsupported XLSX library type: %s", libType)
}
}
func (p *XLSXParser) Parse(filename string, data []byte) error {
fmt.Printf("Parsing XLSX file: %s\n", filename)
switch p.libType {
case OfficeOxide:
return p.OfficeOxideParse(data)
default:
return fmt.Errorf("unsupported XLSX library type: %s", p.libType)
}
}
func (p *XLSXParser) OfficeOxideParse(data []byte) error {
doc, err := officeOxide.OpenFromBytes(data, "xlsx")
if err != nil {
return err