From e89afbae21182b70f69ee7dedebdb461f8c123de Mon Sep 17 00:00:00 2001 From: Jin Hai Date: Sat, 13 Jun 2026 19:40:43 +0800 Subject: [PATCH] Go: file parser config (#15989) ### What problem does this PR solve? Add parser config ### Type of change - [x] New Feature (non-breaking change which adds functionality) Signed-off-by: Jin Hai --- internal/cli/user_command.go | 6 ++++-- internal/ingestion/parser/doc_parser.go | 21 +++++++++++++++++-- internal/ingestion/parser/docx_parser.go | 26 ++++++++++++++++++++++-- internal/ingestion/parser/pdf_parser.go | 3 +++ internal/ingestion/parser/ppt_parser.go | 21 +++++++++++++++++-- internal/ingestion/parser/pptx_parser.go | 21 +++++++++++++++++-- internal/ingestion/parser/type.go | 18 +++++++++------- internal/ingestion/parser/xls_parser.go | 21 +++++++++++++++++-- internal/ingestion/parser/xlsx_parser.go | 21 +++++++++++++++++-- 9 files changed, 137 insertions(+), 21 deletions(-) diff --git a/internal/cli/user_command.go b/internal/cli/user_command.go index c96357e238..9f58f7eca2 100644 --- a/internal/cli/user_command.go +++ b/internal/cli/user_command.go @@ -3252,8 +3252,10 @@ func (c *CLI) UserParseLocalFile(cmd *Command) (ResponseIf, error) { } fileType := utility.GetFileType(filename) - - fileParser, err := parser.GetParser(fileType) + config := map[string]string{ + "lib_type": "office_oxide", + } + fileParser, err := parser.GetParser(fileType, config) if err != nil { return nil, err } diff --git a/internal/ingestion/parser/doc_parser.go b/internal/ingestion/parser/doc_parser.go index 5a8d561b9a..3b8539af35 100644 --- a/internal/ingestion/parser/doc_parser.go +++ b/internal/ingestion/parser/doc_parser.go @@ -23,14 +23,31 @@ import ( ) type DOCParser struct { + libType string } -func NewDOCParser() *DOCParser { - return &DOCParser{} +func NewDOCParser(libType string) (*DOCParser, error) { + switch libType { + case OfficeOxide: + return &DOCParser{ + libType: OfficeOxide, + }, nil + default: + return nil, fmt.Errorf("unsupported DOC library type: %s", libType) + } } func (p *DOCParser) Parse(filename string, data []byte) error { fmt.Printf("Parsing DOC file: %s\n", filename) + switch p.libType { + case OfficeOxide: + return p.OfficeOxideParse(data) + default: + return fmt.Errorf("unsupported DOC library type: %s", p.libType) + } +} + +func (p *DOCParser) OfficeOxideParse(data []byte) error { doc, err := officeOxide.OpenFromBytes(data, "doc") if err != nil { return err diff --git a/internal/ingestion/parser/docx_parser.go b/internal/ingestion/parser/docx_parser.go index 22a0b0ae16..da5a9ff52d 100644 --- a/internal/ingestion/parser/docx_parser.go +++ b/internal/ingestion/parser/docx_parser.go @@ -22,16 +22,38 @@ import ( officeOxide "github.com/yfedoseev/office_oxide/go" ) +const ( + OfficeOxide string = "office_oxide" +) + type DOCXParser struct { + libType string } -func NewDOCXParser() *DOCXParser { - return &DOCXParser{} +func NewDOCXParser(libType string) (*DOCXParser, error) { + switch libType { + case OfficeOxide: + return &DOCXParser{ + libType: OfficeOxide, + }, nil + default: + return nil, fmt.Errorf("unsupported DOCX library type: %s", libType) + } } func (p *DOCXParser) Parse(filename string, data []byte) error { fmt.Printf("Parsing DOCX file: %s\n", filename) + switch p.libType { + case OfficeOxide: + return p.OfficeOxideParse(data) + default: + return fmt.Errorf("unsupported DOCX library type: %s", p.libType) + } +} + +func (p *DOCXParser) OfficeOxideParse(data []byte) error { + doc, err := officeOxide.OpenFromBytes(data, "docx") if err != nil { return err diff --git a/internal/ingestion/parser/pdf_parser.go b/internal/ingestion/parser/pdf_parser.go index 3061d6a2e5..32bb44f482 100644 --- a/internal/ingestion/parser/pdf_parser.go +++ b/internal/ingestion/parser/pdf_parser.go @@ -19,6 +19,9 @@ package parser import "fmt" type PDFParser struct { + ParserType string // DeepDoc, PaddleOCR, MinerU + Model string // DeepDoc@buildin@ragflow + LibType string // pdf_oxide, used by DeepDoc } func NewPDFParser() *PDFParser { diff --git a/internal/ingestion/parser/ppt_parser.go b/internal/ingestion/parser/ppt_parser.go index 6bed34862a..5e3165b394 100644 --- a/internal/ingestion/parser/ppt_parser.go +++ b/internal/ingestion/parser/ppt_parser.go @@ -23,14 +23,31 @@ import ( ) type PPTParser struct { + libType string } -func NewPPTParser() *PPTParser { - return &PPTParser{} +func NewPPTParser(libType string) (*PPTParser, error) { + switch libType { + case OfficeOxide: + return &PPTParser{ + libType: OfficeOxide, + }, nil + default: + return nil, fmt.Errorf("unsupported PPT library type: %s", libType) + } } func (p *PPTParser) Parse(filename string, data []byte) error { fmt.Printf("Parsing PPT file: %s\n", filename) + switch p.libType { + case OfficeOxide: + return p.OfficeOxideParse(data) + default: + return fmt.Errorf("unsupported PPT library type: %s", p.libType) + } +} + +func (p *PPTParser) OfficeOxideParse(data []byte) error { doc, err := officeOxide.OpenFromBytes(data, "ppt") if err != nil { return err diff --git a/internal/ingestion/parser/pptx_parser.go b/internal/ingestion/parser/pptx_parser.go index 2ae3aa6ede..2b325139c6 100644 --- a/internal/ingestion/parser/pptx_parser.go +++ b/internal/ingestion/parser/pptx_parser.go @@ -23,14 +23,31 @@ import ( ) type PPTXParser struct { + libType string } -func NewPPTXParser() *PPTXParser { - return &PPTXParser{} +func NewPPTXParser(libType string) (*PPTXParser, error) { + switch libType { + case OfficeOxide: + return &PPTXParser{ + libType: OfficeOxide, + }, nil + default: + return nil, fmt.Errorf("unsupported PPTX library type: %s", libType) + } } func (p *PPTXParser) Parse(filename string, data []byte) error { fmt.Printf("Parsing PPTX file: %s\n", filename) + switch p.libType { + case OfficeOxide: + return p.OfficeOxideParse(data) + default: + return fmt.Errorf("unsupported PPTX library type: %s", p.libType) + } +} + +func (p *PPTXParser) OfficeOxideParse(data []byte) error { doc, err := officeOxide.OpenFromBytes(data, "pptx") if err != nil { return err diff --git a/internal/ingestion/parser/type.go b/internal/ingestion/parser/type.go index 9770e44bf8..da40976f79 100644 --- a/internal/ingestion/parser/type.go +++ b/internal/ingestion/parser/type.go @@ -21,20 +21,24 @@ import ( "ragflow/internal/utility" ) -func GetParser(fileType utility.FileType) (FileParser, error) { +func GetParser(fileType utility.FileType, config map[string]string) (FileParser, error) { + libType, ok := config["lib_type"] + if !ok { + return nil, fmt.Errorf("missing lib_type config") + } switch fileType { case utility.FileTypePPTX: - return NewPPTXParser(), nil + return NewPPTXParser(libType) case utility.FileTypePPT: - return NewPPTParser(), nil + return NewPPTParser(libType) case utility.FileTypeXLSX: - return NewXLSXParser(), nil + return NewXLSXParser(libType) case utility.FileTypeXLS: - return NewXLSParser(), nil + return NewXLSParser(libType) case utility.FileTypeDOCX: - return NewDOCXParser(), nil + return NewDOCXParser(libType) case utility.FileTypeDOC: - return NewDOCParser(), nil + return NewDOCParser(libType) case utility.FileTypePDF: return NewPDFParser(), nil default: diff --git a/internal/ingestion/parser/xls_parser.go b/internal/ingestion/parser/xls_parser.go index 7cf46b9447..21df47a61f 100644 --- a/internal/ingestion/parser/xls_parser.go +++ b/internal/ingestion/parser/xls_parser.go @@ -23,14 +23,31 @@ import ( ) type XLSParser struct { + libType string } -func NewXLSParser() *XLSParser { - return &XLSParser{} +func NewXLSParser(libType string) (*XLSParser, error) { + switch libType { + case OfficeOxide: + return &XLSParser{ + libType: OfficeOxide, + }, nil + default: + return nil, fmt.Errorf("unsupported XLS library type: %s", libType) + } } func (p *XLSParser) Parse(filename string, data []byte) error { fmt.Printf("Parsing XLS file: %s\n", filename) + switch p.libType { + case OfficeOxide: + return p.OfficeOxideParse(data) + default: + return fmt.Errorf("unsupported XLS library type: %s", p.libType) + } +} + +func (p *XLSParser) OfficeOxideParse(data []byte) error { doc, err := officeOxide.OpenFromBytes(data, "xls") if err != nil { return err diff --git a/internal/ingestion/parser/xlsx_parser.go b/internal/ingestion/parser/xlsx_parser.go index 7d822d42c3..aac3ad2374 100644 --- a/internal/ingestion/parser/xlsx_parser.go +++ b/internal/ingestion/parser/xlsx_parser.go @@ -23,14 +23,31 @@ import ( ) type XLSXParser struct { + libType string } -func NewXLSXParser() *XLSXParser { - return &XLSXParser{} +func NewXLSXParser(libType string) (*XLSXParser, error) { + switch libType { + case OfficeOxide: + return &XLSXParser{ + libType: OfficeOxide, + }, nil + default: + return nil, fmt.Errorf("unsupported XLSX library type: %s", libType) + } } func (p *XLSXParser) Parse(filename string, data []byte) error { fmt.Printf("Parsing XLSX file: %s\n", filename) + switch p.libType { + case OfficeOxide: + return p.OfficeOxideParse(data) + default: + return fmt.Errorf("unsupported XLSX library type: %s", p.libType) + } +} + +func (p *XLSXParser) OfficeOxideParse(data []byte) error { doc, err := officeOxide.OpenFromBytes(data, "xlsx") if err != nil { return err