mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
Go: file parser config (#15989)
### What problem does this PR solve? Add parser config ### Type of change - [x] New Feature (non-breaking change which adds functionality) Signed-off-by: Jin Hai <haijin.chn@gmail.com>
This commit is contained in:
@@ -3252,8 +3252,10 @@ func (c *CLI) UserParseLocalFile(cmd *Command) (ResponseIf, error) {
|
||||
}
|
||||
|
||||
fileType := utility.GetFileType(filename)
|
||||
|
||||
fileParser, err := parser.GetParser(fileType)
|
||||
config := map[string]string{
|
||||
"lib_type": "office_oxide",
|
||||
}
|
||||
fileParser, err := parser.GetParser(fileType, config)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -23,14 +23,31 @@ import (
|
||||
)
|
||||
|
||||
type DOCParser struct {
|
||||
libType string
|
||||
}
|
||||
|
||||
func NewDOCParser() *DOCParser {
|
||||
return &DOCParser{}
|
||||
func NewDOCParser(libType string) (*DOCParser, error) {
|
||||
switch libType {
|
||||
case OfficeOxide:
|
||||
return &DOCParser{
|
||||
libType: OfficeOxide,
|
||||
}, nil
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported DOC library type: %s", libType)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *DOCParser) Parse(filename string, data []byte) error {
|
||||
fmt.Printf("Parsing DOC file: %s\n", filename)
|
||||
switch p.libType {
|
||||
case OfficeOxide:
|
||||
return p.OfficeOxideParse(data)
|
||||
default:
|
||||
return fmt.Errorf("unsupported DOC library type: %s", p.libType)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *DOCParser) OfficeOxideParse(data []byte) error {
|
||||
doc, err := officeOxide.OpenFromBytes(data, "doc")
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@@ -22,16 +22,38 @@ import (
|
||||
officeOxide "github.com/yfedoseev/office_oxide/go"
|
||||
)
|
||||
|
||||
const (
|
||||
OfficeOxide string = "office_oxide"
|
||||
)
|
||||
|
||||
type DOCXParser struct {
|
||||
libType string
|
||||
}
|
||||
|
||||
func NewDOCXParser() *DOCXParser {
|
||||
return &DOCXParser{}
|
||||
func NewDOCXParser(libType string) (*DOCXParser, error) {
|
||||
switch libType {
|
||||
case OfficeOxide:
|
||||
return &DOCXParser{
|
||||
libType: OfficeOxide,
|
||||
}, nil
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported DOCX library type: %s", libType)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *DOCXParser) Parse(filename string, data []byte) error {
|
||||
|
||||
fmt.Printf("Parsing DOCX file: %s\n", filename)
|
||||
switch p.libType {
|
||||
case OfficeOxide:
|
||||
return p.OfficeOxideParse(data)
|
||||
default:
|
||||
return fmt.Errorf("unsupported DOCX library type: %s", p.libType)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *DOCXParser) OfficeOxideParse(data []byte) error {
|
||||
|
||||
doc, err := officeOxide.OpenFromBytes(data, "docx")
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@@ -19,6 +19,9 @@ package parser
|
||||
import "fmt"
|
||||
|
||||
type PDFParser struct {
|
||||
ParserType string // DeepDoc, PaddleOCR, MinerU
|
||||
Model string // DeepDoc@buildin@ragflow
|
||||
LibType string // pdf_oxide, used by DeepDoc
|
||||
}
|
||||
|
||||
func NewPDFParser() *PDFParser {
|
||||
|
||||
@@ -23,14 +23,31 @@ import (
|
||||
)
|
||||
|
||||
type PPTParser struct {
|
||||
libType string
|
||||
}
|
||||
|
||||
func NewPPTParser() *PPTParser {
|
||||
return &PPTParser{}
|
||||
func NewPPTParser(libType string) (*PPTParser, error) {
|
||||
switch libType {
|
||||
case OfficeOxide:
|
||||
return &PPTParser{
|
||||
libType: OfficeOxide,
|
||||
}, nil
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported PPT library type: %s", libType)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *PPTParser) Parse(filename string, data []byte) error {
|
||||
fmt.Printf("Parsing PPT file: %s\n", filename)
|
||||
switch p.libType {
|
||||
case OfficeOxide:
|
||||
return p.OfficeOxideParse(data)
|
||||
default:
|
||||
return fmt.Errorf("unsupported PPT library type: %s", p.libType)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *PPTParser) OfficeOxideParse(data []byte) error {
|
||||
doc, err := officeOxide.OpenFromBytes(data, "ppt")
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@@ -23,14 +23,31 @@ import (
|
||||
)
|
||||
|
||||
type PPTXParser struct {
|
||||
libType string
|
||||
}
|
||||
|
||||
func NewPPTXParser() *PPTXParser {
|
||||
return &PPTXParser{}
|
||||
func NewPPTXParser(libType string) (*PPTXParser, error) {
|
||||
switch libType {
|
||||
case OfficeOxide:
|
||||
return &PPTXParser{
|
||||
libType: OfficeOxide,
|
||||
}, nil
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported PPTX library type: %s", libType)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *PPTXParser) Parse(filename string, data []byte) error {
|
||||
fmt.Printf("Parsing PPTX file: %s\n", filename)
|
||||
switch p.libType {
|
||||
case OfficeOxide:
|
||||
return p.OfficeOxideParse(data)
|
||||
default:
|
||||
return fmt.Errorf("unsupported PPTX library type: %s", p.libType)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *PPTXParser) OfficeOxideParse(data []byte) error {
|
||||
doc, err := officeOxide.OpenFromBytes(data, "pptx")
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@@ -21,20 +21,24 @@ import (
|
||||
"ragflow/internal/utility"
|
||||
)
|
||||
|
||||
func GetParser(fileType utility.FileType) (FileParser, error) {
|
||||
func GetParser(fileType utility.FileType, config map[string]string) (FileParser, error) {
|
||||
libType, ok := config["lib_type"]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("missing lib_type config")
|
||||
}
|
||||
switch fileType {
|
||||
case utility.FileTypePPTX:
|
||||
return NewPPTXParser(), nil
|
||||
return NewPPTXParser(libType)
|
||||
case utility.FileTypePPT:
|
||||
return NewPPTParser(), nil
|
||||
return NewPPTParser(libType)
|
||||
case utility.FileTypeXLSX:
|
||||
return NewXLSXParser(), nil
|
||||
return NewXLSXParser(libType)
|
||||
case utility.FileTypeXLS:
|
||||
return NewXLSParser(), nil
|
||||
return NewXLSParser(libType)
|
||||
case utility.FileTypeDOCX:
|
||||
return NewDOCXParser(), nil
|
||||
return NewDOCXParser(libType)
|
||||
case utility.FileTypeDOC:
|
||||
return NewDOCParser(), nil
|
||||
return NewDOCParser(libType)
|
||||
case utility.FileTypePDF:
|
||||
return NewPDFParser(), nil
|
||||
default:
|
||||
|
||||
@@ -23,14 +23,31 @@ import (
|
||||
)
|
||||
|
||||
type XLSParser struct {
|
||||
libType string
|
||||
}
|
||||
|
||||
func NewXLSParser() *XLSParser {
|
||||
return &XLSParser{}
|
||||
func NewXLSParser(libType string) (*XLSParser, error) {
|
||||
switch libType {
|
||||
case OfficeOxide:
|
||||
return &XLSParser{
|
||||
libType: OfficeOxide,
|
||||
}, nil
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported XLS library type: %s", libType)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *XLSParser) Parse(filename string, data []byte) error {
|
||||
fmt.Printf("Parsing XLS file: %s\n", filename)
|
||||
switch p.libType {
|
||||
case OfficeOxide:
|
||||
return p.OfficeOxideParse(data)
|
||||
default:
|
||||
return fmt.Errorf("unsupported XLS library type: %s", p.libType)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *XLSParser) OfficeOxideParse(data []byte) error {
|
||||
doc, err := officeOxide.OpenFromBytes(data, "xls")
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@@ -23,14 +23,31 @@ import (
|
||||
)
|
||||
|
||||
type XLSXParser struct {
|
||||
libType string
|
||||
}
|
||||
|
||||
func NewXLSXParser() *XLSXParser {
|
||||
return &XLSXParser{}
|
||||
func NewXLSXParser(libType string) (*XLSXParser, error) {
|
||||
switch libType {
|
||||
case OfficeOxide:
|
||||
return &XLSXParser{
|
||||
libType: OfficeOxide,
|
||||
}, nil
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported XLSX library type: %s", libType)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *XLSXParser) Parse(filename string, data []byte) error {
|
||||
fmt.Printf("Parsing XLSX file: %s\n", filename)
|
||||
switch p.libType {
|
||||
case OfficeOxide:
|
||||
return p.OfficeOxideParse(data)
|
||||
default:
|
||||
return fmt.Errorf("unsupported XLSX library type: %s", p.libType)
|
||||
}
|
||||
}
|
||||
|
||||
func (p *XLSXParser) OfficeOxideParse(data []byte) error {
|
||||
doc, err := officeOxide.OpenFromBytes(data, "xlsx")
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
Reference in New Issue
Block a user