mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-03 01:01:56 +08:00
61 lines
1.5 KiB
Go
61 lines
1.5 KiB
Go
package docx
|
|
|
|
import (
|
|
"strings"
|
|
|
|
"ragflow/internal/deepdoc/parser/pdf/table"
|
|
doctype "ragflow/internal/deepdoc/parser/type"
|
|
)
|
|
|
|
// blocksToSections converts raw DOCX blocks to the shared Section representation
|
|
// consumed by the framework layer. Headings get LayoutType "title", tables get
|
|
// DocTypeKwd "table" with a populated TableItem, and everything else is "text".
|
|
func blocksToSections(blocks []RawBlock) []doctype.Section {
|
|
sections := make([]doctype.Section, 0, len(blocks))
|
|
for _, b := range blocks {
|
|
sec := blockToSection(b)
|
|
sections = append(sections, sec)
|
|
}
|
|
return sections
|
|
}
|
|
|
|
func blockToSection(b RawBlock) doctype.Section {
|
|
switch b.Type {
|
|
case "table":
|
|
return doctype.Section{
|
|
Text: table.SimpleRowsToHTML(b.Rows),
|
|
DocTypeKwd: "table",
|
|
TableItem: &doctype.TableItem{
|
|
Rows: b.Rows,
|
|
},
|
|
}
|
|
case "image":
|
|
return doctype.Section{
|
|
DocTypeKwd: "image",
|
|
Image: b.Image,
|
|
}
|
|
default:
|
|
layoutType := "text"
|
|
if strings.HasPrefix(strings.ToLower(b.Style), "heading") {
|
|
layoutType = "title"
|
|
}
|
|
return doctype.Section{
|
|
Text: b.Text,
|
|
DocTypeKwd: "text",
|
|
LayoutType: layoutType,
|
|
}
|
|
}
|
|
}
|
|
|
|
// Parse converts a DOCX file (given as bytes) into a doctype.ParseResult.
|
|
// It uses office_oxide for raw block extraction, then maps blocks to Sections.
|
|
func Parse(data []byte, cfg doctype.ParserConfig) (*doctype.ParseResult, error) {
|
|
blocks, err := ExtractRawBlocks(data)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &doctype.ParseResult{
|
|
Sections: blocksToSections(blocks),
|
|
}, nil
|
|
}
|