mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-03 01:01:56 +08:00
109 lines
2.3 KiB
Go
109 lines
2.3 KiB
Go
//go:build cgo
|
|
|
|
package docx
|
|
|
|
import (
|
|
"encoding/base64"
|
|
"encoding/json"
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
|
|
officeOxide "github.com/yfedoseev/office_oxide/go"
|
|
)
|
|
|
|
// ExtractRawBlocks opens a DOCX via office_oxide and extracts blocks in
|
|
// document order, matching the format produced by python-docx's
|
|
// _element.body iteration.
|
|
func ExtractRawBlocks(data []byte) ([]RawBlock, error) {
|
|
doc, err := officeOxide.OpenFromBytes(data, "docx")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("office_oxide open: %w", err)
|
|
}
|
|
defer doc.Close()
|
|
|
|
irJSON, err := doc.ToIRJSON()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("ToIRJSON: %w", err)
|
|
}
|
|
|
|
var ir irDocument
|
|
if err := json.Unmarshal([]byte(irJSON), &ir); err != nil {
|
|
return nil, fmt.Errorf("parse IR JSON: %w", err)
|
|
}
|
|
|
|
var blocks []RawBlock
|
|
for _, sec := range ir.Sections {
|
|
for _, el := range sec.Elements {
|
|
block := irElementToBlock(el)
|
|
blocks = append(blocks, block)
|
|
}
|
|
}
|
|
return blocks, nil
|
|
}
|
|
|
|
func irElementToBlock(el irElement) RawBlock {
|
|
switch el.Type {
|
|
case "table":
|
|
rows := make([][]string, len(el.Rows))
|
|
for ri, row := range el.Rows {
|
|
cells := make([]string, len(row.Cells))
|
|
for ci, cell := range row.Cells {
|
|
cells[ci] = joinElements(cell.Content)
|
|
}
|
|
rows[ri] = cells
|
|
}
|
|
return RawBlock{Type: "table", Rows: rows}
|
|
|
|
case "heading":
|
|
text := joinRuns(el.Content)
|
|
level := strconv.Itoa(el.Level)
|
|
return RawBlock{
|
|
Type: "paragraph",
|
|
Text: text,
|
|
Style: "Heading " + level,
|
|
}
|
|
|
|
case "image":
|
|
return RawBlock{
|
|
Type: "image",
|
|
Image: base64.StdEncoding.EncodeToString(el.Data),
|
|
}
|
|
|
|
default: // "paragraph" and anything else
|
|
style := el.Style
|
|
if style == "" {
|
|
style = "Normal"
|
|
}
|
|
return RawBlock{
|
|
Type: "paragraph",
|
|
Text: joinRuns(el.Content),
|
|
Style: style,
|
|
}
|
|
}
|
|
}
|
|
|
|
func joinRuns(runs []irRun) string {
|
|
var b strings.Builder
|
|
for _, r := range runs {
|
|
if r.Type == "text" {
|
|
b.WriteString(r.Text)
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
// joinElements extracts plain text from nested irElements (used for table cells).
|
|
// When multiple elements are present, a newline is inserted between each one
|
|
// to match python-docx _Cell.text behavior.
|
|
func joinElements(els []irElement) string {
|
|
var b strings.Builder
|
|
for i, el := range els {
|
|
if i > 0 {
|
|
b.WriteByte('\n')
|
|
}
|
|
b.WriteString(joinRuns(el.Content))
|
|
}
|
|
return b.String()
|
|
}
|