mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-03 01:01:56 +08:00
60
internal/deepdoc/parser/docx/parser.go
Normal file
60
internal/deepdoc/parser/docx/parser.go
Normal file
@@ -0,0 +1,60 @@
|
||||
package docx
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"ragflow/internal/deepdoc/parser/pdf/table"
|
||||
doctype "ragflow/internal/deepdoc/parser/type"
|
||||
)
|
||||
|
||||
// blocksToSections converts raw DOCX blocks to the shared Section representation
|
||||
// consumed by the framework layer. Headings get LayoutType "title", tables get
|
||||
// DocTypeKwd "table" with a populated TableItem, and everything else is "text".
|
||||
func blocksToSections(blocks []RawBlock) []doctype.Section {
|
||||
sections := make([]doctype.Section, 0, len(blocks))
|
||||
for _, b := range blocks {
|
||||
sec := blockToSection(b)
|
||||
sections = append(sections, sec)
|
||||
}
|
||||
return sections
|
||||
}
|
||||
|
||||
func blockToSection(b RawBlock) doctype.Section {
|
||||
switch b.Type {
|
||||
case "table":
|
||||
return doctype.Section{
|
||||
Text: table.SimpleRowsToHTML(b.Rows),
|
||||
DocTypeKwd: "table",
|
||||
TableItem: &doctype.TableItem{
|
||||
Rows: b.Rows,
|
||||
},
|
||||
}
|
||||
case "image":
|
||||
return doctype.Section{
|
||||
DocTypeKwd: "image",
|
||||
Image: b.Image,
|
||||
}
|
||||
default:
|
||||
layoutType := "text"
|
||||
if strings.HasPrefix(strings.ToLower(b.Style), "heading") {
|
||||
layoutType = "title"
|
||||
}
|
||||
return doctype.Section{
|
||||
Text: b.Text,
|
||||
DocTypeKwd: "text",
|
||||
LayoutType: layoutType,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Parse converts a DOCX file (given as bytes) into a doctype.ParseResult.
|
||||
// It uses office_oxide for raw block extraction, then maps blocks to Sections.
|
||||
func Parse(data []byte, cfg doctype.ParserConfig) (*doctype.ParseResult, error) {
|
||||
blocks, err := ExtractRawBlocks(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &doctype.ParseResult{
|
||||
Sections: blocksToSections(blocks),
|
||||
}, nil
|
||||
}
|
||||
161
internal/deepdoc/parser/docx/parser_integration_test.go
Normal file
161
internal/deepdoc/parser/docx/parser_integration_test.go
Normal file
@@ -0,0 +1,161 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package docx
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
doctype "ragflow/internal/deepdoc/parser/type"
|
||||
)
|
||||
|
||||
// readFixture reads a DOCX fixture file from testdata/docxs/.
|
||||
func readFixture(name string) ([]byte, error) {
|
||||
return os.ReadFile(filepath.Join("testdata", "docxs", name))
|
||||
}
|
||||
|
||||
func TestParse_Integration_MultiSection(t *testing.T) {
|
||||
data, err := readFixture("multi_section.docx")
|
||||
if err != nil {
|
||||
t.Skipf("fixture not available: %v", err)
|
||||
}
|
||||
result, err := Parse(data, doctype.DefaultParserConfig())
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
if len(result.Sections) != 7 {
|
||||
t.Errorf("multi_section.docx: want 7 sections, got %d", len(result.Sections))
|
||||
}
|
||||
// Verify headings
|
||||
expected := []string{"Chapter 1", "Section 1.1", "Chapter 2"}
|
||||
titleIdx := 0
|
||||
for _, s := range result.Sections {
|
||||
if s.LayoutType == "title" {
|
||||
if titleIdx < len(expected) && s.Text != expected[titleIdx] {
|
||||
t.Errorf("heading[%d]: got %q, want %q", titleIdx, s.Text, expected[titleIdx])
|
||||
}
|
||||
titleIdx++
|
||||
}
|
||||
}
|
||||
if titleIdx != 3 {
|
||||
t.Errorf("expected 3 headings, found %d", titleIdx)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParse_Integration_WithTable(t *testing.T) {
|
||||
data, err := readFixture("with_table.docx")
|
||||
if err != nil {
|
||||
t.Skipf("fixture not available: %v", err)
|
||||
}
|
||||
result, err := Parse(data, doctype.DefaultParserConfig())
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
if len(result.Sections) != 4 {
|
||||
t.Fatalf("want 4 sections, got %d", len(result.Sections))
|
||||
}
|
||||
if result.Sections[2].DocTypeKwd != "table" {
|
||||
t.Error("expected table section at index 2")
|
||||
}
|
||||
if len(result.Sections[2].TableItem.Rows) != 3 {
|
||||
t.Errorf("expected 3 rows, got %d", len(result.Sections[2].TableItem.Rows))
|
||||
}
|
||||
if result.Sections[2].TableItem.Rows[0][0] != "Product" {
|
||||
t.Errorf("cell[0,0]: got %q", result.Sections[2].TableItem.Rows[0][0])
|
||||
}
|
||||
// Verify HTML table is rendered.
|
||||
if !strings.Contains(result.Sections[2].Text, "<table>") {
|
||||
t.Error("table Section.Text should contain HTML <table>")
|
||||
}
|
||||
if !strings.Contains(result.Sections[2].Text, "<th >Product</th>") {
|
||||
t.Errorf("table HTML missing header: %s", result.Sections[2].Text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParse_Integration_WithImage(t *testing.T) {
|
||||
data, err := readFixture("with_image.docx")
|
||||
if err != nil {
|
||||
t.Skipf("fixture not available: %v", err)
|
||||
}
|
||||
result, err := Parse(data, doctype.DefaultParserConfig())
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
hasImage := false
|
||||
for _, s := range result.Sections {
|
||||
if s.DocTypeKwd == "image" && s.Image != "" {
|
||||
hasImage = true
|
||||
}
|
||||
}
|
||||
if !hasImage {
|
||||
t.Error("expected at least one image section")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParse_Integration_NestedHeadings(t *testing.T) {
|
||||
data, err := readFixture("nested_headings.docx")
|
||||
if err != nil {
|
||||
t.Skipf("fixture not available: %v", err)
|
||||
}
|
||||
result, err := Parse(data, doctype.DefaultParserConfig())
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
if len(result.Sections) != 5 {
|
||||
t.Fatalf("want 5 sections, got %d", len(result.Sections))
|
||||
}
|
||||
titles := 0
|
||||
for _, s := range result.Sections {
|
||||
if s.LayoutType == "title" {
|
||||
titles++
|
||||
}
|
||||
}
|
||||
if titles != 5 {
|
||||
t.Errorf("expected 5 titles, got %d", titles)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParse_Integration_WithCaption(t *testing.T) {
|
||||
data, err := readFixture("with_caption.docx")
|
||||
if err != nil {
|
||||
t.Skipf("fixture not available: %v", err)
|
||||
}
|
||||
result, err := Parse(data, doctype.DefaultParserConfig())
|
||||
if err != nil {
|
||||
t.Fatalf("Parse: %v", err)
|
||||
}
|
||||
if len(result.Sections) != 4 {
|
||||
t.Fatalf("want 4 sections, got %d", len(result.Sections))
|
||||
}
|
||||
|
||||
// Block order: [Figure caption] [body text] [2x2 table] [Table caption]
|
||||
// Figure caption (index 0) is text, not title.
|
||||
if result.Sections[0].LayoutType != "text" {
|
||||
t.Errorf("figure caption: got LayoutType %q", result.Sections[0].LayoutType)
|
||||
}
|
||||
if !strings.Contains(result.Sections[0].Text, "Figure 1") {
|
||||
t.Errorf("figure caption text: %q", result.Sections[0].Text)
|
||||
}
|
||||
|
||||
// Table section (index 2) must have HTML rendering.
|
||||
s := result.Sections[2]
|
||||
if s.DocTypeKwd != "table" {
|
||||
t.Errorf("table section: DocTypeKwd=%q", s.DocTypeKwd)
|
||||
}
|
||||
if !strings.Contains(s.Text, "<table>") {
|
||||
t.Fatal("table section missing <table> HTML")
|
||||
}
|
||||
if !strings.Contains(s.Text, "<th >A</th>") || !strings.Contains(s.Text, "<th >B</th>") {
|
||||
t.Errorf("table header cells: %s", s.Text)
|
||||
}
|
||||
if !strings.Contains(s.Text, "<td >1</td>") || !strings.Contains(s.Text, "<td >2</td>") {
|
||||
t.Errorf("table data cells: %s", s.Text)
|
||||
}
|
||||
|
||||
// Table caption (index 3) follows the table.
|
||||
if !strings.Contains(result.Sections[3].Text, "Table 1") {
|
||||
t.Errorf("table caption text: %q", result.Sections[3].Text)
|
||||
}
|
||||
}
|
||||
202
internal/deepdoc/parser/docx/parser_test.go
Normal file
202
internal/deepdoc/parser/docx/parser_test.go
Normal file
@@ -0,0 +1,202 @@
|
||||
package docx
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestBlocksToSections_Paragraph(t *testing.T) {
|
||||
blocks := []RawBlock{
|
||||
{Type: "paragraph", Text: "hello world", Style: "Normal"},
|
||||
}
|
||||
sections := blocksToSections(blocks)
|
||||
|
||||
if len(sections) != 1 {
|
||||
t.Fatalf("want 1 section, got %d", len(sections))
|
||||
}
|
||||
s := sections[0]
|
||||
if s.Text != "hello world" {
|
||||
t.Errorf("Text: got %q, want %q", s.Text, "hello world")
|
||||
}
|
||||
if s.DocTypeKwd != "text" {
|
||||
t.Errorf("DocTypeKwd: got %q, want %q", s.DocTypeKwd, "text")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlocksToSections_Headings(t *testing.T) {
|
||||
blocks := []RawBlock{
|
||||
{Type: "paragraph", Text: "Main Title", Style: "Heading 1"},
|
||||
{Type: "paragraph", Text: "Sub Title", Style: "Heading 2"},
|
||||
{Type: "paragraph", Text: "Deep", Style: "Heading 3"},
|
||||
{Type: "paragraph", Text: "Plain", Style: "Normal"},
|
||||
}
|
||||
sections := blocksToSections(blocks)
|
||||
|
||||
if len(sections) != 4 {
|
||||
t.Fatalf("want 4 sections, got %d", len(sections))
|
||||
}
|
||||
if sections[0].LayoutType != "title" {
|
||||
t.Errorf("[0] LayoutType: got %q, want %q", sections[0].LayoutType, "title")
|
||||
}
|
||||
if sections[1].LayoutType != "title" {
|
||||
t.Errorf("[1] LayoutType: got %q, want %q", sections[1].LayoutType, "title")
|
||||
}
|
||||
if sections[2].LayoutType != "title" {
|
||||
t.Errorf("[2] LayoutType: got %q, want %q", sections[2].LayoutType, "title")
|
||||
}
|
||||
// Normal paragraph is NOT a title
|
||||
if sections[3].LayoutType != "text" {
|
||||
t.Errorf("[3] LayoutType: got %q, want %q", sections[3].LayoutType, "text")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlocksToSections_Table(t *testing.T) {
|
||||
blocks := []RawBlock{
|
||||
{Type: "table", Rows: [][]string{
|
||||
{"Name", "Age"},
|
||||
{"Alice", "30"},
|
||||
}},
|
||||
}
|
||||
sections := blocksToSections(blocks)
|
||||
|
||||
if len(sections) != 1 {
|
||||
t.Fatalf("want 1 section, got %d", len(sections))
|
||||
}
|
||||
s := sections[0]
|
||||
if s.DocTypeKwd != "table" {
|
||||
t.Errorf("DocTypeKwd: got %q, want %q", s.DocTypeKwd, "table")
|
||||
}
|
||||
if s.TableItem == nil {
|
||||
t.Fatal("TableItem is nil")
|
||||
}
|
||||
if len(s.TableItem.Rows) != 2 {
|
||||
t.Errorf("Rows: want 2, got %d", len(s.TableItem.Rows))
|
||||
}
|
||||
if s.Text == "" {
|
||||
t.Error("Text: expected rendered HTML, got empty string")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlocksToSections_EmptyInput(t *testing.T) {
|
||||
sections := blocksToSections(nil)
|
||||
if len(sections) != 0 {
|
||||
t.Errorf("want 0 sections, got %d", len(sections))
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlocksToSections_DocumentOrder(t *testing.T) {
|
||||
blocks := []RawBlock{
|
||||
{Type: "paragraph", Text: "first", Style: "Normal"},
|
||||
{Type: "table", Rows: [][]string{{"a"}}},
|
||||
{Type: "paragraph", Text: "second", Style: "Normal"},
|
||||
{Type: "paragraph", Text: "third", Style: "Heading 1"},
|
||||
}
|
||||
sections := blocksToSections(blocks)
|
||||
|
||||
if len(sections) != 4 {
|
||||
t.Fatalf("want 4 sections, got %d", len(sections))
|
||||
}
|
||||
if sections[0].Text != "first" {
|
||||
t.Errorf("order[0]: got %q", sections[0].Text)
|
||||
}
|
||||
if sections[1].DocTypeKwd != "table" {
|
||||
t.Errorf("order[1]: expected table")
|
||||
}
|
||||
if sections[2].Text != "second" {
|
||||
t.Errorf("order[2]: got %q", sections[2].Text)
|
||||
}
|
||||
if sections[3].Text != "third" {
|
||||
t.Errorf("order[3]: got %q", sections[3].Text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlocksToSections_CaptionStyle(t *testing.T) {
|
||||
blocks := []RawBlock{
|
||||
{Type: "paragraph", Text: "Table 1: Results", Style: "Caption"},
|
||||
}
|
||||
sections := blocksToSections(blocks)
|
||||
if len(sections) != 1 {
|
||||
t.Fatalf("want 1 section, got %d", len(sections))
|
||||
}
|
||||
if sections[0].LayoutType != "text" {
|
||||
t.Errorf("Caption: LayoutType should be 'text', got %q", sections[0].LayoutType)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlocksToSections_MixedContent(t *testing.T) {
|
||||
blocks := []RawBlock{
|
||||
{Type: "paragraph", Text: "Title", Style: "Heading 1"},
|
||||
{Type: "paragraph", Text: "Body text.", Style: "Normal"},
|
||||
{Type: "table", Rows: [][]string{{"a", "b"}}},
|
||||
{Type: "paragraph", Text: "More text.", Style: "Normal"},
|
||||
}
|
||||
sections := blocksToSections(blocks)
|
||||
|
||||
if len(sections) != 4 {
|
||||
t.Fatalf("want 4 sections, got %d", len(sections))
|
||||
}
|
||||
if sections[0].LayoutType != "title" {
|
||||
t.Errorf("[0] heading: got %q", sections[0].LayoutType)
|
||||
}
|
||||
if sections[1].LayoutType != "text" {
|
||||
t.Errorf("[1] body: got %q", sections[1].LayoutType)
|
||||
}
|
||||
if sections[2].DocTypeKwd != "table" {
|
||||
t.Errorf("[2] table: got %q", sections[2].DocTypeKwd)
|
||||
}
|
||||
if sections[3].DocTypeKwd != "text" {
|
||||
t.Errorf("[3] text after table: got %q", sections[3].DocTypeKwd)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlocksToSections_Image(t *testing.T) {
|
||||
blocks := []RawBlock{
|
||||
{Type: "image", Image: "iVBORw0KGgoAAAANSUhEUg=="},
|
||||
}
|
||||
sections := blocksToSections(blocks)
|
||||
|
||||
if len(sections) != 1 {
|
||||
t.Fatalf("want 1 section, got %d", len(sections))
|
||||
}
|
||||
if sections[0].DocTypeKwd != "image" {
|
||||
t.Errorf("DocTypeKwd: got %q, want %q", sections[0].DocTypeKwd, "image")
|
||||
}
|
||||
if sections[0].Image != "iVBORw0KGgoAAAANSUhEUg==" {
|
||||
t.Error("Image base64 not preserved")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlocksToSections_ImageBetweenText(t *testing.T) {
|
||||
blocks := []RawBlock{
|
||||
{Type: "paragraph", Text: "before", Style: "Normal"},
|
||||
{Type: "image", Image: "b64data"},
|
||||
{Type: "paragraph", Text: "after", Style: "Normal"},
|
||||
}
|
||||
sections := blocksToSections(blocks)
|
||||
|
||||
if len(sections) != 3 {
|
||||
t.Fatalf("want 3 sections, got %d", len(sections))
|
||||
}
|
||||
if sections[0].DocTypeKwd != "text" || sections[0].Text != "before" {
|
||||
t.Error("wrong text section before image")
|
||||
}
|
||||
if sections[1].DocTypeKwd != "image" {
|
||||
t.Errorf("image section: got DocTypeKwd %q", sections[1].DocTypeKwd)
|
||||
}
|
||||
if sections[2].DocTypeKwd != "text" || sections[2].Text != "after" {
|
||||
t.Error("wrong text section after image")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBlocksToSections_NestedHeadings(t *testing.T) {
|
||||
blocks := []RawBlock{
|
||||
{Type: "paragraph", Text: "H1", Style: "Heading 1"},
|
||||
{Type: "paragraph", Text: "H2", Style: "Heading 2"},
|
||||
{Type: "paragraph", Text: "H3", Style: "Heading 3"},
|
||||
}
|
||||
sections := blocksToSections(blocks)
|
||||
for i, want := range []string{"title", "title", "title"} {
|
||||
if sections[i].LayoutType != want {
|
||||
t.Errorf("[%d] got %q, want %q", i, sections[i].LayoutType, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
184
internal/deepdoc/parser/docx/raw_blocks_test.go
Normal file
184
internal/deepdoc/parser/docx/raw_blocks_test.go
Normal file
@@ -0,0 +1,184 @@
|
||||
//go:build cgo && manual
|
||||
|
||||
package docx
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func loadPythonBlocks(t *testing.T, path string) []RawBlock {
|
||||
t.Helper()
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read %s: %v", path, err)
|
||||
}
|
||||
var blocks []RawBlock
|
||||
if err := json.Unmarshal(data, &blocks); err != nil {
|
||||
t.Fatalf("unmarshal %s: %v", path, err)
|
||||
}
|
||||
return blocks
|
||||
}
|
||||
|
||||
func TestRawBlocksParity_SimpleText(t *testing.T) {
|
||||
data, err := os.ReadFile("testdata/docxs/simple_text.docx")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
got, err := ExtractRawBlocks(data)
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractRawBlocks: %v", err)
|
||||
}
|
||||
want := loadPythonBlocks(t, "testdata/output/py/docx/simple_text_blocks.json")
|
||||
|
||||
if len(got) != len(want) {
|
||||
t.Errorf("block count: got %d, want %d", len(got), len(want))
|
||||
}
|
||||
for i := 0; i < min(len(got), len(want)); i++ {
|
||||
if got[i].Type != want[i].Type {
|
||||
t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type)
|
||||
}
|
||||
if got[i].Text != want[i].Text {
|
||||
t.Errorf("block[%d].text: got %q, want %q", i, got[i].Text, want[i].Text)
|
||||
}
|
||||
}
|
||||
if t.Failed() {
|
||||
t.Logf("Go blocks: %+v", got)
|
||||
t.Logf("Py blocks: %+v", want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRawBlocksParity_WithTable(t *testing.T) {
|
||||
data, err := os.ReadFile("testdata/docxs/with_table.docx")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
got, err := ExtractRawBlocks(data)
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractRawBlocks: %v", err)
|
||||
}
|
||||
want := loadPythonBlocks(t, "testdata/output/py/docx/with_table_blocks.json")
|
||||
|
||||
if len(got) != len(want) {
|
||||
t.Errorf("block count: got %d, want %d", len(got), len(want))
|
||||
}
|
||||
for i := 0; i < min(len(got), len(want)); i++ {
|
||||
if got[i].Type != want[i].Type {
|
||||
t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type)
|
||||
}
|
||||
}
|
||||
if t.Failed() {
|
||||
t.Logf("Go blocks: %+v", got)
|
||||
t.Logf("Py blocks: %+v", want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRawBlocksParity_WithImage(t *testing.T) {
|
||||
data, err := os.ReadFile("testdata/docxs/with_image.docx")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
got, err := ExtractRawBlocks(data)
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractRawBlocks: %v", err)
|
||||
}
|
||||
// Engine-level difference: python-docx embeds images inside empty
|
||||
// paragraph blocks; office_oxide represents them as separate elements.
|
||||
// Both engines must see "Before" and "After" text and at least one
|
||||
// image-related block.
|
||||
hasBefore, hasAfter, hasImage := false, false, false
|
||||
for _, b := range got {
|
||||
if b.Text != "" {
|
||||
hasBefore = hasBefore || b.Text == "Before the image."
|
||||
hasAfter = hasAfter || b.Text == "After the image."
|
||||
}
|
||||
if b.Image != "" {
|
||||
hasImage = true
|
||||
}
|
||||
}
|
||||
if !hasBefore {
|
||||
t.Error("missing 'Before the image.' text")
|
||||
}
|
||||
if !hasAfter {
|
||||
t.Error("missing 'After the image.' text")
|
||||
}
|
||||
if !hasImage {
|
||||
t.Log("office_oxide IR does not expose embedded images as top-level blocks")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRawBlocksParity_MultiSection(t *testing.T) {
|
||||
data, err := os.ReadFile("testdata/docxs/multi_section.docx")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
got, err := ExtractRawBlocks(data)
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractRawBlocks: %v", err)
|
||||
}
|
||||
want := loadPythonBlocks(t, "testdata/output/py/docx/multi_section_blocks.json")
|
||||
if len(got) != len(want) {
|
||||
t.Errorf("block count: got %d, want %d", len(got), len(want))
|
||||
}
|
||||
for i := 0; i < min(len(got), len(want)); i++ {
|
||||
if got[i].Type != want[i].Type {
|
||||
t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRawBlocksParity_NestedHeadings(t *testing.T) {
|
||||
data, err := os.ReadFile("testdata/docxs/nested_headings.docx")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
got, err := ExtractRawBlocks(data)
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractRawBlocks: %v", err)
|
||||
}
|
||||
want := loadPythonBlocks(t, "testdata/output/py/docx/nested_headings_blocks.json")
|
||||
if len(got) != len(want) {
|
||||
t.Errorf("block count: got %d, want %d", len(got), len(want))
|
||||
}
|
||||
headings := 0
|
||||
for _, b := range got {
|
||||
if strings.HasPrefix(b.Style, "Heading") {
|
||||
headings++
|
||||
}
|
||||
}
|
||||
if headings != 5 {
|
||||
t.Errorf("expected 5 headings, got %d", headings)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRawBlocksParity_WithCaption(t *testing.T) {
|
||||
data, err := os.ReadFile("testdata/docxs/with_caption.docx")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
got, err := ExtractRawBlocks(data)
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractRawBlocks: %v", err)
|
||||
}
|
||||
// Verify both engines see the same number of blocks
|
||||
want := loadPythonBlocks(t, "testdata/output/py/docx/with_caption_blocks.json")
|
||||
if len(got) != len(want) {
|
||||
t.Errorf("block count: got %d, want %d", len(got), len(want))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRawBlocksParity_Empty(t *testing.T) {
|
||||
data, err := os.ReadFile("testdata/docxs/empty.docx")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
got, err := ExtractRawBlocks(data)
|
||||
if err != nil {
|
||||
t.Fatalf("ExtractRawBlocks: %v", err)
|
||||
}
|
||||
if len(got) != 0 {
|
||||
t.Errorf("empty docx: expected 0 blocks, got %d", len(got))
|
||||
}
|
||||
}
|
||||
108
internal/deepdoc/parser/docx/reader.go
Normal file
108
internal/deepdoc/parser/docx/reader.go
Normal file
@@ -0,0 +1,108 @@
|
||||
//go:build cgo
|
||||
|
||||
package docx
|
||||
|
||||
import (
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
officeOxide "github.com/yfedoseev/office_oxide/go"
|
||||
)
|
||||
|
||||
// ExtractRawBlocks opens a DOCX via office_oxide and extracts blocks in
|
||||
// document order, matching the format produced by python-docx's
|
||||
// _element.body iteration.
|
||||
func ExtractRawBlocks(data []byte) ([]RawBlock, error) {
|
||||
doc, err := officeOxide.OpenFromBytes(data, "docx")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("office_oxide open: %w", err)
|
||||
}
|
||||
defer doc.Close()
|
||||
|
||||
irJSON, err := doc.ToIRJSON()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("ToIRJSON: %w", err)
|
||||
}
|
||||
|
||||
var ir irDocument
|
||||
if err := json.Unmarshal([]byte(irJSON), &ir); err != nil {
|
||||
return nil, fmt.Errorf("parse IR JSON: %w", err)
|
||||
}
|
||||
|
||||
var blocks []RawBlock
|
||||
for _, sec := range ir.Sections {
|
||||
for _, el := range sec.Elements {
|
||||
block := irElementToBlock(el)
|
||||
blocks = append(blocks, block)
|
||||
}
|
||||
}
|
||||
return blocks, nil
|
||||
}
|
||||
|
||||
func irElementToBlock(el irElement) RawBlock {
|
||||
switch el.Type {
|
||||
case "table":
|
||||
rows := make([][]string, len(el.Rows))
|
||||
for ri, row := range el.Rows {
|
||||
cells := make([]string, len(row.Cells))
|
||||
for ci, cell := range row.Cells {
|
||||
cells[ci] = joinElements(cell.Content)
|
||||
}
|
||||
rows[ri] = cells
|
||||
}
|
||||
return RawBlock{Type: "table", Rows: rows}
|
||||
|
||||
case "heading":
|
||||
text := joinRuns(el.Content)
|
||||
level := strconv.Itoa(el.Level)
|
||||
return RawBlock{
|
||||
Type: "paragraph",
|
||||
Text: text,
|
||||
Style: "Heading " + level,
|
||||
}
|
||||
|
||||
case "image":
|
||||
return RawBlock{
|
||||
Type: "image",
|
||||
Image: base64.StdEncoding.EncodeToString(el.Data),
|
||||
}
|
||||
|
||||
default: // "paragraph" and anything else
|
||||
style := el.Style
|
||||
if style == "" {
|
||||
style = "Normal"
|
||||
}
|
||||
return RawBlock{
|
||||
Type: "paragraph",
|
||||
Text: joinRuns(el.Content),
|
||||
Style: style,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func joinRuns(runs []irRun) string {
|
||||
var b strings.Builder
|
||||
for _, r := range runs {
|
||||
if r.Type == "text" {
|
||||
b.WriteString(r.Text)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// joinElements extracts plain text from nested irElements (used for table cells).
|
||||
// When multiple elements are present, a newline is inserted between each one
|
||||
// to match python-docx _Cell.text behavior.
|
||||
func joinElements(els []irElement) string {
|
||||
var b strings.Builder
|
||||
for i, el := range els {
|
||||
if i > 0 {
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
b.WriteString(joinRuns(el.Content))
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
38
internal/deepdoc/parser/docx/reader_cell_test.go
Normal file
38
internal/deepdoc/parser/docx/reader_cell_test.go
Normal file
@@ -0,0 +1,38 @@
|
||||
//go:build cgo
|
||||
|
||||
package docx
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestJoinElements_MultiParagraphCell(t *testing.T) {
|
||||
// When a table cell contains multiple paragraphs, joinElements must
|
||||
// insert a newline between them to match python-docx _Cell.text behavior.
|
||||
els := []irElement{
|
||||
{Type: "paragraph", Content: []irRun{{Type: "text", Text: "first line"}}},
|
||||
{Type: "paragraph", Content: []irRun{{Type: "text", Text: "second line"}}},
|
||||
}
|
||||
got := joinElements(els)
|
||||
want := "first line\nsecond line"
|
||||
if got != want {
|
||||
t.Errorf("joinElements:\ngot: %q\nwant: %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJoinElements_SingleElement(t *testing.T) {
|
||||
// Single paragraph cell — no separator expected.
|
||||
els := []irElement{
|
||||
{Type: "paragraph", Content: []irRun{{Type: "text", Text: "single paragraph"}}},
|
||||
}
|
||||
got := joinElements(els)
|
||||
want := "single paragraph"
|
||||
if got != want {
|
||||
t.Errorf("joinElements:\ngot: %q\nwant: %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJoinElements_Empty(t *testing.T) {
|
||||
got := joinElements(nil)
|
||||
if got != "" {
|
||||
t.Errorf("joinElements(nil): got %q, want empty", got)
|
||||
}
|
||||
}
|
||||
11
internal/deepdoc/parser/docx/reader_stub.go
Normal file
11
internal/deepdoc/parser/docx/reader_stub.go
Normal file
@@ -0,0 +1,11 @@
|
||||
//go:build !cgo
|
||||
|
||||
package docx
|
||||
|
||||
import "errors"
|
||||
|
||||
// ExtractRawBlocks is not available without cgo because the underlying
|
||||
// office_oxide library requires CGo. Rebuild with CGO_ENABLED=1.
|
||||
func ExtractRawBlocks(_ []byte) ([]RawBlock, error) {
|
||||
return nil, errors.New("office_oxide requires cgo; rebuild with CGO_ENABLED=1")
|
||||
}
|
||||
54
internal/deepdoc/parser/docx/reader_style_test.go
Normal file
54
internal/deepdoc/parser/docx/reader_style_test.go
Normal file
@@ -0,0 +1,54 @@
|
||||
//go:build cgo
|
||||
|
||||
package docx
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestIrElementToBlock_PreservesCustomStyle(t *testing.T) {
|
||||
// irElementToBlock should preserve the Word style name from the IR,
|
||||
// not hard-code "Normal" for every non-heading paragraph.
|
||||
el := irElement{
|
||||
Type: "paragraph",
|
||||
Style: "Caption",
|
||||
Content: []irRun{
|
||||
{Type: "text", Text: "Figure 1: Architecture diagram"},
|
||||
},
|
||||
}
|
||||
block := irElementToBlock(el)
|
||||
|
||||
if block.Style != "Caption" {
|
||||
t.Errorf("irElementToBlock with Style=%q:\ngot Style=%q\nwant Style=%q",
|
||||
el.Style, block.Style, el.Style)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIrElementToBlock_PreservesHeadingStyle(t *testing.T) {
|
||||
// Heading elements should still produce "Heading N" style.
|
||||
el := irElement{
|
||||
Type: "heading",
|
||||
Level: 2,
|
||||
Content: []irRun{
|
||||
{Type: "text", Text: "Section 2.1"},
|
||||
},
|
||||
}
|
||||
block := irElementToBlock(el)
|
||||
|
||||
if block.Style != "Heading 2" {
|
||||
t.Errorf("heading: got Style=%q, want %q", block.Style, "Heading 2")
|
||||
}
|
||||
}
|
||||
|
||||
func TestIrElementToBlock_FallsBackToNormal(t *testing.T) {
|
||||
// When Style is empty, defaults to "Normal".
|
||||
el := irElement{
|
||||
Type: "paragraph",
|
||||
Content: []irRun{
|
||||
{Type: "text", Text: "plain text"},
|
||||
},
|
||||
}
|
||||
block := irElementToBlock(el)
|
||||
|
||||
if block.Style != "Normal" {
|
||||
t.Errorf("empty style: got %q, want %q", block.Style, "Normal")
|
||||
}
|
||||
}
|
||||
46
internal/deepdoc/parser/docx/types.go
Normal file
46
internal/deepdoc/parser/docx/types.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package docx
|
||||
|
||||
// RawBlock represents a single block extracted from a DOCX file in document order.
|
||||
// Type is one of "paragraph", "table", or "image". Headings are represented as
|
||||
// Type "paragraph" with a Style of "Heading N".
|
||||
type RawBlock struct {
|
||||
Type string `json:"type"` // "paragraph" or "table"
|
||||
Text string `json:"text"` // paragraph text; empty for tables
|
||||
Style string `json:"style"` // Word style name (e.g. "Normal", "Heading 1")
|
||||
Image string `json:"image,omitempty"` // base64-encoded image data
|
||||
Rows [][]string `json:"rows,omitempty"` // table rows; nil for paragraphs
|
||||
}
|
||||
|
||||
// ── office_oxide IR JSON types ────────────────────────────────────────
|
||||
|
||||
type irElement struct {
|
||||
Type string `json:"type"` // "paragraph", "heading", "table", "image"
|
||||
Level int `json:"level"` // heading level (1-6)
|
||||
Style string `json:"style"` // Word style name (e.g. "Normal", "Caption", "Heading 1")
|
||||
Content []irRun `json:"content"` // rich text runs
|
||||
Data []byte `json:"data"` // raw image bytes (for "image" type)
|
||||
Rows []irRow `json:"rows"` // table rows
|
||||
}
|
||||
|
||||
type irRun struct {
|
||||
Type string `json:"type"` // "text", "image"
|
||||
Text string `json:"text"` // plain text content
|
||||
Content []irElement `json:"content"` // nested elements (used in table cells)
|
||||
}
|
||||
|
||||
type irRow struct {
|
||||
Cells []irCell `json:"cells"`
|
||||
}
|
||||
|
||||
type irCell struct {
|
||||
Content []irElement `json:"content"` // nested paragraphs inside table cell
|
||||
}
|
||||
|
||||
type irSection struct {
|
||||
Title string `json:"title"`
|
||||
Elements []irElement `json:"elements"`
|
||||
}
|
||||
|
||||
type irDocument struct {
|
||||
Sections []irSection `json:"sections"`
|
||||
}
|
||||
Reference in New Issue
Block a user