From c8cf0c967d2af7d00365e7d130cff1b0271bc18f Mon Sep 17 00:00:00 2001 From: Jack Date: Thu, 2 Jul 2026 16:31:09 +0800 Subject: [PATCH] Feat: add DOCX parser (#16521) ### Summary Add DOCX parser - go. --- internal/deepdoc/parser/docx/parser.go | 60 ++++++ .../parser/docx/parser_integration_test.go | 161 ++++++++++++++ internal/deepdoc/parser/docx/parser_test.go | 202 ++++++++++++++++++ .../deepdoc/parser/docx/raw_blocks_test.go | 184 ++++++++++++++++ internal/deepdoc/parser/docx/reader.go | 108 ++++++++++ .../deepdoc/parser/docx/reader_cell_test.go | 38 ++++ internal/deepdoc/parser/docx/reader_stub.go | 11 + .../deepdoc/parser/docx/reader_style_test.go | 54 +++++ internal/deepdoc/parser/docx/types.go | 46 ++++ 9 files changed, 864 insertions(+) create mode 100644 internal/deepdoc/parser/docx/parser.go create mode 100644 internal/deepdoc/parser/docx/parser_integration_test.go create mode 100644 internal/deepdoc/parser/docx/parser_test.go create mode 100644 internal/deepdoc/parser/docx/raw_blocks_test.go create mode 100644 internal/deepdoc/parser/docx/reader.go create mode 100644 internal/deepdoc/parser/docx/reader_cell_test.go create mode 100644 internal/deepdoc/parser/docx/reader_stub.go create mode 100644 internal/deepdoc/parser/docx/reader_style_test.go create mode 100644 internal/deepdoc/parser/docx/types.go diff --git a/internal/deepdoc/parser/docx/parser.go b/internal/deepdoc/parser/docx/parser.go new file mode 100644 index 0000000000..a13ed80cb4 --- /dev/null +++ b/internal/deepdoc/parser/docx/parser.go @@ -0,0 +1,60 @@ +package docx + +import ( + "strings" + + "ragflow/internal/deepdoc/parser/pdf/table" + doctype "ragflow/internal/deepdoc/parser/type" +) + +// blocksToSections converts raw DOCX blocks to the shared Section representation +// consumed by the framework layer. Headings get LayoutType "title", tables get +// DocTypeKwd "table" with a populated TableItem, and everything else is "text". +func blocksToSections(blocks []RawBlock) []doctype.Section { + sections := make([]doctype.Section, 0, len(blocks)) + for _, b := range blocks { + sec := blockToSection(b) + sections = append(sections, sec) + } + return sections +} + +func blockToSection(b RawBlock) doctype.Section { + switch b.Type { + case "table": + return doctype.Section{ + Text: table.SimpleRowsToHTML(b.Rows), + DocTypeKwd: "table", + TableItem: &doctype.TableItem{ + Rows: b.Rows, + }, + } + case "image": + return doctype.Section{ + DocTypeKwd: "image", + Image: b.Image, + } + default: + layoutType := "text" + if strings.HasPrefix(strings.ToLower(b.Style), "heading") { + layoutType = "title" + } + return doctype.Section{ + Text: b.Text, + DocTypeKwd: "text", + LayoutType: layoutType, + } + } +} + +// Parse converts a DOCX file (given as bytes) into a doctype.ParseResult. +// It uses office_oxide for raw block extraction, then maps blocks to Sections. +func Parse(data []byte, cfg doctype.ParserConfig) (*doctype.ParseResult, error) { + blocks, err := ExtractRawBlocks(data) + if err != nil { + return nil, err + } + return &doctype.ParseResult{ + Sections: blocksToSections(blocks), + }, nil +} diff --git a/internal/deepdoc/parser/docx/parser_integration_test.go b/internal/deepdoc/parser/docx/parser_integration_test.go new file mode 100644 index 0000000000..675e2e196f --- /dev/null +++ b/internal/deepdoc/parser/docx/parser_integration_test.go @@ -0,0 +1,161 @@ +//go:build cgo && manual + +package docx + +import ( + "os" + "path/filepath" + "strings" + "testing" + + doctype "ragflow/internal/deepdoc/parser/type" +) + +// readFixture reads a DOCX fixture file from testdata/docxs/. +func readFixture(name string) ([]byte, error) { + return os.ReadFile(filepath.Join("testdata", "docxs", name)) +} + +func TestParse_Integration_MultiSection(t *testing.T) { + data, err := readFixture("multi_section.docx") + if err != nil { + t.Skipf("fixture not available: %v", err) + } + result, err := Parse(data, doctype.DefaultParserConfig()) + if err != nil { + t.Fatalf("Parse: %v", err) + } + if len(result.Sections) != 7 { + t.Errorf("multi_section.docx: want 7 sections, got %d", len(result.Sections)) + } + // Verify headings + expected := []string{"Chapter 1", "Section 1.1", "Chapter 2"} + titleIdx := 0 + for _, s := range result.Sections { + if s.LayoutType == "title" { + if titleIdx < len(expected) && s.Text != expected[titleIdx] { + t.Errorf("heading[%d]: got %q, want %q", titleIdx, s.Text, expected[titleIdx]) + } + titleIdx++ + } + } + if titleIdx != 3 { + t.Errorf("expected 3 headings, found %d", titleIdx) + } +} + +func TestParse_Integration_WithTable(t *testing.T) { + data, err := readFixture("with_table.docx") + if err != nil { + t.Skipf("fixture not available: %v", err) + } + result, err := Parse(data, doctype.DefaultParserConfig()) + if err != nil { + t.Fatalf("Parse: %v", err) + } + if len(result.Sections) != 4 { + t.Fatalf("want 4 sections, got %d", len(result.Sections)) + } + if result.Sections[2].DocTypeKwd != "table" { + t.Error("expected table section at index 2") + } + if len(result.Sections[2].TableItem.Rows) != 3 { + t.Errorf("expected 3 rows, got %d", len(result.Sections[2].TableItem.Rows)) + } + if result.Sections[2].TableItem.Rows[0][0] != "Product" { + t.Errorf("cell[0,0]: got %q", result.Sections[2].TableItem.Rows[0][0]) + } + // Verify HTML table is rendered. + if !strings.Contains(result.Sections[2].Text, "") { + t.Error("table Section.Text should contain HTML
") + } + if !strings.Contains(result.Sections[2].Text, "") { + t.Errorf("table HTML missing header: %s", result.Sections[2].Text) + } +} + +func TestParse_Integration_WithImage(t *testing.T) { + data, err := readFixture("with_image.docx") + if err != nil { + t.Skipf("fixture not available: %v", err) + } + result, err := Parse(data, doctype.DefaultParserConfig()) + if err != nil { + t.Fatalf("Parse: %v", err) + } + hasImage := false + for _, s := range result.Sections { + if s.DocTypeKwd == "image" && s.Image != "" { + hasImage = true + } + } + if !hasImage { + t.Error("expected at least one image section") + } +} + +func TestParse_Integration_NestedHeadings(t *testing.T) { + data, err := readFixture("nested_headings.docx") + if err != nil { + t.Skipf("fixture not available: %v", err) + } + result, err := Parse(data, doctype.DefaultParserConfig()) + if err != nil { + t.Fatalf("Parse: %v", err) + } + if len(result.Sections) != 5 { + t.Fatalf("want 5 sections, got %d", len(result.Sections)) + } + titles := 0 + for _, s := range result.Sections { + if s.LayoutType == "title" { + titles++ + } + } + if titles != 5 { + t.Errorf("expected 5 titles, got %d", titles) + } +} + +func TestParse_Integration_WithCaption(t *testing.T) { + data, err := readFixture("with_caption.docx") + if err != nil { + t.Skipf("fixture not available: %v", err) + } + result, err := Parse(data, doctype.DefaultParserConfig()) + if err != nil { + t.Fatalf("Parse: %v", err) + } + if len(result.Sections) != 4 { + t.Fatalf("want 4 sections, got %d", len(result.Sections)) + } + + // Block order: [Figure caption] [body text] [2x2 table] [Table caption] + // Figure caption (index 0) is text, not title. + if result.Sections[0].LayoutType != "text" { + t.Errorf("figure caption: got LayoutType %q", result.Sections[0].LayoutType) + } + if !strings.Contains(result.Sections[0].Text, "Figure 1") { + t.Errorf("figure caption text: %q", result.Sections[0].Text) + } + + // Table section (index 2) must have HTML rendering. + s := result.Sections[2] + if s.DocTypeKwd != "table" { + t.Errorf("table section: DocTypeKwd=%q", s.DocTypeKwd) + } + if !strings.Contains(s.Text, "
Product
") { + t.Fatal("table section missing
HTML") + } + if !strings.Contains(s.Text, "") || !strings.Contains(s.Text, "") { + t.Errorf("table header cells: %s", s.Text) + } + if !strings.Contains(s.Text, "") || !strings.Contains(s.Text, "") { + t.Errorf("table data cells: %s", s.Text) + } + + // Table caption (index 3) follows the table. + if !strings.Contains(result.Sections[3].Text, "Table 1") { + t.Errorf("table caption text: %q", result.Sections[3].Text) + } +} diff --git a/internal/deepdoc/parser/docx/parser_test.go b/internal/deepdoc/parser/docx/parser_test.go new file mode 100644 index 0000000000..60dcf07473 --- /dev/null +++ b/internal/deepdoc/parser/docx/parser_test.go @@ -0,0 +1,202 @@ +package docx + +import ( + "testing" +) + +func TestBlocksToSections_Paragraph(t *testing.T) { + blocks := []RawBlock{ + {Type: "paragraph", Text: "hello world", Style: "Normal"}, + } + sections := blocksToSections(blocks) + + if len(sections) != 1 { + t.Fatalf("want 1 section, got %d", len(sections)) + } + s := sections[0] + if s.Text != "hello world" { + t.Errorf("Text: got %q, want %q", s.Text, "hello world") + } + if s.DocTypeKwd != "text" { + t.Errorf("DocTypeKwd: got %q, want %q", s.DocTypeKwd, "text") + } +} + +func TestBlocksToSections_Headings(t *testing.T) { + blocks := []RawBlock{ + {Type: "paragraph", Text: "Main Title", Style: "Heading 1"}, + {Type: "paragraph", Text: "Sub Title", Style: "Heading 2"}, + {Type: "paragraph", Text: "Deep", Style: "Heading 3"}, + {Type: "paragraph", Text: "Plain", Style: "Normal"}, + } + sections := blocksToSections(blocks) + + if len(sections) != 4 { + t.Fatalf("want 4 sections, got %d", len(sections)) + } + if sections[0].LayoutType != "title" { + t.Errorf("[0] LayoutType: got %q, want %q", sections[0].LayoutType, "title") + } + if sections[1].LayoutType != "title" { + t.Errorf("[1] LayoutType: got %q, want %q", sections[1].LayoutType, "title") + } + if sections[2].LayoutType != "title" { + t.Errorf("[2] LayoutType: got %q, want %q", sections[2].LayoutType, "title") + } + // Normal paragraph is NOT a title + if sections[3].LayoutType != "text" { + t.Errorf("[3] LayoutType: got %q, want %q", sections[3].LayoutType, "text") + } +} + +func TestBlocksToSections_Table(t *testing.T) { + blocks := []RawBlock{ + {Type: "table", Rows: [][]string{ + {"Name", "Age"}, + {"Alice", "30"}, + }}, + } + sections := blocksToSections(blocks) + + if len(sections) != 1 { + t.Fatalf("want 1 section, got %d", len(sections)) + } + s := sections[0] + if s.DocTypeKwd != "table" { + t.Errorf("DocTypeKwd: got %q, want %q", s.DocTypeKwd, "table") + } + if s.TableItem == nil { + t.Fatal("TableItem is nil") + } + if len(s.TableItem.Rows) != 2 { + t.Errorf("Rows: want 2, got %d", len(s.TableItem.Rows)) + } + if s.Text == "" { + t.Error("Text: expected rendered HTML, got empty string") + } +} + +func TestBlocksToSections_EmptyInput(t *testing.T) { + sections := blocksToSections(nil) + if len(sections) != 0 { + t.Errorf("want 0 sections, got %d", len(sections)) + } +} + +func TestBlocksToSections_DocumentOrder(t *testing.T) { + blocks := []RawBlock{ + {Type: "paragraph", Text: "first", Style: "Normal"}, + {Type: "table", Rows: [][]string{{"a"}}}, + {Type: "paragraph", Text: "second", Style: "Normal"}, + {Type: "paragraph", Text: "third", Style: "Heading 1"}, + } + sections := blocksToSections(blocks) + + if len(sections) != 4 { + t.Fatalf("want 4 sections, got %d", len(sections)) + } + if sections[0].Text != "first" { + t.Errorf("order[0]: got %q", sections[0].Text) + } + if sections[1].DocTypeKwd != "table" { + t.Errorf("order[1]: expected table") + } + if sections[2].Text != "second" { + t.Errorf("order[2]: got %q", sections[2].Text) + } + if sections[3].Text != "third" { + t.Errorf("order[3]: got %q", sections[3].Text) + } +} + +func TestBlocksToSections_CaptionStyle(t *testing.T) { + blocks := []RawBlock{ + {Type: "paragraph", Text: "Table 1: Results", Style: "Caption"}, + } + sections := blocksToSections(blocks) + if len(sections) != 1 { + t.Fatalf("want 1 section, got %d", len(sections)) + } + if sections[0].LayoutType != "text" { + t.Errorf("Caption: LayoutType should be 'text', got %q", sections[0].LayoutType) + } +} + +func TestBlocksToSections_MixedContent(t *testing.T) { + blocks := []RawBlock{ + {Type: "paragraph", Text: "Title", Style: "Heading 1"}, + {Type: "paragraph", Text: "Body text.", Style: "Normal"}, + {Type: "table", Rows: [][]string{{"a", "b"}}}, + {Type: "paragraph", Text: "More text.", Style: "Normal"}, + } + sections := blocksToSections(blocks) + + if len(sections) != 4 { + t.Fatalf("want 4 sections, got %d", len(sections)) + } + if sections[0].LayoutType != "title" { + t.Errorf("[0] heading: got %q", sections[0].LayoutType) + } + if sections[1].LayoutType != "text" { + t.Errorf("[1] body: got %q", sections[1].LayoutType) + } + if sections[2].DocTypeKwd != "table" { + t.Errorf("[2] table: got %q", sections[2].DocTypeKwd) + } + if sections[3].DocTypeKwd != "text" { + t.Errorf("[3] text after table: got %q", sections[3].DocTypeKwd) + } +} + +func TestBlocksToSections_Image(t *testing.T) { + blocks := []RawBlock{ + {Type: "image", Image: "iVBORw0KGgoAAAANSUhEUg=="}, + } + sections := blocksToSections(blocks) + + if len(sections) != 1 { + t.Fatalf("want 1 section, got %d", len(sections)) + } + if sections[0].DocTypeKwd != "image" { + t.Errorf("DocTypeKwd: got %q, want %q", sections[0].DocTypeKwd, "image") + } + if sections[0].Image != "iVBORw0KGgoAAAANSUhEUg==" { + t.Error("Image base64 not preserved") + } +} + +func TestBlocksToSections_ImageBetweenText(t *testing.T) { + blocks := []RawBlock{ + {Type: "paragraph", Text: "before", Style: "Normal"}, + {Type: "image", Image: "b64data"}, + {Type: "paragraph", Text: "after", Style: "Normal"}, + } + sections := blocksToSections(blocks) + + if len(sections) != 3 { + t.Fatalf("want 3 sections, got %d", len(sections)) + } + if sections[0].DocTypeKwd != "text" || sections[0].Text != "before" { + t.Error("wrong text section before image") + } + if sections[1].DocTypeKwd != "image" { + t.Errorf("image section: got DocTypeKwd %q", sections[1].DocTypeKwd) + } + if sections[2].DocTypeKwd != "text" || sections[2].Text != "after" { + t.Error("wrong text section after image") + } +} + +func TestBlocksToSections_NestedHeadings(t *testing.T) { + blocks := []RawBlock{ + {Type: "paragraph", Text: "H1", Style: "Heading 1"}, + {Type: "paragraph", Text: "H2", Style: "Heading 2"}, + {Type: "paragraph", Text: "H3", Style: "Heading 3"}, + } + sections := blocksToSections(blocks) + for i, want := range []string{"title", "title", "title"} { + if sections[i].LayoutType != want { + t.Errorf("[%d] got %q, want %q", i, sections[i].LayoutType, want) + } + } +} diff --git a/internal/deepdoc/parser/docx/raw_blocks_test.go b/internal/deepdoc/parser/docx/raw_blocks_test.go new file mode 100644 index 0000000000..c1174f7a21 --- /dev/null +++ b/internal/deepdoc/parser/docx/raw_blocks_test.go @@ -0,0 +1,184 @@ +//go:build cgo && manual + +package docx + +import ( + "encoding/json" + "os" + "strings" + "testing" +) + +func loadPythonBlocks(t *testing.T, path string) []RawBlock { + t.Helper() + data, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read %s: %v", path, err) + } + var blocks []RawBlock + if err := json.Unmarshal(data, &blocks); err != nil { + t.Fatalf("unmarshal %s: %v", path, err) + } + return blocks +} + +func TestRawBlocksParity_SimpleText(t *testing.T) { + data, err := os.ReadFile("testdata/docxs/simple_text.docx") + if err != nil { + t.Fatal(err) + } + got, err := ExtractRawBlocks(data) + if err != nil { + t.Fatalf("ExtractRawBlocks: %v", err) + } + want := loadPythonBlocks(t, "testdata/output/py/docx/simple_text_blocks.json") + + if len(got) != len(want) { + t.Errorf("block count: got %d, want %d", len(got), len(want)) + } + for i := 0; i < min(len(got), len(want)); i++ { + if got[i].Type != want[i].Type { + t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type) + } + if got[i].Text != want[i].Text { + t.Errorf("block[%d].text: got %q, want %q", i, got[i].Text, want[i].Text) + } + } + if t.Failed() { + t.Logf("Go blocks: %+v", got) + t.Logf("Py blocks: %+v", want) + } +} + +func TestRawBlocksParity_WithTable(t *testing.T) { + data, err := os.ReadFile("testdata/docxs/with_table.docx") + if err != nil { + t.Fatal(err) + } + got, err := ExtractRawBlocks(data) + if err != nil { + t.Fatalf("ExtractRawBlocks: %v", err) + } + want := loadPythonBlocks(t, "testdata/output/py/docx/with_table_blocks.json") + + if len(got) != len(want) { + t.Errorf("block count: got %d, want %d", len(got), len(want)) + } + for i := 0; i < min(len(got), len(want)); i++ { + if got[i].Type != want[i].Type { + t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type) + } + } + if t.Failed() { + t.Logf("Go blocks: %+v", got) + t.Logf("Py blocks: %+v", want) + } +} + +func TestRawBlocksParity_WithImage(t *testing.T) { + data, err := os.ReadFile("testdata/docxs/with_image.docx") + if err != nil { + t.Fatal(err) + } + got, err := ExtractRawBlocks(data) + if err != nil { + t.Fatalf("ExtractRawBlocks: %v", err) + } + // Engine-level difference: python-docx embeds images inside empty + // paragraph blocks; office_oxide represents them as separate elements. + // Both engines must see "Before" and "After" text and at least one + // image-related block. + hasBefore, hasAfter, hasImage := false, false, false + for _, b := range got { + if b.Text != "" { + hasBefore = hasBefore || b.Text == "Before the image." + hasAfter = hasAfter || b.Text == "After the image." + } + if b.Image != "" { + hasImage = true + } + } + if !hasBefore { + t.Error("missing 'Before the image.' text") + } + if !hasAfter { + t.Error("missing 'After the image.' text") + } + if !hasImage { + t.Log("office_oxide IR does not expose embedded images as top-level blocks") + } +} + +func TestRawBlocksParity_MultiSection(t *testing.T) { + data, err := os.ReadFile("testdata/docxs/multi_section.docx") + if err != nil { + t.Fatal(err) + } + got, err := ExtractRawBlocks(data) + if err != nil { + t.Fatalf("ExtractRawBlocks: %v", err) + } + want := loadPythonBlocks(t, "testdata/output/py/docx/multi_section_blocks.json") + if len(got) != len(want) { + t.Errorf("block count: got %d, want %d", len(got), len(want)) + } + for i := 0; i < min(len(got), len(want)); i++ { + if got[i].Type != want[i].Type { + t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type) + } + } +} + +func TestRawBlocksParity_NestedHeadings(t *testing.T) { + data, err := os.ReadFile("testdata/docxs/nested_headings.docx") + if err != nil { + t.Fatal(err) + } + got, err := ExtractRawBlocks(data) + if err != nil { + t.Fatalf("ExtractRawBlocks: %v", err) + } + want := loadPythonBlocks(t, "testdata/output/py/docx/nested_headings_blocks.json") + if len(got) != len(want) { + t.Errorf("block count: got %d, want %d", len(got), len(want)) + } + headings := 0 + for _, b := range got { + if strings.HasPrefix(b.Style, "Heading") { + headings++ + } + } + if headings != 5 { + t.Errorf("expected 5 headings, got %d", headings) + } +} + +func TestRawBlocksParity_WithCaption(t *testing.T) { + data, err := os.ReadFile("testdata/docxs/with_caption.docx") + if err != nil { + t.Fatal(err) + } + got, err := ExtractRawBlocks(data) + if err != nil { + t.Fatalf("ExtractRawBlocks: %v", err) + } + // Verify both engines see the same number of blocks + want := loadPythonBlocks(t, "testdata/output/py/docx/with_caption_blocks.json") + if len(got) != len(want) { + t.Errorf("block count: got %d, want %d", len(got), len(want)) + } +} + +func TestRawBlocksParity_Empty(t *testing.T) { + data, err := os.ReadFile("testdata/docxs/empty.docx") + if err != nil { + t.Fatal(err) + } + got, err := ExtractRawBlocks(data) + if err != nil { + t.Fatalf("ExtractRawBlocks: %v", err) + } + if len(got) != 0 { + t.Errorf("empty docx: expected 0 blocks, got %d", len(got)) + } +} diff --git a/internal/deepdoc/parser/docx/reader.go b/internal/deepdoc/parser/docx/reader.go new file mode 100644 index 0000000000..b3a5c3677d --- /dev/null +++ b/internal/deepdoc/parser/docx/reader.go @@ -0,0 +1,108 @@ +//go:build cgo + +package docx + +import ( + "encoding/base64" + "encoding/json" + "fmt" + "strconv" + "strings" + + officeOxide "github.com/yfedoseev/office_oxide/go" +) + +// ExtractRawBlocks opens a DOCX via office_oxide and extracts blocks in +// document order, matching the format produced by python-docx's +// _element.body iteration. +func ExtractRawBlocks(data []byte) ([]RawBlock, error) { + doc, err := officeOxide.OpenFromBytes(data, "docx") + if err != nil { + return nil, fmt.Errorf("office_oxide open: %w", err) + } + defer doc.Close() + + irJSON, err := doc.ToIRJSON() + if err != nil { + return nil, fmt.Errorf("ToIRJSON: %w", err) + } + + var ir irDocument + if err := json.Unmarshal([]byte(irJSON), &ir); err != nil { + return nil, fmt.Errorf("parse IR JSON: %w", err) + } + + var blocks []RawBlock + for _, sec := range ir.Sections { + for _, el := range sec.Elements { + block := irElementToBlock(el) + blocks = append(blocks, block) + } + } + return blocks, nil +} + +func irElementToBlock(el irElement) RawBlock { + switch el.Type { + case "table": + rows := make([][]string, len(el.Rows)) + for ri, row := range el.Rows { + cells := make([]string, len(row.Cells)) + for ci, cell := range row.Cells { + cells[ci] = joinElements(cell.Content) + } + rows[ri] = cells + } + return RawBlock{Type: "table", Rows: rows} + + case "heading": + text := joinRuns(el.Content) + level := strconv.Itoa(el.Level) + return RawBlock{ + Type: "paragraph", + Text: text, + Style: "Heading " + level, + } + + case "image": + return RawBlock{ + Type: "image", + Image: base64.StdEncoding.EncodeToString(el.Data), + } + + default: // "paragraph" and anything else + style := el.Style + if style == "" { + style = "Normal" + } + return RawBlock{ + Type: "paragraph", + Text: joinRuns(el.Content), + Style: style, + } + } +} + +func joinRuns(runs []irRun) string { + var b strings.Builder + for _, r := range runs { + if r.Type == "text" { + b.WriteString(r.Text) + } + } + return b.String() +} + +// joinElements extracts plain text from nested irElements (used for table cells). +// When multiple elements are present, a newline is inserted between each one +// to match python-docx _Cell.text behavior. +func joinElements(els []irElement) string { + var b strings.Builder + for i, el := range els { + if i > 0 { + b.WriteByte('\n') + } + b.WriteString(joinRuns(el.Content)) + } + return b.String() +} diff --git a/internal/deepdoc/parser/docx/reader_cell_test.go b/internal/deepdoc/parser/docx/reader_cell_test.go new file mode 100644 index 0000000000..4183cf378c --- /dev/null +++ b/internal/deepdoc/parser/docx/reader_cell_test.go @@ -0,0 +1,38 @@ +//go:build cgo + +package docx + +import "testing" + +func TestJoinElements_MultiParagraphCell(t *testing.T) { + // When a table cell contains multiple paragraphs, joinElements must + // insert a newline between them to match python-docx _Cell.text behavior. + els := []irElement{ + {Type: "paragraph", Content: []irRun{{Type: "text", Text: "first line"}}}, + {Type: "paragraph", Content: []irRun{{Type: "text", Text: "second line"}}}, + } + got := joinElements(els) + want := "first line\nsecond line" + if got != want { + t.Errorf("joinElements:\ngot: %q\nwant: %q", got, want) + } +} + +func TestJoinElements_SingleElement(t *testing.T) { + // Single paragraph cell — no separator expected. + els := []irElement{ + {Type: "paragraph", Content: []irRun{{Type: "text", Text: "single paragraph"}}}, + } + got := joinElements(els) + want := "single paragraph" + if got != want { + t.Errorf("joinElements:\ngot: %q\nwant: %q", got, want) + } +} + +func TestJoinElements_Empty(t *testing.T) { + got := joinElements(nil) + if got != "" { + t.Errorf("joinElements(nil): got %q, want empty", got) + } +} diff --git a/internal/deepdoc/parser/docx/reader_stub.go b/internal/deepdoc/parser/docx/reader_stub.go new file mode 100644 index 0000000000..ed7c993544 --- /dev/null +++ b/internal/deepdoc/parser/docx/reader_stub.go @@ -0,0 +1,11 @@ +//go:build !cgo + +package docx + +import "errors" + +// ExtractRawBlocks is not available without cgo because the underlying +// office_oxide library requires CGo. Rebuild with CGO_ENABLED=1. +func ExtractRawBlocks(_ []byte) ([]RawBlock, error) { + return nil, errors.New("office_oxide requires cgo; rebuild with CGO_ENABLED=1") +} diff --git a/internal/deepdoc/parser/docx/reader_style_test.go b/internal/deepdoc/parser/docx/reader_style_test.go new file mode 100644 index 0000000000..dd7b054ff6 --- /dev/null +++ b/internal/deepdoc/parser/docx/reader_style_test.go @@ -0,0 +1,54 @@ +//go:build cgo + +package docx + +import "testing" + +func TestIrElementToBlock_PreservesCustomStyle(t *testing.T) { + // irElementToBlock should preserve the Word style name from the IR, + // not hard-code "Normal" for every non-heading paragraph. + el := irElement{ + Type: "paragraph", + Style: "Caption", + Content: []irRun{ + {Type: "text", Text: "Figure 1: Architecture diagram"}, + }, + } + block := irElementToBlock(el) + + if block.Style != "Caption" { + t.Errorf("irElementToBlock with Style=%q:\ngot Style=%q\nwant Style=%q", + el.Style, block.Style, el.Style) + } +} + +func TestIrElementToBlock_PreservesHeadingStyle(t *testing.T) { + // Heading elements should still produce "Heading N" style. + el := irElement{ + Type: "heading", + Level: 2, + Content: []irRun{ + {Type: "text", Text: "Section 2.1"}, + }, + } + block := irElementToBlock(el) + + if block.Style != "Heading 2" { + t.Errorf("heading: got Style=%q, want %q", block.Style, "Heading 2") + } +} + +func TestIrElementToBlock_FallsBackToNormal(t *testing.T) { + // When Style is empty, defaults to "Normal". + el := irElement{ + Type: "paragraph", + Content: []irRun{ + {Type: "text", Text: "plain text"}, + }, + } + block := irElementToBlock(el) + + if block.Style != "Normal" { + t.Errorf("empty style: got %q, want %q", block.Style, "Normal") + } +} diff --git a/internal/deepdoc/parser/docx/types.go b/internal/deepdoc/parser/docx/types.go new file mode 100644 index 0000000000..21c3c56959 --- /dev/null +++ b/internal/deepdoc/parser/docx/types.go @@ -0,0 +1,46 @@ +package docx + +// RawBlock represents a single block extracted from a DOCX file in document order. +// Type is one of "paragraph", "table", or "image". Headings are represented as +// Type "paragraph" with a Style of "Heading N". +type RawBlock struct { + Type string `json:"type"` // "paragraph" or "table" + Text string `json:"text"` // paragraph text; empty for tables + Style string `json:"style"` // Word style name (e.g. "Normal", "Heading 1") + Image string `json:"image,omitempty"` // base64-encoded image data + Rows [][]string `json:"rows,omitempty"` // table rows; nil for paragraphs +} + +// ── office_oxide IR JSON types ──────────────────────────────────────── + +type irElement struct { + Type string `json:"type"` // "paragraph", "heading", "table", "image" + Level int `json:"level"` // heading level (1-6) + Style string `json:"style"` // Word style name (e.g. "Normal", "Caption", "Heading 1") + Content []irRun `json:"content"` // rich text runs + Data []byte `json:"data"` // raw image bytes (for "image" type) + Rows []irRow `json:"rows"` // table rows +} + +type irRun struct { + Type string `json:"type"` // "text", "image" + Text string `json:"text"` // plain text content + Content []irElement `json:"content"` // nested elements (used in table cells) +} + +type irRow struct { + Cells []irCell `json:"cells"` +} + +type irCell struct { + Content []irElement `json:"content"` // nested paragraphs inside table cell +} + +type irSection struct { + Title string `json:"title"` + Elements []irElement `json:"elements"` +} + +type irDocument struct { + Sections []irSection `json:"sections"` +}
AB12