Feat: add DOCX parser (#16521)

### Summary

Add DOCX parser - go.
This commit is contained in:
Jack
2026-07-02 16:31:09 +08:00
committed by GitHub
parent 9c8d8c7b83
commit c8cf0c967d
9 changed files with 864 additions and 0 deletions

View File

@@ -0,0 +1,60 @@
package docx
import (
"strings"
"ragflow/internal/deepdoc/parser/pdf/table"
doctype "ragflow/internal/deepdoc/parser/type"
)
// blocksToSections converts raw DOCX blocks to the shared Section representation
// consumed by the framework layer. Headings get LayoutType "title", tables get
// DocTypeKwd "table" with a populated TableItem, and everything else is "text".
func blocksToSections(blocks []RawBlock) []doctype.Section {
sections := make([]doctype.Section, 0, len(blocks))
for _, b := range blocks {
sec := blockToSection(b)
sections = append(sections, sec)
}
return sections
}
func blockToSection(b RawBlock) doctype.Section {
switch b.Type {
case "table":
return doctype.Section{
Text: table.SimpleRowsToHTML(b.Rows),
DocTypeKwd: "table",
TableItem: &doctype.TableItem{
Rows: b.Rows,
},
}
case "image":
return doctype.Section{
DocTypeKwd: "image",
Image: b.Image,
}
default:
layoutType := "text"
if strings.HasPrefix(strings.ToLower(b.Style), "heading") {
layoutType = "title"
}
return doctype.Section{
Text: b.Text,
DocTypeKwd: "text",
LayoutType: layoutType,
}
}
}
// Parse converts a DOCX file (given as bytes) into a doctype.ParseResult.
// It uses office_oxide for raw block extraction, then maps blocks to Sections.
func Parse(data []byte, cfg doctype.ParserConfig) (*doctype.ParseResult, error) {
blocks, err := ExtractRawBlocks(data)
if err != nil {
return nil, err
}
return &doctype.ParseResult{
Sections: blocksToSections(blocks),
}, nil
}

View File

@@ -0,0 +1,161 @@
//go:build cgo && manual
package docx
import (
"os"
"path/filepath"
"strings"
"testing"
doctype "ragflow/internal/deepdoc/parser/type"
)
// readFixture reads a DOCX fixture file from testdata/docxs/.
func readFixture(name string) ([]byte, error) {
return os.ReadFile(filepath.Join("testdata", "docxs", name))
}
func TestParse_Integration_MultiSection(t *testing.T) {
data, err := readFixture("multi_section.docx")
if err != nil {
t.Skipf("fixture not available: %v", err)
}
result, err := Parse(data, doctype.DefaultParserConfig())
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) != 7 {
t.Errorf("multi_section.docx: want 7 sections, got %d", len(result.Sections))
}
// Verify headings
expected := []string{"Chapter 1", "Section 1.1", "Chapter 2"}
titleIdx := 0
for _, s := range result.Sections {
if s.LayoutType == "title" {
if titleIdx < len(expected) && s.Text != expected[titleIdx] {
t.Errorf("heading[%d]: got %q, want %q", titleIdx, s.Text, expected[titleIdx])
}
titleIdx++
}
}
if titleIdx != 3 {
t.Errorf("expected 3 headings, found %d", titleIdx)
}
}
func TestParse_Integration_WithTable(t *testing.T) {
data, err := readFixture("with_table.docx")
if err != nil {
t.Skipf("fixture not available: %v", err)
}
result, err := Parse(data, doctype.DefaultParserConfig())
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) != 4 {
t.Fatalf("want 4 sections, got %d", len(result.Sections))
}
if result.Sections[2].DocTypeKwd != "table" {
t.Error("expected table section at index 2")
}
if len(result.Sections[2].TableItem.Rows) != 3 {
t.Errorf("expected 3 rows, got %d", len(result.Sections[2].TableItem.Rows))
}
if result.Sections[2].TableItem.Rows[0][0] != "Product" {
t.Errorf("cell[0,0]: got %q", result.Sections[2].TableItem.Rows[0][0])
}
// Verify HTML table is rendered.
if !strings.Contains(result.Sections[2].Text, "<table>") {
t.Error("table Section.Text should contain HTML <table>")
}
if !strings.Contains(result.Sections[2].Text, "<th >Product</th>") {
t.Errorf("table HTML missing header: %s", result.Sections[2].Text)
}
}
func TestParse_Integration_WithImage(t *testing.T) {
data, err := readFixture("with_image.docx")
if err != nil {
t.Skipf("fixture not available: %v", err)
}
result, err := Parse(data, doctype.DefaultParserConfig())
if err != nil {
t.Fatalf("Parse: %v", err)
}
hasImage := false
for _, s := range result.Sections {
if s.DocTypeKwd == "image" && s.Image != "" {
hasImage = true
}
}
if !hasImage {
t.Error("expected at least one image section")
}
}
func TestParse_Integration_NestedHeadings(t *testing.T) {
data, err := readFixture("nested_headings.docx")
if err != nil {
t.Skipf("fixture not available: %v", err)
}
result, err := Parse(data, doctype.DefaultParserConfig())
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) != 5 {
t.Fatalf("want 5 sections, got %d", len(result.Sections))
}
titles := 0
for _, s := range result.Sections {
if s.LayoutType == "title" {
titles++
}
}
if titles != 5 {
t.Errorf("expected 5 titles, got %d", titles)
}
}
func TestParse_Integration_WithCaption(t *testing.T) {
data, err := readFixture("with_caption.docx")
if err != nil {
t.Skipf("fixture not available: %v", err)
}
result, err := Parse(data, doctype.DefaultParserConfig())
if err != nil {
t.Fatalf("Parse: %v", err)
}
if len(result.Sections) != 4 {
t.Fatalf("want 4 sections, got %d", len(result.Sections))
}
// Block order: [Figure caption] [body text] [2x2 table] [Table caption]
// Figure caption (index 0) is text, not title.
if result.Sections[0].LayoutType != "text" {
t.Errorf("figure caption: got LayoutType %q", result.Sections[0].LayoutType)
}
if !strings.Contains(result.Sections[0].Text, "Figure 1") {
t.Errorf("figure caption text: %q", result.Sections[0].Text)
}
// Table section (index 2) must have HTML rendering.
s := result.Sections[2]
if s.DocTypeKwd != "table" {
t.Errorf("table section: DocTypeKwd=%q", s.DocTypeKwd)
}
if !strings.Contains(s.Text, "<table>") {
t.Fatal("table section missing <table> HTML")
}
if !strings.Contains(s.Text, "<th >A</th>") || !strings.Contains(s.Text, "<th >B</th>") {
t.Errorf("table header cells: %s", s.Text)
}
if !strings.Contains(s.Text, "<td >1</td>") || !strings.Contains(s.Text, "<td >2</td>") {
t.Errorf("table data cells: %s", s.Text)
}
// Table caption (index 3) follows the table.
if !strings.Contains(result.Sections[3].Text, "Table 1") {
t.Errorf("table caption text: %q", result.Sections[3].Text)
}
}

View File

@@ -0,0 +1,202 @@
package docx
import (
"testing"
)
func TestBlocksToSections_Paragraph(t *testing.T) {
blocks := []RawBlock{
{Type: "paragraph", Text: "hello world", Style: "Normal"},
}
sections := blocksToSections(blocks)
if len(sections) != 1 {
t.Fatalf("want 1 section, got %d", len(sections))
}
s := sections[0]
if s.Text != "hello world" {
t.Errorf("Text: got %q, want %q", s.Text, "hello world")
}
if s.DocTypeKwd != "text" {
t.Errorf("DocTypeKwd: got %q, want %q", s.DocTypeKwd, "text")
}
}
func TestBlocksToSections_Headings(t *testing.T) {
blocks := []RawBlock{
{Type: "paragraph", Text: "Main Title", Style: "Heading 1"},
{Type: "paragraph", Text: "Sub Title", Style: "Heading 2"},
{Type: "paragraph", Text: "Deep", Style: "Heading 3"},
{Type: "paragraph", Text: "Plain", Style: "Normal"},
}
sections := blocksToSections(blocks)
if len(sections) != 4 {
t.Fatalf("want 4 sections, got %d", len(sections))
}
if sections[0].LayoutType != "title" {
t.Errorf("[0] LayoutType: got %q, want %q", sections[0].LayoutType, "title")
}
if sections[1].LayoutType != "title" {
t.Errorf("[1] LayoutType: got %q, want %q", sections[1].LayoutType, "title")
}
if sections[2].LayoutType != "title" {
t.Errorf("[2] LayoutType: got %q, want %q", sections[2].LayoutType, "title")
}
// Normal paragraph is NOT a title
if sections[3].LayoutType != "text" {
t.Errorf("[3] LayoutType: got %q, want %q", sections[3].LayoutType, "text")
}
}
func TestBlocksToSections_Table(t *testing.T) {
blocks := []RawBlock{
{Type: "table", Rows: [][]string{
{"Name", "Age"},
{"Alice", "30"},
}},
}
sections := blocksToSections(blocks)
if len(sections) != 1 {
t.Fatalf("want 1 section, got %d", len(sections))
}
s := sections[0]
if s.DocTypeKwd != "table" {
t.Errorf("DocTypeKwd: got %q, want %q", s.DocTypeKwd, "table")
}
if s.TableItem == nil {
t.Fatal("TableItem is nil")
}
if len(s.TableItem.Rows) != 2 {
t.Errorf("Rows: want 2, got %d", len(s.TableItem.Rows))
}
if s.Text == "" {
t.Error("Text: expected rendered HTML, got empty string")
}
}
func TestBlocksToSections_EmptyInput(t *testing.T) {
sections := blocksToSections(nil)
if len(sections) != 0 {
t.Errorf("want 0 sections, got %d", len(sections))
}
}
func TestBlocksToSections_DocumentOrder(t *testing.T) {
blocks := []RawBlock{
{Type: "paragraph", Text: "first", Style: "Normal"},
{Type: "table", Rows: [][]string{{"a"}}},
{Type: "paragraph", Text: "second", Style: "Normal"},
{Type: "paragraph", Text: "third", Style: "Heading 1"},
}
sections := blocksToSections(blocks)
if len(sections) != 4 {
t.Fatalf("want 4 sections, got %d", len(sections))
}
if sections[0].Text != "first" {
t.Errorf("order[0]: got %q", sections[0].Text)
}
if sections[1].DocTypeKwd != "table" {
t.Errorf("order[1]: expected table")
}
if sections[2].Text != "second" {
t.Errorf("order[2]: got %q", sections[2].Text)
}
if sections[3].Text != "third" {
t.Errorf("order[3]: got %q", sections[3].Text)
}
}
func TestBlocksToSections_CaptionStyle(t *testing.T) {
blocks := []RawBlock{
{Type: "paragraph", Text: "Table 1: Results", Style: "Caption"},
}
sections := blocksToSections(blocks)
if len(sections) != 1 {
t.Fatalf("want 1 section, got %d", len(sections))
}
if sections[0].LayoutType != "text" {
t.Errorf("Caption: LayoutType should be 'text', got %q", sections[0].LayoutType)
}
}
func TestBlocksToSections_MixedContent(t *testing.T) {
blocks := []RawBlock{
{Type: "paragraph", Text: "Title", Style: "Heading 1"},
{Type: "paragraph", Text: "Body text.", Style: "Normal"},
{Type: "table", Rows: [][]string{{"a", "b"}}},
{Type: "paragraph", Text: "More text.", Style: "Normal"},
}
sections := blocksToSections(blocks)
if len(sections) != 4 {
t.Fatalf("want 4 sections, got %d", len(sections))
}
if sections[0].LayoutType != "title" {
t.Errorf("[0] heading: got %q", sections[0].LayoutType)
}
if sections[1].LayoutType != "text" {
t.Errorf("[1] body: got %q", sections[1].LayoutType)
}
if sections[2].DocTypeKwd != "table" {
t.Errorf("[2] table: got %q", sections[2].DocTypeKwd)
}
if sections[3].DocTypeKwd != "text" {
t.Errorf("[3] text after table: got %q", sections[3].DocTypeKwd)
}
}
func TestBlocksToSections_Image(t *testing.T) {
blocks := []RawBlock{
{Type: "image", Image: "iVBORw0KGgoAAAANSUhEUg=="},
}
sections := blocksToSections(blocks)
if len(sections) != 1 {
t.Fatalf("want 1 section, got %d", len(sections))
}
if sections[0].DocTypeKwd != "image" {
t.Errorf("DocTypeKwd: got %q, want %q", sections[0].DocTypeKwd, "image")
}
if sections[0].Image != "iVBORw0KGgoAAAANSUhEUg==" {
t.Error("Image base64 not preserved")
}
}
func TestBlocksToSections_ImageBetweenText(t *testing.T) {
blocks := []RawBlock{
{Type: "paragraph", Text: "before", Style: "Normal"},
{Type: "image", Image: "b64data"},
{Type: "paragraph", Text: "after", Style: "Normal"},
}
sections := blocksToSections(blocks)
if len(sections) != 3 {
t.Fatalf("want 3 sections, got %d", len(sections))
}
if sections[0].DocTypeKwd != "text" || sections[0].Text != "before" {
t.Error("wrong text section before image")
}
if sections[1].DocTypeKwd != "image" {
t.Errorf("image section: got DocTypeKwd %q", sections[1].DocTypeKwd)
}
if sections[2].DocTypeKwd != "text" || sections[2].Text != "after" {
t.Error("wrong text section after image")
}
}
func TestBlocksToSections_NestedHeadings(t *testing.T) {
blocks := []RawBlock{
{Type: "paragraph", Text: "H1", Style: "Heading 1"},
{Type: "paragraph", Text: "H2", Style: "Heading 2"},
{Type: "paragraph", Text: "H3", Style: "Heading 3"},
}
sections := blocksToSections(blocks)
for i, want := range []string{"title", "title", "title"} {
if sections[i].LayoutType != want {
t.Errorf("[%d] got %q, want %q", i, sections[i].LayoutType, want)
}
}
}

View File

@@ -0,0 +1,184 @@
//go:build cgo && manual
package docx
import (
"encoding/json"
"os"
"strings"
"testing"
)
func loadPythonBlocks(t *testing.T, path string) []RawBlock {
t.Helper()
data, err := os.ReadFile(path)
if err != nil {
t.Fatalf("read %s: %v", path, err)
}
var blocks []RawBlock
if err := json.Unmarshal(data, &blocks); err != nil {
t.Fatalf("unmarshal %s: %v", path, err)
}
return blocks
}
func TestRawBlocksParity_SimpleText(t *testing.T) {
data, err := os.ReadFile("testdata/docxs/simple_text.docx")
if err != nil {
t.Fatal(err)
}
got, err := ExtractRawBlocks(data)
if err != nil {
t.Fatalf("ExtractRawBlocks: %v", err)
}
want := loadPythonBlocks(t, "testdata/output/py/docx/simple_text_blocks.json")
if len(got) != len(want) {
t.Errorf("block count: got %d, want %d", len(got), len(want))
}
for i := 0; i < min(len(got), len(want)); i++ {
if got[i].Type != want[i].Type {
t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type)
}
if got[i].Text != want[i].Text {
t.Errorf("block[%d].text: got %q, want %q", i, got[i].Text, want[i].Text)
}
}
if t.Failed() {
t.Logf("Go blocks: %+v", got)
t.Logf("Py blocks: %+v", want)
}
}
func TestRawBlocksParity_WithTable(t *testing.T) {
data, err := os.ReadFile("testdata/docxs/with_table.docx")
if err != nil {
t.Fatal(err)
}
got, err := ExtractRawBlocks(data)
if err != nil {
t.Fatalf("ExtractRawBlocks: %v", err)
}
want := loadPythonBlocks(t, "testdata/output/py/docx/with_table_blocks.json")
if len(got) != len(want) {
t.Errorf("block count: got %d, want %d", len(got), len(want))
}
for i := 0; i < min(len(got), len(want)); i++ {
if got[i].Type != want[i].Type {
t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type)
}
}
if t.Failed() {
t.Logf("Go blocks: %+v", got)
t.Logf("Py blocks: %+v", want)
}
}
func TestRawBlocksParity_WithImage(t *testing.T) {
data, err := os.ReadFile("testdata/docxs/with_image.docx")
if err != nil {
t.Fatal(err)
}
got, err := ExtractRawBlocks(data)
if err != nil {
t.Fatalf("ExtractRawBlocks: %v", err)
}
// Engine-level difference: python-docx embeds images inside empty
// paragraph blocks; office_oxide represents them as separate elements.
// Both engines must see "Before" and "After" text and at least one
// image-related block.
hasBefore, hasAfter, hasImage := false, false, false
for _, b := range got {
if b.Text != "" {
hasBefore = hasBefore || b.Text == "Before the image."
hasAfter = hasAfter || b.Text == "After the image."
}
if b.Image != "" {
hasImage = true
}
}
if !hasBefore {
t.Error("missing 'Before the image.' text")
}
if !hasAfter {
t.Error("missing 'After the image.' text")
}
if !hasImage {
t.Log("office_oxide IR does not expose embedded images as top-level blocks")
}
}
func TestRawBlocksParity_MultiSection(t *testing.T) {
data, err := os.ReadFile("testdata/docxs/multi_section.docx")
if err != nil {
t.Fatal(err)
}
got, err := ExtractRawBlocks(data)
if err != nil {
t.Fatalf("ExtractRawBlocks: %v", err)
}
want := loadPythonBlocks(t, "testdata/output/py/docx/multi_section_blocks.json")
if len(got) != len(want) {
t.Errorf("block count: got %d, want %d", len(got), len(want))
}
for i := 0; i < min(len(got), len(want)); i++ {
if got[i].Type != want[i].Type {
t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type)
}
}
}
func TestRawBlocksParity_NestedHeadings(t *testing.T) {
data, err := os.ReadFile("testdata/docxs/nested_headings.docx")
if err != nil {
t.Fatal(err)
}
got, err := ExtractRawBlocks(data)
if err != nil {
t.Fatalf("ExtractRawBlocks: %v", err)
}
want := loadPythonBlocks(t, "testdata/output/py/docx/nested_headings_blocks.json")
if len(got) != len(want) {
t.Errorf("block count: got %d, want %d", len(got), len(want))
}
headings := 0
for _, b := range got {
if strings.HasPrefix(b.Style, "Heading") {
headings++
}
}
if headings != 5 {
t.Errorf("expected 5 headings, got %d", headings)
}
}
func TestRawBlocksParity_WithCaption(t *testing.T) {
data, err := os.ReadFile("testdata/docxs/with_caption.docx")
if err != nil {
t.Fatal(err)
}
got, err := ExtractRawBlocks(data)
if err != nil {
t.Fatalf("ExtractRawBlocks: %v", err)
}
// Verify both engines see the same number of blocks
want := loadPythonBlocks(t, "testdata/output/py/docx/with_caption_blocks.json")
if len(got) != len(want) {
t.Errorf("block count: got %d, want %d", len(got), len(want))
}
}
func TestRawBlocksParity_Empty(t *testing.T) {
data, err := os.ReadFile("testdata/docxs/empty.docx")
if err != nil {
t.Fatal(err)
}
got, err := ExtractRawBlocks(data)
if err != nil {
t.Fatalf("ExtractRawBlocks: %v", err)
}
if len(got) != 0 {
t.Errorf("empty docx: expected 0 blocks, got %d", len(got))
}
}

View File

@@ -0,0 +1,108 @@
//go:build cgo
package docx
import (
"encoding/base64"
"encoding/json"
"fmt"
"strconv"
"strings"
officeOxide "github.com/yfedoseev/office_oxide/go"
)
// ExtractRawBlocks opens a DOCX via office_oxide and extracts blocks in
// document order, matching the format produced by python-docx's
// _element.body iteration.
func ExtractRawBlocks(data []byte) ([]RawBlock, error) {
doc, err := officeOxide.OpenFromBytes(data, "docx")
if err != nil {
return nil, fmt.Errorf("office_oxide open: %w", err)
}
defer doc.Close()
irJSON, err := doc.ToIRJSON()
if err != nil {
return nil, fmt.Errorf("ToIRJSON: %w", err)
}
var ir irDocument
if err := json.Unmarshal([]byte(irJSON), &ir); err != nil {
return nil, fmt.Errorf("parse IR JSON: %w", err)
}
var blocks []RawBlock
for _, sec := range ir.Sections {
for _, el := range sec.Elements {
block := irElementToBlock(el)
blocks = append(blocks, block)
}
}
return blocks, nil
}
func irElementToBlock(el irElement) RawBlock {
switch el.Type {
case "table":
rows := make([][]string, len(el.Rows))
for ri, row := range el.Rows {
cells := make([]string, len(row.Cells))
for ci, cell := range row.Cells {
cells[ci] = joinElements(cell.Content)
}
rows[ri] = cells
}
return RawBlock{Type: "table", Rows: rows}
case "heading":
text := joinRuns(el.Content)
level := strconv.Itoa(el.Level)
return RawBlock{
Type: "paragraph",
Text: text,
Style: "Heading " + level,
}
case "image":
return RawBlock{
Type: "image",
Image: base64.StdEncoding.EncodeToString(el.Data),
}
default: // "paragraph" and anything else
style := el.Style
if style == "" {
style = "Normal"
}
return RawBlock{
Type: "paragraph",
Text: joinRuns(el.Content),
Style: style,
}
}
}
func joinRuns(runs []irRun) string {
var b strings.Builder
for _, r := range runs {
if r.Type == "text" {
b.WriteString(r.Text)
}
}
return b.String()
}
// joinElements extracts plain text from nested irElements (used for table cells).
// When multiple elements are present, a newline is inserted between each one
// to match python-docx _Cell.text behavior.
func joinElements(els []irElement) string {
var b strings.Builder
for i, el := range els {
if i > 0 {
b.WriteByte('\n')
}
b.WriteString(joinRuns(el.Content))
}
return b.String()
}

View File

@@ -0,0 +1,38 @@
//go:build cgo
package docx
import "testing"
func TestJoinElements_MultiParagraphCell(t *testing.T) {
// When a table cell contains multiple paragraphs, joinElements must
// insert a newline between them to match python-docx _Cell.text behavior.
els := []irElement{
{Type: "paragraph", Content: []irRun{{Type: "text", Text: "first line"}}},
{Type: "paragraph", Content: []irRun{{Type: "text", Text: "second line"}}},
}
got := joinElements(els)
want := "first line\nsecond line"
if got != want {
t.Errorf("joinElements:\ngot: %q\nwant: %q", got, want)
}
}
func TestJoinElements_SingleElement(t *testing.T) {
// Single paragraph cell — no separator expected.
els := []irElement{
{Type: "paragraph", Content: []irRun{{Type: "text", Text: "single paragraph"}}},
}
got := joinElements(els)
want := "single paragraph"
if got != want {
t.Errorf("joinElements:\ngot: %q\nwant: %q", got, want)
}
}
func TestJoinElements_Empty(t *testing.T) {
got := joinElements(nil)
if got != "" {
t.Errorf("joinElements(nil): got %q, want empty", got)
}
}

View File

@@ -0,0 +1,11 @@
//go:build !cgo
package docx
import "errors"
// ExtractRawBlocks is not available without cgo because the underlying
// office_oxide library requires CGo. Rebuild with CGO_ENABLED=1.
func ExtractRawBlocks(_ []byte) ([]RawBlock, error) {
return nil, errors.New("office_oxide requires cgo; rebuild with CGO_ENABLED=1")
}

View File

@@ -0,0 +1,54 @@
//go:build cgo
package docx
import "testing"
func TestIrElementToBlock_PreservesCustomStyle(t *testing.T) {
// irElementToBlock should preserve the Word style name from the IR,
// not hard-code "Normal" for every non-heading paragraph.
el := irElement{
Type: "paragraph",
Style: "Caption",
Content: []irRun{
{Type: "text", Text: "Figure 1: Architecture diagram"},
},
}
block := irElementToBlock(el)
if block.Style != "Caption" {
t.Errorf("irElementToBlock with Style=%q:\ngot Style=%q\nwant Style=%q",
el.Style, block.Style, el.Style)
}
}
func TestIrElementToBlock_PreservesHeadingStyle(t *testing.T) {
// Heading elements should still produce "Heading N" style.
el := irElement{
Type: "heading",
Level: 2,
Content: []irRun{
{Type: "text", Text: "Section 2.1"},
},
}
block := irElementToBlock(el)
if block.Style != "Heading 2" {
t.Errorf("heading: got Style=%q, want %q", block.Style, "Heading 2")
}
}
func TestIrElementToBlock_FallsBackToNormal(t *testing.T) {
// When Style is empty, defaults to "Normal".
el := irElement{
Type: "paragraph",
Content: []irRun{
{Type: "text", Text: "plain text"},
},
}
block := irElementToBlock(el)
if block.Style != "Normal" {
t.Errorf("empty style: got %q, want %q", block.Style, "Normal")
}
}

View File

@@ -0,0 +1,46 @@
package docx
// RawBlock represents a single block extracted from a DOCX file in document order.
// Type is one of "paragraph", "table", or "image". Headings are represented as
// Type "paragraph" with a Style of "Heading N".
type RawBlock struct {
Type string `json:"type"` // "paragraph" or "table"
Text string `json:"text"` // paragraph text; empty for tables
Style string `json:"style"` // Word style name (e.g. "Normal", "Heading 1")
Image string `json:"image,omitempty"` // base64-encoded image data
Rows [][]string `json:"rows,omitempty"` // table rows; nil for paragraphs
}
// ── office_oxide IR JSON types ────────────────────────────────────────
type irElement struct {
Type string `json:"type"` // "paragraph", "heading", "table", "image"
Level int `json:"level"` // heading level (1-6)
Style string `json:"style"` // Word style name (e.g. "Normal", "Caption", "Heading 1")
Content []irRun `json:"content"` // rich text runs
Data []byte `json:"data"` // raw image bytes (for "image" type)
Rows []irRow `json:"rows"` // table rows
}
type irRun struct {
Type string `json:"type"` // "text", "image"
Text string `json:"text"` // plain text content
Content []irElement `json:"content"` // nested elements (used in table cells)
}
type irRow struct {
Cells []irCell `json:"cells"`
}
type irCell struct {
Content []irElement `json:"content"` // nested paragraphs inside table cell
}
type irSection struct {
Title string `json:"title"`
Elements []irElement `json:"elements"`
}
type irDocument struct {
Sections []irSection `json:"sections"`
}