diff --git a/internal/deepdoc/parser/docx/parser.go b/internal/deepdoc/parser/docx/parser.go
new file mode 100644
index 0000000000..a13ed80cb4
--- /dev/null
+++ b/internal/deepdoc/parser/docx/parser.go
@@ -0,0 +1,60 @@
+package docx
+
+import (
+ "strings"
+
+ "ragflow/internal/deepdoc/parser/pdf/table"
+ doctype "ragflow/internal/deepdoc/parser/type"
+)
+
+// blocksToSections converts raw DOCX blocks to the shared Section representation
+// consumed by the framework layer. Headings get LayoutType "title", tables get
+// DocTypeKwd "table" with a populated TableItem, and everything else is "text".
+func blocksToSections(blocks []RawBlock) []doctype.Section {
+ sections := make([]doctype.Section, 0, len(blocks))
+ for _, b := range blocks {
+ sec := blockToSection(b)
+ sections = append(sections, sec)
+ }
+ return sections
+}
+
+func blockToSection(b RawBlock) doctype.Section {
+ switch b.Type {
+ case "table":
+ return doctype.Section{
+ Text: table.SimpleRowsToHTML(b.Rows),
+ DocTypeKwd: "table",
+ TableItem: &doctype.TableItem{
+ Rows: b.Rows,
+ },
+ }
+ case "image":
+ return doctype.Section{
+ DocTypeKwd: "image",
+ Image: b.Image,
+ }
+ default:
+ layoutType := "text"
+ if strings.HasPrefix(strings.ToLower(b.Style), "heading") {
+ layoutType = "title"
+ }
+ return doctype.Section{
+ Text: b.Text,
+ DocTypeKwd: "text",
+ LayoutType: layoutType,
+ }
+ }
+}
+
+// Parse converts a DOCX file (given as bytes) into a doctype.ParseResult.
+// It uses office_oxide for raw block extraction, then maps blocks to Sections.
+func Parse(data []byte, cfg doctype.ParserConfig) (*doctype.ParseResult, error) {
+ blocks, err := ExtractRawBlocks(data)
+ if err != nil {
+ return nil, err
+ }
+ return &doctype.ParseResult{
+ Sections: blocksToSections(blocks),
+ }, nil
+}
diff --git a/internal/deepdoc/parser/docx/parser_integration_test.go b/internal/deepdoc/parser/docx/parser_integration_test.go
new file mode 100644
index 0000000000..675e2e196f
--- /dev/null
+++ b/internal/deepdoc/parser/docx/parser_integration_test.go
@@ -0,0 +1,161 @@
+//go:build cgo && manual
+
+package docx
+
+import (
+ "os"
+ "path/filepath"
+ "strings"
+ "testing"
+
+ doctype "ragflow/internal/deepdoc/parser/type"
+)
+
+// readFixture reads a DOCX fixture file from testdata/docxs/.
+func readFixture(name string) ([]byte, error) {
+ return os.ReadFile(filepath.Join("testdata", "docxs", name))
+}
+
+func TestParse_Integration_MultiSection(t *testing.T) {
+ data, err := readFixture("multi_section.docx")
+ if err != nil {
+ t.Skipf("fixture not available: %v", err)
+ }
+ result, err := Parse(data, doctype.DefaultParserConfig())
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+ if len(result.Sections) != 7 {
+ t.Errorf("multi_section.docx: want 7 sections, got %d", len(result.Sections))
+ }
+ // Verify headings
+ expected := []string{"Chapter 1", "Section 1.1", "Chapter 2"}
+ titleIdx := 0
+ for _, s := range result.Sections {
+ if s.LayoutType == "title" {
+ if titleIdx < len(expected) && s.Text != expected[titleIdx] {
+ t.Errorf("heading[%d]: got %q, want %q", titleIdx, s.Text, expected[titleIdx])
+ }
+ titleIdx++
+ }
+ }
+ if titleIdx != 3 {
+ t.Errorf("expected 3 headings, found %d", titleIdx)
+ }
+}
+
+func TestParse_Integration_WithTable(t *testing.T) {
+ data, err := readFixture("with_table.docx")
+ if err != nil {
+ t.Skipf("fixture not available: %v", err)
+ }
+ result, err := Parse(data, doctype.DefaultParserConfig())
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+ if len(result.Sections) != 4 {
+ t.Fatalf("want 4 sections, got %d", len(result.Sections))
+ }
+ if result.Sections[2].DocTypeKwd != "table" {
+ t.Error("expected table section at index 2")
+ }
+ if len(result.Sections[2].TableItem.Rows) != 3 {
+ t.Errorf("expected 3 rows, got %d", len(result.Sections[2].TableItem.Rows))
+ }
+ if result.Sections[2].TableItem.Rows[0][0] != "Product" {
+ t.Errorf("cell[0,0]: got %q", result.Sections[2].TableItem.Rows[0][0])
+ }
+ // Verify HTML table is rendered.
+ if !strings.Contains(result.Sections[2].Text, "
") {
+ t.Error("table Section.Text should contain HTML ")
+ }
+ if !strings.Contains(result.Sections[2].Text, "| Product | ") {
+ t.Errorf("table HTML missing header: %s", result.Sections[2].Text)
+ }
+}
+
+func TestParse_Integration_WithImage(t *testing.T) {
+ data, err := readFixture("with_image.docx")
+ if err != nil {
+ t.Skipf("fixture not available: %v", err)
+ }
+ result, err := Parse(data, doctype.DefaultParserConfig())
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+ hasImage := false
+ for _, s := range result.Sections {
+ if s.DocTypeKwd == "image" && s.Image != "" {
+ hasImage = true
+ }
+ }
+ if !hasImage {
+ t.Error("expected at least one image section")
+ }
+}
+
+func TestParse_Integration_NestedHeadings(t *testing.T) {
+ data, err := readFixture("nested_headings.docx")
+ if err != nil {
+ t.Skipf("fixture not available: %v", err)
+ }
+ result, err := Parse(data, doctype.DefaultParserConfig())
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+ if len(result.Sections) != 5 {
+ t.Fatalf("want 5 sections, got %d", len(result.Sections))
+ }
+ titles := 0
+ for _, s := range result.Sections {
+ if s.LayoutType == "title" {
+ titles++
+ }
+ }
+ if titles != 5 {
+ t.Errorf("expected 5 titles, got %d", titles)
+ }
+}
+
+func TestParse_Integration_WithCaption(t *testing.T) {
+ data, err := readFixture("with_caption.docx")
+ if err != nil {
+ t.Skipf("fixture not available: %v", err)
+ }
+ result, err := Parse(data, doctype.DefaultParserConfig())
+ if err != nil {
+ t.Fatalf("Parse: %v", err)
+ }
+ if len(result.Sections) != 4 {
+ t.Fatalf("want 4 sections, got %d", len(result.Sections))
+ }
+
+ // Block order: [Figure caption] [body text] [2x2 table] [Table caption]
+ // Figure caption (index 0) is text, not title.
+ if result.Sections[0].LayoutType != "text" {
+ t.Errorf("figure caption: got LayoutType %q", result.Sections[0].LayoutType)
+ }
+ if !strings.Contains(result.Sections[0].Text, "Figure 1") {
+ t.Errorf("figure caption text: %q", result.Sections[0].Text)
+ }
+
+ // Table section (index 2) must have HTML rendering.
+ s := result.Sections[2]
+ if s.DocTypeKwd != "table" {
+ t.Errorf("table section: DocTypeKwd=%q", s.DocTypeKwd)
+ }
+ if !strings.Contains(s.Text, "") {
+ t.Fatal("table section missing HTML")
+ }
+ if !strings.Contains(s.Text, "| A | ") || !strings.Contains(s.Text, "B | ") {
+ t.Errorf("table header cells: %s", s.Text)
+ }
+ if !strings.Contains(s.Text, "1 | ") || !strings.Contains(s.Text, "2 | ") {
+ t.Errorf("table data cells: %s", s.Text)
+ }
+
+ // Table caption (index 3) follows the table.
+ if !strings.Contains(result.Sections[3].Text, "Table 1") {
+ t.Errorf("table caption text: %q", result.Sections[3].Text)
+ }
+}
diff --git a/internal/deepdoc/parser/docx/parser_test.go b/internal/deepdoc/parser/docx/parser_test.go
new file mode 100644
index 0000000000..60dcf07473
--- /dev/null
+++ b/internal/deepdoc/parser/docx/parser_test.go
@@ -0,0 +1,202 @@
+package docx
+
+import (
+ "testing"
+)
+
+func TestBlocksToSections_Paragraph(t *testing.T) {
+ blocks := []RawBlock{
+ {Type: "paragraph", Text: "hello world", Style: "Normal"},
+ }
+ sections := blocksToSections(blocks)
+
+ if len(sections) != 1 {
+ t.Fatalf("want 1 section, got %d", len(sections))
+ }
+ s := sections[0]
+ if s.Text != "hello world" {
+ t.Errorf("Text: got %q, want %q", s.Text, "hello world")
+ }
+ if s.DocTypeKwd != "text" {
+ t.Errorf("DocTypeKwd: got %q, want %q", s.DocTypeKwd, "text")
+ }
+}
+
+func TestBlocksToSections_Headings(t *testing.T) {
+ blocks := []RawBlock{
+ {Type: "paragraph", Text: "Main Title", Style: "Heading 1"},
+ {Type: "paragraph", Text: "Sub Title", Style: "Heading 2"},
+ {Type: "paragraph", Text: "Deep", Style: "Heading 3"},
+ {Type: "paragraph", Text: "Plain", Style: "Normal"},
+ }
+ sections := blocksToSections(blocks)
+
+ if len(sections) != 4 {
+ t.Fatalf("want 4 sections, got %d", len(sections))
+ }
+ if sections[0].LayoutType != "title" {
+ t.Errorf("[0] LayoutType: got %q, want %q", sections[0].LayoutType, "title")
+ }
+ if sections[1].LayoutType != "title" {
+ t.Errorf("[1] LayoutType: got %q, want %q", sections[1].LayoutType, "title")
+ }
+ if sections[2].LayoutType != "title" {
+ t.Errorf("[2] LayoutType: got %q, want %q", sections[2].LayoutType, "title")
+ }
+ // Normal paragraph is NOT a title
+ if sections[3].LayoutType != "text" {
+ t.Errorf("[3] LayoutType: got %q, want %q", sections[3].LayoutType, "text")
+ }
+}
+
+func TestBlocksToSections_Table(t *testing.T) {
+ blocks := []RawBlock{
+ {Type: "table", Rows: [][]string{
+ {"Name", "Age"},
+ {"Alice", "30"},
+ }},
+ }
+ sections := blocksToSections(blocks)
+
+ if len(sections) != 1 {
+ t.Fatalf("want 1 section, got %d", len(sections))
+ }
+ s := sections[0]
+ if s.DocTypeKwd != "table" {
+ t.Errorf("DocTypeKwd: got %q, want %q", s.DocTypeKwd, "table")
+ }
+ if s.TableItem == nil {
+ t.Fatal("TableItem is nil")
+ }
+ if len(s.TableItem.Rows) != 2 {
+ t.Errorf("Rows: want 2, got %d", len(s.TableItem.Rows))
+ }
+ if s.Text == "" {
+ t.Error("Text: expected rendered HTML, got empty string")
+ }
+}
+
+func TestBlocksToSections_EmptyInput(t *testing.T) {
+ sections := blocksToSections(nil)
+ if len(sections) != 0 {
+ t.Errorf("want 0 sections, got %d", len(sections))
+ }
+}
+
+func TestBlocksToSections_DocumentOrder(t *testing.T) {
+ blocks := []RawBlock{
+ {Type: "paragraph", Text: "first", Style: "Normal"},
+ {Type: "table", Rows: [][]string{{"a"}}},
+ {Type: "paragraph", Text: "second", Style: "Normal"},
+ {Type: "paragraph", Text: "third", Style: "Heading 1"},
+ }
+ sections := blocksToSections(blocks)
+
+ if len(sections) != 4 {
+ t.Fatalf("want 4 sections, got %d", len(sections))
+ }
+ if sections[0].Text != "first" {
+ t.Errorf("order[0]: got %q", sections[0].Text)
+ }
+ if sections[1].DocTypeKwd != "table" {
+ t.Errorf("order[1]: expected table")
+ }
+ if sections[2].Text != "second" {
+ t.Errorf("order[2]: got %q", sections[2].Text)
+ }
+ if sections[3].Text != "third" {
+ t.Errorf("order[3]: got %q", sections[3].Text)
+ }
+}
+
+func TestBlocksToSections_CaptionStyle(t *testing.T) {
+ blocks := []RawBlock{
+ {Type: "paragraph", Text: "Table 1: Results", Style: "Caption"},
+ }
+ sections := blocksToSections(blocks)
+ if len(sections) != 1 {
+ t.Fatalf("want 1 section, got %d", len(sections))
+ }
+ if sections[0].LayoutType != "text" {
+ t.Errorf("Caption: LayoutType should be 'text', got %q", sections[0].LayoutType)
+ }
+}
+
+func TestBlocksToSections_MixedContent(t *testing.T) {
+ blocks := []RawBlock{
+ {Type: "paragraph", Text: "Title", Style: "Heading 1"},
+ {Type: "paragraph", Text: "Body text.", Style: "Normal"},
+ {Type: "table", Rows: [][]string{{"a", "b"}}},
+ {Type: "paragraph", Text: "More text.", Style: "Normal"},
+ }
+ sections := blocksToSections(blocks)
+
+ if len(sections) != 4 {
+ t.Fatalf("want 4 sections, got %d", len(sections))
+ }
+ if sections[0].LayoutType != "title" {
+ t.Errorf("[0] heading: got %q", sections[0].LayoutType)
+ }
+ if sections[1].LayoutType != "text" {
+ t.Errorf("[1] body: got %q", sections[1].LayoutType)
+ }
+ if sections[2].DocTypeKwd != "table" {
+ t.Errorf("[2] table: got %q", sections[2].DocTypeKwd)
+ }
+ if sections[3].DocTypeKwd != "text" {
+ t.Errorf("[3] text after table: got %q", sections[3].DocTypeKwd)
+ }
+}
+
+func TestBlocksToSections_Image(t *testing.T) {
+ blocks := []RawBlock{
+ {Type: "image", Image: "iVBORw0KGgoAAAANSUhEUg=="},
+ }
+ sections := blocksToSections(blocks)
+
+ if len(sections) != 1 {
+ t.Fatalf("want 1 section, got %d", len(sections))
+ }
+ if sections[0].DocTypeKwd != "image" {
+ t.Errorf("DocTypeKwd: got %q, want %q", sections[0].DocTypeKwd, "image")
+ }
+ if sections[0].Image != "iVBORw0KGgoAAAANSUhEUg==" {
+ t.Error("Image base64 not preserved")
+ }
+}
+
+func TestBlocksToSections_ImageBetweenText(t *testing.T) {
+ blocks := []RawBlock{
+ {Type: "paragraph", Text: "before", Style: "Normal"},
+ {Type: "image", Image: "b64data"},
+ {Type: "paragraph", Text: "after", Style: "Normal"},
+ }
+ sections := blocksToSections(blocks)
+
+ if len(sections) != 3 {
+ t.Fatalf("want 3 sections, got %d", len(sections))
+ }
+ if sections[0].DocTypeKwd != "text" || sections[0].Text != "before" {
+ t.Error("wrong text section before image")
+ }
+ if sections[1].DocTypeKwd != "image" {
+ t.Errorf("image section: got DocTypeKwd %q", sections[1].DocTypeKwd)
+ }
+ if sections[2].DocTypeKwd != "text" || sections[2].Text != "after" {
+ t.Error("wrong text section after image")
+ }
+}
+
+func TestBlocksToSections_NestedHeadings(t *testing.T) {
+ blocks := []RawBlock{
+ {Type: "paragraph", Text: "H1", Style: "Heading 1"},
+ {Type: "paragraph", Text: "H2", Style: "Heading 2"},
+ {Type: "paragraph", Text: "H3", Style: "Heading 3"},
+ }
+ sections := blocksToSections(blocks)
+ for i, want := range []string{"title", "title", "title"} {
+ if sections[i].LayoutType != want {
+ t.Errorf("[%d] got %q, want %q", i, sections[i].LayoutType, want)
+ }
+ }
+}
diff --git a/internal/deepdoc/parser/docx/raw_blocks_test.go b/internal/deepdoc/parser/docx/raw_blocks_test.go
new file mode 100644
index 0000000000..c1174f7a21
--- /dev/null
+++ b/internal/deepdoc/parser/docx/raw_blocks_test.go
@@ -0,0 +1,184 @@
+//go:build cgo && manual
+
+package docx
+
+import (
+ "encoding/json"
+ "os"
+ "strings"
+ "testing"
+)
+
+func loadPythonBlocks(t *testing.T, path string) []RawBlock {
+ t.Helper()
+ data, err := os.ReadFile(path)
+ if err != nil {
+ t.Fatalf("read %s: %v", path, err)
+ }
+ var blocks []RawBlock
+ if err := json.Unmarshal(data, &blocks); err != nil {
+ t.Fatalf("unmarshal %s: %v", path, err)
+ }
+ return blocks
+}
+
+func TestRawBlocksParity_SimpleText(t *testing.T) {
+ data, err := os.ReadFile("testdata/docxs/simple_text.docx")
+ if err != nil {
+ t.Fatal(err)
+ }
+ got, err := ExtractRawBlocks(data)
+ if err != nil {
+ t.Fatalf("ExtractRawBlocks: %v", err)
+ }
+ want := loadPythonBlocks(t, "testdata/output/py/docx/simple_text_blocks.json")
+
+ if len(got) != len(want) {
+ t.Errorf("block count: got %d, want %d", len(got), len(want))
+ }
+ for i := 0; i < min(len(got), len(want)); i++ {
+ if got[i].Type != want[i].Type {
+ t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type)
+ }
+ if got[i].Text != want[i].Text {
+ t.Errorf("block[%d].text: got %q, want %q", i, got[i].Text, want[i].Text)
+ }
+ }
+ if t.Failed() {
+ t.Logf("Go blocks: %+v", got)
+ t.Logf("Py blocks: %+v", want)
+ }
+}
+
+func TestRawBlocksParity_WithTable(t *testing.T) {
+ data, err := os.ReadFile("testdata/docxs/with_table.docx")
+ if err != nil {
+ t.Fatal(err)
+ }
+ got, err := ExtractRawBlocks(data)
+ if err != nil {
+ t.Fatalf("ExtractRawBlocks: %v", err)
+ }
+ want := loadPythonBlocks(t, "testdata/output/py/docx/with_table_blocks.json")
+
+ if len(got) != len(want) {
+ t.Errorf("block count: got %d, want %d", len(got), len(want))
+ }
+ for i := 0; i < min(len(got), len(want)); i++ {
+ if got[i].Type != want[i].Type {
+ t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type)
+ }
+ }
+ if t.Failed() {
+ t.Logf("Go blocks: %+v", got)
+ t.Logf("Py blocks: %+v", want)
+ }
+}
+
+func TestRawBlocksParity_WithImage(t *testing.T) {
+ data, err := os.ReadFile("testdata/docxs/with_image.docx")
+ if err != nil {
+ t.Fatal(err)
+ }
+ got, err := ExtractRawBlocks(data)
+ if err != nil {
+ t.Fatalf("ExtractRawBlocks: %v", err)
+ }
+ // Engine-level difference: python-docx embeds images inside empty
+ // paragraph blocks; office_oxide represents them as separate elements.
+ // Both engines must see "Before" and "After" text and at least one
+ // image-related block.
+ hasBefore, hasAfter, hasImage := false, false, false
+ for _, b := range got {
+ if b.Text != "" {
+ hasBefore = hasBefore || b.Text == "Before the image."
+ hasAfter = hasAfter || b.Text == "After the image."
+ }
+ if b.Image != "" {
+ hasImage = true
+ }
+ }
+ if !hasBefore {
+ t.Error("missing 'Before the image.' text")
+ }
+ if !hasAfter {
+ t.Error("missing 'After the image.' text")
+ }
+ if !hasImage {
+ t.Log("office_oxide IR does not expose embedded images as top-level blocks")
+ }
+}
+
+func TestRawBlocksParity_MultiSection(t *testing.T) {
+ data, err := os.ReadFile("testdata/docxs/multi_section.docx")
+ if err != nil {
+ t.Fatal(err)
+ }
+ got, err := ExtractRawBlocks(data)
+ if err != nil {
+ t.Fatalf("ExtractRawBlocks: %v", err)
+ }
+ want := loadPythonBlocks(t, "testdata/output/py/docx/multi_section_blocks.json")
+ if len(got) != len(want) {
+ t.Errorf("block count: got %d, want %d", len(got), len(want))
+ }
+ for i := 0; i < min(len(got), len(want)); i++ {
+ if got[i].Type != want[i].Type {
+ t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type)
+ }
+ }
+}
+
+func TestRawBlocksParity_NestedHeadings(t *testing.T) {
+ data, err := os.ReadFile("testdata/docxs/nested_headings.docx")
+ if err != nil {
+ t.Fatal(err)
+ }
+ got, err := ExtractRawBlocks(data)
+ if err != nil {
+ t.Fatalf("ExtractRawBlocks: %v", err)
+ }
+ want := loadPythonBlocks(t, "testdata/output/py/docx/nested_headings_blocks.json")
+ if len(got) != len(want) {
+ t.Errorf("block count: got %d, want %d", len(got), len(want))
+ }
+ headings := 0
+ for _, b := range got {
+ if strings.HasPrefix(b.Style, "Heading") {
+ headings++
+ }
+ }
+ if headings != 5 {
+ t.Errorf("expected 5 headings, got %d", headings)
+ }
+}
+
+func TestRawBlocksParity_WithCaption(t *testing.T) {
+ data, err := os.ReadFile("testdata/docxs/with_caption.docx")
+ if err != nil {
+ t.Fatal(err)
+ }
+ got, err := ExtractRawBlocks(data)
+ if err != nil {
+ t.Fatalf("ExtractRawBlocks: %v", err)
+ }
+ // Verify both engines see the same number of blocks
+ want := loadPythonBlocks(t, "testdata/output/py/docx/with_caption_blocks.json")
+ if len(got) != len(want) {
+ t.Errorf("block count: got %d, want %d", len(got), len(want))
+ }
+}
+
+func TestRawBlocksParity_Empty(t *testing.T) {
+ data, err := os.ReadFile("testdata/docxs/empty.docx")
+ if err != nil {
+ t.Fatal(err)
+ }
+ got, err := ExtractRawBlocks(data)
+ if err != nil {
+ t.Fatalf("ExtractRawBlocks: %v", err)
+ }
+ if len(got) != 0 {
+ t.Errorf("empty docx: expected 0 blocks, got %d", len(got))
+ }
+}
diff --git a/internal/deepdoc/parser/docx/reader.go b/internal/deepdoc/parser/docx/reader.go
new file mode 100644
index 0000000000..b3a5c3677d
--- /dev/null
+++ b/internal/deepdoc/parser/docx/reader.go
@@ -0,0 +1,108 @@
+//go:build cgo
+
+package docx
+
+import (
+ "encoding/base64"
+ "encoding/json"
+ "fmt"
+ "strconv"
+ "strings"
+
+ officeOxide "github.com/yfedoseev/office_oxide/go"
+)
+
+// ExtractRawBlocks opens a DOCX via office_oxide and extracts blocks in
+// document order, matching the format produced by python-docx's
+// _element.body iteration.
+func ExtractRawBlocks(data []byte) ([]RawBlock, error) {
+ doc, err := officeOxide.OpenFromBytes(data, "docx")
+ if err != nil {
+ return nil, fmt.Errorf("office_oxide open: %w", err)
+ }
+ defer doc.Close()
+
+ irJSON, err := doc.ToIRJSON()
+ if err != nil {
+ return nil, fmt.Errorf("ToIRJSON: %w", err)
+ }
+
+ var ir irDocument
+ if err := json.Unmarshal([]byte(irJSON), &ir); err != nil {
+ return nil, fmt.Errorf("parse IR JSON: %w", err)
+ }
+
+ var blocks []RawBlock
+ for _, sec := range ir.Sections {
+ for _, el := range sec.Elements {
+ block := irElementToBlock(el)
+ blocks = append(blocks, block)
+ }
+ }
+ return blocks, nil
+}
+
+func irElementToBlock(el irElement) RawBlock {
+ switch el.Type {
+ case "table":
+ rows := make([][]string, len(el.Rows))
+ for ri, row := range el.Rows {
+ cells := make([]string, len(row.Cells))
+ for ci, cell := range row.Cells {
+ cells[ci] = joinElements(cell.Content)
+ }
+ rows[ri] = cells
+ }
+ return RawBlock{Type: "table", Rows: rows}
+
+ case "heading":
+ text := joinRuns(el.Content)
+ level := strconv.Itoa(el.Level)
+ return RawBlock{
+ Type: "paragraph",
+ Text: text,
+ Style: "Heading " + level,
+ }
+
+ case "image":
+ return RawBlock{
+ Type: "image",
+ Image: base64.StdEncoding.EncodeToString(el.Data),
+ }
+
+ default: // "paragraph" and anything else
+ style := el.Style
+ if style == "" {
+ style = "Normal"
+ }
+ return RawBlock{
+ Type: "paragraph",
+ Text: joinRuns(el.Content),
+ Style: style,
+ }
+ }
+}
+
+func joinRuns(runs []irRun) string {
+ var b strings.Builder
+ for _, r := range runs {
+ if r.Type == "text" {
+ b.WriteString(r.Text)
+ }
+ }
+ return b.String()
+}
+
+// joinElements extracts plain text from nested irElements (used for table cells).
+// When multiple elements are present, a newline is inserted between each one
+// to match python-docx _Cell.text behavior.
+func joinElements(els []irElement) string {
+ var b strings.Builder
+ for i, el := range els {
+ if i > 0 {
+ b.WriteByte('\n')
+ }
+ b.WriteString(joinRuns(el.Content))
+ }
+ return b.String()
+}
diff --git a/internal/deepdoc/parser/docx/reader_cell_test.go b/internal/deepdoc/parser/docx/reader_cell_test.go
new file mode 100644
index 0000000000..4183cf378c
--- /dev/null
+++ b/internal/deepdoc/parser/docx/reader_cell_test.go
@@ -0,0 +1,38 @@
+//go:build cgo
+
+package docx
+
+import "testing"
+
+func TestJoinElements_MultiParagraphCell(t *testing.T) {
+ // When a table cell contains multiple paragraphs, joinElements must
+ // insert a newline between them to match python-docx _Cell.text behavior.
+ els := []irElement{
+ {Type: "paragraph", Content: []irRun{{Type: "text", Text: "first line"}}},
+ {Type: "paragraph", Content: []irRun{{Type: "text", Text: "second line"}}},
+ }
+ got := joinElements(els)
+ want := "first line\nsecond line"
+ if got != want {
+ t.Errorf("joinElements:\ngot: %q\nwant: %q", got, want)
+ }
+}
+
+func TestJoinElements_SingleElement(t *testing.T) {
+ // Single paragraph cell — no separator expected.
+ els := []irElement{
+ {Type: "paragraph", Content: []irRun{{Type: "text", Text: "single paragraph"}}},
+ }
+ got := joinElements(els)
+ want := "single paragraph"
+ if got != want {
+ t.Errorf("joinElements:\ngot: %q\nwant: %q", got, want)
+ }
+}
+
+func TestJoinElements_Empty(t *testing.T) {
+ got := joinElements(nil)
+ if got != "" {
+ t.Errorf("joinElements(nil): got %q, want empty", got)
+ }
+}
diff --git a/internal/deepdoc/parser/docx/reader_stub.go b/internal/deepdoc/parser/docx/reader_stub.go
new file mode 100644
index 0000000000..ed7c993544
--- /dev/null
+++ b/internal/deepdoc/parser/docx/reader_stub.go
@@ -0,0 +1,11 @@
+//go:build !cgo
+
+package docx
+
+import "errors"
+
+// ExtractRawBlocks is not available without cgo because the underlying
+// office_oxide library requires CGo. Rebuild with CGO_ENABLED=1.
+func ExtractRawBlocks(_ []byte) ([]RawBlock, error) {
+ return nil, errors.New("office_oxide requires cgo; rebuild with CGO_ENABLED=1")
+}
diff --git a/internal/deepdoc/parser/docx/reader_style_test.go b/internal/deepdoc/parser/docx/reader_style_test.go
new file mode 100644
index 0000000000..dd7b054ff6
--- /dev/null
+++ b/internal/deepdoc/parser/docx/reader_style_test.go
@@ -0,0 +1,54 @@
+//go:build cgo
+
+package docx
+
+import "testing"
+
+func TestIrElementToBlock_PreservesCustomStyle(t *testing.T) {
+ // irElementToBlock should preserve the Word style name from the IR,
+ // not hard-code "Normal" for every non-heading paragraph.
+ el := irElement{
+ Type: "paragraph",
+ Style: "Caption",
+ Content: []irRun{
+ {Type: "text", Text: "Figure 1: Architecture diagram"},
+ },
+ }
+ block := irElementToBlock(el)
+
+ if block.Style != "Caption" {
+ t.Errorf("irElementToBlock with Style=%q:\ngot Style=%q\nwant Style=%q",
+ el.Style, block.Style, el.Style)
+ }
+}
+
+func TestIrElementToBlock_PreservesHeadingStyle(t *testing.T) {
+ // Heading elements should still produce "Heading N" style.
+ el := irElement{
+ Type: "heading",
+ Level: 2,
+ Content: []irRun{
+ {Type: "text", Text: "Section 2.1"},
+ },
+ }
+ block := irElementToBlock(el)
+
+ if block.Style != "Heading 2" {
+ t.Errorf("heading: got Style=%q, want %q", block.Style, "Heading 2")
+ }
+}
+
+func TestIrElementToBlock_FallsBackToNormal(t *testing.T) {
+ // When Style is empty, defaults to "Normal".
+ el := irElement{
+ Type: "paragraph",
+ Content: []irRun{
+ {Type: "text", Text: "plain text"},
+ },
+ }
+ block := irElementToBlock(el)
+
+ if block.Style != "Normal" {
+ t.Errorf("empty style: got %q, want %q", block.Style, "Normal")
+ }
+}
diff --git a/internal/deepdoc/parser/docx/types.go b/internal/deepdoc/parser/docx/types.go
new file mode 100644
index 0000000000..21c3c56959
--- /dev/null
+++ b/internal/deepdoc/parser/docx/types.go
@@ -0,0 +1,46 @@
+package docx
+
+// RawBlock represents a single block extracted from a DOCX file in document order.
+// Type is one of "paragraph", "table", or "image". Headings are represented as
+// Type "paragraph" with a Style of "Heading N".
+type RawBlock struct {
+ Type string `json:"type"` // "paragraph" or "table"
+ Text string `json:"text"` // paragraph text; empty for tables
+ Style string `json:"style"` // Word style name (e.g. "Normal", "Heading 1")
+ Image string `json:"image,omitempty"` // base64-encoded image data
+ Rows [][]string `json:"rows,omitempty"` // table rows; nil for paragraphs
+}
+
+// ── office_oxide IR JSON types ────────────────────────────────────────
+
+type irElement struct {
+ Type string `json:"type"` // "paragraph", "heading", "table", "image"
+ Level int `json:"level"` // heading level (1-6)
+ Style string `json:"style"` // Word style name (e.g. "Normal", "Caption", "Heading 1")
+ Content []irRun `json:"content"` // rich text runs
+ Data []byte `json:"data"` // raw image bytes (for "image" type)
+ Rows []irRow `json:"rows"` // table rows
+}
+
+type irRun struct {
+ Type string `json:"type"` // "text", "image"
+ Text string `json:"text"` // plain text content
+ Content []irElement `json:"content"` // nested elements (used in table cells)
+}
+
+type irRow struct {
+ Cells []irCell `json:"cells"`
+}
+
+type irCell struct {
+ Content []irElement `json:"content"` // nested paragraphs inside table cell
+}
+
+type irSection struct {
+ Title string `json:"title"`
+ Elements []irElement `json:"elements"`
+}
+
+type irDocument struct {
+ Sections []irSection `json:"sections"`
+}