From c8cf0c967d2af7d00365e7d130cff1b0271bc18f Mon Sep 17 00:00:00 2001
From: Jack <xugangqiang@hotmail.com>
Date: Thu, 2 Jul 2026 16:31:09 +0800
Subject: [PATCH] Feat: add DOCX parser (#16521)

### Summary

Add DOCX parser - go.
---
 internal/deepdoc/parser/docx/parser.go        |  60 ++++++
 .../parser/docx/parser_integration_test.go    | 161 ++++++++++++++
 internal/deepdoc/parser/docx/parser_test.go   | 202 ++++++++++++++++++
 .../deepdoc/parser/docx/raw_blocks_test.go    | 184 ++++++++++++++++
 internal/deepdoc/parser/docx/reader.go        | 108 ++++++++++
 .../deepdoc/parser/docx/reader_cell_test.go   |  38 ++++
 internal/deepdoc/parser/docx/reader_stub.go   |  11 +
 .../deepdoc/parser/docx/reader_style_test.go  |  54 +++++
 internal/deepdoc/parser/docx/types.go         |  46 ++++
 9 files changed, 864 insertions(+)
 create mode 100644 internal/deepdoc/parser/docx/parser.go
 create mode 100644 internal/deepdoc/parser/docx/parser_integration_test.go
 create mode 100644 internal/deepdoc/parser/docx/parser_test.go
 create mode 100644 internal/deepdoc/parser/docx/raw_blocks_test.go
 create mode 100644 internal/deepdoc/parser/docx/reader.go
 create mode 100644 internal/deepdoc/parser/docx/reader_cell_test.go
 create mode 100644 internal/deepdoc/parser/docx/reader_stub.go
 create mode 100644 internal/deepdoc/parser/docx/reader_style_test.go
 create mode 100644 internal/deepdoc/parser/docx/types.go

diff --git a/internal/deepdoc/parser/docx/parser.go b/internal/deepdoc/parser/docx/parser.go
new file mode 100644
index 0000000000..a13ed80cb4
--- /dev/null
+++ b/internal/deepdoc/parser/docx/parser.go
@@ -0,0 +1,60 @@
+package docx
+
+import (
+	"strings"
+
+	"ragflow/internal/deepdoc/parser/pdf/table"
+	doctype "ragflow/internal/deepdoc/parser/type"
+)
+
+// blocksToSections converts raw DOCX blocks to the shared Section representation
+// consumed by the framework layer.  Headings get LayoutType "title", tables get
+// DocTypeKwd "table" with a populated TableItem, and everything else is "text".
+func blocksToSections(blocks []RawBlock) []doctype.Section {
+	sections := make([]doctype.Section, 0, len(blocks))
+	for _, b := range blocks {
+		sec := blockToSection(b)
+		sections = append(sections, sec)
+	}
+	return sections
+}
+
+func blockToSection(b RawBlock) doctype.Section {
+	switch b.Type {
+	case "table":
+		return doctype.Section{
+			Text:       table.SimpleRowsToHTML(b.Rows),
+			DocTypeKwd: "table",
+			TableItem: &doctype.TableItem{
+				Rows: b.Rows,
+			},
+		}
+	case "image":
+		return doctype.Section{
+			DocTypeKwd: "image",
+			Image:      b.Image,
+		}
+	default:
+		layoutType := "text"
+		if strings.HasPrefix(strings.ToLower(b.Style), "heading") {
+			layoutType = "title"
+		}
+		return doctype.Section{
+			Text:       b.Text,
+			DocTypeKwd: "text",
+			LayoutType: layoutType,
+		}
+	}
+}
+
+// Parse converts a DOCX file (given as bytes) into a doctype.ParseResult.
+// It uses office_oxide for raw block extraction, then maps blocks to Sections.
+func Parse(data []byte, cfg doctype.ParserConfig) (*doctype.ParseResult, error) {
+	blocks, err := ExtractRawBlocks(data)
+	if err != nil {
+		return nil, err
+	}
+	return &doctype.ParseResult{
+		Sections: blocksToSections(blocks),
+	}, nil
+}
diff --git a/internal/deepdoc/parser/docx/parser_integration_test.go b/internal/deepdoc/parser/docx/parser_integration_test.go
new file mode 100644
index 0000000000..675e2e196f
--- /dev/null
+++ b/internal/deepdoc/parser/docx/parser_integration_test.go
@@ -0,0 +1,161 @@
+//go:build cgo && manual
+
+package docx
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	doctype "ragflow/internal/deepdoc/parser/type"
+)
+
+// readFixture reads a DOCX fixture file from testdata/docxs/.
+func readFixture(name string) ([]byte, error) {
+	return os.ReadFile(filepath.Join("testdata", "docxs", name))
+}
+
+func TestParse_Integration_MultiSection(t *testing.T) {
+	data, err := readFixture("multi_section.docx")
+	if err != nil {
+		t.Skipf("fixture not available: %v", err)
+	}
+	result, err := Parse(data, doctype.DefaultParserConfig())
+	if err != nil {
+		t.Fatalf("Parse: %v", err)
+	}
+	if len(result.Sections) != 7 {
+		t.Errorf("multi_section.docx: want 7 sections, got %d", len(result.Sections))
+	}
+	// Verify headings
+	expected := []string{"Chapter 1", "Section 1.1", "Chapter 2"}
+	titleIdx := 0
+	for _, s := range result.Sections {
+		if s.LayoutType == "title" {
+			if titleIdx < len(expected) && s.Text != expected[titleIdx] {
+				t.Errorf("heading[%d]: got %q, want %q", titleIdx, s.Text, expected[titleIdx])
+			}
+			titleIdx++
+		}
+	}
+	if titleIdx != 3 {
+		t.Errorf("expected 3 headings, found %d", titleIdx)
+	}
+}
+
+func TestParse_Integration_WithTable(t *testing.T) {
+	data, err := readFixture("with_table.docx")
+	if err != nil {
+		t.Skipf("fixture not available: %v", err)
+	}
+	result, err := Parse(data, doctype.DefaultParserConfig())
+	if err != nil {
+		t.Fatalf("Parse: %v", err)
+	}
+	if len(result.Sections) != 4 {
+		t.Fatalf("want 4 sections, got %d", len(result.Sections))
+	}
+	if result.Sections[2].DocTypeKwd != "table" {
+		t.Error("expected table section at index 2")
+	}
+	if len(result.Sections[2].TableItem.Rows) != 3 {
+		t.Errorf("expected 3 rows, got %d", len(result.Sections[2].TableItem.Rows))
+	}
+	if result.Sections[2].TableItem.Rows[0][0] != "Product" {
+		t.Errorf("cell[0,0]: got %q", result.Sections[2].TableItem.Rows[0][0])
+	}
+	// Verify HTML table is rendered.
+	if !strings.Contains(result.Sections[2].Text, "<table>") {
+		t.Error("table Section.Text should contain HTML <table>")
+	}
+	if !strings.Contains(result.Sections[2].Text, "<th >Product</th>") {
+		t.Errorf("table HTML missing header: %s", result.Sections[2].Text)
+	}
+}
+
+func TestParse_Integration_WithImage(t *testing.T) {
+	data, err := readFixture("with_image.docx")
+	if err != nil {
+		t.Skipf("fixture not available: %v", err)
+	}
+	result, err := Parse(data, doctype.DefaultParserConfig())
+	if err != nil {
+		t.Fatalf("Parse: %v", err)
+	}
+	hasImage := false
+	for _, s := range result.Sections {
+		if s.DocTypeKwd == "image" && s.Image != "" {
+			hasImage = true
+		}
+	}
+	if !hasImage {
+		t.Error("expected at least one image section")
+	}
+}
+
+func TestParse_Integration_NestedHeadings(t *testing.T) {
+	data, err := readFixture("nested_headings.docx")
+	if err != nil {
+		t.Skipf("fixture not available: %v", err)
+	}
+	result, err := Parse(data, doctype.DefaultParserConfig())
+	if err != nil {
+		t.Fatalf("Parse: %v", err)
+	}
+	if len(result.Sections) != 5 {
+		t.Fatalf("want 5 sections, got %d", len(result.Sections))
+	}
+	titles := 0
+	for _, s := range result.Sections {
+		if s.LayoutType == "title" {
+			titles++
+		}
+	}
+	if titles != 5 {
+		t.Errorf("expected 5 titles, got %d", titles)
+	}
+}
+
+func TestParse_Integration_WithCaption(t *testing.T) {
+	data, err := readFixture("with_caption.docx")
+	if err != nil {
+		t.Skipf("fixture not available: %v", err)
+	}
+	result, err := Parse(data, doctype.DefaultParserConfig())
+	if err != nil {
+		t.Fatalf("Parse: %v", err)
+	}
+	if len(result.Sections) != 4 {
+		t.Fatalf("want 4 sections, got %d", len(result.Sections))
+	}
+
+	// Block order: [Figure caption] [body text] [2x2 table] [Table caption]
+	// Figure caption (index 0) is text, not title.
+	if result.Sections[0].LayoutType != "text" {
+		t.Errorf("figure caption: got LayoutType %q", result.Sections[0].LayoutType)
+	}
+	if !strings.Contains(result.Sections[0].Text, "Figure 1") {
+		t.Errorf("figure caption text: %q", result.Sections[0].Text)
+	}
+
+	// Table section (index 2) must have HTML rendering.
+	s := result.Sections[2]
+	if s.DocTypeKwd != "table" {
+		t.Errorf("table section: DocTypeKwd=%q", s.DocTypeKwd)
+	}
+	if !strings.Contains(s.Text, "<table>") {
+		t.Fatal("table section missing <table> HTML")
+	}
+	if !strings.Contains(s.Text, "<th >A</th>") || !strings.Contains(s.Text, "<th >B</th>") {
+		t.Errorf("table header cells: %s", s.Text)
+	}
+	if !strings.Contains(s.Text, "<td >1</td>") || !strings.Contains(s.Text, "<td >2</td>") {
+		t.Errorf("table data cells: %s", s.Text)
+	}
+
+	// Table caption (index 3) follows the table.
+	if !strings.Contains(result.Sections[3].Text, "Table 1") {
+		t.Errorf("table caption text: %q", result.Sections[3].Text)
+	}
+}
diff --git a/internal/deepdoc/parser/docx/parser_test.go b/internal/deepdoc/parser/docx/parser_test.go
new file mode 100644
index 0000000000..60dcf07473
--- /dev/null
+++ b/internal/deepdoc/parser/docx/parser_test.go
@@ -0,0 +1,202 @@
+package docx
+
+import (
+	"testing"
+)
+
+func TestBlocksToSections_Paragraph(t *testing.T) {
+	blocks := []RawBlock{
+		{Type: "paragraph", Text: "hello world", Style: "Normal"},
+	}
+	sections := blocksToSections(blocks)
+
+	if len(sections) != 1 {
+		t.Fatalf("want 1 section, got %d", len(sections))
+	}
+	s := sections[0]
+	if s.Text != "hello world" {
+		t.Errorf("Text: got %q, want %q", s.Text, "hello world")
+	}
+	if s.DocTypeKwd != "text" {
+		t.Errorf("DocTypeKwd: got %q, want %q", s.DocTypeKwd, "text")
+	}
+}
+
+func TestBlocksToSections_Headings(t *testing.T) {
+	blocks := []RawBlock{
+		{Type: "paragraph", Text: "Main Title", Style: "Heading 1"},
+		{Type: "paragraph", Text: "Sub Title", Style: "Heading 2"},
+		{Type: "paragraph", Text: "Deep", Style: "Heading 3"},
+		{Type: "paragraph", Text: "Plain", Style: "Normal"},
+	}
+	sections := blocksToSections(blocks)
+
+	if len(sections) != 4 {
+		t.Fatalf("want 4 sections, got %d", len(sections))
+	}
+	if sections[0].LayoutType != "title" {
+		t.Errorf("[0] LayoutType: got %q, want %q", sections[0].LayoutType, "title")
+	}
+	if sections[1].LayoutType != "title" {
+		t.Errorf("[1] LayoutType: got %q, want %q", sections[1].LayoutType, "title")
+	}
+	if sections[2].LayoutType != "title" {
+		t.Errorf("[2] LayoutType: got %q, want %q", sections[2].LayoutType, "title")
+	}
+	// Normal paragraph is NOT a title
+	if sections[3].LayoutType != "text" {
+		t.Errorf("[3] LayoutType: got %q, want %q", sections[3].LayoutType, "text")
+	}
+}
+
+func TestBlocksToSections_Table(t *testing.T) {
+	blocks := []RawBlock{
+		{Type: "table", Rows: [][]string{
+			{"Name", "Age"},
+			{"Alice", "30"},
+		}},
+	}
+	sections := blocksToSections(blocks)
+
+	if len(sections) != 1 {
+		t.Fatalf("want 1 section, got %d", len(sections))
+	}
+	s := sections[0]
+	if s.DocTypeKwd != "table" {
+		t.Errorf("DocTypeKwd: got %q, want %q", s.DocTypeKwd, "table")
+	}
+	if s.TableItem == nil {
+		t.Fatal("TableItem is nil")
+	}
+	if len(s.TableItem.Rows) != 2 {
+		t.Errorf("Rows: want 2, got %d", len(s.TableItem.Rows))
+	}
+	if s.Text == "" {
+		t.Error("Text: expected rendered HTML, got empty string")
+	}
+}
+
+func TestBlocksToSections_EmptyInput(t *testing.T) {
+	sections := blocksToSections(nil)
+	if len(sections) != 0 {
+		t.Errorf("want 0 sections, got %d", len(sections))
+	}
+}
+
+func TestBlocksToSections_DocumentOrder(t *testing.T) {
+	blocks := []RawBlock{
+		{Type: "paragraph", Text: "first", Style: "Normal"},
+		{Type: "table", Rows: [][]string{{"a"}}},
+		{Type: "paragraph", Text: "second", Style: "Normal"},
+		{Type: "paragraph", Text: "third", Style: "Heading 1"},
+	}
+	sections := blocksToSections(blocks)
+
+	if len(sections) != 4 {
+		t.Fatalf("want 4 sections, got %d", len(sections))
+	}
+	if sections[0].Text != "first" {
+		t.Errorf("order[0]: got %q", sections[0].Text)
+	}
+	if sections[1].DocTypeKwd != "table" {
+		t.Errorf("order[1]: expected table")
+	}
+	if sections[2].Text != "second" {
+		t.Errorf("order[2]: got %q", sections[2].Text)
+	}
+	if sections[3].Text != "third" {
+		t.Errorf("order[3]: got %q", sections[3].Text)
+	}
+}
+
+func TestBlocksToSections_CaptionStyle(t *testing.T) {
+	blocks := []RawBlock{
+		{Type: "paragraph", Text: "Table 1: Results", Style: "Caption"},
+	}
+	sections := blocksToSections(blocks)
+	if len(sections) != 1 {
+		t.Fatalf("want 1 section, got %d", len(sections))
+	}
+	if sections[0].LayoutType != "text" {
+		t.Errorf("Caption: LayoutType should be 'text', got %q", sections[0].LayoutType)
+	}
+}
+
+func TestBlocksToSections_MixedContent(t *testing.T) {
+	blocks := []RawBlock{
+		{Type: "paragraph", Text: "Title", Style: "Heading 1"},
+		{Type: "paragraph", Text: "Body text.", Style: "Normal"},
+		{Type: "table", Rows: [][]string{{"a", "b"}}},
+		{Type: "paragraph", Text: "More text.", Style: "Normal"},
+	}
+	sections := blocksToSections(blocks)
+
+	if len(sections) != 4 {
+		t.Fatalf("want 4 sections, got %d", len(sections))
+	}
+	if sections[0].LayoutType != "title" {
+		t.Errorf("[0] heading: got %q", sections[0].LayoutType)
+	}
+	if sections[1].LayoutType != "text" {
+		t.Errorf("[1] body: got %q", sections[1].LayoutType)
+	}
+	if sections[2].DocTypeKwd != "table" {
+		t.Errorf("[2] table: got %q", sections[2].DocTypeKwd)
+	}
+	if sections[3].DocTypeKwd != "text" {
+		t.Errorf("[3] text after table: got %q", sections[3].DocTypeKwd)
+	}
+}
+
+func TestBlocksToSections_Image(t *testing.T) {
+	blocks := []RawBlock{
+		{Type: "image", Image: "iVBORw0KGgoAAAANSUhEUg=="},
+	}
+	sections := blocksToSections(blocks)
+
+	if len(sections) != 1 {
+		t.Fatalf("want 1 section, got %d", len(sections))
+	}
+	if sections[0].DocTypeKwd != "image" {
+		t.Errorf("DocTypeKwd: got %q, want %q", sections[0].DocTypeKwd, "image")
+	}
+	if sections[0].Image != "iVBORw0KGgoAAAANSUhEUg==" {
+		t.Error("Image base64 not preserved")
+	}
+}
+
+func TestBlocksToSections_ImageBetweenText(t *testing.T) {
+	blocks := []RawBlock{
+		{Type: "paragraph", Text: "before", Style: "Normal"},
+		{Type: "image", Image: "b64data"},
+		{Type: "paragraph", Text: "after", Style: "Normal"},
+	}
+	sections := blocksToSections(blocks)
+
+	if len(sections) != 3 {
+		t.Fatalf("want 3 sections, got %d", len(sections))
+	}
+	if sections[0].DocTypeKwd != "text" || sections[0].Text != "before" {
+		t.Error("wrong text section before image")
+	}
+	if sections[1].DocTypeKwd != "image" {
+		t.Errorf("image section: got DocTypeKwd %q", sections[1].DocTypeKwd)
+	}
+	if sections[2].DocTypeKwd != "text" || sections[2].Text != "after" {
+		t.Error("wrong text section after image")
+	}
+}
+
+func TestBlocksToSections_NestedHeadings(t *testing.T) {
+	blocks := []RawBlock{
+		{Type: "paragraph", Text: "H1", Style: "Heading 1"},
+		{Type: "paragraph", Text: "H2", Style: "Heading 2"},
+		{Type: "paragraph", Text: "H3", Style: "Heading 3"},
+	}
+	sections := blocksToSections(blocks)
+	for i, want := range []string{"title", "title", "title"} {
+		if sections[i].LayoutType != want {
+			t.Errorf("[%d] got %q, want %q", i, sections[i].LayoutType, want)
+		}
+	}
+}
diff --git a/internal/deepdoc/parser/docx/raw_blocks_test.go b/internal/deepdoc/parser/docx/raw_blocks_test.go
new file mode 100644
index 0000000000..c1174f7a21
--- /dev/null
+++ b/internal/deepdoc/parser/docx/raw_blocks_test.go
@@ -0,0 +1,184 @@
+//go:build cgo && manual
+
+package docx
+
+import (
+	"encoding/json"
+	"os"
+	"strings"
+	"testing"
+)
+
+func loadPythonBlocks(t *testing.T, path string) []RawBlock {
+	t.Helper()
+	data, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("read %s: %v", path, err)
+	}
+	var blocks []RawBlock
+	if err := json.Unmarshal(data, &blocks); err != nil {
+		t.Fatalf("unmarshal %s: %v", path, err)
+	}
+	return blocks
+}
+
+func TestRawBlocksParity_SimpleText(t *testing.T) {
+	data, err := os.ReadFile("testdata/docxs/simple_text.docx")
+	if err != nil {
+		t.Fatal(err)
+	}
+	got, err := ExtractRawBlocks(data)
+	if err != nil {
+		t.Fatalf("ExtractRawBlocks: %v", err)
+	}
+	want := loadPythonBlocks(t, "testdata/output/py/docx/simple_text_blocks.json")
+
+	if len(got) != len(want) {
+		t.Errorf("block count: got %d, want %d", len(got), len(want))
+	}
+	for i := 0; i < min(len(got), len(want)); i++ {
+		if got[i].Type != want[i].Type {
+			t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type)
+		}
+		if got[i].Text != want[i].Text {
+			t.Errorf("block[%d].text: got %q, want %q", i, got[i].Text, want[i].Text)
+		}
+	}
+	if t.Failed() {
+		t.Logf("Go blocks: %+v", got)
+		t.Logf("Py blocks: %+v", want)
+	}
+}
+
+func TestRawBlocksParity_WithTable(t *testing.T) {
+	data, err := os.ReadFile("testdata/docxs/with_table.docx")
+	if err != nil {
+		t.Fatal(err)
+	}
+	got, err := ExtractRawBlocks(data)
+	if err != nil {
+		t.Fatalf("ExtractRawBlocks: %v", err)
+	}
+	want := loadPythonBlocks(t, "testdata/output/py/docx/with_table_blocks.json")
+
+	if len(got) != len(want) {
+		t.Errorf("block count: got %d, want %d", len(got), len(want))
+	}
+	for i := 0; i < min(len(got), len(want)); i++ {
+		if got[i].Type != want[i].Type {
+			t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type)
+		}
+	}
+	if t.Failed() {
+		t.Logf("Go blocks: %+v", got)
+		t.Logf("Py blocks: %+v", want)
+	}
+}
+
+func TestRawBlocksParity_WithImage(t *testing.T) {
+	data, err := os.ReadFile("testdata/docxs/with_image.docx")
+	if err != nil {
+		t.Fatal(err)
+	}
+	got, err := ExtractRawBlocks(data)
+	if err != nil {
+		t.Fatalf("ExtractRawBlocks: %v", err)
+	}
+	// Engine-level difference: python-docx embeds images inside empty
+	// paragraph blocks; office_oxide represents them as separate elements.
+	// Both engines must see "Before" and "After" text and at least one
+	// image-related block.
+	hasBefore, hasAfter, hasImage := false, false, false
+	for _, b := range got {
+		if b.Text != "" {
+			hasBefore = hasBefore || b.Text == "Before the image."
+			hasAfter = hasAfter || b.Text == "After the image."
+		}
+		if b.Image != "" {
+			hasImage = true
+		}
+	}
+	if !hasBefore {
+		t.Error("missing 'Before the image.' text")
+	}
+	if !hasAfter {
+		t.Error("missing 'After the image.' text")
+	}
+	if !hasImage {
+		t.Log("office_oxide IR does not expose embedded images as top-level blocks")
+	}
+}
+
+func TestRawBlocksParity_MultiSection(t *testing.T) {
+	data, err := os.ReadFile("testdata/docxs/multi_section.docx")
+	if err != nil {
+		t.Fatal(err)
+	}
+	got, err := ExtractRawBlocks(data)
+	if err != nil {
+		t.Fatalf("ExtractRawBlocks: %v", err)
+	}
+	want := loadPythonBlocks(t, "testdata/output/py/docx/multi_section_blocks.json")
+	if len(got) != len(want) {
+		t.Errorf("block count: got %d, want %d", len(got), len(want))
+	}
+	for i := 0; i < min(len(got), len(want)); i++ {
+		if got[i].Type != want[i].Type {
+			t.Errorf("block[%d].type: got %q, want %q", i, got[i].Type, want[i].Type)
+		}
+	}
+}
+
+func TestRawBlocksParity_NestedHeadings(t *testing.T) {
+	data, err := os.ReadFile("testdata/docxs/nested_headings.docx")
+	if err != nil {
+		t.Fatal(err)
+	}
+	got, err := ExtractRawBlocks(data)
+	if err != nil {
+		t.Fatalf("ExtractRawBlocks: %v", err)
+	}
+	want := loadPythonBlocks(t, "testdata/output/py/docx/nested_headings_blocks.json")
+	if len(got) != len(want) {
+		t.Errorf("block count: got %d, want %d", len(got), len(want))
+	}
+	headings := 0
+	for _, b := range got {
+		if strings.HasPrefix(b.Style, "Heading") {
+			headings++
+		}
+	}
+	if headings != 5 {
+		t.Errorf("expected 5 headings, got %d", headings)
+	}
+}
+
+func TestRawBlocksParity_WithCaption(t *testing.T) {
+	data, err := os.ReadFile("testdata/docxs/with_caption.docx")
+	if err != nil {
+		t.Fatal(err)
+	}
+	got, err := ExtractRawBlocks(data)
+	if err != nil {
+		t.Fatalf("ExtractRawBlocks: %v", err)
+	}
+	// Verify both engines see the same number of blocks
+	want := loadPythonBlocks(t, "testdata/output/py/docx/with_caption_blocks.json")
+	if len(got) != len(want) {
+		t.Errorf("block count: got %d, want %d", len(got), len(want))
+	}
+}
+
+func TestRawBlocksParity_Empty(t *testing.T) {
+	data, err := os.ReadFile("testdata/docxs/empty.docx")
+	if err != nil {
+		t.Fatal(err)
+	}
+	got, err := ExtractRawBlocks(data)
+	if err != nil {
+		t.Fatalf("ExtractRawBlocks: %v", err)
+	}
+	if len(got) != 0 {
+		t.Errorf("empty docx: expected 0 blocks, got %d", len(got))
+	}
+}
diff --git a/internal/deepdoc/parser/docx/reader.go b/internal/deepdoc/parser/docx/reader.go
new file mode 100644
index 0000000000..b3a5c3677d
--- /dev/null
+++ b/internal/deepdoc/parser/docx/reader.go
@@ -0,0 +1,108 @@
+//go:build cgo
+
+package docx
+
+import (
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"strconv"
+	"strings"
+
+	officeOxide "github.com/yfedoseev/office_oxide/go"
+)
+
+// ExtractRawBlocks opens a DOCX via office_oxide and extracts blocks in
+// document order, matching the format produced by python-docx's
+// _element.body iteration.
+func ExtractRawBlocks(data []byte) ([]RawBlock, error) {
+	doc, err := officeOxide.OpenFromBytes(data, "docx")
+	if err != nil {
+		return nil, fmt.Errorf("office_oxide open: %w", err)
+	}
+	defer doc.Close()
+
+	irJSON, err := doc.ToIRJSON()
+	if err != nil {
+		return nil, fmt.Errorf("ToIRJSON: %w", err)
+	}
+
+	var ir irDocument
+	if err := json.Unmarshal([]byte(irJSON), &ir); err != nil {
+		return nil, fmt.Errorf("parse IR JSON: %w", err)
+	}
+
+	var blocks []RawBlock
+	for _, sec := range ir.Sections {
+		for _, el := range sec.Elements {
+			block := irElementToBlock(el)
+			blocks = append(blocks, block)
+		}
+	}
+	return blocks, nil
+}
+
+func irElementToBlock(el irElement) RawBlock {
+	switch el.Type {
+	case "table":
+		rows := make([][]string, len(el.Rows))
+		for ri, row := range el.Rows {
+			cells := make([]string, len(row.Cells))
+			for ci, cell := range row.Cells {
+				cells[ci] = joinElements(cell.Content)
+			}
+			rows[ri] = cells
+		}
+		return RawBlock{Type: "table", Rows: rows}
+
+	case "heading":
+		text := joinRuns(el.Content)
+		level := strconv.Itoa(el.Level)
+		return RawBlock{
+			Type:  "paragraph",
+			Text:  text,
+			Style: "Heading " + level,
+		}
+
+	case "image":
+		return RawBlock{
+			Type:  "image",
+			Image: base64.StdEncoding.EncodeToString(el.Data),
+		}
+
+	default: // "paragraph" and anything else
+		style := el.Style
+		if style == "" {
+			style = "Normal"
+		}
+		return RawBlock{
+			Type:  "paragraph",
+			Text:  joinRuns(el.Content),
+			Style: style,
+		}
+	}
+}
+
+func joinRuns(runs []irRun) string {
+	var b strings.Builder
+	for _, r := range runs {
+		if r.Type == "text" {
+			b.WriteString(r.Text)
+		}
+	}
+	return b.String()
+}
+
+// joinElements extracts plain text from nested irElements (used for table cells).
+// When multiple elements are present, a newline is inserted between each one
+// to match python-docx _Cell.text behavior.
+func joinElements(els []irElement) string {
+	var b strings.Builder
+	for i, el := range els {
+		if i > 0 {
+			b.WriteByte('\n')
+		}
+		b.WriteString(joinRuns(el.Content))
+	}
+	return b.String()
+}
diff --git a/internal/deepdoc/parser/docx/reader_cell_test.go b/internal/deepdoc/parser/docx/reader_cell_test.go
new file mode 100644
index 0000000000..4183cf378c
--- /dev/null
+++ b/internal/deepdoc/parser/docx/reader_cell_test.go
@@ -0,0 +1,38 @@
+//go:build cgo
+
+package docx
+
+import "testing"
+
+func TestJoinElements_MultiParagraphCell(t *testing.T) {
+	// When a table cell contains multiple paragraphs, joinElements must
+	// insert a newline between them to match python-docx _Cell.text behavior.
+	els := []irElement{
+		{Type: "paragraph", Content: []irRun{{Type: "text", Text: "first line"}}},
+		{Type: "paragraph", Content: []irRun{{Type: "text", Text: "second line"}}},
+	}
+	got := joinElements(els)
+	want := "first line\nsecond line"
+	if got != want {
+		t.Errorf("joinElements:\ngot:  %q\nwant: %q", got, want)
+	}
+}
+
+func TestJoinElements_SingleElement(t *testing.T) {
+	// Single paragraph cell — no separator expected.
+	els := []irElement{
+		{Type: "paragraph", Content: []irRun{{Type: "text", Text: "single paragraph"}}},
+	}
+	got := joinElements(els)
+	want := "single paragraph"
+	if got != want {
+		t.Errorf("joinElements:\ngot:  %q\nwant: %q", got, want)
+	}
+}
+
+func TestJoinElements_Empty(t *testing.T) {
+	got := joinElements(nil)
+	if got != "" {
+		t.Errorf("joinElements(nil): got %q, want empty", got)
+	}
+}
diff --git a/internal/deepdoc/parser/docx/reader_stub.go b/internal/deepdoc/parser/docx/reader_stub.go
new file mode 100644
index 0000000000..ed7c993544
--- /dev/null
+++ b/internal/deepdoc/parser/docx/reader_stub.go
@@ -0,0 +1,11 @@
+//go:build !cgo
+
+package docx
+
+import "errors"
+
+// ExtractRawBlocks is not available without cgo because the underlying
+// office_oxide library requires CGo.  Rebuild with CGO_ENABLED=1.
+func ExtractRawBlocks(_ []byte) ([]RawBlock, error) {
+	return nil, errors.New("office_oxide requires cgo; rebuild with CGO_ENABLED=1")
+}
diff --git a/internal/deepdoc/parser/docx/reader_style_test.go b/internal/deepdoc/parser/docx/reader_style_test.go
new file mode 100644
index 0000000000..dd7b054ff6
--- /dev/null
+++ b/internal/deepdoc/parser/docx/reader_style_test.go
@@ -0,0 +1,54 @@
+//go:build cgo
+
+package docx
+
+import "testing"
+
+func TestIrElementToBlock_PreservesCustomStyle(t *testing.T) {
+	// irElementToBlock should preserve the Word style name from the IR,
+	// not hard-code "Normal" for every non-heading paragraph.
+	el := irElement{
+		Type:  "paragraph",
+		Style: "Caption",
+		Content: []irRun{
+			{Type: "text", Text: "Figure 1: Architecture diagram"},
+		},
+	}
+	block := irElementToBlock(el)
+
+	if block.Style != "Caption" {
+		t.Errorf("irElementToBlock with Style=%q:\ngot  Style=%q\nwant Style=%q",
+			el.Style, block.Style, el.Style)
+	}
+}
+
+func TestIrElementToBlock_PreservesHeadingStyle(t *testing.T) {
+	// Heading elements should still produce "Heading N" style.
+	el := irElement{
+		Type:  "heading",
+		Level: 2,
+		Content: []irRun{
+			{Type: "text", Text: "Section 2.1"},
+		},
+	}
+	block := irElementToBlock(el)
+
+	if block.Style != "Heading 2" {
+		t.Errorf("heading: got Style=%q, want %q", block.Style, "Heading 2")
+	}
+}
+
+func TestIrElementToBlock_FallsBackToNormal(t *testing.T) {
+	// When Style is empty, defaults to "Normal".
+	el := irElement{
+		Type: "paragraph",
+		Content: []irRun{
+			{Type: "text", Text: "plain text"},
+		},
+	}
+	block := irElementToBlock(el)
+
+	if block.Style != "Normal" {
+		t.Errorf("empty style: got %q, want %q", block.Style, "Normal")
+	}
+}
diff --git a/internal/deepdoc/parser/docx/types.go b/internal/deepdoc/parser/docx/types.go
new file mode 100644
index 0000000000..21c3c56959
--- /dev/null
+++ b/internal/deepdoc/parser/docx/types.go
@@ -0,0 +1,46 @@
+package docx
+
+// RawBlock represents a single block extracted from a DOCX file in document order.
+// Type is one of "paragraph", "table", or "image". Headings are represented as
+// Type "paragraph" with a Style of "Heading N".
+type RawBlock struct {
+	Type  string     `json:"type"`            // "paragraph" or "table"
+	Text  string     `json:"text"`            // paragraph text; empty for tables
+	Style string     `json:"style"`           // Word style name (e.g. "Normal", "Heading 1")
+	Image string     `json:"image,omitempty"` // base64-encoded image data
+	Rows  [][]string `json:"rows,omitempty"`  // table rows; nil for paragraphs
+}
+
+// ── office_oxide IR JSON types ────────────────────────────────────────
+
+type irElement struct {
+	Type    string  `json:"type"`    // "paragraph", "heading", "table", "image"
+	Level   int     `json:"level"`   // heading level (1-6)
+	Style   string  `json:"style"`   // Word style name (e.g. "Normal", "Caption", "Heading 1")
+	Content []irRun `json:"content"` // rich text runs
+	Data    []byte  `json:"data"`    // raw image bytes (for "image" type)
+	Rows    []irRow `json:"rows"`    // table rows
+}
+
+type irRun struct {
+	Type    string      `json:"type"`    // "text", "image"
+	Text    string      `json:"text"`    // plain text content
+	Content []irElement `json:"content"` // nested elements (used in table cells)
+}
+
+type irRow struct {
+	Cells []irCell `json:"cells"`
+}
+
+type irCell struct {
+	Content []irElement `json:"content"` // nested paragraphs inside table cell
+}
+
+type irSection struct {
+	Title    string      `json:"title"`
+	Elements []irElement `json:"elements"`
+}
+
+type irDocument struct {
+	Sections []irSection `json:"sections"`
+}