From 70c0121b78e5b82402ad96efc2f3c6ecd74e0ef1 Mon Sep 17 00:00:00 2001
From: Manan Bansal <66985466+manan-tech@users.noreply.github.com>
Date: Mon, 22 Jun 2026 07:16:44 +0530
Subject: [PATCH] Fix: preserve tables when parsing DOCX with the laws parser
 (#16008) (#16155)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What

Fixes #16008 — tables contained in a DOCX are silently dropped when the
document is parsed with the **laws** chunking method.

## Root cause

`Docx.__call__` in `rag/app/laws.py` iterated `self.doc.paragraphs`,
which only yields paragraph elements. Tables are separate `tbl` blocks
in the document body, so they were never visited and were lost from the
output. (The `naive` parser already handles tables by iterating the
document body.)

## Changes

- Iterate `self.doc._element.body` so tables are visited in document
order alongside paragraphs.
- Add a `__table_to_html` helper that renders each table to HTML,
including merged-cell `colspan` detection (mirrors the `naive` parser's
logic).
- Inject each table into the section tree with a sentinel level deeper
than any heading, so `Node.build_tree` merges it into its **enclosing
section** — keeping the chapter/article title path as retrieval context
rather than producing an orphaned chunk.
- Guard the `h2_level` computation against an empty heading set, so a
tables-only or empty DOCX no longer raises `IndexError`.

This keeps the laws parser's hierarchical chunking **and** adds table
extraction, so users no longer have to choose between losing structure
(naive) or losing tables (laws).

## Tests

Adds `test/unit_test/rag/test_laws_docx_tables.py` covering:
- table content is preserved and carries its section title path,
- merged adjacent cells collapse to `colspan`,
- tables-only document does not crash,
- empty document returns `[]`.

All four pass; `ruff check` / `ruff format` are clean.
---
 rag/app/laws.py                             |  53 ++++++++-
 test/unit_test/rag/test_laws_docx_tables.py | 124 ++++++++++++++++++++
 2 files changed, 174 insertions(+), 3 deletions(-)
 create mode 100644 test/unit_test/rag/test_laws_docx_tables.py
diff --git a/rag/app/laws.py b/rag/app/laws.py
index 46829d23c2..d365340832 100644
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@@ -16,8 +16,11 @@
 
 import logging
 import re
+from html import escape as html_escape
 from io import BytesIO
 from docx import Document
+from docx.table import Table as DocxTable
+from docx.text.paragraph import Paragraph
 
 from common.constants import ParserType, MAXIMUM_PAGE_NUMBER
 from deepdoc.parser.utils import get_text
@@ -53,15 +56,56 @@ class Docx(DocxParser):
                     pn += 1
         return [line for line in lines if line]
 
+    def __table_to_html(self, tb):
+        html = "<table>"
+        for r in tb.rows:
+            html += "<tr>"
+            col_idx = 0
+            try:
+                while col_idx < len(r.cells):
+                    span = 1
+                    c = r.cells[col_idx]
+                    for j in range(col_idx + 1, len(r.cells)):
+                        if c.text == r.cells[j].text:
+                            span += 1
+                            col_idx = j
+                        else:
+                            break
+                    col_idx += 1
+                    cell = html_escape(c.text)
+                    html += f"<td>{cell}</td>" if span == 1 else f"<td colspan='{span}'>{cell}</td>"
+            except Exception as e:
+                logging.warning(f"Error parsing table, ignore: {e}")
+            html += "</tr>"
+        html += "</table>"
+        return html
+
     def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER):
         self.doc = Document(filename) if not binary else Document(BytesIO(binary))
         pn = 0
         lines = []
         level_set = set()
         bull = bullets_category([p.text for p in self.doc.paragraphs])
-        for p in self.doc.paragraphs:
+        # Tables carry no heading level; assign a sentinel deeper than any heading so
+        # build_tree merges them into the enclosing section as leaf content (keeping the
+        # section's title path as retrieval context) instead of dropping them.
+        table_level = 10**6
+        # Iterate over the document body so tables are visited in order alongside
+        # paragraphs (self.doc.paragraphs only yields paragraph elements, skipping tables).
+        for block in self.doc._element.body:
             if pn > to_page:
                 break
+
+            if block.tag.endswith("tbl"):
+                html = self.__table_to_html(DocxTable(block, self.doc))
+                if html:
+                    lines.append((table_level, html))
+                continue
+
+            if not block.tag.endswith("p"):
+                continue
+
+            p = Paragraph(block, self.doc)
             question_level, p_text = docx_question_level(p, bull)
             if not p_text.strip("\n"):
                 continue
@@ -76,8 +120,11 @@ class Docx(DocxParser):
 
         sorted_levels = sorted(level_set)
 
-        h2_level = sorted_levels[1] if len(sorted_levels) > 1 else 1
-        h2_level = sorted_levels[-2] if h2_level == sorted_levels[-1] and len(sorted_levels) > 2 else h2_level
+        if not sorted_levels:
+            h2_level = 1
+        else:
+            h2_level = sorted_levels[1] if len(sorted_levels) > 1 else 1
+            h2_level = sorted_levels[-2] if h2_level == sorted_levels[-1] and len(sorted_levels) > 2 else h2_level
 
         root = Node(level=0, depth=h2_level, texts=[])
         root.build_tree(lines)
diff --git a/test/unit_test/rag/test_laws_docx_tables.py b/test/unit_test/rag/test_laws_docx_tables.py
new file mode 100644
index 0000000000..0a49a791a9
--- /dev/null
+++ b/test/unit_test/rag/test_laws_docx_tables.py
@@ -0,0 +1,124 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import sys
+import types
+from io import BytesIO
+
+import pytest
+from docx import Document
+
+
+def _stub(name, **attrs):
+    mod = types.ModuleType(name)
+    for key, value in attrs.items():
+        setattr(mod, key, value)
+    sys.modules.setdefault(name, mod)
+    return mod
+
+
+# Stub laws.py's app-layer siblings that the Docx parser never calls, so the module
+# can be imported without pulling in the LLM / vision / storage stacks.
+class _DummyBase:
+    def __init__(self, *a, **k):
+        pass
+
+
+_stub("deepdoc.parser", PdfParser=_DummyBase, DocxParser=_DummyBase, HtmlParser=_DummyBase)
+_stub("deepdoc.parser.utils", get_text=lambda *a, **k: "")
+_stub("rag.app.naive", by_plaintext=lambda *a, **k: ([], [], None), PARSERS={})
+_stub("common.parser_config_utils", normalize_layout_recognizer=lambda x: (x, None))
+
+from rag.app.laws import Docx  # noqa: E402
+
+
+def _build_docx(builder):
+    doc = Document()
+    builder(doc)
+    buf = BytesIO()
+    doc.save(buf)
+    return buf.getvalue()
+
+
+@pytest.mark.p2
+def test_laws_docx_preserves_table():
+    """Regression for #16008: the laws DOCX parser dropped tables entirely."""
+
+    def builder(d):
+        d.add_heading("Chapter 1 General Provisions", level=1)
+        d.add_heading("Article 2 Fee Schedule", level=2)
+        d.add_paragraph("The applicable fees are as follows:")
+        t = d.add_table(rows=2, cols=2)
+        t.cell(0, 0).text = "Item"
+        t.cell(0, 1).text = "Fee"
+        t.cell(1, 0).text = "Registration"
+        t.cell(1, 1).text = "100"
+
+    chunks = Docx()("law.docx", _build_docx(builder))
+
+    assert any("<table>" in c for c in chunks)
+    table_chunk = next(c for c in chunks if "<table>" in c)
+    # Table content is present...
+    assert "Registration" in table_chunk and "100" in table_chunk
+    # ...and it carries its enclosing section's title path for retrieval context.
+    assert "Article 2 Fee Schedule" in table_chunk
+
+
+@pytest.mark.p2
+def test_laws_docx_merged_cells_use_colspan():
+    def builder(d):
+        d.add_heading("Heading", level=1)
+        t = d.add_table(rows=1, cols=3)
+        # Identical adjacent cell text is collapsed into a single colspan cell.
+        t.cell(0, 0).text = "Merged"
+        t.cell(0, 1).text = "Merged"
+        t.cell(0, 2).text = "Other"
+
+    chunks = Docx()("law.docx", _build_docx(builder))
+    table_chunk = next(c for c in chunks if "<table>" in c)
+    assert "colspan='2'" in table_chunk
+    assert "<td>Other</td>" in table_chunk
+
+
+@pytest.mark.p2
+def test_laws_docx_escapes_cell_html():
+    def builder(d):
+        d.add_heading("Heading", level=1)
+        t = d.add_table(rows=1, cols=1)
+        t.cell(0, 0).text = "a < b & c > d"
+
+    chunks = Docx()("law.docx", _build_docx(builder))
+    table_chunk = next(c for c in chunks if "<table>" in c)
+    # Special characters are HTML-escaped so the table markup stays well-formed.
+    assert "a &lt; b &amp; c &gt; d" in table_chunk
+    assert "<td>a < b" not in table_chunk
+
+
+@pytest.mark.p2
+def test_laws_docx_tables_only_does_not_crash():
+    def builder(d):
+        t = d.add_table(rows=1, cols=2)
+        t.cell(0, 0).text = "a"
+        t.cell(0, 1).text = "b"
+
+    chunks = Docx()("law.docx", _build_docx(builder))
+    assert any("<table>" in c for c in chunks)
+
+
+@pytest.mark.p2
+def test_laws_docx_empty_doc_returns_empty():
+    chunks = Docx()("law.docx", _build_docx(lambda d: None))
+    assert chunks == []