Fix: preserve tables when parsing DOCX with the laws parser (#16008) (#16155)

## What Fixes #16008 — tables contained in a DOCX are silently dropped when the document is parsed with the **laws** chunking method. ## Root cause `Docx.__call__` in `rag/app/laws.py` iterated `self.doc.paragraphs`, which only yields paragraph elements. Tables are separate `tbl` blocks in the document body, so they were never visited and were lost from the output. (The `naive` parser already handles tables by iterating the document body.) ## Changes - Iterate `self.doc._element.body` so tables are visited in document order alongside paragraphs. - Add a `__table_to_html` helper that renders each table to HTML, including merged-cell `colspan` detection (mirrors the `naive` parser's logic). - Inject each table into the section tree with a sentinel level deeper than any heading, so `Node.build_tree` merges it into its **enclosing section** — keeping the chapter/article title path as retrieval context rather than producing an orphaned chunk. - Guard the `h2_level` computation against an empty heading set, so a tables-only or empty DOCX no longer raises `IndexError`. This keeps the laws parser's hierarchical chunking **and** adds table extraction, so users no longer have to choose between losing structure (naive) or losing tables (laws). ## Tests Adds `test/unit_test/rag/test_laws_docx_tables.py` covering: - table content is preserved and carries its section title path, - merged adjacent cells collapse to `colspan`, - tables-only document does not crash, - empty document returns `[]`. All four pass; `ruff check` / `ruff format` are clean.
2026-06-29 15:31:05 +08:00 · 2026-06-22 07:16:44 +05:30
parent 760229d917
commit 70c0121b78
2 changed files with 174 additions and 3 deletions
--- a/test/unit_test/rag/test_laws_docx_tables.py
+++ b/test/unit_test/rag/test_laws_docx_tables.py
@@ -0,0 +1,124 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import sys
+import types
+from io import BytesIO
+
+import pytest
+from docx import Document
+
+
+def _stub(name, **attrs):
+    mod = types.ModuleType(name)
+    for key, value in attrs.items():
+        setattr(mod, key, value)
+    sys.modules.setdefault(name, mod)
+    return mod
+
+
+# Stub laws.py's app-layer siblings that the Docx parser never calls, so the module
+# can be imported without pulling in the LLM / vision / storage stacks.
+class _DummyBase:
+    def __init__(self, *a, **k):
+        pass
+
+
+_stub("deepdoc.parser", PdfParser=_DummyBase, DocxParser=_DummyBase, HtmlParser=_DummyBase)
+_stub("deepdoc.parser.utils", get_text=lambda *a, **k: "")
+_stub("rag.app.naive", by_plaintext=lambda *a, **k: ([], [], None), PARSERS={})
+_stub("common.parser_config_utils", normalize_layout_recognizer=lambda x: (x, None))
+
+from rag.app.laws import Docx  # noqa: E402
+
+
+def _build_docx(builder):
+    doc = Document()
+    builder(doc)
+    buf = BytesIO()
+    doc.save(buf)
+    return buf.getvalue()
+
+
+@pytest.mark.p2
+def test_laws_docx_preserves_table():
+    """Regression for #16008: the laws DOCX parser dropped tables entirely."""
+
+    def builder(d):
+        d.add_heading("Chapter 1 General Provisions", level=1)
+        d.add_heading("Article 2 Fee Schedule", level=2)
+        d.add_paragraph("The applicable fees are as follows:")
+        t = d.add_table(rows=2, cols=2)
+        t.cell(0, 0).text = "Item"
+        t.cell(0, 1).text = "Fee"
+        t.cell(1, 0).text = "Registration"
+        t.cell(1, 1).text = "100"
+
+    chunks = Docx()("law.docx", _build_docx(builder))
+
+    assert any("<table>" in c for c in chunks)
+    table_chunk = next(c for c in chunks if "<table>" in c)
+    # Table content is present...
+    assert "Registration" in table_chunk and "100" in table_chunk
+    # ...and it carries its enclosing section's title path for retrieval context.
+    assert "Article 2 Fee Schedule" in table_chunk
+
+
+@pytest.mark.p2
+def test_laws_docx_merged_cells_use_colspan():
+    def builder(d):
+        d.add_heading("Heading", level=1)
+        t = d.add_table(rows=1, cols=3)
+        # Identical adjacent cell text is collapsed into a single colspan cell.
+        t.cell(0, 0).text = "Merged"
+        t.cell(0, 1).text = "Merged"
+        t.cell(0, 2).text = "Other"
+
+    chunks = Docx()("law.docx", _build_docx(builder))
+    table_chunk = next(c for c in chunks if "<table>" in c)
+    assert "colspan='2'" in table_chunk
+    assert "<td>Other</td>" in table_chunk
+
+
+@pytest.mark.p2
+def test_laws_docx_escapes_cell_html():
+    def builder(d):
+        d.add_heading("Heading", level=1)
+        t = d.add_table(rows=1, cols=1)
+        t.cell(0, 0).text = "a < b & c > d"
+
+    chunks = Docx()("law.docx", _build_docx(builder))
+    table_chunk = next(c for c in chunks if "<table>" in c)
+    # Special characters are HTML-escaped so the table markup stays well-formed.
+    assert "a &lt; b &amp; c &gt; d" in table_chunk
+    assert "<td>a < b" not in table_chunk
+
+
+@pytest.mark.p2
+def test_laws_docx_tables_only_does_not_crash():
+    def builder(d):
+        t = d.add_table(rows=1, cols=2)
+        t.cell(0, 0).text = "a"
+        t.cell(0, 1).text = "b"
+
+    chunks = Docx()("law.docx", _build_docx(builder))
+    assert any("<table>" in c for c in chunks)
+
+
+@pytest.mark.p2
+def test_laws_docx_empty_doc_returns_empty():
+    chunks = Docx()("law.docx", _build_docx(lambda d: None))
+    assert chunks == []