From 70c0121b78e5b82402ad96efc2f3c6ecd74e0ef1 Mon Sep 17 00:00:00 2001 From: Manan Bansal <66985466+manan-tech@users.noreply.github.com> Date: Mon, 22 Jun 2026 07:16:44 +0530 Subject: [PATCH] Fix: preserve tables when parsing DOCX with the laws parser (#16008) (#16155) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What Fixes #16008 — tables contained in a DOCX are silently dropped when the document is parsed with the **laws** chunking method. ## Root cause `Docx.__call__` in `rag/app/laws.py` iterated `self.doc.paragraphs`, which only yields paragraph elements. Tables are separate `tbl` blocks in the document body, so they were never visited and were lost from the output. (The `naive` parser already handles tables by iterating the document body.) ## Changes - Iterate `self.doc._element.body` so tables are visited in document order alongside paragraphs. - Add a `__table_to_html` helper that renders each table to HTML, including merged-cell `colspan` detection (mirrors the `naive` parser's logic). - Inject each table into the section tree with a sentinel level deeper than any heading, so `Node.build_tree` merges it into its **enclosing section** — keeping the chapter/article title path as retrieval context rather than producing an orphaned chunk. - Guard the `h2_level` computation against an empty heading set, so a tables-only or empty DOCX no longer raises `IndexError`. This keeps the laws parser's hierarchical chunking **and** adds table extraction, so users no longer have to choose between losing structure (naive) or losing tables (laws). ## Tests Adds `test/unit_test/rag/test_laws_docx_tables.py` covering: - table content is preserved and carries its section title path, - merged adjacent cells collapse to `colspan`, - tables-only document does not crash, - empty document returns `[]`. All four pass; `ruff check` / `ruff format` are clean. --- rag/app/laws.py | 53 ++++++++- test/unit_test/rag/test_laws_docx_tables.py | 124 ++++++++++++++++++++ 2 files changed, 174 insertions(+), 3 deletions(-) create mode 100644 test/unit_test/rag/test_laws_docx_tables.py diff --git a/rag/app/laws.py b/rag/app/laws.py index 46829d23c2..d365340832 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -16,8 +16,11 @@ import logging import re +from html import escape as html_escape from io import BytesIO from docx import Document +from docx.table import Table as DocxTable +from docx.text.paragraph import Paragraph from common.constants import ParserType, MAXIMUM_PAGE_NUMBER from deepdoc.parser.utils import get_text @@ -53,15 +56,56 @@ class Docx(DocxParser): pn += 1 return [line for line in lines if line] + def __table_to_html(self, tb): + html = "" + for r in tb.rows: + html += "" + col_idx = 0 + try: + while col_idx < len(r.cells): + span = 1 + c = r.cells[col_idx] + for j in range(col_idx + 1, len(r.cells)): + if c.text == r.cells[j].text: + span += 1 + col_idx = j + else: + break + col_idx += 1 + cell = html_escape(c.text) + html += f"" if span == 1 else f"" + except Exception as e: + logging.warning(f"Error parsing table, ignore: {e}") + html += "" + html += "
{cell}{cell}
" + return html + def __call__(self, filename, binary=None, from_page=0, to_page=MAXIMUM_PAGE_NUMBER): self.doc = Document(filename) if not binary else Document(BytesIO(binary)) pn = 0 lines = [] level_set = set() bull = bullets_category([p.text for p in self.doc.paragraphs]) - for p in self.doc.paragraphs: + # Tables carry no heading level; assign a sentinel deeper than any heading so + # build_tree merges them into the enclosing section as leaf content (keeping the + # section's title path as retrieval context) instead of dropping them. + table_level = 10**6 + # Iterate over the document body so tables are visited in order alongside + # paragraphs (self.doc.paragraphs only yields paragraph elements, skipping tables). + for block in self.doc._element.body: if pn > to_page: break + + if block.tag.endswith("tbl"): + html = self.__table_to_html(DocxTable(block, self.doc)) + if html: + lines.append((table_level, html)) + continue + + if not block.tag.endswith("p"): + continue + + p = Paragraph(block, self.doc) question_level, p_text = docx_question_level(p, bull) if not p_text.strip("\n"): continue @@ -76,8 +120,11 @@ class Docx(DocxParser): sorted_levels = sorted(level_set) - h2_level = sorted_levels[1] if len(sorted_levels) > 1 else 1 - h2_level = sorted_levels[-2] if h2_level == sorted_levels[-1] and len(sorted_levels) > 2 else h2_level + if not sorted_levels: + h2_level = 1 + else: + h2_level = sorted_levels[1] if len(sorted_levels) > 1 else 1 + h2_level = sorted_levels[-2] if h2_level == sorted_levels[-1] and len(sorted_levels) > 2 else h2_level root = Node(level=0, depth=h2_level, texts=[]) root.build_tree(lines) diff --git a/test/unit_test/rag/test_laws_docx_tables.py b/test/unit_test/rag/test_laws_docx_tables.py new file mode 100644 index 0000000000..0a49a791a9 --- /dev/null +++ b/test/unit_test/rag/test_laws_docx_tables.py @@ -0,0 +1,124 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +import types +from io import BytesIO + +import pytest +from docx import Document + + +def _stub(name, **attrs): + mod = types.ModuleType(name) + for key, value in attrs.items(): + setattr(mod, key, value) + sys.modules.setdefault(name, mod) + return mod + + +# Stub laws.py's app-layer siblings that the Docx parser never calls, so the module +# can be imported without pulling in the LLM / vision / storage stacks. +class _DummyBase: + def __init__(self, *a, **k): + pass + + +_stub("deepdoc.parser", PdfParser=_DummyBase, DocxParser=_DummyBase, HtmlParser=_DummyBase) +_stub("deepdoc.parser.utils", get_text=lambda *a, **k: "") +_stub("rag.app.naive", by_plaintext=lambda *a, **k: ([], [], None), PARSERS={}) +_stub("common.parser_config_utils", normalize_layout_recognizer=lambda x: (x, None)) + +from rag.app.laws import Docx # noqa: E402 + + +def _build_docx(builder): + doc = Document() + builder(doc) + buf = BytesIO() + doc.save(buf) + return buf.getvalue() + + +@pytest.mark.p2 +def test_laws_docx_preserves_table(): + """Regression for #16008: the laws DOCX parser dropped tables entirely.""" + + def builder(d): + d.add_heading("Chapter 1 General Provisions", level=1) + d.add_heading("Article 2 Fee Schedule", level=2) + d.add_paragraph("The applicable fees are as follows:") + t = d.add_table(rows=2, cols=2) + t.cell(0, 0).text = "Item" + t.cell(0, 1).text = "Fee" + t.cell(1, 0).text = "Registration" + t.cell(1, 1).text = "100" + + chunks = Docx()("law.docx", _build_docx(builder)) + + assert any("" in c for c in chunks) + table_chunk = next(c for c in chunks if "
" in c) + # Table content is present... + assert "Registration" in table_chunk and "100" in table_chunk + # ...and it carries its enclosing section's title path for retrieval context. + assert "Article 2 Fee Schedule" in table_chunk + + +@pytest.mark.p2 +def test_laws_docx_merged_cells_use_colspan(): + def builder(d): + d.add_heading("Heading", level=1) + t = d.add_table(rows=1, cols=3) + # Identical adjacent cell text is collapsed into a single colspan cell. + t.cell(0, 0).text = "Merged" + t.cell(0, 1).text = "Merged" + t.cell(0, 2).text = "Other" + + chunks = Docx()("law.docx", _build_docx(builder)) + table_chunk = next(c for c in chunks if "
" in c) + assert "colspan='2'" in table_chunk + assert "" in table_chunk + + +@pytest.mark.p2 +def test_laws_docx_escapes_cell_html(): + def builder(d): + d.add_heading("Heading", level=1) + t = d.add_table(rows=1, cols=1) + t.cell(0, 0).text = "a < b & c > d" + + chunks = Docx()("law.docx", _build_docx(builder)) + table_chunk = next(c for c in chunks if "
Other
" in c) + # Special characters are HTML-escaped so the table markup stays well-formed. + assert "a < b & c > d" in table_chunk + assert "
a < b" not in table_chunk + + +@pytest.mark.p2 +def test_laws_docx_tables_only_does_not_crash(): + def builder(d): + t = d.add_table(rows=1, cols=2) + t.cell(0, 0).text = "a" + t.cell(0, 1).text = "b" + + chunks = Docx()("law.docx", _build_docx(builder)) + assert any("" in c for c in chunks) + + +@pytest.mark.p2 +def test_laws_docx_empty_doc_returns_empty(): + chunks = Docx()("law.docx", _build_docx(lambda d: None)) + assert chunks == []