Fix: preserve tables when parsing DOCX with the laws parser (#16008) (#16155)

## What

Fixes #16008 — tables contained in a DOCX are silently dropped when the
document is parsed with the **laws** chunking method.

## Root cause

`Docx.__call__` in `rag/app/laws.py` iterated `self.doc.paragraphs`,
which only yields paragraph elements. Tables are separate `tbl` blocks
in the document body, so they were never visited and were lost from the
output. (The `naive` parser already handles tables by iterating the
document body.)

## Changes

- Iterate `self.doc._element.body` so tables are visited in document
order alongside paragraphs.
- Add a `__table_to_html` helper that renders each table to HTML,
including merged-cell `colspan` detection (mirrors the `naive` parser's
logic).
- Inject each table into the section tree with a sentinel level deeper
than any heading, so `Node.build_tree` merges it into its **enclosing
section** — keeping the chapter/article title path as retrieval context
rather than producing an orphaned chunk.
- Guard the `h2_level` computation against an empty heading set, so a
tables-only or empty DOCX no longer raises `IndexError`.

This keeps the laws parser's hierarchical chunking **and** adds table
extraction, so users no longer have to choose between losing structure
(naive) or losing tables (laws).

## Tests

Adds `test/unit_test/rag/test_laws_docx_tables.py` covering:
- table content is preserved and carries its section title path,
- merged adjacent cells collapse to `colspan`,
- tables-only document does not crash,
- empty document returns `[]`.

All four pass; `ruff check` / `ruff format` are clean.
This commit is contained in:
Manan Bansal
2026-06-22 07:16:44 +05:30
committed by GitHub
parent 760229d917
commit 70c0121b78
2 changed files with 174 additions and 3 deletions

View File

@@ -0,0 +1,124 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import sys
import types
from io import BytesIO
import pytest
from docx import Document
def _stub(name, **attrs):
mod = types.ModuleType(name)
for key, value in attrs.items():
setattr(mod, key, value)
sys.modules.setdefault(name, mod)
return mod
# Stub laws.py's app-layer siblings that the Docx parser never calls, so the module
# can be imported without pulling in the LLM / vision / storage stacks.
class _DummyBase:
def __init__(self, *a, **k):
pass
_stub("deepdoc.parser", PdfParser=_DummyBase, DocxParser=_DummyBase, HtmlParser=_DummyBase)
_stub("deepdoc.parser.utils", get_text=lambda *a, **k: "")
_stub("rag.app.naive", by_plaintext=lambda *a, **k: ([], [], None), PARSERS={})
_stub("common.parser_config_utils", normalize_layout_recognizer=lambda x: (x, None))
from rag.app.laws import Docx # noqa: E402
def _build_docx(builder):
doc = Document()
builder(doc)
buf = BytesIO()
doc.save(buf)
return buf.getvalue()
@pytest.mark.p2
def test_laws_docx_preserves_table():
"""Regression for #16008: the laws DOCX parser dropped tables entirely."""
def builder(d):
d.add_heading("Chapter 1 General Provisions", level=1)
d.add_heading("Article 2 Fee Schedule", level=2)
d.add_paragraph("The applicable fees are as follows:")
t = d.add_table(rows=2, cols=2)
t.cell(0, 0).text = "Item"
t.cell(0, 1).text = "Fee"
t.cell(1, 0).text = "Registration"
t.cell(1, 1).text = "100"
chunks = Docx()("law.docx", _build_docx(builder))
assert any("<table>" in c for c in chunks)
table_chunk = next(c for c in chunks if "<table>" in c)
# Table content is present...
assert "Registration" in table_chunk and "100" in table_chunk
# ...and it carries its enclosing section's title path for retrieval context.
assert "Article 2 Fee Schedule" in table_chunk
@pytest.mark.p2
def test_laws_docx_merged_cells_use_colspan():
def builder(d):
d.add_heading("Heading", level=1)
t = d.add_table(rows=1, cols=3)
# Identical adjacent cell text is collapsed into a single colspan cell.
t.cell(0, 0).text = "Merged"
t.cell(0, 1).text = "Merged"
t.cell(0, 2).text = "Other"
chunks = Docx()("law.docx", _build_docx(builder))
table_chunk = next(c for c in chunks if "<table>" in c)
assert "colspan='2'" in table_chunk
assert "<td>Other</td>" in table_chunk
@pytest.mark.p2
def test_laws_docx_escapes_cell_html():
def builder(d):
d.add_heading("Heading", level=1)
t = d.add_table(rows=1, cols=1)
t.cell(0, 0).text = "a < b & c > d"
chunks = Docx()("law.docx", _build_docx(builder))
table_chunk = next(c for c in chunks if "<table>" in c)
# Special characters are HTML-escaped so the table markup stays well-formed.
assert "a &lt; b &amp; c &gt; d" in table_chunk
assert "<td>a < b" not in table_chunk
@pytest.mark.p2
def test_laws_docx_tables_only_does_not_crash():
def builder(d):
t = d.add_table(rows=1, cols=2)
t.cell(0, 0).text = "a"
t.cell(0, 1).text = "b"
chunks = Docx()("law.docx", _build_docx(builder))
assert any("<table>" in c for c in chunks)
@pytest.mark.p2
def test_laws_docx_empty_doc_returns_empty():
chunks = Docx()("law.docx", _build_docx(lambda d: None))
assert chunks == []