From 14565b289af86145f7bbd56f42c9f2e85d5021a0 Mon Sep 17 00:00:00 2001 From: minion1227 Date: Tue, 23 Jun 2026 22:16:16 -0700 Subject: [PATCH] Fix: docx parsing raises ValueError on 'Heading' styles (#16284) --- rag/nlp/__init__.py | 7 +- .../unit_test/rag/test_docx_question_level.py | 88 +++++++++++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 test/unit_test/rag/test_docx_question_level.py diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 9e7f332dbd..6e994ac4cc 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -1201,7 +1201,12 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。 def docx_question_level(p, bull=-1): txt = re.sub(r"\u3000", " ", p.text).strip() if hasattr(p.style, 'name') and p.style.name and p.style.name.startswith('Heading'): - return int(p.style.name.split(' ')[-1]), txt + # Heading styles are usually "Heading N", but the base "Heading" style, + # custom "Heading"-prefixed styles, or "HeadingN" (no space) have no + # space-separated trailing integer. Extract the level digits safely and + # fall back to the top heading level instead of raising ValueError (#16163). + m = re.search(r"\d+", p.style.name) + return (int(m.group()) if m else 1), txt else: if bull < 0: return 0, txt diff --git a/test/unit_test/rag/test_docx_question_level.py b/test/unit_test/rag/test_docx_question_level.py new file mode 100644 index 0000000000..7272d15ea2 --- /dev/null +++ b/test/unit_test/rag/test_docx_question_level.py @@ -0,0 +1,88 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +import types + +import pytest + + +def _stub(name, **attrs): + mod = types.ModuleType(name) + for key, value in attrs.items(): + setattr(mod, key, value) + sys.modules.setdefault(name, mod) + return mod + + +# Stub heavy module-level imports so rag.nlp can be imported in isolation. +_stub("common.token_utils", num_tokens_from_string=lambda *a, **k: 0) +_stub("roman_numbers") +_stub("word2number", w2n=types.SimpleNamespace()) +_stub("cn2an", cn2an=lambda *a, **k: 0) +_pil = _stub("PIL") +_pil.Image = _stub("PIL.Image") +_stub("chardet") + +from rag.nlp import docx_question_level + + +class _Style: + def __init__(self, name): + self.name = name + + +class _Paragraph: + def __init__(self, style_name, text="Some title"): + self.style = _Style(style_name) + self.text = text + + +@pytest.mark.p2 +@pytest.mark.parametrize( + "style_name, expected_level", + [ + ("Heading 1", 1), + ("Heading 2", 2), + ("Heading 9", 9), + ("Heading 10", 10), + ("Heading1", 1), # no space + ("Heading", 1), # base style, no number -> top level + ("HeadingTitle", 1), # custom prefix, no number -> top level + ("Heading Title", 1), # custom prefix with space, no number -> top level + ], +) +def test_docx_question_level_heading_styles(style_name, expected_level): + level, text = docx_question_level(_Paragraph(style_name)) + assert level == expected_level + assert text == "Some title" + + +@pytest.mark.p2 +def test_docx_question_level_no_number_does_not_raise(): + # Regression for #16163: a "Heading"-prefixed style without a parseable + # number used to raise ValueError: invalid literal for int(). + for name in ("Heading", "HeadingTitle", "Heading Title"): + level, _ = docx_question_level(_Paragraph(name)) + assert level == 1 + + +@pytest.mark.p2 +def test_docx_question_level_non_heading_default_bull(): + # Non-heading paragraph with the default bull=-1 returns level 0 (body text). + level, text = docx_question_level(_Paragraph("Normal", text="just a body line")) + assert level == 0 + assert text == "just a body line"