mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
Fix: docx parsing raises ValueError on 'Heading' styles (#16284)
This commit is contained in:
@@ -1201,7 +1201,12 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
||||
def docx_question_level(p, bull=-1):
|
||||
txt = re.sub(r"\u3000", " ", p.text).strip()
|
||||
if hasattr(p.style, 'name') and p.style.name and p.style.name.startswith('Heading'):
|
||||
return int(p.style.name.split(' ')[-1]), txt
|
||||
# Heading styles are usually "Heading N", but the base "Heading" style,
|
||||
# custom "Heading"-prefixed styles, or "HeadingN" (no space) have no
|
||||
# space-separated trailing integer. Extract the level digits safely and
|
||||
# fall back to the top heading level instead of raising ValueError (#16163).
|
||||
m = re.search(r"\d+", p.style.name)
|
||||
return (int(m.group()) if m else 1), txt
|
||||
else:
|
||||
if bull < 0:
|
||||
return 0, txt
|
||||
|
||||
88
test/unit_test/rag/test_docx_question_level.py
Normal file
88
test/unit_test/rag/test_docx_question_level.py
Normal file
@@ -0,0 +1,88 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
import sys
|
||||
import types
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def _stub(name, **attrs):
|
||||
mod = types.ModuleType(name)
|
||||
for key, value in attrs.items():
|
||||
setattr(mod, key, value)
|
||||
sys.modules.setdefault(name, mod)
|
||||
return mod
|
||||
|
||||
|
||||
# Stub heavy module-level imports so rag.nlp can be imported in isolation.
|
||||
_stub("common.token_utils", num_tokens_from_string=lambda *a, **k: 0)
|
||||
_stub("roman_numbers")
|
||||
_stub("word2number", w2n=types.SimpleNamespace())
|
||||
_stub("cn2an", cn2an=lambda *a, **k: 0)
|
||||
_pil = _stub("PIL")
|
||||
_pil.Image = _stub("PIL.Image")
|
||||
_stub("chardet")
|
||||
|
||||
from rag.nlp import docx_question_level
|
||||
|
||||
|
||||
class _Style:
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
||||
|
||||
class _Paragraph:
|
||||
def __init__(self, style_name, text="Some title"):
|
||||
self.style = _Style(style_name)
|
||||
self.text = text
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
@pytest.mark.parametrize(
|
||||
"style_name, expected_level",
|
||||
[
|
||||
("Heading 1", 1),
|
||||
("Heading 2", 2),
|
||||
("Heading 9", 9),
|
||||
("Heading 10", 10),
|
||||
("Heading1", 1), # no space
|
||||
("Heading", 1), # base style, no number -> top level
|
||||
("HeadingTitle", 1), # custom prefix, no number -> top level
|
||||
("Heading Title", 1), # custom prefix with space, no number -> top level
|
||||
],
|
||||
)
|
||||
def test_docx_question_level_heading_styles(style_name, expected_level):
|
||||
level, text = docx_question_level(_Paragraph(style_name))
|
||||
assert level == expected_level
|
||||
assert text == "Some title"
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_docx_question_level_no_number_does_not_raise():
|
||||
# Regression for #16163: a "Heading"-prefixed style without a parseable
|
||||
# number used to raise ValueError: invalid literal for int().
|
||||
for name in ("Heading", "HeadingTitle", "Heading Title"):
|
||||
level, _ = docx_question_level(_Paragraph(name))
|
||||
assert level == 1
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_docx_question_level_non_heading_default_bull():
|
||||
# Non-heading paragraph with the default bull=-1 returns level 0 (body text).
|
||||
level, text = docx_question_level(_Paragraph("Normal", text="just a body line"))
|
||||
assert level == 0
|
||||
assert text == "just a body line"
|
||||
Reference in New Issue
Block a user