diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index acbd98f228..21e9dc1276 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -229,7 +229,12 @@ class RAGFlowExcelParser: tb_rows_0 += f"{escape(_fmt(t.value))}" tb_rows_0 += "" - for chunk_i in range((len(rows) - 1) // chunk_rows + 1): + # rows[0] is the header; split the remaining data rows into + # ceil(n_data / chunk_rows) chunks. Using +1 here over-counts by one + # when the data-row count is an exact multiple of chunk_rows and emits + # a spurious header-only chunk. + n_data_rows = len(rows) - 1 + for chunk_i in range((n_data_rows + chunk_rows - 1) // chunk_rows): tb = "" tb += f"" tb += tb_rows_0 diff --git a/test/unit_test/deepdoc/parser/test_excel_parser.py b/test/unit_test/deepdoc/parser/test_excel_parser.py new file mode 100644 index 0000000000..9036284643 --- /dev/null +++ b/test/unit_test/deepdoc/parser/test_excel_parser.py @@ -0,0 +1,92 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import importlib.util +import os +import sys +from io import BytesIO +from unittest import mock + +import pytest + +# Import RAGFlowExcelParser directly by file path to avoid triggering +# deepdoc/parser/__init__.py and rag.nlp, which pull in heavy dependencies. +for _m in ["pandas", "rag.nlp", "rag.utils", "rag.utils.lazy_image"]: + if _m not in sys.modules: + sys.modules[_m] = mock.MagicMock() + + +def _find_project_root(marker="pyproject.toml"): + d = os.path.dirname(os.path.abspath(__file__)) + while d != os.path.dirname(d): + if os.path.exists(os.path.join(d, marker)): + return d + d = os.path.dirname(d) + return None + + +_PROJECT_ROOT = _find_project_root() +_spec = importlib.util.spec_from_file_location( + "deepdoc.parser.excel_parser", + os.path.join(_PROJECT_ROOT, "deepdoc", "parser", "excel_parser.py"), +) +_mod = importlib.util.module_from_spec(_spec) +sys.modules["deepdoc.parser.excel_parser"] = _mod +_spec.loader.exec_module(_mod) + +RAGFlowExcelParser = _mod.RAGFlowExcelParser + + +def _make_xlsx(n_data_rows): + from openpyxl import Workbook + + wb = Workbook() + ws = wb.active + ws.append(["H1", "H2"]) + for i in range(n_data_rows): + ws.append([f"a{i}", f"b{i}"]) + buf = BytesIO() + wb.save(buf) + buf.seek(0) + return buf.read() + + +def _chunk_has_no_data_cells(chunk): + return "" not in chunk + + +@pytest.mark.p2 +def test_exact_multiple_does_not_emit_header_only_chunk(): + # 12 data rows with chunk_rows=12 (the value rag/app/naive.py uses). + chunks = RAGFlowExcelParser().html(_make_xlsx(12), chunk_rows=12) + assert len(chunks) == 1 + assert all(not _chunk_has_no_data_cells(c) for c in chunks) + + +@pytest.mark.p2 +def test_multiple_of_chunk_rows_splits_without_spurious_chunk(): + # 24 data rows with chunk_rows=12 -> exactly 2 data chunks, no trailing header-only chunk. + chunks = RAGFlowExcelParser().html(_make_xlsx(24), chunk_rows=12) + assert len(chunks) == 2 + assert all(not _chunk_has_no_data_cells(c) for c in chunks) + + +@pytest.mark.p2 +def test_non_multiple_unchanged(): + # 13 data rows with chunk_rows=12 -> 2 chunks (12 + 1). + chunks = RAGFlowExcelParser().html(_make_xlsx(13), chunk_rows=12) + assert len(chunks) == 2 + assert all(not _chunk_has_no_data_cells(c) for c in chunks)
{sheetname}
" not in chunk and "