mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
fix: naive_merge splits oversized sections and counts overlap tokens correctly (#15802)
This commit is contained in:
@@ -1090,6 +1090,8 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;
|
||||
if cks:
|
||||
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
|
||||
t = overlapped[int(len(overlapped) * (100 - overlapped_percent) / 100.):] + t
|
||||
# Recount with the overlap prefix included, else chunks overshoot chunk_token_num.
|
||||
tnum = num_tokens_from_string(t)
|
||||
if t.find(pos) < 0:
|
||||
t += pos
|
||||
cks.append(t)
|
||||
@@ -1103,6 +1105,7 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;
|
||||
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
|
||||
has_custom = bool(custom_delimiters)
|
||||
if has_custom:
|
||||
# Custom delimiters ignore chunk_token_num: each segment is its own chunk.
|
||||
custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
|
||||
cks, tk_nums = [], []
|
||||
for sec, pos in sections:
|
||||
@@ -1120,9 +1123,18 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;
|
||||
tk_nums.append(num_tokens_from_string(text))
|
||||
return cks
|
||||
|
||||
# Split oversized sections at sentence delimiters; add_chunk re-merges to size.
|
||||
dels = get_delimiters(delimiter)
|
||||
for sec, pos in sections:
|
||||
add_chunk("\n" + sec, pos)
|
||||
if not dels or num_tokens_from_string(sec) < chunk_token_num:
|
||||
add_chunk("\n" + sec, pos)
|
||||
continue
|
||||
for sub_sec in re.split(r"(%s)" % dels, sec, flags=re.DOTALL):
|
||||
if not sub_sec or re.fullmatch(dels, sub_sec):
|
||||
continue
|
||||
add_chunk("\n" + sub_sec, pos)
|
||||
|
||||
logging.debug("naive_merge: %d sections -> %d chunks (delimiter=%r)", len(sections), len(cks), delimiter)
|
||||
return cks
|
||||
|
||||
|
||||
@@ -1146,6 +1158,8 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
||||
if cks:
|
||||
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
|
||||
t = overlapped[int(len(overlapped) * (100 - overlapped_percent) / 100.):] + t
|
||||
# Recount with the overlap prefix included, else chunks overshoot chunk_token_num.
|
||||
tnum = num_tokens_from_string(t)
|
||||
if t.find(pos) < 0:
|
||||
t += pos
|
||||
cks.append(t)
|
||||
@@ -1164,6 +1178,7 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
||||
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
|
||||
has_custom = bool(custom_delimiters)
|
||||
if has_custom:
|
||||
# Custom delimiters ignore chunk_token_num: each segment is its own chunk.
|
||||
custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
|
||||
cks, result_images, tk_nums = [], [], []
|
||||
for text, image in zip(texts, images):
|
||||
@@ -1186,15 +1201,26 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
||||
tk_nums.append(num_tokens_from_string(text_seg))
|
||||
return cks, result_images
|
||||
|
||||
# Split oversized sections at sentence delimiters; the section's image rides
|
||||
# along on every piece (concat_img dedupes when pieces re-merge into a chunk).
|
||||
dels = get_delimiters(delimiter)
|
||||
for text, image in zip(texts, images):
|
||||
# if text is tuple, unpack it
|
||||
if isinstance(text, tuple):
|
||||
text_str = text[0] if text[0] is not None else ""
|
||||
text_pos = text[1] if len(text) > 1 else ""
|
||||
add_chunk("\n" + text_str, image, text_pos)
|
||||
else:
|
||||
add_chunk("\n" + (text or ""), image)
|
||||
text_str = text or ""
|
||||
text_pos = ""
|
||||
if not dels or num_tokens_from_string(text_str) < chunk_token_num:
|
||||
add_chunk("\n" + text_str, image, text_pos)
|
||||
continue
|
||||
for sub_sec in re.split(r"(%s)" % dels, text_str, flags=re.DOTALL):
|
||||
if not sub_sec or re.fullmatch(dels, sub_sec):
|
||||
continue
|
||||
add_chunk("\n" + sub_sec, image, text_pos)
|
||||
|
||||
logging.debug("naive_merge_with_images: %d texts -> %d chunks (delimiter=%r)", len(texts), len(cks), delimiter)
|
||||
return cks, result_images
|
||||
|
||||
|
||||
@@ -1219,6 +1245,11 @@ def docx_question_level(p, bull=-1):
|
||||
def concat_img(img1, img2):
|
||||
from rag.utils.lazy_image import ensure_pil_image, LazyImage
|
||||
|
||||
# Same image must not stack with itself (the LazyImage branch would otherwise
|
||||
# concatenate its blob list); mirrors the PIL branch's same-reference guard.
|
||||
if img1 is img2:
|
||||
return img1
|
||||
|
||||
if (img1 is None or isinstance(img1, LazyImage)) and \
|
||||
(img2 is None or isinstance(img2, LazyImage)):
|
||||
if img1 and not img2:
|
||||
|
||||
229
test/unit_test/rag/test_naive_merge.py
Normal file
229
test/unit_test/rag/test_naive_merge.py
Normal file
@@ -0,0 +1,229 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
"""Regression tests for ``naive_merge`` / ``naive_merge_with_images``.
|
||||
|
||||
Guards against the regression introduced by commit db0f6840d (#11434) where the
|
||||
default (non-custom-delimiter) path stopped splitting oversized sections at
|
||||
sentence boundaries, and the overlap prefix was not counted toward a chunk's
|
||||
token budget.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
import pytest
|
||||
|
||||
import rag.nlp as nlp
|
||||
from rag.nlp import naive_merge, naive_merge_with_images
|
||||
|
||||
DEFAULT_DELIMITER = "\n!?。;!?"
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def word_count_tokens(monkeypatch):
|
||||
"""Count tokens as whitespace-delimited words (ignoring ``@@..`` position tags).
|
||||
|
||||
Deterministic and tokenizer-independent so chunk-size assertions are exact.
|
||||
"""
|
||||
|
||||
def fake_num_tokens(s):
|
||||
s = re.sub(r"@@[0-9]+\t[^\t\n]*", "", s or "")
|
||||
return len(s.split())
|
||||
|
||||
monkeypatch.setattr(nlp, "num_tokens_from_string", fake_num_tokens)
|
||||
return fake_num_tokens
|
||||
|
||||
|
||||
def _tok(s):
|
||||
return len(re.sub(r"@@[0-9]+\t[^\t\n]*", "", s or "").split())
|
||||
|
||||
|
||||
def _nonempty(chunks):
|
||||
return [c for c in chunks if c.strip()]
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# naive_merge — text path
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_oversized_section_is_split_at_sentence_boundaries():
|
||||
# One section far larger than chunk_token_num, sentences separated by '\n'.
|
||||
sentence = " ".join(["word"] * 10) # 10 tokens
|
||||
section = "\n".join([sentence] * 20) # 200 tokens, single section
|
||||
assert _tok(section) == 200
|
||||
|
||||
chunks = _nonempty(naive_merge([section], chunk_token_num=50, delimiter=DEFAULT_DELIMITER))
|
||||
|
||||
# Pre-regression behaviour: the section is broken into several chunks
|
||||
# instead of a single oversized one.
|
||||
assert len(chunks) > 1
|
||||
# No chunk should greatly exceed the budget (allow one trailing sentence of slack).
|
||||
assert all(_tok(c) <= 50 + 10 for c in chunks)
|
||||
# Content is preserved.
|
||||
assert "".join(chunks).count("word") == 200
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_small_sections_are_merged_not_oversplit():
|
||||
sentences = ["alpha beta gamma delta" for _ in range(8)] # 4 tokens each
|
||||
chunks = _nonempty(naive_merge(sentences, chunk_token_num=50, delimiter=DEFAULT_DELIMITER))
|
||||
# All 32 tokens comfortably fit one chunk.
|
||||
assert len(chunks) == 1
|
||||
assert _tok(chunks[0]) == 32
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_default_delimiters_are_honored_without_backticks():
|
||||
# Sentences delimited by '?' and '!' (part of the default set) must split.
|
||||
section = ("q " * 10).strip() + "?" + ("r " * 10).strip() + "!" + ("s " * 10).strip()
|
||||
chunks = _nonempty(naive_merge([section], chunk_token_num=12, delimiter=DEFAULT_DELIMITER))
|
||||
assert len(chunks) >= 2
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_empty_delimiter_falls_back_to_token_size_merge():
|
||||
# token_chunker.py calls naive_merge with delimiter="" as a size-only fallback.
|
||||
sections = [f"sentence number {i} here" for i in range(30)] # 4 tokens each
|
||||
chunks = _nonempty(naive_merge(sections, chunk_token_num=20, delimiter=""))
|
||||
assert len(chunks) >= 1
|
||||
# Must not crash and must not explode into per-character chunks.
|
||||
assert len(chunks) < len(sections)
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_overlap_prefix_is_counted_in_token_budget():
|
||||
# With overlap, each chunk = overlap-prefix + new content. The fix recomputes
|
||||
# the chunk's token count after prepending the prefix, so chunks stay bounded.
|
||||
# Pre-fix, the prefix tokens were not counted, so the per-chunk budget check
|
||||
# fired late and chunks systematically overshot chunk_token_num.
|
||||
sentences = [" ".join(["w"] * 10) for _ in range(30)]
|
||||
chunks = _nonempty(
|
||||
naive_merge(sentences, chunk_token_num=50, delimiter=DEFAULT_DELIMITER, overlapped_percent=20)
|
||||
)
|
||||
assert len(chunks) > 1
|
||||
# Each 10-token sentence divides chunk_token_num evenly, so a correct
|
||||
# accounting yields chunks of exactly the budget. The buggy version
|
||||
# overshot (observed up to 63). A small tolerance guards tokenizer rounding.
|
||||
assert all(_tok(c) <= 50 + 2 for c in chunks)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# Custom-delimiter path (intended #11434 behaviour must be preserved)
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_custom_delimiter_ignores_chunk_size():
|
||||
text = "partA##partB##partC"
|
||||
# Backtick-wrapped custom delimiter -> every segment is its own chunk,
|
||||
# regardless of chunk_token_num.
|
||||
chunks = [c.strip() for c in naive_merge([text], chunk_token_num=1000, delimiter="\n。`##`")]
|
||||
assert chunks == ["partA", "partB", "partC"]
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_custom_delimiter_does_not_size_merge():
|
||||
parts = [f"seg{i}" for i in range(5)]
|
||||
text = "##".join(parts)
|
||||
chunks = [c.strip() for c in naive_merge([text], chunk_token_num=1000, delimiter="`##`")]
|
||||
assert chunks == parts
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------- #
|
||||
# naive_merge_with_images — image path
|
||||
# --------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_images_oversized_section_is_split():
|
||||
sentence = " ".join(["word"] * 10)
|
||||
section = "\n".join([sentence] * 20) # 200 tokens
|
||||
texts = [(section, "")]
|
||||
images = [None]
|
||||
|
||||
chunks, imgs = naive_merge_with_images(
|
||||
texts, images, chunk_token_num=50, delimiter=DEFAULT_DELIMITER
|
||||
)
|
||||
nonempty = _nonempty(chunks)
|
||||
assert len(nonempty) > 1
|
||||
# Returned lists stay aligned.
|
||||
assert len(chunks) == len(imgs)
|
||||
assert all(_tok(c) <= 50 + 10 for c in nonempty)
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_images_custom_delimiter_preserved():
|
||||
chunks, imgs = naive_merge_with_images(
|
||||
[("x##y##z", "")], [None], chunk_token_num=1000, delimiter="`##`"
|
||||
)
|
||||
assert [c.strip() for c in chunks] == ["x", "y", "z"]
|
||||
assert len(chunks) == len(imgs)
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_images_plain_string_input():
|
||||
# texts may be plain strings (not tuples).
|
||||
sentence = " ".join(["word"] * 10)
|
||||
section = "\n".join([sentence] * 20)
|
||||
chunks, imgs = naive_merge_with_images(
|
||||
[section], [None], chunk_token_num=50, delimiter=DEFAULT_DELIMITER
|
||||
)
|
||||
assert len(_nonempty(chunks)) > 1
|
||||
assert len(chunks) == len(imgs)
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_images_mismatched_lengths_returns_empty():
|
||||
assert naive_merge_with_images(["a"], [], chunk_token_num=50) == ([], [])
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_images_shared_lazyimage_not_stacked_across_split_sentences():
|
||||
# A single section carries one LazyImage. After splitting into sentences that
|
||||
# merge back into one chunk, the shared image must NOT be duplicated/stacked
|
||||
# (concat_img would otherwise concatenate the blob list with itself).
|
||||
from rag.utils.lazy_image import LazyImage
|
||||
|
||||
image = LazyImage([b"FAKEBLOB"])
|
||||
section = "\n".join([" ".join(["word"] * 10)] * 20)
|
||||
|
||||
_, imgs = naive_merge_with_images(
|
||||
[(section, "")], [image], chunk_token_num=50, delimiter=DEFAULT_DELIMITER
|
||||
)
|
||||
for im in imgs:
|
||||
if isinstance(im, LazyImage):
|
||||
assert len(im._blobs) == 1 # never grows beyond the single source blob
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_images_distinct_lazyimages_are_concatenated():
|
||||
# Two different sections (small enough to land in one chunk) with distinct
|
||||
# images must still be merged together.
|
||||
from rag.utils.lazy_image import LazyImage
|
||||
|
||||
a = LazyImage([b"BLOB_A"])
|
||||
b = LazyImage([b"BLOB_B"])
|
||||
texts = [("alpha beta gamma", ""), ("delta epsilon zeta", "")]
|
||||
_, imgs = naive_merge_with_images(
|
||||
texts, [a, b], chunk_token_num=100, delimiter=DEFAULT_DELIMITER
|
||||
)
|
||||
nonempty_imgs = [im for im in imgs if im is not None]
|
||||
assert len(nonempty_imgs) == 1
|
||||
merged = nonempty_imgs[0]
|
||||
assert isinstance(merged, LazyImage)
|
||||
assert merged._blobs == [b"BLOB_A", b"BLOB_B"]
|
||||
Reference in New Issue
Block a user