fix: naive_merge splits oversized sections and counts overlap tokens correctly (#15802)

This commit is contained in:
cleanjunc
2026-06-25 14:19:38 +03:00
committed by GitHub
parent 0af5d43e8d
commit e8bb534b90
2 changed files with 263 additions and 3 deletions

View File

@@ -1090,6 +1090,8 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。
if cks:
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
t = overlapped[int(len(overlapped) * (100 - overlapped_percent) / 100.):] + t
# Recount with the overlap prefix included, else chunks overshoot chunk_token_num.
tnum = num_tokens_from_string(t)
if t.find(pos) < 0:
t += pos
cks.append(t)
@@ -1103,6 +1105,7 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
has_custom = bool(custom_delimiters)
if has_custom:
# Custom delimiters ignore chunk_token_num: each segment is its own chunk.
custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
cks, tk_nums = [], []
for sec, pos in sections:
@@ -1120,9 +1123,18 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。
tk_nums.append(num_tokens_from_string(text))
return cks
# Split oversized sections at sentence delimiters; add_chunk re-merges to size.
dels = get_delimiters(delimiter)
for sec, pos in sections:
add_chunk("\n" + sec, pos)
if not dels or num_tokens_from_string(sec) < chunk_token_num:
add_chunk("\n" + sec, pos)
continue
for sub_sec in re.split(r"(%s)" % dels, sec, flags=re.DOTALL):
if not sub_sec or re.fullmatch(dels, sub_sec):
continue
add_chunk("\n" + sub_sec, pos)
logging.debug("naive_merge: %d sections -> %d chunks (delimiter=%r)", len(sections), len(cks), delimiter)
return cks
@@ -1146,6 +1158,8 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
if cks:
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
t = overlapped[int(len(overlapped) * (100 - overlapped_percent) / 100.):] + t
# Recount with the overlap prefix included, else chunks overshoot chunk_token_num.
tnum = num_tokens_from_string(t)
if t.find(pos) < 0:
t += pos
cks.append(t)
@@ -1164,6 +1178,7 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
has_custom = bool(custom_delimiters)
if has_custom:
# Custom delimiters ignore chunk_token_num: each segment is its own chunk.
custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
cks, result_images, tk_nums = [], [], []
for text, image in zip(texts, images):
@@ -1186,15 +1201,26 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
tk_nums.append(num_tokens_from_string(text_seg))
return cks, result_images
# Split oversized sections at sentence delimiters; the section's image rides
# along on every piece (concat_img dedupes when pieces re-merge into a chunk).
dels = get_delimiters(delimiter)
for text, image in zip(texts, images):
# if text is tuple, unpack it
if isinstance(text, tuple):
text_str = text[0] if text[0] is not None else ""
text_pos = text[1] if len(text) > 1 else ""
add_chunk("\n" + text_str, image, text_pos)
else:
add_chunk("\n" + (text or ""), image)
text_str = text or ""
text_pos = ""
if not dels or num_tokens_from_string(text_str) < chunk_token_num:
add_chunk("\n" + text_str, image, text_pos)
continue
for sub_sec in re.split(r"(%s)" % dels, text_str, flags=re.DOTALL):
if not sub_sec or re.fullmatch(dels, sub_sec):
continue
add_chunk("\n" + sub_sec, image, text_pos)
logging.debug("naive_merge_with_images: %d texts -> %d chunks (delimiter=%r)", len(texts), len(cks), delimiter)
return cks, result_images
@@ -1219,6 +1245,11 @@ def docx_question_level(p, bull=-1):
def concat_img(img1, img2):
from rag.utils.lazy_image import ensure_pil_image, LazyImage
# Same image must not stack with itself (the LazyImage branch would otherwise
# concatenate its blob list); mirrors the PIL branch's same-reference guard.
if img1 is img2:
return img1
if (img1 is None or isinstance(img1, LazyImage)) and \
(img2 is None or isinstance(img2, LazyImage)):
if img1 and not img2:

View File

@@ -0,0 +1,229 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Regression tests for ``naive_merge`` / ``naive_merge_with_images``.
Guards against the regression introduced by commit db0f6840d (#11434) where the
default (non-custom-delimiter) path stopped splitting oversized sections at
sentence boundaries, and the overlap prefix was not counted toward a chunk's
token budget.
"""
import re
import pytest
import rag.nlp as nlp
from rag.nlp import naive_merge, naive_merge_with_images
DEFAULT_DELIMITER = "\n!?。;!?"
@pytest.fixture(autouse=True)
def word_count_tokens(monkeypatch):
"""Count tokens as whitespace-delimited words (ignoring ``@@..`` position tags).
Deterministic and tokenizer-independent so chunk-size assertions are exact.
"""
def fake_num_tokens(s):
s = re.sub(r"@@[0-9]+\t[^\t\n]*", "", s or "")
return len(s.split())
monkeypatch.setattr(nlp, "num_tokens_from_string", fake_num_tokens)
return fake_num_tokens
def _tok(s):
return len(re.sub(r"@@[0-9]+\t[^\t\n]*", "", s or "").split())
def _nonempty(chunks):
return [c for c in chunks if c.strip()]
# --------------------------------------------------------------------------- #
# naive_merge — text path
# --------------------------------------------------------------------------- #
@pytest.mark.p2
def test_oversized_section_is_split_at_sentence_boundaries():
# One section far larger than chunk_token_num, sentences separated by '\n'.
sentence = " ".join(["word"] * 10) # 10 tokens
section = "\n".join([sentence] * 20) # 200 tokens, single section
assert _tok(section) == 200
chunks = _nonempty(naive_merge([section], chunk_token_num=50, delimiter=DEFAULT_DELIMITER))
# Pre-regression behaviour: the section is broken into several chunks
# instead of a single oversized one.
assert len(chunks) > 1
# No chunk should greatly exceed the budget (allow one trailing sentence of slack).
assert all(_tok(c) <= 50 + 10 for c in chunks)
# Content is preserved.
assert "".join(chunks).count("word") == 200
@pytest.mark.p2
def test_small_sections_are_merged_not_oversplit():
sentences = ["alpha beta gamma delta" for _ in range(8)] # 4 tokens each
chunks = _nonempty(naive_merge(sentences, chunk_token_num=50, delimiter=DEFAULT_DELIMITER))
# All 32 tokens comfortably fit one chunk.
assert len(chunks) == 1
assert _tok(chunks[0]) == 32
@pytest.mark.p2
def test_default_delimiters_are_honored_without_backticks():
# Sentences delimited by '?' and '!' (part of the default set) must split.
section = ("q " * 10).strip() + "?" + ("r " * 10).strip() + "!" + ("s " * 10).strip()
chunks = _nonempty(naive_merge([section], chunk_token_num=12, delimiter=DEFAULT_DELIMITER))
assert len(chunks) >= 2
@pytest.mark.p2
def test_empty_delimiter_falls_back_to_token_size_merge():
# token_chunker.py calls naive_merge with delimiter="" as a size-only fallback.
sections = [f"sentence number {i} here" for i in range(30)] # 4 tokens each
chunks = _nonempty(naive_merge(sections, chunk_token_num=20, delimiter=""))
assert len(chunks) >= 1
# Must not crash and must not explode into per-character chunks.
assert len(chunks) < len(sections)
@pytest.mark.p2
def test_overlap_prefix_is_counted_in_token_budget():
# With overlap, each chunk = overlap-prefix + new content. The fix recomputes
# the chunk's token count after prepending the prefix, so chunks stay bounded.
# Pre-fix, the prefix tokens were not counted, so the per-chunk budget check
# fired late and chunks systematically overshot chunk_token_num.
sentences = [" ".join(["w"] * 10) for _ in range(30)]
chunks = _nonempty(
naive_merge(sentences, chunk_token_num=50, delimiter=DEFAULT_DELIMITER, overlapped_percent=20)
)
assert len(chunks) > 1
# Each 10-token sentence divides chunk_token_num evenly, so a correct
# accounting yields chunks of exactly the budget. The buggy version
# overshot (observed up to 63). A small tolerance guards tokenizer rounding.
assert all(_tok(c) <= 50 + 2 for c in chunks)
# --------------------------------------------------------------------------- #
# Custom-delimiter path (intended #11434 behaviour must be preserved)
# --------------------------------------------------------------------------- #
@pytest.mark.p2
def test_custom_delimiter_ignores_chunk_size():
text = "partA##partB##partC"
# Backtick-wrapped custom delimiter -> every segment is its own chunk,
# regardless of chunk_token_num.
chunks = [c.strip() for c in naive_merge([text], chunk_token_num=1000, delimiter="\n。`##`")]
assert chunks == ["partA", "partB", "partC"]
@pytest.mark.p2
def test_custom_delimiter_does_not_size_merge():
parts = [f"seg{i}" for i in range(5)]
text = "##".join(parts)
chunks = [c.strip() for c in naive_merge([text], chunk_token_num=1000, delimiter="`##`")]
assert chunks == parts
# --------------------------------------------------------------------------- #
# naive_merge_with_images — image path
# --------------------------------------------------------------------------- #
@pytest.mark.p2
def test_images_oversized_section_is_split():
sentence = " ".join(["word"] * 10)
section = "\n".join([sentence] * 20) # 200 tokens
texts = [(section, "")]
images = [None]
chunks, imgs = naive_merge_with_images(
texts, images, chunk_token_num=50, delimiter=DEFAULT_DELIMITER
)
nonempty = _nonempty(chunks)
assert len(nonempty) > 1
# Returned lists stay aligned.
assert len(chunks) == len(imgs)
assert all(_tok(c) <= 50 + 10 for c in nonempty)
@pytest.mark.p2
def test_images_custom_delimiter_preserved():
chunks, imgs = naive_merge_with_images(
[("x##y##z", "")], [None], chunk_token_num=1000, delimiter="`##`"
)
assert [c.strip() for c in chunks] == ["x", "y", "z"]
assert len(chunks) == len(imgs)
@pytest.mark.p2
def test_images_plain_string_input():
# texts may be plain strings (not tuples).
sentence = " ".join(["word"] * 10)
section = "\n".join([sentence] * 20)
chunks, imgs = naive_merge_with_images(
[section], [None], chunk_token_num=50, delimiter=DEFAULT_DELIMITER
)
assert len(_nonempty(chunks)) > 1
assert len(chunks) == len(imgs)
@pytest.mark.p2
def test_images_mismatched_lengths_returns_empty():
assert naive_merge_with_images(["a"], [], chunk_token_num=50) == ([], [])
@pytest.mark.p2
def test_images_shared_lazyimage_not_stacked_across_split_sentences():
# A single section carries one LazyImage. After splitting into sentences that
# merge back into one chunk, the shared image must NOT be duplicated/stacked
# (concat_img would otherwise concatenate the blob list with itself).
from rag.utils.lazy_image import LazyImage
image = LazyImage([b"FAKEBLOB"])
section = "\n".join([" ".join(["word"] * 10)] * 20)
_, imgs = naive_merge_with_images(
[(section, "")], [image], chunk_token_num=50, delimiter=DEFAULT_DELIMITER
)
for im in imgs:
if isinstance(im, LazyImage):
assert len(im._blobs) == 1 # never grows beyond the single source blob
@pytest.mark.p2
def test_images_distinct_lazyimages_are_concatenated():
# Two different sections (small enough to land in one chunk) with distinct
# images must still be merged together.
from rag.utils.lazy_image import LazyImage
a = LazyImage([b"BLOB_A"])
b = LazyImage([b"BLOB_B"])
texts = [("alpha beta gamma", ""), ("delta epsilon zeta", "")]
_, imgs = naive_merge_with_images(
texts, [a, b], chunk_token_num=100, delimiter=DEFAULT_DELIMITER
)
nonempty_imgs = [im for im in imgs if im is not None]
assert len(nonempty_imgs) == 1
merged = nonempty_imgs[0]
assert isinstance(merged, LazyImage)
assert merged._blobs == [b"BLOB_A", b"BLOB_B"]