fix: naive_merge splits oversized sections and counts overlap tokens correctly (#15802)

2026-06-29 15:31:05 +08:00 · 2026-06-25 14:19:38 +03:00
parent 0af5d43e8d
commit e8bb534b90
2 changed files with 263 additions and 3 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@@ -1090,6 +1090,8 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；
            if cks:
                overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
                t = overlapped[int(len(overlapped) * (100 - overlapped_percent) / 100.):] + t
+                # Recount with the overlap prefix included, else chunks overshoot chunk_token_num.
+                tnum = num_tokens_from_string(t)
            if t.find(pos) < 0:
                t += pos
            cks.append(t)
@@ -1103,6 +1105,7 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；
    custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
    has_custom = bool(custom_delimiters)
    if has_custom:
+        # Custom delimiters ignore chunk_token_num: each segment is its own chunk.
        custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
        cks, tk_nums = [], []
        for sec, pos in sections:
@@ -1120,9 +1123,18 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；
                tk_nums.append(num_tokens_from_string(text))
        return cks

+    # Split oversized sections at sentence delimiters; add_chunk re-merges to size.
+    dels = get_delimiters(delimiter)
    for sec, pos in sections:
-        add_chunk("\n" + sec, pos)
+        if not dels or num_tokens_from_string(sec) < chunk_token_num:
+            add_chunk("\n" + sec, pos)
+            continue
+        for sub_sec in re.split(r"(%s)" % dels, sec, flags=re.DOTALL):
+            if not sub_sec or re.fullmatch(dels, sub_sec):
+                continue
+            add_chunk("\n" + sub_sec, pos)

+    logging.debug("naive_merge: %d sections -> %d chunks (delimiter=%r)", len(sections), len(cks), delimiter)
    return cks


@@ -1146,6 +1158,8 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
            if cks:
                overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
                t = overlapped[int(len(overlapped) * (100 - overlapped_percent) / 100.):] + t
+                # Recount with the overlap prefix included, else chunks overshoot chunk_token_num.
+                tnum = num_tokens_from_string(t)
            if t.find(pos) < 0:
                t += pos
            cks.append(t)
@@ -1164,6 +1178,7 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
    custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
    has_custom = bool(custom_delimiters)
    if has_custom:
+        # Custom delimiters ignore chunk_token_num: each segment is its own chunk.
        custom_pattern = "|".join(re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True))
        cks, result_images, tk_nums = [], [], []
        for text, image in zip(texts, images):
@@ -1186,15 +1201,26 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
                tk_nums.append(num_tokens_from_string(text_seg))
        return cks, result_images

+    # Split oversized sections at sentence delimiters; the section's image rides
+    # along on every piece (concat_img dedupes when pieces re-merge into a chunk).
+    dels = get_delimiters(delimiter)
    for text, image in zip(texts, images):
        # if text is tuple, unpack it
        if isinstance(text, tuple):
            text_str = text[0] if text[0] is not None else ""
            text_pos = text[1] if len(text) > 1 else ""
-            add_chunk("\n" + text_str, image, text_pos)
        else:
-            add_chunk("\n" + (text or ""), image)
+            text_str = text or ""
+            text_pos = ""
+        if not dels or num_tokens_from_string(text_str) < chunk_token_num:
+            add_chunk("\n" + text_str, image, text_pos)
+            continue
+        for sub_sec in re.split(r"(%s)" % dels, text_str, flags=re.DOTALL):
+            if not sub_sec or re.fullmatch(dels, sub_sec):
+                continue
+            add_chunk("\n" + sub_sec, image, text_pos)

+    logging.debug("naive_merge_with_images: %d texts -> %d chunks (delimiter=%r)", len(texts), len(cks), delimiter)
    return cks, result_images


@@ -1219,6 +1245,11 @@ def docx_question_level(p, bull=-1):
 def concat_img(img1, img2):
    from rag.utils.lazy_image import ensure_pil_image, LazyImage

+    # Same image must not stack with itself (the LazyImage branch would otherwise
+    # concatenate its blob list); mirrors the PIL branch's same-reference guard.
+    if img1 is img2:
+        return img1
+
    if (img1 is None or isinstance(img1, LazyImage)) and \
       (img2 is None or isinstance(img2, LazyImage)):
        if img1 and not img2:
--- a/test/unit_test/rag/test_naive_merge.py
+++ b/test/unit_test/rag/test_naive_merge.py
@@ -0,0 +1,229 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""Regression tests for ``naive_merge`` / ``naive_merge_with_images``.
+
+Guards against the regression introduced by commit db0f6840d (#11434) where the
+default (non-custom-delimiter) path stopped splitting oversized sections at
+sentence boundaries, and the overlap prefix was not counted toward a chunk's
+token budget.
+"""
+
+import re
+
+import pytest
+
+import rag.nlp as nlp
+from rag.nlp import naive_merge, naive_merge_with_images
+
+DEFAULT_DELIMITER = "\n!?。；！？"
+
+
+@pytest.fixture(autouse=True)
+def word_count_tokens(monkeypatch):
+    """Count tokens as whitespace-delimited words (ignoring ``@@..`` position tags).
+
+    Deterministic and tokenizer-independent so chunk-size assertions are exact.
+    """
+
+    def fake_num_tokens(s):
+        s = re.sub(r"@@[0-9]+\t[^\t\n]*", "", s or "")
+        return len(s.split())
+
+    monkeypatch.setattr(nlp, "num_tokens_from_string", fake_num_tokens)
+    return fake_num_tokens
+
+
+def _tok(s):
+    return len(re.sub(r"@@[0-9]+\t[^\t\n]*", "", s or "").split())
+
+
+def _nonempty(chunks):
+    return [c for c in chunks if c.strip()]
+
+
+# --------------------------------------------------------------------------- #
+# naive_merge — text path
+# --------------------------------------------------------------------------- #
+
+
+@pytest.mark.p2
+def test_oversized_section_is_split_at_sentence_boundaries():
+    # One section far larger than chunk_token_num, sentences separated by '\n'.
+    sentence = " ".join(["word"] * 10)  # 10 tokens
+    section = "\n".join([sentence] * 20)  # 200 tokens, single section
+    assert _tok(section) == 200
+
+    chunks = _nonempty(naive_merge([section], chunk_token_num=50, delimiter=DEFAULT_DELIMITER))
+
+    # Pre-regression behaviour: the section is broken into several chunks
+    # instead of a single oversized one.
+    assert len(chunks) > 1
+    # No chunk should greatly exceed the budget (allow one trailing sentence of slack).
+    assert all(_tok(c) <= 50 + 10 for c in chunks)
+    # Content is preserved.
+    assert "".join(chunks).count("word") == 200
+
+
+@pytest.mark.p2
+def test_small_sections_are_merged_not_oversplit():
+    sentences = ["alpha beta gamma delta" for _ in range(8)]  # 4 tokens each
+    chunks = _nonempty(naive_merge(sentences, chunk_token_num=50, delimiter=DEFAULT_DELIMITER))
+    # All 32 tokens comfortably fit one chunk.
+    assert len(chunks) == 1
+    assert _tok(chunks[0]) == 32
+
+
+@pytest.mark.p2
+def test_default_delimiters_are_honored_without_backticks():
+    # Sentences delimited by '?' and '!' (part of the default set) must split.
+    section = ("q " * 10).strip() + "?" + ("r " * 10).strip() + "!" + ("s " * 10).strip()
+    chunks = _nonempty(naive_merge([section], chunk_token_num=12, delimiter=DEFAULT_DELIMITER))
+    assert len(chunks) >= 2
+
+
+@pytest.mark.p2
+def test_empty_delimiter_falls_back_to_token_size_merge():
+    # token_chunker.py calls naive_merge with delimiter="" as a size-only fallback.
+    sections = [f"sentence number {i} here" for i in range(30)]  # 4 tokens each
+    chunks = _nonempty(naive_merge(sections, chunk_token_num=20, delimiter=""))
+    assert len(chunks) >= 1
+    # Must not crash and must not explode into per-character chunks.
+    assert len(chunks) < len(sections)
+
+
+@pytest.mark.p2
+def test_overlap_prefix_is_counted_in_token_budget():
+    # With overlap, each chunk = overlap-prefix + new content. The fix recomputes
+    # the chunk's token count after prepending the prefix, so chunks stay bounded.
+    # Pre-fix, the prefix tokens were not counted, so the per-chunk budget check
+    # fired late and chunks systematically overshot chunk_token_num.
+    sentences = [" ".join(["w"] * 10) for _ in range(30)]
+    chunks = _nonempty(
+        naive_merge(sentences, chunk_token_num=50, delimiter=DEFAULT_DELIMITER, overlapped_percent=20)
+    )
+    assert len(chunks) > 1
+    # Each 10-token sentence divides chunk_token_num evenly, so a correct
+    # accounting yields chunks of exactly the budget. The buggy version
+    # overshot (observed up to 63). A small tolerance guards tokenizer rounding.
+    assert all(_tok(c) <= 50 + 2 for c in chunks)
+
+
+# --------------------------------------------------------------------------- #
+# Custom-delimiter path (intended #11434 behaviour must be preserved)
+# --------------------------------------------------------------------------- #
+
+
+@pytest.mark.p2
+def test_custom_delimiter_ignores_chunk_size():
+    text = "partA##partB##partC"
+    # Backtick-wrapped custom delimiter -> every segment is its own chunk,
+    # regardless of chunk_token_num.
+    chunks = [c.strip() for c in naive_merge([text], chunk_token_num=1000, delimiter="\n。`##`")]
+    assert chunks == ["partA", "partB", "partC"]
+
+
+@pytest.mark.p2
+def test_custom_delimiter_does_not_size_merge():
+    parts = [f"seg{i}" for i in range(5)]
+    text = "##".join(parts)
+    chunks = [c.strip() for c in naive_merge([text], chunk_token_num=1000, delimiter="`##`")]
+    assert chunks == parts
+
+
+# --------------------------------------------------------------------------- #
+# naive_merge_with_images — image path
+# --------------------------------------------------------------------------- #
+
+
+@pytest.mark.p2
+def test_images_oversized_section_is_split():
+    sentence = " ".join(["word"] * 10)
+    section = "\n".join([sentence] * 20)  # 200 tokens
+    texts = [(section, "")]
+    images = [None]
+
+    chunks, imgs = naive_merge_with_images(
+        texts, images, chunk_token_num=50, delimiter=DEFAULT_DELIMITER
+    )
+    nonempty = _nonempty(chunks)
+    assert len(nonempty) > 1
+    # Returned lists stay aligned.
+    assert len(chunks) == len(imgs)
+    assert all(_tok(c) <= 50 + 10 for c in nonempty)
+
+
+@pytest.mark.p2
+def test_images_custom_delimiter_preserved():
+    chunks, imgs = naive_merge_with_images(
+        [("x##y##z", "")], [None], chunk_token_num=1000, delimiter="`##`"
+    )
+    assert [c.strip() for c in chunks] == ["x", "y", "z"]
+    assert len(chunks) == len(imgs)
+
+
+@pytest.mark.p2
+def test_images_plain_string_input():
+    # texts may be plain strings (not tuples).
+    sentence = " ".join(["word"] * 10)
+    section = "\n".join([sentence] * 20)
+    chunks, imgs = naive_merge_with_images(
+        [section], [None], chunk_token_num=50, delimiter=DEFAULT_DELIMITER
+    )
+    assert len(_nonempty(chunks)) > 1
+    assert len(chunks) == len(imgs)
+
+
+@pytest.mark.p2
+def test_images_mismatched_lengths_returns_empty():
+    assert naive_merge_with_images(["a"], [], chunk_token_num=50) == ([], [])
+
+
+@pytest.mark.p2
+def test_images_shared_lazyimage_not_stacked_across_split_sentences():
+    # A single section carries one LazyImage. After splitting into sentences that
+    # merge back into one chunk, the shared image must NOT be duplicated/stacked
+    # (concat_img would otherwise concatenate the blob list with itself).
+    from rag.utils.lazy_image import LazyImage
+
+    image = LazyImage([b"FAKEBLOB"])
+    section = "\n".join([" ".join(["word"] * 10)] * 20)
+
+    _, imgs = naive_merge_with_images(
+        [(section, "")], [image], chunk_token_num=50, delimiter=DEFAULT_DELIMITER
+    )
+    for im in imgs:
+        if isinstance(im, LazyImage):
+            assert len(im._blobs) == 1  # never grows beyond the single source blob
+
+
+@pytest.mark.p2
+def test_images_distinct_lazyimages_are_concatenated():
+    # Two different sections (small enough to land in one chunk) with distinct
+    # images must still be merged together.
+    from rag.utils.lazy_image import LazyImage
+
+    a = LazyImage([b"BLOB_A"])
+    b = LazyImage([b"BLOB_B"])
+    texts = [("alpha beta gamma", ""), ("delta epsilon zeta", "")]
+    _, imgs = naive_merge_with_images(
+        texts, [a, b], chunk_token_num=100, delimiter=DEFAULT_DELIMITER
+    )
+    nonempty_imgs = [im for im in imgs if im is not None]
+    assert len(nonempty_imgs) == 1
+    merged = nonempty_imgs[0]
+    assert isinstance(merged, LazyImage)
+    assert merged._blobs == [b"BLOB_A", b"BLOB_B"]