fix(html_parser): preserve original text when splitting oversized blocks (#16052)

### Bug `RAGFlowHtmlParser.chunk_block()` splits an oversized block by slicing the **tokenized** string and storing the joined tokens: ```python tks_str = rag_tokenizer.tokenize(block) ... tokens = tks_str.split(" ") while start < len(tokens): chunks.append(" ".join(tokens[start:start + chunk_token_num])) # tokenized form, not source ``` On the default (Elasticsearch) backend `rag_tokenizer.tokenize` transforms text: it lowercases/stems Latin words and inserts spaces between CJK characters. So any text block longer than `chunk_token_num` is stored as garbled, lowercased, space-segmented text instead of the source content. The small-block branch correctly stores the original `block`, so only oversized blocks are corrupted. Affects HTML and EPUB ingestion (both go through `chunk_block`), degrading retrieved chunks and the answers generated from them. ### Real tokenizer behavior (infinity-sdk 0.7.0, ES backend) ``` tokenize("Hello World FOO Bar Baz Qux Jumps") -> "hello world foo bar baz qux jump" # lowercased + stemmed tokenize("你好世界这是一个测试") -> "你好世界这是一个测试" # spaces inserted ``` ### Fix Split the **original** text: break it into atoms (whitespace-delimited runs for space-separated scripts, per-character for spaceless scripts such as Chinese) and pack them into pieces of at most `chunk_token_num` tokens. This preserves the source characters and still splits scripts that have no whitespace — a plain whitespace split would leave CJK as one un-splittable chunk. ### Proof (real tokenizer, before/after) Running the old vs new split against the real `infinity.rag_tokenizer`: ``` ENGLISH "Hello World FOO Bar Baz Qux Lazy Dogs" (chunk_token_num=4) OLD: ['hello world foo bar', 'baz qux jump over', 'lazi dog'] # lowercased + stemmed NEW: ['Hello World FOO Bar ', 'Baz Qux Jumps Over ', 'Lazy Dogs'] # preserved; each <= 4 tokens NEW preserves text exactly: True CHINESE "你好世界这是一个测试用例需要被切分成多个块" (chunk_token_num=3) OLD: ['你好世界这是', '一个测试用例需要', ...] # spurious spaces NEW: ['你好世', '界这是', '一个测', ...] # preserved; each <= 3 tokens NEW preserves text exactly: True ``` ### Tests Added `test/unit_test/deepdoc/parser/test_html_parser.py` (English + Chinese oversized blocks, plus small-block merge). Before the fix the two oversized tests fail (English shows lowercasing, Chinese shows inserted spaces); after the fix all pass. `ruff check` clean.
2026-06-29 15:31:05 +08:00 · 2026-06-25 04:43:35 -04:00
parent edfa9be67f
commit 091417980e
2 changed files with 216 additions and 11 deletions
--- a/test/unit_test/deepdoc/parser/test_html_parser.py
+++ b/test/unit_test/deepdoc/parser/test_html_parser.py
@@ -0,0 +1,142 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""Unit tests for HtmlParser.chunk_block.
+
+These cover the splitting of oversized text blocks, which must preserve the
+original source text (the tokenizer lowercases / stems / segments text, so the
+stored chunk must not be built from the tokenized form) and must split text in
+scripts that have no whitespace word boundaries (e.g. Chinese).
+"""
+
+import importlib.util
+import os
+import sys
+from unittest import mock
+
+# Load html_parser by file path so we don't trigger deepdoc/parser/__init__.py
+# (which pulls in heavy parsers) or the real rag.nlp tokenizer. The heavy
+# optional modules are stubbed; rag.nlp is stubbed so the module imports, and
+# the tokenizer is replaced after load with a deterministic fake below.
+_MOCK_MODULES = [
+    "xgboost",
+    "pdfplumber",
+    "huggingface_hub",
+    "PIL",
+    "PIL.Image",
+    "pypdf",
+    "sklearn",
+    "deepdoc.vision",
+    "infinity",
+    "infinity.rag_tokenizer",
+]
+for _m in _MOCK_MODULES:
+    if _m not in sys.modules:
+        sys.modules[_m] = mock.MagicMock()
+
+if "rag" not in sys.modules:
+    sys.modules["rag"] = mock.MagicMock()
+if "rag.nlp" not in sys.modules:
+    sys.modules["rag.nlp"] = mock.MagicMock()
+
+
+def _find_project_root(marker="pyproject.toml"):
+    d = os.path.dirname(os.path.abspath(__file__))
+    while d != os.path.dirname(d):
+        if os.path.exists(os.path.join(d, marker)):
+            return d
+        d = os.path.dirname(d)
+    return None
+
+
+_PROJECT_ROOT = _find_project_root()
+
+_html_spec = importlib.util.spec_from_file_location(
+    "deepdoc.parser.html_parser",
+    os.path.join(_PROJECT_ROOT, "deepdoc", "parser", "html_parser.py"),
+)
+_html_mod = importlib.util.module_from_spec(_html_spec)
+sys.modules["deepdoc.parser.html_parser"] = _html_mod
+_html_spec.loader.exec_module(_html_mod)
+
+RAGFlowHtmlParser = _html_mod.RAGFlowHtmlParser
+
+
+class _FakeTokenizer:
+    """Deterministic stand-in for rag.nlp.rag_tokenizer.
+
+    Mirrors the two behaviours the real tokenizer applies on the default
+    (Elasticsearch) backend and that this test depends on: it transforms the
+    text (lowercases Latin tokens) and segments spaceless scripts (CJK) into
+    per-character, space-separated tokens. tokenize() returns the same
+    space-joined string shape the real tokenizer returns.
+    """
+
+    @staticmethod
+    def tokenize(text):
+        spaced = []
+        for ch in text:
+            if "一" <= ch <= "鿿":
+                spaced.append(" " + ch + " ")
+            else:
+                spaced.append(ch)
+        return " ".join(t.lower() for t in "".join(spaced).split())
+
+
+# Bind the deterministic tokenizer regardless of how rag.nlp resolved.
+_html_mod.rag_tokenizer = _FakeTokenizer()
+
+
+def _token_count(text):
+    return RAGFlowHtmlParser._token_count(text)
+
+
+def test_oversized_english_block_preserves_original_text():
+    # 8 latin tokens, budget 3 -> must be split into multiple chunks that keep
+    # the original casing (the tokenizer lowercases, so a tokenized-form chunk
+    # would be "hello world ...").
+    block = "Hello World FOO Bar Baz Qux Lazy Dogs"
+    chunks = RAGFlowHtmlParser.chunk_block([block], chunk_token_num=3)
+
+    assert len(chunks) > 1
+    # Original text is preserved exactly (atoms partition the source).
+    assert "".join(chunks) == block
+    # Case is not mangled.
+    assert "Hello" in chunks[0]
+    assert all(c.lower() != c for c in chunks if any(ch.isalpha() for ch in c))
+    # No chunk exceeds the token budget.
+    assert all(_token_count(c) <= 3 for c in chunks)
+
+
+def test_oversized_chinese_block_is_split_and_preserved():
+    # Chinese has no whitespace; a naive whitespace split would leave this as a
+    # single un-splittable chunk. It must still be split, with no spurious
+    # spaces inserted between characters.
+    block = "你好世界这是一个测试用例需要被切分"
+    chunks = RAGFlowHtmlParser.chunk_block([block], chunk_token_num=3)
+
+    assert len(chunks) > 1
+    assert "".join(chunks) == block
+    assert all(" " not in c for c in chunks)
+    assert all(_token_count(c) <= 3 for c in chunks)
+
+
+def test_small_blocks_are_merged_unchanged():
+    # Blocks under the budget keep their original text and are merged.
+    chunks = RAGFlowHtmlParser.chunk_block(["Alpha Beta", "Gamma"], chunk_token_num=512)
+
+    assert "Alpha Beta" in "".join(chunks)
+    assert "Gamma" in "".join(chunks)