mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
fix(html_parser): preserve original text when splitting oversized blocks (#16052)
### Bug
`RAGFlowHtmlParser.chunk_block()` splits an oversized block by slicing
the **tokenized** string and storing the joined tokens:
```python
tks_str = rag_tokenizer.tokenize(block)
...
tokens = tks_str.split(" ")
while start < len(tokens):
chunks.append(" ".join(tokens[start:start + chunk_token_num])) # tokenized form, not source
```
On the default (Elasticsearch) backend `rag_tokenizer.tokenize`
transforms text: it lowercases/stems Latin words and inserts spaces
between CJK characters. So any text block longer than `chunk_token_num`
is stored as garbled, lowercased, space-segmented text instead of the
source content. The small-block branch correctly stores the original
`block`, so only oversized blocks are corrupted. Affects HTML and EPUB
ingestion (both go through `chunk_block`), degrading retrieved chunks
and the answers generated from them.
### Real tokenizer behavior (infinity-sdk 0.7.0, ES backend)
```
tokenize("Hello World FOO Bar Baz Qux Jumps") -> "hello world foo bar baz qux jump" # lowercased + stemmed
tokenize("你好世界这是一个测试") -> "你好世界 这 是 一个 测试" # spaces inserted
```
### Fix
Split the **original** text: break it into atoms (whitespace-delimited
runs for space-separated scripts, per-character for spaceless scripts
such as Chinese) and pack them into pieces of at most `chunk_token_num`
tokens. This preserves the source characters and still splits scripts
that have no whitespace — a plain whitespace split would leave CJK as
one un-splittable chunk.
### Proof (real tokenizer, before/after)
Running the old vs new split against the real `infinity.rag_tokenizer`:
```
ENGLISH "Hello World FOO Bar Baz Qux Lazy Dogs" (chunk_token_num=4)
OLD: ['hello world foo bar', 'baz qux jump over', 'lazi dog'] # lowercased + stemmed
NEW: ['Hello World FOO Bar ', 'Baz Qux Jumps Over ', 'Lazy Dogs'] # preserved; each <= 4 tokens
NEW preserves text exactly: True
CHINESE "你好世界这是一个测试用例需要被切分成多个块" (chunk_token_num=3)
OLD: ['你好世界 这 是', '一个 测试用例 需要', ...] # spurious spaces
NEW: ['你好世', '界这是', '一个测', ...] # preserved; each <= 3 tokens
NEW preserves text exactly: True
```
### Tests
Added `test/unit_test/deepdoc/parser/test_html_parser.py` (English +
Chinese oversized blocks, plus small-block merge). Before the fix the
two oversized tests fail (English shows lowercasing, Chinese shows
inserted spaces); after the fix all pass. `ruff check` clean.
This commit is contained in:
142
test/unit_test/deepdoc/parser/test_html_parser.py
Normal file
142
test/unit_test/deepdoc/parser/test_html_parser.py
Normal file
@@ -0,0 +1,142 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
"""Unit tests for HtmlParser.chunk_block.
|
||||
|
||||
These cover the splitting of oversized text blocks, which must preserve the
|
||||
original source text (the tokenizer lowercases / stems / segments text, so the
|
||||
stored chunk must not be built from the tokenized form) and must split text in
|
||||
scripts that have no whitespace word boundaries (e.g. Chinese).
|
||||
"""
|
||||
|
||||
import importlib.util
|
||||
import os
|
||||
import sys
|
||||
from unittest import mock
|
||||
|
||||
# Load html_parser by file path so we don't trigger deepdoc/parser/__init__.py
|
||||
# (which pulls in heavy parsers) or the real rag.nlp tokenizer. The heavy
|
||||
# optional modules are stubbed; rag.nlp is stubbed so the module imports, and
|
||||
# the tokenizer is replaced after load with a deterministic fake below.
|
||||
_MOCK_MODULES = [
|
||||
"xgboost",
|
||||
"pdfplumber",
|
||||
"huggingface_hub",
|
||||
"PIL",
|
||||
"PIL.Image",
|
||||
"pypdf",
|
||||
"sklearn",
|
||||
"deepdoc.vision",
|
||||
"infinity",
|
||||
"infinity.rag_tokenizer",
|
||||
]
|
||||
for _m in _MOCK_MODULES:
|
||||
if _m not in sys.modules:
|
||||
sys.modules[_m] = mock.MagicMock()
|
||||
|
||||
if "rag" not in sys.modules:
|
||||
sys.modules["rag"] = mock.MagicMock()
|
||||
if "rag.nlp" not in sys.modules:
|
||||
sys.modules["rag.nlp"] = mock.MagicMock()
|
||||
|
||||
|
||||
def _find_project_root(marker="pyproject.toml"):
|
||||
d = os.path.dirname(os.path.abspath(__file__))
|
||||
while d != os.path.dirname(d):
|
||||
if os.path.exists(os.path.join(d, marker)):
|
||||
return d
|
||||
d = os.path.dirname(d)
|
||||
return None
|
||||
|
||||
|
||||
_PROJECT_ROOT = _find_project_root()
|
||||
|
||||
_html_spec = importlib.util.spec_from_file_location(
|
||||
"deepdoc.parser.html_parser",
|
||||
os.path.join(_PROJECT_ROOT, "deepdoc", "parser", "html_parser.py"),
|
||||
)
|
||||
_html_mod = importlib.util.module_from_spec(_html_spec)
|
||||
sys.modules["deepdoc.parser.html_parser"] = _html_mod
|
||||
_html_spec.loader.exec_module(_html_mod)
|
||||
|
||||
RAGFlowHtmlParser = _html_mod.RAGFlowHtmlParser
|
||||
|
||||
|
||||
class _FakeTokenizer:
|
||||
"""Deterministic stand-in for rag.nlp.rag_tokenizer.
|
||||
|
||||
Mirrors the two behaviours the real tokenizer applies on the default
|
||||
(Elasticsearch) backend and that this test depends on: it transforms the
|
||||
text (lowercases Latin tokens) and segments spaceless scripts (CJK) into
|
||||
per-character, space-separated tokens. tokenize() returns the same
|
||||
space-joined string shape the real tokenizer returns.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def tokenize(text):
|
||||
spaced = []
|
||||
for ch in text:
|
||||
if "一" <= ch <= "鿿":
|
||||
spaced.append(" " + ch + " ")
|
||||
else:
|
||||
spaced.append(ch)
|
||||
return " ".join(t.lower() for t in "".join(spaced).split())
|
||||
|
||||
|
||||
# Bind the deterministic tokenizer regardless of how rag.nlp resolved.
|
||||
_html_mod.rag_tokenizer = _FakeTokenizer()
|
||||
|
||||
|
||||
def _token_count(text):
|
||||
return RAGFlowHtmlParser._token_count(text)
|
||||
|
||||
|
||||
def test_oversized_english_block_preserves_original_text():
|
||||
# 8 latin tokens, budget 3 -> must be split into multiple chunks that keep
|
||||
# the original casing (the tokenizer lowercases, so a tokenized-form chunk
|
||||
# would be "hello world ...").
|
||||
block = "Hello World FOO Bar Baz Qux Lazy Dogs"
|
||||
chunks = RAGFlowHtmlParser.chunk_block([block], chunk_token_num=3)
|
||||
|
||||
assert len(chunks) > 1
|
||||
# Original text is preserved exactly (atoms partition the source).
|
||||
assert "".join(chunks) == block
|
||||
# Case is not mangled.
|
||||
assert "Hello" in chunks[0]
|
||||
assert all(c.lower() != c for c in chunks if any(ch.isalpha() for ch in c))
|
||||
# No chunk exceeds the token budget.
|
||||
assert all(_token_count(c) <= 3 for c in chunks)
|
||||
|
||||
|
||||
def test_oversized_chinese_block_is_split_and_preserved():
|
||||
# Chinese has no whitespace; a naive whitespace split would leave this as a
|
||||
# single un-splittable chunk. It must still be split, with no spurious
|
||||
# spaces inserted between characters.
|
||||
block = "你好世界这是一个测试用例需要被切分"
|
||||
chunks = RAGFlowHtmlParser.chunk_block([block], chunk_token_num=3)
|
||||
|
||||
assert len(chunks) > 1
|
||||
assert "".join(chunks) == block
|
||||
assert all(" " not in c for c in chunks)
|
||||
assert all(_token_count(c) <= 3 for c in chunks)
|
||||
|
||||
|
||||
def test_small_blocks_are_merged_unchanged():
|
||||
# Blocks under the budget keep their original text and are merged.
|
||||
chunks = RAGFlowHtmlParser.chunk_block(["Alpha Beta", "Gamma"], chunk_token_num=512)
|
||||
|
||||
assert "Alpha Beta" in "".join(chunks)
|
||||
assert "Gamma" in "".join(chunks)
|
||||
Reference in New Issue
Block a user