mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
fix(html_parser): preserve original text when splitting oversized blocks (#16052)
### Bug
`RAGFlowHtmlParser.chunk_block()` splits an oversized block by slicing
the **tokenized** string and storing the joined tokens:
```python
tks_str = rag_tokenizer.tokenize(block)
...
tokens = tks_str.split(" ")
while start < len(tokens):
chunks.append(" ".join(tokens[start:start + chunk_token_num])) # tokenized form, not source
```
On the default (Elasticsearch) backend `rag_tokenizer.tokenize`
transforms text: it lowercases/stems Latin words and inserts spaces
between CJK characters. So any text block longer than `chunk_token_num`
is stored as garbled, lowercased, space-segmented text instead of the
source content. The small-block branch correctly stores the original
`block`, so only oversized blocks are corrupted. Affects HTML and EPUB
ingestion (both go through `chunk_block`), degrading retrieved chunks
and the answers generated from them.
### Real tokenizer behavior (infinity-sdk 0.7.0, ES backend)
```
tokenize("Hello World FOO Bar Baz Qux Jumps") -> "hello world foo bar baz qux jump" # lowercased + stemmed
tokenize("你好世界这是一个测试") -> "你好世界 这 是 一个 测试" # spaces inserted
```
### Fix
Split the **original** text: break it into atoms (whitespace-delimited
runs for space-separated scripts, per-character for spaceless scripts
such as Chinese) and pack them into pieces of at most `chunk_token_num`
tokens. This preserves the source characters and still splits scripts
that have no whitespace — a plain whitespace split would leave CJK as
one un-splittable chunk.
### Proof (real tokenizer, before/after)
Running the old vs new split against the real `infinity.rag_tokenizer`:
```
ENGLISH "Hello World FOO Bar Baz Qux Lazy Dogs" (chunk_token_num=4)
OLD: ['hello world foo bar', 'baz qux jump over', 'lazi dog'] # lowercased + stemmed
NEW: ['Hello World FOO Bar ', 'Baz Qux Jumps Over ', 'Lazy Dogs'] # preserved; each <= 4 tokens
NEW preserves text exactly: True
CHINESE "你好世界这是一个测试用例需要被切分成多个块" (chunk_token_num=3)
OLD: ['你好世界 这 是', '一个 测试用例 需要', ...] # spurious spaces
NEW: ['你好世', '界这是', '一个测', ...] # preserved; each <= 3 tokens
NEW preserves text exactly: True
```
### Tests
Added `test/unit_test/deepdoc/parser/test_html_parser.py` (English +
Chinese oversized blocks, plus small-block merge). Before the fix the
two oversized tests fail (English shows lowercasing, Chinese shows
inserted spaces); after the fix all pass. `ruff check` clean.
This commit is contained in:
@@ -16,6 +16,8 @@
|
||||
#
|
||||
|
||||
from rag.nlp import find_codec, rag_tokenizer
|
||||
import logging
|
||||
import re
|
||||
import uuid
|
||||
import chardet
|
||||
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
|
||||
@@ -176,6 +178,74 @@ class RAGFlowHtmlParser:
|
||||
block_content.append(current_content)
|
||||
return block_content, table_info_list
|
||||
|
||||
# Characters from scripts written without spaces between words (CJK, kana,
|
||||
# Hangul). These must be split per-character, since whitespace is not a
|
||||
# usable word boundary for them.
|
||||
_SPACELESS = (
|
||||
"-ヿ" # Hiragana, Katakana
|
||||
"㐀-䶿" # CJK Extension A
|
||||
"一-鿿" # CJK Unified Ideographs
|
||||
"豈-" # CJK Compatibility Ideographs
|
||||
"가-" # Hangul syllables
|
||||
)
|
||||
_ATOM_RE = re.compile(r"[{s}]|[^\s{s}]+|\s+".format(s=_SPACELESS))
|
||||
|
||||
@classmethod
|
||||
def _token_count(cls, text):
|
||||
if not text:
|
||||
return 0
|
||||
tks_str = rag_tokenizer.tokenize(text)
|
||||
return len(tks_str.split(" ")) if tks_str else 0
|
||||
|
||||
@classmethod
|
||||
def _split_oversized_block(cls, block, chunk_token_num):
|
||||
# Split the ORIGINAL text into pieces of at most chunk_token_num tokens,
|
||||
# preserving the source characters. Break on whitespace for
|
||||
# space-delimited scripts and per-character for scripts that have no
|
||||
# spaces (e.g. Chinese), so both are split without mangling the text.
|
||||
pieces = []
|
||||
current = ""
|
||||
current_tokens = 0
|
||||
# Spaceless scripts yield many repeated single-character atoms, so cache
|
||||
# the token count per distinct atom to avoid re-tokenizing each one.
|
||||
token_cache = {}
|
||||
|
||||
def atom_token_count(atom):
|
||||
if atom.isspace():
|
||||
return 0
|
||||
if atom not in token_cache:
|
||||
token_cache[atom] = cls._token_count(atom)
|
||||
return token_cache[atom]
|
||||
|
||||
for atom in cls._ATOM_RE.findall(block):
|
||||
atom_tokens = atom_token_count(atom)
|
||||
if current and current_tokens + atom_tokens > chunk_token_num:
|
||||
pieces.append(current)
|
||||
current = ""
|
||||
current_tokens = 0
|
||||
if atom_tokens > chunk_token_num and not atom.isspace():
|
||||
# A single atom longer than the budget (e.g. a very long
|
||||
# unbroken token): fall back to fixed character windows.
|
||||
logging.debug(
|
||||
"html_parser: atom of %d chars exceeds chunk_token_num=%d; "
|
||||
"falling back to character windows",
|
||||
len(atom),
|
||||
chunk_token_num,
|
||||
)
|
||||
for i in range(0, len(atom), chunk_token_num):
|
||||
pieces.append(atom[i:i + chunk_token_num])
|
||||
continue
|
||||
current += atom
|
||||
current_tokens += atom_tokens
|
||||
if current:
|
||||
pieces.append(current)
|
||||
logging.debug(
|
||||
"html_parser: split oversized block of %d chars into %d pieces",
|
||||
len(block),
|
||||
len(pieces),
|
||||
)
|
||||
return pieces
|
||||
|
||||
@classmethod
|
||||
def chunk_block(cls, block_txt_list, chunk_token_num=512):
|
||||
chunks = []
|
||||
@@ -183,20 +253,13 @@ class RAGFlowHtmlParser:
|
||||
current_token_count = 0
|
||||
|
||||
for block in block_txt_list:
|
||||
tks_str = rag_tokenizer.tokenize(block)
|
||||
block_token_count = len(tks_str.split(" ")) if tks_str else 0
|
||||
block_token_count = cls._token_count(block)
|
||||
if block_token_count > chunk_token_num:
|
||||
if current_block:
|
||||
chunks.append(current_block)
|
||||
start = 0
|
||||
tokens = tks_str.split(" ")
|
||||
while start < len(tokens):
|
||||
end = start + chunk_token_num
|
||||
split_tokens = tokens[start:end]
|
||||
chunks.append(" ".join(split_tokens))
|
||||
start = end
|
||||
current_block = ""
|
||||
current_token_count = 0
|
||||
current_block = ""
|
||||
current_token_count = 0
|
||||
chunks.extend(cls._split_oversized_block(block, chunk_token_num))
|
||||
else:
|
||||
if current_token_count + block_token_count <= chunk_token_num:
|
||||
current_block += ("\n" if current_block else "") + block
|
||||
|
||||
Reference in New Issue
Block a user