From 091417980e68955ce6ce824694205445a42da0ab Mon Sep 17 00:00:00 2001 From: Yash Raj Pandey <55940078+devYRPauli@users.noreply.github.com> Date: Thu, 25 Jun 2026 04:43:35 -0400 Subject: [PATCH] fix(html_parser): preserve original text when splitting oversized blocks (#16052) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Bug `RAGFlowHtmlParser.chunk_block()` splits an oversized block by slicing the **tokenized** string and storing the joined tokens: ```python tks_str = rag_tokenizer.tokenize(block) ... tokens = tks_str.split(" ") while start < len(tokens): chunks.append(" ".join(tokens[start:start + chunk_token_num])) # tokenized form, not source ``` On the default (Elasticsearch) backend `rag_tokenizer.tokenize` transforms text: it lowercases/stems Latin words and inserts spaces between CJK characters. So any text block longer than `chunk_token_num` is stored as garbled, lowercased, space-segmented text instead of the source content. The small-block branch correctly stores the original `block`, so only oversized blocks are corrupted. Affects HTML and EPUB ingestion (both go through `chunk_block`), degrading retrieved chunks and the answers generated from them. ### Real tokenizer behavior (infinity-sdk 0.7.0, ES backend) ``` tokenize("Hello World FOO Bar Baz Qux Jumps") -> "hello world foo bar baz qux jump" # lowercased + stemmed tokenize("你好世界这是一个测试") -> "你好世界 这 是 一个 测试" # spaces inserted ``` ### Fix Split the **original** text: break it into atoms (whitespace-delimited runs for space-separated scripts, per-character for spaceless scripts such as Chinese) and pack them into pieces of at most `chunk_token_num` tokens. This preserves the source characters and still splits scripts that have no whitespace — a plain whitespace split would leave CJK as one un-splittable chunk. ### Proof (real tokenizer, before/after) Running the old vs new split against the real `infinity.rag_tokenizer`: ``` ENGLISH "Hello World FOO Bar Baz Qux Lazy Dogs" (chunk_token_num=4) OLD: ['hello world foo bar', 'baz qux jump over', 'lazi dog'] # lowercased + stemmed NEW: ['Hello World FOO Bar ', 'Baz Qux Jumps Over ', 'Lazy Dogs'] # preserved; each <= 4 tokens NEW preserves text exactly: True CHINESE "你好世界这是一个测试用例需要被切分成多个块" (chunk_token_num=3) OLD: ['你好世界 这 是', '一个 测试用例 需要', ...] # spurious spaces NEW: ['你好世', '界这是', '一个测', ...] # preserved; each <= 3 tokens NEW preserves text exactly: True ``` ### Tests Added `test/unit_test/deepdoc/parser/test_html_parser.py` (English + Chinese oversized blocks, plus small-block merge). Before the fix the two oversized tests fail (English shows lowercasing, Chinese shows inserted spaces); after the fix all pass. `ruff check` clean. --- deepdoc/parser/html_parser.py | 85 +++++++++-- .../deepdoc/parser/test_html_parser.py | 142 ++++++++++++++++++ 2 files changed, 216 insertions(+), 11 deletions(-) create mode 100644 test/unit_test/deepdoc/parser/test_html_parser.py diff --git a/deepdoc/parser/html_parser.py b/deepdoc/parser/html_parser.py index 7462ad99e9..f8524abf36 100644 --- a/deepdoc/parser/html_parser.py +++ b/deepdoc/parser/html_parser.py @@ -16,6 +16,8 @@ # from rag.nlp import find_codec, rag_tokenizer +import logging +import re import uuid import chardet from bs4 import BeautifulSoup, NavigableString, Tag, Comment @@ -176,6 +178,74 @@ class RAGFlowHtmlParser: block_content.append(current_content) return block_content, table_info_list + # Characters from scripts written without spaces between words (CJK, kana, + # Hangul). These must be split per-character, since whitespace is not a + # usable word boundary for them. + _SPACELESS = ( + "぀-ヿ" # Hiragana, Katakana + "㐀-䶿" # CJK Extension A + "一-鿿" # CJK Unified Ideographs + "豈-﫿" # CJK Compatibility Ideographs + "가-힯" # Hangul syllables + ) + _ATOM_RE = re.compile(r"[{s}]|[^\s{s}]+|\s+".format(s=_SPACELESS)) + + @classmethod + def _token_count(cls, text): + if not text: + return 0 + tks_str = rag_tokenizer.tokenize(text) + return len(tks_str.split(" ")) if tks_str else 0 + + @classmethod + def _split_oversized_block(cls, block, chunk_token_num): + # Split the ORIGINAL text into pieces of at most chunk_token_num tokens, + # preserving the source characters. Break on whitespace for + # space-delimited scripts and per-character for scripts that have no + # spaces (e.g. Chinese), so both are split without mangling the text. + pieces = [] + current = "" + current_tokens = 0 + # Spaceless scripts yield many repeated single-character atoms, so cache + # the token count per distinct atom to avoid re-tokenizing each one. + token_cache = {} + + def atom_token_count(atom): + if atom.isspace(): + return 0 + if atom not in token_cache: + token_cache[atom] = cls._token_count(atom) + return token_cache[atom] + + for atom in cls._ATOM_RE.findall(block): + atom_tokens = atom_token_count(atom) + if current and current_tokens + atom_tokens > chunk_token_num: + pieces.append(current) + current = "" + current_tokens = 0 + if atom_tokens > chunk_token_num and not atom.isspace(): + # A single atom longer than the budget (e.g. a very long + # unbroken token): fall back to fixed character windows. + logging.debug( + "html_parser: atom of %d chars exceeds chunk_token_num=%d; " + "falling back to character windows", + len(atom), + chunk_token_num, + ) + for i in range(0, len(atom), chunk_token_num): + pieces.append(atom[i:i + chunk_token_num]) + continue + current += atom + current_tokens += atom_tokens + if current: + pieces.append(current) + logging.debug( + "html_parser: split oversized block of %d chars into %d pieces", + len(block), + len(pieces), + ) + return pieces + @classmethod def chunk_block(cls, block_txt_list, chunk_token_num=512): chunks = [] @@ -183,20 +253,13 @@ class RAGFlowHtmlParser: current_token_count = 0 for block in block_txt_list: - tks_str = rag_tokenizer.tokenize(block) - block_token_count = len(tks_str.split(" ")) if tks_str else 0 + block_token_count = cls._token_count(block) if block_token_count > chunk_token_num: if current_block: chunks.append(current_block) - start = 0 - tokens = tks_str.split(" ") - while start < len(tokens): - end = start + chunk_token_num - split_tokens = tokens[start:end] - chunks.append(" ".join(split_tokens)) - start = end - current_block = "" - current_token_count = 0 + current_block = "" + current_token_count = 0 + chunks.extend(cls._split_oversized_block(block, chunk_token_num)) else: if current_token_count + block_token_count <= chunk_token_num: current_block += ("\n" if current_block else "") + block diff --git a/test/unit_test/deepdoc/parser/test_html_parser.py b/test/unit_test/deepdoc/parser/test_html_parser.py new file mode 100644 index 0000000000..02aff9c67b --- /dev/null +++ b/test/unit_test/deepdoc/parser/test_html_parser.py @@ -0,0 +1,142 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Unit tests for HtmlParser.chunk_block. + +These cover the splitting of oversized text blocks, which must preserve the +original source text (the tokenizer lowercases / stems / segments text, so the +stored chunk must not be built from the tokenized form) and must split text in +scripts that have no whitespace word boundaries (e.g. Chinese). +""" + +import importlib.util +import os +import sys +from unittest import mock + +# Load html_parser by file path so we don't trigger deepdoc/parser/__init__.py +# (which pulls in heavy parsers) or the real rag.nlp tokenizer. The heavy +# optional modules are stubbed; rag.nlp is stubbed so the module imports, and +# the tokenizer is replaced after load with a deterministic fake below. +_MOCK_MODULES = [ + "xgboost", + "pdfplumber", + "huggingface_hub", + "PIL", + "PIL.Image", + "pypdf", + "sklearn", + "deepdoc.vision", + "infinity", + "infinity.rag_tokenizer", +] +for _m in _MOCK_MODULES: + if _m not in sys.modules: + sys.modules[_m] = mock.MagicMock() + +if "rag" not in sys.modules: + sys.modules["rag"] = mock.MagicMock() +if "rag.nlp" not in sys.modules: + sys.modules["rag.nlp"] = mock.MagicMock() + + +def _find_project_root(marker="pyproject.toml"): + d = os.path.dirname(os.path.abspath(__file__)) + while d != os.path.dirname(d): + if os.path.exists(os.path.join(d, marker)): + return d + d = os.path.dirname(d) + return None + + +_PROJECT_ROOT = _find_project_root() + +_html_spec = importlib.util.spec_from_file_location( + "deepdoc.parser.html_parser", + os.path.join(_PROJECT_ROOT, "deepdoc", "parser", "html_parser.py"), +) +_html_mod = importlib.util.module_from_spec(_html_spec) +sys.modules["deepdoc.parser.html_parser"] = _html_mod +_html_spec.loader.exec_module(_html_mod) + +RAGFlowHtmlParser = _html_mod.RAGFlowHtmlParser + + +class _FakeTokenizer: + """Deterministic stand-in for rag.nlp.rag_tokenizer. + + Mirrors the two behaviours the real tokenizer applies on the default + (Elasticsearch) backend and that this test depends on: it transforms the + text (lowercases Latin tokens) and segments spaceless scripts (CJK) into + per-character, space-separated tokens. tokenize() returns the same + space-joined string shape the real tokenizer returns. + """ + + @staticmethod + def tokenize(text): + spaced = [] + for ch in text: + if "一" <= ch <= "鿿": + spaced.append(" " + ch + " ") + else: + spaced.append(ch) + return " ".join(t.lower() for t in "".join(spaced).split()) + + +# Bind the deterministic tokenizer regardless of how rag.nlp resolved. +_html_mod.rag_tokenizer = _FakeTokenizer() + + +def _token_count(text): + return RAGFlowHtmlParser._token_count(text) + + +def test_oversized_english_block_preserves_original_text(): + # 8 latin tokens, budget 3 -> must be split into multiple chunks that keep + # the original casing (the tokenizer lowercases, so a tokenized-form chunk + # would be "hello world ..."). + block = "Hello World FOO Bar Baz Qux Lazy Dogs" + chunks = RAGFlowHtmlParser.chunk_block([block], chunk_token_num=3) + + assert len(chunks) > 1 + # Original text is preserved exactly (atoms partition the source). + assert "".join(chunks) == block + # Case is not mangled. + assert "Hello" in chunks[0] + assert all(c.lower() != c for c in chunks if any(ch.isalpha() for ch in c)) + # No chunk exceeds the token budget. + assert all(_token_count(c) <= 3 for c in chunks) + + +def test_oversized_chinese_block_is_split_and_preserved(): + # Chinese has no whitespace; a naive whitespace split would leave this as a + # single un-splittable chunk. It must still be split, with no spurious + # spaces inserted between characters. + block = "你好世界这是一个测试用例需要被切分" + chunks = RAGFlowHtmlParser.chunk_block([block], chunk_token_num=3) + + assert len(chunks) > 1 + assert "".join(chunks) == block + assert all(" " not in c for c in chunks) + assert all(_token_count(c) <= 3 for c in chunks) + + +def test_small_blocks_are_merged_unchanged(): + # Blocks under the budget keep their original text and are merged. + chunks = RAGFlowHtmlParser.chunk_block(["Alpha Beta", "Gamma"], chunk_token_num=512) + + assert "Alpha Beta" in "".join(chunks) + assert "Gamma" in "".join(chunks)