diff --git a/deepdoc/parser/html_parser.py b/deepdoc/parser/html_parser.py
index 7462ad99e9..f8524abf36 100644
--- a/deepdoc/parser/html_parser.py
+++ b/deepdoc/parser/html_parser.py
@@ -16,6 +16,8 @@
#
from rag.nlp import find_codec, rag_tokenizer
+import logging
+import re
import uuid
import chardet
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
@@ -176,6 +178,74 @@ class RAGFlowHtmlParser:
block_content.append(current_content)
return block_content, table_info_list
+ # Characters from scripts written without spaces between words (CJK, kana,
+ # Hangul). These must be split per-character, since whitespace is not a
+ # usable word boundary for them.
+ _SPACELESS = (
+ "-ヿ" # Hiragana, Katakana
+ "㐀-䶿" # CJK Extension A
+ "一-鿿" # CJK Unified Ideographs
+ "豈-" # CJK Compatibility Ideographs
+ "가-" # Hangul syllables
+ )
+ _ATOM_RE = re.compile(r"[{s}]|[^\s{s}]+|\s+".format(s=_SPACELESS))
+
+ @classmethod
+ def _token_count(cls, text):
+ if not text:
+ return 0
+ tks_str = rag_tokenizer.tokenize(text)
+ return len(tks_str.split(" ")) if tks_str else 0
+
+ @classmethod
+ def _split_oversized_block(cls, block, chunk_token_num):
+ # Split the ORIGINAL text into pieces of at most chunk_token_num tokens,
+ # preserving the source characters. Break on whitespace for
+ # space-delimited scripts and per-character for scripts that have no
+ # spaces (e.g. Chinese), so both are split without mangling the text.
+ pieces = []
+ current = ""
+ current_tokens = 0
+ # Spaceless scripts yield many repeated single-character atoms, so cache
+ # the token count per distinct atom to avoid re-tokenizing each one.
+ token_cache = {}
+
+ def atom_token_count(atom):
+ if atom.isspace():
+ return 0
+ if atom not in token_cache:
+ token_cache[atom] = cls._token_count(atom)
+ return token_cache[atom]
+
+ for atom in cls._ATOM_RE.findall(block):
+ atom_tokens = atom_token_count(atom)
+ if current and current_tokens + atom_tokens > chunk_token_num:
+ pieces.append(current)
+ current = ""
+ current_tokens = 0
+ if atom_tokens > chunk_token_num and not atom.isspace():
+ # A single atom longer than the budget (e.g. a very long
+ # unbroken token): fall back to fixed character windows.
+ logging.debug(
+ "html_parser: atom of %d chars exceeds chunk_token_num=%d; "
+ "falling back to character windows",
+ len(atom),
+ chunk_token_num,
+ )
+ for i in range(0, len(atom), chunk_token_num):
+ pieces.append(atom[i:i + chunk_token_num])
+ continue
+ current += atom
+ current_tokens += atom_tokens
+ if current:
+ pieces.append(current)
+ logging.debug(
+ "html_parser: split oversized block of %d chars into %d pieces",
+ len(block),
+ len(pieces),
+ )
+ return pieces
+
@classmethod
def chunk_block(cls, block_txt_list, chunk_token_num=512):
chunks = []
@@ -183,20 +253,13 @@ class RAGFlowHtmlParser:
current_token_count = 0
for block in block_txt_list:
- tks_str = rag_tokenizer.tokenize(block)
- block_token_count = len(tks_str.split(" ")) if tks_str else 0
+ block_token_count = cls._token_count(block)
if block_token_count > chunk_token_num:
if current_block:
chunks.append(current_block)
- start = 0
- tokens = tks_str.split(" ")
- while start < len(tokens):
- end = start + chunk_token_num
- split_tokens = tokens[start:end]
- chunks.append(" ".join(split_tokens))
- start = end
- current_block = ""
- current_token_count = 0
+ current_block = ""
+ current_token_count = 0
+ chunks.extend(cls._split_oversized_block(block, chunk_token_num))
else:
if current_token_count + block_token_count <= chunk_token_num:
current_block += ("\n" if current_block else "") + block
diff --git a/test/unit_test/deepdoc/parser/test_html_parser.py b/test/unit_test/deepdoc/parser/test_html_parser.py
new file mode 100644
index 0000000000..02aff9c67b
--- /dev/null
+++ b/test/unit_test/deepdoc/parser/test_html_parser.py
@@ -0,0 +1,142 @@
+#
+# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Unit tests for HtmlParser.chunk_block.
+
+These cover the splitting of oversized text blocks, which must preserve the
+original source text (the tokenizer lowercases / stems / segments text, so the
+stored chunk must not be built from the tokenized form) and must split text in
+scripts that have no whitespace word boundaries (e.g. Chinese).
+"""
+
+import importlib.util
+import os
+import sys
+from unittest import mock
+
+# Load html_parser by file path so we don't trigger deepdoc/parser/__init__.py
+# (which pulls in heavy parsers) or the real rag.nlp tokenizer. The heavy
+# optional modules are stubbed; rag.nlp is stubbed so the module imports, and
+# the tokenizer is replaced after load with a deterministic fake below.
+_MOCK_MODULES = [
+ "xgboost",
+ "pdfplumber",
+ "huggingface_hub",
+ "PIL",
+ "PIL.Image",
+ "pypdf",
+ "sklearn",
+ "deepdoc.vision",
+ "infinity",
+ "infinity.rag_tokenizer",
+]
+for _m in _MOCK_MODULES:
+ if _m not in sys.modules:
+ sys.modules[_m] = mock.MagicMock()
+
+if "rag" not in sys.modules:
+ sys.modules["rag"] = mock.MagicMock()
+if "rag.nlp" not in sys.modules:
+ sys.modules["rag.nlp"] = mock.MagicMock()
+
+
+def _find_project_root(marker="pyproject.toml"):
+ d = os.path.dirname(os.path.abspath(__file__))
+ while d != os.path.dirname(d):
+ if os.path.exists(os.path.join(d, marker)):
+ return d
+ d = os.path.dirname(d)
+ return None
+
+
+_PROJECT_ROOT = _find_project_root()
+
+_html_spec = importlib.util.spec_from_file_location(
+ "deepdoc.parser.html_parser",
+ os.path.join(_PROJECT_ROOT, "deepdoc", "parser", "html_parser.py"),
+)
+_html_mod = importlib.util.module_from_spec(_html_spec)
+sys.modules["deepdoc.parser.html_parser"] = _html_mod
+_html_spec.loader.exec_module(_html_mod)
+
+RAGFlowHtmlParser = _html_mod.RAGFlowHtmlParser
+
+
+class _FakeTokenizer:
+ """Deterministic stand-in for rag.nlp.rag_tokenizer.
+
+ Mirrors the two behaviours the real tokenizer applies on the default
+ (Elasticsearch) backend and that this test depends on: it transforms the
+ text (lowercases Latin tokens) and segments spaceless scripts (CJK) into
+ per-character, space-separated tokens. tokenize() returns the same
+ space-joined string shape the real tokenizer returns.
+ """
+
+ @staticmethod
+ def tokenize(text):
+ spaced = []
+ for ch in text:
+ if "一" <= ch <= "鿿":
+ spaced.append(" " + ch + " ")
+ else:
+ spaced.append(ch)
+ return " ".join(t.lower() for t in "".join(spaced).split())
+
+
+# Bind the deterministic tokenizer regardless of how rag.nlp resolved.
+_html_mod.rag_tokenizer = _FakeTokenizer()
+
+
+def _token_count(text):
+ return RAGFlowHtmlParser._token_count(text)
+
+
+def test_oversized_english_block_preserves_original_text():
+ # 8 latin tokens, budget 3 -> must be split into multiple chunks that keep
+ # the original casing (the tokenizer lowercases, so a tokenized-form chunk
+ # would be "hello world ...").
+ block = "Hello World FOO Bar Baz Qux Lazy Dogs"
+ chunks = RAGFlowHtmlParser.chunk_block([block], chunk_token_num=3)
+
+ assert len(chunks) > 1
+ # Original text is preserved exactly (atoms partition the source).
+ assert "".join(chunks) == block
+ # Case is not mangled.
+ assert "Hello" in chunks[0]
+ assert all(c.lower() != c for c in chunks if any(ch.isalpha() for ch in c))
+ # No chunk exceeds the token budget.
+ assert all(_token_count(c) <= 3 for c in chunks)
+
+
+def test_oversized_chinese_block_is_split_and_preserved():
+ # Chinese has no whitespace; a naive whitespace split would leave this as a
+ # single un-splittable chunk. It must still be split, with no spurious
+ # spaces inserted between characters.
+ block = "你好世界这是一个测试用例需要被切分"
+ chunks = RAGFlowHtmlParser.chunk_block([block], chunk_token_num=3)
+
+ assert len(chunks) > 1
+ assert "".join(chunks) == block
+ assert all(" " not in c for c in chunks)
+ assert all(_token_count(c) <= 3 for c in chunks)
+
+
+def test_small_blocks_are_merged_unchanged():
+ # Blocks under the budget keep their original text and are merged.
+ chunks = RAGFlowHtmlParser.chunk_block(["Alpha Beta", "Gamma"], chunk_token_num=512)
+
+ assert "Alpha Beta" in "".join(chunks)
+ assert "Gamma" in "".join(chunks)