From 091417980e68955ce6ce824694205445a42da0ab Mon Sep 17 00:00:00 2001
From: Yash Raj Pandey <55940078+devYRPauli@users.noreply.github.com>
Date: Thu, 25 Jun 2026 04:43:35 -0400
Subject: [PATCH] fix(html_parser): preserve original text when splitting
 oversized blocks (#16052)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Bug

`RAGFlowHtmlParser.chunk_block()` splits an oversized block by slicing
the **tokenized** string and storing the joined tokens:

```python
tks_str = rag_tokenizer.tokenize(block)
...
tokens = tks_str.split(" ")
while start < len(tokens):
    chunks.append(" ".join(tokens[start:start + chunk_token_num]))  # tokenized form, not source
```

On the default (Elasticsearch) backend `rag_tokenizer.tokenize`
transforms text: it lowercases/stems Latin words and inserts spaces
between CJK characters. So any text block longer than `chunk_token_num`
is stored as garbled, lowercased, space-segmented text instead of the
source content. The small-block branch correctly stores the original
`block`, so only oversized blocks are corrupted. Affects HTML and EPUB
ingestion (both go through `chunk_block`), degrading retrieved chunks
and the answers generated from them.

### Real tokenizer behavior (infinity-sdk 0.7.0, ES backend)

```
tokenize("Hello World FOO Bar Baz Qux Jumps")  -> "hello world foo bar baz qux jump"   # lowercased + stemmed
tokenize("你好世界这是一个测试")                 -> "你好世界 这 是 一个 测试"            # spaces inserted
```

### Fix

Split the **original** text: break it into atoms (whitespace-delimited
runs for space-separated scripts, per-character for spaceless scripts
such as Chinese) and pack them into pieces of at most `chunk_token_num`
tokens. This preserves the source characters and still splits scripts
that have no whitespace — a plain whitespace split would leave CJK as
one un-splittable chunk.

### Proof (real tokenizer, before/after)

Running the old vs new split against the real `infinity.rag_tokenizer`:

```
ENGLISH "Hello World FOO Bar Baz Qux Lazy Dogs"  (chunk_token_num=4)
  OLD: ['hello world foo bar', 'baz qux jump over', 'lazi dog']          # lowercased + stemmed
  NEW: ['Hello World FOO Bar ', 'Baz Qux Jumps Over ', 'Lazy Dogs']      # preserved; each <= 4 tokens
  NEW preserves text exactly: True

CHINESE "你好世界这是一个测试用例需要被切分成多个块"  (chunk_token_num=3)
  OLD: ['你好世界 这 是', '一个 测试用例 需要', ...]                      # spurious spaces
  NEW: ['你好世', '界这是', '一个测', ...]                               # preserved; each <= 3 tokens
  NEW preserves text exactly: True
```

### Tests

Added `test/unit_test/deepdoc/parser/test_html_parser.py` (English +
Chinese oversized blocks, plus small-block merge). Before the fix the
two oversized tests fail (English shows lowercasing, Chinese shows
inserted spaces); after the fix all pass. `ruff check` clean.
---
 deepdoc/parser/html_parser.py                 |  85 +++++++++--
 .../deepdoc/parser/test_html_parser.py        | 142 ++++++++++++++++++
 2 files changed, 216 insertions(+), 11 deletions(-)
 create mode 100644 test/unit_test/deepdoc/parser/test_html_parser.py

diff --git a/deepdoc/parser/html_parser.py b/deepdoc/parser/html_parser.py
index 7462ad99e9..f8524abf36 100644
--- a/deepdoc/parser/html_parser.py
+++ b/deepdoc/parser/html_parser.py
@@ -16,6 +16,8 @@
 #
 
 from rag.nlp import find_codec, rag_tokenizer
+import logging
+import re
 import uuid
 import chardet
 from bs4 import BeautifulSoup, NavigableString, Tag, Comment
@@ -176,6 +178,74 @@ class RAGFlowHtmlParser:
             block_content.append(current_content)
         return block_content, table_info_list
 
+    # Characters from scripts written without spaces between words (CJK, kana,
+    # Hangul). These must be split per-character, since whitespace is not a
+    # usable word boundary for them.
+    _SPACELESS = (
+        "぀-ヿ"  # Hiragana, Katakana
+        "㐀-䶿"  # CJK Extension A
+        "一-鿿"  # CJK Unified Ideographs
+        "豈-﫿"  # CJK Compatibility Ideographs
+        "가-힯"  # Hangul syllables
+    )
+    _ATOM_RE = re.compile(r"[{s}]|[^\s{s}]+|\s+".format(s=_SPACELESS))
+
+    @classmethod
+    def _token_count(cls, text):
+        if not text:
+            return 0
+        tks_str = rag_tokenizer.tokenize(text)
+        return len(tks_str.split(" ")) if tks_str else 0
+
+    @classmethod
+    def _split_oversized_block(cls, block, chunk_token_num):
+        # Split the ORIGINAL text into pieces of at most chunk_token_num tokens,
+        # preserving the source characters. Break on whitespace for
+        # space-delimited scripts and per-character for scripts that have no
+        # spaces (e.g. Chinese), so both are split without mangling the text.
+        pieces = []
+        current = ""
+        current_tokens = 0
+        # Spaceless scripts yield many repeated single-character atoms, so cache
+        # the token count per distinct atom to avoid re-tokenizing each one.
+        token_cache = {}
+
+        def atom_token_count(atom):
+            if atom.isspace():
+                return 0
+            if atom not in token_cache:
+                token_cache[atom] = cls._token_count(atom)
+            return token_cache[atom]
+
+        for atom in cls._ATOM_RE.findall(block):
+            atom_tokens = atom_token_count(atom)
+            if current and current_tokens + atom_tokens > chunk_token_num:
+                pieces.append(current)
+                current = ""
+                current_tokens = 0
+            if atom_tokens > chunk_token_num and not atom.isspace():
+                # A single atom longer than the budget (e.g. a very long
+                # unbroken token): fall back to fixed character windows.
+                logging.debug(
+                    "html_parser: atom of %d chars exceeds chunk_token_num=%d; "
+                    "falling back to character windows",
+                    len(atom),
+                    chunk_token_num,
+                )
+                for i in range(0, len(atom), chunk_token_num):
+                    pieces.append(atom[i:i + chunk_token_num])
+                continue
+            current += atom
+            current_tokens += atom_tokens
+        if current:
+            pieces.append(current)
+        logging.debug(
+            "html_parser: split oversized block of %d chars into %d pieces",
+            len(block),
+            len(pieces),
+        )
+        return pieces
+
     @classmethod
     def chunk_block(cls, block_txt_list, chunk_token_num=512):
         chunks = []
@@ -183,20 +253,13 @@ class RAGFlowHtmlParser:
         current_token_count = 0
 
         for block in block_txt_list:
-            tks_str = rag_tokenizer.tokenize(block)
-            block_token_count = len(tks_str.split(" ")) if tks_str else 0
+            block_token_count = cls._token_count(block)
             if block_token_count > chunk_token_num:
                 if current_block:
                     chunks.append(current_block)
-                start = 0
-                tokens = tks_str.split(" ")
-                while start < len(tokens):
-                    end = start + chunk_token_num
-                    split_tokens = tokens[start:end]
-                    chunks.append(" ".join(split_tokens))
-                    start = end
-                current_block = ""
-                current_token_count = 0
+                    current_block = ""
+                    current_token_count = 0
+                chunks.extend(cls._split_oversized_block(block, chunk_token_num))
             else:
                 if current_token_count + block_token_count <= chunk_token_num:
                     current_block += ("\n" if current_block else "") + block
diff --git a/test/unit_test/deepdoc/parser/test_html_parser.py b/test/unit_test/deepdoc/parser/test_html_parser.py
new file mode 100644
index 0000000000..02aff9c67b
--- /dev/null
+++ b/test/unit_test/deepdoc/parser/test_html_parser.py
@@ -0,0 +1,142 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+"""Unit tests for HtmlParser.chunk_block.
+
+These cover the splitting of oversized text blocks, which must preserve the
+original source text (the tokenizer lowercases / stems / segments text, so the
+stored chunk must not be built from the tokenized form) and must split text in
+scripts that have no whitespace word boundaries (e.g. Chinese).
+"""
+
+import importlib.util
+import os
+import sys
+from unittest import mock
+
+# Load html_parser by file path so we don't trigger deepdoc/parser/__init__.py
+# (which pulls in heavy parsers) or the real rag.nlp tokenizer. The heavy
+# optional modules are stubbed; rag.nlp is stubbed so the module imports, and
+# the tokenizer is replaced after load with a deterministic fake below.
+_MOCK_MODULES = [
+    "xgboost",
+    "pdfplumber",
+    "huggingface_hub",
+    "PIL",
+    "PIL.Image",
+    "pypdf",
+    "sklearn",
+    "deepdoc.vision",
+    "infinity",
+    "infinity.rag_tokenizer",
+]
+for _m in _MOCK_MODULES:
+    if _m not in sys.modules:
+        sys.modules[_m] = mock.MagicMock()
+
+if "rag" not in sys.modules:
+    sys.modules["rag"] = mock.MagicMock()
+if "rag.nlp" not in sys.modules:
+    sys.modules["rag.nlp"] = mock.MagicMock()
+
+
+def _find_project_root(marker="pyproject.toml"):
+    d = os.path.dirname(os.path.abspath(__file__))
+    while d != os.path.dirname(d):
+        if os.path.exists(os.path.join(d, marker)):
+            return d
+        d = os.path.dirname(d)
+    return None
+
+
+_PROJECT_ROOT = _find_project_root()
+
+_html_spec = importlib.util.spec_from_file_location(
+    "deepdoc.parser.html_parser",
+    os.path.join(_PROJECT_ROOT, "deepdoc", "parser", "html_parser.py"),
+)
+_html_mod = importlib.util.module_from_spec(_html_spec)
+sys.modules["deepdoc.parser.html_parser"] = _html_mod
+_html_spec.loader.exec_module(_html_mod)
+
+RAGFlowHtmlParser = _html_mod.RAGFlowHtmlParser
+
+
+class _FakeTokenizer:
+    """Deterministic stand-in for rag.nlp.rag_tokenizer.
+
+    Mirrors the two behaviours the real tokenizer applies on the default
+    (Elasticsearch) backend and that this test depends on: it transforms the
+    text (lowercases Latin tokens) and segments spaceless scripts (CJK) into
+    per-character, space-separated tokens. tokenize() returns the same
+    space-joined string shape the real tokenizer returns.
+    """
+
+    @staticmethod
+    def tokenize(text):
+        spaced = []
+        for ch in text:
+            if "一" <= ch <= "鿿":
+                spaced.append(" " + ch + " ")
+            else:
+                spaced.append(ch)
+        return " ".join(t.lower() for t in "".join(spaced).split())
+
+
+# Bind the deterministic tokenizer regardless of how rag.nlp resolved.
+_html_mod.rag_tokenizer = _FakeTokenizer()
+
+
+def _token_count(text):
+    return RAGFlowHtmlParser._token_count(text)
+
+
+def test_oversized_english_block_preserves_original_text():
+    # 8 latin tokens, budget 3 -> must be split into multiple chunks that keep
+    # the original casing (the tokenizer lowercases, so a tokenized-form chunk
+    # would be "hello world ...").
+    block = "Hello World FOO Bar Baz Qux Lazy Dogs"
+    chunks = RAGFlowHtmlParser.chunk_block([block], chunk_token_num=3)
+
+    assert len(chunks) > 1
+    # Original text is preserved exactly (atoms partition the source).
+    assert "".join(chunks) == block
+    # Case is not mangled.
+    assert "Hello" in chunks[0]
+    assert all(c.lower() != c for c in chunks if any(ch.isalpha() for ch in c))
+    # No chunk exceeds the token budget.
+    assert all(_token_count(c) <= 3 for c in chunks)
+
+
+def test_oversized_chinese_block_is_split_and_preserved():
+    # Chinese has no whitespace; a naive whitespace split would leave this as a
+    # single un-splittable chunk. It must still be split, with no spurious
+    # spaces inserted between characters.
+    block = "你好世界这是一个测试用例需要被切分"
+    chunks = RAGFlowHtmlParser.chunk_block([block], chunk_token_num=3)
+
+    assert len(chunks) > 1
+    assert "".join(chunks) == block
+    assert all(" " not in c for c in chunks)
+    assert all(_token_count(c) <= 3 for c in chunks)
+
+
+def test_small_blocks_are_merged_unchanged():
+    # Blocks under the budget keep their original text and are merged.
+    chunks = RAGFlowHtmlParser.chunk_block(["Alpha Beta", "Gamma"], chunk_token_num=512)
+
+    assert "Alpha Beta" in "".join(chunks)
+    assert "Gamma" in "".join(chunks)